ViewVC Help
View File | Revision Log | View Changeset | Root Listing
root/Oni2/ValBot/Python/check_interwiki_links.py
(Generate patch)

Comparing ValBot/Python/check_interwiki_links.py (file contents):
Revision 1206 by iritscen, Mon Feb 9 03:21:59 2026 UTC vs.
Revision 1207 by iritscen, Sat Mar 21 23:06:14 2026 UTC

# Line 9 | Line 9 | import bs4
9   import pywikibot
10   import re
11   import requests # for listing members with dir() when debugging
12 + import time
13  
14   from bs4 import BeautifulSoup
15   from pywikibot import pagegenerators
# Line 19 | Line 20 | from urllib.parse import urljoin
20  
21   class IWLink:
22     def __init__(self, iw_prefix, prefix_url, full_url, page_name, page_name_only, page_slug, hosting_page, curl_response):
23 <      self.iw_prefix = iw_prefix # e.g. "wp"
23 >      self.iw_prefix = iw_prefix # e.g. "wp" as in [[wp:Marathon (series)#Rampancy]]
24        self.prefix_url = prefix_url # e.g. "https://en.wikipedia.org/wiki/"
25        self.full_url = full_url # e.g. "https://en.wikipedia.org/wiki/Marathon_(series)#Rampancy"
26        self.page_name = page_name # "Marathon (series)#Rampancy"
# Line 40 | Line 41 | iw_found = 0
41   errors_issued = 0
42   unintended_redirects_found = 0
43   name_printed = 0
44 + request_delay = 1.5
45 + max_retries = 3
46 + backoff_factor = 2
47  
48   # Prints the name of a page on which something occurred, if it has not been printed before
49   def possibly_print(the_link):
# Line 115 | Line 119 | def test_interwiki_link(the_link):
119     global errors_issued
120     global unintended_redirects_found
121    
122 <   the_link.curl_response = fetch(the_link.full_url)
122 >   # We have to carefully throttle requests because otherwise we will get hit with a 429: Too Many Requests
123 >   attempt = 0
124 >   delay = request_delay
125 >   while True:
126 >       time.sleep(delay)
127 >  
128 >       the_link.curl_response = fetch(the_link.full_url)
129 >  
130 >       if the_link.curl_response.status_code != 429:
131 >           break
132 >  
133 >       attempt += 1
134 >       if attempt > max_retries:
135 >          pywikibot.stdout(f'   ERROR: Maximum retries afer error 429 exceeded for "{the_link.page_slug}". Aborting script.')
136 >          raise SystemExit(1)
137 >  
138 >       # Increase rate limit if we got the error
139 >       delay *= backoff_factor
140 >       pywikibot.stdout(f'   WARNING: Received error 429 for "{the_link.page_slug}". Retrying in {delay:.1f}s...')
141  
142     # One way we tell that a redirect occurred is by checking fetch's history, as it automatically follows redirects. This will catch formal redirects which come from
143     # pages such as Special:PermanentLink.

Diff Legend

Removed lines
+ Added lines
< Changed lines (old)
> Changed lines (new)