| 9 |
|
import pywikibot |
| 10 |
|
import re |
| 11 |
|
import requests # for listing members with dir() when debugging |
| 12 |
+ |
import time |
| 13 |
|
|
| 14 |
|
from bs4 import BeautifulSoup |
| 15 |
|
from pywikibot import pagegenerators |
| 20 |
|
|
| 21 |
|
class IWLink: |
| 22 |
|
def __init__(self, iw_prefix, prefix_url, full_url, page_name, page_name_only, page_slug, hosting_page, curl_response): |
| 23 |
< |
self.iw_prefix = iw_prefix # e.g. "wp" |
| 23 |
> |
self.iw_prefix = iw_prefix # e.g. "wp" as in [[wp:Marathon (series)#Rampancy]] |
| 24 |
|
self.prefix_url = prefix_url # e.g. "https://en.wikipedia.org/wiki/" |
| 25 |
|
self.full_url = full_url # e.g. "https://en.wikipedia.org/wiki/Marathon_(series)#Rampancy" |
| 26 |
|
self.page_name = page_name # "Marathon (series)#Rampancy" |
| 41 |
|
errors_issued = 0 |
| 42 |
|
unintended_redirects_found = 0 |
| 43 |
|
name_printed = 0 |
| 44 |
+ |
request_delay = 1.5 |
| 45 |
+ |
max_retries = 3 |
| 46 |
+ |
backoff_factor = 2 |
| 47 |
|
|
| 48 |
|
# Prints the name of a page on which something occurred, if it has not been printed before |
| 49 |
|
def possibly_print(the_link): |
| 119 |
|
global errors_issued |
| 120 |
|
global unintended_redirects_found |
| 121 |
|
|
| 122 |
< |
the_link.curl_response = fetch(the_link.full_url) |
| 122 |
> |
# We have to carefully throttle requests because otherwise we will get hit with a 429: Too Many Requests |
| 123 |
> |
attempt = 0 |
| 124 |
> |
delay = request_delay |
| 125 |
> |
while True: |
| 126 |
> |
time.sleep(delay) |
| 127 |
> |
|
| 128 |
> |
the_link.curl_response = fetch(the_link.full_url) |
| 129 |
> |
|
| 130 |
> |
if the_link.curl_response.status_code != 429: |
| 131 |
> |
break |
| 132 |
> |
|
| 133 |
> |
attempt += 1 |
| 134 |
> |
if attempt > max_retries: |
| 135 |
> |
pywikibot.stdout(f' ERROR: Maximum retries afer error 429 exceeded for "{the_link.page_slug}". Aborting script.') |
| 136 |
> |
raise SystemExit(1) |
| 137 |
> |
|
| 138 |
> |
# Increase rate limit if we got the error |
| 139 |
> |
delay *= backoff_factor |
| 140 |
> |
pywikibot.stdout(f' WARNING: Received error 429 for "{the_link.page_slug}". Retrying in {delay:.1f}s...') |
| 141 |
|
|
| 142 |
|
# One way we tell that a redirect occurred is by checking fetch's history, as it automatically follows redirects. This will catch formal redirects which come from |
| 143 |
|
# pages such as Special:PermanentLink. |