--- ValBot/Python/check_interwiki_links.py 2025/08/15 20:55:01 1196 +++ ValBot/Python/check_interwiki_links.py 2025/09/29 19:18:16 1200 @@ -1,11 +1,9 @@ # Check Interwiki Links # by iritscen@yahoo.com -# Looks at each link on a page (or all the pages in a category) which uses a registered -# interwiki prefix and loads the linked page, verifying that it exists and that any section -# link, if present, is valid as well. The output will use the word "ERROR" when it cannot -# validate the interwiki link. +# Looks at each link on a page (or all the pages in a category) which uses a registered interwiki prefix and loads the linked page, verifying that it exists and that +# any section link, if present, is valid as well. The output will use the word "ERROR" when it cannot validate the interwiki link. # Recommended viewing width: -# |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---| +# |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ----| import bs4 import pywikibot @@ -21,12 +19,14 @@ from pywikibot.tools.formatter import co from urllib.parse import urljoin class IWLink: - def __init__(self, iw_prefix, prefix_url, full_url, page_name, page_slug, curl_response): + def __init__(self, iw_prefix, prefix_url, full_url, page_name, page_name_only, page_slug, hosting_page, curl_response): self.iw_prefix = iw_prefix # e.g. "wp" self.prefix_url = prefix_url # e.g. "https://en.wikipedia.org/wiki/" - self.full_url = full_url # e.g. "https://en.wikipedia.org/wiki/Easter_egg" - self.page_name = page_name # "Easter egg" - self.page_slug = page_slug # "Easter_egg" + self.full_url = full_url # e.g. "https://en.wikipedia.org/wiki/Marathon_(series)#Rampancy" + self.page_name = page_name # "Marathon (series)#Rampancy" + self.page_name_only = page_name # "Marathon (series)" + self.page_slug = page_slug # "Marathon_(series)#Rampancy" + self.hosting_page = hosting_page # "Easter eggs"; page where the link was found self.curl_response = curl_response # a class defined in the Requests library # Parallel arrays based on https://wiki.oni2.net/Special:Interwiki @@ -43,13 +43,13 @@ unintended_redirects_found = 0 name_printed = 0 # Prints the name of a page on which something occurred, if it has not been printed before -def possibly_print(page_name): +def possibly_print(the_link): global debug global name_printed if not name_printed and not debug: pywikibot.stdout('') - pywikibot.stdout('From page "{}":'.format(page_name)) + pywikibot.stdout('From page "{}":'.format(the_link.hosting_page)) name_printed = 1 # Search a page for the section specified in the link @@ -57,8 +57,7 @@ def find_section(the_link, print_result) global errors_issued # Isolate section link - target_page_name, anchor_name = the_link.page_slug.split('#') - target_page_name_human = target_page_name.replace('_', ' ') + _, anchor_name = the_link.page_slug.split('#') # Convert dot-notation hex entities to proper characters replacements = [(r'\.22', '"'), (r'\.27', "'"), (r'\.28', '('), (r'\.29', ')')] @@ -79,15 +78,13 @@ def find_section(the_link, print_result) # Tell user what we found if found_section == False: - possibly_print(the_link.page_name) - pywikibot.stdout(' ERROR: Could not find section "{0}" on {1} page "{2}".'.format(anchor_name, the_link.iw_prefix, target_page_name_human)) - # TODO: Check that page name has been corrected to redirected page if there was a redirect + possibly_print(the_link) + pywikibot.stdout(' ERROR: Could not find section "{0}" on {1} page "{2}".'.format(anchor_name, the_link.iw_prefix, the_link.page_name)) errors_issued = errors_issued + 1 elif print_result == True: - pywikibot.stdout(' The section "{0}" was found on {1} page "{2}".'.format(anchor_name, the_link.iw_prefix, target_page_name_human)) + pywikibot.stdout(' The section "{0}" was found on {1} page "{2}".'.format(anchor_name, the_link.iw_prefix, the_link.page_name)) -# For a link that redirected us to another page, extract the name of the target page from -# the target page's source +# For a link that redirected us to another page, extract the name of the target page from the target page's source def find_canonical_link(the_link): # Extract link from this markup which contains name of redirected-to page: # @@ -102,19 +99,17 @@ def find_canonical_link(the_link): else: canonical_name = canonical_name[:tag_end] if len(canonical_name) > 100: - # Certain things can cause the trim to fail; report error and avoid slamming the - # output with massive page source from a failed trim + # Certain things can cause the trim to fail; report error and avoid slamming the output with massive page source from a failed trim pywikibot.stdout(' ERROR: The {0} link "{1}" is a redirect to "{2}…" (string overflow).'.format(the_link.iw_prefix, the_link.page_slug, canonical_name[:100])) errors_issued = errors_issued + 1 else: - canonical_name = canonical_name.replace('_', ' ') + the_link.page_name = canonical_name.replace('_', ' ') if '#' in the_link.page_slug: - _, anchor_name = the_link.page_slug.split('#') - pywikibot.stdout(' The {0} link "{1}" is a redirect to "{2}#{3}", which is a valid page. Checking for section on that page….'.format(the_link.iw_prefix, the_link.page_slug, canonical_name, anchor_name)) - the_link.page_slug = the_link.page_slug.replace(the_link.page_name, canonical_name) # update page slug so that find_section() uses the right page name in its messages + the_link.page_name_only, _ = the_link.page_slug.split('#') + pywikibot.stdout(' The {0} link "{1}" is a redirect to "{2}", which is a valid page. Checking for section on that page….'.format(the_link.iw_prefix, the_link.page_name_only, the_link.page_name)) find_section(the_link, True) else: - pywikibot.stdout(' The {0} link "{1}" is a redirect to "{2}", which is a valid page.'.format(the_link.iw_prefix, the_link.page_slug, canonical_name)) + pywikibot.stdout(' The {0} link "{1}" is a redirect to "{2}", which is a valid page.'.format(the_link.iw_prefix, the_link.page_slug, the_link.page_name)) # Test an interwiki link and look for a section link if applicable def test_interwiki_link(the_link): @@ -123,11 +118,10 @@ def test_interwiki_link(the_link): the_link.curl_response = fetch(the_link.full_url) - # One way we tell that a redirect occurred is by checking fetch's history, as it - # automatically follows redirects. This will catch formal redirects which come from pages - # such as Special:PermanentLink. + # One way we tell that a redirect occurred is by checking fetch's history, as it automatically follows redirects. This will catch formal redirects which come from + # pages such as Special:PermanentLink. if the_link.curl_response.history != []: - possibly_print(the_link.page_name) + possibly_print(the_link) # If linked page is in all caps, e.g. WP:BEANS, it's likely a deliberate use of a redirect if the_link.page_slug.startswith('WP:') and the_link.page_slug == the_link.page_slug.upper(): @@ -144,18 +138,17 @@ def test_interwiki_link(the_link): pywikibot.stdout(' ERROR: Unrecognized type of redirection (code "{0}") for {1} link "{2}". You should check the link manually.'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug)) errors_issued = errors_issued + 1 elif the_link.curl_response.status_code != 200: - possibly_print(the_link.page_name) + possibly_print(the_link) pywikibot.stdout(' ERROR: Got response code {0} for {1} link "{2}". The page may not exist.'.format(the_link.curl_response.status_code, the_link.iw_prefix, the_link.page_slug)) errors_issued = errors_issued + 1 - # However the usual way that a redirect occurs is that MediaWiki redirects us sneakily - # using JavaScript, while returning code OK 200 as if the link was correct; this happens - # when a redirect page is accessed. We must detect these soft redirects by looking at the - # page source to find the redirect note inserted at the top of the page for the reader. + # However the usual way that a redirect occurs is that MediaWiki redirects us sneakily using JavaScript, while returning code OK 200 as if the link was correct; this + # happens when a redirect page is accessed. We must detect these soft redirects by looking at the page source to find the redirect note inserted at the top of the + # page for the reader. elif 'Redirected from