--- ValBot/Python/check_interwiki_links.py 2025/08/29 03:52:17 1197
+++ ValBot/Python/check_interwiki_links.py 2025/09/29 19:18:16 1200
@@ -1,11 +1,9 @@
# Check Interwiki Links
# by iritscen@yahoo.com
-# Looks at each link on a page (or all the pages in a category) which uses a registered
-# interwiki prefix and loads the linked page, verifying that it exists and that any section
-# link, if present, is valid as well. The output will use the word "ERROR" when it cannot
-# validate the interwiki link.
+# Looks at each link on a page (or all the pages in a category) which uses a registered interwiki prefix and loads the linked page, verifying that it exists and that
+# any section link, if present, is valid as well. The output will use the word "ERROR" when it cannot validate the interwiki link.
# Recommended viewing width:
-# |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---|
+# |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ----|
import bs4
import pywikibot
@@ -21,12 +19,14 @@ from pywikibot.tools.formatter import co
from urllib.parse import urljoin
class IWLink:
- def __init__(self, iw_prefix, prefix_url, full_url, page_name, page_slug, curl_response):
+ def __init__(self, iw_prefix, prefix_url, full_url, page_name, page_name_only, page_slug, hosting_page, curl_response):
self.iw_prefix = iw_prefix # e.g. "wp"
self.prefix_url = prefix_url # e.g. "https://en.wikipedia.org/wiki/"
- self.full_url = full_url # e.g. "https://en.wikipedia.org/wiki/Easter_egg"
- self.page_name = page_name # "Easter egg"
- self.page_slug = page_slug # "Easter_egg"
+ self.full_url = full_url # e.g. "https://en.wikipedia.org/wiki/Marathon_(series)#Rampancy"
+ self.page_name = page_name # "Marathon (series)#Rampancy"
+ self.page_name_only = page_name # "Marathon (series)"
+ self.page_slug = page_slug # "Marathon_(series)#Rampancy"
+ self.hosting_page = hosting_page # "Easter eggs"; page where the link was found
self.curl_response = curl_response # a class defined in the Requests library
# Parallel arrays based on https://wiki.oni2.net/Special:Interwiki
@@ -43,13 +43,13 @@ unintended_redirects_found = 0
name_printed = 0
# Prints the name of a page on which something occurred, if it has not been printed before
-def possibly_print(page_name):
+def possibly_print(the_link):
global debug
global name_printed
if not name_printed and not debug:
pywikibot.stdout('')
- pywikibot.stdout('From page "{}":'.format(page_name))
+ pywikibot.stdout('From page "{}":'.format(the_link.hosting_page))
name_printed = 1
# Search a page for the section specified in the link
@@ -57,8 +57,7 @@ def find_section(the_link, print_result)
global errors_issued
# Isolate section link
- target_page_name, anchor_name = the_link.page_slug.split('#')
- target_page_name_human = target_page_name.replace('_', ' ')
+ _, anchor_name = the_link.page_slug.split('#')
# Convert dot-notation hex entities to proper characters
replacements = [(r'\.22', '"'), (r'\.27', "'"), (r'\.28', '('), (r'\.29', ')')]
@@ -79,14 +78,13 @@ def find_section(the_link, print_result)
# Tell user what we found
if found_section == False:
- possibly_print(the_link.page_name)
- pywikibot.stdout(' ERROR: Could not find section "{0}" on {1} page "{2}".'.format(anchor_name, the_link.iw_prefix, target_page_name_human))
+ possibly_print(the_link)
+ pywikibot.stdout(' ERROR: Could not find section "{0}" on {1} page "{2}".'.format(anchor_name, the_link.iw_prefix, the_link.page_name))
errors_issued = errors_issued + 1
elif print_result == True:
- pywikibot.stdout(' The section "{0}" was found on {1} page "{2}".'.format(anchor_name, the_link.iw_prefix, target_page_name_human))
+ pywikibot.stdout(' The section "{0}" was found on {1} page "{2}".'.format(anchor_name, the_link.iw_prefix, the_link.page_name))
-# For a link that redirected us to another page, extract the name of the target page from
-# the target page's source
+# For a link that redirected us to another page, extract the name of the target page from the target page's source
def find_canonical_link(the_link):
# Extract link from this markup which contains name of redirected-to page:
#
@@ -101,19 +99,17 @@ def find_canonical_link(the_link):
else:
canonical_name = canonical_name[:tag_end]
if len(canonical_name) > 100:
- # Certain things can cause the trim to fail; report error and avoid slamming the
- # output with massive page source from a failed trim
+ # Certain things can cause the trim to fail; report error and avoid slamming the output with massive page source from a failed trim
pywikibot.stdout(' ERROR: The {0} link "{1}" is a redirect to "{2}…" (string overflow).'.format(the_link.iw_prefix, the_link.page_slug, canonical_name[:100]))
errors_issued = errors_issued + 1
else:
- canonical_name = canonical_name.replace('_', ' ')
+ the_link.page_name = canonical_name.replace('_', ' ')
if '#' in the_link.page_slug:
- _, anchor_name = the_link.page_slug.split('#')
- pywikibot.stdout(' The {0} link "{1}" is a redirect to "{2}#{3}", which is a valid page. Checking for section on that page….'.format(the_link.iw_prefix, the_link.page_slug, canonical_name, anchor_name))
- the_link.page_slug = the_link.page_slug.replace(the_link.page_name, canonical_name) # update page slug so that find_section() uses the right page name in its messages
+ the_link.page_name_only, _ = the_link.page_slug.split('#')
+ pywikibot.stdout(' The {0} link "{1}" is a redirect to "{2}", which is a valid page. Checking for section on that page….'.format(the_link.iw_prefix, the_link.page_name_only, the_link.page_name))
find_section(the_link, True)
else:
- pywikibot.stdout(' The {0} link "{1}" is a redirect to "{2}", which is a valid page.'.format(the_link.iw_prefix, the_link.page_slug, canonical_name))
+ pywikibot.stdout(' The {0} link "{1}" is a redirect to "{2}", which is a valid page.'.format(the_link.iw_prefix, the_link.page_slug, the_link.page_name))
# Test an interwiki link and look for a section link if applicable
def test_interwiki_link(the_link):
@@ -122,11 +118,10 @@ def test_interwiki_link(the_link):
the_link.curl_response = fetch(the_link.full_url)
- # One way we tell that a redirect occurred is by checking fetch's history, as it
- # automatically follows redirects. This will catch formal redirects which come from pages
- # such as Special:PermanentLink.
+ # One way we tell that a redirect occurred is by checking fetch's history, as it automatically follows redirects. This will catch formal redirects which come from
+ # pages such as Special:PermanentLink.
if the_link.curl_response.history != []:
- possibly_print(the_link.page_name)
+ possibly_print(the_link)
# If linked page is in all caps, e.g. WP:BEANS, it's likely a deliberate use of a redirect
if the_link.page_slug.startswith('WP:') and the_link.page_slug == the_link.page_slug.upper():
@@ -143,18 +138,17 @@ def test_interwiki_link(the_link):
pywikibot.stdout(' ERROR: Unrecognized type of redirection (code "{0}") for {1} link "{2}". You should check the link manually.'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug))
errors_issued = errors_issued + 1
elif the_link.curl_response.status_code != 200:
- possibly_print(the_link.page_name)
+ possibly_print(the_link)
pywikibot.stdout(' ERROR: Got response code {0} for {1} link "{2}". The page may not exist.'.format(the_link.curl_response.status_code, the_link.iw_prefix, the_link.page_slug))
errors_issued = errors_issued + 1
- # However the usual way that a redirect occurs is that MediaWiki redirects us sneakily
- # using JavaScript, while returning code OK 200 as if the link was correct; this happens
- # when a redirect page is accessed. We must detect these soft redirects by looking at the
- # page source to find the redirect note inserted at the top of the page for the reader.
+ # However the usual way that a redirect occurs is that MediaWiki redirects us sneakily using JavaScript, while returning code OK 200 as if the link was correct; this
+ # happens when a redirect page is accessed. We must detect these soft redirects by looking at the page source to find the redirect note inserted at the top of the
+ # page for the reader.
elif 'Redirected from