--- ValBot/Python/check_interwiki_links.py 2022/03/21 21:22:33 1170
+++ ValBot/Python/check_interwiki_links.py 2022/06/28 22:11:41 1174
@@ -13,7 +13,7 @@ from urllib.parse import urljoin
import pywikibot
import re
-import requests
+import requests # for listing members with dir()
from pywikibot.bot import QuitKeyboardInterrupt
from pywikibot import pagegenerators
@@ -50,10 +50,13 @@ def scan_for_iw_links(page_text):
# Sometimes we used a space char. instead of a '_', so fix that before querying
page_title = page_text[s:e].replace(' ', '_')
+ # Use only spaces for title when printing it
+ page_title_human = page_title.replace('_', ' ')
+ pywikibot.stdout(' Validating {0} link "{1}"'.format(prefix, page_title_human))
+ iw_found = iw_found + 1
+
# Construct full URL for the particular wiki
iw_url = interwiki_urls[cur] + page_title
- pywikibot.stdout(' Validating {0} link "{1}"'.format(prefix, page_title))
- iw_found = iw_found + 1
# Adjust URL if this is a foreign-language WP link
if re.match("^[a-zA-Z]{2}:", page_title):
@@ -64,17 +67,37 @@ def scan_for_iw_links(page_text):
iw_url = iw_url.replace(page_title[0:3], '')
# Test the URL
- #pywikibot.stdout(' Testing URL "{}"'.format(iw_url))
response = fetch(iw_url)
- # Redirects are followed automatically by fetch() and treated as "200"s, so the
- # way we tell that a redirect occurred is by checking the history
+ # One way we tell that a redirect occurred is by checking the history
if response.history != []:
pywikibot.stdout(' ERROR: Got redirection code ({0}) on URL "{1}".'.format(response.history[0], iw_url))
errors_issued = errors_issued + 1
elif response.status_code != 200:
pywikibot.stdout(' ERROR: Got response code {0} on URL "{1}".'.format(response.status_code, iw_url))
errors_issued = errors_issued + 1
+ # The usual way that a redirect occurs is that MediaWiki redirects us sneakily
+ # using JavaScript, while returning code OK 200 as if the link was correct; we
+ # must detect this from the page source
+ elif 'Redirected from
+ canonical_name = response.text.split('')
+ if tag_end == -1:
+ pywikibot.stdout(' ERROR: This is a redirect page (but I could not isolate the correct page name).')
+ else:
+ canonical_name = canonical_name[:tag_end]
+ if len(canonical_name) > 100:
+ # Certain things can cause the trim to fail; here we avoid slamming
+ # the output with massive page source from a failed trim
+ pywikibot.stdout(' ERROR: This is a redirect to "{}" (string trimmed to 100 chars due to excessive length).'.format(canonical_name[:100]))
+ else:
+ canonical_name = canonical_name.replace('_', ' ')
+ pywikibot.stdout(' ERROR: This is a redirect to "{}".'.format(canonical_name))
+ errors_issued = errors_issued + 1
elif '#' in page_title:
# Isolate section link
page_name, anchor_name = page_title.split('#')
@@ -91,7 +114,6 @@ def scan_for_iw_links(page_text):
for span_tag in soup.findAll('span'):
span_name = span_tag.get('id', None)
if span_name == anchor_name:
- #pywikibot.stdout('Found section!')
found_section = True
break
if found_section == False: