--- ValBot/Python/check_interwiki_links.py 2022/03/21 21:22:33 1170 +++ ValBot/Python/check_interwiki_links.py 2022/06/28 22:11:41 1174 @@ -13,7 +13,7 @@ from urllib.parse import urljoin import pywikibot import re -import requests +import requests # for listing members with dir() from pywikibot.bot import QuitKeyboardInterrupt from pywikibot import pagegenerators @@ -50,10 +50,13 @@ def scan_for_iw_links(page_text): # Sometimes we used a space char. instead of a '_', so fix that before querying page_title = page_text[s:e].replace(' ', '_') + # Use only spaces for title when printing it + page_title_human = page_title.replace('_', ' ') + pywikibot.stdout(' Validating {0} link "{1}"'.format(prefix, page_title_human)) + iw_found = iw_found + 1 + # Construct full URL for the particular wiki iw_url = interwiki_urls[cur] + page_title - pywikibot.stdout(' Validating {0} link "{1}"'.format(prefix, page_title)) - iw_found = iw_found + 1 # Adjust URL if this is a foreign-language WP link if re.match("^[a-zA-Z]{2}:", page_title): @@ -64,17 +67,37 @@ def scan_for_iw_links(page_text): iw_url = iw_url.replace(page_title[0:3], '') # Test the URL - #pywikibot.stdout(' Testing URL "{}"'.format(iw_url)) response = fetch(iw_url) - # Redirects are followed automatically by fetch() and treated as "200"s, so the - # way we tell that a redirect occurred is by checking the history + # One way we tell that a redirect occurred is by checking the history if response.history != []: pywikibot.stdout(' ERROR: Got redirection code ({0}) on URL "{1}".'.format(response.history[0], iw_url)) errors_issued = errors_issued + 1 elif response.status_code != 200: pywikibot.stdout(' ERROR: Got response code {0} on URL "{1}".'.format(response.status_code, iw_url)) errors_issued = errors_issued + 1 + # The usual way that a redirect occurs is that MediaWiki redirects us sneakily + # using JavaScript, while returning code OK 200 as if the link was correct; we + # must detect this from the page source + elif 'Redirected from + canonical_name = response.text.split('') + if tag_end == -1: + pywikibot.stdout(' ERROR: This is a redirect page (but I could not isolate the correct page name).') + else: + canonical_name = canonical_name[:tag_end] + if len(canonical_name) > 100: + # Certain things can cause the trim to fail; here we avoid slamming + # the output with massive page source from a failed trim + pywikibot.stdout(' ERROR: This is a redirect to "{}" (string trimmed to 100 chars due to excessive length).'.format(canonical_name[:100])) + else: + canonical_name = canonical_name.replace('_', ' ') + pywikibot.stdout(' ERROR: This is a redirect to "{}".'.format(canonical_name)) + errors_issued = errors_issued + 1 elif '#' in page_title: # Isolate section link page_name, anchor_name = page_title.split('#') @@ -91,7 +114,6 @@ def scan_for_iw_links(page_text): for span_tag in soup.findAll('span'): span_name = span_tag.get('id', None) if span_name == anchor_name: - #pywikibot.stdout('Found section!') found_section = True break if found_section == False: