--- ValBot/Python/check_intrawiki_section_links.py 2022/02/21 23:59:20 1169 +++ ValBot/Python/check_intrawiki_section_links.py 2022/09/25 23:58:33 1176 @@ -1,3 +1,11 @@ +# Check Intrawiki Section Links +# by iritscen@yahoo.com +# Looks at each wikilink on a page (or in all the pages in a category) for a section link ('#'), +# and loads the linked page and verifies that the named section actually exists. The output will +# use the keywords ADVICE, WARNING or ERROR depending on the nature of issue that it encounters. +# Recommended viewing width: +# |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- --| + import os from urllib.parse import urljoin @@ -23,14 +31,18 @@ interwiki_prefixes = ('acronym', 'cache' pages_checked = 0 iw_found = 0 -problems_found = 0 +advice_issued = 0 +warnings_issued = 0 +errors_issued = 0 page_name = '' # Searches the given page text for intrawiki links with section links in them def scan_for_iw_links(page_text): global pages_checked global iw_found - global problems_found + global advice_issued + global warnings_issued + global errors_issued global page_name pages_checked = pages_checked + 1 @@ -50,31 +62,50 @@ def scan_for_iw_links(page_text): # Sometimes we used a space char. instead of a '_', so fix that before querying link_text = link_text.replace(' ', '_') - #pywikibot.output('Found link {0}.'.format(link_text)) + #pywikibot.stdout('Found link {0}.'.format(link_text)) # If this link doesn't have a section link in it, then we don't care about it, as # MediaWiki takes care of checking basic intrawiki links if not '#' in link_text: - #pywikibot.output('Link doesn\'t have a section anchor in it. Skipping.') + #pywikibot.stdout('Link doesn\'t have a section anchor in it. Skipping.') + continue + + # If this link has an interwiki prefix, it can be ignored + is_interwiki = False + if found_iw_match == False: + for prefix in interwiki_prefixes: + if prefix + ":" in link_text: + #pywikibot.stdout('Skipping link {} because it is an interwiki link.'.format(link_text)) + is_interwiki = True + break + if is_interwiki: continue # If there is a '{' in the link, then probably it's a link built on transcluded text # like "Quotes/Diary#{{C3}}", which we cannot expand and work with, so skip it if '{' in link_text: - pywikibot.output('ADVICE: Link {} seems to use transclusion, so it can\'t be verified automatically. You should check it manually.'.format(link_text)) + pywikibot.stdout('ADVICE: Link {} seems to use transclusion, so it can\'t be verified automatically. You should check it manually.'.format(link_text)) + advice_issued = advice_issued + 1 continue - - # If this is a relative "../" link, find the parent page and set ourselves to that - # page, then remove the relative portion of the link. Note that this is only performed - # once, so if there's multiple steps back ("../../"), we're out of luck. + + # If this is a relative "/" link, use the current page as the basis for the URL. Note + # that only a leading slash is looked for, so if there's multiple steps down ("/x/y"), + # we're out of luck. + if link_text.startswith('/'): + link_text = page_name + link_text + #pywikibot.stdout('Changed link_text to {} on account of "/".'.format(link_text)) + + # If this is a relative "../" link, find the parent page and set ourselves to that page, + # then remove the relative portion of the link. Note that this is only performed once, + # so if there's multiple steps back ("../../"), we're out of luck. if link_text.startswith('../'): last_slash = page_name.rfind('/') page_name2 = page_name[0:last_slash] - #pywikibot.output('Changed page_name to {} on account of "../".'.format(page_name2)) + #pywikibot.stdout('Changed page_name to {} on account of "../".'.format(page_name2)) link_text = link_text[3:len(link_text)] - #pywikibot.output('Changed link_text to {} on account of "../".'.format(link_text)) - # If this is now going to be a bare section link for the parent page, don't add - # a slash, otherwise do because we are drilling down to another subpage + #pywikibot.stdout('Changed link_text to {} on account of "../".'.format(link_text)) + # If this is now going to be a bare section link for the parent page, don't add a + # slash, otherwise do because we are drilling down to another subpage if link_text.startswith('#'): link_text = page_name2 + link_text else: @@ -84,66 +115,53 @@ def scan_for_iw_links(page_text): if link_text.startswith('#'): iw_url = onigalore_url + page_name2 iw_found = iw_found + 1 - #pywikibot.output('Found link to this very page, {}.'.format(link_text)) + #pywikibot.stdout('Found link to this very page, {}.'.format(link_text)) found_iw_match = True link_text = page_name2 + link_text # If there's no ":" in the link (before the section link, where a colon would just be # part of the text) then it's a Main namespace article, so construct URL - #if not ':' in link_text: if found_iw_match == False: if not re.search(":.*#", link_text): iw_url = onigalore_url + link_text iw_found = iw_found + 1 - #pywikibot.output('Found link to OniGalore Main namespace page {}.'.format(link_text)) + #pywikibot.stdout('Found link to OniGalore Main namespace page {}.'.format(link_text)) found_iw_match = True # If there is a ":", match the prefix against the intrawiki prefixes on OniGalore if found_iw_match == False: for prefix in intrawiki_prefixes: - #pywikibot.output('Comparing link against prefix {}.'.format(prefix)) + #pywikibot.stdout('Comparing link against prefix {}.'.format(prefix)) if prefix + ":" in link_text: iw_url = onigalore_url + link_text _, post_ns = link_text.split(':', 1) - #pywikibot.output('Found link to OniGalore {0} namespace page {1}.'.format(prefix, post_ns)) + #pywikibot.stdout('Found link to OniGalore {0} namespace page {1}.'.format(prefix, post_ns)) iw_found = iw_found + 1 found_iw_match = True break - # If we didn't match the prefix against any intrawiki prefixes, see if it matches - # against an interwiki prefix; if so, this link can be ignored - is_interwiki = False - if found_iw_match == False: - for prefix in interwiki_prefixes: - if prefix + ":" in link_text: - #pywikibot.output('Skipping link {} because it is an interwiki link.'.format(link_text)) - is_interwiki = True - break - if is_interwiki: - continue - # If we still haven't turned this match into a URL, something's gone wrong if (found_iw_match == False) or (iw_url == ""): - pywikibot.output('ERROR: Couldn\'t figure out link {}. Aborting script.'.format(link_text)) - quit() + pywikibot.stdout('ERROR: Couldn\'t figure out link {}.'.format(link_text)) + continue # Test the URL iw_url = iw_url.replace(' ', '_') - #pywikibot.output('Reading page at {}...'.format(iw_url)) + #pywikibot.stdout('Reading page at {}...'.format(iw_url)) response = fetch(iw_url) - # Redirects are followed automatically by fetch() and treated as "200"s, so the - # way we tell that a redirect occurred is by checking the history + # Redirects are followed automatically by fetch() and treated as "200"s; the way we can + # tell that a redirect occurred is by checking fetch's history if response.history != []: - pywikibot.output('WARNING: Redirected from {}.'.format(response.history)) - problems_found = problems_found + 1 + pywikibot.stdout('WARNING: Got redirection code ({0}) on URL "{1}".'.format(response.history[0], iw_url)) + warnings_issued = warnings_issued + 1 elif response.status_code != 200: - #pywikibot.output('WARNING: Got response code {}.'.format(response.status_code)) # commented out because fetch() already prints such a msg - problems_found = problems_found + 1 + pywikibot.stdout('WARNING: Got response code {0} on URL {1}.'.format(response.status_code, iw_url)) + warnings_issued = warnings_issued + 1 else: # Isolate section link pre_section, section_name = link_text.split('#', 1) - #pywikibot.output('Searching for section link {} on page.'.format(section_name)) + #pywikibot.stdout('Searching for section link {} on page.'.format(section_name)) # Convert slash character to the dot-notation hex encoding that MediaWiki uses section_name = section_name.replace('/', '.2F') @@ -154,12 +172,12 @@ def scan_for_iw_links(page_text): for span_tag in soup.findAll('span'): span_name = span_tag.get('id', None) if span_name == section_name: - #pywikibot.output('Found section!') + #pywikibot.stdout('Found section!') found_section = True break if found_section == False: - pywikibot.output('ERROR: Could not find section {0} on page {1}!'.format(section_name, pre_section)) - problems_found = problems_found + 1 + pywikibot.stdout('ERROR: Could not find section {0} on page {1}!'.format(section_name, pre_section)) + errors_issued = errors_issued + 1 def main(*args): cat_name = '' @@ -193,8 +211,37 @@ def main(*args): global pages_checked global iw_found - global problems_found - pywikibot.stdout('Checked {0} page(s) and found {1} intrawiki link(s) with {2} section link problem(s).'.format(pages_checked, iw_found, problems_found)) + global advice_issued + global warnings_issued + global errors_issued + + page_str = "pages" + if pages_checked == 1: + page_str = "page" + + link_str = "links" + if iw_found == 1: + link_str = "link" + + pywikibot.stdout('Checked {0} {1} and found {2} intrawiki {3}.'.format(pages_checked, page_str, iw_found, link_str)) + pywikibot.stdout('While attempting to follow section links...') + + if advice_issued == 0: + pywikibot.stdout(' No advice on potential problems was issued.') + elif advice_issued == 1: + pywikibot.stdout(' 1 piece of advice on a potential problem was issued.') + else: + pywikibot.stdout(' {} pieces of advice on potential problems were issued.'.format(advice_issued)) + + warning_str = "warnings were" + if warnings_issued == 1: + warning_str = "warning was" + pywikibot.stdout(' {0} {1} issued.'.format(warnings_issued, warning_str)) + + error_str = "errors were" + if errors_issued == 1: + error_str = "error was" + pywikibot.stdout(' {0} {1} encountered.'.format(errors_issued, error_str)) if __name__ == '__main__': main()