--- ValBot/Python/check_intrawiki_section_links.py	2022/02/21 23:59:20	1169
+++ ValBot/Python/check_intrawiki_section_links.py	2022/09/25 23:58:33	1176
@@ -1,3 +1,11 @@
+# Check Intrawiki Section Links
+# by iritscen@yahoo.com
+# Looks at each wikilink on a page (or in all the pages in a category) for a section link ('#'),
+# and loads the linked page and verifies that the named section actually exists. The output will
+# use the keywords ADVICE, WARNING or ERROR depending on the nature of issue that it encounters.
+# Recommended viewing width:
+# |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- --|
+
 import os
 
 from urllib.parse import urljoin
@@ -23,14 +31,18 @@ interwiki_prefixes = ('acronym', 'cache'
 
 pages_checked = 0
 iw_found = 0
-problems_found = 0
+advice_issued = 0
+warnings_issued = 0
+errors_issued = 0
 page_name = ''
 
 # Searches the given page text for intrawiki links with section links in them
 def scan_for_iw_links(page_text):
     global pages_checked
     global iw_found
-    global problems_found
+    global advice_issued
+    global warnings_issued
+    global errors_issued
     global page_name
     pages_checked = pages_checked + 1
 
@@ -50,31 +62,50 @@ def scan_for_iw_links(page_text):
 
         # Sometimes we used a space char. instead of a '_', so fix that before querying
         link_text = link_text.replace(' ', '_')
-        #pywikibot.output('Found link {0}.'.format(link_text))
+        #pywikibot.stdout('Found link {0}.'.format(link_text))
         
         # If this link doesn't have a section link in it, then we don't care about it, as
         # MediaWiki takes care of checking basic intrawiki links
         if not '#' in link_text:
-            #pywikibot.output('Link doesn\'t have a section anchor in it. Skipping.')
+            #pywikibot.stdout('Link doesn\'t have a section anchor in it. Skipping.')
+            continue
+
+        # If this link has an interwiki prefix, it can be ignored
+        is_interwiki = False
+        if found_iw_match == False:
+            for prefix in interwiki_prefixes:
+                if prefix + ":" in link_text:
+                    #pywikibot.stdout('Skipping link {} because it is an interwiki link.'.format(link_text))
+                    is_interwiki = True
+                    break
+        if is_interwiki:
             continue
         
         # If there is a '{' in the link, then probably it's a link built on transcluded text
         # like "Quotes/Diary#{{C3}}", which we cannot expand and work with, so skip it
         if '{' in link_text:
-            pywikibot.output('ADVICE: Link {} seems to use transclusion, so it can\'t be verified automatically. You should check it manually.'.format(link_text))
+            pywikibot.stdout('ADVICE: Link {} seems to use transclusion, so it can\'t be verified automatically. You should check it manually.'.format(link_text))
+            advice_issued = advice_issued + 1
             continue
-        
-        # If this is a relative "../" link, find the parent page and set ourselves to that
-        # page, then remove the relative portion of the link. Note that this is only performed
-        # once, so if there's multiple steps back ("../../"), we're out of luck.
+
+        # If this is a relative "/" link, use the current page as the basis for the URL. Note
+        # that only a leading slash is looked for, so if there's multiple steps down ("/x/y"),
+        # we're out of luck.
+        if link_text.startswith('/'):
+            link_text = page_name + link_text
+            #pywikibot.stdout('Changed link_text to {} on account of "/".'.format(link_text))
+        
+        # If this is a relative "../" link, find the parent page and set ourselves to that page,
+        # then remove the relative portion of the link. Note that this is only performed once,
+        # so if there's multiple steps back ("../../"), we're out of luck.
         if link_text.startswith('../'):
             last_slash = page_name.rfind('/')
             page_name2 = page_name[0:last_slash]
-            #pywikibot.output('Changed page_name to {} on account of "../".'.format(page_name2))
+            #pywikibot.stdout('Changed page_name to {} on account of "../".'.format(page_name2))
             link_text = link_text[3:len(link_text)]
-            #pywikibot.output('Changed link_text to {} on account of "../".'.format(link_text))
-            # If this is now going to be a bare section link for the parent page, don't add
-            # a slash, otherwise do because we are drilling down to another subpage
+            #pywikibot.stdout('Changed link_text to {} on account of "../".'.format(link_text))
+            # If this is now going to be a bare section link for the parent page, don't add a
+            # slash, otherwise do because we are drilling down to another subpage
             if link_text.startswith('#'):
                 link_text = page_name2 + link_text
             else:
@@ -84,66 +115,53 @@ def scan_for_iw_links(page_text):
         if link_text.startswith('#'):
             iw_url = onigalore_url + page_name2
             iw_found = iw_found + 1
-            #pywikibot.output('Found link to this very page, {}.'.format(link_text))
+            #pywikibot.stdout('Found link to this very page, {}.'.format(link_text))
             found_iw_match = True
             link_text = page_name2 + link_text
         
         # If there's no ":" in the link (before the section link, where a colon would just be
         # part of the text) then it's a Main namespace article, so construct URL
-        #if not ':' in link_text:
         if found_iw_match == False:
             if not re.search(":.*#", link_text):
                 iw_url = onigalore_url + link_text
                 iw_found = iw_found + 1
-                #pywikibot.output('Found link to OniGalore Main namespace page {}.'.format(link_text))
+                #pywikibot.stdout('Found link to OniGalore Main namespace page {}.'.format(link_text))
                 found_iw_match = True
             
         # If there is a ":", match the prefix against the intrawiki prefixes on OniGalore
         if found_iw_match == False:
             for prefix in intrawiki_prefixes:
-                #pywikibot.output('Comparing link against prefix {}.'.format(prefix))
+                #pywikibot.stdout('Comparing link against prefix {}.'.format(prefix))
                 if prefix + ":" in link_text:
                     iw_url = onigalore_url + link_text
                     _, post_ns = link_text.split(':', 1)
-                    #pywikibot.output('Found link to OniGalore {0} namespace page {1}.'.format(prefix, post_ns))
+                    #pywikibot.stdout('Found link to OniGalore {0} namespace page {1}.'.format(prefix, post_ns))
                     iw_found = iw_found + 1
                     found_iw_match = True
                     break
         
-        # If we didn't match the prefix against any intrawiki prefixes, see if it matches
-        # against an interwiki prefix; if so, this link can be ignored
-        is_interwiki = False
-        if found_iw_match == False:
-            for prefix in interwiki_prefixes:
-                if prefix + ":" in link_text:
-                    #pywikibot.output('Skipping link {} because it is an interwiki link.'.format(link_text))
-                    is_interwiki = True
-                    break
-        if is_interwiki:
-            continue
-        
         # If we still haven't turned this match into a URL, something's gone wrong
         if (found_iw_match == False) or (iw_url == ""):
-            pywikibot.output('ERROR: Couldn\'t figure out link {}. Aborting script.'.format(link_text))
-            quit()
+            pywikibot.stdout('ERROR: Couldn\'t figure out link {}.'.format(link_text))
+            continue
 
         # Test the URL
         iw_url = iw_url.replace(' ', '_')
-        #pywikibot.output('Reading page at {}...'.format(iw_url))
+        #pywikibot.stdout('Reading page at {}...'.format(iw_url))
         response = fetch(iw_url)
 
-        # Redirects are followed automatically by fetch() and treated as "200"s, so the
-        # way we tell that a redirect occurred is by checking the history
+        # Redirects are followed automatically by fetch() and treated as "200"s; the way we can
+        # tell that a redirect occurred is by checking fetch's history
         if response.history != []:
-            pywikibot.output('WARNING: Redirected from {}.'.format(response.history))
-            problems_found = problems_found + 1
+            pywikibot.stdout('WARNING: Got redirection code ({0}) on URL "{1}".'.format(response.history[0], iw_url))
+            warnings_issued = warnings_issued + 1
         elif response.status_code != 200:
-            #pywikibot.output('WARNING: Got response code {}.'.format(response.status_code)) # commented out because fetch() already prints such a msg
-            problems_found = problems_found + 1
+            pywikibot.stdout('WARNING: Got response code {0} on URL {1}.'.format(response.status_code, iw_url))
+            warnings_issued = warnings_issued + 1
         else:
             # Isolate section link
             pre_section, section_name = link_text.split('#', 1)
-            #pywikibot.output('Searching for section link {} on page.'.format(section_name))
+            #pywikibot.stdout('Searching for section link {} on page.'.format(section_name))
             
             # Convert slash character to the dot-notation hex encoding that MediaWiki uses
             section_name = section_name.replace('/', '.2F')
@@ -154,12 +172,12 @@ def scan_for_iw_links(page_text):
             for span_tag in soup.findAll('span'):
                 span_name = span_tag.get('id', None)
                 if span_name == section_name:
-                    #pywikibot.output('Found section!')
+                    #pywikibot.stdout('Found section!')
                     found_section = True
                     break
             if found_section == False:
-                pywikibot.output('ERROR: Could not find section {0} on page {1}!'.format(section_name, pre_section))
-                problems_found = problems_found + 1
+                pywikibot.stdout('ERROR: Could not find section {0} on page {1}!'.format(section_name, pre_section))
+                errors_issued = errors_issued + 1
 
 def main(*args):
     cat_name = ''
@@ -193,8 +211,37 @@ def main(*args):
 
     global pages_checked
     global iw_found
-    global problems_found
-    pywikibot.stdout('Checked {0} page(s) and found {1} intrawiki link(s) with {2} section link problem(s).'.format(pages_checked, iw_found, problems_found))
+    global advice_issued
+    global warnings_issued
+    global errors_issued
+
+    page_str = "pages"
+    if pages_checked == 1:
+        page_str = "page"
+
+    link_str = "links"
+    if iw_found == 1:
+        link_str = "link"
+
+    pywikibot.stdout('Checked {0} {1} and found {2} intrawiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
+    pywikibot.stdout('While attempting to follow section links...')
+
+    if advice_issued == 0:
+        pywikibot.stdout('  No advice on potential problems was issued.')
+    elif advice_issued == 1:
+        pywikibot.stdout('  1 piece of advice on a potential problem was issued.')
+    else:
+        pywikibot.stdout('  {} pieces of advice on potential problems were issued.'.format(advice_issued))
+
+    warning_str = "warnings were"
+    if warnings_issued == 1:
+        warning_str = "warning was"
+    pywikibot.stdout('  {0} {1} issued.'.format(warnings_issued, warning_str))
+
+    error_str = "errors were"
+    if errors_issued == 1:
+        error_str = "error was"
+    pywikibot.stdout('  {0} {1} encountered.'.format(errors_issued, error_str))
 
 if __name__ == '__main__':
     main()