--- ValBot/Python/check_interwiki_links.py 2022/02/21 23:59:20 1169 +++ ValBot/Python/check_interwiki_links.py 2022/03/21 21:22:33 1170 @@ -1,9 +1,19 @@ +# Check Interwiki Links +# by iritscen@yahoo.com +# Looks at each link on a page (or in all the pages in a category) which uses a registered +# interwiki prefix and loads the linked page, verifying that it exists and that any section +# link, if present, is valid as well. The output will use the word "ERROR" when it cannot +# validate the interwiki link. +# Recommended viewing width: +# |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---| + import os from urllib.parse import urljoin import pywikibot import re +import requests from pywikibot.bot import QuitKeyboardInterrupt from pywikibot import pagegenerators @@ -19,13 +29,13 @@ interwiki_urls = ('http://www.acronymfin pages_checked = 0 iw_found = 0 -problems_found = 0 +errors_issued = 0 # Searches the given page text for interwiki links def scan_for_iw_links(page_text): global pages_checked global iw_found - global problems_found + global errors_issued pages_checked = pages_checked + 1 cur = 0 @@ -42,7 +52,7 @@ def scan_for_iw_links(page_text): # Construct full URL for the particular wiki iw_url = interwiki_urls[cur] + page_title - pywikibot.output('Found {0} link {1}.'.format(prefix, page_title)) + pywikibot.stdout(' Validating {0} link "{1}"'.format(prefix, page_title)) iw_found = iw_found + 1 # Adjust URL if this is a foreign-language WP link @@ -54,20 +64,19 @@ def scan_for_iw_links(page_text): iw_url = iw_url.replace(page_title[0:3], '') # Test the URL - #pywikibot.output('Testing URL {}...'.format(iw_url)) + #pywikibot.stdout(' Testing URL "{}"'.format(iw_url)) response = fetch(iw_url) # Redirects are followed automatically by fetch() and treated as "200"s, so the # way we tell that a redirect occurred is by checking the history if response.history != []: - pywikibot.output('WARNING: Redirected from {}.'.format(response.history)) - problems_found = problems_found + 1 + pywikibot.stdout(' ERROR: Got redirection code ({0}) on URL "{1}".'.format(response.history[0], iw_url)) + errors_issued = errors_issued + 1 elif response.status_code != 200: - #pywikibot.output('WARNING: Got response code {}.'.format(response.status_code)) # commented out because fetch() already prints such a msg - problems_found = problems_found + 1 + pywikibot.stdout(' ERROR: Got response code {0} on URL "{1}".'.format(response.status_code, iw_url)) + errors_issued = errors_issued + 1 elif '#' in page_title: # Isolate section link - pywikibot.output('Detected section link on page {0}.'.format(page_title)) page_name, anchor_name = page_title.split('#') # Convert dot-notation hex entities to proper characters @@ -79,22 +88,15 @@ def scan_for_iw_links(page_text): # Read linked page to see if it really has this anchor link soup = BeautifulSoup(response.text, 'html.parser') found_section = False - for tag in soup.findAll('a'): - link = tag.get('href', None) - if not link: - #pywikibot.output('It is not a link.') - continue - #pywikibot.output('Got link {0}.'.format(link)) - if not link.startswith('#'): - continue - - if link == '#' + anchor_name: - pywikibot.output('Found section link!') + for span_tag in soup.findAll('span'): + span_name = span_tag.get('id', None) + if span_name == anchor_name: + #pywikibot.stdout('Found section!') found_section = True break if found_section == False: - pywikibot.output('Could not find section {0} on page {1}.'.format(anchor_name, page_name)) - problems_found = problems_found + 1 + pywikibot.stdout(' ERROR: Could not find section {0} on page {1}.'.format(anchor_name, page_name)) + errors_issued = errors_issued + 1 cur = cur + 1 def main(*args): @@ -112,24 +114,39 @@ def main(*args): site = pywikibot.Site() - # This line of code enumerates the methods in the 'page' class - #pywikibot.stdout(format(dir(page))) + #pywikibot.stdout('The members of the requests.models.Response class are:') + #pywikibot.stdout(format(dir(requests.models.Response))) if cat_name != '': cat_obj = pywikibot.Category(site, cat_name) generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True) for page in pagegenerators.PreloadingGenerator(generator, 100): - pywikibot.stdout('Checking page {0}'.format(page.title())) + pywikibot.stdout('Checking page "{}"'.format(page.title())) scan_for_iw_links(page.text) elif page_name != '': page = pywikibot.Page(site, page_name) - pywikibot.stdout('Checking page {0}'.format(page.title())) + pywikibot.stdout('Checking page "{}"'.format(page.title())) scan_for_iw_links(page.text) global pages_checked global iw_found - global problems_found - pywikibot.stdout('Checked {0} page(s) and found {1} interwiki link(s) with {2} problem(s).'.format(pages_checked, iw_found, problems_found)) + global errors_issued + + page_str = "pages" + if pages_checked == 1: + page_str = "page" + + link_str = "links" + if iw_found == 1: + link_str = "link" + + pywikibot.stdout('Checked {0} {1} and found {2} interwiki {3}.'.format(pages_checked, page_str, iw_found, link_str)) + + error_str = "errors were" + if errors_issued == 1: + error_str = "error was" + + pywikibot.stdout('{0} {1} encountered in validating these links.'.format(errors_issued, error_str)) if __name__ == '__main__': main()