ValBot/Python/check_interwiki_links.py

# Check Interwiki Links
# by iritscen@yahoo.com
# Looks at each link on a page (or all the pages in a category) which uses a registered
# interwiki prefix and loads the linked page, verifying that it exists and that any section
# link, if present, is valid as well. The output will use the word "ERROR" when it cannot
# validate the interwiki link.
# Recommended viewing width:
# |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---|

import bs4
import pywikibot
import re
import requests # for listing members with dir() when debugging

from bs4 import BeautifulSoup
from pywikibot import pagegenerators
from pywikibot.bot import QuitKeyboardInterrupt
from pywikibot.comms.http import fetch
from pywikibot.specialbots import UploadRobot
from pywikibot.tools.formatter import color_format
from urllib.parse import urljoin

class IWLink:
   def __init__(self, iw_prefix, prefix_url, full_url, page_name, page_slug, curl_response):
      self.iw_prefix = iw_prefix # e.g. "wp"
      self.prefix_url = prefix_url # e.g. "https://en.wikipedia.org/wiki/"
      self.full_url = full_url # e.g. "https://en.wikipedia.org/wiki/Easter_egg"
      self.page_name = page_name # "Easter egg"
      self.page_slug = page_slug # "Easter_egg"
      self.curl_response = curl_response # a class defined in the Requests library

# Parallel arrays based on https://wiki.oni2.net/Special:Interwiki
interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')

interwiki_urls = ('http://www.acronymfinder.com/~/search/af.aspx?string=exact&Acronym=', 'http://www.google.com/search?q=cache:', 'https://commons.wikimedia.org/wiki/', 'http://www.dict.org/bin/Dict?Database=*&Form=Dict1&Strategy=*&Query=', 'http://www.google.com/search?q=', 'https://meta.wikimedia.org/wiki/', 'https://www.mediawiki.org/wiki/', 'https://en.wikibooks.org/wiki/', 'https://www.wikidata.org/wiki/', 'https://foundation.wikimedia.org/wiki/', 'https://en.wikinews.org/wiki/', 'https://en.wikipedia.org/wiki/', 'https://en.wikiquote.org/wiki/', 'https://wikisource.org/wiki/', 'https://species.wikimedia.org/wiki/', 'https://en.wikiversity.org/wiki/', 'https://en.wikivoyage.org/wiki/', 'https://en.wiktionary.org/wiki/', 'https://en.wiktionary.org/wiki/', 'https://en.wikipedia.org/wiki/')

# Initialize globals
debug = 0
pages_checked = 0
iw_found = 0
errors_issued = 0
unintended_redirects_found = 0
name_printed = 0

# Prints the name of a page on which something occurred, if it has not been printed before
def possibly_print(page_name):
   global debug
   global name_printed
   
   if not name_printed and not debug:
      pywikibot.stdout('')
      pywikibot.stdout('From page "{}":'.format(page_name))
      name_printed = 1

# Search a page for the section specified in the link
def find_section(the_link, print_result):
   global errors_issued

   # Isolate section link
   target_page_name, anchor_name = the_link.page_slug.split('#')
   target_page_name_human = target_page_name.replace('_', ' ')
   
   # Convert dot-notation hex entities to proper characters
   replacements = [(r'\.22', '"'), (r'\.27', "'"), (r'\.28', '('), (r'\.29', ')')]
   for pattern, replacement in replacements:
      anchor_name = re.sub(pattern, replacement, anchor_name)
   
   # Read linked page to see if it really has this anchor link
   soup = BeautifulSoup(the_link.curl_response.text, 'html.parser')
   tags_to_search = ['span', 'div', 'h2', 'h3', 'h4']
   found_section = False
   for tag_name in tags_to_search:
       for the_tag in soup.find_all(tag_name):
           if the_tag.get('id') == anchor_name:
               found_section = True
               break
       if found_section:
           break
   
   # Tell user what we found
   if found_section == False:
      possibly_print(the_link.page_name)
      pywikibot.stdout('   ERROR: Could not find section "{0}" on {1} page "{2}".'.format(anchor_name, the_link.iw_prefix, target_page_name_human))
      # TODO: Check that page name has been corrected to redirected page if there was a redirect
      errors_issued = errors_issued + 1
   elif print_result == True:
      pywikibot.stdout('   The section "{0}" was found on {1} page "{2}".'.format(anchor_name, the_link.iw_prefix, target_page_name_human))

# For a link that redirected us to another page, extract the name of the target page from
# the target page's source
def find_canonical_link(the_link):
   # Extract link from this markup which contains name of redirected-to page:
   # <link rel="canonical" href="https://en.wikipedia.org/wiki/Page_name"/>
   canonical_name = the_link.curl_response.text.split('<link rel="canonical" href="')[-1]
   prefix_length = len(the_link.prefix_url)
   canonical_name = canonical_name[prefix_length:]
   tag_end = canonical_name.find('">')
   
   if tag_end == -1:
      pywikibot.stdout('   ERROR: The {0} link "{1}" is a redirect page, but this script could not isolate the target page name.'.format(the_link.iw_prefix, the_link.page_slug))
      errors_issued = errors_issued + 1
   else:
      canonical_name = canonical_name[:tag_end]
      if len(canonical_name) > 100:
         # Certain things can cause the trim to fail; report error and avoid slamming the
         # output with massive page source from a failed trim
         pywikibot.stdout('   ERROR: The {0} link "{1}" is a redirect to "{2}…" (string overflow).'.format(the_link.iw_prefix, the_link.page_slug, canonical_name[:100]))
         errors_issued = errors_issued + 1
      else:
         canonical_name = canonical_name.replace('_', ' ')
         if '#' in the_link.page_slug:
            _, anchor_name = the_link.page_slug.split('#')
            pywikibot.stdout('   The {0} link "{1}" is a redirect to "{2}#{3}", which is a valid page. Checking for section on that page….'.format(the_link.iw_prefix, the_link.page_slug, canonical_name, anchor_name))
            the_link.page_slug = the_link.page_slug.replace(the_link.page_name, canonical_name) # update page slug so that find_section() uses the right page name in its messages
            find_section(the_link, True)
         else:
            pywikibot.stdout('   The {0} link "{1}" is a redirect to "{2}", which is a valid page.'.format(the_link.iw_prefix, the_link.page_slug, canonical_name))

# Test an interwiki link and look for a section link if applicable
def test_interwiki_link(the_link):
   global errors_issued
   global unintended_redirects_found
   
   the_link.curl_response = fetch(the_link.full_url)

   # One way we tell that a redirect occurred is by checking fetch's history, as it
   # automatically follows redirects. This will catch formal redirects which come from pages
   # such as Special:PermanentLink.
   if the_link.curl_response.history != []:
      possibly_print(the_link.page_name)
      
      # If linked page is in all caps, e.g. WP:BEANS, it's likely a deliberate use of a redirect
      if the_link.page_slug.startswith('WP:') and the_link.page_slug == the_link.page_slug.upper():
         pywikibot.stdout('   Got redirection code "{0}" for {1} link "{2}". This appears to be a deliberate use of a Wikipedia shortcut. Checking the target page….'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug))
         find_canonical_link(the_link)
      else:
         permalink1 = 'Special:PermanentLink/'.lower()
         permalink2 = 'Special:Permalink/'.lower()
         page_slug_lower = the_link.page_slug.lower()
         if page_slug_lower.startswith(permalink1) or page_slug_lower.startswith(permalink2):
            pywikibot.stdout('   Got redirection code "{0}" for {1} permanent revision link "{2}". Checking the target page….'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug))
            find_canonical_link(the_link)
         else:
            pywikibot.stdout('   ERROR: Unrecognized type of redirection (code "{0}") for {1} link "{2}". You should check the link manually.'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug))
            errors_issued = errors_issued + 1
   elif the_link.curl_response.status_code != 200:
      possibly_print(the_link.page_name)
      pywikibot.stdout('   ERROR: Got response code {0} for {1} link "{2}". The page may not exist.'.format(the_link.curl_response.status_code, the_link.iw_prefix, the_link.page_slug))
      errors_issued = errors_issued + 1
   # However the usual way that a redirect occurs is that MediaWiki redirects us sneakily
   # using JavaScript, while returning code OK 200 as if the link was correct; this happens
   # when a redirect page is accessed. We must detect these soft redirects by looking at the
   # page source to find the redirect note inserted at the top of the page for the reader.
   elif 'Redirected from <a' in the_link.curl_response.text:
      unintended_redirects_found = unintended_redirects_found + 1
      possibly_print(the_link.page_name)
      pywikibot.stdout('   WARNING: Got silently redirected by {0} link "{1}". Checking the target page….'.format(the_link.iw_prefix, the_link.page_slug))
      find_canonical_link(the_link)
   elif '#' in the_link.page_slug:
      find_section(the_link, False)

# Searches the given page text for interwiki links
def scan_for_interwiki_links(page_text, page_name):
   global debug
   global pages_checked
   global iw_found
   global name_printed
   pages_checked = pages_checked + 1
   cur_prefix = 0
   name_printed = 0

   for prefix in interwiki_prefixes:
      # Isolate strings that start with "[[prefix:" and end with "|" or "]"
      iw_link = r"\[\[" + prefix + r":[^|\]]*(\||\])"
      for match in re.finditer(iw_link, page_text):
         the_link = IWLink(prefix, interwiki_urls[cur_prefix], "", page_name, "", "")
      
         # Extract just the page title from this regex match
         s = match.start() + 2 + len(the_link.iw_prefix) + 1
         e = match.end() - 1

         # Commonly we use spaces instead of underscores, so fix that before querying
         the_link.page_slug = page_text[s:e].replace(' ', '_')

         # But use spaces for title when printing it
         page_title_human = the_link.page_slug.replace('_', ' ')
         if debug: pywikibot.stdout('   Validating {0} link "{1}"'.format(the_link.iw_prefix, page_title_human))
         iw_found = iw_found + 1

         # Construct full URL for the particular wiki
         the_link.full_url = the_link.prefix_url + the_link.page_slug

         # Adjust URL if this is a foreign-language WP link
         if re.match("^[a-zA-Z]{2}:", the_link.page_slug):
            lang_code = the_link.page_slug[0:2] + "."
            # "wp:" is the Wikipedia: namespace, not a language
            if lang_code != "wp." and lang_code != "WP.":
               the_link.full_url = the_link.full_url.replace('en.', lang_code)
               the_link.full_url = the_link.full_url.replace(the_link.page_slug[0:3], '')

         # Test the URL
         test_interwiki_link(the_link)
      cur_prefix = cur_prefix + 1

# Print a wrap-up message
def print_summary():
   global pages_checked
   global iw_found
   global errors_issued
   global unintended_redirects_found

   page_str = "pages"
   if pages_checked == 1:
      page_str = "page"

   link_str = "links"
   if iw_found == 1:
      link_str = "link"

   pywikibot.stdout('Checked {0} {1} and found {2} interwiki {3}.'.format(pages_checked, page_str, iw_found, link_str))

   error_str = "errors were"
   if errors_issued == 1:
      error_str = "error was"

   pywikibot.stdout('{0} {1} encountered in validating these links.'.format(errors_issued, error_str))

   warning_str = "likely-unintended redirects were"
   if unintended_redirects_found == 1:
      warning_str = "likely-unintended redirect was"

   pywikibot.stdout('{0} {1} encountered in validating these links.'.format(unintended_redirects_found, warning_str))

# Main function
def main(*args):
   global debug
   search_cat = ''
   search_page = ''

   # Process arguments
   local_args = pywikibot.handle_args(args)
   for arg in local_args:
      if arg.startswith('-cat:'):
         search_cat = arg[5:]
      elif arg.startswith('-page:'):
         search_page = arg[6:]
      elif arg == '-dbg':
         debug = 1
      else:
         pywikibot.stdout('Unknown argument "{}". Exiting.'.format(arg))
         return

   #pywikibot.stdout('The members of the requests.models.Response class are:')
   #pywikibot.stdout(format(dir(requests.models.Response)))
   #return
   
   # Check specified page or loop through specified category and check all pages
   site = pywikibot.Site()
   if search_cat != '':
      cat_obj = pywikibot.Category(site, search_cat)
      generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
      for page in pagegenerators.PreloadingGenerator(generator, 100):
         if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
         scan_for_interwiki_links(page.text, page.title())
   elif search_page != '':
      page = pywikibot.Page(site, search_page)
      if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
      scan_for_interwiki_links(page.text, page.title())

   # Print the results
   print_summary()

if __name__ == '__main__':
   main()
Revision:	1196
Committed:	Fri Aug 15 20:55:01 2025 UTC (8 weeks ago) by iritscen
Content type:	text/x-python
File size:	13487 byte(s)
Log Message:	ValBot: check_interwiki_links.sh now tallies and more clearly marks redirects that are probably not intended. Redirect target page is now correctly stated in one message about redirects. Streamlined code somewhat.
#	Content
1	# Check Interwiki Links
2	# by iritscen@yahoo.com
3	# Looks at each link on a page (or all the pages in a category) which uses a registered
4	# interwiki prefix and loads the linked page, verifying that it exists and that any section
5	# link, if present, is valid as well. The output will use the word "ERROR" when it cannot
6	# validate the interwiki link.
7	# Recommended viewing width:
8	# \|---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---\|
9
10	import bs4
11	import pywikibot
12	import re
13	import requests # for listing members with dir() when debugging
14
15	from bs4 import BeautifulSoup
16	from pywikibot import pagegenerators
17	from pywikibot.bot import QuitKeyboardInterrupt
18	from pywikibot.comms.http import fetch
19	from pywikibot.specialbots import UploadRobot
20	from pywikibot.tools.formatter import color_format
21	from urllib.parse import urljoin
22
23	class IWLink:
24	def __init__(self, iw_prefix, prefix_url, full_url, page_name, page_slug, curl_response):
25	self.iw_prefix = iw_prefix # e.g. "wp"
26	self.prefix_url = prefix_url # e.g. "https://en.wikipedia.org/wiki/"
27	self.full_url = full_url # e.g. "https://en.wikipedia.org/wiki/Easter_egg"
28	self.page_name = page_name # "Easter egg"
29	self.page_slug = page_slug # "Easter_egg"
30	self.curl_response = curl_response # a class defined in the Requests library
31
32	# Parallel arrays based on https://wiki.oni2.net/Special:Interwiki
33	interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')
34
35	interwiki_urls = ('http://www.acronymfinder.com/~/search/af.aspx?string=exact&Acronym=', 'http://www.google.com/search?q=cache:', 'https://commons.wikimedia.org/wiki/', 'http://www.dict.org/bin/Dict?Database=&Form=Dict1&Strategy=&Query=', 'http://www.google.com/search?q=', 'https://meta.wikimedia.org/wiki/', 'https://www.mediawiki.org/wiki/', 'https://en.wikibooks.org/wiki/', 'https://www.wikidata.org/wiki/', 'https://foundation.wikimedia.org/wiki/', 'https://en.wikinews.org/wiki/', 'https://en.wikipedia.org/wiki/', 'https://en.wikiquote.org/wiki/', 'https://wikisource.org/wiki/', 'https://species.wikimedia.org/wiki/', 'https://en.wikiversity.org/wiki/', 'https://en.wikivoyage.org/wiki/', 'https://en.wiktionary.org/wiki/', 'https://en.wiktionary.org/wiki/', 'https://en.wikipedia.org/wiki/')
36
37	# Initialize globals
38	debug = 0
39	pages_checked = 0
40	iw_found = 0
41	errors_issued = 0
42	unintended_redirects_found = 0
43	name_printed = 0
44
45	# Prints the name of a page on which something occurred, if it has not been printed before
46	def possibly_print(page_name):
47	global debug
48	global name_printed
49
50	if not name_printed and not debug:
51	pywikibot.stdout('')
52	pywikibot.stdout('From page "{}":'.format(page_name))
53	name_printed = 1
54
55	# Search a page for the section specified in the link
56	def find_section(the_link, print_result):
57	global errors_issued
58
59	# Isolate section link
60	target_page_name, anchor_name = the_link.page_slug.split('#')
61	target_page_name_human = target_page_name.replace('_', ' ')
62
63	# Convert dot-notation hex entities to proper characters
64	replacements = [(r'\.22', '"'), (r'\.27', "'"), (r'\.28', '('), (r'\.29', ')')]
65	for pattern, replacement in replacements:
66	anchor_name = re.sub(pattern, replacement, anchor_name)
67
68	# Read linked page to see if it really has this anchor link
69	soup = BeautifulSoup(the_link.curl_response.text, 'html.parser')
70	tags_to_search = ['span', 'div', 'h2', 'h3', 'h4']
71	found_section = False
72	for tag_name in tags_to_search:
73	for the_tag in soup.find_all(tag_name):
74	if the_tag.get('id') == anchor_name:
75	found_section = True
76	break
77	if found_section:
78	break
79
80	# Tell user what we found
81	if found_section == False:
82	possibly_print(the_link.page_name)
83	pywikibot.stdout(' ERROR: Could not find section "{0}" on {1} page "{2}".'.format(anchor_name, the_link.iw_prefix, target_page_name_human))
84	# TODO: Check that page name has been corrected to redirected page if there was a redirect
85	errors_issued = errors_issued + 1
86	elif print_result == True:
87	pywikibot.stdout(' The section "{0}" was found on {1} page "{2}".'.format(anchor_name, the_link.iw_prefix, target_page_name_human))
88
89	# For a link that redirected us to another page, extract the name of the target page from
90	# the target page's source
91	def find_canonical_link(the_link):
92	# Extract link from this markup which contains name of redirected-to page:
93	# <link rel="canonical" href="https://en.wikipedia.org/wiki/Page_name"/>
94	canonical_name = the_link.curl_response.text.split('<link rel="canonical" href="')[-1]
95	prefix_length = len(the_link.prefix_url)
96	canonical_name = canonical_name[prefix_length:]
97	tag_end = canonical_name.find('">')
98
99	if tag_end == -1:
100	pywikibot.stdout(' ERROR: The {0} link "{1}" is a redirect page, but this script could not isolate the target page name.'.format(the_link.iw_prefix, the_link.page_slug))
101	errors_issued = errors_issued + 1
102	else:
103	canonical_name = canonical_name[:tag_end]
104	if len(canonical_name) > 100:
105	# Certain things can cause the trim to fail; report error and avoid slamming the
106	# output with massive page source from a failed trim
107	pywikibot.stdout(' ERROR: The {0} link "{1}" is a redirect to "{2}…" (string overflow).'.format(the_link.iw_prefix, the_link.page_slug, canonical_name[:100]))
108	errors_issued = errors_issued + 1
109	else:
110	canonical_name = canonical_name.replace('_', ' ')
111	if '#' in the_link.page_slug:
112	_, anchor_name = the_link.page_slug.split('#')
113	pywikibot.stdout(' The {0} link "{1}" is a redirect to "{2}#{3}", which is a valid page. Checking for section on that page….'.format(the_link.iw_prefix, the_link.page_slug, canonical_name, anchor_name))
114	the_link.page_slug = the_link.page_slug.replace(the_link.page_name, canonical_name) # update page slug so that find_section() uses the right page name in its messages
115	find_section(the_link, True)
116	else:
117	pywikibot.stdout(' The {0} link "{1}" is a redirect to "{2}", which is a valid page.'.format(the_link.iw_prefix, the_link.page_slug, canonical_name))
118
119	# Test an interwiki link and look for a section link if applicable
120	def test_interwiki_link(the_link):
121	global errors_issued
122	global unintended_redirects_found
123
124	the_link.curl_response = fetch(the_link.full_url)
125
126	# One way we tell that a redirect occurred is by checking fetch's history, as it
127	# automatically follows redirects. This will catch formal redirects which come from pages
128	# such as Special:PermanentLink.
129	if the_link.curl_response.history != []:
130	possibly_print(the_link.page_name)
131
132	# If linked page is in all caps, e.g. WP:BEANS, it's likely a deliberate use of a redirect
133	if the_link.page_slug.startswith('WP:') and the_link.page_slug == the_link.page_slug.upper():
134	pywikibot.stdout(' Got redirection code "{0}" for {1} link "{2}". This appears to be a deliberate use of a Wikipedia shortcut. Checking the target page….'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug))
135	find_canonical_link(the_link)
136	else:
137	permalink1 = 'Special:PermanentLink/'.lower()
138	permalink2 = 'Special:Permalink/'.lower()
139	page_slug_lower = the_link.page_slug.lower()
140	if page_slug_lower.startswith(permalink1) or page_slug_lower.startswith(permalink2):
141	pywikibot.stdout(' Got redirection code "{0}" for {1} permanent revision link "{2}". Checking the target page….'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug))
142	find_canonical_link(the_link)
143	else:
144	pywikibot.stdout(' ERROR: Unrecognized type of redirection (code "{0}") for {1} link "{2}". You should check the link manually.'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug))
145	errors_issued = errors_issued + 1
146	elif the_link.curl_response.status_code != 200:
147	possibly_print(the_link.page_name)
148	pywikibot.stdout(' ERROR: Got response code {0} for {1} link "{2}". The page may not exist.'.format(the_link.curl_response.status_code, the_link.iw_prefix, the_link.page_slug))
149	errors_issued = errors_issued + 1
150	# However the usual way that a redirect occurs is that MediaWiki redirects us sneakily
151	# using JavaScript, while returning code OK 200 as if the link was correct; this happens
152	# when a redirect page is accessed. We must detect these soft redirects by looking at the
153	# page source to find the redirect note inserted at the top of the page for the reader.
154	elif 'Redirected from <a' in the_link.curl_response.text:
155	unintended_redirects_found = unintended_redirects_found + 1
156	possibly_print(the_link.page_name)
157	pywikibot.stdout(' WARNING: Got silently redirected by {0} link "{1}". Checking the target page….'.format(the_link.iw_prefix, the_link.page_slug))
158	find_canonical_link(the_link)
159	elif '#' in the_link.page_slug:
160	find_section(the_link, False)
161
162	# Searches the given page text for interwiki links
163	def scan_for_interwiki_links(page_text, page_name):
164	global debug
165	global pages_checked
166	global iw_found
167	global name_printed
168	pages_checked = pages_checked + 1
169	cur_prefix = 0
170	name_printed = 0
171
172	for prefix in interwiki_prefixes:
173	# Isolate strings that start with "[[prefix:" and end with "\|" or "]"
174	iw_link = r"\[\[" + prefix + r":[^\|\]]*(\\|\|\])"
175	for match in re.finditer(iw_link, page_text):
176	the_link = IWLink(prefix, interwiki_urls[cur_prefix], "", page_name, "", "")
177
178	# Extract just the page title from this regex match
179	s = match.start() + 2 + len(the_link.iw_prefix) + 1
180	e = match.end() - 1
181
182	# Commonly we use spaces instead of underscores, so fix that before querying
183	the_link.page_slug = page_text[s:e].replace(' ', '_')
184
185	# But use spaces for title when printing it
186	page_title_human = the_link.page_slug.replace('_', ' ')
187	if debug: pywikibot.stdout(' Validating {0} link "{1}"'.format(the_link.iw_prefix, page_title_human))
188	iw_found = iw_found + 1
189
190	# Construct full URL for the particular wiki
191	the_link.full_url = the_link.prefix_url + the_link.page_slug
192
193	# Adjust URL if this is a foreign-language WP link
194	if re.match("^[a-zA-Z]{2}:", the_link.page_slug):
195	lang_code = the_link.page_slug[0:2] + "."
196	# "wp:" is the Wikipedia: namespace, not a language
197	if lang_code != "wp." and lang_code != "WP.":
198	the_link.full_url = the_link.full_url.replace('en.', lang_code)
199	the_link.full_url = the_link.full_url.replace(the_link.page_slug[0:3], '')
200
201	# Test the URL
202	test_interwiki_link(the_link)
203	cur_prefix = cur_prefix + 1
204
205	# Print a wrap-up message
206	def print_summary():
207	global pages_checked
208	global iw_found
209	global errors_issued
210	global unintended_redirects_found
211
212	page_str = "pages"
213	if pages_checked == 1:
214	page_str = "page"
215
216	link_str = "links"
217	if iw_found == 1:
218	link_str = "link"
219
220	pywikibot.stdout('Checked {0} {1} and found {2} interwiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
221
222	error_str = "errors were"
223	if errors_issued == 1:
224	error_str = "error was"
225
226	pywikibot.stdout('{0} {1} encountered in validating these links.'.format(errors_issued, error_str))
227
228	warning_str = "likely-unintended redirects were"
229	if unintended_redirects_found == 1:
230	warning_str = "likely-unintended redirect was"
231
232	pywikibot.stdout('{0} {1} encountered in validating these links.'.format(unintended_redirects_found, warning_str))
233
234	# Main function
235	def main(*args):
236	global debug
237	search_cat = ''
238	search_page = ''
239
240	# Process arguments
241	local_args = pywikibot.handle_args(args)
242	for arg in local_args:
243	if arg.startswith('-cat:'):
244	search_cat = arg[5:]
245	elif arg.startswith('-page:'):
246	search_page = arg[6:]
247	elif arg == '-dbg':
248	debug = 1
249	else:
250	pywikibot.stdout('Unknown argument "{}". Exiting.'.format(arg))
251	return
252
253	#pywikibot.stdout('The members of the requests.models.Response class are:')
254	#pywikibot.stdout(format(dir(requests.models.Response)))
255	#return
256
257	# Check specified page or loop through specified category and check all pages
258	site = pywikibot.Site()
259	if search_cat != '':
260	cat_obj = pywikibot.Category(site, search_cat)
261	generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
262	for page in pagegenerators.PreloadingGenerator(generator, 100):
263	if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
264	scan_for_interwiki_links(page.text, page.title())
265	elif search_page != '':
266	page = pywikibot.Page(site, search_page)
267	if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
268	scan_for_interwiki_links(page.text, page.title())
269
270	# Print the results
271	print_summary()
272
273	if __name__ == '__main__':
274	main()