ValBot/Python/check_intrawiki_section_links.py

# Check Intrawiki Section Links
# by iritscen@yahoo.com
# Looks at each wikilink on a page (or in all the pages in a category) for a section link ('#'),
# and loads the linked page and verifies that the named section actually exists. It also
# understands section links generated through a call to Template:SectionLink.
# Recommended viewing width:
# |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- --|

import os

from urllib.parse import urljoin

import pywikibot
import re

from pywikibot.bot import QuitKeyboardInterrupt
from pywikibot import pagegenerators
from pywikibot.tools.formatter import color_format
from pywikibot.comms.http import fetch
from pywikibot.specialbots import UploadRobot
from bs4 import BeautifulSoup

# Tuple of OniGalore's namespaces
intrawiki_prefixes = ('Image', 'Special', 'Talk', 'User', 'User_talk', 'OniGalore', 'OniGalore_talk', 'File', 'File_talk', 'MediaWiki', 'MediaWiki_talk', 'Template', 'Template_talk', 'Help', 'Help_talk', 'Category', 'Category_talk', 'BSL', 'BSL_talk', 'OBD', 'OBD_talk', 'AE', 'AE_talk', 'Oni2', 'Oni2_talk', 'XML', 'XML_talk')

# URL for main namespace of our wiki
onigalore_url = 'https://wiki.oni2.net/'

# Tuple of interwiki prefixes, for passing over such links
interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')

# List of chapter names, for substitution into links that use "{{Cn}}" transclusion
chapter_names = ['CHAPTER_00_._COMBAT_TRAINING', 'CHAPTER_01_._TRIAL_RUN', 'CHAPTER_02_._ENGINES_OF_EVIL', 'CHAPTER_03_._PUZZLE_PIECES', 'CHAPTER_04_._TIGER_BY_THE_TAIL', 'CHAPTER_05_._HOT_PURSUIT', 'CHAPTER_06_._COUNTERATTACK', 'CHAPTER_07_._A_FRIEND_IN_NEED', 'CHAPTER_08_._AN_INNOCENT_LIFE', 'CHAPTER_09_._TRUTH_AND_CONSEQUENCES', 'CHAPTER_10_._CAT_AND_MOUSE', 'CHAPTER_11_._DREAM_DIVER', 'CHAPTER_12_._SINS_OF_THE_FATHER', 'CHAPTER_13_._PHOENIX_RISING', 'CHAPTER_14_._DAWN_OF_THE_CHRYSALIS']

# Tuple of patterns for recognizing wikilinks
# Pattern 1: Detect "[[anything]]", "[[any:thing]]", "[[any|thing]]", "[[any:thi|ng]]"
# Pattern 2: Detect "{{SectionLink|Page|Section name}}", "{{SectionLink||Section name}}"
link_patterns = ("\[\[[^|\]]*(\||\])", "\{\{SectionLink\|[^|\}]*\|[^|\}]*\}\}")

# Initialize globals
debug = 0
pages_checked = 0
iw_found = 0
advice_issued = 0
errors_issued = 0

# Searches the given page text for intrawiki links with section links in them
def scan_for_intrawiki_links(page_text, page_name):
   global debug
   global pages_checked
   global iw_found
   global advice_issued
   global errors_issued
   pages_checked += 1
   name_printed = 0

   for i, the_pattern in enumerate(link_patterns):
      if debug:
         if i == 0:
            pywikibot.stdout('   Checking page for wikilinks with section names.')
         elif i == 1:
            pywikibot.stdout('   Checking page for {{SectionLink}} calls.')
      
      for match in re.finditer(the_pattern, page_text):
         found_iw_match = False
         iw_url = ""
         page_name2 = page_name
   
         # Cut out the matched text from the page, isolating just the page+section name
         target_start = 2 # "[["
         target_end = 1 # "|" or "]" (we only match the first ending bracket)
         if i == 1:
            target_start = 14 # "{{SectionLink|"
            target_end = 2 # "}}"
         s = match.start() + target_start # remove the link-opening markup
         e = match.end() - target_end # remove the link-ending markup
         link_text = page_text[s:e]
         
         # The second link type will look like "Page|Section" or "|Section", so fix that pipe
         if i == 1:
            link_text = link_text.replace('|', '#')

         # Sometimes we use a space char. instead of a '_', so fix that before querying
         link_text = link_text.replace(' ', '_')
         if debug: pywikibot.stdout('      Found link {0}.'.format(link_text))
      
         # If this link doesn't have a section link in it, then we don't care about it, as
         # MediaWiki takes care of checking basic intrawiki links
         if not '#' in link_text:
            if debug: pywikibot.stdout('         Link doesn\'t have a section anchor in it. Skipping.')
            continue

         # If this link has an interwiki prefix, it can be ignored; see check_interwiki_links.py
         # for the task of checking interwiki page+section links
         is_interwiki = False
         if found_iw_match == False:
            for prefix in interwiki_prefixes:
               if prefix + ":" in link_text:
                  if debug: pywikibot.stdout('         Skipping link {} because it is an interwiki link.'.format(link_text))
                  is_interwiki = True
                  break
         if is_interwiki:
            continue
      
         # If there is a '{' in the link, then probably it's a link built on transcluded text
         # like "Quotes/Diary#{{C3}}", which we cannot expand and work with, so skip it
         if '{' in link_text:
            ch_link_pattern = re.compile(r"{{C[0-9]*}}")
            ch_link = ch_link_pattern.search(link_text)
            if debug: pywikibot.stdout('         Found transclusion in link: "{}".'.format(ch_link.group(0)))
            if ch_link:
               ch_link_match = ch_link.group(0)
               ch_num_pattern = re.compile("[0-9]+")
               ch_num = ch_num_pattern.search(ch_link_match)
               if ch_num:
                  ch_num_match = int(ch_num.group(0))
                  if ch_num_match >= 0 and ch_num_match <= 14:
                     ch_name = chapter_names[ch_num_match]
                     replace_pattern = re.compile(r"{{C" + ch_num.group(0) + r"}}")
                     link_text = replace_pattern.sub(ch_name, link_text)
                     if debug: pywikibot.stdout('         After performing transclusion, link is now "{}".'.format(link_text))
                  else:
                     if not name_printed and not debug:
                        pywikibot.stdout('From page "{}":'.format(page_name))
                        name_printed = 1
                     pywikibot.stdout('   ADVICE: Link {0} transcludes a chapter name using an out-of-range number, {1}.'.format(link_text, ch_num_match))
                     advice_issued += 1
                     continue
               else:
                  if not name_printed and not debug:
                     pywikibot.stdout('From page "{}":'.format(page_name))
                     name_printed = 1
                  pywikibot.stdout('   ADVICE: Link {} seems to be transcluding a chapter name, but this script couldn\'t read it.'.format(link_text))
                  advice_issued += 1
                  continue
            else:
               if not name_printed and not debug:
                  pywikibot.stdout('From page "{}":'.format(page_name))
                  name_printed = 1
               pywikibot.stdout('   ADVICE: Link {0} seems to use transclusion. This script can understand chapter name transclusions such as "{1}" but it doesn\'t recognize this one so it can\'t be verified. You should check the link manually.'.format(link_text, "{{C7}}"))
               advice_issued += 1
               continue

         # If this is a relative "/" link, use the current page as the basis for the URL. Note
         # that only a leading slash is looked for, so if there's multiple steps down ("/x/y"),
         # we're out of luck.
         if link_text.startswith('/'):
            link_text = page_name + link_text
            if debug: pywikibot.stdout('         Changed link_text to {} on account of "/".'.format(link_text))
      
         # If this is a relative "../" link, find the parent page, set ourselves to that page,
         # then remove the relative portion of the link. Note that this is only performed once,
         # so if there's multiple steps back ("../../"), we're out of luck.
         if link_text.startswith('../'):
            last_slash = page_name.rfind('/')
            page_name2 = page_name[0:last_slash]
            if debug: pywikibot.stdout('         Changed page_name to {} on account of "../".'.format(page_name2))
            link_text = link_text[3:len(link_text)]
            if debug: pywikibot.stdout('         Changed link_text to {} on account of "../".'.format(link_text))
            # If this is now going to be a bare section link for the parent page, don't add a
            # slash, otherwise do because we are drilling down to another subpage
            if link_text.startswith('#'):
               link_text = page_name2 + link_text
            else:
               link_text = page_name2 + '/' + link_text
         
         # If this is a bare section link, build URL based on this page
         if link_text.startswith('#'):
            iw_url = onigalore_url + page_name2
            iw_found += 1
            if debug: pywikibot.stdout('         Found link to this very page, {}.'.format(link_text))
            found_iw_match = True
            link_text = page_name2 + link_text
      
         # If there's no ":" in the link (before the section link, where a colon would just be
         # part of the text) then it's a Main namespace article; proceed with building URL
         if found_iw_match == False:
            if not re.search(":.*#", link_text):
               iw_url = onigalore_url + link_text
               iw_found += 1
               if debug: pywikibot.stdout('         Link is to a Main namespace page.')
               found_iw_match = True
         
         # If there is a ":", match the prefix against the intrawiki prefixes on OniGalore
         # before building URL
         if found_iw_match == False:
            for prefix in intrawiki_prefixes:
               if prefix + ":" in link_text:
                  iw_url = onigalore_url + link_text
                  if debug: pywikibot.stdout('         Identified namespace {}.'.format(prefix))
                  iw_found += 1
                  found_iw_match = True
                  break
      
         # If we still haven't turned this match into a URL, something's gone wrong
         if (found_iw_match == False) or (iw_url == ""):
            if not name_printed and not debug:
               pywikibot.stdout('From page "{}":'.format(page_name))
               name_printed = 1
            pywikibot.stdout('   ERROR: Couldn\'t figure out link {}.'.format(link_text))
            continue

         # Test the URL
         iw_url = iw_url.replace(' ', '_')
         if debug: pywikibot.stdout('         Reading page at {}...'.format(iw_url))
         response = fetch(iw_url)

         # Redirects are followed automatically by fetch() and treated as "200"s; the way we can
         # tell that a redirect occurred is by checking fetch's history
         if response.history != []:
            if not name_printed and not debug:
               pywikibot.stdout('From page "{}":'.format(page_name))
               name_printed = 1
            pywikibot.stdout('   ADVICE: Got redirection code ({0}) on URL "{1}". You should check the link manually.'.format(response.history[0], iw_url))
            advice_issued += 1
         elif response.status_code != 200:
            if not name_printed and not debug:
               pywikibot.stdout('From page "{}":'.format(page_name))
               name_printed = 1
            pywikibot.stdout('   ERROR: Got response code {0} on URL {1}. The target page may not exist.'.format(response.status_code, iw_url))
            errors_issued += 1
         else:
            # Isolate section link
            pre_section, section_name = link_text.split('#', 1)
            if debug: pywikibot.stdout('         Searching for section link {} on page.'.format(section_name))
         
            # Convert slash character to the dot-notation hex encoding that MediaWiki uses
            section_name = section_name.replace('/', '.2F')
         
            # Read linked page to see if it really has this anchor link
            soup = BeautifulSoup(response.text, 'html.parser')
            found_section = False
            for span_tag in soup.findAll('span'):
               span_name = span_tag.get('id', None)
               if span_name == section_name:
                  if debug: pywikibot.stdout('         Found section!')
                  found_section = True
                  break
            if found_section == False:
               if not name_printed and not debug:
                  pywikibot.stdout('From page "{}":'.format(page_name))
                  name_printed = 1
               pywikibot.stdout('   ERROR: Could not find section {0} on page {1}!'.format(section_name, pre_section))
               errors_issued += 1

def main(*args):
   global debug
   global pages_checked
   global iw_found
   global advice_issued
   global errors_issued
   search_cat = ''
   search_page = ''

   local_args = pywikibot.handle_args(args)
   genFactory = pagegenerators.GeneratorFactory()

   for arg in local_args:
      if arg.startswith('-cat:'):
         search_cat = arg[5:]
      elif arg.startswith('-page:'):
         search_page = arg[6:]
      elif arg == '-dbg':
         debug = 1
      else:
         pywikibot.stdout('Unknown argument "{}".'.format(arg))
         return

   site = pywikibot.Site()

   # This line of code enumerates the methods in the 'page' class
   #pywikibot.stdout(format(dir(page)))

   if search_cat != '':
      cat_obj = pywikibot.Category(site, search_cat)
      generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
      for page in pagegenerators.PreloadingGenerator(generator, 100):
         if debug: pywikibot.stdout('Checking page {0}'.format(page.title()))
         scan_for_intrawiki_links(page.text, page.title())
   elif search_page != '':
      page = pywikibot.Page(site, search_page)
      if debug: pywikibot.stdout('Checking page {0}'.format(page.title()))
      scan_for_intrawiki_links(page.text, page.title())

   page_str = "pages"
   if pages_checked == 1:
      page_str = "page"

   link_str = "links"
   if iw_found == 1:
      link_str = "link"

   pywikibot.stdout('Checked {0} {1} and found {2} intrawiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
   pywikibot.stdout('While attempting to follow section links...')

   if advice_issued == 0:
      pywikibot.stdout('   No advice on potential problems was issued.')
   elif advice_issued == 1:
      pywikibot.stdout('   1 piece of advice on a potential problem was issued.')
   else:
      pywikibot.stdout('   {} pieces of advice on potential problems were issued.'.format(advice_issued))

   error_str = "errors were"
   if errors_issued == 1:
      error_str = "error was"
   pywikibot.stdout('   {0} {1} encountered.'.format(errors_issued, error_str))

if __name__ == '__main__':
   main()
Revision:	1179
Committed:	Fri Apr 28 00:53:24 2023 UTC (2 years, 6 months ago) by iritscen
Content type:	text/x-python
File size:	15057 byte(s)
Log Message:	ValBot: check_intrawiki_section_links.py: Simplified output to just advice and errors. Added support for SectionLink template. Added support for links built on chapter name transclusion. Placed verbose output under a "-dbg" argument.
#	Content
1	# Check Intrawiki Section Links
2	# by iritscen@yahoo.com
3	# Looks at each wikilink on a page (or in all the pages in a category) for a section link ('#'),
4	# and loads the linked page and verifies that the named section actually exists. It also
5	# understands section links generated through a call to Template:SectionLink.
6	# Recommended viewing width:
7	# \|---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- --\|
8
9	import os
10
11	from urllib.parse import urljoin
12
13	import pywikibot
14	import re
15
16	from pywikibot.bot import QuitKeyboardInterrupt
17	from pywikibot import pagegenerators
18	from pywikibot.tools.formatter import color_format
19	from pywikibot.comms.http import fetch
20	from pywikibot.specialbots import UploadRobot
21	from bs4 import BeautifulSoup
22
23	# Tuple of OniGalore's namespaces
24	intrawiki_prefixes = ('Image', 'Special', 'Talk', 'User', 'User_talk', 'OniGalore', 'OniGalore_talk', 'File', 'File_talk', 'MediaWiki', 'MediaWiki_talk', 'Template', 'Template_talk', 'Help', 'Help_talk', 'Category', 'Category_talk', 'BSL', 'BSL_talk', 'OBD', 'OBD_talk', 'AE', 'AE_talk', 'Oni2', 'Oni2_talk', 'XML', 'XML_talk')
25
26	# URL for main namespace of our wiki
27	onigalore_url = 'https://wiki.oni2.net/'
28
29	# Tuple of interwiki prefixes, for passing over such links
30	interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')
31
32	# List of chapter names, for substitution into links that use "{{Cn}}" transclusion
33	chapter_names = ['CHAPTER_00_._COMBAT_TRAINING', 'CHAPTER_01_._TRIAL_RUN', 'CHAPTER_02_._ENGINES_OF_EVIL', 'CHAPTER_03_._PUZZLE_PIECES', 'CHAPTER_04_._TIGER_BY_THE_TAIL', 'CHAPTER_05_._HOT_PURSUIT', 'CHAPTER_06_._COUNTERATTACK', 'CHAPTER_07_._A_FRIEND_IN_NEED', 'CHAPTER_08_._AN_INNOCENT_LIFE', 'CHAPTER_09_._TRUTH_AND_CONSEQUENCES', 'CHAPTER_10_._CAT_AND_MOUSE', 'CHAPTER_11_._DREAM_DIVER', 'CHAPTER_12_._SINS_OF_THE_FATHER', 'CHAPTER_13_._PHOENIX_RISING', 'CHAPTER_14_._DAWN_OF_THE_CHRYSALIS']
34
35	# Tuple of patterns for recognizing wikilinks
36	# Pattern 1: Detect "[[anything]]", "[[any:thing]]", "[[any\|thing]]", "[[any:thi\|ng]]"
37	# Pattern 2: Detect "{{SectionLink\|Page\|Section name}}", "{{SectionLink\|\|Section name}}"
38	link_patterns = ("\[\[[^\|\]](\\|\|\])", "\{\{SectionLink\\|[^\|\}]\\|[^\|\}]*\}\}")
39
40	# Initialize globals
41	debug = 0
42	pages_checked = 0
43	iw_found = 0
44	advice_issued = 0
45	errors_issued = 0
46
47	# Searches the given page text for intrawiki links with section links in them
48	def scan_for_intrawiki_links(page_text, page_name):
49	global debug
50	global pages_checked
51	global iw_found
52	global advice_issued
53	global errors_issued
54	pages_checked += 1
55	name_printed = 0
56
57	for i, the_pattern in enumerate(link_patterns):
58	if debug:
59	if i == 0:
60	pywikibot.stdout(' Checking page for wikilinks with section names.')
61	elif i == 1:
62	pywikibot.stdout(' Checking page for {{SectionLink}} calls.')
63
64	for match in re.finditer(the_pattern, page_text):
65	found_iw_match = False
66	iw_url = ""
67	page_name2 = page_name
68
69	# Cut out the matched text from the page, isolating just the page+section name
70	target_start = 2 # "[["
71	target_end = 1 # "\|" or "]" (we only match the first ending bracket)
72	if i == 1:
73	target_start = 14 # "{{SectionLink\|"
74	target_end = 2 # "}}"
75	s = match.start() + target_start # remove the link-opening markup
76	e = match.end() - target_end # remove the link-ending markup
77	link_text = page_text[s:e]
78
79	# The second link type will look like "Page\|Section" or "\|Section", so fix that pipe
80	if i == 1:
81	link_text = link_text.replace('\|', '#')
82
83	# Sometimes we use a space char. instead of a '_', so fix that before querying
84	link_text = link_text.replace(' ', '_')
85	if debug: pywikibot.stdout(' Found link {0}.'.format(link_text))
86
87	# If this link doesn't have a section link in it, then we don't care about it, as
88	# MediaWiki takes care of checking basic intrawiki links
89	if not '#' in link_text:
90	if debug: pywikibot.stdout(' Link doesn\'t have a section anchor in it. Skipping.')
91	continue
92
93	# If this link has an interwiki prefix, it can be ignored; see check_interwiki_links.py
94	# for the task of checking interwiki page+section links
95	is_interwiki = False
96	if found_iw_match == False:
97	for prefix in interwiki_prefixes:
98	if prefix + ":" in link_text:
99	if debug: pywikibot.stdout(' Skipping link {} because it is an interwiki link.'.format(link_text))
100	is_interwiki = True
101	break
102	if is_interwiki:
103	continue
104
105	# If there is a '{' in the link, then probably it's a link built on transcluded text
106	# like "Quotes/Diary#{{C3}}", which we cannot expand and work with, so skip it
107	if '{' in link_text:
108	ch_link_pattern = re.compile(r"{{C[0-9]*}}")
109	ch_link = ch_link_pattern.search(link_text)
110	if debug: pywikibot.stdout(' Found transclusion in link: "{}".'.format(ch_link.group(0)))
111	if ch_link:
112	ch_link_match = ch_link.group(0)
113	ch_num_pattern = re.compile("[0-9]+")
114	ch_num = ch_num_pattern.search(ch_link_match)
115	if ch_num:
116	ch_num_match = int(ch_num.group(0))
117	if ch_num_match >= 0 and ch_num_match <= 14:
118	ch_name = chapter_names[ch_num_match]
119	replace_pattern = re.compile(r"{{C" + ch_num.group(0) + r"}}")
120	link_text = replace_pattern.sub(ch_name, link_text)
121	if debug: pywikibot.stdout(' After performing transclusion, link is now "{}".'.format(link_text))
122	else:
123	if not name_printed and not debug:
124	pywikibot.stdout('From page "{}":'.format(page_name))
125	name_printed = 1
126	pywikibot.stdout(' ADVICE: Link {0} transcludes a chapter name using an out-of-range number, {1}.'.format(link_text, ch_num_match))
127	advice_issued += 1
128	continue
129	else:
130	if not name_printed and not debug:
131	pywikibot.stdout('From page "{}":'.format(page_name))
132	name_printed = 1
133	pywikibot.stdout(' ADVICE: Link {} seems to be transcluding a chapter name, but this script couldn\'t read it.'.format(link_text))
134	advice_issued += 1
135	continue
136	else:
137	if not name_printed and not debug:
138	pywikibot.stdout('From page "{}":'.format(page_name))
139	name_printed = 1
140	pywikibot.stdout(' ADVICE: Link {0} seems to use transclusion. This script can understand chapter name transclusions such as "{1}" but it doesn\'t recognize this one so it can\'t be verified. You should check the link manually.'.format(link_text, "{{C7}}"))
141	advice_issued += 1
142	continue
143
144	# If this is a relative "/" link, use the current page as the basis for the URL. Note
145	# that only a leading slash is looked for, so if there's multiple steps down ("/x/y"),
146	# we're out of luck.
147	if link_text.startswith('/'):
148	link_text = page_name + link_text
149	if debug: pywikibot.stdout(' Changed link_text to {} on account of "/".'.format(link_text))
150
151	# If this is a relative "../" link, find the parent page, set ourselves to that page,
152	# then remove the relative portion of the link. Note that this is only performed once,
153	# so if there's multiple steps back ("../../"), we're out of luck.
154	if link_text.startswith('../'):
155	last_slash = page_name.rfind('/')
156	page_name2 = page_name[0:last_slash]
157	if debug: pywikibot.stdout(' Changed page_name to {} on account of "../".'.format(page_name2))
158	link_text = link_text[3:len(link_text)]
159	if debug: pywikibot.stdout(' Changed link_text to {} on account of "../".'.format(link_text))
160	# If this is now going to be a bare section link for the parent page, don't add a
161	# slash, otherwise do because we are drilling down to another subpage
162	if link_text.startswith('#'):
163	link_text = page_name2 + link_text
164	else:
165	link_text = page_name2 + '/' + link_text
166
167	# If this is a bare section link, build URL based on this page
168	if link_text.startswith('#'):
169	iw_url = onigalore_url + page_name2
170	iw_found += 1
171	if debug: pywikibot.stdout(' Found link to this very page, {}.'.format(link_text))
172	found_iw_match = True
173	link_text = page_name2 + link_text
174
175	# If there's no ":" in the link (before the section link, where a colon would just be
176	# part of the text) then it's a Main namespace article; proceed with building URL
177	if found_iw_match == False:
178	if not re.search(":.*#", link_text):
179	iw_url = onigalore_url + link_text
180	iw_found += 1
181	if debug: pywikibot.stdout(' Link is to a Main namespace page.')
182	found_iw_match = True
183
184	# If there is a ":", match the prefix against the intrawiki prefixes on OniGalore
185	# before building URL
186	if found_iw_match == False:
187	for prefix in intrawiki_prefixes:
188	if prefix + ":" in link_text:
189	iw_url = onigalore_url + link_text
190	if debug: pywikibot.stdout(' Identified namespace {}.'.format(prefix))
191	iw_found += 1
192	found_iw_match = True
193	break
194
195	# If we still haven't turned this match into a URL, something's gone wrong
196	if (found_iw_match == False) or (iw_url == ""):
197	if not name_printed and not debug:
198	pywikibot.stdout('From page "{}":'.format(page_name))
199	name_printed = 1
200	pywikibot.stdout(' ERROR: Couldn\'t figure out link {}.'.format(link_text))
201	continue
202
203	# Test the URL
204	iw_url = iw_url.replace(' ', '_')
205	if debug: pywikibot.stdout(' Reading page at {}...'.format(iw_url))
206	response = fetch(iw_url)
207
208	# Redirects are followed automatically by fetch() and treated as "200"s; the way we can
209	# tell that a redirect occurred is by checking fetch's history
210	if response.history != []:
211	if not name_printed and not debug:
212	pywikibot.stdout('From page "{}":'.format(page_name))
213	name_printed = 1
214	pywikibot.stdout(' ADVICE: Got redirection code ({0}) on URL "{1}". You should check the link manually.'.format(response.history[0], iw_url))
215	advice_issued += 1
216	elif response.status_code != 200:
217	if not name_printed and not debug:
218	pywikibot.stdout('From page "{}":'.format(page_name))
219	name_printed = 1
220	pywikibot.stdout(' ERROR: Got response code {0} on URL {1}. The target page may not exist.'.format(response.status_code, iw_url))
221	errors_issued += 1
222	else:
223	# Isolate section link
224	pre_section, section_name = link_text.split('#', 1)
225	if debug: pywikibot.stdout(' Searching for section link {} on page.'.format(section_name))
226
227	# Convert slash character to the dot-notation hex encoding that MediaWiki uses
228	section_name = section_name.replace('/', '.2F')
229
230	# Read linked page to see if it really has this anchor link
231	soup = BeautifulSoup(response.text, 'html.parser')
232	found_section = False
233	for span_tag in soup.findAll('span'):
234	span_name = span_tag.get('id', None)
235	if span_name == section_name:
236	if debug: pywikibot.stdout(' Found section!')
237	found_section = True
238	break
239	if found_section == False:
240	if not name_printed and not debug:
241	pywikibot.stdout('From page "{}":'.format(page_name))
242	name_printed = 1
243	pywikibot.stdout(' ERROR: Could not find section {0} on page {1}!'.format(section_name, pre_section))
244	errors_issued += 1
245
246	def main(*args):
247	global debug
248	global pages_checked
249	global iw_found
250	global advice_issued
251	global errors_issued
252	search_cat = ''
253	search_page = ''
254
255	local_args = pywikibot.handle_args(args)
256	genFactory = pagegenerators.GeneratorFactory()
257
258	for arg in local_args:
259	if arg.startswith('-cat:'):
260	search_cat = arg[5:]
261	elif arg.startswith('-page:'):
262	search_page = arg[6:]
263	elif arg == '-dbg':
264	debug = 1
265	else:
266	pywikibot.stdout('Unknown argument "{}".'.format(arg))
267	return
268
269	site = pywikibot.Site()
270
271	# This line of code enumerates the methods in the 'page' class
272	#pywikibot.stdout(format(dir(page)))
273
274	if search_cat != '':
275	cat_obj = pywikibot.Category(site, search_cat)
276	generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
277	for page in pagegenerators.PreloadingGenerator(generator, 100):
278	if debug: pywikibot.stdout('Checking page {0}'.format(page.title()))
279	scan_for_intrawiki_links(page.text, page.title())
280	elif search_page != '':
281	page = pywikibot.Page(site, search_page)
282	if debug: pywikibot.stdout('Checking page {0}'.format(page.title()))
283	scan_for_intrawiki_links(page.text, page.title())
284
285	page_str = "pages"
286	if pages_checked == 1:
287	page_str = "page"
288
289	link_str = "links"
290	if iw_found == 1:
291	link_str = "link"
292
293	pywikibot.stdout('Checked {0} {1} and found {2} intrawiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
294	pywikibot.stdout('While attempting to follow section links...')
295
296	if advice_issued == 0:
297	pywikibot.stdout(' No advice on potential problems was issued.')
298	elif advice_issued == 1:
299	pywikibot.stdout(' 1 piece of advice on a potential problem was issued.')
300	else:
301	pywikibot.stdout(' {} pieces of advice on potential problems were issued.'.format(advice_issued))
302
303	error_str = "errors were"
304	if errors_issued == 1:
305	error_str = "error was"
306	pywikibot.stdout(' {0} {1} encountered.'.format(errors_issued, error_str))
307
308	if __name__ == '__main__':
309	main()