ViewVC Help
View File | Revision Log | View Changeset | Root Listing
root/Oni2/ValBot/Python/check_intrawiki_section_links.py
Revision: 1186
Committed: Mon Nov 20 02:18:07 2023 UTC (22 months, 3 weeks ago) by iritscen
Content type: text/x-python
File size: 18613 byte(s)
Log Message:
ValBot: Forgot to update one variable name in check_intrawiki_section_links.py.

File Contents

# Content
1 # Check Intrawiki Section Links
2 # by iritscen@yahoo.com
3 # Looks at each wikilink on a page (or in all the pages in a category) for a section link ('#'),
4 # and loads the linked page and verifies that the named section actually exists. It also
5 # understands section links generated through a call to Template:SectionLink.
6 # Recommended viewing width:
7 # |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- --|
8
9 import os
10
11 from urllib.parse import urljoin
12
13 import pywikibot
14 import re
15
16 from pywikibot.bot import QuitKeyboardInterrupt
17 from pywikibot import pagegenerators
18 from pywikibot.tools.formatter import color_format
19 from pywikibot.comms.http import fetch
20 from pywikibot.specialbots import UploadRobot
21 from bs4 import BeautifulSoup
22
23 # Tuple of OniGalore's namespaces
24 intrawiki_prefixes = ('Image', 'Special', 'Talk', 'User', 'User_talk', 'OniGalore', 'OniGalore_talk', 'File', 'File_talk', 'MediaWiki', 'MediaWiki_talk', 'Template', 'Template_talk', 'Help', 'Help_talk', 'Category', 'Category_talk', 'BSL', 'BSL_talk', 'OBD', 'OBD_talk', 'AE', 'AE_talk', 'Oni2', 'Oni2_talk', 'XML', 'XML_talk')
25
26 # URL for main namespace of our wiki
27 onigalore_url = 'https://wiki.oni2.net/'
28
29 # Tuple of interwiki prefixes, for recognizing and passing over such links
30 interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')
31
32 # List of chapter names, for substitution into links that use "{{Cn}}" transclusion
33 chapter_names = ['CHAPTER_00_._COMBAT_TRAINING', 'CHAPTER_01_._TRIAL_RUN', 'CHAPTER_02_._ENGINES_OF_EVIL', 'CHAPTER_03_._PUZZLE_PIECES', 'CHAPTER_04_._TIGER_BY_THE_TAIL', 'CHAPTER_05_._HOT_PURSUIT', 'CHAPTER_06_._COUNTERATTACK', 'CHAPTER_07_._A_FRIEND_IN_NEED', 'CHAPTER_08_._AN_INNOCENT_LIFE', 'CHAPTER_09_._TRUTH_AND_CONSEQUENCES', 'CHAPTER_10_._CAT_AND_MOUSE', 'CHAPTER_11_._DREAM_DIVER', 'CHAPTER_12_._SINS_OF_THE_FATHER', 'CHAPTER_13_._PHOENIX_RISING', 'CHAPTER_14_._DAWN_OF_THE_CHRYSALIS']
34
35 # Tuple of patterns for recognizing wikilinks
36 # Pattern 1: Detect "[[anything]]", "[[any:thing]]", "[[any|thing]]", "[[any:thi|ng]]"
37 # Pattern 2: Detect "{{SectionLink|Page|Section name}}", "{{SectionLink||Section name}}"
38 link_patterns = ("\[\[[^|\]]*(\||\])", "\{\{SectionLink\|[^|\}]*\|[^|\}]*\}\}")
39
40 # Initialize globals
41 debug = 0
42 pages_checked = 0
43 iw_found = 0
44 advice_issued = 0
45 errors_issued = 0
46 name_printed = 0
47
48 # Prints the name of a page on which something occurred, if it has not been printed before
49 def possibly_print(page_name):
50 global debug
51 global name_printed
52
53 if not name_printed and not debug:
54 pywikibot.stdout('')
55 pywikibot.stdout('From page "{}":'.format(page_name))
56 name_printed = 1
57
58 # Search a page for the section specified in the link
59 def find_section(page_text, page_name, page_slug, print_result):
60 global errors_issued
61
62 # Isolate section link
63 target_page_name, anchor_name = page_slug.split('#', 1)
64 target_page_name_human = target_page_name.replace('_', ' ')
65 if debug: pywikibot.stdout(' Searching for section link {} on page.'.format(anchor_name))
66
67 # Convert slash character to the dot-notation hex encoding that MediaWiki uses
68 anchor_name = anchor_name.replace('/', '.2F')
69
70 # Read linked page to see if it really has this anchor link
71 soup = BeautifulSoup(page_text, 'html.parser')
72 found_section = False
73 for span_tag in soup.findAll('span'):
74 span_name = span_tag.get('id', None)
75 if span_name == anchor_name:
76 if debug and not print_result: pywikibot.stdout(' Found section in a span!')
77 found_section = True
78 break
79 if found_section == False:
80 # Search for a div with this ID
81 for span_tag in soup.findAll('div'):
82 span_name = span_tag.get('id', None)
83 if span_name == anchor_name:
84 if debug and not print_result: pywikibot.stdout(' Found section in a div!')
85 found_section = True
86 break
87 if found_section == False:
88 possibly_print(page_name)
89 pywikibot.stdout(' ERROR: Could not find section "{0}" on page {1}!'.format(anchor_name, target_page_name_human))
90 errors_issued += 1
91 elif debug and print_result:
92 pywikibot.stdout(' The section "{0}" was found on page "{1}".'.format(anchor_name, target_page_name_human))
93
94 # For a link that redirected us to another page, extract the name of the target page from
95 # the target page's source
96 def find_canonical_link(page_text, page_name, page_slug):
97 # Extract link from this markup which contains name of redirected-to page:
98 # <link rel="canonical" href="https://en.wikipedia.org/wiki/Page_name"/>
99 # "wgPageName":"Namespace:Page_name",
100 canonical_name = page_text.split('"wgPageName":"')[-1]
101 tag_end = canonical_name.find('",')
102
103 if tag_end == -1:
104 pywikibot.stdout(' ERROR: The link "{}" is a redirect page, but this script could not isolate the target page name.'.format(page_slug))
105 errors_issued = errors_issued + 1
106 else:
107 canonical_name = canonical_name[:tag_end]
108 if len(canonical_name) > 100:
109 # Certain things can cause the trim to fail; report error and avoid slamming the
110 # output with massive page source from a failed trim
111 pywikibot.stdout(' ERROR: The link "{}" is a redirect to "{2}…" (string overflow).'.format(page_slug, canonical_name[:100]))
112 errors_issued = errors_issued + 1
113 else:
114 canonical_name = canonical_name.replace('_', ' ')
115 if '#' in page_slug:
116 _, anchor_name = page_slug.split('#')
117 if debug: pywikibot.stdout(' The link "{0}" is a redirect to "{1}#{2}", which is a valid page. Checking section link….'.format(page_slug, canonical_name, anchor_name))
118 find_section(page_text, page_name, page_slug, True)
119 else:
120 pywikibot.stdout(' The link "{0}" is a redirect to "{1}", which is a valid page.'.format(page_slug, canonical_name))
121
122 # Test an intrawiki link and look for a section link if applicable
123 def test_intrawiki_link(iw_url, page_name, page_slug):
124 global advice_issued
125 global errors_issued
126
127 response = fetch(iw_url)
128
129 # One way we tell that a redirect occurred is by checking fetch's history, as it
130 # automatically follows redirects. This will catch formal redirects which come from pages
131 # such as Special:PermanentLink.
132 if response.history != []:
133
134 permalink1 = 'Special:PermanentLink/'.lower()
135 permalink2 = 'Special:Permalink/'.lower()
136 page_slug_lower = page_slug.lower()
137 if page_slug_lower.startswith(permalink1) or page_slug_lower.startswith(permalink2):
138 if debug:
139 possibly_print(page_name)
140 pywikibot.stdout(' Got redirection code "{0}" for permanent revision link "{1}". Checking the target page….'.format(response.history[0], page_slug))
141 find_canonical_link(response.text, page_name, page_slug)
142 else:
143 possibly_print(page_name)
144 pywikibot.stdout(' ERROR: Unrecognized type of redirection (code "{0}") for link "{1}". You should check the link manually.'.format(response.history[0], page_slug))
145 advice_issued += 1
146 elif response.status_code != 200:
147 possibly_print(page_name)
148 pywikibot.stdout(' ERROR: Got response code {0} on URL {1}. The target page may not exist.'.format(response.status_code, iw_url))
149 errors_issued += 1
150 # However the usual way that a redirect occurs is that MediaWiki redirects us sneakily
151 # using JavaScript, while returning code OK 200 as if the link was correct; this happens
152 # when a redirect page is accessed. We must detect these soft redirects by looking at the
153 # page source to find the redirect note inserted at the top of the page for the reader.
154 elif 'Redirected from <a' in response.text:
155 if debug:
156 possibly_print(page_name)
157 pywikibot.stdout(' Got silently redirected by link "{}". Checking the target page….'.format(page_slug))
158 find_canonical_link(response.text, page_name, page_slug)
159 else: # URL is OK, so proceed
160 find_section(response.text, page_name, page_slug, False)
161
162 # Searches the given page text for intrawiki links with section links in them
163 def scan_for_intrawiki_links(page_text, page_name):
164 global debug
165 global pages_checked
166 global iw_found
167 global advice_issued
168 global errors_issued
169 global name_printed
170 pages_checked += 1
171 name_printed = 0
172
173 for i, the_pattern in enumerate(link_patterns):
174 if debug:
175 if i == 0:
176 pywikibot.stdout(' Checking page for wikilinks with section names.')
177 elif i == 1:
178 pywikibot.stdout(' Checking page for {{SectionLink}} calls.')
179
180 for match in re.finditer(the_pattern, page_text):
181 found_iw_match = False
182 iw_url = ""
183 page_name2 = page_name
184
185 # Cut out the matched text from the page, isolating just the page+section name
186 target_start = 2 # "[["
187 target_end = 1 # "|" or "]" (we only match the first ending bracket)
188 if i == 1:
189 target_start = 14 # "{{SectionLink|"
190 target_end = 2 # "}}"
191 s = match.start() + target_start # remove the link-opening markup
192 e = match.end() - target_end # remove the link-ending markup
193 page_slug = page_text[s:e]
194
195 # The second link type will look like "Page|Section" or "|Section", so fix that pipe
196 if i == 1:
197 page_slug = page_slug.replace('|', '#')
198
199 # Sometimes we use a space char. instead of a '_', so fix that before querying
200 page_slug = page_slug.replace(' ', '_')
201 if debug: pywikibot.stdout(' Found link {0}.'.format(page_slug))
202
203 # If this link doesn't have a section link in it, then we don't care about it, as
204 # MediaWiki takes care of checking basic intrawiki links
205 if not '#' in page_slug:
206 if debug: pywikibot.stdout(' Link doesn\'t have a section anchor in it. Skipping.')
207 continue
208
209 # If this link has an interwiki prefix, it can be ignored; see check_interwiki_links.py
210 # for the task of checking interwiki page+section links
211 is_interwiki = False
212 if found_iw_match == False:
213 for prefix in interwiki_prefixes:
214 if prefix + ":" in page_slug:
215 if debug: pywikibot.stdout(' Skipping link {} because it is an interwiki link.'.format(page_slug))
216 is_interwiki = True
217 break
218 if is_interwiki:
219 continue
220
221 # If there is a '{' in the link, then probably it's a link built on transcluded text.
222 # If it's a chapter template transclusion like "Quotes/Diary#{{C3}}", expand it using
223 # our "chapter_names" array. If it's another type of transclusion, punt it to the user.
224 if '{' in page_slug:
225 ch_link_pattern = re.compile(r"{{C[0-9]*}}")
226 ch_link = ch_link_pattern.search(page_slug)
227 if debug: pywikibot.stdout(' Found transclusion in link: "{}".'.format(ch_link.group(0)))
228 if ch_link:
229 ch_link_match = ch_link.group(0)
230 ch_num_pattern = re.compile("[0-9]+")
231 ch_num = ch_num_pattern.search(ch_link_match)
232 if ch_num:
233 ch_num_match = int(ch_num.group(0))
234 if ch_num_match >= 0 and ch_num_match <= 14:
235 ch_name = chapter_names[ch_num_match]
236 replace_pattern = re.compile(r"{{C" + ch_num.group(0) + r"}}")
237 page_slug = replace_pattern.sub(ch_name, page_slug)
238 if debug: pywikibot.stdout(' After performing transclusion, link is now "{}".'.format(page_slug))
239 else:
240 possibly_print(page_name)
241 pywikibot.stdout(' ERROR: Link {0} transcludes a chapter name using an out-of-range number, {1}.'.format(page_slug, ch_num_match))
242 errors_issued += 1
243 continue
244 else:
245 possibly_print(page_name)
246 pywikibot.stdout(' ADVICE: Link {} seems to be transcluding a chapter name, but this script couldn\'t read it.'.format(page_slug))
247 advice_issued += 1
248 continue
249 else:
250 possibly_print(page_name)
251 pywikibot.stdout(' ADVICE: Link {0} seems to use transclusion. This script can understand chapter name transclusions such as "{1}" but it doesn\'t recognize this one so it can\'t be verified. You should check the link manually.'.format(page_slug, "{{C7}}"))
252 advice_issued += 1
253 continue
254
255 # If this is a relative "/" link, use the current page as the basis for the URL. Note
256 # that only a leading slash is looked for, so if there's multiple steps down ("/x/y"),
257 # we're out of luck.
258 if page_slug.startswith('/'):
259 page_slug = page_name + page_slug
260 if debug: pywikibot.stdout(' Changed page_slug to {} on account of "/".'.format(page_slug))
261
262 # If this is a relative "../" link, find the parent page, set ourselves to that page,
263 # then remove the relative portion of the link. Note that this is only performed once,
264 # so if there's multiple steps back ("../../"), we're out of luck.
265 if page_slug.startswith('../'):
266 last_slash = page_name.rfind('/')
267 page_name2 = page_name[0:last_slash]
268 if debug: pywikibot.stdout(' Changed page_name to {} on account of "../".'.format(page_name2))
269 page_slug = page_slug[3:len(page_slug)]
270 if debug: pywikibot.stdout(' Changed page_slug to {} on account of "../".'.format(page_slug))
271 # If this is now going to be a bare section link for the parent page, don't add a
272 # slash, otherwise do because we are drilling down to another subpage
273 if page_slug.startswith('#'):
274 page_slug = page_name2 + page_slug
275 else:
276 page_slug = page_name2 + '/' + page_slug
277
278 # If this is a bare section link, build URL based on this page
279 if page_slug.startswith('#'):
280 iw_url = onigalore_url + page_name2
281 iw_found += 1
282 if debug: pywikibot.stdout(' Found link to this very page, {}.'.format(page_slug))
283 found_iw_match = True
284 page_slug = page_name2 + page_slug
285
286 # If there's no ":" in the link (before the section link, where a colon would just be
287 # part of the text) then it's a Main namespace article; proceed with building URL
288 if found_iw_match == False:
289 if not re.search(":.*#", page_slug):
290 iw_url = onigalore_url + page_slug
291 iw_found += 1
292 if debug: pywikibot.stdout(' Link is to a Main namespace page.')
293 found_iw_match = True
294
295 # If there is a ":", match the prefix against the intrawiki prefixes on OniGalore
296 # before building URL
297 if found_iw_match == False:
298 for prefix in intrawiki_prefixes:
299 if prefix + ":" in page_slug:
300 iw_url = onigalore_url + page_slug
301 if debug: pywikibot.stdout(' Identified namespace {}.'.format(prefix))
302 iw_found += 1
303 found_iw_match = True
304 break
305
306 # If we still haven't turned this match into a URL, something's gone wrong
307 if (found_iw_match == False) or (iw_url == ""):
308 possibly_print(page_name)
309 pywikibot.stdout(' ERROR: Couldn\'t figure out link {}.'.format(page_slug))
310 continue
311
312 # Test the URL
313 iw_url = iw_url.replace(' ', '_')
314 if debug: pywikibot.stdout(' Reading page at {}….'.format(iw_url))
315 test_intrawiki_link(iw_url, page_name, page_slug)
316
317 # Print a wrap-up message
318 def print_summary():
319 global pages_checked
320 global iw_found
321 global advice_issued
322 global errors_issued
323
324 page_str = "pages"
325 if pages_checked == 1:
326 page_str = "page"
327
328 link_str = "links"
329 if iw_found == 1:
330 link_str = "link"
331
332 pywikibot.stdout('Checked {0} {1} and found {2} intrawiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
333 pywikibot.stdout('While attempting to follow section links….')
334
335 if advice_issued == 0:
336 pywikibot.stdout(' No advice on potential problems was issued.')
337 elif advice_issued == 1:
338 pywikibot.stdout(' 1 piece of advice on a potential problem was issued.')
339 else:
340 pywikibot.stdout(' {} pieces of advice on potential problems were issued.'.format(advice_issued))
341
342 error_str = "errors were"
343 if errors_issued == 1:
344 error_str = "error was"
345 pywikibot.stdout(' {0} {1} encountered.'.format(errors_issued, error_str))
346
347 # Main function
348 def main(*args):
349 global debug
350 search_cat = ''
351 search_page = ''
352
353 # Process arguments
354 local_args = pywikibot.handle_args(args)
355 for arg in local_args:
356 if arg.startswith('-cat:'):
357 search_cat = arg[5:]
358 elif arg.startswith('-page:'):
359 search_page = arg[6:]
360 elif arg == '-dbg':
361 debug = 1
362 else:
363 pywikibot.stdout('Unknown argument "{}".'.format(arg))
364 return
365
366 site = pywikibot.Site()
367
368 # This line of code enumerates the methods in the 'page' class
369 #pywikibot.stdout(format(dir(page)))
370
371 # Check specified page or loop through specified category and check all pages
372 if search_cat != '':
373 cat_obj = pywikibot.Category(site, search_cat)
374 generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
375 for page in pagegenerators.PreloadingGenerator(generator, 100):
376 if debug: pywikibot.stdout('Checking page {0}'.format(page.title()))
377 scan_for_intrawiki_links(page.text, page.title())
378 elif search_page != '':
379 page = pywikibot.Page(site, search_page)
380 if debug: pywikibot.stdout('Checking page {0}'.format(page.title()))
381 scan_for_intrawiki_links(page.text, page.title())
382
383 # Print the results
384 print_summary()
385
386 if __name__ == '__main__':
387 main()