ViewVC Help
View File | Revision Log | View Changeset | Root Listing
root/Oni2/ValBot/Python/check_intrawiki_section_links.py
Revision: 1194
Committed: Mon Nov 18 04:00:08 2024 UTC (10 months, 3 weeks ago) by iritscen
Content type: text/x-python
File size: 19822 byte(s)
Log Message:
ValBot: check_intrawiki_section_links.py now understands text fragment directives.

File Contents

# Content
1 # Check Intrawiki Section Links
2 # by iritscen@yahoo.com
3 # Looks at each wikilink on a page (or in all the pages in a category) for a section link ('#'),
4 # and loads the linked page and verifies that the named section actually exists. It also
5 # understands section links generated through a call to Template:SectionLink.
6 # Recommended viewing width:
7 # |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- --|
8
9 import os
10
11 from urllib.parse import urljoin
12
13 import pywikibot
14 import re
15
16 from pywikibot.bot import QuitKeyboardInterrupt
17 from pywikibot import pagegenerators
18 from pywikibot.tools.formatter import color_format
19 from pywikibot.comms.http import fetch
20 from pywikibot.specialbots import UploadRobot
21 from bs4 import BeautifulSoup
22
23 # Tuple of OniGalore's namespaces
24 intrawiki_prefixes = ('Image', 'Special', 'Talk', 'User', 'User_talk', 'OniGalore', 'OniGalore_talk', 'File', 'File_talk', 'MediaWiki', 'MediaWiki_talk', 'Template', 'Template_talk', 'Help', 'Help_talk', 'Category', 'Category_talk', 'BSL', 'BSL_talk', 'OBD', 'OBD_talk', 'AE', 'AE_talk', 'Oni2', 'Oni2_talk', 'XML', 'XML_talk')
25
26 # URL for main namespace of our wiki
27 onigalore_url = 'https://wiki.oni2.net/'
28
29 # Tuple of interwiki prefixes, for recognizing and passing over such links
30 interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')
31
32 # List of chapter names, for substitution into links that use "{{Cn}}" transclusion
33 chapter_names = ['CHAPTER_00_._COMBAT_TRAINING', 'CHAPTER_01_._TRIAL_RUN', 'CHAPTER_02_._ENGINES_OF_EVIL', 'CHAPTER_03_._PUZZLE_PIECES', 'CHAPTER_04_._TIGER_BY_THE_TAIL', 'CHAPTER_05_._HOT_PURSUIT', 'CHAPTER_06_._COUNTERATTACK', 'CHAPTER_07_._A_FRIEND_IN_NEED', 'CHAPTER_08_._AN_INNOCENT_LIFE', 'CHAPTER_09_._TRUTH_AND_CONSEQUENCES', 'CHAPTER_10_._CAT_AND_MOUSE', 'CHAPTER_11_._DREAM_DIVER', 'CHAPTER_12_._SINS_OF_THE_FATHER', 'CHAPTER_13_._PHOENIX_RISING', 'CHAPTER_14_._DAWN_OF_THE_CHRYSALIS']
34
35 # Tuple of patterns for recognizing wikilinks
36 # Pattern 1: Detect "[[anything]]", "[[any:thing]]", "[[any|thing]]", "[[any:thi|ng]]"
37 # Pattern 2: Detect "{{SectionLink|Page|Section name}}", "{{SectionLink||Section name}}"
38 link_patterns = (r"\[\[[^|\]]*(\||\])", r"\{\{SectionLink\|[^|\}]*\|[^|\}]*\}\}")
39
40 # Initialize globals
41 debug = 0
42 pages_checked = 0
43 iw_found = 0
44 advice_issued = 0
45 errors_issued = 0
46 name_printed = 0
47
48 # Prints the name of a page on which something occurred, if it has not been printed before
49 def possibly_print(page_name):
50 global debug
51 global name_printed
52
53 if not name_printed and not debug:
54 pywikibot.stdout('')
55 pywikibot.stdout('From page "{}":'.format(page_name))
56 name_printed = 1
57
58 # Search a page for the section specified in the link
59 def find_section(page_text, page_name, page_slug, print_result):
60 global errors_issued
61 found_section = False
62
63 # Isolate section link or text fragment link
64 target_page_name, anchor_name = page_slug.split('#', 1)
65 target_page_name_human = target_page_name.replace('_', ' ')
66
67 # First check if this is a text fragment directive, and look for it if so
68 if anchor_name.startswith(':~:text='):
69 if debug: pywikibot.stdout(' Found text fragment directive {} from URL {}.'.format(anchor_name, page_slug))
70 anchor_name = anchor_name[8:]
71 # We're only checking the first text directive, so strip add'l ones if present
72 addl_fragment = anchor_name.find('&text=')
73 if addl_fragment != -1:
74 anchor_name = anchor_name[:addl_fragment]
75 search_terms = anchor_name.split(',')
76 # Delete prefix and suffix terms because they aren't needed
77 if search_terms[0].endswith('-'):
78 search_terms.pop(0)
79 if search_terms[-1].startswith('-'):
80 search_terms.pop()
81 # Remake text directive with the terms separated by spaces as they should be in the page text
82 newSep = ' '
83 search_string = newSep.join(search_terms)
84 if debug: pywikibot.stdout(' Converted text fragment to string "{}".'.format(search_string))
85 if search_string in page_text:
86 found_section = True
87 if debug and not print_result: pywikibot.stdout(' Found text fragment!')
88
89 # If we're still here, it's a section link; read linked page to see if it really has this
90 # anchor link
91 if found_section == False:
92 if debug: pywikibot.stdout(' Searching for section link {} on page.'.format(anchor_name))
93 soup = BeautifulSoup(page_text, 'html.parser')
94 # Search for a span with this ID
95 for span_tag in soup.findAll('span'):
96 span_name = span_tag.get('id', None)
97 if span_name == anchor_name:
98 if debug and not print_result: pywikibot.stdout(' Found section in a span!')
99 found_section = True
100 break
101 if found_section == False:
102 # Search for a div with this ID
103 for span_tag in soup.findAll('div'):
104 span_name = span_tag.get('id', None)
105 if span_name == anchor_name:
106 if debug and not print_result: pywikibot.stdout(' Found section in a div!')
107 found_section = True
108 break
109 if found_section == False:
110 possibly_print(page_name)
111 pywikibot.stdout(' ERROR: Could not find section "{0}" on page {1}!'.format(anchor_name, target_page_name_human))
112 errors_issued += 1
113 elif debug and print_result:
114 pywikibot.stdout(' The section "{0}" was found on page "{1}".'.format(anchor_name, target_page_name_human))
115
116 # For a link that redirected us to another page, extract the name of the target page from
117 # the target page's source
118 def find_canonical_link(page_text, page_name, page_slug):
119 # Extract link from this markup which contains name of redirected-to page:
120 # <link rel="canonical" href="https://en.wikipedia.org/wiki/Page_name"/>
121 # "wgPageName":"Namespace:Page_name",
122 canonical_name = page_text.split('"wgPageName":"')[-1]
123 tag_end = canonical_name.find('",')
124
125 if tag_end == -1:
126 pywikibot.stdout(' ERROR: The link "{}" is a redirect page, but this script could not isolate the target page name.'.format(page_slug))
127 errors_issued = errors_issued + 1
128 else:
129 canonical_name = canonical_name[:tag_end]
130 if len(canonical_name) > 100:
131 # Certain things can cause the trim to fail; report error and avoid slamming the
132 # output with massive page source from a failed trim
133 pywikibot.stdout(' ERROR: The link "{}" is a redirect to "{2}…" (string overflow).'.format(page_slug, canonical_name[:100]))
134 errors_issued = errors_issued + 1
135 else:
136 canonical_name = canonical_name.replace('_', ' ')
137 if '#' in page_slug:
138 _, anchor_name = page_slug.split('#')
139 if debug: pywikibot.stdout(' The link "{0}" is a redirect to "{1}#{2}", which is a valid page. Checking section link….'.format(page_slug, canonical_name, anchor_name))
140 find_section(page_text, page_name, page_slug, True)
141 else:
142 pywikibot.stdout(' The link "{0}" is a redirect to "{1}", which is a valid page.'.format(page_slug, canonical_name))
143
144 # Test an intrawiki link and look for a section link if applicable
145 def test_intrawiki_link(iw_url, page_name, page_slug):
146 global advice_issued
147 global errors_issued
148
149 response = fetch(iw_url)
150
151 # One way we tell that a redirect occurred is by checking fetch's history, as it
152 # automatically follows redirects. This will catch formal redirects which come from pages
153 # such as Special:PermanentLink.
154 if response.history != []:
155 permalink1 = 'Special:PermanentLink/'.lower()
156 permalink2 = 'Special:Permalink/'.lower()
157 page_slug_lower = page_slug.lower()
158 if page_slug_lower.startswith(permalink1) or page_slug_lower.startswith(permalink2):
159 if debug:
160 possibly_print(page_name)
161 pywikibot.stdout(' Got redirection code "{0}" for permanent revision link "{1}". Checking the target page….'.format(response.history[0], page_slug))
162 find_canonical_link(response.text, page_name, page_slug)
163 else:
164 possibly_print(page_name)
165 pywikibot.stdout(' ERROR: Unrecognized type of redirection (code "{0}") for link "{1}". You should check the link manually.'.format(response.history[0], page_slug))
166 advice_issued += 1
167 elif response.status_code != 200:
168 possibly_print(page_name)
169 pywikibot.stdout(' ERROR: Got response code {0} on URL {1}. The target page may not exist.'.format(response.status_code, iw_url))
170 errors_issued += 1
171 # However the usual way that a redirect occurs is that MediaWiki redirects us sneakily
172 # using JavaScript, while returning code OK 200 as if the link was correct; this happens
173 # when a redirect page is accessed. We must detect these soft redirects by looking at the
174 # page source to find the redirect note inserted at the top of the page for the reader.
175 elif 'Redirected from <a' in response.text:
176 if debug:
177 possibly_print(page_name)
178 pywikibot.stdout(' Got silently redirected by link "{}". Checking the target page….'.format(page_slug))
179 find_canonical_link(response.text, page_name, page_slug)
180 else: # URL is OK, so proceed
181 find_section(response.text, page_name, page_slug, False)
182
183 # Searches the given page text for intrawiki links with section links in them
184 def scan_for_intrawiki_links(page_text, page_name):
185 global debug
186 global pages_checked
187 global iw_found
188 global advice_issued
189 global errors_issued
190 global name_printed
191 pages_checked += 1
192 name_printed = 0
193
194 for i, the_pattern in enumerate(link_patterns):
195 if debug:
196 if i == 0:
197 pywikibot.stdout(' Checking page for wikilinks with section names.')
198 elif i == 1:
199 pywikibot.stdout(' Checking page for {{SectionLink}} calls.')
200
201 for match in re.finditer(the_pattern, page_text):
202 found_iw_match = False
203 iw_url = ""
204 page_name2 = page_name
205
206 # Cut out the matched text from the page, isolating just the page+section name
207 target_start = 2 # "[["
208 target_end = 1 # "|" or "]" (we only match the first ending bracket)
209 if i == 1:
210 target_start = 14 # "{{SectionLink|"
211 target_end = 2 # "}}"
212 s = match.start() + target_start # remove the link-opening markup
213 e = match.end() - target_end # remove the link-ending markup
214 page_slug = page_text[s:e]
215
216 # The second link type will look like "Page|Section" or "|Section", so fix that pipe
217 if i == 1:
218 page_slug = page_slug.replace('|', '#')
219
220 # Sometimes we use a space char. instead of a '_', so fix that before querying
221 page_slug = page_slug.replace(' ', '_')
222 if debug: pywikibot.stdout(' Found link {0}.'.format(page_slug))
223
224 # If this link doesn't have a section link in it, then we don't care about it, as
225 # MediaWiki takes care of checking basic intrawiki links
226 if not '#' in page_slug:
227 if debug: pywikibot.stdout(' Link doesn\'t have a section anchor in it. Skipping.')
228 continue
229
230 # If this link has an interwiki prefix, it can be ignored; see check_interwiki_links.py
231 # for the task of checking interwiki page+section links
232 is_interwiki = False
233 if found_iw_match == False:
234 for prefix in interwiki_prefixes:
235 if prefix + ":" in page_slug:
236 if debug: pywikibot.stdout(' Skipping link {} because it is an interwiki link.'.format(page_slug))
237 is_interwiki = True
238 break
239 if is_interwiki:
240 continue
241
242 # If there is a '{' in the link, then probably it's a link built on transcluded text.
243 # If it's a chapter template transclusion like "Quotes/Diary#{{C3}}", expand it using
244 # our "chapter_names" array. If it's another type of transclusion, punt it to the user.
245 if '{' in page_slug:
246 ch_link_pattern = re.compile(r"{{C[0-9]*}}")
247 ch_link = ch_link_pattern.search(page_slug)
248 if debug: pywikibot.stdout(' Found transclusion in link: "{}".'.format(ch_link.group(0)))
249 if ch_link:
250 ch_link_match = ch_link.group(0)
251 ch_num_pattern = re.compile("[0-9]+")
252 ch_num = ch_num_pattern.search(ch_link_match)
253 if ch_num:
254 ch_num_match = int(ch_num.group(0))
255 if ch_num_match >= 0 and ch_num_match <= 14:
256 ch_name = chapter_names[ch_num_match]
257 replace_pattern = re.compile(r"{{C" + ch_num.group(0) + r"}}")
258 page_slug = replace_pattern.sub(ch_name, page_slug)
259 if debug: pywikibot.stdout(' After performing transclusion, link is now "{}".'.format(page_slug))
260 else:
261 possibly_print(page_name)
262 pywikibot.stdout(' ERROR: Link {0} transcludes a chapter name using an out-of-range number, {1}.'.format(page_slug, ch_num_match))
263 errors_issued += 1
264 continue
265 else:
266 possibly_print(page_name)
267 pywikibot.stdout(' ADVICE: Link {} seems to be transcluding a chapter name, but this script couldn\'t read it.'.format(page_slug))
268 advice_issued += 1
269 continue
270 else:
271 possibly_print(page_name)
272 pywikibot.stdout(' ADVICE: Link {0} seems to use transclusion. This script can understand chapter name transclusions such as "{1}" but it doesn\'t recognize this one so it can\'t be verified. You should check the link manually.'.format(page_slug, "{{C7}}"))
273 advice_issued += 1
274 continue
275
276 # If this is a relative "/" link, use the current page as the basis for the URL. Note
277 # that only a leading slash is looked for, so if there's multiple steps down ("/x/y"),
278 # we're out of luck.
279 if page_slug.startswith('/'):
280 page_slug = page_name + page_slug
281 if debug: pywikibot.stdout(' Changed page_slug to {} on account of "/".'.format(page_slug))
282
283 # If this is a relative "../" link, find the parent page, set ourselves to that page,
284 # then remove the relative portion of the link. Note that this is only performed once,
285 # so if there's multiple steps back ("../../"), we're out of luck.
286 if page_slug.startswith('../'):
287 last_slash = page_name.rfind('/')
288 page_name2 = page_name[0:last_slash]
289 if debug: pywikibot.stdout(' Changed page_name to {} on account of "../".'.format(page_name2))
290 page_slug = page_slug[3:len(page_slug)]
291 if debug: pywikibot.stdout(' Changed page_slug to {} on account of "../".'.format(page_slug))
292 # If this is now going to be a bare section link for the parent page, don't add a
293 # slash, otherwise do because we are drilling down to another subpage
294 if page_slug.startswith('#'):
295 page_slug = page_name2 + page_slug
296 else:
297 page_slug = page_name2 + '/' + page_slug
298
299 # If this is a bare section link, build URL based on this page
300 if page_slug.startswith('#'):
301 iw_url = onigalore_url + page_name2
302 iw_found += 1
303 if debug: pywikibot.stdout(' Found link to this very page, {}.'.format(page_slug))
304 found_iw_match = True
305 page_slug = page_name2 + page_slug
306
307 # If there's no ":" in the link (before the section link, where a colon would just be
308 # part of the text) then it's a Main namespace article; proceed with building URL
309 if found_iw_match == False:
310 if not re.search(":.*#", page_slug):
311 iw_url = onigalore_url + page_slug
312 iw_found += 1
313 if debug: pywikibot.stdout(' Link is to a Main namespace page.')
314 found_iw_match = True
315
316 # If there is a ":", match the prefix against the intrawiki prefixes on OniGalore
317 # before building URL
318 if found_iw_match == False:
319 for prefix in intrawiki_prefixes:
320 if prefix + ":" in page_slug:
321 iw_url = onigalore_url + page_slug
322 if debug: pywikibot.stdout(' Identified namespace {}.'.format(prefix))
323 iw_found += 1
324 found_iw_match = True
325 break
326
327 # If we still haven't turned this match into a URL, something's gone wrong
328 if (found_iw_match == False) or (iw_url == ""):
329 possibly_print(page_name)
330 pywikibot.stdout(' ERROR: Couldn\'t figure out link {}.'.format(page_slug))
331 continue
332
333 # Test the URL
334 iw_url = iw_url.replace(' ', '_')
335 if debug: pywikibot.stdout(' Reading page at {}….'.format(iw_url))
336 test_intrawiki_link(iw_url, page_name, page_slug)
337
338 # Print a wrap-up message
339 def print_summary():
340 global pages_checked
341 global iw_found
342 global advice_issued
343 global errors_issued
344
345 page_str = "pages"
346 if pages_checked == 1:
347 page_str = "page"
348
349 link_str = "links"
350 if iw_found == 1:
351 link_str = "link"
352
353 pywikibot.stdout('Checked {0} {1} and found {2} intrawiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
354 pywikibot.stdout('While attempting to follow section links….')
355
356 if advice_issued == 0:
357 pywikibot.stdout(' No advice on potential problems was issued.')
358 elif advice_issued == 1:
359 pywikibot.stdout(' 1 piece of advice on a potential problem was issued.')
360 else:
361 pywikibot.stdout(' {} pieces of advice on potential problems were issued.'.format(advice_issued))
362
363 error_str = "errors were"
364 if errors_issued == 1:
365 error_str = "error was"
366 pywikibot.stdout(' {0} {1} encountered.'.format(errors_issued, error_str))
367
368 # Main function
369 def main(*args):
370 global debug
371 search_cat = ''
372 search_page = ''
373
374 # Process arguments
375 local_args = pywikibot.handle_args(args)
376 for arg in local_args:
377 if arg.startswith('-cat:'):
378 search_cat = arg[5:]
379 elif arg.startswith('-page:'):
380 search_page = arg[6:]
381 elif arg == '-dbg':
382 debug = 1
383 else:
384 pywikibot.stdout('Unknown argument "{}".'.format(arg))
385 return
386
387 site = pywikibot.Site()
388
389 # This line of code enumerates the methods in the 'page' class
390 #pywikibot.stdout(format(dir(page)))
391
392 # Check specified page or loop through specified category and check all pages
393 if search_cat != '':
394 cat_obj = pywikibot.Category(site, search_cat)
395 generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
396 for page in pagegenerators.PreloadingGenerator(generator, 100):
397 if debug: pywikibot.stdout('Checking page {0}'.format(page.title()))
398 scan_for_intrawiki_links(page.text, page.title())
399 elif search_page != '':
400 page = pywikibot.Page(site, search_page)
401 if debug: pywikibot.stdout('Checking page {0}'.format(page.title()))
402 scan_for_intrawiki_links(page.text, page.title())
403
404 # Print the results
405 print_summary()
406
407 if __name__ == '__main__':
408 main()