ViewVC Help
View File | Revision Log | View Changeset | Root Listing
root/Oni2/ValBot/Python/check_intrawiki_section_links.py
Revision: 1188
Committed: Tue Jan 23 03:53:05 2024 UTC (20 months, 2 weeks ago) by iritscen
Content type: text/x-python
File size: 18481 byte(s)
Log Message:
ValBot: Removed line converting slashes to HTML notation that now seems to cause a problem rather than solve one.

File Contents

# Content
1 # Check Intrawiki Section Links
2 # by iritscen@yahoo.com
3 # Looks at each wikilink on a page (or in all the pages in a category) for a section link ('#'),
4 # and loads the linked page and verifies that the named section actually exists. It also
5 # understands section links generated through a call to Template:SectionLink.
6 # Recommended viewing width:
7 # |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- --|
8
9 import os
10
11 from urllib.parse import urljoin
12
13 import pywikibot
14 import re
15
16 from pywikibot.bot import QuitKeyboardInterrupt
17 from pywikibot import pagegenerators
18 from pywikibot.tools.formatter import color_format
19 from pywikibot.comms.http import fetch
20 from pywikibot.specialbots import UploadRobot
21 from bs4 import BeautifulSoup
22
23 # Tuple of OniGalore's namespaces
24 intrawiki_prefixes = ('Image', 'Special', 'Talk', 'User', 'User_talk', 'OniGalore', 'OniGalore_talk', 'File', 'File_talk', 'MediaWiki', 'MediaWiki_talk', 'Template', 'Template_talk', 'Help', 'Help_talk', 'Category', 'Category_talk', 'BSL', 'BSL_talk', 'OBD', 'OBD_talk', 'AE', 'AE_talk', 'Oni2', 'Oni2_talk', 'XML', 'XML_talk')
25
26 # URL for main namespace of our wiki
27 onigalore_url = 'https://wiki.oni2.net/'
28
29 # Tuple of interwiki prefixes, for recognizing and passing over such links
30 interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')
31
32 # List of chapter names, for substitution into links that use "{{Cn}}" transclusion
33 chapter_names = ['CHAPTER_00_._COMBAT_TRAINING', 'CHAPTER_01_._TRIAL_RUN', 'CHAPTER_02_._ENGINES_OF_EVIL', 'CHAPTER_03_._PUZZLE_PIECES', 'CHAPTER_04_._TIGER_BY_THE_TAIL', 'CHAPTER_05_._HOT_PURSUIT', 'CHAPTER_06_._COUNTERATTACK', 'CHAPTER_07_._A_FRIEND_IN_NEED', 'CHAPTER_08_._AN_INNOCENT_LIFE', 'CHAPTER_09_._TRUTH_AND_CONSEQUENCES', 'CHAPTER_10_._CAT_AND_MOUSE', 'CHAPTER_11_._DREAM_DIVER', 'CHAPTER_12_._SINS_OF_THE_FATHER', 'CHAPTER_13_._PHOENIX_RISING', 'CHAPTER_14_._DAWN_OF_THE_CHRYSALIS']
34
35 # Tuple of patterns for recognizing wikilinks
36 # Pattern 1: Detect "[[anything]]", "[[any:thing]]", "[[any|thing]]", "[[any:thi|ng]]"
37 # Pattern 2: Detect "{{SectionLink|Page|Section name}}", "{{SectionLink||Section name}}"
38 link_patterns = ("\[\[[^|\]]*(\||\])", "\{\{SectionLink\|[^|\}]*\|[^|\}]*\}\}")
39
40 # Initialize globals
41 debug = 0
42 pages_checked = 0
43 iw_found = 0
44 advice_issued = 0
45 errors_issued = 0
46 name_printed = 0
47
48 # Prints the name of a page on which something occurred, if it has not been printed before
49 def possibly_print(page_name):
50 global debug
51 global name_printed
52
53 if not name_printed and not debug:
54 pywikibot.stdout('')
55 pywikibot.stdout('From page "{}":'.format(page_name))
56 name_printed = 1
57
58 # Search a page for the section specified in the link
59 def find_section(page_text, page_name, page_slug, print_result):
60 global errors_issued
61
62 # Isolate section link
63 target_page_name, anchor_name = page_slug.split('#', 1)
64 target_page_name_human = target_page_name.replace('_', ' ')
65 if debug: pywikibot.stdout(' Searching for section link {} on page.'.format(anchor_name))
66
67 # Read linked page to see if it really has this anchor link
68 soup = BeautifulSoup(page_text, 'html.parser')
69 found_section = False
70 for span_tag in soup.findAll('span'):
71 span_name = span_tag.get('id', None)
72 if span_name == anchor_name:
73 if debug and not print_result: pywikibot.stdout(' Found section in a span!')
74 found_section = True
75 break
76 if found_section == False:
77 # Search for a div with this ID
78 for span_tag in soup.findAll('div'):
79 span_name = span_tag.get('id', None)
80 if span_name == anchor_name:
81 if debug and not print_result: pywikibot.stdout(' Found section in a div!')
82 found_section = True
83 break
84 if found_section == False:
85 possibly_print(page_name)
86 pywikibot.stdout(' ERROR: Could not find section "{0}" on page {1}!'.format(anchor_name, target_page_name_human))
87 errors_issued += 1
88 elif debug and print_result:
89 pywikibot.stdout(' The section "{0}" was found on page "{1}".'.format(anchor_name, target_page_name_human))
90
91 # For a link that redirected us to another page, extract the name of the target page from
92 # the target page's source
93 def find_canonical_link(page_text, page_name, page_slug):
94 # Extract link from this markup which contains name of redirected-to page:
95 # <link rel="canonical" href="https://en.wikipedia.org/wiki/Page_name"/>
96 # "wgPageName":"Namespace:Page_name",
97 canonical_name = page_text.split('"wgPageName":"')[-1]
98 tag_end = canonical_name.find('",')
99
100 if tag_end == -1:
101 pywikibot.stdout(' ERROR: The link "{}" is a redirect page, but this script could not isolate the target page name.'.format(page_slug))
102 errors_issued = errors_issued + 1
103 else:
104 canonical_name = canonical_name[:tag_end]
105 if len(canonical_name) > 100:
106 # Certain things can cause the trim to fail; report error and avoid slamming the
107 # output with massive page source from a failed trim
108 pywikibot.stdout(' ERROR: The link "{}" is a redirect to "{2}…" (string overflow).'.format(page_slug, canonical_name[:100]))
109 errors_issued = errors_issued + 1
110 else:
111 canonical_name = canonical_name.replace('_', ' ')
112 if '#' in page_slug:
113 _, anchor_name = page_slug.split('#')
114 if debug: pywikibot.stdout(' The link "{0}" is a redirect to "{1}#{2}", which is a valid page. Checking section link….'.format(page_slug, canonical_name, anchor_name))
115 find_section(page_text, page_name, page_slug, True)
116 else:
117 pywikibot.stdout(' The link "{0}" is a redirect to "{1}", which is a valid page.'.format(page_slug, canonical_name))
118
119 # Test an intrawiki link and look for a section link if applicable
120 def test_intrawiki_link(iw_url, page_name, page_slug):
121 global advice_issued
122 global errors_issued
123
124 response = fetch(iw_url)
125
126 # One way we tell that a redirect occurred is by checking fetch's history, as it
127 # automatically follows redirects. This will catch formal redirects which come from pages
128 # such as Special:PermanentLink.
129 if response.history != []:
130
131 permalink1 = 'Special:PermanentLink/'.lower()
132 permalink2 = 'Special:Permalink/'.lower()
133 page_slug_lower = page_slug.lower()
134 if page_slug_lower.startswith(permalink1) or page_slug_lower.startswith(permalink2):
135 if debug:
136 possibly_print(page_name)
137 pywikibot.stdout(' Got redirection code "{0}" for permanent revision link "{1}". Checking the target page….'.format(response.history[0], page_slug))
138 find_canonical_link(response.text, page_name, page_slug)
139 else:
140 possibly_print(page_name)
141 pywikibot.stdout(' ERROR: Unrecognized type of redirection (code "{0}") for link "{1}". You should check the link manually.'.format(response.history[0], page_slug))
142 advice_issued += 1
143 elif response.status_code != 200:
144 possibly_print(page_name)
145 pywikibot.stdout(' ERROR: Got response code {0} on URL {1}. The target page may not exist.'.format(response.status_code, iw_url))
146 errors_issued += 1
147 # However the usual way that a redirect occurs is that MediaWiki redirects us sneakily
148 # using JavaScript, while returning code OK 200 as if the link was correct; this happens
149 # when a redirect page is accessed. We must detect these soft redirects by looking at the
150 # page source to find the redirect note inserted at the top of the page for the reader.
151 elif 'Redirected from <a' in response.text:
152 if debug:
153 possibly_print(page_name)
154 pywikibot.stdout(' Got silently redirected by link "{}". Checking the target page….'.format(page_slug))
155 find_canonical_link(response.text, page_name, page_slug)
156 else: # URL is OK, so proceed
157 find_section(response.text, page_name, page_slug, False)
158
159 # Searches the given page text for intrawiki links with section links in them
160 def scan_for_intrawiki_links(page_text, page_name):
161 global debug
162 global pages_checked
163 global iw_found
164 global advice_issued
165 global errors_issued
166 global name_printed
167 pages_checked += 1
168 name_printed = 0
169
170 for i, the_pattern in enumerate(link_patterns):
171 if debug:
172 if i == 0:
173 pywikibot.stdout(' Checking page for wikilinks with section names.')
174 elif i == 1:
175 pywikibot.stdout(' Checking page for {{SectionLink}} calls.')
176
177 for match in re.finditer(the_pattern, page_text):
178 found_iw_match = False
179 iw_url = ""
180 page_name2 = page_name
181
182 # Cut out the matched text from the page, isolating just the page+section name
183 target_start = 2 # "[["
184 target_end = 1 # "|" or "]" (we only match the first ending bracket)
185 if i == 1:
186 target_start = 14 # "{{SectionLink|"
187 target_end = 2 # "}}"
188 s = match.start() + target_start # remove the link-opening markup
189 e = match.end() - target_end # remove the link-ending markup
190 page_slug = page_text[s:e]
191
192 # The second link type will look like "Page|Section" or "|Section", so fix that pipe
193 if i == 1:
194 page_slug = page_slug.replace('|', '#')
195
196 # Sometimes we use a space char. instead of a '_', so fix that before querying
197 page_slug = page_slug.replace(' ', '_')
198 if debug: pywikibot.stdout(' Found link {0}.'.format(page_slug))
199
200 # If this link doesn't have a section link in it, then we don't care about it, as
201 # MediaWiki takes care of checking basic intrawiki links
202 if not '#' in page_slug:
203 if debug: pywikibot.stdout(' Link doesn\'t have a section anchor in it. Skipping.')
204 continue
205
206 # If this link has an interwiki prefix, it can be ignored; see check_interwiki_links.py
207 # for the task of checking interwiki page+section links
208 is_interwiki = False
209 if found_iw_match == False:
210 for prefix in interwiki_prefixes:
211 if prefix + ":" in page_slug:
212 if debug: pywikibot.stdout(' Skipping link {} because it is an interwiki link.'.format(page_slug))
213 is_interwiki = True
214 break
215 if is_interwiki:
216 continue
217
218 # If there is a '{' in the link, then probably it's a link built on transcluded text.
219 # If it's a chapter template transclusion like "Quotes/Diary#{{C3}}", expand it using
220 # our "chapter_names" array. If it's another type of transclusion, punt it to the user.
221 if '{' in page_slug:
222 ch_link_pattern = re.compile(r"{{C[0-9]*}}")
223 ch_link = ch_link_pattern.search(page_slug)
224 if debug: pywikibot.stdout(' Found transclusion in link: "{}".'.format(ch_link.group(0)))
225 if ch_link:
226 ch_link_match = ch_link.group(0)
227 ch_num_pattern = re.compile("[0-9]+")
228 ch_num = ch_num_pattern.search(ch_link_match)
229 if ch_num:
230 ch_num_match = int(ch_num.group(0))
231 if ch_num_match >= 0 and ch_num_match <= 14:
232 ch_name = chapter_names[ch_num_match]
233 replace_pattern = re.compile(r"{{C" + ch_num.group(0) + r"}}")
234 page_slug = replace_pattern.sub(ch_name, page_slug)
235 if debug: pywikibot.stdout(' After performing transclusion, link is now "{}".'.format(page_slug))
236 else:
237 possibly_print(page_name)
238 pywikibot.stdout(' ERROR: Link {0} transcludes a chapter name using an out-of-range number, {1}.'.format(page_slug, ch_num_match))
239 errors_issued += 1
240 continue
241 else:
242 possibly_print(page_name)
243 pywikibot.stdout(' ADVICE: Link {} seems to be transcluding a chapter name, but this script couldn\'t read it.'.format(page_slug))
244 advice_issued += 1
245 continue
246 else:
247 possibly_print(page_name)
248 pywikibot.stdout(' ADVICE: Link {0} seems to use transclusion. This script can understand chapter name transclusions such as "{1}" but it doesn\'t recognize this one so it can\'t be verified. You should check the link manually.'.format(page_slug, "{{C7}}"))
249 advice_issued += 1
250 continue
251
252 # If this is a relative "/" link, use the current page as the basis for the URL. Note
253 # that only a leading slash is looked for, so if there's multiple steps down ("/x/y"),
254 # we're out of luck.
255 if page_slug.startswith('/'):
256 page_slug = page_name + page_slug
257 if debug: pywikibot.stdout(' Changed page_slug to {} on account of "/".'.format(page_slug))
258
259 # If this is a relative "../" link, find the parent page, set ourselves to that page,
260 # then remove the relative portion of the link. Note that this is only performed once,
261 # so if there's multiple steps back ("../../"), we're out of luck.
262 if page_slug.startswith('../'):
263 last_slash = page_name.rfind('/')
264 page_name2 = page_name[0:last_slash]
265 if debug: pywikibot.stdout(' Changed page_name to {} on account of "../".'.format(page_name2))
266 page_slug = page_slug[3:len(page_slug)]
267 if debug: pywikibot.stdout(' Changed page_slug to {} on account of "../".'.format(page_slug))
268 # If this is now going to be a bare section link for the parent page, don't add a
269 # slash, otherwise do because we are drilling down to another subpage
270 if page_slug.startswith('#'):
271 page_slug = page_name2 + page_slug
272 else:
273 page_slug = page_name2 + '/' + page_slug
274
275 # If this is a bare section link, build URL based on this page
276 if page_slug.startswith('#'):
277 iw_url = onigalore_url + page_name2
278 iw_found += 1
279 if debug: pywikibot.stdout(' Found link to this very page, {}.'.format(page_slug))
280 found_iw_match = True
281 page_slug = page_name2 + page_slug
282
283 # If there's no ":" in the link (before the section link, where a colon would just be
284 # part of the text) then it's a Main namespace article; proceed with building URL
285 if found_iw_match == False:
286 if not re.search(":.*#", page_slug):
287 iw_url = onigalore_url + page_slug
288 iw_found += 1
289 if debug: pywikibot.stdout(' Link is to a Main namespace page.')
290 found_iw_match = True
291
292 # If there is a ":", match the prefix against the intrawiki prefixes on OniGalore
293 # before building URL
294 if found_iw_match == False:
295 for prefix in intrawiki_prefixes:
296 if prefix + ":" in page_slug:
297 iw_url = onigalore_url + page_slug
298 if debug: pywikibot.stdout(' Identified namespace {}.'.format(prefix))
299 iw_found += 1
300 found_iw_match = True
301 break
302
303 # If we still haven't turned this match into a URL, something's gone wrong
304 if (found_iw_match == False) or (iw_url == ""):
305 possibly_print(page_name)
306 pywikibot.stdout(' ERROR: Couldn\'t figure out link {}.'.format(page_slug))
307 continue
308
309 # Test the URL
310 iw_url = iw_url.replace(' ', '_')
311 if debug: pywikibot.stdout(' Reading page at {}….'.format(iw_url))
312 test_intrawiki_link(iw_url, page_name, page_slug)
313
314 # Print a wrap-up message
315 def print_summary():
316 global pages_checked
317 global iw_found
318 global advice_issued
319 global errors_issued
320
321 page_str = "pages"
322 if pages_checked == 1:
323 page_str = "page"
324
325 link_str = "links"
326 if iw_found == 1:
327 link_str = "link"
328
329 pywikibot.stdout('Checked {0} {1} and found {2} intrawiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
330 pywikibot.stdout('While attempting to follow section links….')
331
332 if advice_issued == 0:
333 pywikibot.stdout(' No advice on potential problems was issued.')
334 elif advice_issued == 1:
335 pywikibot.stdout(' 1 piece of advice on a potential problem was issued.')
336 else:
337 pywikibot.stdout(' {} pieces of advice on potential problems were issued.'.format(advice_issued))
338
339 error_str = "errors were"
340 if errors_issued == 1:
341 error_str = "error was"
342 pywikibot.stdout(' {0} {1} encountered.'.format(errors_issued, error_str))
343
344 # Main function
345 def main(*args):
346 global debug
347 search_cat = ''
348 search_page = ''
349
350 # Process arguments
351 local_args = pywikibot.handle_args(args)
352 for arg in local_args:
353 if arg.startswith('-cat:'):
354 search_cat = arg[5:]
355 elif arg.startswith('-page:'):
356 search_page = arg[6:]
357 elif arg == '-dbg':
358 debug = 1
359 else:
360 pywikibot.stdout('Unknown argument "{}".'.format(arg))
361 return
362
363 site = pywikibot.Site()
364
365 # This line of code enumerates the methods in the 'page' class
366 #pywikibot.stdout(format(dir(page)))
367
368 # Check specified page or loop through specified category and check all pages
369 if search_cat != '':
370 cat_obj = pywikibot.Category(site, search_cat)
371 generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
372 for page in pagegenerators.PreloadingGenerator(generator, 100):
373 if debug: pywikibot.stdout('Checking page {0}'.format(page.title()))
374 scan_for_intrawiki_links(page.text, page.title())
375 elif search_page != '':
376 page = pywikibot.Page(site, search_page)
377 if debug: pywikibot.stdout('Checking page {0}'.format(page.title()))
378 scan_for_intrawiki_links(page.text, page.title())
379
380 # Print the results
381 print_summary()
382
383 if __name__ == '__main__':
384 main()