ViewVC Help
View File | Revision Log | View Changeset | Root Listing
root/Oni2/ValBot/Python/check_intrawiki_section_links.py
Revision: 1179
Committed: Fri Apr 28 00:53:24 2023 UTC (2 years, 5 months ago) by iritscen
Content type: text/x-python
File size: 15057 byte(s)
Log Message:
ValBot: check_intrawiki_section_links.py: Simplified output to just advice and errors.  Added support for SectionLink template.  Added support for links built on chapter name transclusion.  Placed verbose output under a "-dbg" argument.

File Contents

# Content
1 # Check Intrawiki Section Links
2 # by iritscen@yahoo.com
3 # Looks at each wikilink on a page (or in all the pages in a category) for a section link ('#'),
4 # and loads the linked page and verifies that the named section actually exists. It also
5 # understands section links generated through a call to Template:SectionLink.
6 # Recommended viewing width:
7 # |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- --|
8
9 import os
10
11 from urllib.parse import urljoin
12
13 import pywikibot
14 import re
15
16 from pywikibot.bot import QuitKeyboardInterrupt
17 from pywikibot import pagegenerators
18 from pywikibot.tools.formatter import color_format
19 from pywikibot.comms.http import fetch
20 from pywikibot.specialbots import UploadRobot
21 from bs4 import BeautifulSoup
22
23 # Tuple of OniGalore's namespaces
24 intrawiki_prefixes = ('Image', 'Special', 'Talk', 'User', 'User_talk', 'OniGalore', 'OniGalore_talk', 'File', 'File_talk', 'MediaWiki', 'MediaWiki_talk', 'Template', 'Template_talk', 'Help', 'Help_talk', 'Category', 'Category_talk', 'BSL', 'BSL_talk', 'OBD', 'OBD_talk', 'AE', 'AE_talk', 'Oni2', 'Oni2_talk', 'XML', 'XML_talk')
25
26 # URL for main namespace of our wiki
27 onigalore_url = 'https://wiki.oni2.net/'
28
29 # Tuple of interwiki prefixes, for passing over such links
30 interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')
31
32 # List of chapter names, for substitution into links that use "{{Cn}}" transclusion
33 chapter_names = ['CHAPTER_00_._COMBAT_TRAINING', 'CHAPTER_01_._TRIAL_RUN', 'CHAPTER_02_._ENGINES_OF_EVIL', 'CHAPTER_03_._PUZZLE_PIECES', 'CHAPTER_04_._TIGER_BY_THE_TAIL', 'CHAPTER_05_._HOT_PURSUIT', 'CHAPTER_06_._COUNTERATTACK', 'CHAPTER_07_._A_FRIEND_IN_NEED', 'CHAPTER_08_._AN_INNOCENT_LIFE', 'CHAPTER_09_._TRUTH_AND_CONSEQUENCES', 'CHAPTER_10_._CAT_AND_MOUSE', 'CHAPTER_11_._DREAM_DIVER', 'CHAPTER_12_._SINS_OF_THE_FATHER', 'CHAPTER_13_._PHOENIX_RISING', 'CHAPTER_14_._DAWN_OF_THE_CHRYSALIS']
34
35 # Tuple of patterns for recognizing wikilinks
36 # Pattern 1: Detect "[[anything]]", "[[any:thing]]", "[[any|thing]]", "[[any:thi|ng]]"
37 # Pattern 2: Detect "{{SectionLink|Page|Section name}}", "{{SectionLink||Section name}}"
38 link_patterns = ("\[\[[^|\]]*(\||\])", "\{\{SectionLink\|[^|\}]*\|[^|\}]*\}\}")
39
40 # Initialize globals
41 debug = 0
42 pages_checked = 0
43 iw_found = 0
44 advice_issued = 0
45 errors_issued = 0
46
47 # Searches the given page text for intrawiki links with section links in them
48 def scan_for_intrawiki_links(page_text, page_name):
49 global debug
50 global pages_checked
51 global iw_found
52 global advice_issued
53 global errors_issued
54 pages_checked += 1
55 name_printed = 0
56
57 for i, the_pattern in enumerate(link_patterns):
58 if debug:
59 if i == 0:
60 pywikibot.stdout(' Checking page for wikilinks with section names.')
61 elif i == 1:
62 pywikibot.stdout(' Checking page for {{SectionLink}} calls.')
63
64 for match in re.finditer(the_pattern, page_text):
65 found_iw_match = False
66 iw_url = ""
67 page_name2 = page_name
68
69 # Cut out the matched text from the page, isolating just the page+section name
70 target_start = 2 # "[["
71 target_end = 1 # "|" or "]" (we only match the first ending bracket)
72 if i == 1:
73 target_start = 14 # "{{SectionLink|"
74 target_end = 2 # "}}"
75 s = match.start() + target_start # remove the link-opening markup
76 e = match.end() - target_end # remove the link-ending markup
77 link_text = page_text[s:e]
78
79 # The second link type will look like "Page|Section" or "|Section", so fix that pipe
80 if i == 1:
81 link_text = link_text.replace('|', '#')
82
83 # Sometimes we use a space char. instead of a '_', so fix that before querying
84 link_text = link_text.replace(' ', '_')
85 if debug: pywikibot.stdout(' Found link {0}.'.format(link_text))
86
87 # If this link doesn't have a section link in it, then we don't care about it, as
88 # MediaWiki takes care of checking basic intrawiki links
89 if not '#' in link_text:
90 if debug: pywikibot.stdout(' Link doesn\'t have a section anchor in it. Skipping.')
91 continue
92
93 # If this link has an interwiki prefix, it can be ignored; see check_interwiki_links.py
94 # for the task of checking interwiki page+section links
95 is_interwiki = False
96 if found_iw_match == False:
97 for prefix in interwiki_prefixes:
98 if prefix + ":" in link_text:
99 if debug: pywikibot.stdout(' Skipping link {} because it is an interwiki link.'.format(link_text))
100 is_interwiki = True
101 break
102 if is_interwiki:
103 continue
104
105 # If there is a '{' in the link, then probably it's a link built on transcluded text
106 # like "Quotes/Diary#{{C3}}", which we cannot expand and work with, so skip it
107 if '{' in link_text:
108 ch_link_pattern = re.compile(r"{{C[0-9]*}}")
109 ch_link = ch_link_pattern.search(link_text)
110 if debug: pywikibot.stdout(' Found transclusion in link: "{}".'.format(ch_link.group(0)))
111 if ch_link:
112 ch_link_match = ch_link.group(0)
113 ch_num_pattern = re.compile("[0-9]+")
114 ch_num = ch_num_pattern.search(ch_link_match)
115 if ch_num:
116 ch_num_match = int(ch_num.group(0))
117 if ch_num_match >= 0 and ch_num_match <= 14:
118 ch_name = chapter_names[ch_num_match]
119 replace_pattern = re.compile(r"{{C" + ch_num.group(0) + r"}}")
120 link_text = replace_pattern.sub(ch_name, link_text)
121 if debug: pywikibot.stdout(' After performing transclusion, link is now "{}".'.format(link_text))
122 else:
123 if not name_printed and not debug:
124 pywikibot.stdout('From page "{}":'.format(page_name))
125 name_printed = 1
126 pywikibot.stdout(' ADVICE: Link {0} transcludes a chapter name using an out-of-range number, {1}.'.format(link_text, ch_num_match))
127 advice_issued += 1
128 continue
129 else:
130 if not name_printed and not debug:
131 pywikibot.stdout('From page "{}":'.format(page_name))
132 name_printed = 1
133 pywikibot.stdout(' ADVICE: Link {} seems to be transcluding a chapter name, but this script couldn\'t read it.'.format(link_text))
134 advice_issued += 1
135 continue
136 else:
137 if not name_printed and not debug:
138 pywikibot.stdout('From page "{}":'.format(page_name))
139 name_printed = 1
140 pywikibot.stdout(' ADVICE: Link {0} seems to use transclusion. This script can understand chapter name transclusions such as "{1}" but it doesn\'t recognize this one so it can\'t be verified. You should check the link manually.'.format(link_text, "{{C7}}"))
141 advice_issued += 1
142 continue
143
144 # If this is a relative "/" link, use the current page as the basis for the URL. Note
145 # that only a leading slash is looked for, so if there's multiple steps down ("/x/y"),
146 # we're out of luck.
147 if link_text.startswith('/'):
148 link_text = page_name + link_text
149 if debug: pywikibot.stdout(' Changed link_text to {} on account of "/".'.format(link_text))
150
151 # If this is a relative "../" link, find the parent page, set ourselves to that page,
152 # then remove the relative portion of the link. Note that this is only performed once,
153 # so if there's multiple steps back ("../../"), we're out of luck.
154 if link_text.startswith('../'):
155 last_slash = page_name.rfind('/')
156 page_name2 = page_name[0:last_slash]
157 if debug: pywikibot.stdout(' Changed page_name to {} on account of "../".'.format(page_name2))
158 link_text = link_text[3:len(link_text)]
159 if debug: pywikibot.stdout(' Changed link_text to {} on account of "../".'.format(link_text))
160 # If this is now going to be a bare section link for the parent page, don't add a
161 # slash, otherwise do because we are drilling down to another subpage
162 if link_text.startswith('#'):
163 link_text = page_name2 + link_text
164 else:
165 link_text = page_name2 + '/' + link_text
166
167 # If this is a bare section link, build URL based on this page
168 if link_text.startswith('#'):
169 iw_url = onigalore_url + page_name2
170 iw_found += 1
171 if debug: pywikibot.stdout(' Found link to this very page, {}.'.format(link_text))
172 found_iw_match = True
173 link_text = page_name2 + link_text
174
175 # If there's no ":" in the link (before the section link, where a colon would just be
176 # part of the text) then it's a Main namespace article; proceed with building URL
177 if found_iw_match == False:
178 if not re.search(":.*#", link_text):
179 iw_url = onigalore_url + link_text
180 iw_found += 1
181 if debug: pywikibot.stdout(' Link is to a Main namespace page.')
182 found_iw_match = True
183
184 # If there is a ":", match the prefix against the intrawiki prefixes on OniGalore
185 # before building URL
186 if found_iw_match == False:
187 for prefix in intrawiki_prefixes:
188 if prefix + ":" in link_text:
189 iw_url = onigalore_url + link_text
190 if debug: pywikibot.stdout(' Identified namespace {}.'.format(prefix))
191 iw_found += 1
192 found_iw_match = True
193 break
194
195 # If we still haven't turned this match into a URL, something's gone wrong
196 if (found_iw_match == False) or (iw_url == ""):
197 if not name_printed and not debug:
198 pywikibot.stdout('From page "{}":'.format(page_name))
199 name_printed = 1
200 pywikibot.stdout(' ERROR: Couldn\'t figure out link {}.'.format(link_text))
201 continue
202
203 # Test the URL
204 iw_url = iw_url.replace(' ', '_')
205 if debug: pywikibot.stdout(' Reading page at {}...'.format(iw_url))
206 response = fetch(iw_url)
207
208 # Redirects are followed automatically by fetch() and treated as "200"s; the way we can
209 # tell that a redirect occurred is by checking fetch's history
210 if response.history != []:
211 if not name_printed and not debug:
212 pywikibot.stdout('From page "{}":'.format(page_name))
213 name_printed = 1
214 pywikibot.stdout(' ADVICE: Got redirection code ({0}) on URL "{1}". You should check the link manually.'.format(response.history[0], iw_url))
215 advice_issued += 1
216 elif response.status_code != 200:
217 if not name_printed and not debug:
218 pywikibot.stdout('From page "{}":'.format(page_name))
219 name_printed = 1
220 pywikibot.stdout(' ERROR: Got response code {0} on URL {1}. The target page may not exist.'.format(response.status_code, iw_url))
221 errors_issued += 1
222 else:
223 # Isolate section link
224 pre_section, section_name = link_text.split('#', 1)
225 if debug: pywikibot.stdout(' Searching for section link {} on page.'.format(section_name))
226
227 # Convert slash character to the dot-notation hex encoding that MediaWiki uses
228 section_name = section_name.replace('/', '.2F')
229
230 # Read linked page to see if it really has this anchor link
231 soup = BeautifulSoup(response.text, 'html.parser')
232 found_section = False
233 for span_tag in soup.findAll('span'):
234 span_name = span_tag.get('id', None)
235 if span_name == section_name:
236 if debug: pywikibot.stdout(' Found section!')
237 found_section = True
238 break
239 if found_section == False:
240 if not name_printed and not debug:
241 pywikibot.stdout('From page "{}":'.format(page_name))
242 name_printed = 1
243 pywikibot.stdout(' ERROR: Could not find section {0} on page {1}!'.format(section_name, pre_section))
244 errors_issued += 1
245
246 def main(*args):
247 global debug
248 global pages_checked
249 global iw_found
250 global advice_issued
251 global errors_issued
252 search_cat = ''
253 search_page = ''
254
255 local_args = pywikibot.handle_args(args)
256 genFactory = pagegenerators.GeneratorFactory()
257
258 for arg in local_args:
259 if arg.startswith('-cat:'):
260 search_cat = arg[5:]
261 elif arg.startswith('-page:'):
262 search_page = arg[6:]
263 elif arg == '-dbg':
264 debug = 1
265 else:
266 pywikibot.stdout('Unknown argument "{}".'.format(arg))
267 return
268
269 site = pywikibot.Site()
270
271 # This line of code enumerates the methods in the 'page' class
272 #pywikibot.stdout(format(dir(page)))
273
274 if search_cat != '':
275 cat_obj = pywikibot.Category(site, search_cat)
276 generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
277 for page in pagegenerators.PreloadingGenerator(generator, 100):
278 if debug: pywikibot.stdout('Checking page {0}'.format(page.title()))
279 scan_for_intrawiki_links(page.text, page.title())
280 elif search_page != '':
281 page = pywikibot.Page(site, search_page)
282 if debug: pywikibot.stdout('Checking page {0}'.format(page.title()))
283 scan_for_intrawiki_links(page.text, page.title())
284
285 page_str = "pages"
286 if pages_checked == 1:
287 page_str = "page"
288
289 link_str = "links"
290 if iw_found == 1:
291 link_str = "link"
292
293 pywikibot.stdout('Checked {0} {1} and found {2} intrawiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
294 pywikibot.stdout('While attempting to follow section links...')
295
296 if advice_issued == 0:
297 pywikibot.stdout(' No advice on potential problems was issued.')
298 elif advice_issued == 1:
299 pywikibot.stdout(' 1 piece of advice on a potential problem was issued.')
300 else:
301 pywikibot.stdout(' {} pieces of advice on potential problems were issued.'.format(advice_issued))
302
303 error_str = "errors were"
304 if errors_issued == 1:
305 error_str = "error was"
306 pywikibot.stdout(' {0} {1} encountered.'.format(errors_issued, error_str))
307
308 if __name__ == '__main__':
309 main()