ViewVC Help
View File | Revision Log | View Changeset | Root Listing
root/Oni2/ValBot/Python/check_intrawiki_section_links.py
Revision: 1176
Committed: Sun Sep 25 23:58:33 2022 UTC (3 years ago) by iritscen
Content type: text/x-python
File size: 11009 byte(s)
Log Message:
ValBot: check_intrawiki_section_links.py should now always ignore interwiki links.

File Contents

# Content
1 # Check Intrawiki Section Links
2 # by iritscen@yahoo.com
3 # Looks at each wikilink on a page (or in all the pages in a category) for a section link ('#'),
4 # and loads the linked page and verifies that the named section actually exists. The output will
5 # use the keywords ADVICE, WARNING or ERROR depending on the nature of issue that it encounters.
6 # Recommended viewing width:
7 # |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- --|
8
9 import os
10
11 from urllib.parse import urljoin
12
13 import pywikibot
14 import re
15
16 from pywikibot.bot import QuitKeyboardInterrupt
17 from pywikibot import pagegenerators
18 from pywikibot.tools.formatter import color_format
19 from pywikibot.comms.http import fetch
20 from pywikibot.specialbots import UploadRobot
21 from bs4 import BeautifulSoup
22
23 # Array of OniGalore's namespaces
24 intrawiki_prefixes = ('Image', 'Special', 'Talk', 'User', 'User_talk', 'OniGalore', 'OniGalore_talk', 'File', 'File_talk', 'MediaWiki', 'MediaWiki_talk', 'Template', 'Template_talk', 'Help', 'Help_talk', 'Category', 'Category_talk', 'BSL', 'BSL_talk', 'OBD', 'OBD_talk', 'AE', 'AE_talk', 'Oni2', 'Oni2_talk', 'XML', 'XML_talk')
25
26 # URL for main namespace of our wiki
27 onigalore_url = 'https://wiki.oni2.net/'
28
29 # Interwiki prefixes, for ruling out these links
30 interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')
31
32 pages_checked = 0
33 iw_found = 0
34 advice_issued = 0
35 warnings_issued = 0
36 errors_issued = 0
37 page_name = ''
38
39 # Searches the given page text for intrawiki links with section links in them
40 def scan_for_iw_links(page_text):
41 global pages_checked
42 global iw_found
43 global advice_issued
44 global warnings_issued
45 global errors_issued
46 global page_name
47 pages_checked = pages_checked + 1
48
49 # Isolate strings of pattern "[[anything]]", "[[any:thing]]", "[[any|thing]]" or
50 # "[[any:thi|ng]]"
51 iw_link = "\[\[[^|\]]*(\||\])"
52 for match in re.finditer(iw_link, page_text):
53 found_iw_match = False
54 iw_url = ""
55 page_name2 = page_name
56
57 # Cut out the matched text from the page, and in the process remove the "[[" from the
58 # front and the "|" or "]" from the end
59 s = match.start() + 2
60 e = match.end() - 1
61 link_text = page_text[s:e]
62
63 # Sometimes we used a space char. instead of a '_', so fix that before querying
64 link_text = link_text.replace(' ', '_')
65 #pywikibot.stdout('Found link {0}.'.format(link_text))
66
67 # If this link doesn't have a section link in it, then we don't care about it, as
68 # MediaWiki takes care of checking basic intrawiki links
69 if not '#' in link_text:
70 #pywikibot.stdout('Link doesn\'t have a section anchor in it. Skipping.')
71 continue
72
73 # If this link has an interwiki prefix, it can be ignored
74 is_interwiki = False
75 if found_iw_match == False:
76 for prefix in interwiki_prefixes:
77 if prefix + ":" in link_text:
78 #pywikibot.stdout('Skipping link {} because it is an interwiki link.'.format(link_text))
79 is_interwiki = True
80 break
81 if is_interwiki:
82 continue
83
84 # If there is a '{' in the link, then probably it's a link built on transcluded text
85 # like "Quotes/Diary#{{C3}}", which we cannot expand and work with, so skip it
86 if '{' in link_text:
87 pywikibot.stdout('ADVICE: Link {} seems to use transclusion, so it can\'t be verified automatically. You should check it manually.'.format(link_text))
88 advice_issued = advice_issued + 1
89 continue
90
91 # If this is a relative "/" link, use the current page as the basis for the URL. Note
92 # that only a leading slash is looked for, so if there's multiple steps down ("/x/y"),
93 # we're out of luck.
94 if link_text.startswith('/'):
95 link_text = page_name + link_text
96 #pywikibot.stdout('Changed link_text to {} on account of "/".'.format(link_text))
97
98 # If this is a relative "../" link, find the parent page and set ourselves to that page,
99 # then remove the relative portion of the link. Note that this is only performed once,
100 # so if there's multiple steps back ("../../"), we're out of luck.
101 if link_text.startswith('../'):
102 last_slash = page_name.rfind('/')
103 page_name2 = page_name[0:last_slash]
104 #pywikibot.stdout('Changed page_name to {} on account of "../".'.format(page_name2))
105 link_text = link_text[3:len(link_text)]
106 #pywikibot.stdout('Changed link_text to {} on account of "../".'.format(link_text))
107 # If this is now going to be a bare section link for the parent page, don't add a
108 # slash, otherwise do because we are drilling down to another subpage
109 if link_text.startswith('#'):
110 link_text = page_name2 + link_text
111 else:
112 link_text = page_name2 + '/' + link_text
113
114 # If this is a bare section link, build URL based on this page
115 if link_text.startswith('#'):
116 iw_url = onigalore_url + page_name2
117 iw_found = iw_found + 1
118 #pywikibot.stdout('Found link to this very page, {}.'.format(link_text))
119 found_iw_match = True
120 link_text = page_name2 + link_text
121
122 # If there's no ":" in the link (before the section link, where a colon would just be
123 # part of the text) then it's a Main namespace article, so construct URL
124 if found_iw_match == False:
125 if not re.search(":.*#", link_text):
126 iw_url = onigalore_url + link_text
127 iw_found = iw_found + 1
128 #pywikibot.stdout('Found link to OniGalore Main namespace page {}.'.format(link_text))
129 found_iw_match = True
130
131 # If there is a ":", match the prefix against the intrawiki prefixes on OniGalore
132 if found_iw_match == False:
133 for prefix in intrawiki_prefixes:
134 #pywikibot.stdout('Comparing link against prefix {}.'.format(prefix))
135 if prefix + ":" in link_text:
136 iw_url = onigalore_url + link_text
137 _, post_ns = link_text.split(':', 1)
138 #pywikibot.stdout('Found link to OniGalore {0} namespace page {1}.'.format(prefix, post_ns))
139 iw_found = iw_found + 1
140 found_iw_match = True
141 break
142
143 # If we still haven't turned this match into a URL, something's gone wrong
144 if (found_iw_match == False) or (iw_url == ""):
145 pywikibot.stdout('ERROR: Couldn\'t figure out link {}.'.format(link_text))
146 continue
147
148 # Test the URL
149 iw_url = iw_url.replace(' ', '_')
150 #pywikibot.stdout('Reading page at {}...'.format(iw_url))
151 response = fetch(iw_url)
152
153 # Redirects are followed automatically by fetch() and treated as "200"s; the way we can
154 # tell that a redirect occurred is by checking fetch's history
155 if response.history != []:
156 pywikibot.stdout('WARNING: Got redirection code ({0}) on URL "{1}".'.format(response.history[0], iw_url))
157 warnings_issued = warnings_issued + 1
158 elif response.status_code != 200:
159 pywikibot.stdout('WARNING: Got response code {0} on URL {1}.'.format(response.status_code, iw_url))
160 warnings_issued = warnings_issued + 1
161 else:
162 # Isolate section link
163 pre_section, section_name = link_text.split('#', 1)
164 #pywikibot.stdout('Searching for section link {} on page.'.format(section_name))
165
166 # Convert slash character to the dot-notation hex encoding that MediaWiki uses
167 section_name = section_name.replace('/', '.2F')
168
169 # Read linked page to see if it really has this anchor link
170 soup = BeautifulSoup(response.text, 'html.parser')
171 found_section = False
172 for span_tag in soup.findAll('span'):
173 span_name = span_tag.get('id', None)
174 if span_name == section_name:
175 #pywikibot.stdout('Found section!')
176 found_section = True
177 break
178 if found_section == False:
179 pywikibot.stdout('ERROR: Could not find section {0} on page {1}!'.format(section_name, pre_section))
180 errors_issued = errors_issued + 1
181
182 def main(*args):
183 cat_name = ''
184 global page_name
185
186 local_args = pywikibot.handle_args(args)
187 genFactory = pagegenerators.GeneratorFactory()
188
189 for arg in local_args:
190 if arg.startswith('-cat:'):
191 cat_name = arg[5:]
192 elif arg.startswith('-page:'):
193 page_name = arg[6:]
194
195 site = pywikibot.Site()
196
197 # This line of code enumerates the methods in the 'page' class
198 #pywikibot.stdout(format(dir(page)))
199
200 if cat_name != '':
201 cat_obj = pywikibot.Category(site, cat_name)
202 generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
203 for page in pagegenerators.PreloadingGenerator(generator, 100):
204 pywikibot.stdout('Checking page {0}'.format(page.title()))
205 page_name = page.title()
206 scan_for_iw_links(page.text)
207 elif page_name != '':
208 page = pywikibot.Page(site, page_name)
209 pywikibot.stdout('Checking page {0}'.format(page.title()))
210 scan_for_iw_links(page.text)
211
212 global pages_checked
213 global iw_found
214 global advice_issued
215 global warnings_issued
216 global errors_issued
217
218 page_str = "pages"
219 if pages_checked == 1:
220 page_str = "page"
221
222 link_str = "links"
223 if iw_found == 1:
224 link_str = "link"
225
226 pywikibot.stdout('Checked {0} {1} and found {2} intrawiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
227 pywikibot.stdout('While attempting to follow section links...')
228
229 if advice_issued == 0:
230 pywikibot.stdout(' No advice on potential problems was issued.')
231 elif advice_issued == 1:
232 pywikibot.stdout(' 1 piece of advice on a potential problem was issued.')
233 else:
234 pywikibot.stdout(' {} pieces of advice on potential problems were issued.'.format(advice_issued))
235
236 warning_str = "warnings were"
237 if warnings_issued == 1:
238 warning_str = "warning was"
239 pywikibot.stdout(' {0} {1} issued.'.format(warnings_issued, warning_str))
240
241 error_str = "errors were"
242 if errors_issued == 1:
243 error_str = "error was"
244 pywikibot.stdout(' {0} {1} encountered.'.format(errors_issued, error_str))
245
246 if __name__ == '__main__':
247 main()