ViewVC Help
View File | Revision Log | View Changeset | Root Listing
root/Oni2/ValBot/Python/check_intrawiki_section_links.py
Revision: 1173
Committed: Tue Jun 28 22:06:29 2022 UTC (3 years, 3 months ago) by iritscen
Content type: text/x-python
File size: 11112 byte(s)
Log Message:
ValBot: check_intrawiki_section_links.py won't quit when a link cannot be understood; it will just move on. find_external_images.py is now polished and robust.

File Contents

# Content
1 # Check Intrawiki Section Links
2 # by iritscen@yahoo.com
3 # Looks at each wikilink on a page (or in all the pages in a category) for a section link ('#'),
4 # and loads the linked page and verifies that the named section actually exists. The output will
5 # use the keywords ADVICE, WARNING or ERROR depending on the nature of issue that it encounters.
6 # Recommended viewing width:
7 # |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- --|
8
9 import os
10
11 from urllib.parse import urljoin
12
13 import pywikibot
14 import re
15
16 from pywikibot.bot import QuitKeyboardInterrupt
17 from pywikibot import pagegenerators
18 from pywikibot.tools.formatter import color_format
19 from pywikibot.comms.http import fetch
20 from pywikibot.specialbots import UploadRobot
21 from bs4 import BeautifulSoup
22
23 # Array of OniGalore's namespaces
24 intrawiki_prefixes = ('Image', 'Special', 'Talk', 'User', 'User_talk', 'OniGalore', 'OniGalore_talk', 'File', 'File_talk', 'MediaWiki', 'MediaWiki_talk', 'Template', 'Template_talk', 'Help', 'Help_talk', 'Category', 'Category_talk', 'BSL', 'BSL_talk', 'OBD', 'OBD_talk', 'AE', 'AE_talk', 'Oni2', 'Oni2_talk', 'XML', 'XML_talk')
25
26 # URL for main namespace of our wiki
27 onigalore_url = 'https://wiki.oni2.net/'
28
29 # Interwiki prefixes, for ruling out these links
30 interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')
31
32 pages_checked = 0
33 iw_found = 0
34 advice_issued = 0
35 warnings_issued = 0
36 errors_issued = 0
37 page_name = ''
38
39 # Searches the given page text for intrawiki links with section links in them
40 def scan_for_iw_links(page_text):
41 global pages_checked
42 global iw_found
43 global advice_issued
44 global warnings_issued
45 global errors_issued
46 global page_name
47 pages_checked = pages_checked + 1
48
49 # Isolate strings of pattern "[[anything]]", "[[any:thing]]", "[[any|thing]]" or
50 # "[[any:thi|ng]]"
51 iw_link = "\[\[[^|\]]*(\||\])"
52 for match in re.finditer(iw_link, page_text):
53 found_iw_match = False
54 iw_url = ""
55 page_name2 = page_name
56
57 # Cut out the matched text from the page, and in the process remove the "[[" from the
58 # front and the "|" or "]" from the end
59 s = match.start() + 2
60 e = match.end() - 1
61 link_text = page_text[s:e]
62
63 # Sometimes we used a space char. instead of a '_', so fix that before querying
64 link_text = link_text.replace(' ', '_')
65 #pywikibot.stdout('Found link {0}.'.format(link_text))
66
67 # If this link doesn't have a section link in it, then we don't care about it, as
68 # MediaWiki takes care of checking basic intrawiki links
69 if not '#' in link_text:
70 #pywikibot.stdout('Link doesn\'t have a section anchor in it. Skipping.')
71 continue
72
73 # If there is a '{' in the link, then probably it's a link built on transcluded text
74 # like "Quotes/Diary#{{C3}}", which we cannot expand and work with, so skip it
75 if '{' in link_text:
76 pywikibot.stdout('ADVICE: Link {} seems to use transclusion, so it can\'t be verified automatically. You should check it manually.'.format(link_text))
77 advice_issued = advice_issued + 1
78 continue
79
80 # If this is a relative "/" link, use the current page as the basis for the URL. Note
81 # that only a leading slash is looked for, so if there's multiple steps down ("/x/y"),
82 # we're out of luck.
83 if link_text.startswith('/'):
84 link_text = page_name + link_text
85 #pywikibot.stdout('Changed link_text to {} on account of "/".'.format(link_text))
86
87 # If this is a relative "../" link, find the parent page and set ourselves to that page,
88 # then remove the relative portion of the link. Note that this is only performed once,
89 # so if there's multiple steps back ("../../"), we're out of luck.
90 if link_text.startswith('../'):
91 last_slash = page_name.rfind('/')
92 page_name2 = page_name[0:last_slash]
93 #pywikibot.stdout('Changed page_name to {} on account of "../".'.format(page_name2))
94 link_text = link_text[3:len(link_text)]
95 #pywikibot.stdout('Changed link_text to {} on account of "../".'.format(link_text))
96 # If this is now going to be a bare section link for the parent page, don't add a
97 # slash, otherwise do because we are drilling down to another subpage
98 if link_text.startswith('#'):
99 link_text = page_name2 + link_text
100 else:
101 link_text = page_name2 + '/' + link_text
102
103 # If this is a bare section link, build URL based on this page
104 if link_text.startswith('#'):
105 iw_url = onigalore_url + page_name2
106 iw_found = iw_found + 1
107 #pywikibot.stdout('Found link to this very page, {}.'.format(link_text))
108 found_iw_match = True
109 link_text = page_name2 + link_text
110
111 # If there's no ":" in the link (before the section link, where a colon would just be
112 # part of the text) then it's a Main namespace article, so construct URL
113 if found_iw_match == False:
114 if not re.search(":.*#", link_text):
115 iw_url = onigalore_url + link_text
116 iw_found = iw_found + 1
117 #pywikibot.stdout('Found link to OniGalore Main namespace page {}.'.format(link_text))
118 found_iw_match = True
119
120 # If there is a ":", match the prefix against the intrawiki prefixes on OniGalore
121 if found_iw_match == False:
122 for prefix in intrawiki_prefixes:
123 #pywikibot.stdout('Comparing link against prefix {}.'.format(prefix))
124 if prefix + ":" in link_text:
125 iw_url = onigalore_url + link_text
126 _, post_ns = link_text.split(':', 1)
127 #pywikibot.stdout('Found link to OniGalore {0} namespace page {1}.'.format(prefix, post_ns))
128 iw_found = iw_found + 1
129 found_iw_match = True
130 break
131
132 # If we didn't match the prefix against any intrawiki prefixes, see if it matches
133 # against an interwiki prefix; if so, this link can be ignored
134 is_interwiki = False
135 if found_iw_match == False:
136 for prefix in interwiki_prefixes:
137 if prefix + ":" in link_text:
138 #pywikibot.stdout('Skipping link {} because it is an interwiki link.'.format(link_text))
139 is_interwiki = True
140 break
141 if is_interwiki:
142 continue
143
144 # If we still haven't turned this match into a URL, something's gone wrong
145 if (found_iw_match == False) or (iw_url == ""):
146 pywikibot.stdout('ERROR: Couldn\'t figure out link {}.'.format(link_text))
147 continue
148
149 # Test the URL
150 iw_url = iw_url.replace(' ', '_')
151 #pywikibot.stdout('Reading page at {}...'.format(iw_url))
152 response = fetch(iw_url)
153
154 # Redirects are followed automatically by fetch() and treated as "200"s; the way we can
155 # tell that a redirect occurred is by checking fetch's history
156 if response.history != []:
157 pywikibot.stdout('WARNING: Got redirection code ({0}) on URL "{1}".'.format(response.history[0], iw_url))
158 warnings_issued = warnings_issued + 1
159 elif response.status_code != 200:
160 pywikibot.stdout('WARNING: Got response code {0} on URL {1}.'.format(response.status_code, iw_url))
161 warnings_issued = warnings_issued + 1
162 else:
163 # Isolate section link
164 pre_section, section_name = link_text.split('#', 1)
165 #pywikibot.stdout('Searching for section link {} on page.'.format(section_name))
166
167 # Convert slash character to the dot-notation hex encoding that MediaWiki uses
168 section_name = section_name.replace('/', '.2F')
169
170 # Read linked page to see if it really has this anchor link
171 soup = BeautifulSoup(response.text, 'html.parser')
172 found_section = False
173 for span_tag in soup.findAll('span'):
174 span_name = span_tag.get('id', None)
175 if span_name == section_name:
176 #pywikibot.stdout('Found section!')
177 found_section = True
178 break
179 if found_section == False:
180 pywikibot.stdout('ERROR: Could not find section {0} on page {1}!'.format(section_name, pre_section))
181 errors_issued = errors_issued + 1
182
183 def main(*args):
184 cat_name = ''
185 global page_name
186
187 local_args = pywikibot.handle_args(args)
188 genFactory = pagegenerators.GeneratorFactory()
189
190 for arg in local_args:
191 if arg.startswith('-cat:'):
192 cat_name = arg[5:]
193 elif arg.startswith('-page:'):
194 page_name = arg[6:]
195
196 site = pywikibot.Site()
197
198 # This line of code enumerates the methods in the 'page' class
199 #pywikibot.stdout(format(dir(page)))
200
201 if cat_name != '':
202 cat_obj = pywikibot.Category(site, cat_name)
203 generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
204 for page in pagegenerators.PreloadingGenerator(generator, 100):
205 pywikibot.stdout('Checking page {0}'.format(page.title()))
206 page_name = page.title()
207 scan_for_iw_links(page.text)
208 elif page_name != '':
209 page = pywikibot.Page(site, page_name)
210 pywikibot.stdout('Checking page {0}'.format(page.title()))
211 scan_for_iw_links(page.text)
212
213 global pages_checked
214 global iw_found
215 global advice_issued
216 global warnings_issued
217 global errors_issued
218
219 page_str = "pages"
220 if pages_checked == 1:
221 page_str = "page"
222
223 link_str = "links"
224 if iw_found == 1:
225 link_str = "link"
226
227 pywikibot.stdout('Checked {0} {1} and found {2} intrawiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
228 pywikibot.stdout('While attempting to follow section links...')
229
230 if advice_issued == 0:
231 pywikibot.stdout(' No advice on potential problems was issued.')
232 elif advice_issued == 1:
233 pywikibot.stdout(' 1 piece of advice on a potential problem was issued.')
234 else:
235 pywikibot.stdout(' {} pieces of advice on potential problems were issued.'.format(advice_issued))
236
237 warning_str = "warnings were"
238 if warnings_issued == 1:
239 warning_str = "warning was"
240 pywikibot.stdout(' {0} {1} issued.'.format(warnings_issued, warning_str))
241
242 error_str = "errors were"
243 if errors_issued == 1:
244 error_str = "error was"
245 pywikibot.stdout(' {0} {1} encountered.'.format(errors_issued, error_str))
246
247 if __name__ == '__main__':
248 main()