ViewVC Help
View File | Revision Log | View Changeset | Root Listing
root/Oni2/ValBot/Python/check_intrawiki_section_links.py
(Generate patch)

Comparing ValBot/Python/check_intrawiki_section_links.py (file contents):
Revision 1169 by iritscen, Mon Feb 21 23:59:20 2022 UTC vs.
Revision 1171 by iritscen, Mon Mar 21 21:23:25 2022 UTC

# Line 1 | Line 1
1 + # Check Intrawiki Section Links
2 + # by iritscen@yahoo.com
3 + # Looks at each wikilink on a page (or in all the pages in a category) for a section link ('#'),
4 + # and loads the linked page and verifies that the named section actually exists. The output will
5 + # use the keywords ADVICE, WARNING or ERROR depending on the nature of issue that it encounters.
6 + # Recommended viewing width:
7 + # |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- --|
8 +
9   import os
10  
11   from urllib.parse import urljoin
# Line 23 | Line 31 | interwiki_prefixes = ('acronym', 'cache'
31  
32   pages_checked = 0
33   iw_found = 0
34 < problems_found = 0
34 > advice_issued = 0
35 > warnings_issued = 0
36 > errors_issued = 0
37   page_name = ''
38  
39   # Searches the given page text for intrawiki links with section links in them
40   def scan_for_iw_links(page_text):
41      global pages_checked
42      global iw_found
43 <    global problems_found
43 >    global advice_issued
44 >    global warnings_issued
45 >    global errors_issued
46      global page_name
47      pages_checked = pages_checked + 1
48  
# Line 50 | Line 62 | def scan_for_iw_links(page_text):
62  
63          # Sometimes we used a space char. instead of a '_', so fix that before querying
64          link_text = link_text.replace(' ', '_')
65 <        #pywikibot.output('Found link {0}.'.format(link_text))
65 >        #pywikibot.stdout('Found link {0}.'.format(link_text))
66          
67          # If this link doesn't have a section link in it, then we don't care about it, as
68          # MediaWiki takes care of checking basic intrawiki links
69          if not '#' in link_text:
70 <            #pywikibot.output('Link doesn\'t have a section anchor in it. Skipping.')
70 >            #pywikibot.stdout('Link doesn\'t have a section anchor in it. Skipping.')
71              continue
72          
73          # If there is a '{' in the link, then probably it's a link built on transcluded text
74          # like "Quotes/Diary#{{C3}}", which we cannot expand and work with, so skip it
75          if '{' in link_text:
76 <            pywikibot.output('ADVICE: Link {} seems to use transclusion, so it can\'t be verified automatically. You should check it manually.'.format(link_text))
76 >            pywikibot.stdout('ADVICE: Link {} seems to use transclusion, so it can\'t be verified automatically. You should check it manually.'.format(link_text))
77 >            advice_issued = advice_issued + 1
78              continue
79 <        
80 <        # If this is a relative "../" link, find the parent page and set ourselves to that
81 <        # page, then remove the relative portion of the link. Note that this is only performed
82 <        # once, so if there's multiple steps back ("../../"), we're out of luck.
79 >
80 >        # If this is a relative "/" link, use the current page as the basis for the URL. Note
81 >        # that only a leading slash is looked for, so if there's multiple steps down ("/x/y"),
82 >        # we're out of luck.
83 >        if link_text.startswith('/'):
84 >            link_text = page_name + link_text
85 >            pywikibot.stdout('Changed link_text to {} on account of "/".'.format(link_text))
86 >        
87 >        # If this is a relative "../" link, find the parent page and set ourselves to that page,
88 >        # then remove the relative portion of the link. Note that this is only performed once,
89 >        # so if there's multiple steps back ("../../"), we're out of luck.
90          if link_text.startswith('../'):
91              last_slash = page_name.rfind('/')
92              page_name2 = page_name[0:last_slash]
93 <            #pywikibot.output('Changed page_name to {} on account of "../".'.format(page_name2))
93 >            #pywikibot.stdout('Changed page_name to {} on account of "../".'.format(page_name2))
94              link_text = link_text[3:len(link_text)]
95 <            #pywikibot.output('Changed link_text to {} on account of "../".'.format(link_text))
96 <            # If this is now going to be a bare section link for the parent page, don't add
97 <            # a slash, otherwise do because we are drilling down to another subpage
95 >            #pywikibot.stdout('Changed link_text to {} on account of "../".'.format(link_text))
96 >            # If this is now going to be a bare section link for the parent page, don't add a
97 >            # slash, otherwise do because we are drilling down to another subpage
98              if link_text.startswith('#'):
99                  link_text = page_name2 + link_text
100              else:
# Line 84 | Line 104 | def scan_for_iw_links(page_text):
104          if link_text.startswith('#'):
105              iw_url = onigalore_url + page_name2
106              iw_found = iw_found + 1
107 <            #pywikibot.output('Found link to this very page, {}.'.format(link_text))
107 >            #pywikibot.stdout('Found link to this very page, {}.'.format(link_text))
108              found_iw_match = True
109              link_text = page_name2 + link_text
110          
111          # If there's no ":" in the link (before the section link, where a colon would just be
112          # part of the text) then it's a Main namespace article, so construct URL
93        #if not ':' in link_text:
113          if found_iw_match == False:
114              if not re.search(":.*#", link_text):
115                  iw_url = onigalore_url + link_text
116                  iw_found = iw_found + 1
117 <                #pywikibot.output('Found link to OniGalore Main namespace page {}.'.format(link_text))
117 >                #pywikibot.stdout('Found link to OniGalore Main namespace page {}.'.format(link_text))
118                  found_iw_match = True
119              
120          # If there is a ":", match the prefix against the intrawiki prefixes on OniGalore
121          if found_iw_match == False:
122              for prefix in intrawiki_prefixes:
123 <                #pywikibot.output('Comparing link against prefix {}.'.format(prefix))
123 >                #pywikibot.stdout('Comparing link against prefix {}.'.format(prefix))
124                  if prefix + ":" in link_text:
125                      iw_url = onigalore_url + link_text
126                      _, post_ns = link_text.split(':', 1)
127 <                    #pywikibot.output('Found link to OniGalore {0} namespace page {1}.'.format(prefix, post_ns))
127 >                    #pywikibot.stdout('Found link to OniGalore {0} namespace page {1}.'.format(prefix, post_ns))
128                      iw_found = iw_found + 1
129                      found_iw_match = True
130                      break
# Line 116 | Line 135 | def scan_for_iw_links(page_text):
135          if found_iw_match == False:
136              for prefix in interwiki_prefixes:
137                  if prefix + ":" in link_text:
138 <                    #pywikibot.output('Skipping link {} because it is an interwiki link.'.format(link_text))
138 >                    #pywikibot.stdout('Skipping link {} because it is an interwiki link.'.format(link_text))
139                      is_interwiki = True
140                      break
141          if is_interwiki:
# Line 124 | Line 143 | def scan_for_iw_links(page_text):
143          
144          # If we still haven't turned this match into a URL, something's gone wrong
145          if (found_iw_match == False) or (iw_url == ""):
146 <            pywikibot.output('ERROR: Couldn\'t figure out link {}. Aborting script.'.format(link_text))
146 >            pywikibot.stdout('ERROR: Couldn\'t figure out link {}. Aborting script.'.format(link_text))
147              quit()
148  
149          # Test the URL
150          iw_url = iw_url.replace(' ', '_')
151 <        #pywikibot.output('Reading page at {}...'.format(iw_url))
151 >        #pywikibot.stdout('Reading page at {}...'.format(iw_url))
152          response = fetch(iw_url)
153  
154 <        # Redirects are followed automatically by fetch() and treated as "200"s, so the
155 <        # way we tell that a redirect occurred is by checking the history
154 >        # Redirects are followed automatically by fetch() and treated as "200"s; the way we can
155 >        # tell that a redirect occurred is by checking fetch's history
156          if response.history != []:
157 <            pywikibot.output('WARNING: Redirected from {}.'.format(response.history))
158 <            problems_found = problems_found + 1
157 >            pywikibot.stdout('WARNING: Got redirection code ({0}) on URL "{1}".'.format(response.history[0], iw_url))
158 >            warnings_issued = warnings_issued + 1
159          elif response.status_code != 200:
160 <            #pywikibot.output('WARNING: Got response code {}.'.format(response.status_code)) # commented out because fetch() already prints such a msg
161 <            problems_found = problems_found + 1
160 >            pywikibot.stdout('WARNING: Got response code {0} on URL {1}.'.format(response.status_code, iw_url))
161 >            warnings_issued = warnings_issued + 1
162          else:
163              # Isolate section link
164              pre_section, section_name = link_text.split('#', 1)
165 <            #pywikibot.output('Searching for section link {} on page.'.format(section_name))
165 >            #pywikibot.stdout('Searching for section link {} on page.'.format(section_name))
166              
167              # Convert slash character to the dot-notation hex encoding that MediaWiki uses
168              section_name = section_name.replace('/', '.2F')
# Line 154 | Line 173 | def scan_for_iw_links(page_text):
173              for span_tag in soup.findAll('span'):
174                  span_name = span_tag.get('id', None)
175                  if span_name == section_name:
176 <                    #pywikibot.output('Found section!')
176 >                    #pywikibot.stdout('Found section!')
177                      found_section = True
178                      break
179              if found_section == False:
180 <                pywikibot.output('ERROR: Could not find section {0} on page {1}!'.format(section_name, pre_section))
181 <                problems_found = problems_found + 1
180 >                pywikibot.stdout('ERROR: Could not find section {0} on page {1}!'.format(section_name, pre_section))
181 >                errors_issued = errors_issued + 1
182  
183   def main(*args):
184      cat_name = ''
# Line 193 | Line 212 | def main(*args):
212  
213      global pages_checked
214      global iw_found
215 <    global problems_found
216 <    pywikibot.stdout('Checked {0} page(s) and found {1} intrawiki link(s) with {2} section link problem(s).'.format(pages_checked, iw_found, problems_found))
215 >    global advice_issued
216 >    global warnings_issued
217 >    global errors_issued
218 >
219 >    page_str = "pages"
220 >    if pages_checked == 1:
221 >        page_str = "page"
222 >
223 >    link_str = "links"
224 >    if iw_found == 1:
225 >        link_str = "link"
226 >
227 >    pywikibot.stdout('Checked {0} {1} and found {2} intrawiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
228 >    pywikibot.stdout('While attempting to follow section links...')
229 >
230 >    if advice_issued == 0:
231 >        pywikibot.stdout('  No advice on potential problems was issued.')
232 >    elif advice_issued == 1:
233 >        pywikibot.stdout('  1 piece of advice on a potential problem was issued.')
234 >    else:
235 >        pywikibot.stdout('  {} pieces of advice on potential problems were issued.'.format(advice_issued))
236 >
237 >    warning_str = "warnings were"
238 >    if warnings_issued == 1:
239 >        warning_str = "warning was"
240 >    pywikibot.stdout('  {0} {1} issued.'.format(warnings_issued, warning_str))
241 >
242 >    error_str = "errors were"
243 >    if errors_issued == 1:
244 >        error_str = "error was"
245 >    pywikibot.stdout('  {0} {1} encountered.'.format(errors_issued, error_str))
246  
247   if __name__ == '__main__':
248      main()

Diff Legend

Removed lines
+ Added lines
< Changed lines (old)
> Changed lines (new)