ViewVC Help
View File | Revision Log | View Changeset | Root Listing
root/Oni2/ValBot/Python/check_intrawiki_section_links.py
(Generate patch)

Comparing ValBot/Python/check_intrawiki_section_links.py (file contents):
Revision 1169 by iritscen, Mon Feb 21 23:59:20 2022 UTC vs.
Revision 1176 by iritscen, Sun Sep 25 23:58:33 2022 UTC

# Line 1 | Line 1
1 + # Check Intrawiki Section Links
2 + # by iritscen@yahoo.com
3 + # Looks at each wikilink on a page (or in all the pages in a category) for a section link ('#'),
4 + # and loads the linked page and verifies that the named section actually exists. The output will
5 + # use the keywords ADVICE, WARNING or ERROR depending on the nature of issue that it encounters.
6 + # Recommended viewing width:
7 + # |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- --|
8 +
9   import os
10  
11   from urllib.parse import urljoin
# Line 23 | Line 31 | interwiki_prefixes = ('acronym', 'cache'
31  
32   pages_checked = 0
33   iw_found = 0
34 < problems_found = 0
34 > advice_issued = 0
35 > warnings_issued = 0
36 > errors_issued = 0
37   page_name = ''
38  
39   # Searches the given page text for intrawiki links with section links in them
40   def scan_for_iw_links(page_text):
41      global pages_checked
42      global iw_found
43 <    global problems_found
43 >    global advice_issued
44 >    global warnings_issued
45 >    global errors_issued
46      global page_name
47      pages_checked = pages_checked + 1
48  
# Line 50 | Line 62 | def scan_for_iw_links(page_text):
62  
63          # Sometimes we used a space char. instead of a '_', so fix that before querying
64          link_text = link_text.replace(' ', '_')
65 <        #pywikibot.output('Found link {0}.'.format(link_text))
65 >        #pywikibot.stdout('Found link {0}.'.format(link_text))
66          
67          # If this link doesn't have a section link in it, then we don't care about it, as
68          # MediaWiki takes care of checking basic intrawiki links
69          if not '#' in link_text:
70 <            #pywikibot.output('Link doesn\'t have a section anchor in it. Skipping.')
70 >            #pywikibot.stdout('Link doesn\'t have a section anchor in it. Skipping.')
71 >            continue
72 >
73 >        # If this link has an interwiki prefix, it can be ignored
74 >        is_interwiki = False
75 >        if found_iw_match == False:
76 >            for prefix in interwiki_prefixes:
77 >                if prefix + ":" in link_text:
78 >                    #pywikibot.stdout('Skipping link {} because it is an interwiki link.'.format(link_text))
79 >                    is_interwiki = True
80 >                    break
81 >        if is_interwiki:
82              continue
83          
84          # If there is a '{' in the link, then probably it's a link built on transcluded text
85          # like "Quotes/Diary#{{C3}}", which we cannot expand and work with, so skip it
86          if '{' in link_text:
87 <            pywikibot.output('ADVICE: Link {} seems to use transclusion, so it can\'t be verified automatically. You should check it manually.'.format(link_text))
87 >            pywikibot.stdout('ADVICE: Link {} seems to use transclusion, so it can\'t be verified automatically. You should check it manually.'.format(link_text))
88 >            advice_issued = advice_issued + 1
89              continue
90 <        
91 <        # If this is a relative "../" link, find the parent page and set ourselves to that
92 <        # page, then remove the relative portion of the link. Note that this is only performed
93 <        # once, so if there's multiple steps back ("../../"), we're out of luck.
90 >
91 >        # If this is a relative "/" link, use the current page as the basis for the URL. Note
92 >        # that only a leading slash is looked for, so if there's multiple steps down ("/x/y"),
93 >        # we're out of luck.
94 >        if link_text.startswith('/'):
95 >            link_text = page_name + link_text
96 >            #pywikibot.stdout('Changed link_text to {} on account of "/".'.format(link_text))
97 >        
98 >        # If this is a relative "../" link, find the parent page and set ourselves to that page,
99 >        # then remove the relative portion of the link. Note that this is only performed once,
100 >        # so if there's multiple steps back ("../../"), we're out of luck.
101          if link_text.startswith('../'):
102              last_slash = page_name.rfind('/')
103              page_name2 = page_name[0:last_slash]
104 <            #pywikibot.output('Changed page_name to {} on account of "../".'.format(page_name2))
104 >            #pywikibot.stdout('Changed page_name to {} on account of "../".'.format(page_name2))
105              link_text = link_text[3:len(link_text)]
106 <            #pywikibot.output('Changed link_text to {} on account of "../".'.format(link_text))
107 <            # If this is now going to be a bare section link for the parent page, don't add
108 <            # a slash, otherwise do because we are drilling down to another subpage
106 >            #pywikibot.stdout('Changed link_text to {} on account of "../".'.format(link_text))
107 >            # If this is now going to be a bare section link for the parent page, don't add a
108 >            # slash, otherwise do because we are drilling down to another subpage
109              if link_text.startswith('#'):
110                  link_text = page_name2 + link_text
111              else:
# Line 84 | Line 115 | def scan_for_iw_links(page_text):
115          if link_text.startswith('#'):
116              iw_url = onigalore_url + page_name2
117              iw_found = iw_found + 1
118 <            #pywikibot.output('Found link to this very page, {}.'.format(link_text))
118 >            #pywikibot.stdout('Found link to this very page, {}.'.format(link_text))
119              found_iw_match = True
120              link_text = page_name2 + link_text
121          
122          # If there's no ":" in the link (before the section link, where a colon would just be
123          # part of the text) then it's a Main namespace article, so construct URL
93        #if not ':' in link_text:
124          if found_iw_match == False:
125              if not re.search(":.*#", link_text):
126                  iw_url = onigalore_url + link_text
127                  iw_found = iw_found + 1
128 <                #pywikibot.output('Found link to OniGalore Main namespace page {}.'.format(link_text))
128 >                #pywikibot.stdout('Found link to OniGalore Main namespace page {}.'.format(link_text))
129                  found_iw_match = True
130              
131          # If there is a ":", match the prefix against the intrawiki prefixes on OniGalore
132          if found_iw_match == False:
133              for prefix in intrawiki_prefixes:
134 <                #pywikibot.output('Comparing link against prefix {}.'.format(prefix))
134 >                #pywikibot.stdout('Comparing link against prefix {}.'.format(prefix))
135                  if prefix + ":" in link_text:
136                      iw_url = onigalore_url + link_text
137                      _, post_ns = link_text.split(':', 1)
138 <                    #pywikibot.output('Found link to OniGalore {0} namespace page {1}.'.format(prefix, post_ns))
138 >                    #pywikibot.stdout('Found link to OniGalore {0} namespace page {1}.'.format(prefix, post_ns))
139                      iw_found = iw_found + 1
140                      found_iw_match = True
141                      break
142          
113        # If we didn't match the prefix against any intrawiki prefixes, see if it matches
114        # against an interwiki prefix; if so, this link can be ignored
115        is_interwiki = False
116        if found_iw_match == False:
117            for prefix in interwiki_prefixes:
118                if prefix + ":" in link_text:
119                    #pywikibot.output('Skipping link {} because it is an interwiki link.'.format(link_text))
120                    is_interwiki = True
121                    break
122        if is_interwiki:
123            continue
124        
143          # If we still haven't turned this match into a URL, something's gone wrong
144          if (found_iw_match == False) or (iw_url == ""):
145 <            pywikibot.output('ERROR: Couldn\'t figure out link {}. Aborting script.'.format(link_text))
146 <            quit()
145 >            pywikibot.stdout('ERROR: Couldn\'t figure out link {}.'.format(link_text))
146 >            continue
147  
148          # Test the URL
149          iw_url = iw_url.replace(' ', '_')
150 <        #pywikibot.output('Reading page at {}...'.format(iw_url))
150 >        #pywikibot.stdout('Reading page at {}...'.format(iw_url))
151          response = fetch(iw_url)
152  
153 <        # Redirects are followed automatically by fetch() and treated as "200"s, so the
154 <        # way we tell that a redirect occurred is by checking the history
153 >        # Redirects are followed automatically by fetch() and treated as "200"s; the way we can
154 >        # tell that a redirect occurred is by checking fetch's history
155          if response.history != []:
156 <            pywikibot.output('WARNING: Redirected from {}.'.format(response.history))
157 <            problems_found = problems_found + 1
156 >            pywikibot.stdout('WARNING: Got redirection code ({0}) on URL "{1}".'.format(response.history[0], iw_url))
157 >            warnings_issued = warnings_issued + 1
158          elif response.status_code != 200:
159 <            #pywikibot.output('WARNING: Got response code {}.'.format(response.status_code)) # commented out because fetch() already prints such a msg
160 <            problems_found = problems_found + 1
159 >            pywikibot.stdout('WARNING: Got response code {0} on URL {1}.'.format(response.status_code, iw_url))
160 >            warnings_issued = warnings_issued + 1
161          else:
162              # Isolate section link
163              pre_section, section_name = link_text.split('#', 1)
164 <            #pywikibot.output('Searching for section link {} on page.'.format(section_name))
164 >            #pywikibot.stdout('Searching for section link {} on page.'.format(section_name))
165              
166              # Convert slash character to the dot-notation hex encoding that MediaWiki uses
167              section_name = section_name.replace('/', '.2F')
# Line 154 | Line 172 | def scan_for_iw_links(page_text):
172              for span_tag in soup.findAll('span'):
173                  span_name = span_tag.get('id', None)
174                  if span_name == section_name:
175 <                    #pywikibot.output('Found section!')
175 >                    #pywikibot.stdout('Found section!')
176                      found_section = True
177                      break
178              if found_section == False:
179 <                pywikibot.output('ERROR: Could not find section {0} on page {1}!'.format(section_name, pre_section))
180 <                problems_found = problems_found + 1
179 >                pywikibot.stdout('ERROR: Could not find section {0} on page {1}!'.format(section_name, pre_section))
180 >                errors_issued = errors_issued + 1
181  
182   def main(*args):
183      cat_name = ''
# Line 193 | Line 211 | def main(*args):
211  
212      global pages_checked
213      global iw_found
214 <    global problems_found
215 <    pywikibot.stdout('Checked {0} page(s) and found {1} intrawiki link(s) with {2} section link problem(s).'.format(pages_checked, iw_found, problems_found))
214 >    global advice_issued
215 >    global warnings_issued
216 >    global errors_issued
217 >
218 >    page_str = "pages"
219 >    if pages_checked == 1:
220 >        page_str = "page"
221 >
222 >    link_str = "links"
223 >    if iw_found == 1:
224 >        link_str = "link"
225 >
226 >    pywikibot.stdout('Checked {0} {1} and found {2} intrawiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
227 >    pywikibot.stdout('While attempting to follow section links...')
228 >
229 >    if advice_issued == 0:
230 >        pywikibot.stdout('  No advice on potential problems was issued.')
231 >    elif advice_issued == 1:
232 >        pywikibot.stdout('  1 piece of advice on a potential problem was issued.')
233 >    else:
234 >        pywikibot.stdout('  {} pieces of advice on potential problems were issued.'.format(advice_issued))
235 >
236 >    warning_str = "warnings were"
237 >    if warnings_issued == 1:
238 >        warning_str = "warning was"
239 >    pywikibot.stdout('  {0} {1} issued.'.format(warnings_issued, warning_str))
240 >
241 >    error_str = "errors were"
242 >    if errors_issued == 1:
243 >        error_str = "error was"
244 >    pywikibot.stdout('  {0} {1} encountered.'.format(errors_issued, error_str))
245  
246   if __name__ == '__main__':
247      main()

Diff Legend

Removed lines
+ Added lines
< Changed lines (old)
> Changed lines (new)