ViewVC Help
View File | Revision Log | View Changeset | Root Listing
root/Oni2/ValBot/Python/check_interwiki_links.py
(Generate patch)

Comparing ValBot/Python/check_interwiki_links.py (file contents):
Revision 1169 by iritscen, Mon Feb 21 23:59:20 2022 UTC vs.
Revision 1170 by iritscen, Mon Mar 21 21:22:33 2022 UTC

# Line 1 | Line 1
1 + # Check Interwiki Links
2 + # by iritscen@yahoo.com
3 + # Looks at each link on a page (or in all the pages in a category) which uses a registered
4 + # interwiki prefix and loads the linked page, verifying that it exists and that any section
5 + # link, if present, is valid as well. The output will use the word "ERROR" when it cannot
6 + # validate the interwiki link.
7 + # Recommended viewing width:
8 + # |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---|
9 +
10   import os
11  
12   from urllib.parse import urljoin
13  
14   import pywikibot
15   import re
16 + import requests
17  
18   from pywikibot.bot import QuitKeyboardInterrupt
19   from pywikibot import pagegenerators
# Line 19 | Line 29 | interwiki_urls = ('http://www.acronymfin
29  
30   pages_checked = 0
31   iw_found = 0
32 < problems_found = 0
32 > errors_issued = 0
33  
34   # Searches the given page text for interwiki links
35   def scan_for_iw_links(page_text):
36      global pages_checked
37      global iw_found
38 <    global problems_found
38 >    global errors_issued
39      pages_checked = pages_checked + 1
40      cur = 0
41  
# Line 42 | Line 52 | def scan_for_iw_links(page_text):
52  
53              # Construct full URL for the particular wiki
54              iw_url = interwiki_urls[cur] + page_title
55 <            pywikibot.output('Found {0} link {1}.'.format(prefix, page_title))
55 >            pywikibot.stdout('   Validating {0} link "{1}"'.format(prefix, page_title))
56              iw_found = iw_found + 1
57  
58              # Adjust URL if this is a foreign-language WP link
# Line 54 | Line 64 | def scan_for_iw_links(page_text):
64                      iw_url = iw_url.replace(page_title[0:3], '')
65  
66              # Test the URL
67 <            #pywikibot.output('Testing URL {}...'.format(iw_url))
67 >            #pywikibot.stdout('   Testing URL "{}"'.format(iw_url))
68              response = fetch(iw_url)
69  
70              # Redirects are followed automatically by fetch() and treated as "200"s, so the
71              # way we tell that a redirect occurred is by checking the history
72              if response.history != []:
73 <                pywikibot.output('WARNING: Redirected from {}.'.format(response.history))
74 <                problems_found = problems_found + 1
73 >                pywikibot.stdout('   ERROR: Got redirection code ({0}) on URL "{1}".'.format(response.history[0], iw_url))
74 >                errors_issued = errors_issued + 1
75              elif response.status_code != 200:
76 <                #pywikibot.output('WARNING: Got response code {}.'.format(response.status_code)) # commented out because fetch() already prints such a msg
77 <                problems_found = problems_found + 1
76 >                pywikibot.stdout('   ERROR: Got response code {0} on URL "{1}".'.format(response.status_code, iw_url))
77 >                errors_issued = errors_issued + 1
78              elif '#' in page_title:
79                  # Isolate section link
70                pywikibot.output('Detected section link on page {0}.'.format(page_title))
80                  page_name, anchor_name = page_title.split('#')
81                  
82                  # Convert dot-notation hex entities to proper characters
# Line 79 | Line 88 | def scan_for_iw_links(page_text):
88                  # Read linked page to see if it really has this anchor link
89                  soup = BeautifulSoup(response.text, 'html.parser')
90                  found_section = False
91 <                for tag in soup.findAll('a'):
92 <                    link = tag.get('href', None)
93 <                    if not link:
94 <                        #pywikibot.output('It is not a link.')
86 <                        continue
87 <                    #pywikibot.output('Got link {0}.'.format(link))
88 <                    if not link.startswith('#'):
89 <                        continue
90 <                        
91 <                    if link == '#' + anchor_name:
92 <                        pywikibot.output('Found section link!')
91 >                for span_tag in soup.findAll('span'):
92 >                    span_name = span_tag.get('id', None)
93 >                    if span_name == anchor_name:
94 >                        #pywikibot.stdout('Found section!')
95                          found_section = True
96                          break
97                  if found_section == False:
98 <                    pywikibot.output('Could not find section {0} on page {1}.'.format(anchor_name, page_name))
99 <                    problems_found = problems_found + 1
98 >                    pywikibot.stdout('   ERROR: Could not find section {0} on page {1}.'.format(anchor_name, page_name))
99 >                    errors_issued = errors_issued + 1
100          cur = cur + 1
101  
102   def main(*args):
# Line 112 | Line 114 | def main(*args):
114  
115      site = pywikibot.Site()
116  
117 <    # This line of code enumerates the methods in the 'page' class
118 <    #pywikibot.stdout(format(dir(page)))
117 >    #pywikibot.stdout('The members of the requests.models.Response class are:')
118 >    #pywikibot.stdout(format(dir(requests.models.Response)))
119  
120      if cat_name != '':
121          cat_obj = pywikibot.Category(site, cat_name)
122          generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
123          for page in pagegenerators.PreloadingGenerator(generator, 100):
124 <            pywikibot.stdout('Checking page {0}'.format(page.title()))
124 >            pywikibot.stdout('Checking page "{}"'.format(page.title()))
125              scan_for_iw_links(page.text)
126      elif page_name != '':
127          page = pywikibot.Page(site, page_name)
128 <        pywikibot.stdout('Checking page {0}'.format(page.title()))
128 >        pywikibot.stdout('Checking page "{}"'.format(page.title()))
129          scan_for_iw_links(page.text)
130  
131      global pages_checked
132      global iw_found
133 <    global problems_found
134 <    pywikibot.stdout('Checked {0} page(s) and found {1} interwiki link(s) with {2} problem(s).'.format(pages_checked, iw_found, problems_found))
133 >    global errors_issued
134 >
135 >    page_str = "pages"
136 >    if pages_checked == 1:
137 >        page_str = "page"
138 >
139 >    link_str = "links"
140 >    if iw_found == 1:
141 >        link_str = "link"
142 >
143 >    pywikibot.stdout('Checked {0} {1} and found {2} interwiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
144 >
145 >    error_str = "errors were"
146 >    if errors_issued == 1:
147 >        error_str = "error was"
148 >
149 >    pywikibot.stdout('{0} {1} encountered in validating these links.'.format(errors_issued, error_str))
150  
151   if __name__ == '__main__':
152      main()

Diff Legend

Removed lines
+ Added lines
< Changed lines (old)
> Changed lines (new)