ViewVC Help
View File | Revision Log | View Changeset | Root Listing
root/Oni2/ValBot/Python/check_interwiki_links.py
(Generate patch)

Comparing ValBot/Python/check_interwiki_links.py (file contents):
Revision 1174 by iritscen, Tue Jun 28 22:11:41 2022 UTC vs.
Revision 1180 by iritscen, Fri Apr 28 00:54:21 2023 UTC

# Line 27 | Line 27 | interwiki_prefixes = ('acronym', 'cache'
27  
28   interwiki_urls = ('http://www.acronymfinder.com/~/search/af.aspx?string=exact&Acronym=', 'http://www.google.com/search?q=cache:', 'https://commons.wikimedia.org/wiki/', 'http://www.dict.org/bin/Dict?Database=*&Form=Dict1&Strategy=*&Query=', 'http://www.google.com/search?q=', 'https://meta.wikimedia.org/wiki/', 'https://www.mediawiki.org/wiki/', 'https://en.wikibooks.org/wiki/', 'https://www.wikidata.org/wiki/', 'https://foundation.wikimedia.org/wiki/', 'https://en.wikinews.org/wiki/', 'https://en.wikipedia.org/wiki/', 'https://en.wikiquote.org/wiki/', 'https://wikisource.org/wiki/', 'https://species.wikimedia.org/wiki/', 'https://en.wikiversity.org/wiki/', 'https://en.wikivoyage.org/wiki/', 'https://en.wiktionary.org/wiki/', 'https://en.wiktionary.org/wiki/', 'https://en.wikipedia.org/wiki/')
29  
30 + # Initialize globals
31 + debug = 0
32   pages_checked = 0
33   iw_found = 0
34   errors_issued = 0
35  
36   # Searches the given page text for interwiki links
37 < def scan_for_iw_links(page_text):
38 <    global pages_checked
39 <    global iw_found
40 <    global errors_issued
41 <    pages_checked = pages_checked + 1
42 <    cur = 0
43 <
44 <    for prefix in interwiki_prefixes:
45 <        # Isolate strings that start with "[[prefix:" and end with "|" or "]"
46 <        iw_link = "\[\[" + prefix + ":[^|\]]*(\||\])"
47 <        for match in re.finditer(iw_link, page_text):
48 <            # Extract just the page title from this regex match
49 <            s = match.start() + 2 + len(prefix) + 1
50 <            e = match.end() - 1
51 <
52 <            # Sometimes we used a space char. instead of a '_', so fix that before querying
53 <            page_title = page_text[s:e].replace(' ', '_')
54 <
55 <            # Use only spaces for title when printing it
56 <            page_title_human = page_title.replace('_', ' ')
57 <            pywikibot.stdout('   Validating {0} link "{1}"'.format(prefix, page_title_human))
58 <            iw_found = iw_found + 1
59 <
60 <            # Construct full URL for the particular wiki
61 <            iw_url = interwiki_urls[cur] + page_title
62 <
63 <            # Adjust URL if this is a foreign-language WP link
64 <            if re.match("^[a-zA-Z]{2}:", page_title):
65 <                lang_code = page_title[0:2] + "."
66 <                # "wp:" is the Wikipedia: namespace, not a language
67 <                if lang_code != "wp." and lang_code != "WP.":
68 <                    iw_url = iw_url.replace('en.', lang_code)
69 <                    iw_url = iw_url.replace(page_title[0:3], '')
70 <
71 <            # Test the URL
72 <            response = fetch(iw_url)
73 <
74 <            # One way we tell that a redirect occurred is by checking the history
75 <            if response.history != []:
76 <                pywikibot.stdout('   ERROR: Got redirection code ({0}) on URL "{1}".'.format(response.history[0], iw_url))
77 <                errors_issued = errors_issued + 1
78 <            elif response.status_code != 200:
79 <                pywikibot.stdout('   ERROR: Got response code {0} on URL "{1}".'.format(response.status_code, iw_url))
80 <                errors_issued = errors_issued + 1
81 <            # The usual way that a redirect occurs is that MediaWiki redirects us sneakily
82 <            # using JavaScript, while returning code OK 200 as if the link was correct; we
83 <            # must detect this from the page source
84 <            elif 'Redirected from <a' in response.text:
85 <                # Extract link from this source which contains name of redirected-to page:
86 <                # <link rel="canonical" href="https://en.wikipedia.org/wiki/Page_name"/>
87 <                canonical_name = response.text.split('<link rel="canonical" href="')[-1]
88 <                prefix_length = len(interwiki_urls[cur])
89 <                canonical_name = canonical_name[prefix_length:]
90 <                tag_end = canonical_name.find('"/>')
91 <                if tag_end == -1:
92 <                   pywikibot.stdout('   ERROR: This is a redirect page (but I could not isolate the correct page name).')
93 <                else:
94 <                   canonical_name = canonical_name[:tag_end]
95 <                   if len(canonical_name) > 100:
96 <                      # Certain things can cause the trim to fail; here we avoid slamming
97 <                      # the output with massive page source from a failed trim
98 <                      pywikibot.stdout('   ERROR: This is a redirect to "{}" (string trimmed to 100 chars due to excessive length).'.format(canonical_name[:100]))
99 <                   else:
100 <                      canonical_name = canonical_name.replace('_', ' ')
101 <                      pywikibot.stdout('   ERROR: This is a redirect to "{}".'.format(canonical_name))
102 <                errors_issued = errors_issued + 1
103 <            elif '#' in page_title:
104 <                # Isolate section link
105 <                page_name, anchor_name = page_title.split('#')
106 <                
107 <                # Convert dot-notation hex entities to proper characters
108 <                anchor_name = anchor_name.replace('.22', '"')
109 <                anchor_name = anchor_name.replace('.27', '\'')
110 <                anchor_name = anchor_name.replace('.28', '(')
111 <                anchor_name = anchor_name.replace('.29', ')')
112 <                
113 <                # Read linked page to see if it really has this anchor link
114 <                soup = BeautifulSoup(response.text, 'html.parser')
115 <                found_section = False
116 <                for span_tag in soup.findAll('span'):
117 <                    span_name = span_tag.get('id', None)
118 <                    if span_name == anchor_name:
119 <                        found_section = True
120 <                        break
121 <                if found_section == False:
122 <                    pywikibot.stdout('   ERROR: Could not find section {0} on page {1}.'.format(anchor_name, page_name))
123 <                    errors_issued = errors_issued + 1
124 <        cur = cur + 1
37 > def scan_for_interwiki_links(page_text, page_name):
38 >   global debug
39 >   global pages_checked
40 >   global iw_found
41 >   global errors_issued
42 >   pages_checked = pages_checked + 1
43 >   cur = 0
44 >   name_printed = 0
45 >
46 >   for prefix in interwiki_prefixes:
47 >      # Isolate strings that start with "[[prefix:" and end with "|" or "]"
48 >      iw_link = "\[\[" + prefix + ":[^|\]]*(\||\])"
49 >      for match in re.finditer(iw_link, page_text):
50 >         # Extract just the page title from this regex match
51 >         s = match.start() + 2 + len(prefix) + 1
52 >         e = match.end() - 1
53 >
54 >         # Sometimes we used a space char. instead of a '_', so fix that before querying
55 >         page_title = page_text[s:e].replace(' ', '_')
56 >
57 >         # Use only spaces for title when printing it
58 >         page_title_human = page_title.replace('_', ' ')
59 >         if debug: pywikibot.stdout('   Validating {0} link "{1}"'.format(prefix, page_title_human))
60 >         iw_found = iw_found + 1
61 >
62 >         # Construct full URL for the particular wiki
63 >         iw_url = interwiki_urls[cur] + page_title
64 >
65 >         # Adjust URL if this is a foreign-language WP link
66 >         if re.match("^[a-zA-Z]{2}:", page_title):
67 >            lang_code = page_title[0:2] + "."
68 >            # "wp:" is the Wikipedia: namespace, not a language
69 >            if lang_code != "wp." and lang_code != "WP.":
70 >               iw_url = iw_url.replace('en.', lang_code)
71 >               iw_url = iw_url.replace(page_title[0:3], '')
72 >
73 >         # Test the URL
74 >         response = fetch(iw_url)
75 >
76 >         # One way we tell that a redirect occurred is by checking the history
77 >         if response.history != []:
78 >            if not name_printed and not debug:
79 >               pywikibot.stdout('From page "{}":'.format(page_name))
80 >               name_printed = 1
81 >            if page_title.startswith('WP:') and page_title == page_title.upper():
82 >               pywikibot.stdout('   ERROR: Got redirection code ({0}) for {1} link "{2}", but this appears to be a deliberate use of a Wikipedia shortcut. You should check the link manually.'.format(response.history[0], prefix, page_title))
83 >            else:
84 >               pywikibot.stdout('   ERROR: Got redirection code ({0}) for {1} link "{2}". You should check the link manually.'.format(response.history[0], prefix, page_title))
85 >            errors_issued = errors_issued + 1
86 >         elif response.status_code != 200:
87 >            if not name_printed and not debug:
88 >               pywikibot.stdout('From page "{}":'.format(page_name))
89 >               name_printed = 1
90 >            pywikibot.stdout('   ERROR: Got response code {0} for {1} link "{2}". The page may not exist.'.format(response.status_code, prefix, page_title))
91 >            errors_issued = errors_issued + 1
92 >         # The usual way that a redirect occurs is that MediaWiki redirects us sneakily
93 >         # using JavaScript, while returning code OK 200 as if the link was correct; we
94 >         # must detect this from the page source
95 >         elif 'Redirected from <a' in response.text:
96 >            if not name_printed and not debug:
97 >               pywikibot.stdout('From page "{}":'.format(page_name))
98 >               name_printed = 1
99 >            # Extract link from this source which contains name of redirected-to page:
100 >            # <link rel="canonical" href="https://en.wikipedia.org/wiki/Page_name"/>
101 >            canonical_name = response.text.split('<link rel="canonical" href="')[-1]
102 >            prefix_length = len(interwiki_urls[cur])
103 >            canonical_name = canonical_name[prefix_length:]
104 >            tag_end = canonical_name.find('"/>')
105 >            if tag_end == -1:
106 >               pywikibot.stdout('   ERROR: The {0} link "{1}" is a redirect page, but this script could not isolate the target page name.', format(prefix, page_title))
107 >            else:
108 >               canonical_name = canonical_name[:tag_end]
109 >               if len(canonical_name) > 100:
110 >                 # Certain things can cause the trim to fail; here we avoid slamming
111 >                 # the output with massive page source from a failed trim
112 >                 pywikibot.stdout('   ERROR: The {0} link "{1}" is a redirect to "{2}…" (string trimmed to 100 chars).'.format(prefix, page_title, canonical_name[:100]))
113 >               else:
114 >                 canonical_name = canonical_name.replace('_', ' ')
115 >                 pywikibot.stdout('   ERROR: The {0} link "{1}" is a redirect to "{2}".'.format(prefix, page_title, canonical_name))
116 >            errors_issued = errors_issued + 1
117 >         elif '#' in page_title:
118 >            # Isolate section link
119 >            target_page_name, anchor_name = page_title.split('#')
120 >            
121 >            # Convert dot-notation hex entities to proper characters
122 >            anchor_name = anchor_name.replace('.22', '"')
123 >            anchor_name = anchor_name.replace('.27', '\'')
124 >            anchor_name = anchor_name.replace('.28', '(')
125 >            anchor_name = anchor_name.replace('.29', ')')
126 >            
127 >            # Read linked page to see if it really has this anchor link
128 >            soup = BeautifulSoup(response.text, 'html.parser')
129 >            found_section = False
130 >            for span_tag in soup.findAll('span'):
131 >               span_name = span_tag.get('id', None)
132 >               if span_name == anchor_name:
133 >                  found_section = True
134 >                  break
135 >            if found_section == False:
136 >               if not name_printed and not debug:
137 >                  pywikibot.stdout('From page "{}":'.format(page_name))
138 >                  name_printed = 1
139 >               target_page_name_human = target_page_name.replace('_', ' ')
140 >               pywikibot.stdout('   ERROR: Could not find section "{0}" on {1} page "{2}".'.format(anchor_name, prefix, target_page_name_human))
141 >               errors_issued = errors_issued + 1
142 >      cur = cur + 1
143  
144   def main(*args):
145 <    cat_name = ''
146 <    page_name = ''
145 >   global debug
146 >   search_cat = ''
147 >   search_page = ''
148 >
149 >   local_args = pywikibot.handle_args(args)
150 >   genFactory = pagegenerators.GeneratorFactory()
151 >
152 >   for arg in local_args:
153 >      if arg.startswith('-cat:'):
154 >         search_cat = arg[5:]
155 >      elif arg.startswith('-page:'):
156 >         search_page = arg[6:]
157 >      elif arg == '-dbg':
158 >         debug = 1
159 >      else:
160 >         pywikibot.stdout('Unknown argument "{}".'.format(arg))
161 >         return
162 >
163 >   site = pywikibot.Site()
164 >
165 >   #pywikibot.stdout('The members of the requests.models.Response class are:')
166 >   #pywikibot.stdout(format(dir(requests.models.Response)))
167 >
168 >   if search_cat != '':
169 >      cat_obj = pywikibot.Category(site, search_cat)
170 >      generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
171 >      for page in pagegenerators.PreloadingGenerator(generator, 100):
172 >         if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
173 >         scan_for_interwiki_links(page.text, page.title())
174 >   elif search_page != '':
175 >      page = pywikibot.Page(site, search_page)
176 >      if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
177 >      scan_for_interwiki_links(page.text, page.title())
178 >
179 >   global pages_checked
180 >   global iw_found
181 >   global errors_issued
182 >
183 >   page_str = "pages"
184 >   if pages_checked == 1:
185 >      page_str = "page"
186 >
187 >   link_str = "links"
188 >   if iw_found == 1:
189 >      link_str = "link"
190 >
191 >   pywikibot.stdout('Checked {0} {1} and found {2} interwiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
192 >
193 >   error_str = "errors were"
194 >   if errors_issued == 1:
195 >      error_str = "error was"
196  
197 <    local_args = pywikibot.handle_args(args)
129 <    genFactory = pagegenerators.GeneratorFactory()
130 <
131 <    for arg in local_args:
132 <        if arg.startswith('-cat:'):
133 <            cat_name = arg[5:]
134 <        elif arg.startswith('-page:'):
135 <            page_name = arg[6:]
136 <
137 <    site = pywikibot.Site()
138 <
139 <    #pywikibot.stdout('The members of the requests.models.Response class are:')
140 <    #pywikibot.stdout(format(dir(requests.models.Response)))
141 <
142 <    if cat_name != '':
143 <        cat_obj = pywikibot.Category(site, cat_name)
144 <        generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
145 <        for page in pagegenerators.PreloadingGenerator(generator, 100):
146 <            pywikibot.stdout('Checking page "{}"'.format(page.title()))
147 <            scan_for_iw_links(page.text)
148 <    elif page_name != '':
149 <        page = pywikibot.Page(site, page_name)
150 <        pywikibot.stdout('Checking page "{}"'.format(page.title()))
151 <        scan_for_iw_links(page.text)
152 <
153 <    global pages_checked
154 <    global iw_found
155 <    global errors_issued
156 <
157 <    page_str = "pages"
158 <    if pages_checked == 1:
159 <        page_str = "page"
160 <
161 <    link_str = "links"
162 <    if iw_found == 1:
163 <        link_str = "link"
164 <
165 <    pywikibot.stdout('Checked {0} {1} and found {2} interwiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
166 <
167 <    error_str = "errors were"
168 <    if errors_issued == 1:
169 <        error_str = "error was"
170 <
171 <    pywikibot.stdout('{0} {1} encountered in validating these links.'.format(errors_issued, error_str))
197 >   pywikibot.stdout('{0} {1} encountered in validating these links.'.format(errors_issued, error_str))
198  
199   if __name__ == '__main__':
200 <    main()
200 >   main()

Diff Legend

Removed lines
+ Added lines
< Changed lines (old)
> Changed lines (new)