ViewVC Help
View File | Revision Log | View Changeset | Root Listing
root/Oni2/ValBot/Python/check_interwiki_links.py
Revision: 1170
Committed: Mon Mar 21 21:22:33 2022 UTC (3 years, 6 months ago) by iritscen
Content type: text/x-python
File size: 6920 byte(s)
Log Message:
ValBot: check_interwiki_links.py: Added code header, improved output, improved section name detection in target page.

File Contents

# Content
1 # Check Interwiki Links
2 # by iritscen@yahoo.com
3 # Looks at each link on a page (or in all the pages in a category) which uses a registered
4 # interwiki prefix and loads the linked page, verifying that it exists and that any section
5 # link, if present, is valid as well. The output will use the word "ERROR" when it cannot
6 # validate the interwiki link.
7 # Recommended viewing width:
8 # |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---|
9
10 import os
11
12 from urllib.parse import urljoin
13
14 import pywikibot
15 import re
16 import requests
17
18 from pywikibot.bot import QuitKeyboardInterrupt
19 from pywikibot import pagegenerators
20 from pywikibot.tools.formatter import color_format
21 from pywikibot.comms.http import fetch
22 from pywikibot.specialbots import UploadRobot
23 from bs4 import BeautifulSoup
24
25 # Parallel arrays based on https://wiki.oni2.net/Special:Interwiki
26 interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')
27
28 interwiki_urls = ('http://www.acronymfinder.com/~/search/af.aspx?string=exact&Acronym=', 'http://www.google.com/search?q=cache:', 'https://commons.wikimedia.org/wiki/', 'http://www.dict.org/bin/Dict?Database=*&Form=Dict1&Strategy=*&Query=', 'http://www.google.com/search?q=', 'https://meta.wikimedia.org/wiki/', 'https://www.mediawiki.org/wiki/', 'https://en.wikibooks.org/wiki/', 'https://www.wikidata.org/wiki/', 'https://foundation.wikimedia.org/wiki/', 'https://en.wikinews.org/wiki/', 'https://en.wikipedia.org/wiki/', 'https://en.wikiquote.org/wiki/', 'https://wikisource.org/wiki/', 'https://species.wikimedia.org/wiki/', 'https://en.wikiversity.org/wiki/', 'https://en.wikivoyage.org/wiki/', 'https://en.wiktionary.org/wiki/', 'https://en.wiktionary.org/wiki/', 'https://en.wikipedia.org/wiki/')
29
30 pages_checked = 0
31 iw_found = 0
32 errors_issued = 0
33
34 # Searches the given page text for interwiki links
35 def scan_for_iw_links(page_text):
36 global pages_checked
37 global iw_found
38 global errors_issued
39 pages_checked = pages_checked + 1
40 cur = 0
41
42 for prefix in interwiki_prefixes:
43 # Isolate strings that start with "[[prefix:" and end with "|" or "]"
44 iw_link = "\[\[" + prefix + ":[^|\]]*(\||\])"
45 for match in re.finditer(iw_link, page_text):
46 # Extract just the page title from this regex match
47 s = match.start() + 2 + len(prefix) + 1
48 e = match.end() - 1
49
50 # Sometimes we used a space char. instead of a '_', so fix that before querying
51 page_title = page_text[s:e].replace(' ', '_')
52
53 # Construct full URL for the particular wiki
54 iw_url = interwiki_urls[cur] + page_title
55 pywikibot.stdout(' Validating {0} link "{1}"'.format(prefix, page_title))
56 iw_found = iw_found + 1
57
58 # Adjust URL if this is a foreign-language WP link
59 if re.match("^[a-zA-Z]{2}:", page_title):
60 lang_code = page_title[0:2] + "."
61 # "wp:" is the Wikipedia: namespace, not a language
62 if lang_code != "wp." and lang_code != "WP.":
63 iw_url = iw_url.replace('en.', lang_code)
64 iw_url = iw_url.replace(page_title[0:3], '')
65
66 # Test the URL
67 #pywikibot.stdout(' Testing URL "{}"'.format(iw_url))
68 response = fetch(iw_url)
69
70 # Redirects are followed automatically by fetch() and treated as "200"s, so the
71 # way we tell that a redirect occurred is by checking the history
72 if response.history != []:
73 pywikibot.stdout(' ERROR: Got redirection code ({0}) on URL "{1}".'.format(response.history[0], iw_url))
74 errors_issued = errors_issued + 1
75 elif response.status_code != 200:
76 pywikibot.stdout(' ERROR: Got response code {0} on URL "{1}".'.format(response.status_code, iw_url))
77 errors_issued = errors_issued + 1
78 elif '#' in page_title:
79 # Isolate section link
80 page_name, anchor_name = page_title.split('#')
81
82 # Convert dot-notation hex entities to proper characters
83 anchor_name = anchor_name.replace('.22', '"')
84 anchor_name = anchor_name.replace('.27', '\'')
85 anchor_name = anchor_name.replace('.28', '(')
86 anchor_name = anchor_name.replace('.29', ')')
87
88 # Read linked page to see if it really has this anchor link
89 soup = BeautifulSoup(response.text, 'html.parser')
90 found_section = False
91 for span_tag in soup.findAll('span'):
92 span_name = span_tag.get('id', None)
93 if span_name == anchor_name:
94 #pywikibot.stdout('Found section!')
95 found_section = True
96 break
97 if found_section == False:
98 pywikibot.stdout(' ERROR: Could not find section {0} on page {1}.'.format(anchor_name, page_name))
99 errors_issued = errors_issued + 1
100 cur = cur + 1
101
102 def main(*args):
103 cat_name = ''
104 page_name = ''
105
106 local_args = pywikibot.handle_args(args)
107 genFactory = pagegenerators.GeneratorFactory()
108
109 for arg in local_args:
110 if arg.startswith('-cat:'):
111 cat_name = arg[5:]
112 elif arg.startswith('-page:'):
113 page_name = arg[6:]
114
115 site = pywikibot.Site()
116
117 #pywikibot.stdout('The members of the requests.models.Response class are:')
118 #pywikibot.stdout(format(dir(requests.models.Response)))
119
120 if cat_name != '':
121 cat_obj = pywikibot.Category(site, cat_name)
122 generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
123 for page in pagegenerators.PreloadingGenerator(generator, 100):
124 pywikibot.stdout('Checking page "{}"'.format(page.title()))
125 scan_for_iw_links(page.text)
126 elif page_name != '':
127 page = pywikibot.Page(site, page_name)
128 pywikibot.stdout('Checking page "{}"'.format(page.title()))
129 scan_for_iw_links(page.text)
130
131 global pages_checked
132 global iw_found
133 global errors_issued
134
135 page_str = "pages"
136 if pages_checked == 1:
137 page_str = "page"
138
139 link_str = "links"
140 if iw_found == 1:
141 link_str = "link"
142
143 pywikibot.stdout('Checked {0} {1} and found {2} interwiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
144
145 error_str = "errors were"
146 if errors_issued == 1:
147 error_str = "error was"
148
149 pywikibot.stdout('{0} {1} encountered in validating these links.'.format(errors_issued, error_str))
150
151 if __name__ == '__main__':
152 main()