ValBot/Python/check_interwiki_links.py

# Check Interwiki Links
# by iritscen@yahoo.com
# Looks at each link on a page (or in all the pages in a category) which uses a registered
# interwiki prefix and loads the linked page, verifying that it exists and that any section
# link, if present, is valid as well. The output will use the word "ERROR" when it cannot
# validate the interwiki link.
# Recommended viewing width:
# |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---|

import os

from urllib.parse import urljoin

import pywikibot
import re
import requests

from pywikibot.bot import QuitKeyboardInterrupt
from pywikibot import pagegenerators
from pywikibot.tools.formatter import color_format
from pywikibot.comms.http import fetch
from pywikibot.specialbots import UploadRobot
from bs4 import BeautifulSoup

# Parallel arrays based on https://wiki.oni2.net/Special:Interwiki
interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')

interwiki_urls = ('http://www.acronymfinder.com/~/search/af.aspx?string=exact&Acronym=', 'http://www.google.com/search?q=cache:', 'https://commons.wikimedia.org/wiki/', 'http://www.dict.org/bin/Dict?Database=*&Form=Dict1&Strategy=*&Query=', 'http://www.google.com/search?q=', 'https://meta.wikimedia.org/wiki/', 'https://www.mediawiki.org/wiki/', 'https://en.wikibooks.org/wiki/', 'https://www.wikidata.org/wiki/', 'https://foundation.wikimedia.org/wiki/', 'https://en.wikinews.org/wiki/', 'https://en.wikipedia.org/wiki/', 'https://en.wikiquote.org/wiki/', 'https://wikisource.org/wiki/', 'https://species.wikimedia.org/wiki/', 'https://en.wikiversity.org/wiki/', 'https://en.wikivoyage.org/wiki/', 'https://en.wiktionary.org/wiki/', 'https://en.wiktionary.org/wiki/', 'https://en.wikipedia.org/wiki/')

pages_checked = 0
iw_found = 0
errors_issued = 0

# Searches the given page text for interwiki links
def scan_for_iw_links(page_text):
    global pages_checked
    global iw_found
    global errors_issued
    pages_checked = pages_checked + 1
    cur = 0

    for prefix in interwiki_prefixes:
        # Isolate strings that start with "[[prefix:" and end with "|" or "]"
        iw_link = "\[\[" + prefix + ":[^|\]]*(\||\])"
        for match in re.finditer(iw_link, page_text):
            # Extract just the page title from this regex match
            s = match.start() + 2 + len(prefix) + 1
            e = match.end() - 1

            # Sometimes we used a space char. instead of a '_', so fix that before querying
            page_title = page_text[s:e].replace(' ', '_')

            # Construct full URL for the particular wiki
            iw_url = interwiki_urls[cur] + page_title
            pywikibot.stdout('   Validating {0} link "{1}"'.format(prefix, page_title))
            iw_found = iw_found + 1

            # Adjust URL if this is a foreign-language WP link
            if re.match("^[a-zA-Z]{2}:", page_title):
                lang_code = page_title[0:2] + "."
                # "wp:" is the Wikipedia: namespace, not a language
                if lang_code != "wp." and lang_code != "WP.":
                    iw_url = iw_url.replace('en.', lang_code)
                    iw_url = iw_url.replace(page_title[0:3], '')

            # Test the URL
            #pywikibot.stdout('   Testing URL "{}"'.format(iw_url))
            response = fetch(iw_url)

            # Redirects are followed automatically by fetch() and treated as "200"s, so the
            # way we tell that a redirect occurred is by checking the history
            if response.history != []:
                pywikibot.stdout('   ERROR: Got redirection code ({0}) on URL "{1}".'.format(response.history[0], iw_url))
                errors_issued = errors_issued + 1
            elif response.status_code != 200:
                pywikibot.stdout('   ERROR: Got response code {0} on URL "{1}".'.format(response.status_code, iw_url))
                errors_issued = errors_issued + 1
            elif '#' in page_title:
                # Isolate section link
                page_name, anchor_name = page_title.split('#')
                
                # Convert dot-notation hex entities to proper characters
                anchor_name = anchor_name.replace('.22', '"')
                anchor_name = anchor_name.replace('.27', '\'')
                anchor_name = anchor_name.replace('.28', '(')
                anchor_name = anchor_name.replace('.29', ')')
                
                # Read linked page to see if it really has this anchor link
                soup = BeautifulSoup(response.text, 'html.parser')
                found_section = False
                for span_tag in soup.findAll('span'):
                    span_name = span_tag.get('id', None)
                    if span_name == anchor_name:
                        #pywikibot.stdout('Found section!')
                        found_section = True
                        break
                if found_section == False:
                    pywikibot.stdout('   ERROR: Could not find section {0} on page {1}.'.format(anchor_name, page_name))
                    errors_issued = errors_issued + 1
        cur = cur + 1

def main(*args):
    cat_name = ''
    page_name = ''

    local_args = pywikibot.handle_args(args)
    genFactory = pagegenerators.GeneratorFactory()

    for arg in local_args:
        if arg.startswith('-cat:'):
            cat_name = arg[5:]
        elif arg.startswith('-page:'):
            page_name = arg[6:]

    site = pywikibot.Site()

    #pywikibot.stdout('The members of the requests.models.Response class are:')
    #pywikibot.stdout(format(dir(requests.models.Response)))

    if cat_name != '':
        cat_obj = pywikibot.Category(site, cat_name)
        generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
        for page in pagegenerators.PreloadingGenerator(generator, 100):
            pywikibot.stdout('Checking page "{}"'.format(page.title()))
            scan_for_iw_links(page.text)
    elif page_name != '':
        page = pywikibot.Page(site, page_name)
        pywikibot.stdout('Checking page "{}"'.format(page.title()))
        scan_for_iw_links(page.text)

    global pages_checked
    global iw_found
    global errors_issued

    page_str = "pages"
    if pages_checked == 1:
        page_str = "page"

    link_str = "links"
    if iw_found == 1:
        link_str = "link"

    pywikibot.stdout('Checked {0} {1} and found {2} interwiki {3}.'.format(pages_checked, page_str, iw_found, link_str))

    error_str = "errors were"
    if errors_issued == 1:
        error_str = "error was"

    pywikibot.stdout('{0} {1} encountered in validating these links.'.format(errors_issued, error_str))

if __name__ == '__main__':
    main()
Revision:	1170
Committed:	Mon Mar 21 21:22:33 2022 UTC (3 years, 7 months ago) by iritscen
Content type:	text/x-python
File size:	6920 byte(s)
Log Message:	ValBot: check_interwiki_links.py: Added code header, improved output, improved section name detection in target page.
#	Content
1	# Check Interwiki Links
2	# by iritscen@yahoo.com
3	# Looks at each link on a page (or in all the pages in a category) which uses a registered
4	# interwiki prefix and loads the linked page, verifying that it exists and that any section
5	# link, if present, is valid as well. The output will use the word "ERROR" when it cannot
6	# validate the interwiki link.
7	# Recommended viewing width:
8	# \|---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---\|
9
10	import os
11
12	from urllib.parse import urljoin
13
14	import pywikibot
15	import re
16	import requests
17
18	from pywikibot.bot import QuitKeyboardInterrupt
19	from pywikibot import pagegenerators
20	from pywikibot.tools.formatter import color_format
21	from pywikibot.comms.http import fetch
22	from pywikibot.specialbots import UploadRobot
23	from bs4 import BeautifulSoup
24
25	# Parallel arrays based on https://wiki.oni2.net/Special:Interwiki
26	interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')
27
28	interwiki_urls = ('http://www.acronymfinder.com/~/search/af.aspx?string=exact&Acronym=', 'http://www.google.com/search?q=cache:', 'https://commons.wikimedia.org/wiki/', 'http://www.dict.org/bin/Dict?Database=&Form=Dict1&Strategy=&Query=', 'http://www.google.com/search?q=', 'https://meta.wikimedia.org/wiki/', 'https://www.mediawiki.org/wiki/', 'https://en.wikibooks.org/wiki/', 'https://www.wikidata.org/wiki/', 'https://foundation.wikimedia.org/wiki/', 'https://en.wikinews.org/wiki/', 'https://en.wikipedia.org/wiki/', 'https://en.wikiquote.org/wiki/', 'https://wikisource.org/wiki/', 'https://species.wikimedia.org/wiki/', 'https://en.wikiversity.org/wiki/', 'https://en.wikivoyage.org/wiki/', 'https://en.wiktionary.org/wiki/', 'https://en.wiktionary.org/wiki/', 'https://en.wikipedia.org/wiki/')
29
30	pages_checked = 0
31	iw_found = 0
32	errors_issued = 0
33
34	# Searches the given page text for interwiki links
35	def scan_for_iw_links(page_text):
36	global pages_checked
37	global iw_found
38	global errors_issued
39	pages_checked = pages_checked + 1
40	cur = 0
41
42	for prefix in interwiki_prefixes:
43	# Isolate strings that start with "[[prefix:" and end with "\|" or "]"
44	iw_link = "\[\[" + prefix + ":[^\|\]]*(\\|\|\])"
45	for match in re.finditer(iw_link, page_text):
46	# Extract just the page title from this regex match
47	s = match.start() + 2 + len(prefix) + 1
48	e = match.end() - 1
49
50	# Sometimes we used a space char. instead of a '_', so fix that before querying
51	page_title = page_text[s:e].replace(' ', '_')
52
53	# Construct full URL for the particular wiki
54	iw_url = interwiki_urls[cur] + page_title
55	pywikibot.stdout(' Validating {0} link "{1}"'.format(prefix, page_title))
56	iw_found = iw_found + 1
57
58	# Adjust URL if this is a foreign-language WP link
59	if re.match("^[a-zA-Z]{2}:", page_title):
60	lang_code = page_title[0:2] + "."
61	# "wp:" is the Wikipedia: namespace, not a language
62	if lang_code != "wp." and lang_code != "WP.":
63	iw_url = iw_url.replace('en.', lang_code)
64	iw_url = iw_url.replace(page_title[0:3], '')
65
66	# Test the URL
67	#pywikibot.stdout(' Testing URL "{}"'.format(iw_url))
68	response = fetch(iw_url)
69
70	# Redirects are followed automatically by fetch() and treated as "200"s, so the
71	# way we tell that a redirect occurred is by checking the history
72	if response.history != []:
73	pywikibot.stdout(' ERROR: Got redirection code ({0}) on URL "{1}".'.format(response.history[0], iw_url))
74	errors_issued = errors_issued + 1
75	elif response.status_code != 200:
76	pywikibot.stdout(' ERROR: Got response code {0} on URL "{1}".'.format(response.status_code, iw_url))
77	errors_issued = errors_issued + 1
78	elif '#' in page_title:
79	# Isolate section link
80	page_name, anchor_name = page_title.split('#')
81
82	# Convert dot-notation hex entities to proper characters
83	anchor_name = anchor_name.replace('.22', '"')
84	anchor_name = anchor_name.replace('.27', '\'')
85	anchor_name = anchor_name.replace('.28', '(')
86	anchor_name = anchor_name.replace('.29', ')')
87
88	# Read linked page to see if it really has this anchor link
89	soup = BeautifulSoup(response.text, 'html.parser')
90	found_section = False
91	for span_tag in soup.findAll('span'):
92	span_name = span_tag.get('id', None)
93	if span_name == anchor_name:
94	#pywikibot.stdout('Found section!')
95	found_section = True
96	break
97	if found_section == False:
98	pywikibot.stdout(' ERROR: Could not find section {0} on page {1}.'.format(anchor_name, page_name))
99	errors_issued = errors_issued + 1
100	cur = cur + 1
101
102	def main(*args):
103	cat_name = ''
104	page_name = ''
105
106	local_args = pywikibot.handle_args(args)
107	genFactory = pagegenerators.GeneratorFactory()
108
109	for arg in local_args:
110	if arg.startswith('-cat:'):
111	cat_name = arg[5:]
112	elif arg.startswith('-page:'):
113	page_name = arg[6:]
114
115	site = pywikibot.Site()
116
117	#pywikibot.stdout('The members of the requests.models.Response class are:')
118	#pywikibot.stdout(format(dir(requests.models.Response)))
119
120	if cat_name != '':
121	cat_obj = pywikibot.Category(site, cat_name)
122	generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
123	for page in pagegenerators.PreloadingGenerator(generator, 100):
124	pywikibot.stdout('Checking page "{}"'.format(page.title()))
125	scan_for_iw_links(page.text)
126	elif page_name != '':
127	page = pywikibot.Page(site, page_name)
128	pywikibot.stdout('Checking page "{}"'.format(page.title()))
129	scan_for_iw_links(page.text)
130
131	global pages_checked
132	global iw_found
133	global errors_issued
134
135	page_str = "pages"
136	if pages_checked == 1:
137	page_str = "page"
138
139	link_str = "links"
140	if iw_found == 1:
141	link_str = "link"
142
143	pywikibot.stdout('Checked {0} {1} and found {2} interwiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
144
145	error_str = "errors were"
146	if errors_issued == 1:
147	error_str = "error was"
148
149	pywikibot.stdout('{0} {1} encountered in validating these links.'.format(errors_issued, error_str))
150
151	if __name__ == '__main__':
152	main()