ValBot/Python/check_intrawiki_section_links.py

# Check Intrawiki Section Links
# by iritscen@yahoo.com
# Looks at each wikilink on a page (or in all the pages in a category) for a section link ('#'),
# and loads the linked page and verifies that the named section actually exists. The output will
# use the keywords ADVICE, WARNING or ERROR depending on the nature of issue that it encounters.
# Recommended viewing width:
# |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- --|

import os

from urllib.parse import urljoin

import pywikibot
import re

from pywikibot.bot import QuitKeyboardInterrupt
from pywikibot import pagegenerators
from pywikibot.tools.formatter import color_format
from pywikibot.comms.http import fetch
from pywikibot.specialbots import UploadRobot
from bs4 import BeautifulSoup

# Array of OniGalore's namespaces
intrawiki_prefixes = ('Image', 'Special', 'Talk', 'User', 'User_talk', 'OniGalore', 'OniGalore_talk', 'File', 'File_talk', 'MediaWiki', 'MediaWiki_talk', 'Template', 'Template_talk', 'Help', 'Help_talk', 'Category', 'Category_talk', 'BSL', 'BSL_talk', 'OBD', 'OBD_talk', 'AE', 'AE_talk', 'Oni2', 'Oni2_talk', 'XML', 'XML_talk')

# URL for main namespace of our wiki
onigalore_url = 'https://wiki.oni2.net/'

# Interwiki prefixes, for ruling out these links
interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')

pages_checked = 0
iw_found = 0
advice_issued = 0
warnings_issued = 0
errors_issued = 0
page_name = ''

# Searches the given page text for intrawiki links with section links in them
def scan_for_iw_links(page_text):
    global pages_checked
    global iw_found
    global advice_issued
    global warnings_issued
    global errors_issued
    global page_name
    pages_checked = pages_checked + 1

    # Isolate strings of pattern "[[anything]]", "[[any:thing]]", "[[any|thing]]" or
    # "[[any:thi|ng]]"
    iw_link = "\[\[[^|\]]*(\||\])"
    for match in re.finditer(iw_link, page_text):
        found_iw_match = False
        iw_url = ""
        page_name2 = page_name
    
        # Cut out the matched text from the page, and in the process remove the "[[" from the
        # front and the "|" or "]" from the end
        s = match.start() + 2
        e = match.end() - 1
        link_text = page_text[s:e]

        # Sometimes we used a space char. instead of a '_', so fix that before querying
        link_text = link_text.replace(' ', '_')
        #pywikibot.stdout('Found link {0}.'.format(link_text))
        
        # If this link doesn't have a section link in it, then we don't care about it, as
        # MediaWiki takes care of checking basic intrawiki links
        if not '#' in link_text:
            #pywikibot.stdout('Link doesn\'t have a section anchor in it. Skipping.')
            continue
        
        # If there is a '{' in the link, then probably it's a link built on transcluded text
        # like "Quotes/Diary#{{C3}}", which we cannot expand and work with, so skip it
        if '{' in link_text:
            pywikibot.stdout('ADVICE: Link {} seems to use transclusion, so it can\'t be verified automatically. You should check it manually.'.format(link_text))
            advice_issued = advice_issued + 1
            continue

        # If this is a relative "/" link, use the current page as the basis for the URL. Note
        # that only a leading slash is looked for, so if there's multiple steps down ("/x/y"),
        # we're out of luck.
        if link_text.startswith('/'):
            link_text = page_name + link_text
            #pywikibot.stdout('Changed link_text to {} on account of "/".'.format(link_text))
        
        # If this is a relative "../" link, find the parent page and set ourselves to that page,
        # then remove the relative portion of the link. Note that this is only performed once,
        # so if there's multiple steps back ("../../"), we're out of luck.
        if link_text.startswith('../'):
            last_slash = page_name.rfind('/')
            page_name2 = page_name[0:last_slash]
            #pywikibot.stdout('Changed page_name to {} on account of "../".'.format(page_name2))
            link_text = link_text[3:len(link_text)]
            #pywikibot.stdout('Changed link_text to {} on account of "../".'.format(link_text))
            # If this is now going to be a bare section link for the parent page, don't add a
            # slash, otherwise do because we are drilling down to another subpage
            if link_text.startswith('#'):
                link_text = page_name2 + link_text
            else:
                link_text = page_name2 + '/' + link_text
            
        # If this is a bare section link, build URL based on this page
        if link_text.startswith('#'):
            iw_url = onigalore_url + page_name2
            iw_found = iw_found + 1
            #pywikibot.stdout('Found link to this very page, {}.'.format(link_text))
            found_iw_match = True
            link_text = page_name2 + link_text
        
        # If there's no ":" in the link (before the section link, where a colon would just be
        # part of the text) then it's a Main namespace article, so construct URL
        if found_iw_match == False:
            if not re.search(":.*#", link_text):
                iw_url = onigalore_url + link_text
                iw_found = iw_found + 1
                #pywikibot.stdout('Found link to OniGalore Main namespace page {}.'.format(link_text))
                found_iw_match = True
            
        # If there is a ":", match the prefix against the intrawiki prefixes on OniGalore
        if found_iw_match == False:
            for prefix in intrawiki_prefixes:
                #pywikibot.stdout('Comparing link against prefix {}.'.format(prefix))
                if prefix + ":" in link_text:
                    iw_url = onigalore_url + link_text
                    _, post_ns = link_text.split(':', 1)
                    #pywikibot.stdout('Found link to OniGalore {0} namespace page {1}.'.format(prefix, post_ns))
                    iw_found = iw_found + 1
                    found_iw_match = True
                    break
        
        # If we didn't match the prefix against any intrawiki prefixes, see if it matches
        # against an interwiki prefix; if so, this link can be ignored
        is_interwiki = False
        if found_iw_match == False:
            for prefix in interwiki_prefixes:
                if prefix + ":" in link_text:
                    #pywikibot.stdout('Skipping link {} because it is an interwiki link.'.format(link_text))
                    is_interwiki = True
                    break
        if is_interwiki:
            continue
        
        # If we still haven't turned this match into a URL, something's gone wrong
        if (found_iw_match == False) or (iw_url == ""):
            pywikibot.stdout('ERROR: Couldn\'t figure out link {}.'.format(link_text))
            continue

        # Test the URL
        iw_url = iw_url.replace(' ', '_')
        #pywikibot.stdout('Reading page at {}...'.format(iw_url))
        response = fetch(iw_url)

        # Redirects are followed automatically by fetch() and treated as "200"s; the way we can
        # tell that a redirect occurred is by checking fetch's history
        if response.history != []:
            pywikibot.stdout('WARNING: Got redirection code ({0}) on URL "{1}".'.format(response.history[0], iw_url))
            warnings_issued = warnings_issued + 1
        elif response.status_code != 200:
            pywikibot.stdout('WARNING: Got response code {0} on URL {1}.'.format(response.status_code, iw_url))
            warnings_issued = warnings_issued + 1
        else:
            # Isolate section link
            pre_section, section_name = link_text.split('#', 1)
            #pywikibot.stdout('Searching for section link {} on page.'.format(section_name))
            
            # Convert slash character to the dot-notation hex encoding that MediaWiki uses
            section_name = section_name.replace('/', '.2F')
            
            # Read linked page to see if it really has this anchor link
            soup = BeautifulSoup(response.text, 'html.parser')
            found_section = False
            for span_tag in soup.findAll('span'):
                span_name = span_tag.get('id', None)
                if span_name == section_name:
                    #pywikibot.stdout('Found section!')
                    found_section = True
                    break
            if found_section == False:
                pywikibot.stdout('ERROR: Could not find section {0} on page {1}!'.format(section_name, pre_section))
                errors_issued = errors_issued + 1

def main(*args):
    cat_name = ''
    global page_name

    local_args = pywikibot.handle_args(args)
    genFactory = pagegenerators.GeneratorFactory()

    for arg in local_args:
        if arg.startswith('-cat:'):
            cat_name = arg[5:]
        elif arg.startswith('-page:'):
            page_name = arg[6:]

    site = pywikibot.Site()

    # This line of code enumerates the methods in the 'page' class
    #pywikibot.stdout(format(dir(page)))

    if cat_name != '':
        cat_obj = pywikibot.Category(site, cat_name)
        generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
        for page in pagegenerators.PreloadingGenerator(generator, 100):
            pywikibot.stdout('Checking page {0}'.format(page.title()))
            page_name = page.title()
            scan_for_iw_links(page.text)
    elif page_name != '':
        page = pywikibot.Page(site, page_name)
        pywikibot.stdout('Checking page {0}'.format(page.title()))
        scan_for_iw_links(page.text)

    global pages_checked
    global iw_found
    global advice_issued
    global warnings_issued
    global errors_issued

    page_str = "pages"
    if pages_checked == 1:
        page_str = "page"

    link_str = "links"
    if iw_found == 1:
        link_str = "link"

    pywikibot.stdout('Checked {0} {1} and found {2} intrawiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
    pywikibot.stdout('While attempting to follow section links...')

    if advice_issued == 0:
        pywikibot.stdout('  No advice on potential problems was issued.')
    elif advice_issued == 1:
        pywikibot.stdout('  1 piece of advice on a potential problem was issued.')
    else:
        pywikibot.stdout('  {} pieces of advice on potential problems were issued.'.format(advice_issued))

    warning_str = "warnings were"
    if warnings_issued == 1:
        warning_str = "warning was"
    pywikibot.stdout('  {0} {1} issued.'.format(warnings_issued, warning_str))

    error_str = "errors were"
    if errors_issued == 1:
        error_str = "error was"
    pywikibot.stdout('  {0} {1} encountered.'.format(errors_issued, error_str))

if __name__ == '__main__':
    main()
Revision:	1173
Committed:	Tue Jun 28 22:06:29 2022 UTC (3 years, 3 months ago) by iritscen
Content type:	text/x-python
File size:	11112 byte(s)
Log Message:	ValBot: check_intrawiki_section_links.py won't quit when a link cannot be understood; it will just move on. find_external_images.py is now polished and robust.
#	Content
1	# Check Intrawiki Section Links
2	# by iritscen@yahoo.com
3	# Looks at each wikilink on a page (or in all the pages in a category) for a section link ('#'),
4	# and loads the linked page and verifies that the named section actually exists. The output will
5	# use the keywords ADVICE, WARNING or ERROR depending on the nature of issue that it encounters.
6	# Recommended viewing width:
7	# \|---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- --\|
8
9	import os
10
11	from urllib.parse import urljoin
12
13	import pywikibot
14	import re
15
16	from pywikibot.bot import QuitKeyboardInterrupt
17	from pywikibot import pagegenerators
18	from pywikibot.tools.formatter import color_format
19	from pywikibot.comms.http import fetch
20	from pywikibot.specialbots import UploadRobot
21	from bs4 import BeautifulSoup
22
23	# Array of OniGalore's namespaces
24	intrawiki_prefixes = ('Image', 'Special', 'Talk', 'User', 'User_talk', 'OniGalore', 'OniGalore_talk', 'File', 'File_talk', 'MediaWiki', 'MediaWiki_talk', 'Template', 'Template_talk', 'Help', 'Help_talk', 'Category', 'Category_talk', 'BSL', 'BSL_talk', 'OBD', 'OBD_talk', 'AE', 'AE_talk', 'Oni2', 'Oni2_talk', 'XML', 'XML_talk')
25
26	# URL for main namespace of our wiki
27	onigalore_url = 'https://wiki.oni2.net/'
28
29	# Interwiki prefixes, for ruling out these links
30	interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')
31
32	pages_checked = 0
33	iw_found = 0
34	advice_issued = 0
35	warnings_issued = 0
36	errors_issued = 0
37	page_name = ''
38
39	# Searches the given page text for intrawiki links with section links in them
40	def scan_for_iw_links(page_text):
41	global pages_checked
42	global iw_found
43	global advice_issued
44	global warnings_issued
45	global errors_issued
46	global page_name
47	pages_checked = pages_checked + 1
48
49	# Isolate strings of pattern "[[anything]]", "[[any:thing]]", "[[any\|thing]]" or
50	# "[[any:thi\|ng]]"
51	iw_link = "\[\[[^\|\]]*(\\|\|\])"
52	for match in re.finditer(iw_link, page_text):
53	found_iw_match = False
54	iw_url = ""
55	page_name2 = page_name
56
57	# Cut out the matched text from the page, and in the process remove the "[[" from the
58	# front and the "\|" or "]" from the end
59	s = match.start() + 2
60	e = match.end() - 1
61	link_text = page_text[s:e]
62
63	# Sometimes we used a space char. instead of a '_', so fix that before querying
64	link_text = link_text.replace(' ', '_')
65	#pywikibot.stdout('Found link {0}.'.format(link_text))
66
67	# If this link doesn't have a section link in it, then we don't care about it, as
68	# MediaWiki takes care of checking basic intrawiki links
69	if not '#' in link_text:
70	#pywikibot.stdout('Link doesn\'t have a section anchor in it. Skipping.')
71	continue
72
73	# If there is a '{' in the link, then probably it's a link built on transcluded text
74	# like "Quotes/Diary#{{C3}}", which we cannot expand and work with, so skip it
75	if '{' in link_text:
76	pywikibot.stdout('ADVICE: Link {} seems to use transclusion, so it can\'t be verified automatically. You should check it manually.'.format(link_text))
77	advice_issued = advice_issued + 1
78	continue
79
80	# If this is a relative "/" link, use the current page as the basis for the URL. Note
81	# that only a leading slash is looked for, so if there's multiple steps down ("/x/y"),
82	# we're out of luck.
83	if link_text.startswith('/'):
84	link_text = page_name + link_text
85	#pywikibot.stdout('Changed link_text to {} on account of "/".'.format(link_text))
86
87	# If this is a relative "../" link, find the parent page and set ourselves to that page,
88	# then remove the relative portion of the link. Note that this is only performed once,
89	# so if there's multiple steps back ("../../"), we're out of luck.
90	if link_text.startswith('../'):
91	last_slash = page_name.rfind('/')
92	page_name2 = page_name[0:last_slash]
93	#pywikibot.stdout('Changed page_name to {} on account of "../".'.format(page_name2))
94	link_text = link_text[3:len(link_text)]
95	#pywikibot.stdout('Changed link_text to {} on account of "../".'.format(link_text))
96	# If this is now going to be a bare section link for the parent page, don't add a
97	# slash, otherwise do because we are drilling down to another subpage
98	if link_text.startswith('#'):
99	link_text = page_name2 + link_text
100	else:
101	link_text = page_name2 + '/' + link_text
102
103	# If this is a bare section link, build URL based on this page
104	if link_text.startswith('#'):
105	iw_url = onigalore_url + page_name2
106	iw_found = iw_found + 1
107	#pywikibot.stdout('Found link to this very page, {}.'.format(link_text))
108	found_iw_match = True
109	link_text = page_name2 + link_text
110
111	# If there's no ":" in the link (before the section link, where a colon would just be
112	# part of the text) then it's a Main namespace article, so construct URL
113	if found_iw_match == False:
114	if not re.search(":.*#", link_text):
115	iw_url = onigalore_url + link_text
116	iw_found = iw_found + 1
117	#pywikibot.stdout('Found link to OniGalore Main namespace page {}.'.format(link_text))
118	found_iw_match = True
119
120	# If there is a ":", match the prefix against the intrawiki prefixes on OniGalore
121	if found_iw_match == False:
122	for prefix in intrawiki_prefixes:
123	#pywikibot.stdout('Comparing link against prefix {}.'.format(prefix))
124	if prefix + ":" in link_text:
125	iw_url = onigalore_url + link_text
126	_, post_ns = link_text.split(':', 1)
127	#pywikibot.stdout('Found link to OniGalore {0} namespace page {1}.'.format(prefix, post_ns))
128	iw_found = iw_found + 1
129	found_iw_match = True
130	break
131
132	# If we didn't match the prefix against any intrawiki prefixes, see if it matches
133	# against an interwiki prefix; if so, this link can be ignored
134	is_interwiki = False
135	if found_iw_match == False:
136	for prefix in interwiki_prefixes:
137	if prefix + ":" in link_text:
138	#pywikibot.stdout('Skipping link {} because it is an interwiki link.'.format(link_text))
139	is_interwiki = True
140	break
141	if is_interwiki:
142	continue
143
144	# If we still haven't turned this match into a URL, something's gone wrong
145	if (found_iw_match == False) or (iw_url == ""):
146	pywikibot.stdout('ERROR: Couldn\'t figure out link {}.'.format(link_text))
147	continue
148
149	# Test the URL
150	iw_url = iw_url.replace(' ', '_')
151	#pywikibot.stdout('Reading page at {}...'.format(iw_url))
152	response = fetch(iw_url)
153
154	# Redirects are followed automatically by fetch() and treated as "200"s; the way we can
155	# tell that a redirect occurred is by checking fetch's history
156	if response.history != []:
157	pywikibot.stdout('WARNING: Got redirection code ({0}) on URL "{1}".'.format(response.history[0], iw_url))
158	warnings_issued = warnings_issued + 1
159	elif response.status_code != 200:
160	pywikibot.stdout('WARNING: Got response code {0} on URL {1}.'.format(response.status_code, iw_url))
161	warnings_issued = warnings_issued + 1
162	else:
163	# Isolate section link
164	pre_section, section_name = link_text.split('#', 1)
165	#pywikibot.stdout('Searching for section link {} on page.'.format(section_name))
166
167	# Convert slash character to the dot-notation hex encoding that MediaWiki uses
168	section_name = section_name.replace('/', '.2F')
169
170	# Read linked page to see if it really has this anchor link
171	soup = BeautifulSoup(response.text, 'html.parser')
172	found_section = False
173	for span_tag in soup.findAll('span'):
174	span_name = span_tag.get('id', None)
175	if span_name == section_name:
176	#pywikibot.stdout('Found section!')
177	found_section = True
178	break
179	if found_section == False:
180	pywikibot.stdout('ERROR: Could not find section {0} on page {1}!'.format(section_name, pre_section))
181	errors_issued = errors_issued + 1
182
183	def main(*args):
184	cat_name = ''
185	global page_name
186
187	local_args = pywikibot.handle_args(args)
188	genFactory = pagegenerators.GeneratorFactory()
189
190	for arg in local_args:
191	if arg.startswith('-cat:'):
192	cat_name = arg[5:]
193	elif arg.startswith('-page:'):
194	page_name = arg[6:]
195
196	site = pywikibot.Site()
197
198	# This line of code enumerates the methods in the 'page' class
199	#pywikibot.stdout(format(dir(page)))
200
201	if cat_name != '':
202	cat_obj = pywikibot.Category(site, cat_name)
203	generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
204	for page in pagegenerators.PreloadingGenerator(generator, 100):
205	pywikibot.stdout('Checking page {0}'.format(page.title()))
206	page_name = page.title()
207	scan_for_iw_links(page.text)
208	elif page_name != '':
209	page = pywikibot.Page(site, page_name)
210	pywikibot.stdout('Checking page {0}'.format(page.title()))
211	scan_for_iw_links(page.text)
212
213	global pages_checked
214	global iw_found
215	global advice_issued
216	global warnings_issued
217	global errors_issued
218
219	page_str = "pages"
220	if pages_checked == 1:
221	page_str = "page"
222
223	link_str = "links"
224	if iw_found == 1:
225	link_str = "link"
226
227	pywikibot.stdout('Checked {0} {1} and found {2} intrawiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
228	pywikibot.stdout('While attempting to follow section links...')
229
230	if advice_issued == 0:
231	pywikibot.stdout(' No advice on potential problems was issued.')
232	elif advice_issued == 1:
233	pywikibot.stdout(' 1 piece of advice on a potential problem was issued.')
234	else:
235	pywikibot.stdout(' {} pieces of advice on potential problems were issued.'.format(advice_issued))
236
237	warning_str = "warnings were"
238	if warnings_issued == 1:
239	warning_str = "warning was"
240	pywikibot.stdout(' {0} {1} issued.'.format(warnings_issued, warning_str))
241
242	error_str = "errors were"
243	if errors_issued == 1:
244	error_str = "error was"
245	pywikibot.stdout(' {0} {1} encountered.'.format(errors_issued, error_str))
246
247	if __name__ == '__main__':
248	main()