ValBot/Python/check_intrawiki_section_links.py

# Check Intrawiki Section Links
# by iritscen@yahoo.com
# Looks at each wikilink on a page (or in all the pages in a category) for a section link ('#'),
# and loads the linked page and verifies that the named section actually exists. The output will
# use the keywords ADVICE, WARNING or ERROR depending on the nature of issue that it encounters.
# Recommended viewing width:
# |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- --|

import os

from urllib.parse import urljoin

import pywikibot
import re

from pywikibot.bot import QuitKeyboardInterrupt
from pywikibot import pagegenerators
from pywikibot.tools.formatter import color_format
from pywikibot.comms.http import fetch
from pywikibot.specialbots import UploadRobot
from bs4 import BeautifulSoup

# Array of OniGalore's namespaces
intrawiki_prefixes = ('Image', 'Special', 'Talk', 'User', 'User_talk', 'OniGalore', 'OniGalore_talk', 'File', 'File_talk', 'MediaWiki', 'MediaWiki_talk', 'Template', 'Template_talk', 'Help', 'Help_talk', 'Category', 'Category_talk', 'BSL', 'BSL_talk', 'OBD', 'OBD_talk', 'AE', 'AE_talk', 'Oni2', 'Oni2_talk', 'XML', 'XML_talk')

# URL for main namespace of our wiki
onigalore_url = 'https://wiki.oni2.net/'

# Interwiki prefixes, for ruling out these links
interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')

pages_checked = 0
iw_found = 0
advice_issued = 0
warnings_issued = 0
errors_issued = 0
page_name = ''

# Searches the given page text for intrawiki links with section links in them
def scan_for_iw_links(page_text):
    global pages_checked
    global iw_found
    global advice_issued
    global warnings_issued
    global errors_issued
    global page_name
    pages_checked = pages_checked + 1

    # Isolate strings of pattern "[[anything]]", "[[any:thing]]", "[[any|thing]]" or
    # "[[any:thi|ng]]"
    iw_link = "\[\[[^|\]]*(\||\])"
    for match in re.finditer(iw_link, page_text):
        found_iw_match = False
        iw_url = ""
        page_name2 = page_name
    
        # Cut out the matched text from the page, and in the process remove the "[[" from the
        # front and the "|" or "]" from the end
        s = match.start() + 2
        e = match.end() - 1
        link_text = page_text[s:e]

        # Sometimes we used a space char. instead of a '_', so fix that before querying
        link_text = link_text.replace(' ', '_')
        #pywikibot.stdout('Found link {0}.'.format(link_text))
        
        # If this link doesn't have a section link in it, then we don't care about it, as
        # MediaWiki takes care of checking basic intrawiki links
        if not '#' in link_text:
            #pywikibot.stdout('Link doesn\'t have a section anchor in it. Skipping.')
            continue

        # If this link has an interwiki prefix, it can be ignored
        is_interwiki = False
        if found_iw_match == False:
            for prefix in interwiki_prefixes:
                if prefix + ":" in link_text:
                    #pywikibot.stdout('Skipping link {} because it is an interwiki link.'.format(link_text))
                    is_interwiki = True
                    break
        if is_interwiki:
            continue
        
        # If there is a '{' in the link, then probably it's a link built on transcluded text
        # like "Quotes/Diary#{{C3}}", which we cannot expand and work with, so skip it
        if '{' in link_text:
            pywikibot.stdout('ADVICE: Link {} seems to use transclusion, so it can\'t be verified automatically. You should check it manually.'.format(link_text))
            advice_issued = advice_issued + 1
            continue

        # If this is a relative "/" link, use the current page as the basis for the URL. Note
        # that only a leading slash is looked for, so if there's multiple steps down ("/x/y"),
        # we're out of luck.
        if link_text.startswith('/'):
            link_text = page_name + link_text
            #pywikibot.stdout('Changed link_text to {} on account of "/".'.format(link_text))
        
        # If this is a relative "../" link, find the parent page and set ourselves to that page,
        # then remove the relative portion of the link. Note that this is only performed once,
        # so if there's multiple steps back ("../../"), we're out of luck.
        if link_text.startswith('../'):
            last_slash = page_name.rfind('/')
            page_name2 = page_name[0:last_slash]
            #pywikibot.stdout('Changed page_name to {} on account of "../".'.format(page_name2))
            link_text = link_text[3:len(link_text)]
            #pywikibot.stdout('Changed link_text to {} on account of "../".'.format(link_text))
            # If this is now going to be a bare section link for the parent page, don't add a
            # slash, otherwise do because we are drilling down to another subpage
            if link_text.startswith('#'):
                link_text = page_name2 + link_text
            else:
                link_text = page_name2 + '/' + link_text
            
        # If this is a bare section link, build URL based on this page
        if link_text.startswith('#'):
            iw_url = onigalore_url + page_name2
            iw_found = iw_found + 1
            #pywikibot.stdout('Found link to this very page, {}.'.format(link_text))
            found_iw_match = True
            link_text = page_name2 + link_text
        
        # If there's no ":" in the link (before the section link, where a colon would just be
        # part of the text) then it's a Main namespace article, so construct URL
        if found_iw_match == False:
            if not re.search(":.*#", link_text):
                iw_url = onigalore_url + link_text
                iw_found = iw_found + 1
                #pywikibot.stdout('Found link to OniGalore Main namespace page {}.'.format(link_text))
                found_iw_match = True
            
        # If there is a ":", match the prefix against the intrawiki prefixes on OniGalore
        if found_iw_match == False:
            for prefix in intrawiki_prefixes:
                #pywikibot.stdout('Comparing link against prefix {}.'.format(prefix))
                if prefix + ":" in link_text:
                    iw_url = onigalore_url + link_text
                    _, post_ns = link_text.split(':', 1)
                    #pywikibot.stdout('Found link to OniGalore {0} namespace page {1}.'.format(prefix, post_ns))
                    iw_found = iw_found + 1
                    found_iw_match = True
                    break
        
        # If we still haven't turned this match into a URL, something's gone wrong
        if (found_iw_match == False) or (iw_url == ""):
            pywikibot.stdout('ERROR: Couldn\'t figure out link {}.'.format(link_text))
            continue

        # Test the URL
        iw_url = iw_url.replace(' ', '_')
        #pywikibot.stdout('Reading page at {}...'.format(iw_url))
        response = fetch(iw_url)

        # Redirects are followed automatically by fetch() and treated as "200"s; the way we can
        # tell that a redirect occurred is by checking fetch's history
        if response.history != []:
            pywikibot.stdout('WARNING: Got redirection code ({0}) on URL "{1}".'.format(response.history[0], iw_url))
            warnings_issued = warnings_issued + 1
        elif response.status_code != 200:
            pywikibot.stdout('WARNING: Got response code {0} on URL {1}.'.format(response.status_code, iw_url))
            warnings_issued = warnings_issued + 1
        else:
            # Isolate section link
            pre_section, section_name = link_text.split('#', 1)
            #pywikibot.stdout('Searching for section link {} on page.'.format(section_name))
            
            # Convert slash character to the dot-notation hex encoding that MediaWiki uses
            section_name = section_name.replace('/', '.2F')
            
            # Read linked page to see if it really has this anchor link
            soup = BeautifulSoup(response.text, 'html.parser')
            found_section = False
            for span_tag in soup.findAll('span'):
                span_name = span_tag.get('id', None)
                if span_name == section_name:
                    #pywikibot.stdout('Found section!')
                    found_section = True
                    break
            if found_section == False:
                pywikibot.stdout('ERROR: Could not find section {0} on page {1}!'.format(section_name, pre_section))
                errors_issued = errors_issued + 1

def main(*args):
    cat_name = ''
    global page_name

    local_args = pywikibot.handle_args(args)
    genFactory = pagegenerators.GeneratorFactory()

    for arg in local_args:
        if arg.startswith('-cat:'):
            cat_name = arg[5:]
        elif arg.startswith('-page:'):
            page_name = arg[6:]

    site = pywikibot.Site()

    # This line of code enumerates the methods in the 'page' class
    #pywikibot.stdout(format(dir(page)))

    if cat_name != '':
        cat_obj = pywikibot.Category(site, cat_name)
        generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
        for page in pagegenerators.PreloadingGenerator(generator, 100):
            pywikibot.stdout('Checking page {0}'.format(page.title()))
            page_name = page.title()
            scan_for_iw_links(page.text)
    elif page_name != '':
        page = pywikibot.Page(site, page_name)
        pywikibot.stdout('Checking page {0}'.format(page.title()))
        scan_for_iw_links(page.text)

    global pages_checked
    global iw_found
    global advice_issued
    global warnings_issued
    global errors_issued

    page_str = "pages"
    if pages_checked == 1:
        page_str = "page"

    link_str = "links"
    if iw_found == 1:
        link_str = "link"

    pywikibot.stdout('Checked {0} {1} and found {2} intrawiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
    pywikibot.stdout('While attempting to follow section links...')

    if advice_issued == 0:
        pywikibot.stdout('  No advice on potential problems was issued.')
    elif advice_issued == 1:
        pywikibot.stdout('  1 piece of advice on a potential problem was issued.')
    else:
        pywikibot.stdout('  {} pieces of advice on potential problems were issued.'.format(advice_issued))

    warning_str = "warnings were"
    if warnings_issued == 1:
        warning_str = "warning was"
    pywikibot.stdout('  {0} {1} issued.'.format(warnings_issued, warning_str))

    error_str = "errors were"
    if errors_issued == 1:
        error_str = "error was"
    pywikibot.stdout('  {0} {1} encountered.'.format(errors_issued, error_str))

if __name__ == '__main__':
    main()
Revision:	1176
Committed:	Sun Sep 25 23:58:33 2022 UTC (3 years ago) by iritscen
Content type:	text/x-python
File size:	11009 byte(s)
Log Message:	ValBot: check_intrawiki_section_links.py should now always ignore interwiki links.
#	Content
1	# Check Intrawiki Section Links
2	# by iritscen@yahoo.com
3	# Looks at each wikilink on a page (or in all the pages in a category) for a section link ('#'),
4	# and loads the linked page and verifies that the named section actually exists. The output will
5	# use the keywords ADVICE, WARNING or ERROR depending on the nature of issue that it encounters.
6	# Recommended viewing width:
7	# \|---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- --\|
8
9	import os
10
11	from urllib.parse import urljoin
12
13	import pywikibot
14	import re
15
16	from pywikibot.bot import QuitKeyboardInterrupt
17	from pywikibot import pagegenerators
18	from pywikibot.tools.formatter import color_format
19	from pywikibot.comms.http import fetch
20	from pywikibot.specialbots import UploadRobot
21	from bs4 import BeautifulSoup
22
23	# Array of OniGalore's namespaces
24	intrawiki_prefixes = ('Image', 'Special', 'Talk', 'User', 'User_talk', 'OniGalore', 'OniGalore_talk', 'File', 'File_talk', 'MediaWiki', 'MediaWiki_talk', 'Template', 'Template_talk', 'Help', 'Help_talk', 'Category', 'Category_talk', 'BSL', 'BSL_talk', 'OBD', 'OBD_talk', 'AE', 'AE_talk', 'Oni2', 'Oni2_talk', 'XML', 'XML_talk')
25
26	# URL for main namespace of our wiki
27	onigalore_url = 'https://wiki.oni2.net/'
28
29	# Interwiki prefixes, for ruling out these links
30	interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')
31
32	pages_checked = 0
33	iw_found = 0
34	advice_issued = 0
35	warnings_issued = 0
36	errors_issued = 0
37	page_name = ''
38
39	# Searches the given page text for intrawiki links with section links in them
40	def scan_for_iw_links(page_text):
41	global pages_checked
42	global iw_found
43	global advice_issued
44	global warnings_issued
45	global errors_issued
46	global page_name
47	pages_checked = pages_checked + 1
48
49	# Isolate strings of pattern "[[anything]]", "[[any:thing]]", "[[any\|thing]]" or
50	# "[[any:thi\|ng]]"
51	iw_link = "\[\[[^\|\]]*(\\|\|\])"
52	for match in re.finditer(iw_link, page_text):
53	found_iw_match = False
54	iw_url = ""
55	page_name2 = page_name
56
57	# Cut out the matched text from the page, and in the process remove the "[[" from the
58	# front and the "\|" or "]" from the end
59	s = match.start() + 2
60	e = match.end() - 1
61	link_text = page_text[s:e]
62
63	# Sometimes we used a space char. instead of a '_', so fix that before querying
64	link_text = link_text.replace(' ', '_')
65	#pywikibot.stdout('Found link {0}.'.format(link_text))
66
67	# If this link doesn't have a section link in it, then we don't care about it, as
68	# MediaWiki takes care of checking basic intrawiki links
69	if not '#' in link_text:
70	#pywikibot.stdout('Link doesn\'t have a section anchor in it. Skipping.')
71	continue
72
73	# If this link has an interwiki prefix, it can be ignored
74	is_interwiki = False
75	if found_iw_match == False:
76	for prefix in interwiki_prefixes:
77	if prefix + ":" in link_text:
78	#pywikibot.stdout('Skipping link {} because it is an interwiki link.'.format(link_text))
79	is_interwiki = True
80	break
81	if is_interwiki:
82	continue
83
84	# If there is a '{' in the link, then probably it's a link built on transcluded text
85	# like "Quotes/Diary#{{C3}}", which we cannot expand and work with, so skip it
86	if '{' in link_text:
87	pywikibot.stdout('ADVICE: Link {} seems to use transclusion, so it can\'t be verified automatically. You should check it manually.'.format(link_text))
88	advice_issued = advice_issued + 1
89	continue
90
91	# If this is a relative "/" link, use the current page as the basis for the URL. Note
92	# that only a leading slash is looked for, so if there's multiple steps down ("/x/y"),
93	# we're out of luck.
94	if link_text.startswith('/'):
95	link_text = page_name + link_text
96	#pywikibot.stdout('Changed link_text to {} on account of "/".'.format(link_text))
97
98	# If this is a relative "../" link, find the parent page and set ourselves to that page,
99	# then remove the relative portion of the link. Note that this is only performed once,
100	# so if there's multiple steps back ("../../"), we're out of luck.
101	if link_text.startswith('../'):
102	last_slash = page_name.rfind('/')
103	page_name2 = page_name[0:last_slash]
104	#pywikibot.stdout('Changed page_name to {} on account of "../".'.format(page_name2))
105	link_text = link_text[3:len(link_text)]
106	#pywikibot.stdout('Changed link_text to {} on account of "../".'.format(link_text))
107	# If this is now going to be a bare section link for the parent page, don't add a
108	# slash, otherwise do because we are drilling down to another subpage
109	if link_text.startswith('#'):
110	link_text = page_name2 + link_text
111	else:
112	link_text = page_name2 + '/' + link_text
113
114	# If this is a bare section link, build URL based on this page
115	if link_text.startswith('#'):
116	iw_url = onigalore_url + page_name2
117	iw_found = iw_found + 1
118	#pywikibot.stdout('Found link to this very page, {}.'.format(link_text))
119	found_iw_match = True
120	link_text = page_name2 + link_text
121
122	# If there's no ":" in the link (before the section link, where a colon would just be
123	# part of the text) then it's a Main namespace article, so construct URL
124	if found_iw_match == False:
125	if not re.search(":.*#", link_text):
126	iw_url = onigalore_url + link_text
127	iw_found = iw_found + 1
128	#pywikibot.stdout('Found link to OniGalore Main namespace page {}.'.format(link_text))
129	found_iw_match = True
130
131	# If there is a ":", match the prefix against the intrawiki prefixes on OniGalore
132	if found_iw_match == False:
133	for prefix in intrawiki_prefixes:
134	#pywikibot.stdout('Comparing link against prefix {}.'.format(prefix))
135	if prefix + ":" in link_text:
136	iw_url = onigalore_url + link_text
137	_, post_ns = link_text.split(':', 1)
138	#pywikibot.stdout('Found link to OniGalore {0} namespace page {1}.'.format(prefix, post_ns))
139	iw_found = iw_found + 1
140	found_iw_match = True
141	break
142
143	# If we still haven't turned this match into a URL, something's gone wrong
144	if (found_iw_match == False) or (iw_url == ""):
145	pywikibot.stdout('ERROR: Couldn\'t figure out link {}.'.format(link_text))
146	continue
147
148	# Test the URL
149	iw_url = iw_url.replace(' ', '_')
150	#pywikibot.stdout('Reading page at {}...'.format(iw_url))
151	response = fetch(iw_url)
152
153	# Redirects are followed automatically by fetch() and treated as "200"s; the way we can
154	# tell that a redirect occurred is by checking fetch's history
155	if response.history != []:
156	pywikibot.stdout('WARNING: Got redirection code ({0}) on URL "{1}".'.format(response.history[0], iw_url))
157	warnings_issued = warnings_issued + 1
158	elif response.status_code != 200:
159	pywikibot.stdout('WARNING: Got response code {0} on URL {1}.'.format(response.status_code, iw_url))
160	warnings_issued = warnings_issued + 1
161	else:
162	# Isolate section link
163	pre_section, section_name = link_text.split('#', 1)
164	#pywikibot.stdout('Searching for section link {} on page.'.format(section_name))
165
166	# Convert slash character to the dot-notation hex encoding that MediaWiki uses
167	section_name = section_name.replace('/', '.2F')
168
169	# Read linked page to see if it really has this anchor link
170	soup = BeautifulSoup(response.text, 'html.parser')
171	found_section = False
172	for span_tag in soup.findAll('span'):
173	span_name = span_tag.get('id', None)
174	if span_name == section_name:
175	#pywikibot.stdout('Found section!')
176	found_section = True
177	break
178	if found_section == False:
179	pywikibot.stdout('ERROR: Could not find section {0} on page {1}!'.format(section_name, pre_section))
180	errors_issued = errors_issued + 1
181
182	def main(*args):
183	cat_name = ''
184	global page_name
185
186	local_args = pywikibot.handle_args(args)
187	genFactory = pagegenerators.GeneratorFactory()
188
189	for arg in local_args:
190	if arg.startswith('-cat:'):
191	cat_name = arg[5:]
192	elif arg.startswith('-page:'):
193	page_name = arg[6:]
194
195	site = pywikibot.Site()
196
197	# This line of code enumerates the methods in the 'page' class
198	#pywikibot.stdout(format(dir(page)))
199
200	if cat_name != '':
201	cat_obj = pywikibot.Category(site, cat_name)
202	generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
203	for page in pagegenerators.PreloadingGenerator(generator, 100):
204	pywikibot.stdout('Checking page {0}'.format(page.title()))
205	page_name = page.title()
206	scan_for_iw_links(page.text)
207	elif page_name != '':
208	page = pywikibot.Page(site, page_name)
209	pywikibot.stdout('Checking page {0}'.format(page.title()))
210	scan_for_iw_links(page.text)
211
212	global pages_checked
213	global iw_found
214	global advice_issued
215	global warnings_issued
216	global errors_issued
217
218	page_str = "pages"
219	if pages_checked == 1:
220	page_str = "page"
221
222	link_str = "links"
223	if iw_found == 1:
224	link_str = "link"
225
226	pywikibot.stdout('Checked {0} {1} and found {2} intrawiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
227	pywikibot.stdout('While attempting to follow section links...')
228
229	if advice_issued == 0:
230	pywikibot.stdout(' No advice on potential problems was issued.')
231	elif advice_issued == 1:
232	pywikibot.stdout(' 1 piece of advice on a potential problem was issued.')
233	else:
234	pywikibot.stdout(' {} pieces of advice on potential problems were issued.'.format(advice_issued))
235
236	warning_str = "warnings were"
237	if warnings_issued == 1:
238	warning_str = "warning was"
239	pywikibot.stdout(' {0} {1} issued.'.format(warnings_issued, warning_str))
240
241	error_str = "errors were"
242	if errors_issued == 1:
243	error_str = "error was"
244	pywikibot.stdout(' {0} {1} encountered.'.format(errors_issued, error_str))
245
246	if __name__ == '__main__':
247	main()