ValBot/Python/check_intrawiki_section_links.py

import os

from urllib.parse import urljoin

import pywikibot
import re

from pywikibot.bot import QuitKeyboardInterrupt
from pywikibot import pagegenerators
from pywikibot.tools.formatter import color_format
from pywikibot.comms.http import fetch
from pywikibot.specialbots import UploadRobot
from bs4 import BeautifulSoup

# Array of OniGalore's namespaces
intrawiki_prefixes = ('Image', 'Special', 'Talk', 'User', 'User_talk', 'OniGalore', 'OniGalore_talk', 'File', 'File_talk', 'MediaWiki', 'MediaWiki_talk', 'Template', 'Template_talk', 'Help', 'Help_talk', 'Category', 'Category_talk', 'BSL', 'BSL_talk', 'OBD', 'OBD_talk', 'AE', 'AE_talk', 'Oni2', 'Oni2_talk', 'XML', 'XML_talk')

# URL for main namespace of our wiki
onigalore_url = 'https://wiki.oni2.net/'

# Interwiki prefixes, for ruling out these links
interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')

pages_checked = 0
iw_found = 0
problems_found = 0
page_name = ''

# Searches the given page text for intrawiki links with section links in them
def scan_for_iw_links(page_text):
    global pages_checked
    global iw_found
    global problems_found
    global page_name
    pages_checked = pages_checked + 1

    # Isolate strings of pattern "[[anything]]", "[[any:thing]]", "[[any|thing]]" or
    # "[[any:thi|ng]]"
    iw_link = "\[\[[^|\]]*(\||\])"
    for match in re.finditer(iw_link, page_text):
        found_iw_match = False
        iw_url = ""
        page_name2 = page_name
    
        # Cut out the matched text from the page, and in the process remove the "[[" from the
        # front and the "|" or "]" from the end
        s = match.start() + 2
        e = match.end() - 1
        link_text = page_text[s:e]

        # Sometimes we used a space char. instead of a '_', so fix that before querying
        link_text = link_text.replace(' ', '_')
        #pywikibot.output('Found link {0}.'.format(link_text))
        
        # If this link doesn't have a section link in it, then we don't care about it, as
        # MediaWiki takes care of checking basic intrawiki links
        if not '#' in link_text:
            #pywikibot.output('Link doesn\'t have a section anchor in it. Skipping.')
            continue
        
        # If there is a '{' in the link, then probably it's a link built on transcluded text
        # like "Quotes/Diary#{{C3}}", which we cannot expand and work with, so skip it
        if '{' in link_text:
            pywikibot.output('ADVICE: Link {} seems to use transclusion, so it can\'t be verified automatically. You should check it manually.'.format(link_text))
            continue
        
        # If this is a relative "../" link, find the parent page and set ourselves to that
        # page, then remove the relative portion of the link. Note that this is only performed
        # once, so if there's multiple steps back ("../../"), we're out of luck.
        if link_text.startswith('../'):
            last_slash = page_name.rfind('/')
            page_name2 = page_name[0:last_slash]
            #pywikibot.output('Changed page_name to {} on account of "../".'.format(page_name2))
            link_text = link_text[3:len(link_text)]
            #pywikibot.output('Changed link_text to {} on account of "../".'.format(link_text))
            # If this is now going to be a bare section link for the parent page, don't add
            # a slash, otherwise do because we are drilling down to another subpage
            if link_text.startswith('#'):
                link_text = page_name2 + link_text
            else:
                link_text = page_name2 + '/' + link_text
            
        # If this is a bare section link, build URL based on this page
        if link_text.startswith('#'):
            iw_url = onigalore_url + page_name2
            iw_found = iw_found + 1
            #pywikibot.output('Found link to this very page, {}.'.format(link_text))
            found_iw_match = True
            link_text = page_name2 + link_text
        
        # If there's no ":" in the link (before the section link, where a colon would just be
        # part of the text) then it's a Main namespace article, so construct URL
        #if not ':' in link_text:
        if found_iw_match == False:
            if not re.search(":.*#", link_text):
                iw_url = onigalore_url + link_text
                iw_found = iw_found + 1
                #pywikibot.output('Found link to OniGalore Main namespace page {}.'.format(link_text))
                found_iw_match = True
            
        # If there is a ":", match the prefix against the intrawiki prefixes on OniGalore
        if found_iw_match == False:
            for prefix in intrawiki_prefixes:
                #pywikibot.output('Comparing link against prefix {}.'.format(prefix))
                if prefix + ":" in link_text:
                    iw_url = onigalore_url + link_text
                    _, post_ns = link_text.split(':', 1)
                    #pywikibot.output('Found link to OniGalore {0} namespace page {1}.'.format(prefix, post_ns))
                    iw_found = iw_found + 1
                    found_iw_match = True
                    break
        
        # If we didn't match the prefix against any intrawiki prefixes, see if it matches
        # against an interwiki prefix; if so, this link can be ignored
        is_interwiki = False
        if found_iw_match == False:
            for prefix in interwiki_prefixes:
                if prefix + ":" in link_text:
                    #pywikibot.output('Skipping link {} because it is an interwiki link.'.format(link_text))
                    is_interwiki = True
                    break
        if is_interwiki:
            continue
        
        # If we still haven't turned this match into a URL, something's gone wrong
        if (found_iw_match == False) or (iw_url == ""):
            pywikibot.output('ERROR: Couldn\'t figure out link {}. Aborting script.'.format(link_text))
            quit()

        # Test the URL
        iw_url = iw_url.replace(' ', '_')
        #pywikibot.output('Reading page at {}...'.format(iw_url))
        response = fetch(iw_url)

        # Redirects are followed automatically by fetch() and treated as "200"s, so the
        # way we tell that a redirect occurred is by checking the history
        if response.history != []:
            pywikibot.output('WARNING: Redirected from {}.'.format(response.history))
            problems_found = problems_found + 1
        elif response.status_code != 200:
            #pywikibot.output('WARNING: Got response code {}.'.format(response.status_code)) # commented out because fetch() already prints such a msg
            problems_found = problems_found + 1
        else:
            # Isolate section link
            pre_section, section_name = link_text.split('#', 1)
            #pywikibot.output('Searching for section link {} on page.'.format(section_name))
            
            # Convert slash character to the dot-notation hex encoding that MediaWiki uses
            section_name = section_name.replace('/', '.2F')
            
            # Read linked page to see if it really has this anchor link
            soup = BeautifulSoup(response.text, 'html.parser')
            found_section = False
            for span_tag in soup.findAll('span'):
                span_name = span_tag.get('id', None)
                if span_name == section_name:
                    #pywikibot.output('Found section!')
                    found_section = True
                    break
            if found_section == False:
                pywikibot.output('ERROR: Could not find section {0} on page {1}!'.format(section_name, pre_section))
                problems_found = problems_found + 1

def main(*args):
    cat_name = ''
    global page_name

    local_args = pywikibot.handle_args(args)
    genFactory = pagegenerators.GeneratorFactory()

    for arg in local_args:
        if arg.startswith('-cat:'):
            cat_name = arg[5:]
        elif arg.startswith('-page:'):
            page_name = arg[6:]

    site = pywikibot.Site()

    # This line of code enumerates the methods in the 'page' class
    #pywikibot.stdout(format(dir(page)))

    if cat_name != '':
        cat_obj = pywikibot.Category(site, cat_name)
        generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
        for page in pagegenerators.PreloadingGenerator(generator, 100):
            pywikibot.stdout('Checking page {0}'.format(page.title()))
            page_name = page.title()
            scan_for_iw_links(page.text)
    elif page_name != '':
        page = pywikibot.Page(site, page_name)
        pywikibot.stdout('Checking page {0}'.format(page.title()))
        scan_for_iw_links(page.text)

    global pages_checked
    global iw_found
    global problems_found
    pywikibot.stdout('Checked {0} page(s) and found {1} intrawiki link(s) with {2} section link problem(s).'.format(pages_checked, iw_found, problems_found))

if __name__ == '__main__':
    main()
Revision:	1169
Committed:	Mon Feb 21 23:59:20 2022 UTC (3 years, 7 months ago) by iritscen
Content type:	text/x-python
File size:	9244 byte(s)
Log Message:	ValBot: Reorganized files. Updated docs with more helpful information.
#	Content
1	import os
2
3	from urllib.parse import urljoin
4
5	import pywikibot
6	import re
7
8	from pywikibot.bot import QuitKeyboardInterrupt
9	from pywikibot import pagegenerators
10	from pywikibot.tools.formatter import color_format
11	from pywikibot.comms.http import fetch
12	from pywikibot.specialbots import UploadRobot
13	from bs4 import BeautifulSoup
14
15	# Array of OniGalore's namespaces
16	intrawiki_prefixes = ('Image', 'Special', 'Talk', 'User', 'User_talk', 'OniGalore', 'OniGalore_talk', 'File', 'File_talk', 'MediaWiki', 'MediaWiki_talk', 'Template', 'Template_talk', 'Help', 'Help_talk', 'Category', 'Category_talk', 'BSL', 'BSL_talk', 'OBD', 'OBD_talk', 'AE', 'AE_talk', 'Oni2', 'Oni2_talk', 'XML', 'XML_talk')
17
18	# URL for main namespace of our wiki
19	onigalore_url = 'https://wiki.oni2.net/'
20
21	# Interwiki prefixes, for ruling out these links
22	interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')
23
24	pages_checked = 0
25	iw_found = 0
26	problems_found = 0
27	page_name = ''
28
29	# Searches the given page text for intrawiki links with section links in them
30	def scan_for_iw_links(page_text):
31	global pages_checked
32	global iw_found
33	global problems_found
34	global page_name
35	pages_checked = pages_checked + 1
36
37	# Isolate strings of pattern "[[anything]]", "[[any:thing]]", "[[any\|thing]]" or
38	# "[[any:thi\|ng]]"
39	iw_link = "\[\[[^\|\]]*(\\|\|\])"
40	for match in re.finditer(iw_link, page_text):
41	found_iw_match = False
42	iw_url = ""
43	page_name2 = page_name
44
45	# Cut out the matched text from the page, and in the process remove the "[[" from the
46	# front and the "\|" or "]" from the end
47	s = match.start() + 2
48	e = match.end() - 1
49	link_text = page_text[s:e]
50
51	# Sometimes we used a space char. instead of a '_', so fix that before querying
52	link_text = link_text.replace(' ', '_')
53	#pywikibot.output('Found link {0}.'.format(link_text))
54
55	# If this link doesn't have a section link in it, then we don't care about it, as
56	# MediaWiki takes care of checking basic intrawiki links
57	if not '#' in link_text:
58	#pywikibot.output('Link doesn\'t have a section anchor in it. Skipping.')
59	continue
60
61	# If there is a '{' in the link, then probably it's a link built on transcluded text
62	# like "Quotes/Diary#{{C3}}", which we cannot expand and work with, so skip it
63	if '{' in link_text:
64	pywikibot.output('ADVICE: Link {} seems to use transclusion, so it can\'t be verified automatically. You should check it manually.'.format(link_text))
65	continue
66
67	# If this is a relative "../" link, find the parent page and set ourselves to that
68	# page, then remove the relative portion of the link. Note that this is only performed
69	# once, so if there's multiple steps back ("../../"), we're out of luck.
70	if link_text.startswith('../'):
71	last_slash = page_name.rfind('/')
72	page_name2 = page_name[0:last_slash]
73	#pywikibot.output('Changed page_name to {} on account of "../".'.format(page_name2))
74	link_text = link_text[3:len(link_text)]
75	#pywikibot.output('Changed link_text to {} on account of "../".'.format(link_text))
76	# If this is now going to be a bare section link for the parent page, don't add
77	# a slash, otherwise do because we are drilling down to another subpage
78	if link_text.startswith('#'):
79	link_text = page_name2 + link_text
80	else:
81	link_text = page_name2 + '/' + link_text
82
83	# If this is a bare section link, build URL based on this page
84	if link_text.startswith('#'):
85	iw_url = onigalore_url + page_name2
86	iw_found = iw_found + 1
87	#pywikibot.output('Found link to this very page, {}.'.format(link_text))
88	found_iw_match = True
89	link_text = page_name2 + link_text
90
91	# If there's no ":" in the link (before the section link, where a colon would just be
92	# part of the text) then it's a Main namespace article, so construct URL
93	#if not ':' in link_text:
94	if found_iw_match == False:
95	if not re.search(":.*#", link_text):
96	iw_url = onigalore_url + link_text
97	iw_found = iw_found + 1
98	#pywikibot.output('Found link to OniGalore Main namespace page {}.'.format(link_text))
99	found_iw_match = True
100
101	# If there is a ":", match the prefix against the intrawiki prefixes on OniGalore
102	if found_iw_match == False:
103	for prefix in intrawiki_prefixes:
104	#pywikibot.output('Comparing link against prefix {}.'.format(prefix))
105	if prefix + ":" in link_text:
106	iw_url = onigalore_url + link_text
107	_, post_ns = link_text.split(':', 1)
108	#pywikibot.output('Found link to OniGalore {0} namespace page {1}.'.format(prefix, post_ns))
109	iw_found = iw_found + 1
110	found_iw_match = True
111	break
112
113	# If we didn't match the prefix against any intrawiki prefixes, see if it matches
114	# against an interwiki prefix; if so, this link can be ignored
115	is_interwiki = False
116	if found_iw_match == False:
117	for prefix in interwiki_prefixes:
118	if prefix + ":" in link_text:
119	#pywikibot.output('Skipping link {} because it is an interwiki link.'.format(link_text))
120	is_interwiki = True
121	break
122	if is_interwiki:
123	continue
124
125	# If we still haven't turned this match into a URL, something's gone wrong
126	if (found_iw_match == False) or (iw_url == ""):
127	pywikibot.output('ERROR: Couldn\'t figure out link {}. Aborting script.'.format(link_text))
128	quit()
129
130	# Test the URL
131	iw_url = iw_url.replace(' ', '_')
132	#pywikibot.output('Reading page at {}...'.format(iw_url))
133	response = fetch(iw_url)
134
135	# Redirects are followed automatically by fetch() and treated as "200"s, so the
136	# way we tell that a redirect occurred is by checking the history
137	if response.history != []:
138	pywikibot.output('WARNING: Redirected from {}.'.format(response.history))
139	problems_found = problems_found + 1
140	elif response.status_code != 200:
141	#pywikibot.output('WARNING: Got response code {}.'.format(response.status_code)) # commented out because fetch() already prints such a msg
142	problems_found = problems_found + 1
143	else:
144	# Isolate section link
145	pre_section, section_name = link_text.split('#', 1)
146	#pywikibot.output('Searching for section link {} on page.'.format(section_name))
147
148	# Convert slash character to the dot-notation hex encoding that MediaWiki uses
149	section_name = section_name.replace('/', '.2F')
150
151	# Read linked page to see if it really has this anchor link
152	soup = BeautifulSoup(response.text, 'html.parser')
153	found_section = False
154	for span_tag in soup.findAll('span'):
155	span_name = span_tag.get('id', None)
156	if span_name == section_name:
157	#pywikibot.output('Found section!')
158	found_section = True
159	break
160	if found_section == False:
161	pywikibot.output('ERROR: Could not find section {0} on page {1}!'.format(section_name, pre_section))
162	problems_found = problems_found + 1
163
164	def main(*args):
165	cat_name = ''
166	global page_name
167
168	local_args = pywikibot.handle_args(args)
169	genFactory = pagegenerators.GeneratorFactory()
170
171	for arg in local_args:
172	if arg.startswith('-cat:'):
173	cat_name = arg[5:]
174	elif arg.startswith('-page:'):
175	page_name = arg[6:]
176
177	site = pywikibot.Site()
178
179	# This line of code enumerates the methods in the 'page' class
180	#pywikibot.stdout(format(dir(page)))
181
182	if cat_name != '':
183	cat_obj = pywikibot.Category(site, cat_name)
184	generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
185	for page in pagegenerators.PreloadingGenerator(generator, 100):
186	pywikibot.stdout('Checking page {0}'.format(page.title()))
187	page_name = page.title()
188	scan_for_iw_links(page.text)
189	elif page_name != '':
190	page = pywikibot.Page(site, page_name)
191	pywikibot.stdout('Checking page {0}'.format(page.title()))
192	scan_for_iw_links(page.text)
193
194	global pages_checked
195	global iw_found
196	global problems_found
197	pywikibot.stdout('Checked {0} page(s) and found {1} intrawiki link(s) with {2} section link problem(s).'.format(pages_checked, iw_found, problems_found))
198
199	if __name__ == '__main__':
200	main()