| 1 |
+ |
# Check Intrawiki Section Links |
| 2 |
+ |
# by iritscen@yahoo.com |
| 3 |
+ |
# Looks at each wikilink on a page (or in all the pages in a category) for a section link ('#'), |
| 4 |
+ |
# and loads the linked page and verifies that the named section actually exists. The output will |
| 5 |
+ |
# use the keywords ADVICE, WARNING or ERROR depending on the nature of issue that it encounters. |
| 6 |
+ |
# Recommended viewing width: |
| 7 |
+ |
# |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- --| |
| 8 |
+ |
|
| 9 |
|
import os |
| 10 |
|
|
| 11 |
|
from urllib.parse import urljoin |
| 31 |
|
|
| 32 |
|
pages_checked = 0 |
| 33 |
|
iw_found = 0 |
| 34 |
< |
problems_found = 0 |
| 34 |
> |
advice_issued = 0 |
| 35 |
> |
warnings_issued = 0 |
| 36 |
> |
errors_issued = 0 |
| 37 |
|
page_name = '' |
| 38 |
|
|
| 39 |
|
# Searches the given page text for intrawiki links with section links in them |
| 40 |
|
def scan_for_iw_links(page_text): |
| 41 |
|
global pages_checked |
| 42 |
|
global iw_found |
| 43 |
< |
global problems_found |
| 43 |
> |
global advice_issued |
| 44 |
> |
global warnings_issued |
| 45 |
> |
global errors_issued |
| 46 |
|
global page_name |
| 47 |
|
pages_checked = pages_checked + 1 |
| 48 |
|
|
| 62 |
|
|
| 63 |
|
# Sometimes we used a space char. instead of a '_', so fix that before querying |
| 64 |
|
link_text = link_text.replace(' ', '_') |
| 65 |
< |
#pywikibot.output('Found link {0}.'.format(link_text)) |
| 65 |
> |
#pywikibot.stdout('Found link {0}.'.format(link_text)) |
| 66 |
|
|
| 67 |
|
# If this link doesn't have a section link in it, then we don't care about it, as |
| 68 |
|
# MediaWiki takes care of checking basic intrawiki links |
| 69 |
|
if not '#' in link_text: |
| 70 |
< |
#pywikibot.output('Link doesn\'t have a section anchor in it. Skipping.') |
| 70 |
> |
#pywikibot.stdout('Link doesn\'t have a section anchor in it. Skipping.') |
| 71 |
|
continue |
| 72 |
|
|
| 73 |
|
# If there is a '{' in the link, then probably it's a link built on transcluded text |
| 74 |
|
# like "Quotes/Diary#{{C3}}", which we cannot expand and work with, so skip it |
| 75 |
|
if '{' in link_text: |
| 76 |
< |
pywikibot.output('ADVICE: Link {} seems to use transclusion, so it can\'t be verified automatically. You should check it manually.'.format(link_text)) |
| 76 |
> |
pywikibot.stdout('ADVICE: Link {} seems to use transclusion, so it can\'t be verified automatically. You should check it manually.'.format(link_text)) |
| 77 |
> |
advice_issued = advice_issued + 1 |
| 78 |
|
continue |
| 79 |
< |
|
| 80 |
< |
# If this is a relative "../" link, find the parent page and set ourselves to that |
| 81 |
< |
# page, then remove the relative portion of the link. Note that this is only performed |
| 82 |
< |
# once, so if there's multiple steps back ("../../"), we're out of luck. |
| 79 |
> |
|
| 80 |
> |
# If this is a relative "/" link, use the current page as the basis for the URL. Note |
| 81 |
> |
# that only a leading slash is looked for, so if there's multiple steps down ("/x/y"), |
| 82 |
> |
# we're out of luck. |
| 83 |
> |
if link_text.startswith('/'): |
| 84 |
> |
link_text = page_name + link_text |
| 85 |
> |
pywikibot.stdout('Changed link_text to {} on account of "/".'.format(link_text)) |
| 86 |
> |
|
| 87 |
> |
# If this is a relative "../" link, find the parent page and set ourselves to that page, |
| 88 |
> |
# then remove the relative portion of the link. Note that this is only performed once, |
| 89 |
> |
# so if there's multiple steps back ("../../"), we're out of luck. |
| 90 |
|
if link_text.startswith('../'): |
| 91 |
|
last_slash = page_name.rfind('/') |
| 92 |
|
page_name2 = page_name[0:last_slash] |
| 93 |
< |
#pywikibot.output('Changed page_name to {} on account of "../".'.format(page_name2)) |
| 93 |
> |
#pywikibot.stdout('Changed page_name to {} on account of "../".'.format(page_name2)) |
| 94 |
|
link_text = link_text[3:len(link_text)] |
| 95 |
< |
#pywikibot.output('Changed link_text to {} on account of "../".'.format(link_text)) |
| 96 |
< |
# If this is now going to be a bare section link for the parent page, don't add |
| 97 |
< |
# a slash, otherwise do because we are drilling down to another subpage |
| 95 |
> |
#pywikibot.stdout('Changed link_text to {} on account of "../".'.format(link_text)) |
| 96 |
> |
# If this is now going to be a bare section link for the parent page, don't add a |
| 97 |
> |
# slash, otherwise do because we are drilling down to another subpage |
| 98 |
|
if link_text.startswith('#'): |
| 99 |
|
link_text = page_name2 + link_text |
| 100 |
|
else: |
| 104 |
|
if link_text.startswith('#'): |
| 105 |
|
iw_url = onigalore_url + page_name2 |
| 106 |
|
iw_found = iw_found + 1 |
| 107 |
< |
#pywikibot.output('Found link to this very page, {}.'.format(link_text)) |
| 107 |
> |
#pywikibot.stdout('Found link to this very page, {}.'.format(link_text)) |
| 108 |
|
found_iw_match = True |
| 109 |
|
link_text = page_name2 + link_text |
| 110 |
|
|
| 111 |
|
# If there's no ":" in the link (before the section link, where a colon would just be |
| 112 |
|
# part of the text) then it's a Main namespace article, so construct URL |
| 93 |
– |
#if not ':' in link_text: |
| 113 |
|
if found_iw_match == False: |
| 114 |
|
if not re.search(":.*#", link_text): |
| 115 |
|
iw_url = onigalore_url + link_text |
| 116 |
|
iw_found = iw_found + 1 |
| 117 |
< |
#pywikibot.output('Found link to OniGalore Main namespace page {}.'.format(link_text)) |
| 117 |
> |
#pywikibot.stdout('Found link to OniGalore Main namespace page {}.'.format(link_text)) |
| 118 |
|
found_iw_match = True |
| 119 |
|
|
| 120 |
|
# If there is a ":", match the prefix against the intrawiki prefixes on OniGalore |
| 121 |
|
if found_iw_match == False: |
| 122 |
|
for prefix in intrawiki_prefixes: |
| 123 |
< |
#pywikibot.output('Comparing link against prefix {}.'.format(prefix)) |
| 123 |
> |
#pywikibot.stdout('Comparing link against prefix {}.'.format(prefix)) |
| 124 |
|
if prefix + ":" in link_text: |
| 125 |
|
iw_url = onigalore_url + link_text |
| 126 |
|
_, post_ns = link_text.split(':', 1) |
| 127 |
< |
#pywikibot.output('Found link to OniGalore {0} namespace page {1}.'.format(prefix, post_ns)) |
| 127 |
> |
#pywikibot.stdout('Found link to OniGalore {0} namespace page {1}.'.format(prefix, post_ns)) |
| 128 |
|
iw_found = iw_found + 1 |
| 129 |
|
found_iw_match = True |
| 130 |
|
break |
| 135 |
|
if found_iw_match == False: |
| 136 |
|
for prefix in interwiki_prefixes: |
| 137 |
|
if prefix + ":" in link_text: |
| 138 |
< |
#pywikibot.output('Skipping link {} because it is an interwiki link.'.format(link_text)) |
| 138 |
> |
#pywikibot.stdout('Skipping link {} because it is an interwiki link.'.format(link_text)) |
| 139 |
|
is_interwiki = True |
| 140 |
|
break |
| 141 |
|
if is_interwiki: |
| 143 |
|
|
| 144 |
|
# If we still haven't turned this match into a URL, something's gone wrong |
| 145 |
|
if (found_iw_match == False) or (iw_url == ""): |
| 146 |
< |
pywikibot.output('ERROR: Couldn\'t figure out link {}. Aborting script.'.format(link_text)) |
| 146 |
> |
pywikibot.stdout('ERROR: Couldn\'t figure out link {}. Aborting script.'.format(link_text)) |
| 147 |
|
quit() |
| 148 |
|
|
| 149 |
|
# Test the URL |
| 150 |
|
iw_url = iw_url.replace(' ', '_') |
| 151 |
< |
#pywikibot.output('Reading page at {}...'.format(iw_url)) |
| 151 |
> |
#pywikibot.stdout('Reading page at {}...'.format(iw_url)) |
| 152 |
|
response = fetch(iw_url) |
| 153 |
|
|
| 154 |
< |
# Redirects are followed automatically by fetch() and treated as "200"s, so the |
| 155 |
< |
# way we tell that a redirect occurred is by checking the history |
| 154 |
> |
# Redirects are followed automatically by fetch() and treated as "200"s; the way we can |
| 155 |
> |
# tell that a redirect occurred is by checking fetch's history |
| 156 |
|
if response.history != []: |
| 157 |
< |
pywikibot.output('WARNING: Redirected from {}.'.format(response.history)) |
| 158 |
< |
problems_found = problems_found + 1 |
| 157 |
> |
pywikibot.stdout('WARNING: Got redirection code ({0}) on URL "{1}".'.format(response.history[0], iw_url)) |
| 158 |
> |
warnings_issued = warnings_issued + 1 |
| 159 |
|
elif response.status_code != 200: |
| 160 |
< |
#pywikibot.output('WARNING: Got response code {}.'.format(response.status_code)) # commented out because fetch() already prints such a msg |
| 161 |
< |
problems_found = problems_found + 1 |
| 160 |
> |
pywikibot.stdout('WARNING: Got response code {0} on URL {1}.'.format(response.status_code, iw_url)) |
| 161 |
> |
warnings_issued = warnings_issued + 1 |
| 162 |
|
else: |
| 163 |
|
# Isolate section link |
| 164 |
|
pre_section, section_name = link_text.split('#', 1) |
| 165 |
< |
#pywikibot.output('Searching for section link {} on page.'.format(section_name)) |
| 165 |
> |
#pywikibot.stdout('Searching for section link {} on page.'.format(section_name)) |
| 166 |
|
|
| 167 |
|
# Convert slash character to the dot-notation hex encoding that MediaWiki uses |
| 168 |
|
section_name = section_name.replace('/', '.2F') |
| 173 |
|
for span_tag in soup.findAll('span'): |
| 174 |
|
span_name = span_tag.get('id', None) |
| 175 |
|
if span_name == section_name: |
| 176 |
< |
#pywikibot.output('Found section!') |
| 176 |
> |
#pywikibot.stdout('Found section!') |
| 177 |
|
found_section = True |
| 178 |
|
break |
| 179 |
|
if found_section == False: |
| 180 |
< |
pywikibot.output('ERROR: Could not find section {0} on page {1}!'.format(section_name, pre_section)) |
| 181 |
< |
problems_found = problems_found + 1 |
| 180 |
> |
pywikibot.stdout('ERROR: Could not find section {0} on page {1}!'.format(section_name, pre_section)) |
| 181 |
> |
errors_issued = errors_issued + 1 |
| 182 |
|
|
| 183 |
|
def main(*args): |
| 184 |
|
cat_name = '' |
| 212 |
|
|
| 213 |
|
global pages_checked |
| 214 |
|
global iw_found |
| 215 |
< |
global problems_found |
| 216 |
< |
pywikibot.stdout('Checked {0} page(s) and found {1} intrawiki link(s) with {2} section link problem(s).'.format(pages_checked, iw_found, problems_found)) |
| 215 |
> |
global advice_issued |
| 216 |
> |
global warnings_issued |
| 217 |
> |
global errors_issued |
| 218 |
> |
|
| 219 |
> |
page_str = "pages" |
| 220 |
> |
if pages_checked == 1: |
| 221 |
> |
page_str = "page" |
| 222 |
> |
|
| 223 |
> |
link_str = "links" |
| 224 |
> |
if iw_found == 1: |
| 225 |
> |
link_str = "link" |
| 226 |
> |
|
| 227 |
> |
pywikibot.stdout('Checked {0} {1} and found {2} intrawiki {3}.'.format(pages_checked, page_str, iw_found, link_str)) |
| 228 |
> |
pywikibot.stdout('While attempting to follow section links...') |
| 229 |
> |
|
| 230 |
> |
if advice_issued == 0: |
| 231 |
> |
pywikibot.stdout(' No advice on potential problems was issued.') |
| 232 |
> |
elif advice_issued == 1: |
| 233 |
> |
pywikibot.stdout(' 1 piece of advice on a potential problem was issued.') |
| 234 |
> |
else: |
| 235 |
> |
pywikibot.stdout(' {} pieces of advice on potential problems were issued.'.format(advice_issued)) |
| 236 |
> |
|
| 237 |
> |
warning_str = "warnings were" |
| 238 |
> |
if warnings_issued == 1: |
| 239 |
> |
warning_str = "warning was" |
| 240 |
> |
pywikibot.stdout(' {0} {1} issued.'.format(warnings_issued, warning_str)) |
| 241 |
> |
|
| 242 |
> |
error_str = "errors were" |
| 243 |
> |
if errors_issued == 1: |
| 244 |
> |
error_str = "error was" |
| 245 |
> |
pywikibot.stdout(' {0} {1} encountered.'.format(errors_issued, error_str)) |
| 246 |
|
|
| 247 |
|
if __name__ == '__main__': |
| 248 |
|
main() |