| 42 |
|
|
| 43 |
|
# Construct full URL for the particular wiki |
| 44 |
|
iw_url = interwiki_urls[cur] + page_title |
| 45 |
< |
pywikibot.output('Found {0} link {1}'.format(prefix, page_title)) |
| 45 |
> |
pywikibot.output('Found {0} link {1}.'.format(prefix, page_title)) |
| 46 |
|
iw_found = iw_found + 1 |
| 47 |
|
|
| 48 |
|
# Adjust URL if this is a foreign-language WP link |
| 54 |
|
iw_url = iw_url.replace(page_title[0:3], '') |
| 55 |
|
|
| 56 |
|
# Test the URL |
| 57 |
< |
#pywikibot.output('Testing URL {}'.format(iw_url)) |
| 57 |
> |
#pywikibot.output('Testing URL {}...'.format(iw_url)) |
| 58 |
|
response = fetch(iw_url) |
| 59 |
|
|
| 60 |
|
# Redirects are followed automatically by fetch() and treated as "200"s, so the |
| 61 |
|
# way we tell that a redirect occurred is by checking the history |
| 62 |
|
if response.history != []: |
| 63 |
< |
pywikibot.output('WARNING: Initially got {}.'.format(response.history)) |
| 63 |
> |
pywikibot.output('WARNING: Redirected from {}.'.format(response.history)) |
| 64 |
|
problems_found = problems_found + 1 |
| 65 |
|
elif response.status_code != 200: |
| 66 |
|
#pywikibot.output('WARNING: Got response code {}.'.format(response.status_code)) # commented out because fetch() already prints such a msg |
| 67 |
|
problems_found = problems_found + 1 |
| 68 |
+ |
elif '#' in page_title: |
| 69 |
+ |
# Isolate section link |
| 70 |
+ |
pywikibot.output('Detected section link on page {0}.'.format(page_title)) |
| 71 |
+ |
page_name, anchor_name = page_title.split('#') |
| 72 |
+ |
|
| 73 |
+ |
# Convert dot-notation hex entities to proper characters |
| 74 |
+ |
anchor_name = anchor_name.replace('.22', '"') |
| 75 |
+ |
anchor_name = anchor_name.replace('.27', '\'') |
| 76 |
+ |
anchor_name = anchor_name.replace('.28', '(') |
| 77 |
+ |
anchor_name = anchor_name.replace('.29', ')') |
| 78 |
+ |
|
| 79 |
+ |
# Read linked page to see if it really has this anchor link |
| 80 |
+ |
soup = BeautifulSoup(response.text, 'html.parser') |
| 81 |
+ |
found_section = False |
| 82 |
+ |
for tag in soup.findAll('a'): |
| 83 |
+ |
link = tag.get('href', None) |
| 84 |
+ |
if not link: |
| 85 |
+ |
#pywikibot.output('It is not a link.') |
| 86 |
+ |
continue |
| 87 |
+ |
#pywikibot.output('Got link {0}.'.format(link)) |
| 88 |
+ |
if not link.startswith('#'): |
| 89 |
+ |
continue |
| 90 |
+ |
|
| 91 |
+ |
if link == '#' + anchor_name: |
| 92 |
+ |
pywikibot.output('Found section link!') |
| 93 |
+ |
found_section = True |
| 94 |
+ |
break |
| 95 |
+ |
if found_section == False: |
| 96 |
+ |
pywikibot.output('Could not find section {0} on page {1}.'.format(anchor_name, page_name)) |
| 97 |
+ |
problems_found = problems_found + 1 |
| 98 |
|
cur = cur + 1 |
| 99 |
|
|
| 100 |
|
def main(*args): |