| 15 |
|
|
| 16 |
|
from pywikibot.bot import QuitKeyboardInterrupt |
| 17 |
|
from pywikibot import pagegenerators |
| 18 |
– |
from pywikibot.tools.formatter import color_format |
| 18 |
|
from pywikibot.comms.http import fetch |
| 19 |
|
from pywikibot.specialbots import UploadRobot |
| 20 |
|
from bs4 import BeautifulSoup |
| 77 |
|
search_terms.pop(0) |
| 78 |
|
if search_terms[-1].startswith('-'): |
| 79 |
|
search_terms.pop() |
| 80 |
< |
# Remake text directive with the terms separated by spaces as they should be in the page text |
| 80 |
> |
# Remake text directive with the terms separated by spaces as they should be in the page |
| 81 |
> |
# text |
| 82 |
|
newSep = ' ' |
| 83 |
|
search_string = newSep.join(search_terms) |
| 84 |
|
if debug: pywikibot.stdout(' Converted text fragment to string "{}".'.format(search_string)) |
| 113 |
|
elif debug and print_result: |
| 114 |
|
pywikibot.stdout(' The section "{0}" was found on page "{1}".'.format(anchor_name, target_page_name_human)) |
| 115 |
|
|
| 116 |
< |
# For a link that redirected us to another page, extract the name of the target page from |
| 117 |
< |
# the target page's source |
| 116 |
> |
# For a link that redirected us to another page, extract the name of the target page from the |
| 117 |
> |
# target page's source |
| 118 |
|
def find_canonical_link(page_text, page_name, page_slug): |
| 119 |
|
# Extract link from this markup which contains name of redirected-to page: |
| 120 |
|
# <link rel="canonical" href="https://en.wikipedia.org/wiki/Page_name"/> |
| 128 |
|
else: |
| 129 |
|
canonical_name = canonical_name[:tag_end] |
| 130 |
|
if len(canonical_name) > 100: |
| 131 |
< |
# Certain things can cause the trim to fail; report error and avoid slamming the |
| 132 |
< |
# output with massive page source from a failed trim |
| 131 |
> |
# Certain things can cause the trim to fail; report error and avoid slamming the output |
| 132 |
> |
# with massive page source from a failed trim |
| 133 |
|
pywikibot.stdout(' ERROR: The link "{}" is a redirect to "{2}…" (string overflow).'.format(page_slug, canonical_name[:100])) |
| 134 |
|
errors_issued = errors_issued + 1 |
| 135 |
|
else: |
| 151 |
|
# One way we tell that a redirect occurred is by checking fetch's history, as it |
| 152 |
|
# automatically follows redirects. This will catch formal redirects which come from pages |
| 153 |
|
# such as Special:PermanentLink. |
| 154 |
< |
if response.history != []: |
| 155 |
< |
permalink1 = 'Special:PermanentLink/'.lower() |
| 156 |
< |
permalink2 = 'Special:Permalink/'.lower() |
| 157 |
< |
page_slug_lower = page_slug.lower() |
| 158 |
< |
if page_slug_lower.startswith(permalink1) or page_slug_lower.startswith(permalink2): |
| 159 |
< |
if debug: |
| 160 |
< |
possibly_print(page_name) |
| 161 |
< |
pywikibot.stdout(' Got redirection code "{0}" for permanent revision link "{1}". Checking the target page….'.format(response.history[0], page_slug)) |
| 162 |
< |
find_canonical_link(response.text, page_name, page_slug) |
| 163 |
< |
else: |
| 154 |
> |
permalink1 = 'Special:PermanentLink/'.lower() |
| 155 |
> |
permalink2 = 'Special:Permalink/'.lower() |
| 156 |
> |
page_slug_lower = page_slug.lower() |
| 157 |
> |
if response.history != [] and (page_slug_lower.startswith(permalink1) or page_slug_lower.startswith(permalink2)): |
| 158 |
> |
if debug: |
| 159 |
|
possibly_print(page_name) |
| 160 |
< |
pywikibot.stdout(' ERROR: Unrecognized type of redirection (code "{0}") for link "{1}". You should check the link manually.'.format(response.history[0], page_slug)) |
| 161 |
< |
advice_issued += 1 |
| 162 |
< |
elif response.status_code != 200: |
| 163 |
< |
possibly_print(page_name) |
| 164 |
< |
pywikibot.stdout(' ERROR: Got response code {0} on URL {1}. The target page may not exist.'.format(response.status_code, iw_url)) |
| 165 |
< |
errors_issued += 1 |
| 166 |
< |
# However the usual way that a redirect occurs is that MediaWiki redirects us sneakily |
| 172 |
< |
# using JavaScript, while returning code OK 200 as if the link was correct; this happens |
| 173 |
< |
# when a redirect page is accessed. We must detect these soft redirects by looking at the |
| 174 |
< |
# page source to find the redirect note inserted at the top of the page for the reader. |
| 160 |
> |
pywikibot.stdout(' Got redirection code "{0}" for permanent revision link "{1}". Checking the target page….'.format(response.history[0], page_slug)) |
| 161 |
> |
find_canonical_link(response.text, page_name, page_slug) |
| 162 |
> |
# However the usual way that a redirect occurs is that a redirect page is visited and |
| 163 |
> |
# MediaWiki sends us to the new page using JavaScript while returning code 301. Formerly it |
| 164 |
> |
# used to return 200 as if the link was correct, so rather than looking for code 301 we |
| 165 |
> |
# detect these soft redirects by looking at the page source to find the redirect note that |
| 166 |
> |
# gets inserted at the top of the page for the reader. |
| 167 |
|
elif 'Redirected from <a' in response.text: |
| 168 |
|
if debug: |
| 169 |
|
possibly_print(page_name) |
| 170 |
|
pywikibot.stdout(' Got silently redirected by link "{}". Checking the target page….'.format(page_slug)) |
| 171 |
|
find_canonical_link(response.text, page_name, page_slug) |
| 172 |
+ |
# This handles response codes other than 200 and 301 (301 is returned in the above case of a |
| 173 |
+ |
# silent redirect) |
| 174 |
+ |
elif response.status_code != 200: |
| 175 |
+ |
possibly_print(page_name) |
| 176 |
+ |
pywikibot.stdout(' ERROR: Got response code {0} on URL {1}. The target page may not exist.'.format(response.status_code, iw_url)) |
| 177 |
+ |
errors_issued += 1 |
| 178 |
|
else: # URL is OK, so proceed |
| 179 |
|
find_section(response.text, page_name, page_slug, False) |
| 180 |
|
|