ViewVC Help
View File | Revision Log | View Changeset | Root Listing
root/Oni2/ValBot/Python/check_intrawiki_section_links.py
(Generate patch)

Comparing ValBot/Python/check_intrawiki_section_links.py (file contents):
Revision 1194 by iritscen, Mon Nov 18 04:00:08 2024 UTC vs.
Revision 1205 by iritscen, Mon Feb 9 03:24:16 2026 UTC

# Line 15 | Line 15 | import re
15  
16   from pywikibot.bot import QuitKeyboardInterrupt
17   from pywikibot import pagegenerators
18 from pywikibot.tools.formatter import color_format
18   from pywikibot.comms.http import fetch
19   from pywikibot.specialbots import UploadRobot
20   from bs4 import BeautifulSoup
# Line 78 | Line 77 | def find_section(page_text, page_name, p
77           search_terms.pop(0)
78        if search_terms[-1].startswith('-'):
79           search_terms.pop()
80 <      # Remake text directive with the terms separated by spaces as they should be in the page text
80 >      # Remake text directive with the terms separated by spaces as they should be in the page
81 >      # text
82        newSep = ' '
83        search_string = newSep.join(search_terms)
84        if debug: pywikibot.stdout('         Converted text fragment to string "{}".'.format(search_string))
# Line 113 | Line 113 | def find_section(page_text, page_name, p
113     elif debug and print_result:
114        pywikibot.stdout('   The section "{0}" was found on page "{1}".'.format(anchor_name, target_page_name_human))
115  
116 < # For a link that redirected us to another page, extract the name of the target page from
117 < # the target page's source
116 > # For a link that redirected us to another page, extract the name of the target page from the
117 > # target page's source
118   def find_canonical_link(page_text, page_name, page_slug):
119     # Extract link from this markup which contains name of redirected-to page:
120     # <link rel="canonical" href="https://en.wikipedia.org/wiki/Page_name"/>
# Line 128 | Line 128 | def find_canonical_link(page_text, page_
128     else:
129        canonical_name = canonical_name[:tag_end]
130        if len(canonical_name) > 100:
131 <         # Certain things can cause the trim to fail; report error and avoid slamming the
132 <         # output with massive page source from a failed trim
131 >         # Certain things can cause the trim to fail; report error and avoid slamming the output
132 >         # with massive page source from a failed trim
133           pywikibot.stdout('   ERROR: The link "{}" is a redirect to "{2}…" (string overflow).'.format(page_slug, canonical_name[:100]))
134           errors_issued = errors_issued + 1
135        else:
# Line 151 | Line 151 | def test_intrawiki_link(iw_url, page_nam
151     # One way we tell that a redirect occurred is by checking fetch's history, as it
152     # automatically follows redirects. This will catch formal redirects which come from pages
153     # such as Special:PermanentLink.
154 <   if response.history != []:
155 <      permalink1 = 'Special:PermanentLink/'.lower()
156 <      permalink2 = 'Special:Permalink/'.lower()
157 <      page_slug_lower = page_slug.lower()
158 <      if page_slug_lower.startswith(permalink1) or page_slug_lower.startswith(permalink2):
159 <         if debug:
160 <            possibly_print(page_name)
161 <            pywikibot.stdout('   Got redirection code "{0}" for permanent revision link "{1}". Checking the target page….'.format(response.history[0], page_slug))
162 <         find_canonical_link(response.text, page_name, page_slug)
163 <      else:
154 >   permalink1 = 'Special:PermanentLink/'.lower()
155 >   permalink2 = 'Special:Permalink/'.lower()
156 >   page_slug_lower = page_slug.lower()
157 >   if response.history != [] and (page_slug_lower.startswith(permalink1) or page_slug_lower.startswith(permalink2)):
158 >      if debug:
159           possibly_print(page_name)
160 <         pywikibot.stdout('   ERROR: Unrecognized type of redirection (code "{0}") for link "{1}". You should check the link manually.'.format(response.history[0], page_slug))
161 <         advice_issued += 1
162 <   elif response.status_code != 200:
163 <      possibly_print(page_name)
164 <      pywikibot.stdout('   ERROR: Got response code {0} on URL {1}. The target page may not exist.'.format(response.status_code, iw_url))
165 <      errors_issued += 1
166 <   # However the usual way that a redirect occurs is that MediaWiki redirects us sneakily
172 <   # using JavaScript, while returning code OK 200 as if the link was correct; this happens
173 <   # when a redirect page is accessed. We must detect these soft redirects by looking at the
174 <   # page source to find the redirect note inserted at the top of the page for the reader.
160 >         pywikibot.stdout('   Got redirection code "{0}" for permanent revision link "{1}". Checking the target page….'.format(response.history[0], page_slug))
161 >      find_canonical_link(response.text, page_name, page_slug)
162 >   # However the usual way that a redirect occurs is that a redirect page is visited and
163 >   # MediaWiki sends us to the new page using JavaScript while returning code 301. Formerly it
164 >   # used to return 200 as if the link was correct, so rather than looking for code 301 we
165 >   # detect these soft redirects by looking at the page source to find the redirect note that
166 >   # gets inserted at the top of the page for the reader.
167     elif 'Redirected from <a' in response.text:
168        if debug:
169           possibly_print(page_name)
170           pywikibot.stdout('   Got silently redirected by link "{}". Checking the target page….'.format(page_slug))
171        find_canonical_link(response.text, page_name, page_slug)
172 +   # This handles response codes other than 200 and 301 (301 is returned in the above case of a
173 +   # silent redirect)
174 +   elif response.status_code != 200:
175 +      possibly_print(page_name)
176 +      pywikibot.stdout('   ERROR: Got response code {0} on URL {1}. The target page may not exist.'.format(response.status_code, iw_url))
177 +      errors_issued += 1
178     else: # URL is OK, so proceed
179        find_section(response.text, page_name, page_slug, False)
180  

Diff Legend

Removed lines
+ Added lines
< Changed lines (old)
> Changed lines (new)