[ViewVC] Diff of: Oni2/ValBot/Python/check_intrawiki_section

Comparing ValBot/Python/check_intrawiki_section_links.py (file contents):
Revision 1194 by iritscen, Mon Nov 18 04:00:08 2024 UTC vs.
Revision 1205 by iritscen, Mon Feb 9 03:24:16 2026 UTC

+from pywikibot.bot import QuitKeyboardInterrupt
+from pywikibot import pagegenerators
-–
+from pywikibot.tools.formatter import color_format
+from pywikibot.comms.http import fetch
+from pywikibot.specialbots import UploadRobot
+from bs4 import BeautifulSoup
+         search_terms.pop(0)
+      if search_terms[-1].startswith('-'):
+         search_terms.pop()
-<
+      # Remake text directive with the terms separated by spaces as they should be in the page text
->
+      # Remake text directive with the terms separated by spaces as they should be in the page
->
+      # text
+      newSep = ' '
+      search_string = newSep.join(search_terms)
+      if debug: pywikibot.stdout('         Converted text fragment to string "{}".'.format(search_string))
+   elif debug and print_result:
+      pywikibot.stdout('   The section "{0}" was found on page "{1}".'.format(anchor_name, target_page_name_human))
-<
+# For a link that redirected us to another page, extract the name of the target page from
-<
+# the target page's source
->
+# For a link that redirected us to another page, extract the name of the target page from the
->
+# target page's source
+def find_canonical_link(page_text, page_name, page_slug):
+   # Extract link from this markup which contains name of redirected-to page:
+   # <link rel="canonical" href="https://en.wikipedia.org/wiki/Page_name"/>
+   else:
+      canonical_name = canonical_name[:tag_end]
+      if len(canonical_name) > 100:
-<
+         # Certain things can cause the trim to fail; report error and avoid slamming the
-<
+         # output with massive page source from a failed trim
->
+         # Certain things can cause the trim to fail; report error and avoid slamming the output
->
+         # with massive page source from a failed trim
+         pywikibot.stdout('   ERROR: The link "{}" is a redirect to "{2}…" (string overflow).'.format(page_slug, canonical_name[:100]))
+         errors_issued = errors_issued + 1
+      else:
+   # One way we tell that a redirect occurred is by checking fetch's history, as it
+   # automatically follows redirects. This will catch formal redirects which come from pages
+   # such as Special:PermanentLink.
-<
+   if response.history != []:
-<
+      permalink1 = 'Special:PermanentLink/'.lower()
-<
+      permalink2 = 'Special:Permalink/'.lower()
-<
+      page_slug_lower = page_slug.lower()
-<
+      if page_slug_lower.startswith(permalink1) or page_slug_lower.startswith(permalink2):
-<
+         if debug:
-<
+            possibly_print(page_name)
-<
+            pywikibot.stdout('   Got redirection code "{0}" for permanent revision link "{1}". Checking the target page….'.format(response.history[0], page_slug))
-<
+         find_canonical_link(response.text, page_name, page_slug)
-<
+      else:
->
+   permalink1 = 'Special:PermanentLink/'.lower()
->
+   permalink2 = 'Special:Permalink/'.lower()
->
+   page_slug_lower = page_slug.lower()
->
+   if response.history != [] and (page_slug_lower.startswith(permalink1) or page_slug_lower.startswith(permalink2)):
->
+      if debug:
+         possibly_print(page_name)
-<
+         pywikibot.stdout('   ERROR: Unrecognized type of redirection (code "{0}") for link "{1}". You should check the link manually.'.format(response.history[0], page_slug))
-<
+         advice_issued += 1
-<
+   elif response.status_code != 200:
-<
+      possibly_print(page_name)
-<
+      pywikibot.stdout('   ERROR: Got response code {0} on URL {1}. The target page may not exist.'.format(response.status_code, iw_url))
-<
+      errors_issued += 1
-<
+   # However the usual way that a redirect occurs is that MediaWiki redirects us sneakily
-<
+   # using JavaScript, while returning code OK 200 as if the link was correct; this happens
-<
+   # when a redirect page is accessed. We must detect these soft redirects by looking at the
-<
+   # page source to find the redirect note inserted at the top of the page for the reader.
->
+         pywikibot.stdout('   Got redirection code "{0}" for permanent revision link "{1}". Checking the target page….'.format(response.history[0], page_slug))
->
+      find_canonical_link(response.text, page_name, page_slug)
->
+   # However the usual way that a redirect occurs is that a redirect page is visited and
->
+   # MediaWiki sends us to the new page using JavaScript while returning code 301. Formerly it
->
+   # used to return 200 as if the link was correct, so rather than looking for code 301 we
->
+   # detect these soft redirects by looking at the page source to find the redirect note that
->
+   # gets inserted at the top of the page for the reader.
+   elif 'Redirected from <a' in response.text:
+      if debug:
+         possibly_print(page_name)
+         pywikibot.stdout('   Got silently redirected by link "{}". Checking the target page….'.format(page_slug))
+      find_canonical_link(response.text, page_name, page_slug)
-+
+   # This handles response codes other than 200 and 301 (301 is returned in the above case of a
-+
+   # silent redirect)
-+
+   elif response.status_code != 200:
-+
+      possibly_print(page_name)
-+
+      pywikibot.stdout('   ERROR: Got response code {0} on URL {1}. The target page may not exist.'.format(response.status_code, iw_url))
-+
+      errors_issued += 1
+   else: # URL is OK, so proceed
+      find_section(response.text, page_name, page_slug, False)

Diff Legend

-–
+Removed lines
-+
+Added lines
-<
+Changed lines (old)
->
+Changed lines (new)

Comparing ValBot/Python/check_intrawiki_section_links.py (file contents): Revision 1194 by iritscen, Mon Nov 18 04:00:08 2024 UTC vs. Revision 1205 by iritscen, Mon Feb 9 03:24:16 2026 UTC

Diff Legend

Comparing ValBot/Python/check_intrawiki_section_links.py (file contents):
Revision 1194 by iritscen, Mon Nov 18 04:00:08 2024 UTC vs.
Revision 1205 by iritscen, Mon Feb 9 03:24:16 2026 UTC