--- ValBot/Python/check_intrawiki_section_links.py 2024/01/23 03:53:05 1188 +++ ValBot/Python/check_intrawiki_section_links.py 2024/11/18 04:00:08 1194 @@ -35,7 +35,7 @@ chapter_names = ['CHAPTER_00_._COMBAT_TR # Tuple of patterns for recognizing wikilinks # Pattern 1: Detect "[[anything]]", "[[any:thing]]", "[[any|thing]]", "[[any:thi|ng]]" # Pattern 2: Detect "{{SectionLink|Page|Section name}}", "{{SectionLink||Section name}}" -link_patterns = ("\[\[[^|\]]*(\||\])", "\{\{SectionLink\|[^|\}]*\|[^|\}]*\}\}") +link_patterns = (r"\[\[[^|\]]*(\||\])", r"\{\{SectionLink\|[^|\}]*\|[^|\}]*\}\}") # Initialize globals debug = 0 @@ -58,21 +58,46 @@ def possibly_print(page_name): # Search a page for the section specified in the link def find_section(page_text, page_name, page_slug, print_result): global errors_issued - - # Isolate section link + found_section = False + + # Isolate section link or text fragment link target_page_name, anchor_name = page_slug.split('#', 1) target_page_name_human = target_page_name.replace('_', ' ') - if debug: pywikibot.stdout(' Searching for section link {} on page.'.format(anchor_name)) - - # Read linked page to see if it really has this anchor link - soup = BeautifulSoup(page_text, 'html.parser') - found_section = False - for span_tag in soup.findAll('span'): - span_name = span_tag.get('id', None) - if span_name == anchor_name: - if debug and not print_result: pywikibot.stdout(' Found section in a span!') + + # First check if this is a text fragment directive, and look for it if so + if anchor_name.startswith(':~:text='): + if debug: pywikibot.stdout(' Found text fragment directive {} from URL {}.'.format(anchor_name, page_slug)) + anchor_name = anchor_name[8:] + # We're only checking the first text directive, so strip add'l ones if present + addl_fragment = anchor_name.find('&text=') + if addl_fragment != -1: + anchor_name = anchor_name[:addl_fragment] + search_terms = anchor_name.split(',') + # Delete prefix and suffix terms because they aren't needed + if search_terms[0].endswith('-'): + search_terms.pop(0) + if search_terms[-1].startswith('-'): + search_terms.pop() + # Remake text directive with the terms separated by spaces as they should be in the page text + newSep = ' ' + search_string = newSep.join(search_terms) + if debug: pywikibot.stdout(' Converted text fragment to string "{}".'.format(search_string)) + if search_string in page_text: found_section = True - break + if debug and not print_result: pywikibot.stdout(' Found text fragment!') + + # If we're still here, it's a section link; read linked page to see if it really has this + # anchor link + if found_section == False: + if debug: pywikibot.stdout(' Searching for section link {} on page.'.format(anchor_name)) + soup = BeautifulSoup(page_text, 'html.parser') + # Search for a span with this ID + for span_tag in soup.findAll('span'): + span_name = span_tag.get('id', None) + if span_name == anchor_name: + if debug and not print_result: pywikibot.stdout(' Found section in a span!') + found_section = True + break if found_section == False: # Search for a div with this ID for span_tag in soup.findAll('div'): @@ -127,7 +152,6 @@ def test_intrawiki_link(iw_url, page_nam # automatically follows redirects. This will catch formal redirects which come from pages # such as Special:PermanentLink. if response.history != []: - permalink1 = 'Special:PermanentLink/'.lower() permalink2 = 'Special:Permalink/'.lower() page_slug_lower = page_slug.lower()