--- ValBot/Python/check_interwiki_links.py 2023/08/15 02:03:16 1185 +++ ValBot/Python/check_interwiki_links.py 2024/09/16 23:08:26 1192 @@ -62,15 +62,33 @@ def find_section(page_text, page_name, p # Read linked page to see if it really has this anchor link soup = BeautifulSoup(page_text, 'html.parser') found_section = False - for span_tag in soup.findAll('span'): # search for span with ID matching the section name - span_name = span_tag.get('id', None) - if span_name == anchor_name: + for the_tag in soup.findAll('span'): # search for span with ID matching the section name + tag_name = the_tag.get('id', None) + if tag_name == anchor_name: found_section = True break if found_section == False: - for span_tag in soup.findAll('div'): # search for div with ID matching the section name - span_name = span_tag.get('id', None) - if span_name == anchor_name: + for the_tag in soup.findAll('div'): # search for div with ID matching the section name + tag_name = the_tag.get('id', None) + if tag_name == anchor_name: + found_section = True + break + if found_section == False: + for the_tag in soup.findAll('h2'): # search for h2 with ID matching the section name + tag_name = the_tag.get('id', None) + if tag_name == anchor_name: + found_section = True + break + if found_section == False: + for the_tag in soup.findAll('h3'): # search for h3 with ID matching the section name + tag_name = the_tag.get('id', None) + if tag_name == anchor_name: + found_section = True + break + if found_section == False: + for the_tag in soup.findAll('h4'): # search for h4 with ID matching the section name + tag_name = the_tag.get('id', None) + if tag_name == anchor_name: found_section = True break if found_section == False: @@ -161,7 +179,7 @@ def scan_for_interwiki_links(page_text, for prefix in interwiki_prefixes: # Isolate strings that start with "[[prefix:" and end with "|" or "]" - iw_link = "\[\[" + prefix + ":[^|\]]*(\||\])" + iw_link = r"\[\[" + prefix + r":[^|\]]*(\||\])" for match in re.finditer(iw_link, page_text): # Extract just the page title from this regex match s = match.start() + 2 + len(prefix) + 1