| 58 |
|
# Search a page for the section specified in the link |
| 59 |
|
def find_section(page_text, page_name, page_slug, print_result): |
| 60 |
|
global errors_issued |
| 61 |
< |
|
| 62 |
< |
# Isolate section link |
| 61 |
> |
found_section = False |
| 62 |
> |
|
| 63 |
> |
# Isolate section link or text fragment link |
| 64 |
|
target_page_name, anchor_name = page_slug.split('#', 1) |
| 65 |
|
target_page_name_human = target_page_name.replace('_', ' ') |
| 66 |
< |
if debug: pywikibot.stdout(' Searching for section link {} on page.'.format(anchor_name)) |
| 67 |
< |
|
| 68 |
< |
# Read linked page to see if it really has this anchor link |
| 69 |
< |
soup = BeautifulSoup(page_text, 'html.parser') |
| 70 |
< |
found_section = False |
| 71 |
< |
for span_tag in soup.findAll('span'): |
| 72 |
< |
span_name = span_tag.get('id', None) |
| 73 |
< |
if span_name == anchor_name: |
| 74 |
< |
if debug and not print_result: pywikibot.stdout(' Found section in a span!') |
| 66 |
> |
|
| 67 |
> |
# First check if this is a text fragment directive, and look for it if so |
| 68 |
> |
if anchor_name.startswith(':~:text='): |
| 69 |
> |
if debug: pywikibot.stdout(' Found text fragment directive {} from URL {}.'.format(anchor_name, page_slug)) |
| 70 |
> |
anchor_name = anchor_name[8:] |
| 71 |
> |
# We're only checking the first text directive, so strip add'l ones if present |
| 72 |
> |
addl_fragment = anchor_name.find('&text=') |
| 73 |
> |
if addl_fragment != -1: |
| 74 |
> |
anchor_name = anchor_name[:addl_fragment] |
| 75 |
> |
search_terms = anchor_name.split(',') |
| 76 |
> |
# Delete prefix and suffix terms because they aren't needed |
| 77 |
> |
if search_terms[0].endswith('-'): |
| 78 |
> |
search_terms.pop(0) |
| 79 |
> |
if search_terms[-1].startswith('-'): |
| 80 |
> |
search_terms.pop() |
| 81 |
> |
# Remake text directive with the terms separated by spaces as they should be in the page text |
| 82 |
> |
newSep = ' ' |
| 83 |
> |
search_string = newSep.join(search_terms) |
| 84 |
> |
if debug: pywikibot.stdout(' Converted text fragment to string "{}".'.format(search_string)) |
| 85 |
> |
if search_string in page_text: |
| 86 |
|
found_section = True |
| 87 |
< |
break |
| 87 |
> |
if debug and not print_result: pywikibot.stdout(' Found text fragment!') |
| 88 |
> |
|
| 89 |
> |
# If we're still here, it's a section link; read linked page to see if it really has this |
| 90 |
> |
# anchor link |
| 91 |
> |
if found_section == False: |
| 92 |
> |
if debug: pywikibot.stdout(' Searching for section link {} on page.'.format(anchor_name)) |
| 93 |
> |
soup = BeautifulSoup(page_text, 'html.parser') |
| 94 |
> |
# Search for a span with this ID |
| 95 |
> |
for span_tag in soup.findAll('span'): |
| 96 |
> |
span_name = span_tag.get('id', None) |
| 97 |
> |
if span_name == anchor_name: |
| 98 |
> |
if debug and not print_result: pywikibot.stdout(' Found section in a span!') |
| 99 |
> |
found_section = True |
| 100 |
> |
break |
| 101 |
|
if found_section == False: |
| 102 |
|
# Search for a div with this ID |
| 103 |
|
for span_tag in soup.findAll('div'): |
| 152 |
|
# automatically follows redirects. This will catch formal redirects which come from pages |
| 153 |
|
# such as Special:PermanentLink. |
| 154 |
|
if response.history != []: |
| 130 |
– |
|
| 155 |
|
permalink1 = 'Special:PermanentLink/'.lower() |
| 156 |
|
permalink2 = 'Special:Permalink/'.lower() |
| 157 |
|
page_slug_lower = page_slug.lower() |