--- ValBot/Python/find_external_images.py 2022/06/28 22:06:29 1173 +++ ValBot/Python/find_external_images.py 2023/04/28 00:55:00 1181 @@ -1,9 +1,11 @@ # Find External Images # by iritscen@yahoo.com # Looks at each link on a page (or in all the pages in a category) and prints the links to -# images that are externally-hosted. You must pass in one or both of the following args: -# -inlined: Show any plain URLs leading to images (these create embedded images, ) -# -linked: Show any external URLs ("[URL]") leading to images (these create links, ) +# images that are external to the wiki. Distinction is made between images hosted on oni2.net +# and on third-party domains. You must pass in one or both of the following args: +# -embedded: Show any plain URLs leading to images (these create embedded images, ) +# -linked: Show any external URLs ("[URL]") leading to images (these create links, ) +# # Specify the page or category to search with -page:"Some Page" or -cat:"Some Category". # # Recommended viewing width: @@ -14,145 +16,186 @@ import os from urllib.parse import urljoin import pywikibot - from pywikibot.bot import QuitKeyboardInterrupt from pywikibot import pagegenerators from pywikibot.comms.http import fetch from pywikibot.specialbots import UploadRobot -#import bs4 # for listing members with dir() + +import bs4 from bs4 import BeautifulSoup +# Initialize globals +debug = 0 pages_checked = 0 page_errors = 0 -ext_images = 0 -oni2_images = 0 +image_errors = 0 +linked_ext_images = 0 +linked_oni2_images = 0 +embedded_ext_images = 0 +embedded_oni2_images = 0 file_formats = ('.jpg', '.jpeg', '.png', '.gif', '.svg') tag_names = [] -# Scrapes the HTML at the given URL for image tags -def get_image_links(url): - global pages_checked - global page_errors - global ext_images - global oni2_images - global file_formats - global tag_names - - response = fetch(url) - if response.status_code != 200: - pywikibot.stdout(' ERROR: Could not load page at URL "{}"'.format(url)) - page_errors = page_errors + 1 - return - - soup = BeautifulSoup(response.text, 'html.parser') - pages_checked = pages_checked + 1 - for tag in soup.findAll(tag_names): - link = tag.get('href') - if not link: - link = tag.get('src') - - # Filter out empty links - if not link: - if tag.get('id') == "top": - continue - - class_names = tag.get('class') - if "selflink" in class_names: - continue +# Pass this function a singular noun and it will add an 's' to "noun" if "quantity" is not 1 +def plural_check(noun, quantity): + if quantity != 1: + return noun + "s" + else: + return noun - pywikibot.stdout(' Could not process mystery link {}'.format(tag.get_text)) - pywikibot.stdout(' Class is "{}".'.format(tag.get('class'))) - continue - - # A "src" or "href" starting with "/" would be a link to a local page or file; a - # link starting with "#" is a section link - if link.startswith('/') or link.startswith('#'): +# Scrapes the HTML at the given URL for image tags +def get_image_links(page_url, page_name): + global debug + global pages_checked + global page_errors + global image_errors + global linked_ext_images + global linked_oni2_images + global embedded_ext_images + global embedded_oni2_images + global file_formats + global tag_names + name_printed = 0 + + response = fetch(page_url) + if response.status_code != 200: + pywikibot.stdout(' ERROR: Could not load page at URL "{}".'.format(page_url)) + page_errors += 1 + return + + soup = BeautifulSoup(response.text, 'html.parser') + pages_checked += 1 + for tag in soup.findAll(tag_names): + link = tag.get('href') + if not link: + link = tag.get('src') + + # Filter out empty links + if not link: + if tag.get('id') == "top": continue - # The gnu.org link to the Free Documentation License is at the bottom of every page - if link == "http://www.gnu.org/copyleft/fdl.html": + class_names = tag.get('class') + if "selflink" in class_names: continue - _, ext = os.path.splitext(link) - if ext.lower() in file_formats: - if "oni2.net" in link: - pywikibot.stdout(' Oni2.net image: {}'.format(link)) - oni2_images = oni2_images + 1 + if not name_printed and not debug: + pywikibot.stdout('From page "{}":'.format(page_name)) + name_printed = 1 + pywikibot.stdout(' ERROR: Could not process mystery link {}.'.format(tag.get_text)) + pywikibot.stdout(' Class is "{}".'.format(tag.get('class'))) + page_errors += 1 + continue + + # A "src" or "href" starting with "/" would be a link to a local page or file; a + # link starting with "#" is a section link + if link.startswith('/') or link.startswith('#'): + continue + + # The gnu.org link to the Free Documentation License is at the bottom of every page + if link == "http://www.gnu.org/copyleft/fdl.html": + continue + + # Determine if link is to an image + _, ext = os.path.splitext(link) + if ext.lower() in file_formats: + if not name_printed and not debug: + pywikibot.stdout('Found on page "{}":'.format(page_name)) + name_printed = 1 + tag_text = format(tag) + if "oni2.net" in link: + if tag_text.startswith('