--- ValBot/Python/find_external_images.py 2022/02/21 23:59:20 1169 +++ ValBot/Python/find_external_images.py 2022/06/28 22:06:29 1173 @@ -1,3 +1,14 @@ +# Find External Images +# by iritscen@yahoo.com +# Looks at each link on a page (or in all the pages in a category) and prints the links to +# images that are externally-hosted. You must pass in one or both of the following args: +# -inlined: Show any plain URLs leading to images (these create embedded images, ) +# -linked: Show any external URLs ("[URL]") leading to images (these create links, ) +# Specify the page or category to search with -page:"Some Page" or -cat:"Some Category". +# +# Recommended viewing width: +# |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---| + import os from urllib.parse import urljoin @@ -8,82 +19,140 @@ from pywikibot.bot import QuitKeyboardIn from pywikibot import pagegenerators from pywikibot.comms.http import fetch from pywikibot.specialbots import UploadRobot +#import bs4 # for listing members with dir() from bs4 import BeautifulSoup -first_run = False pages_checked = 0 +page_errors = 0 +ext_images = 0 oni2_images = 0 file_formats = ('.jpg', '.jpeg', '.png', '.gif', '.svg') +tag_names = [] # Scrapes the HTML at the given URL for image tags -def get_image_links(url, shown): - links = [] - global oni2_images +def get_image_links(url): global pages_checked + global page_errors + global ext_images + global oni2_images + global file_formats + global tag_names response = fetch(url) if response.status_code != 200: - pywikibot.output('Skipping url: {}'.format(url)) - return links + pywikibot.stdout(' ERROR: Could not load page at URL "{}"'.format(url)) + page_errors = page_errors + 1 + return soup = BeautifulSoup(response.text, 'html.parser') pages_checked = pages_checked + 1 - if not shown: - tagname = 'a' - elif shown == 'just': - tagname = 'img' - else: - tagname = ['a', 'img'] - #pywikibot.output('Looking at tags.') - for tag in soup.findAll(tagname): - link = tag.get('src', tag.get('href', None)) + for tag in soup.findAll(tag_names): + link = tag.get('href') + if not link: + link = tag.get('src') + + # Filter out empty links if not link: - #pywikibot.output('It is not a link.') + if tag.get('id') == "top": + continue + + class_names = tag.get('class') + if "selflink" in class_names: + continue + + pywikibot.stdout(' Could not process mystery link {}'.format(tag.get_text)) + pywikibot.stdout(' Class is "{}".'.format(tag.get('class'))) + continue + + # A "src" or "href" starting with "/" would be a link to a local page or file; a + # link starting with "#" is a section link + if link.startswith('/') or link.startswith('#'): + continue + + # The gnu.org link to the Free Documentation License is at the bottom of every page + if link == "http://www.gnu.org/copyleft/fdl.html": continue - #pywikibot.output('Got link {0}.'.format(link)) + _, ext = os.path.splitext(link) if ext.lower() in file_formats: - pywikibot.output('Found image link {0}.'.format(ext)) if "oni2.net" in link: - pywikibot.stdout('Found an oni2.net image: {0}'.format(link)) + pywikibot.stdout(' Oni2.net image: {}'.format(link)) oni2_images = oni2_images + 1 - return links - + else: + pywikibot.stdout(' External image: {}'.format(link)) + ext_images = ext_images + 1 + #else: + #pywikibot.stdout(' Other external link: {}'.format(link)) def main(*args): - cat = '' - url = '' - image_url = False - shown = False - desc = [] + global pages_checked + global page_errors + global ext_images + global oni2_images + global tag_names + + cat_name = '' + page_name = '' + + #pywikibot.stdout('The members of the bs4.element.Tag class are:') + #pywikibot.stdout(format(dir(bs4.element.Tag))) local_args = pywikibot.handle_args(args) genFactory = pagegenerators.GeneratorFactory() for arg in local_args: if arg.startswith('-cat:'): - cat = arg[5:] - elif arg == '-shown': - shown = True - elif arg == '-justshown': - shown = 'just' - elif url == '': - url = arg + cat_name = arg[5:] + elif arg.startswith('-page:'): + page_name = arg[6:] + elif arg == '-linked': + tag_names += ['a'] + elif arg == '-inlined': + tag_names += ['img'] else: - desc += [arg] - desc = ' '.join(desc) + pywikibot.stdout('Unknown argument "{}".'.format(arg)) + return + + if not tag_names: + pywikibot.stdout('You need to pass this script either "-linked", "-inlined", or both arguments in order to specify which image links you want to find.') + return site = pywikibot.Site() - cat_obj = pywikibot.Category(site, cat) - generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True) - for page in pagegenerators.PreloadingGenerator(generator, 100): - pywikibot.stdout('Checking page {0}'.format(page.title())) + if cat_name != '': + cat_obj = pywikibot.Category(site, cat_name) + generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True) + for page in pagegenerators.PreloadingGenerator(generator, 100): + pywikibot.stdout('Checking page "{}"'.format(page.title())) + page_url = page.full_url().replace("%2F", "/") + get_image_links(page_url) + elif page_name != '': + page = pywikibot.Page(site, page_name) + pywikibot.stdout('Checking page "{}"'.format(page.title())) page_url = page.full_url().replace("%2F", "/") - get_image_links(page_url, shown) + get_image_links(page_url) + else: + pywikibot.stdout('No page name or category name received.'.format(arg)) + return - global pages_checked - global oni2_images - pywikibot.stdout('Checked {0} page(s) and found {1} image(s) from oni2.net.'.format(pages_checked, oni2_images)) + chk_page_str = "pages" + if pages_checked == 1: + chk_page_str = "page" + + err_page_str = "pages" + if page_errors == 1: + err_page_str = "page" + + ext_image_str = "images" + if ext_images == 1: + ext_image_str = "image" + + oni2_image_str = "images" + if oni2_images == 1: + oni2_image_str = "image" + + pywikibot.stdout('-------------------------') + pywikibot.stdout('Checked {0} {1} and failed to check {2} {3}.'.format(pages_checked, chk_page_str, page_errors, err_page_str)) + pywikibot.stdout('Found {0} {1} from oni2.net and {2} {3} from other domains.'.format(oni2_images, oni2_image_str, ext_images, ext_image_str)) if __name__ == '__main__': main()