| 1 | # Find External Images | 
 
 
 
 
 | 2 | # by iritscen@yahoo.com | 
 
 
 
 
 | 3 | # Looks at each link on a page (or in all the pages in a category) and prints the links to | 
 
 
 
 
 | 4 | # images that are externally-hosted. You must pass in one or both of the following args: | 
 
 
 
 
 | 5 | # -inlined: Show any plain URLs leading to images (these create embedded images, <img>) | 
 
 
 
 
 | 6 | # -linked: Show any external URLs ("[URL]") leading to images (these create links, <a>) | 
 
 
 
 
 | 7 | # Specify the page or category to search with -page:"Some Page" or -cat:"Some Category". | 
 
 
 
 
 | 8 | # | 
 
 
 
 
 | 9 | # Recommended viewing width: | 
 
 
 
 
 | 10 | # |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---| | 
 
 
 
 
 | 11 |  | 
 
 
 
 
 | 12 | import os | 
 
 
 
 
 | 13 |  | 
 
 
 
 
 | 14 | from urllib.parse import urljoin | 
 
 
 
 
 | 15 |  | 
 
 
 
 
 | 16 | import pywikibot | 
 
 
 
 
 | 17 |  | 
 
 
 
 
 | 18 | from pywikibot.bot import QuitKeyboardInterrupt | 
 
 
 
 
 | 19 | from pywikibot import pagegenerators | 
 
 
 
 
 | 20 | from pywikibot.comms.http import fetch | 
 
 
 
 
 | 21 | from pywikibot.specialbots import UploadRobot | 
 
 
 
 
 | 22 | #import bs4 # for listing members with dir() | 
 
 
 
 
 | 23 | from bs4 import BeautifulSoup | 
 
 
 
 
 | 24 |  | 
 
 
 
 
 | 25 | pages_checked = 0 | 
 
 
 
 
 | 26 | page_errors = 0 | 
 
 
 
 
 | 27 | ext_images = 0 | 
 
 
 
 
 | 28 | oni2_images = 0 | 
 
 
 
 
 | 29 | file_formats = ('.jpg', '.jpeg', '.png', '.gif', '.svg') | 
 
 
 
 
 | 30 | tag_names = [] | 
 
 
 
 
 | 31 |  | 
 
 
 
 
 | 32 | # Scrapes the HTML at the given URL for image tags | 
 
 
 
 
 | 33 | def get_image_links(url): | 
 
 
 
 
 | 34 | global pages_checked | 
 
 
 
 
 | 35 | global page_errors | 
 
 
 
 
 | 36 | global ext_images | 
 
 
 
 
 | 37 | global oni2_images | 
 
 
 
 
 | 38 | global file_formats | 
 
 
 
 
 | 39 | global tag_names | 
 
 
 
 
 | 40 |  | 
 
 
 
 
 | 41 | response = fetch(url) | 
 
 
 
 
 | 42 | if response.status_code != 200: | 
 
 
 
 
 | 43 | pywikibot.stdout('   ERROR: Could not load page at URL "{}"'.format(url)) | 
 
 
 
 
 | 44 | page_errors = page_errors + 1 | 
 
 
 
 
 | 45 | return | 
 
 
 
 
 | 46 |  | 
 
 
 
 
 | 47 | soup = BeautifulSoup(response.text, 'html.parser') | 
 
 
 
 
 | 48 | pages_checked = pages_checked + 1 | 
 
 
 
 
 | 49 | for tag in soup.findAll(tag_names): | 
 
 
 
 
 | 50 | link = tag.get('href') | 
 
 
 
 
 | 51 | if not link: | 
 
 
 
 
 | 52 | link = tag.get('src') | 
 
 
 
 
 | 53 |  | 
 
 
 
 
 | 54 | # Filter out empty links | 
 
 
 
 
 | 55 | if not link: | 
 
 
 
 
 | 56 | if tag.get('id') == "top": | 
 
 
 
 
 | 57 | continue | 
 
 
 
 
 | 58 |  | 
 
 
 
 
 | 59 | class_names = tag.get('class') | 
 
 
 
 
 | 60 | if "selflink" in class_names: | 
 
 
 
 
 | 61 | continue | 
 
 
 
 
 | 62 |  | 
 
 
 
 
 | 63 | pywikibot.stdout('   Could not process mystery link {}'.format(tag.get_text)) | 
 
 
 
 
 | 64 | pywikibot.stdout('   Class is "{}".'.format(tag.get('class'))) | 
 
 
 
 
 | 65 | continue | 
 
 
 
 
 | 66 |  | 
 
 
 
 
 | 67 | # A "src" or "href" starting with "/" would be a link to a local page or file; a | 
 
 
 
 
 | 68 | # link starting with "#" is a section link | 
 
 
 
 
 | 69 | if link.startswith('/') or link.startswith('#'): | 
 
 
 
 
 | 70 | continue | 
 
 
 
 
 | 71 |  | 
 
 
 
 
 | 72 | # The gnu.org link to the Free Documentation License is at the bottom of every page | 
 
 
 
 
 | 73 | if link == "http://www.gnu.org/copyleft/fdl.html": | 
 
 
 
 
 | 74 | continue | 
 
 
 
 
 | 75 |  | 
 
 
 
 
 | 76 | _, ext = os.path.splitext(link) | 
 
 
 
 
 | 77 | if ext.lower() in file_formats: | 
 
 
 
 
 | 78 | if "oni2.net" in link: | 
 
 
 
 
 | 79 | pywikibot.stdout('   Oni2.net image: {}'.format(link)) | 
 
 
 
 
 | 80 | oni2_images = oni2_images + 1 | 
 
 
 
 
 | 81 | else: | 
 
 
 
 
 | 82 | pywikibot.stdout('   External image: {}'.format(link)) | 
 
 
 
 
 | 83 | ext_images = ext_images + 1 | 
 
 
 
 
 | 84 | #else: | 
 
 
 
 
 | 85 | #pywikibot.stdout('   Other external link: {}'.format(link)) | 
 
 
 
 
 | 86 |  | 
 
 
 
 
 | 87 | def main(*args): | 
 
 
 
 
 | 88 | global pages_checked | 
 
 
 
 
 | 89 | global page_errors | 
 
 
 
 
 | 90 | global ext_images | 
 
 
 
 
 | 91 | global oni2_images | 
 
 
 
 
 | 92 | global tag_names | 
 
 
 
 
 | 93 |  | 
 
 
 
 
 | 94 | cat_name = '' | 
 
 
 
 
 | 95 | page_name = '' | 
 
 
 
 
 | 96 |  | 
 
 
 
 
 | 97 | #pywikibot.stdout('The members of the bs4.element.Tag class are:') | 
 
 
 
 
 | 98 | #pywikibot.stdout(format(dir(bs4.element.Tag))) | 
 
 
 
 
 | 99 |  | 
 
 
 
 
 | 100 | local_args = pywikibot.handle_args(args) | 
 
 
 
 
 | 101 | genFactory = pagegenerators.GeneratorFactory() | 
 
 
 
 
 | 102 |  | 
 
 
 
 
 | 103 | for arg in local_args: | 
 
 
 
 
 | 104 | if arg.startswith('-cat:'): | 
 
 
 
 
 | 105 | cat_name = arg[5:] | 
 
 
 
 
 | 106 | elif arg.startswith('-page:'): | 
 
 
 
 
 | 107 | page_name = arg[6:] | 
 
 
 
 
 | 108 | elif arg == '-linked': | 
 
 
 
 
 | 109 | tag_names += ['a'] | 
 
 
 
 
 | 110 | elif arg == '-inlined': | 
 
 
 
 
 | 111 | tag_names += ['img'] | 
 
 
 
 
 | 112 | else: | 
 
 
 
 
 | 113 | pywikibot.stdout('Unknown argument "{}".'.format(arg)) | 
 
 
 
 
 | 114 | return | 
 
 
 
 
 | 115 |  | 
 
 
 
 
 | 116 | if not tag_names: | 
 
 
 
 
 | 117 | pywikibot.stdout('You need to pass this script either "-linked", "-inlined", or both arguments in order to specify which image links you want to find.') | 
 
 
 
 
 | 118 | return | 
 
 
 
 
 | 119 |  | 
 
 
 
 
 | 120 | site = pywikibot.Site() | 
 
 
 
 
 | 121 | if cat_name != '': | 
 
 
 
 
 | 122 | cat_obj = pywikibot.Category(site, cat_name) | 
 
 
 
 
 | 123 | generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True) | 
 
 
 
 
 | 124 | for page in pagegenerators.PreloadingGenerator(generator, 100): | 
 
 
 
 
 | 125 | pywikibot.stdout('Checking page "{}"'.format(page.title())) | 
 
 
 
 
 | 126 | page_url = page.full_url().replace("%2F", "/") | 
 
 
 
 
 | 127 | get_image_links(page_url) | 
 
 
 
 
 | 128 | elif page_name != '': | 
 
 
 
 
 | 129 | page = pywikibot.Page(site, page_name) | 
 
 
 
 
 | 130 | pywikibot.stdout('Checking page "{}"'.format(page.title())) | 
 
 
 
 
 | 131 | page_url = page.full_url().replace("%2F", "/") | 
 
 
 
 
 | 132 | get_image_links(page_url) | 
 
 
 
 
 | 133 | else: | 
 
 
 
 
 | 134 | pywikibot.stdout('No page name or category name received.'.format(arg)) | 
 
 
 
 
 | 135 | return | 
 
 
 
 
 | 136 |  | 
 
 
 
 
 | 137 | chk_page_str = "pages" | 
 
 
 
 
 | 138 | if pages_checked == 1: | 
 
 
 
 
 | 139 | chk_page_str = "page" | 
 
 
 
 
 | 140 |  | 
 
 
 
 
 | 141 | err_page_str = "pages" | 
 
 
 
 
 | 142 | if page_errors == 1: | 
 
 
 
 
 | 143 | err_page_str = "page" | 
 
 
 
 
 | 144 |  | 
 
 
 
 
 | 145 | ext_image_str = "images" | 
 
 
 
 
 | 146 | if ext_images == 1: | 
 
 
 
 
 | 147 | ext_image_str = "image" | 
 
 
 
 
 | 148 |  | 
 
 
 
 
 | 149 | oni2_image_str = "images" | 
 
 
 
 
 | 150 | if oni2_images == 1: | 
 
 
 
 
 | 151 | oni2_image_str = "image" | 
 
 
 
 
 | 152 |  | 
 
 
 
 
 | 153 | pywikibot.stdout('-------------------------') | 
 
 
 
 
 | 154 | pywikibot.stdout('Checked {0} {1} and failed to check {2} {3}.'.format(pages_checked, chk_page_str, page_errors, err_page_str)) | 
 
 
 
 
 | 155 | pywikibot.stdout('Found {0} {1} from oni2.net and {2} {3} from other domains.'.format(oni2_images, oni2_image_str, ext_images, ext_image_str)) | 
 
 
 
 
 | 156 |  | 
 
 
 
 
 | 157 | if __name__ == '__main__': | 
 
 
 
 
 | 158 | main() |