| 1 |
# Find External Images |
| 2 |
# by iritscen@yahoo.com |
| 3 |
# Looks at each link on a page (or in all the pages in a category) and prints the links to |
| 4 |
# images that are external to the wiki. Distinction is made between images hosted on oni2.net |
| 5 |
# and on third-party domains. You must pass in one or both of the following args: |
| 6 |
# -embedded: Show any plain URLs leading to images (these create embedded images, <img>) |
| 7 |
# -linked: Show any external URLs ("[URL]") leading to images (these create links, <a>) |
| 8 |
# |
| 9 |
# Specify the page or category to search with -page:"Some Page" or -cat:"Some Category". |
| 10 |
# |
| 11 |
# Recommended viewing width: |
| 12 |
# |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---| |
| 13 |
|
| 14 |
import os |
| 15 |
|
| 16 |
from urllib.parse import urljoin |
| 17 |
|
| 18 |
import pywikibot |
| 19 |
from pywikibot.bot import QuitKeyboardInterrupt |
| 20 |
from pywikibot import pagegenerators |
| 21 |
from pywikibot.comms.http import fetch |
| 22 |
from pywikibot.specialbots import UploadRobot |
| 23 |
|
| 24 |
import bs4 |
| 25 |
from bs4 import BeautifulSoup |
| 26 |
|
| 27 |
# Initialize globals |
| 28 |
debug = 0 |
| 29 |
pages_checked = 0 |
| 30 |
page_errors = 0 |
| 31 |
image_errors = 0 |
| 32 |
linked_ext_images = 0 |
| 33 |
linked_oni2_images = 0 |
| 34 |
embedded_ext_images = 0 |
| 35 |
embedded_oni2_images = 0 |
| 36 |
file_formats = ('.jpg', '.jpeg', '.png', '.gif', '.svg') |
| 37 |
tag_names = [] |
| 38 |
|
| 39 |
# Pass this function a singular noun and it will add an 's' to "noun" if "quantity" is not 1 |
| 40 |
def plural_check(noun, quantity): |
| 41 |
if quantity != 1: |
| 42 |
return noun + "s" |
| 43 |
else: |
| 44 |
return noun |
| 45 |
|
| 46 |
# Scrapes the HTML at the given URL for image tags |
| 47 |
def get_image_links(page_url, page_name): |
| 48 |
global debug |
| 49 |
global pages_checked |
| 50 |
global page_errors |
| 51 |
global image_errors |
| 52 |
global linked_ext_images |
| 53 |
global linked_oni2_images |
| 54 |
global embedded_ext_images |
| 55 |
global embedded_oni2_images |
| 56 |
global file_formats |
| 57 |
global tag_names |
| 58 |
name_printed = 0 |
| 59 |
|
| 60 |
response = fetch(page_url) |
| 61 |
if response.status_code != 200: |
| 62 |
pywikibot.stdout(' ERROR: Could not load page at URL "{}".'.format(page_url)) |
| 63 |
page_errors += 1 |
| 64 |
return |
| 65 |
|
| 66 |
soup = BeautifulSoup(response.text, 'html.parser') |
| 67 |
pages_checked += 1 |
| 68 |
for tag in soup.findAll(tag_names): |
| 69 |
link = tag.get('href') |
| 70 |
if not link: |
| 71 |
link = tag.get('src') |
| 72 |
|
| 73 |
# Filter out empty links |
| 74 |
if not link: |
| 75 |
if tag.get('id') == "top": |
| 76 |
continue |
| 77 |
|
| 78 |
class_names = tag.get('class') |
| 79 |
if "selflink" in class_names: |
| 80 |
continue |
| 81 |
|
| 82 |
if not name_printed and not debug: |
| 83 |
pywikibot.stdout('From page "{}":'.format(page_name)) |
| 84 |
name_printed = 1 |
| 85 |
pywikibot.stdout(' ERROR: Could not process mystery link {}.'.format(tag.get_text)) |
| 86 |
pywikibot.stdout(' Class is "{}".'.format(tag.get('class'))) |
| 87 |
page_errors += 1 |
| 88 |
continue |
| 89 |
|
| 90 |
# A "src" or "href" starting with "/" would be a link to a local page or file; a |
| 91 |
# link starting with "#" is a section link |
| 92 |
if link.startswith('/') or link.startswith('#'): |
| 93 |
continue |
| 94 |
|
| 95 |
# The gnu.org link to the Free Documentation License is at the bottom of every page |
| 96 |
if link == "http://www.gnu.org/copyleft/fdl.html": |
| 97 |
continue |
| 98 |
|
| 99 |
# Determine if link is to an image |
| 100 |
_, ext = os.path.splitext(link) |
| 101 |
if ext.lower() in file_formats: |
| 102 |
if not name_printed and not debug: |
| 103 |
pywikibot.stdout('Found on page "{}":'.format(page_name)) |
| 104 |
name_printed = 1 |
| 105 |
tag_text = format(tag) |
| 106 |
if "oni2.net" in link: |
| 107 |
if tag_text.startswith('<a'): |
| 108 |
pywikibot.stdout(' Linked oni2.net image: {}'.format(link)) |
| 109 |
linked_oni2_images += 1 |
| 110 |
elif tag_text.startswith('<img'): |
| 111 |
pywikibot.stdout(' Embedded oni2.net image: {}'.format(link)) |
| 112 |
embedded_oni2_images += 1 |
| 113 |
else: |
| 114 |
pywikibot.stdout(' ERROR: Could not process oni2.net image link {}.'.format(link)) |
| 115 |
image_errors += 1 |
| 116 |
return |
| 117 |
else: |
| 118 |
if tag_text.startswith('<a'): |
| 119 |
pywikibot.stdout(' Linked external image: {}'.format(link)) |
| 120 |
linked_ext_images += 1 |
| 121 |
elif tag_text.startswith('<img'): |
| 122 |
pywikibot.stdout(' Embedded external image: {}'.format(link)) |
| 123 |
embedded_ext_images += 1 |
| 124 |
else: |
| 125 |
pywikibot.stdout(' ERROR: Could not process external image link {}.'.format(link)) |
| 126 |
image_errors += 1 |
| 127 |
return |
| 128 |
|
| 129 |
def main(*args): |
| 130 |
global debug |
| 131 |
global pages_checked |
| 132 |
global page_errors |
| 133 |
global image_errors |
| 134 |
global linked_ext_images |
| 135 |
global linked_oni2_images |
| 136 |
global embedded_ext_images |
| 137 |
global embedded_oni2_images |
| 138 |
global tag_names |
| 139 |
|
| 140 |
search_cat = '' |
| 141 |
search_page = '' |
| 142 |
|
| 143 |
#pywikibot.stdout('The members of the bs4.element.Tag class are:') |
| 144 |
#pywikibot.stdout(format(dir(bs4.element.Tag))) |
| 145 |
|
| 146 |
local_args = pywikibot.handle_args(args) |
| 147 |
genFactory = pagegenerators.GeneratorFactory() |
| 148 |
|
| 149 |
for arg in local_args: |
| 150 |
if arg.startswith('-cat:'): |
| 151 |
search_cat = arg[5:] |
| 152 |
elif arg.startswith('-page:'): |
| 153 |
search_page = arg[6:] |
| 154 |
elif arg == '-linked': |
| 155 |
tag_names += ['a'] |
| 156 |
elif arg == '-embedded': |
| 157 |
tag_names += ['img'] |
| 158 |
elif arg == '-dbg': |
| 159 |
debug = 1 |
| 160 |
else: |
| 161 |
pywikibot.stdout('Unknown argument "{}".'.format(arg)) |
| 162 |
return |
| 163 |
|
| 164 |
if not tag_names: |
| 165 |
pywikibot.stdout('You need to pass this script either "-linked", "-embedded", or both arguments in order to specify which image links you want to find.') |
| 166 |
return |
| 167 |
|
| 168 |
site = pywikibot.Site() |
| 169 |
if search_cat != '': |
| 170 |
cat_obj = pywikibot.Category(site, search_cat) |
| 171 |
generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True) |
| 172 |
for page in pagegenerators.PreloadingGenerator(generator, 100): |
| 173 |
if debug: pywikibot.stdout('Checking page "{}"'.format(page.title())) |
| 174 |
page_url = page.full_url().replace("%2F", "/") |
| 175 |
get_image_links(page_url, page.title()) |
| 176 |
elif search_page != '': |
| 177 |
page = pywikibot.Page(site, search_page) |
| 178 |
if debug: pywikibot.stdout('Checking page "{}"'.format(page.title())) |
| 179 |
page_url = page.full_url().replace("%2F", "/") |
| 180 |
get_image_links(page_url, page.title()) |
| 181 |
else: |
| 182 |
pywikibot.stdout('No page name or category name received.'.format(arg)) |
| 183 |
return |
| 184 |
|
| 185 |
chk_page_str = plural_check("page", pages_checked) |
| 186 |
err_page_str = plural_check("page", page_errors) |
| 187 |
err_img_str = plural_check("image", image_errors) |
| 188 |
linked_ext_image_str = plural_check("image", linked_ext_images) |
| 189 |
linked_oni2_image_str = plural_check("image", linked_oni2_images) |
| 190 |
embedded_ext_image_str = plural_check("image", embedded_ext_images) |
| 191 |
embedded_oni2_image_str = plural_check("image", embedded_oni2_images) |
| 192 |
|
| 193 |
pywikibot.stdout('-------------------------') |
| 194 |
pywikibot.stdout('Checked {0} {1} and failed to process {2} {3} on them. Failed to check {4} {5}.'.format(pages_checked, chk_page_str, image_errors, err_img_str, page_errors, err_page_str)) |
| 195 |
if 'a' in tag_names: |
| 196 |
pywikibot.stdout('Found {0} linked {1} from oni2.net and {2} linked {3} from other domains.'.format(linked_oni2_images, linked_oni2_image_str, linked_ext_images, linked_ext_image_str)) |
| 197 |
if 'img' in tag_names: |
| 198 |
pywikibot.stdout('Found {0} embedded {1} from oni2.net and {2} embedded {3} from other domains.'.format(embedded_oni2_images, embedded_oni2_image_str, embedded_ext_images, embedded_ext_image_str)) |
| 199 |
|
| 200 |
if __name__ == '__main__': |
| 201 |
main() |