1 |
# Find External Images |
2 |
# by iritscen@yahoo.com |
3 |
# Looks at each link on a page (or in all the pages in a category) and prints the links to |
4 |
# images that are externally-hosted. You must pass in one or both of the following args: |
5 |
# -inlined: Show any plain URLs leading to images (these create embedded images, <img>) |
6 |
# -linked: Show any external URLs ("[URL]") leading to images (these create links, <a>) |
7 |
# Specify the page or category to search with -page:"Some Page" or -cat:"Some Category". |
8 |
# |
9 |
# Recommended viewing width: |
10 |
# |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---| |
11 |
|
12 |
import os |
13 |
|
14 |
from urllib.parse import urljoin |
15 |
|
16 |
import pywikibot |
17 |
|
18 |
from pywikibot.bot import QuitKeyboardInterrupt |
19 |
from pywikibot import pagegenerators |
20 |
from pywikibot.comms.http import fetch |
21 |
from pywikibot.specialbots import UploadRobot |
22 |
#import bs4 # for listing members with dir() |
23 |
from bs4 import BeautifulSoup |
24 |
|
25 |
pages_checked = 0 |
26 |
page_errors = 0 |
27 |
ext_images = 0 |
28 |
oni2_images = 0 |
29 |
file_formats = ('.jpg', '.jpeg', '.png', '.gif', '.svg') |
30 |
tag_names = [] |
31 |
|
32 |
# Scrapes the HTML at the given URL for image tags |
33 |
def get_image_links(url): |
34 |
global pages_checked |
35 |
global page_errors |
36 |
global ext_images |
37 |
global oni2_images |
38 |
global file_formats |
39 |
global tag_names |
40 |
|
41 |
response = fetch(url) |
42 |
if response.status_code != 200: |
43 |
pywikibot.stdout(' ERROR: Could not load page at URL "{}"'.format(url)) |
44 |
page_errors = page_errors + 1 |
45 |
return |
46 |
|
47 |
soup = BeautifulSoup(response.text, 'html.parser') |
48 |
pages_checked = pages_checked + 1 |
49 |
for tag in soup.findAll(tag_names): |
50 |
link = tag.get('href') |
51 |
if not link: |
52 |
link = tag.get('src') |
53 |
|
54 |
# Filter out empty links |
55 |
if not link: |
56 |
if tag.get('id') == "top": |
57 |
continue |
58 |
|
59 |
class_names = tag.get('class') |
60 |
if "selflink" in class_names: |
61 |
continue |
62 |
|
63 |
pywikibot.stdout(' Could not process mystery link {}'.format(tag.get_text)) |
64 |
pywikibot.stdout(' Class is "{}".'.format(tag.get('class'))) |
65 |
continue |
66 |
|
67 |
# A "src" or "href" starting with "/" would be a link to a local page or file; a |
68 |
# link starting with "#" is a section link |
69 |
if link.startswith('/') or link.startswith('#'): |
70 |
continue |
71 |
|
72 |
# The gnu.org link to the Free Documentation License is at the bottom of every page |
73 |
if link == "http://www.gnu.org/copyleft/fdl.html": |
74 |
continue |
75 |
|
76 |
_, ext = os.path.splitext(link) |
77 |
if ext.lower() in file_formats: |
78 |
if "oni2.net" in link: |
79 |
pywikibot.stdout(' Oni2.net image: {}'.format(link)) |
80 |
oni2_images = oni2_images + 1 |
81 |
else: |
82 |
pywikibot.stdout(' External image: {}'.format(link)) |
83 |
ext_images = ext_images + 1 |
84 |
#else: |
85 |
#pywikibot.stdout(' Other external link: {}'.format(link)) |
86 |
|
87 |
def main(*args): |
88 |
global pages_checked |
89 |
global page_errors |
90 |
global ext_images |
91 |
global oni2_images |
92 |
global tag_names |
93 |
|
94 |
cat_name = '' |
95 |
page_name = '' |
96 |
|
97 |
#pywikibot.stdout('The members of the bs4.element.Tag class are:') |
98 |
#pywikibot.stdout(format(dir(bs4.element.Tag))) |
99 |
|
100 |
local_args = pywikibot.handle_args(args) |
101 |
genFactory = pagegenerators.GeneratorFactory() |
102 |
|
103 |
for arg in local_args: |
104 |
if arg.startswith('-cat:'): |
105 |
cat_name = arg[5:] |
106 |
elif arg.startswith('-page:'): |
107 |
page_name = arg[6:] |
108 |
elif arg == '-linked': |
109 |
tag_names += ['a'] |
110 |
elif arg == '-inlined': |
111 |
tag_names += ['img'] |
112 |
else: |
113 |
pywikibot.stdout('Unknown argument "{}".'.format(arg)) |
114 |
return |
115 |
|
116 |
if not tag_names: |
117 |
pywikibot.stdout('You need to pass this script either "-linked", "-inlined", or both arguments in order to specify which image links you want to find.') |
118 |
return |
119 |
|
120 |
site = pywikibot.Site() |
121 |
if cat_name != '': |
122 |
cat_obj = pywikibot.Category(site, cat_name) |
123 |
generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True) |
124 |
for page in pagegenerators.PreloadingGenerator(generator, 100): |
125 |
pywikibot.stdout('Checking page "{}"'.format(page.title())) |
126 |
page_url = page.full_url().replace("%2F", "/") |
127 |
get_image_links(page_url) |
128 |
elif page_name != '': |
129 |
page = pywikibot.Page(site, page_name) |
130 |
pywikibot.stdout('Checking page "{}"'.format(page.title())) |
131 |
page_url = page.full_url().replace("%2F", "/") |
132 |
get_image_links(page_url) |
133 |
else: |
134 |
pywikibot.stdout('No page name or category name received.'.format(arg)) |
135 |
return |
136 |
|
137 |
chk_page_str = "pages" |
138 |
if pages_checked == 1: |
139 |
chk_page_str = "page" |
140 |
|
141 |
err_page_str = "pages" |
142 |
if page_errors == 1: |
143 |
err_page_str = "page" |
144 |
|
145 |
ext_image_str = "images" |
146 |
if ext_images == 1: |
147 |
ext_image_str = "image" |
148 |
|
149 |
oni2_image_str = "images" |
150 |
if oni2_images == 1: |
151 |
oni2_image_str = "image" |
152 |
|
153 |
pywikibot.stdout('-------------------------') |
154 |
pywikibot.stdout('Checked {0} {1} and failed to check {2} {3}.'.format(pages_checked, chk_page_str, page_errors, err_page_str)) |
155 |
pywikibot.stdout('Found {0} {1} from oni2.net and {2} {3} from other domains.'.format(oni2_images, oni2_image_str, ext_images, ext_image_str)) |
156 |
|
157 |
if __name__ == '__main__': |
158 |
main() |