ViewVC Help
View File | Revision Log | View Changeset | Root Listing
root/Oni2/ValBot/Python/find_external_images.py
Revision: 1173
Committed: Tue Jun 28 22:06:29 2022 UTC (3 years, 3 months ago) by iritscen
Content type: text/x-python
File size: 5418 byte(s)
Log Message:
ValBot: check_intrawiki_section_links.py won't quit when a link cannot be understood; it will just move on. find_external_images.py is now polished and robust.

File Contents

# Content
1 # Find External Images
2 # by iritscen@yahoo.com
3 # Looks at each link on a page (or in all the pages in a category) and prints the links to
4 # images that are externally-hosted. You must pass in one or both of the following args:
5 # -inlined: Show any plain URLs leading to images (these create embedded images, <img>)
6 # -linked: Show any external URLs ("[URL]") leading to images (these create links, <a>)
7 # Specify the page or category to search with -page:"Some Page" or -cat:"Some Category".
8 #
9 # Recommended viewing width:
10 # |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---|
11
12 import os
13
14 from urllib.parse import urljoin
15
16 import pywikibot
17
18 from pywikibot.bot import QuitKeyboardInterrupt
19 from pywikibot import pagegenerators
20 from pywikibot.comms.http import fetch
21 from pywikibot.specialbots import UploadRobot
22 #import bs4 # for listing members with dir()
23 from bs4 import BeautifulSoup
24
25 pages_checked = 0
26 page_errors = 0
27 ext_images = 0
28 oni2_images = 0
29 file_formats = ('.jpg', '.jpeg', '.png', '.gif', '.svg')
30 tag_names = []
31
32 # Scrapes the HTML at the given URL for image tags
33 def get_image_links(url):
34 global pages_checked
35 global page_errors
36 global ext_images
37 global oni2_images
38 global file_formats
39 global tag_names
40
41 response = fetch(url)
42 if response.status_code != 200:
43 pywikibot.stdout(' ERROR: Could not load page at URL "{}"'.format(url))
44 page_errors = page_errors + 1
45 return
46
47 soup = BeautifulSoup(response.text, 'html.parser')
48 pages_checked = pages_checked + 1
49 for tag in soup.findAll(tag_names):
50 link = tag.get('href')
51 if not link:
52 link = tag.get('src')
53
54 # Filter out empty links
55 if not link:
56 if tag.get('id') == "top":
57 continue
58
59 class_names = tag.get('class')
60 if "selflink" in class_names:
61 continue
62
63 pywikibot.stdout(' Could not process mystery link {}'.format(tag.get_text))
64 pywikibot.stdout(' Class is "{}".'.format(tag.get('class')))
65 continue
66
67 # A "src" or "href" starting with "/" would be a link to a local page or file; a
68 # link starting with "#" is a section link
69 if link.startswith('/') or link.startswith('#'):
70 continue
71
72 # The gnu.org link to the Free Documentation License is at the bottom of every page
73 if link == "http://www.gnu.org/copyleft/fdl.html":
74 continue
75
76 _, ext = os.path.splitext(link)
77 if ext.lower() in file_formats:
78 if "oni2.net" in link:
79 pywikibot.stdout(' Oni2.net image: {}'.format(link))
80 oni2_images = oni2_images + 1
81 else:
82 pywikibot.stdout(' External image: {}'.format(link))
83 ext_images = ext_images + 1
84 #else:
85 #pywikibot.stdout(' Other external link: {}'.format(link))
86
87 def main(*args):
88 global pages_checked
89 global page_errors
90 global ext_images
91 global oni2_images
92 global tag_names
93
94 cat_name = ''
95 page_name = ''
96
97 #pywikibot.stdout('The members of the bs4.element.Tag class are:')
98 #pywikibot.stdout(format(dir(bs4.element.Tag)))
99
100 local_args = pywikibot.handle_args(args)
101 genFactory = pagegenerators.GeneratorFactory()
102
103 for arg in local_args:
104 if arg.startswith('-cat:'):
105 cat_name = arg[5:]
106 elif arg.startswith('-page:'):
107 page_name = arg[6:]
108 elif arg == '-linked':
109 tag_names += ['a']
110 elif arg == '-inlined':
111 tag_names += ['img']
112 else:
113 pywikibot.stdout('Unknown argument "{}".'.format(arg))
114 return
115
116 if not tag_names:
117 pywikibot.stdout('You need to pass this script either "-linked", "-inlined", or both arguments in order to specify which image links you want to find.')
118 return
119
120 site = pywikibot.Site()
121 if cat_name != '':
122 cat_obj = pywikibot.Category(site, cat_name)
123 generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
124 for page in pagegenerators.PreloadingGenerator(generator, 100):
125 pywikibot.stdout('Checking page "{}"'.format(page.title()))
126 page_url = page.full_url().replace("%2F", "/")
127 get_image_links(page_url)
128 elif page_name != '':
129 page = pywikibot.Page(site, page_name)
130 pywikibot.stdout('Checking page "{}"'.format(page.title()))
131 page_url = page.full_url().replace("%2F", "/")
132 get_image_links(page_url)
133 else:
134 pywikibot.stdout('No page name or category name received.'.format(arg))
135 return
136
137 chk_page_str = "pages"
138 if pages_checked == 1:
139 chk_page_str = "page"
140
141 err_page_str = "pages"
142 if page_errors == 1:
143 err_page_str = "page"
144
145 ext_image_str = "images"
146 if ext_images == 1:
147 ext_image_str = "image"
148
149 oni2_image_str = "images"
150 if oni2_images == 1:
151 oni2_image_str = "image"
152
153 pywikibot.stdout('-------------------------')
154 pywikibot.stdout('Checked {0} {1} and failed to check {2} {3}.'.format(pages_checked, chk_page_str, page_errors, err_page_str))
155 pywikibot.stdout('Found {0} {1} from oni2.net and {2} {3} from other domains.'.format(oni2_images, oni2_image_str, ext_images, ext_image_str))
156
157 if __name__ == '__main__':
158 main()