--- ValBot/Python/find_external_images.py 2022/02/21 23:59:20 1169
+++ ValBot/Python/find_external_images.py 2022/06/28 22:06:29 1173
@@ -1,3 +1,14 @@
+# Find External Images
+# by iritscen@yahoo.com
+# Looks at each link on a page (or in all the pages in a category) and prints the links to
+# images that are externally-hosted. You must pass in one or both of the following args:
+# -inlined: Show any plain URLs leading to images (these create embedded images,
)
+# -linked: Show any external URLs ("[URL]") leading to images (these create links, )
+# Specify the page or category to search with -page:"Some Page" or -cat:"Some Category".
+#
+# Recommended viewing width:
+# |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---|
+
import os
from urllib.parse import urljoin
@@ -8,82 +19,140 @@ from pywikibot.bot import QuitKeyboardIn
from pywikibot import pagegenerators
from pywikibot.comms.http import fetch
from pywikibot.specialbots import UploadRobot
+#import bs4 # for listing members with dir()
from bs4 import BeautifulSoup
-first_run = False
pages_checked = 0
+page_errors = 0
+ext_images = 0
oni2_images = 0
file_formats = ('.jpg', '.jpeg', '.png', '.gif', '.svg')
+tag_names = []
# Scrapes the HTML at the given URL for image tags
-def get_image_links(url, shown):
- links = []
- global oni2_images
+def get_image_links(url):
global pages_checked
+ global page_errors
+ global ext_images
+ global oni2_images
+ global file_formats
+ global tag_names
response = fetch(url)
if response.status_code != 200:
- pywikibot.output('Skipping url: {}'.format(url))
- return links
+ pywikibot.stdout(' ERROR: Could not load page at URL "{}"'.format(url))
+ page_errors = page_errors + 1
+ return
soup = BeautifulSoup(response.text, 'html.parser')
pages_checked = pages_checked + 1
- if not shown:
- tagname = 'a'
- elif shown == 'just':
- tagname = 'img'
- else:
- tagname = ['a', 'img']
- #pywikibot.output('Looking at tags.')
- for tag in soup.findAll(tagname):
- link = tag.get('src', tag.get('href', None))
+ for tag in soup.findAll(tag_names):
+ link = tag.get('href')
+ if not link:
+ link = tag.get('src')
+
+ # Filter out empty links
if not link:
- #pywikibot.output('It is not a link.')
+ if tag.get('id') == "top":
+ continue
+
+ class_names = tag.get('class')
+ if "selflink" in class_names:
+ continue
+
+ pywikibot.stdout(' Could not process mystery link {}'.format(tag.get_text))
+ pywikibot.stdout(' Class is "{}".'.format(tag.get('class')))
+ continue
+
+ # A "src" or "href" starting with "/" would be a link to a local page or file; a
+ # link starting with "#" is a section link
+ if link.startswith('/') or link.startswith('#'):
+ continue
+
+ # The gnu.org link to the Free Documentation License is at the bottom of every page
+ if link == "http://www.gnu.org/copyleft/fdl.html":
continue
- #pywikibot.output('Got link {0}.'.format(link))
+
_, ext = os.path.splitext(link)
if ext.lower() in file_formats:
- pywikibot.output('Found image link {0}.'.format(ext))
if "oni2.net" in link:
- pywikibot.stdout('Found an oni2.net image: {0}'.format(link))
+ pywikibot.stdout(' Oni2.net image: {}'.format(link))
oni2_images = oni2_images + 1
- return links
-
+ else:
+ pywikibot.stdout(' External image: {}'.format(link))
+ ext_images = ext_images + 1
+ #else:
+ #pywikibot.stdout(' Other external link: {}'.format(link))
def main(*args):
- cat = ''
- url = ''
- image_url = False
- shown = False
- desc = []
+ global pages_checked
+ global page_errors
+ global ext_images
+ global oni2_images
+ global tag_names
+
+ cat_name = ''
+ page_name = ''
+
+ #pywikibot.stdout('The members of the bs4.element.Tag class are:')
+ #pywikibot.stdout(format(dir(bs4.element.Tag)))
local_args = pywikibot.handle_args(args)
genFactory = pagegenerators.GeneratorFactory()
for arg in local_args:
if arg.startswith('-cat:'):
- cat = arg[5:]
- elif arg == '-shown':
- shown = True
- elif arg == '-justshown':
- shown = 'just'
- elif url == '':
- url = arg
+ cat_name = arg[5:]
+ elif arg.startswith('-page:'):
+ page_name = arg[6:]
+ elif arg == '-linked':
+ tag_names += ['a']
+ elif arg == '-inlined':
+ tag_names += ['img']
else:
- desc += [arg]
- desc = ' '.join(desc)
+ pywikibot.stdout('Unknown argument "{}".'.format(arg))
+ return
+
+ if not tag_names:
+ pywikibot.stdout('You need to pass this script either "-linked", "-inlined", or both arguments in order to specify which image links you want to find.')
+ return
site = pywikibot.Site()
- cat_obj = pywikibot.Category(site, cat)
- generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
- for page in pagegenerators.PreloadingGenerator(generator, 100):
- pywikibot.stdout('Checking page {0}'.format(page.title()))
+ if cat_name != '':
+ cat_obj = pywikibot.Category(site, cat_name)
+ generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
+ for page in pagegenerators.PreloadingGenerator(generator, 100):
+ pywikibot.stdout('Checking page "{}"'.format(page.title()))
+ page_url = page.full_url().replace("%2F", "/")
+ get_image_links(page_url)
+ elif page_name != '':
+ page = pywikibot.Page(site, page_name)
+ pywikibot.stdout('Checking page "{}"'.format(page.title()))
page_url = page.full_url().replace("%2F", "/")
- get_image_links(page_url, shown)
+ get_image_links(page_url)
+ else:
+ pywikibot.stdout('No page name or category name received.'.format(arg))
+ return
- global pages_checked
- global oni2_images
- pywikibot.stdout('Checked {0} page(s) and found {1} image(s) from oni2.net.'.format(pages_checked, oni2_images))
+ chk_page_str = "pages"
+ if pages_checked == 1:
+ chk_page_str = "page"
+
+ err_page_str = "pages"
+ if page_errors == 1:
+ err_page_str = "page"
+
+ ext_image_str = "images"
+ if ext_images == 1:
+ ext_image_str = "image"
+
+ oni2_image_str = "images"
+ if oni2_images == 1:
+ oni2_image_str = "image"
+
+ pywikibot.stdout('-------------------------')
+ pywikibot.stdout('Checked {0} {1} and failed to check {2} {3}.'.format(pages_checked, chk_page_str, page_errors, err_page_str))
+ pywikibot.stdout('Found {0} {1} from oni2.net and {2} {3} from other domains.'.format(oni2_images, oni2_image_str, ext_images, ext_image_str))
if __name__ == '__main__':
main()