ValBot/Python/find_external_images.py

# Find External Images
# by iritscen@yahoo.com
# Looks at each link on a page (or in all the pages in a category) and prints the links to
# images that are externally-hosted. You must pass in one or both of the following args:
# -inlined: Show any plain URLs leading to images (these create embedded images, <img>)
# -linked: Show any external URLs ("[URL]") leading to images (these create links, <a>)
# Specify the page or category to search with -page:"Some Page" or -cat:"Some Category".
#
# Recommended viewing width:
# |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---|

import os

from urllib.parse import urljoin

import pywikibot

from pywikibot.bot import QuitKeyboardInterrupt
from pywikibot import pagegenerators
from pywikibot.comms.http import fetch
from pywikibot.specialbots import UploadRobot
#import bs4 # for listing members with dir()
from bs4 import BeautifulSoup

pages_checked = 0
page_errors = 0
ext_images = 0
oni2_images = 0
file_formats = ('.jpg', '.jpeg', '.png', '.gif', '.svg')
tag_names = []

# Scrapes the HTML at the given URL for image tags
def get_image_links(url):
    global pages_checked
    global page_errors
    global ext_images
    global oni2_images
    global file_formats
    global tag_names

    response = fetch(url)
    if response.status_code != 200:
        pywikibot.stdout('   ERROR: Could not load page at URL "{}"'.format(url))
        page_errors = page_errors + 1
        return

    soup = BeautifulSoup(response.text, 'html.parser')
    pages_checked = pages_checked + 1
    for tag in soup.findAll(tag_names):
        link = tag.get('href')
        if not link:
            link = tag.get('src')

        # Filter out empty links
        if not link:
            if tag.get('id') == "top":
                continue

            class_names = tag.get('class')
            if "selflink" in class_names:
                continue

            pywikibot.stdout('   Could not process mystery link {}'.format(tag.get_text))
            pywikibot.stdout('   Class is "{}".'.format(tag.get('class')))
            continue

        # A "src" or "href" starting with "/" would be a link to a local page or file; a
        # link starting with "#" is a section link
        if link.startswith('/') or link.startswith('#'):
            continue

        # The gnu.org link to the Free Documentation License is at the bottom of every page
        if link == "http://www.gnu.org/copyleft/fdl.html":
            continue

        _, ext = os.path.splitext(link)
        if ext.lower() in file_formats:
            if "oni2.net" in link:
                pywikibot.stdout('   Oni2.net image: {}'.format(link))
                oni2_images = oni2_images + 1
            else:
                pywikibot.stdout('   External image: {}'.format(link))
                ext_images = ext_images + 1
        #else:
           #pywikibot.stdout('   Other external link: {}'.format(link))

def main(*args):
    global pages_checked
    global page_errors
    global ext_images
    global oni2_images
    global tag_names

    cat_name = ''
    page_name = ''

    #pywikibot.stdout('The members of the bs4.element.Tag class are:')
    #pywikibot.stdout(format(dir(bs4.element.Tag)))

    local_args = pywikibot.handle_args(args)
    genFactory = pagegenerators.GeneratorFactory()

    for arg in local_args:
        if arg.startswith('-cat:'):
            cat_name = arg[5:]
        elif arg.startswith('-page:'):
            page_name = arg[6:]
        elif arg == '-linked':
            tag_names += ['a']
        elif arg == '-inlined':
            tag_names += ['img']
        else:
            pywikibot.stdout('Unknown argument "{}".'.format(arg))
            return

    if not tag_names:
        pywikibot.stdout('You need to pass this script either "-linked", "-inlined", or both arguments in order to specify which image links you want to find.')
        return

    site = pywikibot.Site()
    if cat_name != '':
        cat_obj = pywikibot.Category(site, cat_name)
        generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
        for page in pagegenerators.PreloadingGenerator(generator, 100):
            pywikibot.stdout('Checking page "{}"'.format(page.title()))
            page_url = page.full_url().replace("%2F", "/")
            get_image_links(page_url)
    elif page_name != '':
        page = pywikibot.Page(site, page_name)
        pywikibot.stdout('Checking page "{}"'.format(page.title()))
        page_url = page.full_url().replace("%2F", "/")
        get_image_links(page_url)
    else:
        pywikibot.stdout('No page name or category name received.'.format(arg))
        return

    chk_page_str = "pages"
    if pages_checked == 1:
        chk_page_str = "page"

    err_page_str = "pages"
    if page_errors == 1:
        err_page_str = "page"

    ext_image_str = "images"
    if ext_images == 1:
        ext_image_str = "image"

    oni2_image_str = "images"
    if oni2_images == 1:
        oni2_image_str = "image"

    pywikibot.stdout('-------------------------')
    pywikibot.stdout('Checked {0} {1} and failed to check {2} {3}.'.format(pages_checked, chk_page_str, page_errors, err_page_str))
    pywikibot.stdout('Found {0} {1} from oni2.net and {2} {3} from other domains.'.format(oni2_images, oni2_image_str, ext_images, ext_image_str))

if __name__ == '__main__':
    main()
Revision:	1173
Committed:	Tue Jun 28 22:06:29 2022 UTC (3 years, 4 months ago) by iritscen
Content type:	text/x-python
File size:	5418 byte(s)
Log Message:	ValBot: check_intrawiki_section_links.py won't quit when a link cannot be understood; it will just move on. find_external_images.py is now polished and robust.
#	Content
1	# Find External Images
2	# by iritscen@yahoo.com
3	# Looks at each link on a page (or in all the pages in a category) and prints the links to
4	# images that are externally-hosted. You must pass in one or both of the following args:
5	# -inlined: Show any plain URLs leading to images (these create embedded images, <img>)
6	# -linked: Show any external URLs ("[URL]") leading to images (these create links, <a>)
7	# Specify the page or category to search with -page:"Some Page" or -cat:"Some Category".
8	#
9	# Recommended viewing width:
10	# \|---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---\|
11
12	import os
13
14	from urllib.parse import urljoin
15
16	import pywikibot
17
18	from pywikibot.bot import QuitKeyboardInterrupt
19	from pywikibot import pagegenerators
20	from pywikibot.comms.http import fetch
21	from pywikibot.specialbots import UploadRobot
22	#import bs4 # for listing members with dir()
23	from bs4 import BeautifulSoup
24
25	pages_checked = 0
26	page_errors = 0
27	ext_images = 0
28	oni2_images = 0
29	file_formats = ('.jpg', '.jpeg', '.png', '.gif', '.svg')
30	tag_names = []
31
32	# Scrapes the HTML at the given URL for image tags
33	def get_image_links(url):
34	global pages_checked
35	global page_errors
36	global ext_images
37	global oni2_images
38	global file_formats
39	global tag_names
40
41	response = fetch(url)
42	if response.status_code != 200:
43	pywikibot.stdout(' ERROR: Could not load page at URL "{}"'.format(url))
44	page_errors = page_errors + 1
45	return
46
47	soup = BeautifulSoup(response.text, 'html.parser')
48	pages_checked = pages_checked + 1
49	for tag in soup.findAll(tag_names):
50	link = tag.get('href')
51	if not link:
52	link = tag.get('src')
53
54	# Filter out empty links
55	if not link:
56	if tag.get('id') == "top":
57	continue
58
59	class_names = tag.get('class')
60	if "selflink" in class_names:
61	continue
62
63	pywikibot.stdout(' Could not process mystery link {}'.format(tag.get_text))
64	pywikibot.stdout(' Class is "{}".'.format(tag.get('class')))
65	continue
66
67	# A "src" or "href" starting with "/" would be a link to a local page or file; a
68	# link starting with "#" is a section link
69	if link.startswith('/') or link.startswith('#'):
70	continue
71
72	# The gnu.org link to the Free Documentation License is at the bottom of every page
73	if link == "http://www.gnu.org/copyleft/fdl.html":
74	continue
75
76	_, ext = os.path.splitext(link)
77	if ext.lower() in file_formats:
78	if "oni2.net" in link:
79	pywikibot.stdout(' Oni2.net image: {}'.format(link))
80	oni2_images = oni2_images + 1
81	else:
82	pywikibot.stdout(' External image: {}'.format(link))
83	ext_images = ext_images + 1
84	#else:
85	#pywikibot.stdout(' Other external link: {}'.format(link))
86
87	def main(*args):
88	global pages_checked
89	global page_errors
90	global ext_images
91	global oni2_images
92	global tag_names
93
94	cat_name = ''
95	page_name = ''
96
97	#pywikibot.stdout('The members of the bs4.element.Tag class are:')
98	#pywikibot.stdout(format(dir(bs4.element.Tag)))
99
100	local_args = pywikibot.handle_args(args)
101	genFactory = pagegenerators.GeneratorFactory()
102
103	for arg in local_args:
104	if arg.startswith('-cat:'):
105	cat_name = arg[5:]
106	elif arg.startswith('-page:'):
107	page_name = arg[6:]
108	elif arg == '-linked':
109	tag_names += ['a']
110	elif arg == '-inlined':
111	tag_names += ['img']
112	else:
113	pywikibot.stdout('Unknown argument "{}".'.format(arg))
114	return
115
116	if not tag_names:
117	pywikibot.stdout('You need to pass this script either "-linked", "-inlined", or both arguments in order to specify which image links you want to find.')
118	return
119
120	site = pywikibot.Site()
121	if cat_name != '':
122	cat_obj = pywikibot.Category(site, cat_name)
123	generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
124	for page in pagegenerators.PreloadingGenerator(generator, 100):
125	pywikibot.stdout('Checking page "{}"'.format(page.title()))
126	page_url = page.full_url().replace("%2F", "/")
127	get_image_links(page_url)
128	elif page_name != '':
129	page = pywikibot.Page(site, page_name)
130	pywikibot.stdout('Checking page "{}"'.format(page.title()))
131	page_url = page.full_url().replace("%2F", "/")
132	get_image_links(page_url)
133	else:
134	pywikibot.stdout('No page name or category name received.'.format(arg))
135	return
136
137	chk_page_str = "pages"
138	if pages_checked == 1:
139	chk_page_str = "page"
140
141	err_page_str = "pages"
142	if page_errors == 1:
143	err_page_str = "page"
144
145	ext_image_str = "images"
146	if ext_images == 1:
147	ext_image_str = "image"
148
149	oni2_image_str = "images"
150	if oni2_images == 1:
151	oni2_image_str = "image"
152
153	pywikibot.stdout('-------------------------')
154	pywikibot.stdout('Checked {0} {1} and failed to check {2} {3}.'.format(pages_checked, chk_page_str, page_errors, err_page_str))
155	pywikibot.stdout('Found {0} {1} from oni2.net and {2} {3} from other domains.'.format(oni2_images, oni2_image_str, ext_images, ext_image_str))
156
157	if __name__ == '__main__':
158	main()