--- ValBot/Python/find_external_images.py 2022/06/28 22:06:29 1173
+++ ValBot/Python/find_external_images.py 2023/04/28 00:55:00 1181
@@ -1,9 +1,11 @@
# Find External Images
# by iritscen@yahoo.com
# Looks at each link on a page (or in all the pages in a category) and prints the links to
-# images that are externally-hosted. You must pass in one or both of the following args:
-# -inlined: Show any plain URLs leading to images (these create embedded images,
)
-# -linked: Show any external URLs ("[URL]") leading to images (these create links, )
+# images that are external to the wiki. Distinction is made between images hosted on oni2.net
+# and on third-party domains. You must pass in one or both of the following args:
+# -embedded: Show any plain URLs leading to images (these create embedded images,
)
+# -linked: Show any external URLs ("[URL]") leading to images (these create links, )
+#
# Specify the page or category to search with -page:"Some Page" or -cat:"Some Category".
#
# Recommended viewing width:
@@ -14,145 +16,186 @@ import os
from urllib.parse import urljoin
import pywikibot
-
from pywikibot.bot import QuitKeyboardInterrupt
from pywikibot import pagegenerators
from pywikibot.comms.http import fetch
from pywikibot.specialbots import UploadRobot
-#import bs4 # for listing members with dir()
+
+import bs4
from bs4 import BeautifulSoup
+# Initialize globals
+debug = 0
pages_checked = 0
page_errors = 0
-ext_images = 0
-oni2_images = 0
+image_errors = 0
+linked_ext_images = 0
+linked_oni2_images = 0
+embedded_ext_images = 0
+embedded_oni2_images = 0
file_formats = ('.jpg', '.jpeg', '.png', '.gif', '.svg')
tag_names = []
-# Scrapes the HTML at the given URL for image tags
-def get_image_links(url):
- global pages_checked
- global page_errors
- global ext_images
- global oni2_images
- global file_formats
- global tag_names
-
- response = fetch(url)
- if response.status_code != 200:
- pywikibot.stdout(' ERROR: Could not load page at URL "{}"'.format(url))
- page_errors = page_errors + 1
- return
-
- soup = BeautifulSoup(response.text, 'html.parser')
- pages_checked = pages_checked + 1
- for tag in soup.findAll(tag_names):
- link = tag.get('href')
- if not link:
- link = tag.get('src')
-
- # Filter out empty links
- if not link:
- if tag.get('id') == "top":
- continue
-
- class_names = tag.get('class')
- if "selflink" in class_names:
- continue
+# Pass this function a singular noun and it will add an 's' to "noun" if "quantity" is not 1
+def plural_check(noun, quantity):
+ if quantity != 1:
+ return noun + "s"
+ else:
+ return noun
- pywikibot.stdout(' Could not process mystery link {}'.format(tag.get_text))
- pywikibot.stdout(' Class is "{}".'.format(tag.get('class')))
- continue
-
- # A "src" or "href" starting with "/" would be a link to a local page or file; a
- # link starting with "#" is a section link
- if link.startswith('/') or link.startswith('#'):
+# Scrapes the HTML at the given URL for image tags
+def get_image_links(page_url, page_name):
+ global debug
+ global pages_checked
+ global page_errors
+ global image_errors
+ global linked_ext_images
+ global linked_oni2_images
+ global embedded_ext_images
+ global embedded_oni2_images
+ global file_formats
+ global tag_names
+ name_printed = 0
+
+ response = fetch(page_url)
+ if response.status_code != 200:
+ pywikibot.stdout(' ERROR: Could not load page at URL "{}".'.format(page_url))
+ page_errors += 1
+ return
+
+ soup = BeautifulSoup(response.text, 'html.parser')
+ pages_checked += 1
+ for tag in soup.findAll(tag_names):
+ link = tag.get('href')
+ if not link:
+ link = tag.get('src')
+
+ # Filter out empty links
+ if not link:
+ if tag.get('id') == "top":
continue
- # The gnu.org link to the Free Documentation License is at the bottom of every page
- if link == "http://www.gnu.org/copyleft/fdl.html":
+ class_names = tag.get('class')
+ if "selflink" in class_names:
continue
- _, ext = os.path.splitext(link)
- if ext.lower() in file_formats:
- if "oni2.net" in link:
- pywikibot.stdout(' Oni2.net image: {}'.format(link))
- oni2_images = oni2_images + 1
+ if not name_printed and not debug:
+ pywikibot.stdout('From page "{}":'.format(page_name))
+ name_printed = 1
+ pywikibot.stdout(' ERROR: Could not process mystery link {}.'.format(tag.get_text))
+ pywikibot.stdout(' Class is "{}".'.format(tag.get('class')))
+ page_errors += 1
+ continue
+
+ # A "src" or "href" starting with "/" would be a link to a local page or file; a
+ # link starting with "#" is a section link
+ if link.startswith('/') or link.startswith('#'):
+ continue
+
+ # The gnu.org link to the Free Documentation License is at the bottom of every page
+ if link == "http://www.gnu.org/copyleft/fdl.html":
+ continue
+
+ # Determine if link is to an image
+ _, ext = os.path.splitext(link)
+ if ext.lower() in file_formats:
+ if not name_printed and not debug:
+ pywikibot.stdout('Found on page "{}":'.format(page_name))
+ name_printed = 1
+ tag_text = format(tag)
+ if "oni2.net" in link:
+ if tag_text.startswith('