--- ValBot/Python/find_external_images.py	2022/02/21 23:59:20	1169
+++ ValBot/Python/find_external_images.py	2023/04/28 00:55:00	1181
@@ -1,89 +1,201 @@
+# Find External Images
+# by iritscen@yahoo.com
+# Looks at each link on a page (or in all the pages in a category) and prints the links to
+# images that are external to the wiki. Distinction is made between images hosted on oni2.net
+# and on third-party domains. You must pass in one or both of the following args:
+#  -embedded: Show any plain URLs leading to images (these create embedded images, <img>)
+#  -linked: Show any external URLs ("[URL]") leading to images (these create links, <a>)
+# 
+# Specify the page or category to search with -page:"Some Page" or -cat:"Some Category".
+#
+# Recommended viewing width:
+# |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---|
+
 import os
 
 from urllib.parse import urljoin
 
 import pywikibot
-
 from pywikibot.bot import QuitKeyboardInterrupt
 from pywikibot import pagegenerators
 from pywikibot.comms.http import fetch
 from pywikibot.specialbots import UploadRobot
+
+import bs4
 from bs4 import BeautifulSoup
 
-first_run = False
+# Initialize globals
+debug = 0
 pages_checked = 0
-oni2_images = 0
+page_errors = 0
+image_errors = 0
+linked_ext_images = 0
+linked_oni2_images = 0
+embedded_ext_images = 0
+embedded_oni2_images = 0
 file_formats = ('.jpg', '.jpeg', '.png', '.gif', '.svg')
+tag_names = []
+
+# Pass this function a singular noun and it will add an 's' to "noun" if "quantity" is not 1
+def plural_check(noun, quantity):
+   if quantity != 1:
+      return noun + "s"
+   else:
+      return noun
 
 # Scrapes the HTML at the given URL for image tags
-def get_image_links(url, shown):
-    links = []
-    global oni2_images
-    global pages_checked
-
-    response = fetch(url)
-    if response.status_code != 200:
-        pywikibot.output('Skipping url: {}'.format(url))
-        return links
-
-    soup = BeautifulSoup(response.text, 'html.parser')
-    pages_checked = pages_checked + 1
-    if not shown:
-        tagname = 'a'
-    elif shown == 'just':
-        tagname = 'img'
-    else:
-        tagname = ['a', 'img']
-    #pywikibot.output('Looking at tags.')
-    for tag in soup.findAll(tagname):
-        link = tag.get('src', tag.get('href', None))
-        if not link:
-            #pywikibot.output('It is not a link.')
+def get_image_links(page_url, page_name):
+   global debug
+   global pages_checked
+   global page_errors
+   global image_errors
+   global linked_ext_images
+   global linked_oni2_images
+   global embedded_ext_images
+   global embedded_oni2_images
+   global file_formats
+   global tag_names
+   name_printed = 0
+
+   response = fetch(page_url)
+   if response.status_code != 200:
+      pywikibot.stdout('   ERROR: Could not load page at URL "{}".'.format(page_url))
+      page_errors += 1
+      return
+
+   soup = BeautifulSoup(response.text, 'html.parser')
+   pages_checked += 1
+   for tag in soup.findAll(tag_names):
+      link = tag.get('href')
+      if not link:
+         link = tag.get('src')
+
+      # Filter out empty links
+      if not link:
+         if tag.get('id') == "top":
+            continue
+
+         class_names = tag.get('class')
+         if "selflink" in class_names:
             continue
-        #pywikibot.output('Got link {0}.'.format(link))
-        _, ext = os.path.splitext(link)
-        if ext.lower() in file_formats:
-            pywikibot.output('Found image link {0}.'.format(ext))
-            if "oni2.net" in link:
-                pywikibot.stdout('Found an oni2.net image: {0}'.format(link))
-                oni2_images = oni2_images + 1
-    return links
 
+         if not name_printed and not debug:
+            pywikibot.stdout('From page "{}":'.format(page_name))
+            name_printed = 1
+         pywikibot.stdout('   ERROR: Could not process mystery link {}.'.format(tag.get_text))
+         pywikibot.stdout('   Class is "{}".'.format(tag.get('class')))
+         page_errors += 1
+         continue
+
+      # A "src" or "href" starting with "/" would be a link to a local page or file; a
+      # link starting with "#" is a section link
+      if link.startswith('/') or link.startswith('#'):
+         continue
+
+      # The gnu.org link to the Free Documentation License is at the bottom of every page
+      if link == "http://www.gnu.org/copyleft/fdl.html":
+         continue
+
+      # Determine if link is to an image
+      _, ext = os.path.splitext(link)
+      if ext.lower() in file_formats:
+         if not name_printed and not debug:
+            pywikibot.stdout('Found on page "{}":'.format(page_name))
+            name_printed = 1
+         tag_text = format(tag)
+         if "oni2.net" in link:
+            if tag_text.startswith('<a'):
+               pywikibot.stdout('   Linked oni2.net image: {}'.format(link))
+               linked_oni2_images += 1
+            elif tag_text.startswith('<img'):
+               pywikibot.stdout('   Embedded oni2.net image: {}'.format(link))
+               embedded_oni2_images += 1
+            else:
+               pywikibot.stdout('   ERROR: Could not process oni2.net image link {}.'.format(link))
+               image_errors += 1
+               return
+         else:
+            if tag_text.startswith('<a'):
+               pywikibot.stdout('   Linked external image: {}'.format(link))
+               linked_ext_images += 1
+            elif tag_text.startswith('<img'):
+               pywikibot.stdout('   Embedded external image: {}'.format(link))
+               embedded_ext_images += 1
+            else:
+               pywikibot.stdout('   ERROR: Could not process external image link {}.'.format(link))
+               image_errors += 1
+               return
 
 def main(*args):
-    cat = ''
-    url = ''
-    image_url = False
-    shown = False
-    desc = []
-
-    local_args = pywikibot.handle_args(args)
-    genFactory = pagegenerators.GeneratorFactory()
-
-    for arg in local_args:
-        if arg.startswith('-cat:'):
-            cat = arg[5:]
-        elif arg == '-shown':
-            shown = True
-        elif arg == '-justshown':
-            shown = 'just'
-        elif url == '':
-            url = arg
-        else:
-            desc += [arg]
-    desc = ' '.join(desc)
-
-    site = pywikibot.Site()
-    cat_obj = pywikibot.Category(site, cat)
-    generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
-    for page in pagegenerators.PreloadingGenerator(generator, 100):
-        pywikibot.stdout('Checking page {0}'.format(page.title()))
-        page_url = page.full_url().replace("%2F", "/")
-        get_image_links(page_url, shown)
-
-    global pages_checked
-    global oni2_images
-    pywikibot.stdout('Checked {0} page(s) and found {1} image(s) from oni2.net.'.format(pages_checked, oni2_images))
+   global debug
+   global pages_checked
+   global page_errors
+   global image_errors
+   global linked_ext_images
+   global linked_oni2_images
+   global embedded_ext_images
+   global embedded_oni2_images
+   global tag_names
+
+   search_cat = ''
+   search_page = ''
+
+   #pywikibot.stdout('The members of the bs4.element.Tag class are:')
+   #pywikibot.stdout(format(dir(bs4.element.Tag)))
+
+   local_args = pywikibot.handle_args(args)
+   genFactory = pagegenerators.GeneratorFactory()
+
+   for arg in local_args:
+      if arg.startswith('-cat:'):
+         search_cat = arg[5:]
+      elif arg.startswith('-page:'):
+         search_page = arg[6:]
+      elif arg == '-linked':
+         tag_names += ['a']
+      elif arg == '-embedded':
+         tag_names += ['img']
+      elif arg == '-dbg':
+         debug = 1
+      else:
+         pywikibot.stdout('Unknown argument "{}".'.format(arg))
+         return
+
+   if not tag_names:
+      pywikibot.stdout('You need to pass this script either "-linked", "-embedded", or both arguments in order to specify which image links you want to find.')
+      return
+
+   site = pywikibot.Site()
+   if search_cat != '':
+      cat_obj = pywikibot.Category(site, search_cat)
+      generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
+      for page in pagegenerators.PreloadingGenerator(generator, 100):
+         if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
+         page_url = page.full_url().replace("%2F", "/")
+         get_image_links(page_url, page.title())
+   elif search_page != '':
+      page = pywikibot.Page(site, search_page)
+      if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
+      page_url = page.full_url().replace("%2F", "/")
+      get_image_links(page_url, page.title())
+   else:
+      pywikibot.stdout('No page name or category name received.'.format(arg))
+      return
+
+   chk_page_str = plural_check("page", pages_checked)
+   err_page_str = plural_check("page", page_errors)
+   err_img_str = plural_check("image", image_errors)
+   linked_ext_image_str = plural_check("image", linked_ext_images)
+   linked_oni2_image_str = plural_check("image", linked_oni2_images)
+   embedded_ext_image_str = plural_check("image", embedded_ext_images)
+   embedded_oni2_image_str = plural_check("image", embedded_oni2_images)
+
+   pywikibot.stdout('-------------------------')
+   pywikibot.stdout('Checked {0} {1} and failed to process {2} {3} on them. Failed to check {4} {5}.'.format(pages_checked, chk_page_str, image_errors, err_img_str, page_errors, err_page_str))
+   if 'a' in tag_names:
+      pywikibot.stdout('Found {0} linked {1} from oni2.net and {2} linked {3} from other domains.'.format(linked_oni2_images, linked_oni2_image_str, linked_ext_images, linked_ext_image_str))
+   if 'img' in tag_names:
+      pywikibot.stdout('Found {0} embedded {1} from oni2.net and {2} embedded {3} from other domains.'.format(embedded_oni2_images, embedded_oni2_image_str, embedded_ext_images, embedded_ext_image_str))
 
 if __name__ == '__main__':
-    main()
+   main()