[ViewVC] Diff of: Oni2/ValBot/Python/find_external

Comparing ValBot/Python/find_external_images.py (file contents):
Revision 1173 by iritscen, Tue Jun 28 22:06:29 2022 UTC vs.
Revision 1181 by iritscen, Fri Apr 28 00:55:00 2023 UTC

+# Find External Images
+# by iritscen@yahoo.com
+# Looks at each link on a page (or in all the pages in a category) and prints the links to
-<
+# images that are externally-hosted. You must pass in one or both of the following args:
-<
+# -inlined: Show any plain URLs leading to images (these create embedded images, <img>)
-<
+# -linked: Show any external URLs ("[URL]") leading to images (these create links, <a>)
->
+# images that are external to the wiki. Distinction is made between images hosted on oni2.net
->
+# and on third-party domains. You must pass in one or both of the following args:
->
+#  -embedded: Show any plain URLs leading to images (these create embedded images, <img>)
->
+#  -linked: Show any external URLs ("[URL]") leading to images (these create links, <a>)
->
+#
+# Specify the page or category to search with -page:"Some Page" or -cat:"Some Category".
+# Recommended viewing width:
+from urllib.parse import urljoin
+import pywikibot
-–
+from pywikibot.bot import QuitKeyboardInterrupt
+from pywikibot import pagegenerators
+from pywikibot.comms.http import fetch
+from pywikibot.specialbots import UploadRobot
-<
+#import bs4 # for listing members with dir()
->
->
+import bs4
+from bs4 import BeautifulSoup
-+
+# Initialize globals
-+
+debug = 0
+pages_checked = 0
+page_errors = 0
-<
+ext_images = 0
-<
+oni2_images = 0
->
+image_errors = 0
->
+linked_ext_images = 0
->
+linked_oni2_images = 0
->
+embedded_ext_images = 0
->
+embedded_oni2_images = 0
+file_formats = ('.jpg', '.jpeg', '.png', '.gif', '.svg')
+tag_names = []
-<
+# Scrapes the HTML at the given URL for image tags
-<
+def get_image_links(url):
-<
+    global pages_checked
-<
+    global page_errors
-<
+    global ext_images
-<
+    global oni2_images
-<
+    global file_formats
-<
+    global tag_names
-<
-<
+    response = fetch(url)
-<
+    if response.status_code != 200:
-<
+        pywikibot.stdout('   ERROR: Could not load page at URL "{}"'.format(url))
-<
+        page_errors = page_errors + 1
-<
+        return
-<
-<
+    soup = BeautifulSoup(response.text, 'html.parser')
-<
+    pages_checked = pages_checked + 1
-<
+    for tag in soup.findAll(tag_names):
-<
+        link = tag.get('href')
-<
+        if not link:
-<
+            link = tag.get('src')
-<
-<
+        # Filter out empty links
-<
+        if not link:
-<
+            if tag.get('id') == "top":
-<
+                continue
-<
-<
+            class_names = tag.get('class')
-<
+            if "selflink" in class_names:
-<
+                continue
->
+# Pass this function a singular noun and it will add an 's' to "noun" if "quantity" is not 1
->
+def plural_check(noun, quantity):
->
+   if quantity != 1:
->
+      return noun + "s"
->
+   else:
->
+      return noun
-<
+            pywikibot.stdout('   Could not process mystery link {}'.format(tag.get_text))
-<
+            pywikibot.stdout('   Class is "{}".'.format(tag.get('class')))
-<
+            continue
-<
-<
+        # A "src" or "href" starting with "/" would be a link to a local page or file; a
-<
+        # link starting with "#" is a section link
-<
+        if link.startswith('/') or link.startswith('#'):
->
+# Scrapes the HTML at the given URL for image tags
->
+def get_image_links(page_url, page_name):
->
+   global debug
->
+   global pages_checked
->
+   global page_errors
->
+   global image_errors
->
+   global linked_ext_images
->
+   global linked_oni2_images
->
+   global embedded_ext_images
->
+   global embedded_oni2_images
->
+   global file_formats
->
+   global tag_names
->
+   name_printed = 0
->
->
+   response = fetch(page_url)
->
+   if response.status_code != 200:
->
+      pywikibot.stdout('   ERROR: Could not load page at URL "{}".'.format(page_url))
->
+      page_errors += 1
->
+      return
->
->
+   soup = BeautifulSoup(response.text, 'html.parser')
->
+   pages_checked += 1
->
+   for tag in soup.findAll(tag_names):
->
+      link = tag.get('href')
->
+      if not link:
->
+         link = tag.get('src')
->
->
+      # Filter out empty links
->
+      if not link:
->
+         if tag.get('id') == "top":
+            continue
-<
+        # The gnu.org link to the Free Documentation License is at the bottom of every page
-<
+        if link == "http://www.gnu.org/copyleft/fdl.html":
->
+         class_names = tag.get('class')
->
+         if "selflink" in class_names:
+            continue
-<
+        _, ext = os.path.splitext(link)
-<
+        if ext.lower() in file_formats:
-<
+            if "oni2.net" in link:
-<
+                pywikibot.stdout('   Oni2.net image: {}'.format(link))
-<
+                oni2_images = oni2_images + 1
->
+         if not name_printed and not debug:
->
+            pywikibot.stdout('From page "{}":'.format(page_name))
->
+            name_printed = 1
->
+         pywikibot.stdout('   ERROR: Could not process mystery link {}.'.format(tag.get_text))
->
+         pywikibot.stdout('   Class is "{}".'.format(tag.get('class')))
->
+         page_errors += 1
->
+         continue
->
->
+      # A "src" or "href" starting with "/" would be a link to a local page or file; a
->
+      # link starting with "#" is a section link
->
+      if link.startswith('/') or link.startswith('#'):
->
+         continue
->
->
+      # The gnu.org link to the Free Documentation License is at the bottom of every page
->
+      if link == "http://www.gnu.org/copyleft/fdl.html":
->
+         continue
->
->
+      # Determine if link is to an image
->
+      _, ext = os.path.splitext(link)
->
+      if ext.lower() in file_formats:
->
+         if not name_printed and not debug:
->
+            pywikibot.stdout('Found on page "{}":'.format(page_name))
->
+            name_printed = 1
->
+         tag_text = format(tag)
->
+         if "oni2.net" in link:
->
+            if tag_text.startswith('<a'):
->
+               pywikibot.stdout('   Linked oni2.net image: {}'.format(link))
->
+               linked_oni2_images += 1
->
+            elif tag_text.startswith('<img'):
->
+               pywikibot.stdout('   Embedded oni2.net image: {}'.format(link))
->
+               embedded_oni2_images += 1
->
+            else:
->
+               pywikibot.stdout('   ERROR: Could not process oni2.net image link {}.'.format(link))
->
+               image_errors += 1
->
+               return
->
+         else:
->
+            if tag_text.startswith('<a'):
->
+               pywikibot.stdout('   Linked external image: {}'.format(link))
->
+               linked_ext_images += 1
->
+            elif tag_text.startswith('<img'):
->
+               pywikibot.stdout('   Embedded external image: {}'.format(link))
->
+               embedded_ext_images += 1
+            else:
-<
+                pywikibot.stdout('   External image: {}'.format(link))
-<
+                ext_images = ext_images + 1
-<
+        #else:
-<
+           #pywikibot.stdout('   Other external link: {}'.format(link))
->
+               pywikibot.stdout('   ERROR: Could not process external image link {}.'.format(link))
->
+               image_errors += 1
->
+               return
+def main(*args):
-<
+    global pages_checked
-<
+    global page_errors
-<
+    global ext_images
-<
+    global oni2_images
-<
+    global tag_names
-<
-<
+    cat_name = ''
-<
+    page_name = ''
-<
-<
+    #pywikibot.stdout('The members of the bs4.element.Tag class are:')
-<
+    #pywikibot.stdout(format(dir(bs4.element.Tag)))
-<
-<
+    local_args = pywikibot.handle_args(args)
-<
+    genFactory = pagegenerators.GeneratorFactory()
-<
-<
+    for arg in local_args:
-<
+        if arg.startswith('-cat:'):
-<
+            cat_name = arg[5:]
-<
+        elif arg.startswith('-page:'):
-<
+            page_name = arg[6:]
-<
+        elif arg == '-linked':
-<
+            tag_names += ['a']
-<
+        elif arg == '-inlined':
-<
+            tag_names += ['img']
-<
+        else:
-<
+            pywikibot.stdout('Unknown argument "{}".'.format(arg))
-<
+            return
-<
-<
+    if not tag_names:
-<
+        pywikibot.stdout('You need to pass this script either "-linked", "-inlined", or both arguments in order to specify which image links you want to find.')
-<
+        return
-<
-<
+    site = pywikibot.Site()
-<
+    if cat_name != '':
-<
+        cat_obj = pywikibot.Category(site, cat_name)
-<
+        generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
-<
+        for page in pagegenerators.PreloadingGenerator(generator, 100):
-<
+            pywikibot.stdout('Checking page "{}"'.format(page.title()))
-<
+            page_url = page.full_url().replace("%2F", "/")
-<
+            get_image_links(page_url)
-<
+    elif page_name != '':
-<
+        page = pywikibot.Page(site, page_name)
-<
+        pywikibot.stdout('Checking page "{}"'.format(page.title()))
-<
+        page_url = page.full_url().replace("%2F", "/")
-<
+        get_image_links(page_url)
-<
+    else:
-<
+        pywikibot.stdout('No page name or category name received.'.format(arg))
-<
+        return
-<
-<
+    chk_page_str = "pages"
-<
+    if pages_checked == 1:
-<
+        chk_page_str = "page"
-<
-<
+    err_page_str = "pages"
-<
+    if page_errors == 1:
-<
+        err_page_str = "page"
-<
-<
+    ext_image_str = "images"
-<
+    if ext_images == 1:
-<
+        ext_image_str = "image"
-<
-<
+    oni2_image_str = "images"
-<
+    if oni2_images == 1:
-<
+        oni2_image_str = "image"
-<
-<
+    pywikibot.stdout('-------------------------')
-<
+    pywikibot.stdout('Checked {0} {1} and failed to check {2} {3}.'.format(pages_checked, chk_page_str, page_errors, err_page_str))
-<
+    pywikibot.stdout('Found {0} {1} from oni2.net and {2} {3} from other domains.'.format(oni2_images, oni2_image_str, ext_images, ext_image_str))
->
+   global debug
->
+   global pages_checked
->
+   global page_errors
->
+   global image_errors
->
+   global linked_ext_images
->
+   global linked_oni2_images
->
+   global embedded_ext_images
->
+   global embedded_oni2_images
->
+   global tag_names
->
->
+   search_cat = ''
->
+   search_page = ''
->
->
+   #pywikibot.stdout('The members of the bs4.element.Tag class are:')
->
+   #pywikibot.stdout(format(dir(bs4.element.Tag)))
->
->
+   local_args = pywikibot.handle_args(args)
->
+   genFactory = pagegenerators.GeneratorFactory()
->
->
+   for arg in local_args:
->
+      if arg.startswith('-cat:'):
->
+         search_cat = arg[5:]
->
+      elif arg.startswith('-page:'):
->
+         search_page = arg[6:]
->
+      elif arg == '-linked':
->
+         tag_names += ['a']
->
+      elif arg == '-embedded':
->
+         tag_names += ['img']
->
+      elif arg == '-dbg':
->
+         debug = 1
->
+      else:
->
+         pywikibot.stdout('Unknown argument "{}".'.format(arg))
->
+         return
->
->
+   if not tag_names:
->
+      pywikibot.stdout('You need to pass this script either "-linked", "-embedded", or both arguments in order to specify which image links you want to find.')
->
+      return
->
->
+   site = pywikibot.Site()
->
+   if search_cat != '':
->
+      cat_obj = pywikibot.Category(site, search_cat)
->
+      generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
->
+      for page in pagegenerators.PreloadingGenerator(generator, 100):
->
+         if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
->
+         page_url = page.full_url().replace("%2F", "/")
->
+         get_image_links(page_url, page.title())
->
+   elif search_page != '':
->
+      page = pywikibot.Page(site, search_page)
->
+      if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
->
+      page_url = page.full_url().replace("%2F", "/")
->
+      get_image_links(page_url, page.title())
->
+   else:
->
+      pywikibot.stdout('No page name or category name received.'.format(arg))
->
+      return
->
->
+   chk_page_str = plural_check("page", pages_checked)
->
+   err_page_str = plural_check("page", page_errors)
->
+   err_img_str = plural_check("image", image_errors)
->
+   linked_ext_image_str = plural_check("image", linked_ext_images)
->
+   linked_oni2_image_str = plural_check("image", linked_oni2_images)
->
+   embedded_ext_image_str = plural_check("image", embedded_ext_images)
->
+   embedded_oni2_image_str = plural_check("image", embedded_oni2_images)
->
->
+   pywikibot.stdout('-------------------------')
->
+   pywikibot.stdout('Checked {0} {1} and failed to process {2} {3} on them. Failed to check {4} {5}.'.format(pages_checked, chk_page_str, image_errors, err_img_str, page_errors, err_page_str))
->
+   if 'a' in tag_names:
->
+      pywikibot.stdout('Found {0} linked {1} from oni2.net and {2} linked {3} from other domains.'.format(linked_oni2_images, linked_oni2_image_str, linked_ext_images, linked_ext_image_str))
->
+   if 'img' in tag_names:
->
+      pywikibot.stdout('Found {0} embedded {1} from oni2.net and {2} embedded {3} from other domains.'.format(embedded_oni2_images, embedded_oni2_image_str, embedded_ext_images, embedded_ext_image_str))
+if __name__ == '__main__':
-<
+    main()
->
+   main()

Diff Legend

-–
+Removed lines
-+
+Added lines
-<
+Changed lines (old)
->
+Changed lines (new)

Comparing ValBot/Python/find_external_images.py (file contents): Revision 1173 by iritscen, Tue Jun 28 22:06:29 2022 UTC vs. Revision 1181 by iritscen, Fri Apr 28 00:55:00 2023 UTC

Diff Legend

Comparing ValBot/Python/find_external_images.py (file contents):
Revision 1173 by iritscen, Tue Jun 28 22:06:29 2022 UTC vs.
Revision 1181 by iritscen, Fri Apr 28 00:55:00 2023 UTC