ViewVC Help
View File | Revision Log | View Changeset | Root Listing
root/Oni2/ValBot/Python/find_external_images.py
(Generate patch)

Comparing ValBot/Python/find_external_images.py (file contents):
Revision 1173 by iritscen, Tue Jun 28 22:06:29 2022 UTC vs.
Revision 1181 by iritscen, Fri Apr 28 00:55:00 2023 UTC

# Line 1 | Line 1
1   # Find External Images
2   # by iritscen@yahoo.com
3   # Looks at each link on a page (or in all the pages in a category) and prints the links to
4 < # images that are externally-hosted. You must pass in one or both of the following args:
5 < # -inlined: Show any plain URLs leading to images (these create embedded images, <img>)
6 < # -linked: Show any external URLs ("[URL]") leading to images (these create links, <a>)
4 > # images that are external to the wiki. Distinction is made between images hosted on oni2.net
5 > # and on third-party domains. You must pass in one or both of the following args:
6 > #  -embedded: Show any plain URLs leading to images (these create embedded images, <img>)
7 > #  -linked: Show any external URLs ("[URL]") leading to images (these create links, <a>)
8 > #
9   # Specify the page or category to search with -page:"Some Page" or -cat:"Some Category".
10   #
11   # Recommended viewing width:
# Line 14 | Line 16 | import os
16   from urllib.parse import urljoin
17  
18   import pywikibot
17
19   from pywikibot.bot import QuitKeyboardInterrupt
20   from pywikibot import pagegenerators
21   from pywikibot.comms.http import fetch
22   from pywikibot.specialbots import UploadRobot
23 < #import bs4 # for listing members with dir()
23 >
24 > import bs4
25   from bs4 import BeautifulSoup
26  
27 + # Initialize globals
28 + debug = 0
29   pages_checked = 0
30   page_errors = 0
31 < ext_images = 0
32 < oni2_images = 0
31 > image_errors = 0
32 > linked_ext_images = 0
33 > linked_oni2_images = 0
34 > embedded_ext_images = 0
35 > embedded_oni2_images = 0
36   file_formats = ('.jpg', '.jpeg', '.png', '.gif', '.svg')
37   tag_names = []
38  
39 < # Scrapes the HTML at the given URL for image tags
40 < def get_image_links(url):
41 <    global pages_checked
42 <    global page_errors
43 <    global ext_images
44 <    global oni2_images
38 <    global file_formats
39 <    global tag_names
40 <
41 <    response = fetch(url)
42 <    if response.status_code != 200:
43 <        pywikibot.stdout('   ERROR: Could not load page at URL "{}"'.format(url))
44 <        page_errors = page_errors + 1
45 <        return
46 <
47 <    soup = BeautifulSoup(response.text, 'html.parser')
48 <    pages_checked = pages_checked + 1
49 <    for tag in soup.findAll(tag_names):
50 <        link = tag.get('href')
51 <        if not link:
52 <            link = tag.get('src')
53 <
54 <        # Filter out empty links
55 <        if not link:
56 <            if tag.get('id') == "top":
57 <                continue
58 <
59 <            class_names = tag.get('class')
60 <            if "selflink" in class_names:
61 <                continue
39 > # Pass this function a singular noun and it will add an 's' to "noun" if "quantity" is not 1
40 > def plural_check(noun, quantity):
41 >   if quantity != 1:
42 >      return noun + "s"
43 >   else:
44 >      return noun
45  
46 <            pywikibot.stdout('   Could not process mystery link {}'.format(tag.get_text))
47 <            pywikibot.stdout('   Class is "{}".'.format(tag.get('class')))
48 <            continue
49 <
50 <        # A "src" or "href" starting with "/" would be a link to a local page or file; a
51 <        # link starting with "#" is a section link
52 <        if link.startswith('/') or link.startswith('#'):
46 > # Scrapes the HTML at the given URL for image tags
47 > def get_image_links(page_url, page_name):
48 >   global debug
49 >   global pages_checked
50 >   global page_errors
51 >   global image_errors
52 >   global linked_ext_images
53 >   global linked_oni2_images
54 >   global embedded_ext_images
55 >   global embedded_oni2_images
56 >   global file_formats
57 >   global tag_names
58 >   name_printed = 0
59 >
60 >   response = fetch(page_url)
61 >   if response.status_code != 200:
62 >      pywikibot.stdout('   ERROR: Could not load page at URL "{}".'.format(page_url))
63 >      page_errors += 1
64 >      return
65 >
66 >   soup = BeautifulSoup(response.text, 'html.parser')
67 >   pages_checked += 1
68 >   for tag in soup.findAll(tag_names):
69 >      link = tag.get('href')
70 >      if not link:
71 >         link = tag.get('src')
72 >
73 >      # Filter out empty links
74 >      if not link:
75 >         if tag.get('id') == "top":
76              continue
77  
78 <        # The gnu.org link to the Free Documentation License is at the bottom of every page
79 <        if link == "http://www.gnu.org/copyleft/fdl.html":
78 >         class_names = tag.get('class')
79 >         if "selflink" in class_names:
80              continue
81  
82 <        _, ext = os.path.splitext(link)
83 <        if ext.lower() in file_formats:
84 <            if "oni2.net" in link:
85 <                pywikibot.stdout('   Oni2.net image: {}'.format(link))
86 <                oni2_images = oni2_images + 1
82 >         if not name_printed and not debug:
83 >            pywikibot.stdout('From page "{}":'.format(page_name))
84 >            name_printed = 1
85 >         pywikibot.stdout('   ERROR: Could not process mystery link {}.'.format(tag.get_text))
86 >         pywikibot.stdout('   Class is "{}".'.format(tag.get('class')))
87 >         page_errors += 1
88 >         continue
89 >
90 >      # A "src" or "href" starting with "/" would be a link to a local page or file; a
91 >      # link starting with "#" is a section link
92 >      if link.startswith('/') or link.startswith('#'):
93 >         continue
94 >
95 >      # The gnu.org link to the Free Documentation License is at the bottom of every page
96 >      if link == "http://www.gnu.org/copyleft/fdl.html":
97 >         continue
98 >
99 >      # Determine if link is to an image
100 >      _, ext = os.path.splitext(link)
101 >      if ext.lower() in file_formats:
102 >         if not name_printed and not debug:
103 >            pywikibot.stdout('Found on page "{}":'.format(page_name))
104 >            name_printed = 1
105 >         tag_text = format(tag)
106 >         if "oni2.net" in link:
107 >            if tag_text.startswith('<a'):
108 >               pywikibot.stdout('   Linked oni2.net image: {}'.format(link))
109 >               linked_oni2_images += 1
110 >            elif tag_text.startswith('<img'):
111 >               pywikibot.stdout('   Embedded oni2.net image: {}'.format(link))
112 >               embedded_oni2_images += 1
113 >            else:
114 >               pywikibot.stdout('   ERROR: Could not process oni2.net image link {}.'.format(link))
115 >               image_errors += 1
116 >               return
117 >         else:
118 >            if tag_text.startswith('<a'):
119 >               pywikibot.stdout('   Linked external image: {}'.format(link))
120 >               linked_ext_images += 1
121 >            elif tag_text.startswith('<img'):
122 >               pywikibot.stdout('   Embedded external image: {}'.format(link))
123 >               embedded_ext_images += 1
124              else:
125 <                pywikibot.stdout('   External image: {}'.format(link))
126 <                ext_images = ext_images + 1
127 <        #else:
85 <           #pywikibot.stdout('   Other external link: {}'.format(link))
125 >               pywikibot.stdout('   ERROR: Could not process external image link {}.'.format(link))
126 >               image_errors += 1
127 >               return
128  
129   def main(*args):
130 <    global pages_checked
131 <    global page_errors
132 <    global ext_images
133 <    global oni2_images
134 <    global tag_names
135 <
136 <    cat_name = ''
137 <    page_name = ''
138 <
139 <    #pywikibot.stdout('The members of the bs4.element.Tag class are:')
140 <    #pywikibot.stdout(format(dir(bs4.element.Tag)))
141 <
142 <    local_args = pywikibot.handle_args(args)
143 <    genFactory = pagegenerators.GeneratorFactory()
144 <
145 <    for arg in local_args:
146 <        if arg.startswith('-cat:'):
147 <            cat_name = arg[5:]
148 <        elif arg.startswith('-page:'):
149 <            page_name = arg[6:]
150 <        elif arg == '-linked':
151 <            tag_names += ['a']
152 <        elif arg == '-inlined':
153 <            tag_names += ['img']
154 <        else:
155 <            pywikibot.stdout('Unknown argument "{}".'.format(arg))
156 <            return
157 <
158 <    if not tag_names:
159 <        pywikibot.stdout('You need to pass this script either "-linked", "-inlined", or both arguments in order to specify which image links you want to find.')
160 <        return
161 <
162 <    site = pywikibot.Site()
163 <    if cat_name != '':
164 <        cat_obj = pywikibot.Category(site, cat_name)
165 <        generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
166 <        for page in pagegenerators.PreloadingGenerator(generator, 100):
167 <            pywikibot.stdout('Checking page "{}"'.format(page.title()))
168 <            page_url = page.full_url().replace("%2F", "/")
169 <            get_image_links(page_url)
170 <    elif page_name != '':
171 <        page = pywikibot.Page(site, page_name)
172 <        pywikibot.stdout('Checking page "{}"'.format(page.title()))
173 <        page_url = page.full_url().replace("%2F", "/")
174 <        get_image_links(page_url)
175 <    else:
176 <        pywikibot.stdout('No page name or category name received.'.format(arg))
177 <        return
178 <
179 <    chk_page_str = "pages"
180 <    if pages_checked == 1:
181 <        chk_page_str = "page"
182 <
183 <    err_page_str = "pages"
184 <    if page_errors == 1:
185 <        err_page_str = "page"
186 <
187 <    ext_image_str = "images"
188 <    if ext_images == 1:
189 <        ext_image_str = "image"
190 <
191 <    oni2_image_str = "images"
192 <    if oni2_images == 1:
193 <        oni2_image_str = "image"
194 <
195 <    pywikibot.stdout('-------------------------')
196 <    pywikibot.stdout('Checked {0} {1} and failed to check {2} {3}.'.format(pages_checked, chk_page_str, page_errors, err_page_str))
197 <    pywikibot.stdout('Found {0} {1} from oni2.net and {2} {3} from other domains.'.format(oni2_images, oni2_image_str, ext_images, ext_image_str))
130 >   global debug
131 >   global pages_checked
132 >   global page_errors
133 >   global image_errors
134 >   global linked_ext_images
135 >   global linked_oni2_images
136 >   global embedded_ext_images
137 >   global embedded_oni2_images
138 >   global tag_names
139 >
140 >   search_cat = ''
141 >   search_page = ''
142 >
143 >   #pywikibot.stdout('The members of the bs4.element.Tag class are:')
144 >   #pywikibot.stdout(format(dir(bs4.element.Tag)))
145 >
146 >   local_args = pywikibot.handle_args(args)
147 >   genFactory = pagegenerators.GeneratorFactory()
148 >
149 >   for arg in local_args:
150 >      if arg.startswith('-cat:'):
151 >         search_cat = arg[5:]
152 >      elif arg.startswith('-page:'):
153 >         search_page = arg[6:]
154 >      elif arg == '-linked':
155 >         tag_names += ['a']
156 >      elif arg == '-embedded':
157 >         tag_names += ['img']
158 >      elif arg == '-dbg':
159 >         debug = 1
160 >      else:
161 >         pywikibot.stdout('Unknown argument "{}".'.format(arg))
162 >         return
163 >
164 >   if not tag_names:
165 >      pywikibot.stdout('You need to pass this script either "-linked", "-embedded", or both arguments in order to specify which image links you want to find.')
166 >      return
167 >
168 >   site = pywikibot.Site()
169 >   if search_cat != '':
170 >      cat_obj = pywikibot.Category(site, search_cat)
171 >      generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
172 >      for page in pagegenerators.PreloadingGenerator(generator, 100):
173 >         if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
174 >         page_url = page.full_url().replace("%2F", "/")
175 >         get_image_links(page_url, page.title())
176 >   elif search_page != '':
177 >      page = pywikibot.Page(site, search_page)
178 >      if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
179 >      page_url = page.full_url().replace("%2F", "/")
180 >      get_image_links(page_url, page.title())
181 >   else:
182 >      pywikibot.stdout('No page name or category name received.'.format(arg))
183 >      return
184 >
185 >   chk_page_str = plural_check("page", pages_checked)
186 >   err_page_str = plural_check("page", page_errors)
187 >   err_img_str = plural_check("image", image_errors)
188 >   linked_ext_image_str = plural_check("image", linked_ext_images)
189 >   linked_oni2_image_str = plural_check("image", linked_oni2_images)
190 >   embedded_ext_image_str = plural_check("image", embedded_ext_images)
191 >   embedded_oni2_image_str = plural_check("image", embedded_oni2_images)
192 >
193 >   pywikibot.stdout('-------------------------')
194 >   pywikibot.stdout('Checked {0} {1} and failed to process {2} {3} on them. Failed to check {4} {5}.'.format(pages_checked, chk_page_str, image_errors, err_img_str, page_errors, err_page_str))
195 >   if 'a' in tag_names:
196 >      pywikibot.stdout('Found {0} linked {1} from oni2.net and {2} linked {3} from other domains.'.format(linked_oni2_images, linked_oni2_image_str, linked_ext_images, linked_ext_image_str))
197 >   if 'img' in tag_names:
198 >      pywikibot.stdout('Found {0} embedded {1} from oni2.net and {2} embedded {3} from other domains.'.format(embedded_oni2_images, embedded_oni2_image_str, embedded_ext_images, embedded_ext_image_str))
199  
200   if __name__ == '__main__':
201 <    main()
201 >   main()

Diff Legend

Removed lines
+ Added lines
< Changed lines (old)
> Changed lines (new)