ViewVC Help
View File | Revision Log | View Changeset | Root Listing
root/Oni2/ValBot/Python/find_external_images.py
(Generate patch)

Comparing ValBot/Python/find_external_images.py (file contents):
Revision 1169 by iritscen, Mon Feb 21 23:59:20 2022 UTC vs.
Revision 1181 by iritscen, Fri Apr 28 00:55:00 2023 UTC

# Line 1 | Line 1
1 + # Find External Images
2 + # by iritscen@yahoo.com
3 + # Looks at each link on a page (or in all the pages in a category) and prints the links to
4 + # images that are external to the wiki. Distinction is made between images hosted on oni2.net
5 + # and on third-party domains. You must pass in one or both of the following args:
6 + #  -embedded: Show any plain URLs leading to images (these create embedded images, <img>)
7 + #  -linked: Show any external URLs ("[URL]") leading to images (these create links, <a>)
8 + #
9 + # Specify the page or category to search with -page:"Some Page" or -cat:"Some Category".
10 + #
11 + # Recommended viewing width:
12 + # |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---|
13 +
14   import os
15  
16   from urllib.parse import urljoin
17  
18   import pywikibot
6
19   from pywikibot.bot import QuitKeyboardInterrupt
20   from pywikibot import pagegenerators
21   from pywikibot.comms.http import fetch
22   from pywikibot.specialbots import UploadRobot
23 +
24 + import bs4
25   from bs4 import BeautifulSoup
26  
27 < first_run = False
27 > # Initialize globals
28 > debug = 0
29   pages_checked = 0
30 < oni2_images = 0
30 > page_errors = 0
31 > image_errors = 0
32 > linked_ext_images = 0
33 > linked_oni2_images = 0
34 > embedded_ext_images = 0
35 > embedded_oni2_images = 0
36   file_formats = ('.jpg', '.jpeg', '.png', '.gif', '.svg')
37 + tag_names = []
38 +
39 + # Pass this function a singular noun and it will add an 's' to "noun" if "quantity" is not 1
40 + def plural_check(noun, quantity):
41 +   if quantity != 1:
42 +      return noun + "s"
43 +   else:
44 +      return noun
45  
46   # Scrapes the HTML at the given URL for image tags
47 < def get_image_links(url, shown):
48 <    links = []
49 <    global oni2_images
50 <    global pages_checked
51 <
52 <    response = fetch(url)
53 <    if response.status_code != 200:
54 <        pywikibot.output('Skipping url: {}'.format(url))
55 <        return links
56 <
57 <    soup = BeautifulSoup(response.text, 'html.parser')
58 <    pages_checked = pages_checked + 1
59 <    if not shown:
60 <        tagname = 'a'
61 <    elif shown == 'just':
62 <        tagname = 'img'
63 <    else:
64 <        tagname = ['a', 'img']
65 <    #pywikibot.output('Looking at tags.')
66 <    for tag in soup.findAll(tagname):
67 <        link = tag.get('src', tag.get('href', None))
68 <        if not link:
69 <            #pywikibot.output('It is not a link.')
47 > def get_image_links(page_url, page_name):
48 >   global debug
49 >   global pages_checked
50 >   global page_errors
51 >   global image_errors
52 >   global linked_ext_images
53 >   global linked_oni2_images
54 >   global embedded_ext_images
55 >   global embedded_oni2_images
56 >   global file_formats
57 >   global tag_names
58 >   name_printed = 0
59 >
60 >   response = fetch(page_url)
61 >   if response.status_code != 200:
62 >      pywikibot.stdout('   ERROR: Could not load page at URL "{}".'.format(page_url))
63 >      page_errors += 1
64 >      return
65 >
66 >   soup = BeautifulSoup(response.text, 'html.parser')
67 >   pages_checked += 1
68 >   for tag in soup.findAll(tag_names):
69 >      link = tag.get('href')
70 >      if not link:
71 >         link = tag.get('src')
72 >
73 >      # Filter out empty links
74 >      if not link:
75 >         if tag.get('id') == "top":
76 >            continue
77 >
78 >         class_names = tag.get('class')
79 >         if "selflink" in class_names:
80              continue
43        #pywikibot.output('Got link {0}.'.format(link))
44        _, ext = os.path.splitext(link)
45        if ext.lower() in file_formats:
46            pywikibot.output('Found image link {0}.'.format(ext))
47            if "oni2.net" in link:
48                pywikibot.stdout('Found an oni2.net image: {0}'.format(link))
49                oni2_images = oni2_images + 1
50    return links
81  
82 +         if not name_printed and not debug:
83 +            pywikibot.stdout('From page "{}":'.format(page_name))
84 +            name_printed = 1
85 +         pywikibot.stdout('   ERROR: Could not process mystery link {}.'.format(tag.get_text))
86 +         pywikibot.stdout('   Class is "{}".'.format(tag.get('class')))
87 +         page_errors += 1
88 +         continue
89 +
90 +      # A "src" or "href" starting with "/" would be a link to a local page or file; a
91 +      # link starting with "#" is a section link
92 +      if link.startswith('/') or link.startswith('#'):
93 +         continue
94 +
95 +      # The gnu.org link to the Free Documentation License is at the bottom of every page
96 +      if link == "http://www.gnu.org/copyleft/fdl.html":
97 +         continue
98 +
99 +      # Determine if link is to an image
100 +      _, ext = os.path.splitext(link)
101 +      if ext.lower() in file_formats:
102 +         if not name_printed and not debug:
103 +            pywikibot.stdout('Found on page "{}":'.format(page_name))
104 +            name_printed = 1
105 +         tag_text = format(tag)
106 +         if "oni2.net" in link:
107 +            if tag_text.startswith('<a'):
108 +               pywikibot.stdout('   Linked oni2.net image: {}'.format(link))
109 +               linked_oni2_images += 1
110 +            elif tag_text.startswith('<img'):
111 +               pywikibot.stdout('   Embedded oni2.net image: {}'.format(link))
112 +               embedded_oni2_images += 1
113 +            else:
114 +               pywikibot.stdout('   ERROR: Could not process oni2.net image link {}.'.format(link))
115 +               image_errors += 1
116 +               return
117 +         else:
118 +            if tag_text.startswith('<a'):
119 +               pywikibot.stdout('   Linked external image: {}'.format(link))
120 +               linked_ext_images += 1
121 +            elif tag_text.startswith('<img'):
122 +               pywikibot.stdout('   Embedded external image: {}'.format(link))
123 +               embedded_ext_images += 1
124 +            else:
125 +               pywikibot.stdout('   ERROR: Could not process external image link {}.'.format(link))
126 +               image_errors += 1
127 +               return
128  
129   def main(*args):
130 <    cat = ''
131 <    url = ''
132 <    image_url = False
133 <    shown = False
134 <    desc = []
135 <
136 <    local_args = pywikibot.handle_args(args)
137 <    genFactory = pagegenerators.GeneratorFactory()
138 <
139 <    for arg in local_args:
140 <        if arg.startswith('-cat:'):
141 <            cat = arg[5:]
142 <        elif arg == '-shown':
143 <            shown = True
144 <        elif arg == '-justshown':
145 <            shown = 'just'
146 <        elif url == '':
147 <            url = arg
148 <        else:
149 <            desc += [arg]
150 <    desc = ' '.join(desc)
151 <
152 <    site = pywikibot.Site()
153 <    cat_obj = pywikibot.Category(site, cat)
154 <    generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
155 <    for page in pagegenerators.PreloadingGenerator(generator, 100):
156 <        pywikibot.stdout('Checking page {0}'.format(page.title()))
157 <        page_url = page.full_url().replace("%2F", "/")
158 <        get_image_links(page_url, shown)
159 <
160 <    global pages_checked
161 <    global oni2_images
162 <    pywikibot.stdout('Checked {0} page(s) and found {1} image(s) from oni2.net.'.format(pages_checked, oni2_images))
130 >   global debug
131 >   global pages_checked
132 >   global page_errors
133 >   global image_errors
134 >   global linked_ext_images
135 >   global linked_oni2_images
136 >   global embedded_ext_images
137 >   global embedded_oni2_images
138 >   global tag_names
139 >
140 >   search_cat = ''
141 >   search_page = ''
142 >
143 >   #pywikibot.stdout('The members of the bs4.element.Tag class are:')
144 >   #pywikibot.stdout(format(dir(bs4.element.Tag)))
145 >
146 >   local_args = pywikibot.handle_args(args)
147 >   genFactory = pagegenerators.GeneratorFactory()
148 >
149 >   for arg in local_args:
150 >      if arg.startswith('-cat:'):
151 >         search_cat = arg[5:]
152 >      elif arg.startswith('-page:'):
153 >         search_page = arg[6:]
154 >      elif arg == '-linked':
155 >         tag_names += ['a']
156 >      elif arg == '-embedded':
157 >         tag_names += ['img']
158 >      elif arg == '-dbg':
159 >         debug = 1
160 >      else:
161 >         pywikibot.stdout('Unknown argument "{}".'.format(arg))
162 >         return
163 >
164 >   if not tag_names:
165 >      pywikibot.stdout('You need to pass this script either "-linked", "-embedded", or both arguments in order to specify which image links you want to find.')
166 >      return
167 >
168 >   site = pywikibot.Site()
169 >   if search_cat != '':
170 >      cat_obj = pywikibot.Category(site, search_cat)
171 >      generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
172 >      for page in pagegenerators.PreloadingGenerator(generator, 100):
173 >         if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
174 >         page_url = page.full_url().replace("%2F", "/")
175 >         get_image_links(page_url, page.title())
176 >   elif search_page != '':
177 >      page = pywikibot.Page(site, search_page)
178 >      if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
179 >      page_url = page.full_url().replace("%2F", "/")
180 >      get_image_links(page_url, page.title())
181 >   else:
182 >      pywikibot.stdout('No page name or category name received.'.format(arg))
183 >      return
184 >
185 >   chk_page_str = plural_check("page", pages_checked)
186 >   err_page_str = plural_check("page", page_errors)
187 >   err_img_str = plural_check("image", image_errors)
188 >   linked_ext_image_str = plural_check("image", linked_ext_images)
189 >   linked_oni2_image_str = plural_check("image", linked_oni2_images)
190 >   embedded_ext_image_str = plural_check("image", embedded_ext_images)
191 >   embedded_oni2_image_str = plural_check("image", embedded_oni2_images)
192 >
193 >   pywikibot.stdout('-------------------------')
194 >   pywikibot.stdout('Checked {0} {1} and failed to process {2} {3} on them. Failed to check {4} {5}.'.format(pages_checked, chk_page_str, image_errors, err_img_str, page_errors, err_page_str))
195 >   if 'a' in tag_names:
196 >      pywikibot.stdout('Found {0} linked {1} from oni2.net and {2} linked {3} from other domains.'.format(linked_oni2_images, linked_oni2_image_str, linked_ext_images, linked_ext_image_str))
197 >   if 'img' in tag_names:
198 >      pywikibot.stdout('Found {0} embedded {1} from oni2.net and {2} embedded {3} from other domains.'.format(embedded_oni2_images, embedded_oni2_image_str, embedded_ext_images, embedded_ext_image_str))
199  
200   if __name__ == '__main__':
201 <    main()
201 >   main()

Diff Legend

Removed lines
+ Added lines
< Changed lines (old)
> Changed lines (new)