ViewVC Help
View File | Revision Log | View Changeset | Root Listing
root/Oni2/ValBot/Python/find_external_images.py
(Generate patch)

Comparing ValBot/Python/find_external_images.py (file contents):
Revision 1169 by iritscen, Mon Feb 21 23:59:20 2022 UTC vs.
Revision 1173 by iritscen, Tue Jun 28 22:06:29 2022 UTC

# Line 1 | Line 1
1 + # Find External Images
2 + # by iritscen@yahoo.com
3 + # Looks at each link on a page (or in all the pages in a category) and prints the links to
4 + # images that are externally-hosted. You must pass in one or both of the following args:
5 + # -inlined: Show any plain URLs leading to images (these create embedded images, <img>)
6 + # -linked: Show any external URLs ("[URL]") leading to images (these create links, <a>)
7 + # Specify the page or category to search with -page:"Some Page" or -cat:"Some Category".
8 + #
9 + # Recommended viewing width:
10 + # |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---|
11 +
12   import os
13  
14   from urllib.parse import urljoin
# Line 8 | Line 19 | from pywikibot.bot import QuitKeyboardIn
19   from pywikibot import pagegenerators
20   from pywikibot.comms.http import fetch
21   from pywikibot.specialbots import UploadRobot
22 + #import bs4 # for listing members with dir()
23   from bs4 import BeautifulSoup
24  
13 first_run = False
25   pages_checked = 0
26 + page_errors = 0
27 + ext_images = 0
28   oni2_images = 0
29   file_formats = ('.jpg', '.jpeg', '.png', '.gif', '.svg')
30 + tag_names = []
31  
32   # Scrapes the HTML at the given URL for image tags
33 < def get_image_links(url, shown):
20 <    links = []
21 <    global oni2_images
33 > def get_image_links(url):
34      global pages_checked
35 +    global page_errors
36 +    global ext_images
37 +    global oni2_images
38 +    global file_formats
39 +    global tag_names
40  
41      response = fetch(url)
42      if response.status_code != 200:
43 <        pywikibot.output('Skipping url: {}'.format(url))
44 <        return links
43 >        pywikibot.stdout('   ERROR: Could not load page at URL "{}"'.format(url))
44 >        page_errors = page_errors + 1
45 >        return
46  
47      soup = BeautifulSoup(response.text, 'html.parser')
48      pages_checked = pages_checked + 1
49 <    if not shown:
50 <        tagname = 'a'
51 <    elif shown == 'just':
52 <        tagname = 'img'
53 <    else:
54 <        tagname = ['a', 'img']
37 <    #pywikibot.output('Looking at tags.')
38 <    for tag in soup.findAll(tagname):
39 <        link = tag.get('src', tag.get('href', None))
49 >    for tag in soup.findAll(tag_names):
50 >        link = tag.get('href')
51 >        if not link:
52 >            link = tag.get('src')
53 >
54 >        # Filter out empty links
55          if not link:
56 <            #pywikibot.output('It is not a link.')
56 >            if tag.get('id') == "top":
57 >                continue
58 >
59 >            class_names = tag.get('class')
60 >            if "selflink" in class_names:
61 >                continue
62 >
63 >            pywikibot.stdout('   Could not process mystery link {}'.format(tag.get_text))
64 >            pywikibot.stdout('   Class is "{}".'.format(tag.get('class')))
65 >            continue
66 >
67 >        # A "src" or "href" starting with "/" would be a link to a local page or file; a
68 >        # link starting with "#" is a section link
69 >        if link.startswith('/') or link.startswith('#'):
70 >            continue
71 >
72 >        # The gnu.org link to the Free Documentation License is at the bottom of every page
73 >        if link == "http://www.gnu.org/copyleft/fdl.html":
74              continue
75 <        #pywikibot.output('Got link {0}.'.format(link))
75 >
76          _, ext = os.path.splitext(link)
77          if ext.lower() in file_formats:
46            pywikibot.output('Found image link {0}.'.format(ext))
78              if "oni2.net" in link:
79 <                pywikibot.stdout('Found an oni2.net image: {0}'.format(link))
79 >                pywikibot.stdout('   Oni2.net image: {}'.format(link))
80                  oni2_images = oni2_images + 1
81 <    return links
82 <
81 >            else:
82 >                pywikibot.stdout('   External image: {}'.format(link))
83 >                ext_images = ext_images + 1
84 >        #else:
85 >           #pywikibot.stdout('   Other external link: {}'.format(link))
86  
87   def main(*args):
88 <    cat = ''
89 <    url = ''
90 <    image_url = False
91 <    shown = False
92 <    desc = []
88 >    global pages_checked
89 >    global page_errors
90 >    global ext_images
91 >    global oni2_images
92 >    global tag_names
93 >
94 >    cat_name = ''
95 >    page_name = ''
96 >
97 >    #pywikibot.stdout('The members of the bs4.element.Tag class are:')
98 >    #pywikibot.stdout(format(dir(bs4.element.Tag)))
99  
100      local_args = pywikibot.handle_args(args)
101      genFactory = pagegenerators.GeneratorFactory()
102  
103      for arg in local_args:
104          if arg.startswith('-cat:'):
105 <            cat = arg[5:]
106 <        elif arg == '-shown':
107 <            shown = True
108 <        elif arg == '-justshown':
109 <            shown = 'just'
110 <        elif url == '':
111 <            url = arg
105 >            cat_name = arg[5:]
106 >        elif arg.startswith('-page:'):
107 >            page_name = arg[6:]
108 >        elif arg == '-linked':
109 >            tag_names += ['a']
110 >        elif arg == '-inlined':
111 >            tag_names += ['img']
112          else:
113 <            desc += [arg]
114 <    desc = ' '.join(desc)
113 >            pywikibot.stdout('Unknown argument "{}".'.format(arg))
114 >            return
115 >
116 >    if not tag_names:
117 >        pywikibot.stdout('You need to pass this script either "-linked", "-inlined", or both arguments in order to specify which image links you want to find.')
118 >        return
119  
120      site = pywikibot.Site()
121 <    cat_obj = pywikibot.Category(site, cat)
122 <    generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
123 <    for page in pagegenerators.PreloadingGenerator(generator, 100):
124 <        pywikibot.stdout('Checking page {0}'.format(page.title()))
121 >    if cat_name != '':
122 >        cat_obj = pywikibot.Category(site, cat_name)
123 >        generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
124 >        for page in pagegenerators.PreloadingGenerator(generator, 100):
125 >            pywikibot.stdout('Checking page "{}"'.format(page.title()))
126 >            page_url = page.full_url().replace("%2F", "/")
127 >            get_image_links(page_url)
128 >    elif page_name != '':
129 >        page = pywikibot.Page(site, page_name)
130 >        pywikibot.stdout('Checking page "{}"'.format(page.title()))
131          page_url = page.full_url().replace("%2F", "/")
132 <        get_image_links(page_url, shown)
132 >        get_image_links(page_url)
133 >    else:
134 >        pywikibot.stdout('No page name or category name received.'.format(arg))
135 >        return
136  
137 <    global pages_checked
138 <    global oni2_images
139 <    pywikibot.stdout('Checked {0} page(s) and found {1} image(s) from oni2.net.'.format(pages_checked, oni2_images))
137 >    chk_page_str = "pages"
138 >    if pages_checked == 1:
139 >        chk_page_str = "page"
140 >
141 >    err_page_str = "pages"
142 >    if page_errors == 1:
143 >        err_page_str = "page"
144 >
145 >    ext_image_str = "images"
146 >    if ext_images == 1:
147 >        ext_image_str = "image"
148 >
149 >    oni2_image_str = "images"
150 >    if oni2_images == 1:
151 >        oni2_image_str = "image"
152 >
153 >    pywikibot.stdout('-------------------------')
154 >    pywikibot.stdout('Checked {0} {1} and failed to check {2} {3}.'.format(pages_checked, chk_page_str, page_errors, err_page_str))
155 >    pywikibot.stdout('Found {0} {1} from oni2.net and {2} {3} from other domains.'.format(oni2_images, oni2_image_str, ext_images, ext_image_str))
156  
157   if __name__ == '__main__':
158      main()

Diff Legend

Removed lines
+ Added lines
< Changed lines (old)
> Changed lines (new)