1 |
import os |
2 |
|
3 |
from urllib.parse import urljoin |
4 |
|
5 |
import pywikibot |
6 |
|
7 |
from pywikibot.bot import QuitKeyboardInterrupt |
8 |
from pywikibot import pagegenerators |
9 |
from pywikibot.comms.http import fetch |
10 |
from pywikibot.specialbots import UploadRobot |
11 |
from bs4 import BeautifulSoup |
12 |
|
13 |
first_run = False |
14 |
pages_checked = 0 |
15 |
oni2_images = 0 |
16 |
file_formats = ('.jpg', '.jpeg', '.png', '.gif', '.svg') |
17 |
|
18 |
# Scrapes the HTML at the given URL for image tags |
19 |
def get_image_links(url, shown): |
20 |
links = [] |
21 |
global oni2_images |
22 |
global pages_checked |
23 |
|
24 |
response = fetch(url) |
25 |
if response.status_code != 200: |
26 |
pywikibot.output('Skipping url: {}'.format(url)) |
27 |
return links |
28 |
|
29 |
soup = BeautifulSoup(response.text, 'html.parser') |
30 |
pages_checked = pages_checked + 1 |
31 |
if not shown: |
32 |
tagname = 'a' |
33 |
elif shown == 'just': |
34 |
tagname = 'img' |
35 |
else: |
36 |
tagname = ['a', 'img'] |
37 |
#pywikibot.output('Looking at tags.') |
38 |
for tag in soup.findAll(tagname): |
39 |
link = tag.get('src', tag.get('href', None)) |
40 |
if not link: |
41 |
#pywikibot.output('It is not a link.') |
42 |
continue |
43 |
#pywikibot.output('Got link {0}.'.format(link)) |
44 |
_, ext = os.path.splitext(link) |
45 |
if ext.lower() in file_formats: |
46 |
pywikibot.output('Found image link {0}.'.format(ext)) |
47 |
if "oni2.net" in link: |
48 |
pywikibot.stdout('Found an oni2.net image: {0}'.format(link)) |
49 |
oni2_images = oni2_images + 1 |
50 |
return links |
51 |
|
52 |
|
53 |
def main(*args): |
54 |
cat = '' |
55 |
url = '' |
56 |
image_url = False |
57 |
shown = False |
58 |
desc = [] |
59 |
|
60 |
local_args = pywikibot.handle_args(args) |
61 |
genFactory = pagegenerators.GeneratorFactory() |
62 |
|
63 |
for arg in local_args: |
64 |
if arg.startswith('-cat:'): |
65 |
cat = arg[5:] |
66 |
elif arg == '-shown': |
67 |
shown = True |
68 |
elif arg == '-justshown': |
69 |
shown = 'just' |
70 |
elif url == '': |
71 |
url = arg |
72 |
else: |
73 |
desc += [arg] |
74 |
desc = ' '.join(desc) |
75 |
|
76 |
site = pywikibot.Site() |
77 |
cat_obj = pywikibot.Category(site, cat) |
78 |
generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True) |
79 |
for page in pagegenerators.PreloadingGenerator(generator, 100): |
80 |
pywikibot.stdout('Checking page {0}'.format(page.title())) |
81 |
page_url = page.full_url().replace("%2F", "/") |
82 |
get_image_links(page_url, shown) |
83 |
|
84 |
global pages_checked |
85 |
global oni2_images |
86 |
pywikibot.stdout('Checked {0} page(s) and found {1} image(s) from oni2.net.'.format(pages_checked, oni2_images)) |
87 |
|
88 |
if __name__ == '__main__': |
89 |
main() |