| 1 |
import os |
| 2 |
|
| 3 |
from urllib.parse import urljoin |
| 4 |
|
| 5 |
import pywikibot |
| 6 |
|
| 7 |
from pywikibot.bot import QuitKeyboardInterrupt |
| 8 |
from pywikibot import pagegenerators |
| 9 |
from pywikibot.comms.http import fetch |
| 10 |
from pywikibot.specialbots import UploadRobot |
| 11 |
from bs4 import BeautifulSoup |
| 12 |
|
| 13 |
first_run = False |
| 14 |
pages_checked = 0 |
| 15 |
oni2_images = 0 |
| 16 |
file_formats = ('.jpg', '.jpeg', '.png', '.gif', '.svg', '.ogg') |
| 17 |
|
| 18 |
# Scrapes the HTML at the given URL for image tags |
| 19 |
def get_image_links(url, shown): |
| 20 |
links = [] |
| 21 |
global oni2_images |
| 22 |
global pages_checked |
| 23 |
|
| 24 |
response = fetch(url) |
| 25 |
if response.status_code != 200: |
| 26 |
pywikibot.output('Skipping url: {}'.format(url)) |
| 27 |
return links |
| 28 |
|
| 29 |
soup = BeautifulSoup(response.text, 'html.parser') |
| 30 |
pages_checked = pages_checked + 1 |
| 31 |
if not shown: |
| 32 |
tagname = 'a' |
| 33 |
elif shown == 'just': |
| 34 |
tagname = 'img' |
| 35 |
else: |
| 36 |
tagname = ['a', 'img'] |
| 37 |
#pywikibot.output('Looking at tags.') |
| 38 |
for tag in soup.findAll(tagname): |
| 39 |
link = tag.get('src', tag.get('href', None)) |
| 40 |
if not link: |
| 41 |
#pywikibot.output('It is not a link.') |
| 42 |
continue |
| 43 |
#pywikibot.output('Got link {0}.'.format(link)) |
| 44 |
_, ext = os.path.splitext(link) |
| 45 |
if ext.lower() in file_formats: |
| 46 |
pywikibot.output('Found image link {0}.'.format(ext)) |
| 47 |
if "oni2.net" in link: |
| 48 |
pywikibot.stdout('Found an oni2.net image: {0}'.format(link)) |
| 49 |
oni2_images = oni2_images + 1 |
| 50 |
return links |
| 51 |
|
| 52 |
|
| 53 |
def main(*args): |
| 54 |
cat = '' |
| 55 |
url = '' |
| 56 |
image_url = False |
| 57 |
shown = False |
| 58 |
desc = [] |
| 59 |
|
| 60 |
local_args = pywikibot.handle_args(args) |
| 61 |
genFactory = pagegenerators.GeneratorFactory() |
| 62 |
|
| 63 |
for arg in local_args: |
| 64 |
if arg.startswith('-cat:'): |
| 65 |
cat = arg[5:] |
| 66 |
elif arg == '-shown': |
| 67 |
shown = True |
| 68 |
elif arg == '-justshown': |
| 69 |
shown = 'just' |
| 70 |
elif url == '': |
| 71 |
url = arg |
| 72 |
else: |
| 73 |
desc += [arg] |
| 74 |
desc = ' '.join(desc) |
| 75 |
|
| 76 |
site = pywikibot.Site() |
| 77 |
cat_obj = pywikibot.Category(site, cat) |
| 78 |
generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True) |
| 79 |
for page in pagegenerators.PreloadingGenerator(generator, 100): |
| 80 |
pywikibot.stdout('Checking page {0}'.format(page.title())) |
| 81 |
page_url = page.full_url().replace("%2F", "/") |
| 82 |
get_image_links(page_url, shown) |
| 83 |
|
| 84 |
global pages_checked |
| 85 |
global oni2_images |
| 86 |
pywikibot.stdout('Checked {0} page(s) and found {1} image(s) from oni2.net.'.format(pages_checked, oni2_images)) |
| 87 |
|
| 88 |
if __name__ == '__main__': |
| 89 |
main() |