| 1 | import os | 
 
 
 
 
 | 2 |  | 
 
 
 
 
 | 3 | from urllib.parse import urljoin | 
 
 
 
 
 | 4 |  | 
 
 
 
 
 | 5 | import pywikibot | 
 
 
 
 
 | 6 |  | 
 
 
 
 
 | 7 | from pywikibot.bot import QuitKeyboardInterrupt | 
 
 
 
 
 | 8 | from pywikibot import pagegenerators | 
 
 
 
 
 | 9 | from pywikibot.comms.http import fetch | 
 
 
 
 
 | 10 | from pywikibot.specialbots import UploadRobot | 
 
 
 
 
 | 11 | from bs4 import BeautifulSoup | 
 
 
 
 
 | 12 |  | 
 
 
 
 
 | 13 | first_run = False | 
 
 
 
 
 | 14 | pages_checked = 0 | 
 
 
 
 
 | 15 | oni2_images = 0 | 
 
 
 
 
 | 16 | file_formats = ('.jpg', '.jpeg', '.png', '.gif', '.svg', '.ogg') | 
 
 
 
 
 | 17 |  | 
 
 
 
 
 | 18 | # Scrapes the HTML at the given URL for image tags | 
 
 
 
 
 | 19 | def get_image_links(url, shown): | 
 
 
 
 
 | 20 | links = [] | 
 
 
 
 
 | 21 | global oni2_images | 
 
 
 
 
 | 22 | global pages_checked | 
 
 
 
 
 | 23 |  | 
 
 
 
 
 | 24 | response = fetch(url) | 
 
 
 
 
 | 25 | if response.status_code != 200: | 
 
 
 
 
 | 26 | pywikibot.output('Skipping url: {}'.format(url)) | 
 
 
 
 
 | 27 | return links | 
 
 
 
 
 | 28 |  | 
 
 
 
 
 | 29 | soup = BeautifulSoup(response.text, 'html.parser') | 
 
 
 
 
 | 30 | pages_checked = pages_checked + 1 | 
 
 
 
 
 | 31 | if not shown: | 
 
 
 
 
 | 32 | tagname = 'a' | 
 
 
 
 
 | 33 | elif shown == 'just': | 
 
 
 
 
 | 34 | tagname = 'img' | 
 
 
 
 
 | 35 | else: | 
 
 
 
 
 | 36 | tagname = ['a', 'img'] | 
 
 
 
 
 | 37 | #pywikibot.output('Looking at tags.') | 
 
 
 
 
 | 38 | for tag in soup.findAll(tagname): | 
 
 
 
 
 | 39 | link = tag.get('src', tag.get('href', None)) | 
 
 
 
 
 | 40 | if not link: | 
 
 
 
 
 | 41 | #pywikibot.output('It is not a link.') | 
 
 
 
 
 | 42 | continue | 
 
 
 
 
 | 43 | #pywikibot.output('Got link {0}.'.format(link)) | 
 
 
 
 
 | 44 | _, ext = os.path.splitext(link) | 
 
 
 
 
 | 45 | if ext.lower() in file_formats: | 
 
 
 
 
 | 46 | pywikibot.output('Found image link {0}.'.format(ext)) | 
 
 
 
 
 | 47 | if "oni2.net" in link: | 
 
 
 
 
 | 48 | pywikibot.stdout('Found an oni2.net image: {0}'.format(link)) | 
 
 
 
 
 | 49 | oni2_images = oni2_images + 1 | 
 
 
 
 
 | 50 | return links | 
 
 
 
 
 | 51 |  | 
 
 
 
 
 | 52 |  | 
 
 
 
 
 | 53 | def main(*args): | 
 
 
 
 
 | 54 | cat = '' | 
 
 
 
 
 | 55 | url = '' | 
 
 
 
 
 | 56 | image_url = False | 
 
 
 
 
 | 57 | shown = False | 
 
 
 
 
 | 58 | desc = [] | 
 
 
 
 
 | 59 |  | 
 
 
 
 
 | 60 | local_args = pywikibot.handle_args(args) | 
 
 
 
 
 | 61 | genFactory = pagegenerators.GeneratorFactory() | 
 
 
 
 
 | 62 |  | 
 
 
 
 
 | 63 | for arg in local_args: | 
 
 
 
 
 | 64 | if arg.startswith('-cat:'): | 
 
 
 
 
 | 65 | cat = arg[5:] | 
 
 
 
 
 | 66 | elif arg == '-shown': | 
 
 
 
 
 | 67 | shown = True | 
 
 
 
 
 | 68 | elif arg == '-justshown': | 
 
 
 
 
 | 69 | shown = 'just' | 
 
 
 
 
 | 70 | elif url == '': | 
 
 
 
 
 | 71 | url = arg | 
 
 
 
 
 | 72 | else: | 
 
 
 
 
 | 73 | desc += [arg] | 
 
 
 
 
 | 74 | desc = ' '.join(desc) | 
 
 
 
 
 | 75 |  | 
 
 
 
 
 | 76 | site = pywikibot.Site() | 
 
 
 
 
 | 77 | cat_obj = pywikibot.Category(site, cat) | 
 
 
 
 
 | 78 | generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True) | 
 
 
 
 
 | 79 | for page in pagegenerators.PreloadingGenerator(generator, 100): | 
 
 
 
 
 | 80 | pywikibot.stdout('Checking page {0}'.format(page.title())) | 
 
 
 
 
 | 81 | page_url = page.full_url().replace("%2F", "/") | 
 
 
 
 
 | 82 | get_image_links(page_url, shown) | 
 
 
 
 
 | 83 |  | 
 
 
 
 
 | 84 | global pages_checked | 
 
 
 
 
 | 85 | global oni2_images | 
 
 
 
 
 | 86 | pywikibot.stdout('Checked {0} page(s) and found {1} image(s) from oni2.net.'.format(pages_checked, oni2_images)) | 
 
 
 
 
 | 87 |  | 
 
 
 
 
 | 88 | if __name__ == '__main__': | 
 
 
 
 
 | 89 | main() |