import os from urllib.parse import urljoin import pywikibot from pywikibot.bot import QuitKeyboardInterrupt from pywikibot import pagegenerators from pywikibot.comms.http import fetch from pywikibot.specialbots import UploadRobot from bs4 import BeautifulSoup first_run = False pages_checked = 0 oni2_images = 0 file_formats = ('.jpg', '.jpeg', '.png', '.gif', '.svg') # Scrapes the HTML at the given URL for image tags def get_image_links(url, shown): links = [] global oni2_images global pages_checked response = fetch(url) if response.status_code != 200: pywikibot.output('Skipping url: {}'.format(url)) return links soup = BeautifulSoup(response.text, 'html.parser') pages_checked = pages_checked + 1 if not shown: tagname = 'a' elif shown == 'just': tagname = 'img' else: tagname = ['a', 'img'] #pywikibot.output('Looking at tags.') for tag in soup.findAll(tagname): link = tag.get('src', tag.get('href', None)) if not link: #pywikibot.output('It is not a link.') continue #pywikibot.output('Got link {0}.'.format(link)) _, ext = os.path.splitext(link) if ext.lower() in file_formats: pywikibot.output('Found image link {0}.'.format(ext)) if "oni2.net" in link: pywikibot.stdout('Found an oni2.net image: {0}'.format(link)) oni2_images = oni2_images + 1 return links def main(*args): cat = '' url = '' image_url = False shown = False desc = [] local_args = pywikibot.handle_args(args) genFactory = pagegenerators.GeneratorFactory() for arg in local_args: if arg.startswith('-cat:'): cat = arg[5:] elif arg == '-shown': shown = True elif arg == '-justshown': shown = 'just' elif url == '': url = arg else: desc += [arg] desc = ' '.join(desc) site = pywikibot.Site() cat_obj = pywikibot.Category(site, cat) generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True) for page in pagegenerators.PreloadingGenerator(generator, 100): pywikibot.stdout('Checking page {0}'.format(page.title())) page_url = page.full_url().replace("%2F", "/") get_image_links(page_url, shown) global pages_checked global oni2_images pywikibot.stdout('Checked {0} page(s) and found {1} image(s) from oni2.net.'.format(pages_checked, oni2_images)) if __name__ == '__main__': main()