[1151] | 1 | import os
|
---|
| 2 |
|
---|
| 3 | from urllib.parse import urljoin
|
---|
| 4 |
|
---|
| 5 | import pywikibot
|
---|
| 6 |
|
---|
| 7 | from pywikibot.bot import QuitKeyboardInterrupt
|
---|
| 8 | from pywikibot import pagegenerators
|
---|
| 9 | from pywikibot.comms.http import fetch
|
---|
| 10 | from pywikibot.specialbots import UploadRobot
|
---|
| 11 | from bs4 import BeautifulSoup
|
---|
| 12 |
|
---|
| 13 | first_run = False
|
---|
| 14 | pages_checked = 0
|
---|
| 15 | oni2_images = 0
|
---|
| 16 | file_formats = ('.jpg', '.jpeg', '.png', '.gif', '.svg', '.ogg')
|
---|
| 17 |
|
---|
| 18 | # Scrapes the HTML at the given URL for image tags
|
---|
| 19 | def get_image_links(url, shown):
|
---|
| 20 | links = []
|
---|
| 21 | global oni2_images
|
---|
| 22 | global pages_checked
|
---|
| 23 |
|
---|
| 24 | response = fetch(url)
|
---|
| 25 | if response.status_code != 200:
|
---|
| 26 | pywikibot.output('Skipping url: {}'.format(url))
|
---|
| 27 | return links
|
---|
| 28 |
|
---|
| 29 | soup = BeautifulSoup(response.text, 'html.parser')
|
---|
| 30 | pages_checked = pages_checked + 1
|
---|
| 31 | if not shown:
|
---|
| 32 | tagname = 'a'
|
---|
| 33 | elif shown == 'just':
|
---|
| 34 | tagname = 'img'
|
---|
| 35 | else:
|
---|
| 36 | tagname = ['a', 'img']
|
---|
| 37 | #pywikibot.output('Looking at tags.')
|
---|
| 38 | for tag in soup.findAll(tagname):
|
---|
| 39 | link = tag.get('src', tag.get('href', None))
|
---|
| 40 | if not link:
|
---|
| 41 | #pywikibot.output('It is not a link.')
|
---|
| 42 | continue
|
---|
| 43 | #pywikibot.output('Got link {0}.'.format(link))
|
---|
| 44 | _, ext = os.path.splitext(link)
|
---|
| 45 | if ext.lower() in file_formats:
|
---|
| 46 | pywikibot.output('Found image link {0}.'.format(ext))
|
---|
| 47 | if "oni2.net" in link:
|
---|
| 48 | pywikibot.stdout('Found an oni2.net image: {0}'.format(link))
|
---|
| 49 | oni2_images = oni2_images + 1
|
---|
| 50 | return links
|
---|
| 51 |
|
---|
| 52 |
|
---|
| 53 | def main(*args):
|
---|
| 54 | cat = ''
|
---|
| 55 | url = ''
|
---|
| 56 | image_url = False
|
---|
| 57 | shown = False
|
---|
| 58 | desc = []
|
---|
| 59 |
|
---|
| 60 | local_args = pywikibot.handle_args(args)
|
---|
| 61 | genFactory = pagegenerators.GeneratorFactory()
|
---|
| 62 |
|
---|
| 63 | for arg in local_args:
|
---|
| 64 | if arg.startswith('-cat:'):
|
---|
| 65 | cat = arg[5:]
|
---|
| 66 | elif arg == '-shown':
|
---|
| 67 | shown = True
|
---|
| 68 | elif arg == '-justshown':
|
---|
| 69 | shown = 'just'
|
---|
| 70 | elif url == '':
|
---|
| 71 | url = arg
|
---|
| 72 | else:
|
---|
| 73 | desc += [arg]
|
---|
| 74 | desc = ' '.join(desc)
|
---|
| 75 |
|
---|
| 76 | site = pywikibot.Site()
|
---|
| 77 | cat_obj = pywikibot.Category(site, cat)
|
---|
| 78 | generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
|
---|
| 79 | for page in pagegenerators.PreloadingGenerator(generator, 100):
|
---|
| 80 | pywikibot.stdout('Checking page {0}'.format(page.title()))
|
---|
| 81 | page_url = page.full_url().replace("%2F", "/")
|
---|
| 82 | get_image_links(page_url, shown)
|
---|
| 83 |
|
---|
| 84 | global pages_checked
|
---|
| 85 | global oni2_images
|
---|
| 86 | pywikibot.stdout('Checked {0} page(s) and found {1} image(s) from oni2.net.'.format(pages_checked, oni2_images))
|
---|
| 87 |
|
---|
| 88 | if __name__ == '__main__':
|
---|
| 89 | main()
|
---|