1 | import os
|
---|
2 |
|
---|
3 | from urllib.parse import urljoin
|
---|
4 |
|
---|
5 | import pywikibot
|
---|
6 |
|
---|
7 | from pywikibot.bot import QuitKeyboardInterrupt
|
---|
8 | from pywikibot import pagegenerators
|
---|
9 | from pywikibot.comms.http import fetch
|
---|
10 | from pywikibot.specialbots import UploadRobot
|
---|
11 | from bs4 import BeautifulSoup
|
---|
12 |
|
---|
13 | first_run = False
|
---|
14 | pages_checked = 0
|
---|
15 | oni2_images = 0
|
---|
16 | file_formats = ('.jpg', '.jpeg', '.png', '.gif', '.svg')
|
---|
17 |
|
---|
18 | # Scrapes the HTML at the given URL for image tags
|
---|
19 | def get_image_links(url, shown):
|
---|
20 | links = []
|
---|
21 | global oni2_images
|
---|
22 | global pages_checked
|
---|
23 |
|
---|
24 | response = fetch(url)
|
---|
25 | if response.status_code != 200:
|
---|
26 | pywikibot.output('Skipping url: {}'.format(url))
|
---|
27 | return links
|
---|
28 |
|
---|
29 | soup = BeautifulSoup(response.text, 'html.parser')
|
---|
30 | pages_checked = pages_checked + 1
|
---|
31 | if not shown:
|
---|
32 | tagname = 'a'
|
---|
33 | elif shown == 'just':
|
---|
34 | tagname = 'img'
|
---|
35 | else:
|
---|
36 | tagname = ['a', 'img']
|
---|
37 | #pywikibot.output('Looking at tags.')
|
---|
38 | for tag in soup.findAll(tagname):
|
---|
39 | link = tag.get('src', tag.get('href', None))
|
---|
40 | if not link:
|
---|
41 | #pywikibot.output('It is not a link.')
|
---|
42 | continue
|
---|
43 | #pywikibot.output('Got link {0}.'.format(link))
|
---|
44 | _, ext = os.path.splitext(link)
|
---|
45 | if ext.lower() in file_formats:
|
---|
46 | pywikibot.output('Found image link {0}.'.format(ext))
|
---|
47 | if "oni2.net" in link:
|
---|
48 | pywikibot.stdout('Found an oni2.net image: {0}'.format(link))
|
---|
49 | oni2_images = oni2_images + 1
|
---|
50 | return links
|
---|
51 |
|
---|
52 |
|
---|
53 | def main(*args):
|
---|
54 | cat = ''
|
---|
55 | url = ''
|
---|
56 | image_url = False
|
---|
57 | shown = False
|
---|
58 | desc = []
|
---|
59 |
|
---|
60 | local_args = pywikibot.handle_args(args)
|
---|
61 | genFactory = pagegenerators.GeneratorFactory()
|
---|
62 |
|
---|
63 | for arg in local_args:
|
---|
64 | if arg.startswith('-cat:'):
|
---|
65 | cat = arg[5:]
|
---|
66 | elif arg == '-shown':
|
---|
67 | shown = True
|
---|
68 | elif arg == '-justshown':
|
---|
69 | shown = 'just'
|
---|
70 | elif url == '':
|
---|
71 | url = arg
|
---|
72 | else:
|
---|
73 | desc += [arg]
|
---|
74 | desc = ' '.join(desc)
|
---|
75 |
|
---|
76 | site = pywikibot.Site()
|
---|
77 | cat_obj = pywikibot.Category(site, cat)
|
---|
78 | generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
|
---|
79 | for page in pagegenerators.PreloadingGenerator(generator, 100):
|
---|
80 | pywikibot.stdout('Checking page {0}'.format(page.title()))
|
---|
81 | page_url = page.full_url().replace("%2F", "/")
|
---|
82 | get_image_links(page_url, shown)
|
---|
83 |
|
---|
84 | global pages_checked
|
---|
85 | global oni2_images
|
---|
86 | pywikibot.stdout('Checked {0} page(s) and found {1} image(s) from oni2.net.'.format(pages_checked, oni2_images))
|
---|
87 |
|
---|
88 | if __name__ == '__main__':
|
---|
89 | main()
|
---|