source: ValBot/Python/find_external_images.py@ 1170

Last change on this file since 1170 was 1169, checked in by iritscen, 3 years ago

ValBot: Reorganized files. Updated docs with more helpful information.

File size: 2.5 KB
Line 
1import os
2
3from urllib.parse import urljoin
4
5import pywikibot
6
7from pywikibot.bot import QuitKeyboardInterrupt
8from pywikibot import pagegenerators
9from pywikibot.comms.http import fetch
10from pywikibot.specialbots import UploadRobot
11from bs4 import BeautifulSoup
12
13first_run = False
14pages_checked = 0
15oni2_images = 0
16file_formats = ('.jpg', '.jpeg', '.png', '.gif', '.svg')
17
18# Scrapes the HTML at the given URL for image tags
19def get_image_links(url, shown):
20 links = []
21 global oni2_images
22 global pages_checked
23
24 response = fetch(url)
25 if response.status_code != 200:
26 pywikibot.output('Skipping url: {}'.format(url))
27 return links
28
29 soup = BeautifulSoup(response.text, 'html.parser')
30 pages_checked = pages_checked + 1
31 if not shown:
32 tagname = 'a'
33 elif shown == 'just':
34 tagname = 'img'
35 else:
36 tagname = ['a', 'img']
37 #pywikibot.output('Looking at tags.')
38 for tag in soup.findAll(tagname):
39 link = tag.get('src', tag.get('href', None))
40 if not link:
41 #pywikibot.output('It is not a link.')
42 continue
43 #pywikibot.output('Got link {0}.'.format(link))
44 _, ext = os.path.splitext(link)
45 if ext.lower() in file_formats:
46 pywikibot.output('Found image link {0}.'.format(ext))
47 if "oni2.net" in link:
48 pywikibot.stdout('Found an oni2.net image: {0}'.format(link))
49 oni2_images = oni2_images + 1
50 return links
51
52
53def main(*args):
54 cat = ''
55 url = ''
56 image_url = False
57 shown = False
58 desc = []
59
60 local_args = pywikibot.handle_args(args)
61 genFactory = pagegenerators.GeneratorFactory()
62
63 for arg in local_args:
64 if arg.startswith('-cat:'):
65 cat = arg[5:]
66 elif arg == '-shown':
67 shown = True
68 elif arg == '-justshown':
69 shown = 'just'
70 elif url == '':
71 url = arg
72 else:
73 desc += [arg]
74 desc = ' '.join(desc)
75
76 site = pywikibot.Site()
77 cat_obj = pywikibot.Category(site, cat)
78 generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
79 for page in pagegenerators.PreloadingGenerator(generator, 100):
80 pywikibot.stdout('Checking page {0}'.format(page.title()))
81 page_url = page.full_url().replace("%2F", "/")
82 get_image_links(page_url, shown)
83
84 global pages_checked
85 global oni2_images
86 pywikibot.stdout('Checked {0} page(s) and found {1} image(s) from oni2.net.'.format(pages_checked, oni2_images))
87
88if __name__ == '__main__':
89 main()
Note: See TracBrowser for help on using the repository browser.