Context Navigation

find_external_images.py@ 1181

Last change on this file since 1181 was 1181, checked in by iritscen, 19 months ago
ValBot: find_external_images.py: Changed argument "-inlined" to "-embedded". Now clearly distinguishing linked from embedded images. Placed some output under a "-dbg" argument.
File size: 7.3 KB

Line
1	# Find External Images
2	# by iritscen@yahoo.com
3	# Looks at each link on a page (or in all the pages in a category) and prints the links to
4	# images that are external to the wiki. Distinction is made between images hosted on oni2.net
5	# and on third-party domains. You must pass in one or both of the following args:
6	# -embedded: Show any plain URLs leading to images (these create embedded images, <img>)
7	# -linked: Show any external URLs ("[URL]") leading to images (these create links, <a>)
8	#
9	# Specify the page or category to search with -page:"Some Page" or -cat:"Some Category".
10	#
11	# Recommended viewing width:
12	# \|---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---\|
13
14	import os
15
16	from urllib.parse import urljoin
17
18	import pywikibot
19	from pywikibot.bot import QuitKeyboardInterrupt
20	from pywikibot import pagegenerators
21	from pywikibot.comms.http import fetch
22	from pywikibot.specialbots import UploadRobot
23
24	import bs4
25	from bs4 import BeautifulSoup
26
27	# Initialize globals
28	debug = 0
29	pages_checked = 0
30	page_errors = 0
31	image_errors = 0
32	linked_ext_images = 0
33	linked_oni2_images = 0
34	embedded_ext_images = 0
35	embedded_oni2_images = 0
36	file_formats = ('.jpg', '.jpeg', '.png', '.gif', '.svg')
37	tag_names = []
38
39	# Pass this function a singular noun and it will add an 's' to "noun" if "quantity" is not 1
40	def plural_check(noun, quantity):
41	if quantity != 1:
42	return noun + "s"
43	else:
44	return noun
45
46	# Scrapes the HTML at the given URL for image tags
47	def get_image_links(page_url, page_name):
48	global debug
49	global pages_checked
50	global page_errors
51	global image_errors
52	global linked_ext_images
53	global linked_oni2_images
54	global embedded_ext_images
55	global embedded_oni2_images
56	global file_formats
57	global tag_names
58	name_printed = 0
59
60	response = fetch(page_url)
61	if response.status_code != 200:
62	pywikibot.stdout(' ERROR: Could not load page at URL "{}".'.format(page_url))
63	page_errors += 1
64	return
65
66	soup = BeautifulSoup(response.text, 'html.parser')
67	pages_checked += 1
68	for tag in soup.findAll(tag_names):
69	link = tag.get('href')
70	if not link:
71	link = tag.get('src')
72
73	# Filter out empty links
74	if not link:
75	if tag.get('id') == "top":
76	continue
77
78	class_names = tag.get('class')
79	if "selflink" in class_names:
80	continue
81
82	if not name_printed and not debug:
83	pywikibot.stdout('From page "{}":'.format(page_name))
84	name_printed = 1
85	pywikibot.stdout(' ERROR: Could not process mystery link {}.'.format(tag.get_text))
86	pywikibot.stdout(' Class is "{}".'.format(tag.get('class')))
87	page_errors += 1
88	continue
89
90	# A "src" or "href" starting with "/" would be a link to a local page or file; a
91	# link starting with "#" is a section link
92	if link.startswith('/') or link.startswith('#'):
93	continue
94
95	# The gnu.org link to the Free Documentation License is at the bottom of every page
96	if link == "http://www.gnu.org/copyleft/fdl.html":
97	continue
98
99	# Determine if link is to an image
100	_, ext = os.path.splitext(link)
101	if ext.lower() in file_formats:
102	if not name_printed and not debug:
103	pywikibot.stdout('Found on page "{}":'.format(page_name))
104	name_printed = 1
105	tag_text = format(tag)
106	if "oni2.net" in link:
107	if tag_text.startswith('<a'):
108	pywikibot.stdout(' Linked oni2.net image: {}'.format(link))
109	linked_oni2_images += 1
110	elif tag_text.startswith('<img'):
111	pywikibot.stdout(' Embedded oni2.net image: {}'.format(link))
112	embedded_oni2_images += 1
113	else:
114	pywikibot.stdout(' ERROR: Could not process oni2.net image link {}.'.format(link))
115	image_errors += 1
116	return
117	else:
118	if tag_text.startswith('<a'):
119	pywikibot.stdout(' Linked external image: {}'.format(link))
120	linked_ext_images += 1
121	elif tag_text.startswith('<img'):
122	pywikibot.stdout(' Embedded external image: {}'.format(link))
123	embedded_ext_images += 1
124	else:
125	pywikibot.stdout(' ERROR: Could not process external image link {}.'.format(link))
126	image_errors += 1
127	return
128
129	def main(*args):
130	global debug
131	global pages_checked
132	global page_errors
133	global image_errors
134	global linked_ext_images
135	global linked_oni2_images
136	global embedded_ext_images
137	global embedded_oni2_images
138	global tag_names
139
140	search_cat = ''
141	search_page = ''
142
143	#pywikibot.stdout('The members of the bs4.element.Tag class are:')
144	#pywikibot.stdout(format(dir(bs4.element.Tag)))
145
146	local_args = pywikibot.handle_args(args)
147	genFactory = pagegenerators.GeneratorFactory()
148
149	for arg in local_args:
150	if arg.startswith('-cat:'):
151	search_cat = arg[5:]
152	elif arg.startswith('-page:'):
153	search_page = arg[6:]
154	elif arg == '-linked':
155	tag_names += ['a']
156	elif arg == '-embedded':
157	tag_names += ['img']
158	elif arg == '-dbg':
159	debug = 1
160	else:
161	pywikibot.stdout('Unknown argument "{}".'.format(arg))
162	return
163
164	if not tag_names:
165	pywikibot.stdout('You need to pass this script either "-linked", "-embedded", or both arguments in order to specify which image links you want to find.')
166	return
167
168	site = pywikibot.Site()
169	if search_cat != '':
170	cat_obj = pywikibot.Category(site, search_cat)
171	generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
172	for page in pagegenerators.PreloadingGenerator(generator, 100):
173	if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
174	page_url = page.full_url().replace("%2F", "/")
175	get_image_links(page_url, page.title())
176	elif search_page != '':
177	page = pywikibot.Page(site, search_page)
178	if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
179	page_url = page.full_url().replace("%2F", "/")
180	get_image_links(page_url, page.title())
181	else:
182	pywikibot.stdout('No page name or category name received.'.format(arg))
183	return
184
185	chk_page_str = plural_check("page", pages_checked)
186	err_page_str = plural_check("page", page_errors)
187	err_img_str = plural_check("image", image_errors)
188	linked_ext_image_str = plural_check("image", linked_ext_images)
189	linked_oni2_image_str = plural_check("image", linked_oni2_images)
190	embedded_ext_image_str = plural_check("image", embedded_ext_images)
191	embedded_oni2_image_str = plural_check("image", embedded_oni2_images)
192
193	pywikibot.stdout('-------------------------')
194	pywikibot.stdout('Checked {0} {1} and failed to process {2} {3} on them. Failed to check {4} {5}.'.format(pages_checked, chk_page_str, image_errors, err_img_str, page_errors, err_page_str))
195	if 'a' in tag_names:
196	pywikibot.stdout('Found {0} linked {1} from oni2.net and {2} linked {3} from other domains.'.format(linked_oni2_images, linked_oni2_image_str, linked_ext_images, linked_ext_image_str))
197	if 'img' in tag_names:
198	pywikibot.stdout('Found {0} embedded {1} from oni2.net and {2} embedded {3} from other domains.'.format(embedded_oni2_images, embedded_oni2_image_str, embedded_ext_images, embedded_ext_image_str))
199
200	if __name__ == '__main__':
201	main()

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: ValBot/Python/find_external_images.py@ 1181

Download in other formats: