Context Navigation

find_external_images.py@ 1188

Last change on this file since 1188 was 1181, checked in by iritscen, 19 months ago
ValBot: find_external_images.py: Changed argument "-inlined" to "-embedded". Now clearly distinguishing linked from embedded images. Placed some output under a "-dbg" argument.
File size: 7.3 KB

Rev	Line
[1173]	1	# Find External Images
	2	# by iritscen@yahoo.com
	3	# Looks at each link on a page (or in all the pages in a category) and prints the links to
[1181]	4	# images that are external to the wiki. Distinction is made between images hosted on oni2.net
	5	# and on third-party domains. You must pass in one or both of the following args:
	6	# -embedded: Show any plain URLs leading to images (these create embedded images, <img>)
	7	# -linked: Show any external URLs ("[URL]") leading to images (these create links, <a>)
	8	#
[1173]	9	# Specify the page or category to search with -page:"Some Page" or -cat:"Some Category".
	10	#
	11	# Recommended viewing width:
	12	# \|---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---\|
	13
[1169]	14	import os
	15
	16	from urllib.parse import urljoin
	17
	18	import pywikibot
	19	from pywikibot.bot import QuitKeyboardInterrupt
	20	from pywikibot import pagegenerators
	21	from pywikibot.comms.http import fetch
	22	from pywikibot.specialbots import UploadRobot
[1181]	23
	24	import bs4
[1169]	25	from bs4 import BeautifulSoup
	26
[1181]	27	# Initialize globals
	28	debug = 0
[1169]	29	pages_checked = 0
[1173]	30	page_errors = 0
[1181]	31	image_errors = 0
	32	linked_ext_images = 0
	33	linked_oni2_images = 0
	34	embedded_ext_images = 0
	35	embedded_oni2_images = 0
[1169]	36	file_formats = ('.jpg', '.jpeg', '.png', '.gif', '.svg')
[1173]	37	tag_names = []
[1169]	38
[1181]	39	# Pass this function a singular noun and it will add an 's' to "noun" if "quantity" is not 1
	40	def plural_check(noun, quantity):
	41	if quantity != 1:
	42	return noun + "s"
	43	else:
	44	return noun
	45
[1169]	46	# Scrapes the HTML at the given URL for image tags
[1181]	47	def get_image_links(page_url, page_name):
	48	global debug
	49	global pages_checked
	50	global page_errors
	51	global image_errors
	52	global linked_ext_images
	53	global linked_oni2_images
	54	global embedded_ext_images
	55	global embedded_oni2_images
	56	global file_formats
	57	global tag_names
	58	name_printed = 0
[1169]	59
[1181]	60	response = fetch(page_url)
	61	if response.status_code != 200:
	62	pywikibot.stdout(' ERROR: Could not load page at URL "{}".'.format(page_url))
	63	page_errors += 1
	64	return
[1169]	65
[1181]	66	soup = BeautifulSoup(response.text, 'html.parser')
	67	pages_checked += 1
	68	for tag in soup.findAll(tag_names):
	69	link = tag.get('href')
	70	if not link:
	71	link = tag.get('src')
[1173]	72
[1181]	73	# Filter out empty links
	74	if not link:
	75	if tag.get('id') == "top":
[1169]	76	continue
[1173]	77
[1181]	78	class_names = tag.get('class')
	79	if "selflink" in class_names:
[1173]	80	continue
	81
[1181]	82	if not name_printed and not debug:
	83	pywikibot.stdout('From page "{}":'.format(page_name))
	84	name_printed = 1
	85	pywikibot.stdout(' ERROR: Could not process mystery link {}.'.format(tag.get_text))
	86	pywikibot.stdout(' Class is "{}".'.format(tag.get('class')))
	87	page_errors += 1
	88	continue
[1173]	89
[1181]	90	# A "src" or "href" starting with "/" would be a link to a local page or file; a
	91	# link starting with "#" is a section link
	92	if link.startswith('/') or link.startswith('#'):
	93	continue
	94
	95	# The gnu.org link to the Free Documentation License is at the bottom of every page
	96	if link == "http://www.gnu.org/copyleft/fdl.html":
	97	continue
	98
	99	# Determine if link is to an image
	100	_, ext = os.path.splitext(link)
	101	if ext.lower() in file_formats:
	102	if not name_printed and not debug:
	103	pywikibot.stdout('Found on page "{}":'.format(page_name))
	104	name_printed = 1
	105	tag_text = format(tag)
	106	if "oni2.net" in link:
	107	if tag_text.startswith('<a'):
	108	pywikibot.stdout(' Linked oni2.net image: {}'.format(link))
	109	linked_oni2_images += 1
	110	elif tag_text.startswith('<img'):
	111	pywikibot.stdout(' Embedded oni2.net image: {}'.format(link))
	112	embedded_oni2_images += 1
[1173]	113	else:
[1181]	114	pywikibot.stdout(' ERROR: Could not process oni2.net image link {}.'.format(link))
	115	image_errors += 1
	116	return
	117	else:
	118	if tag_text.startswith('<a'):
	119	pywikibot.stdout(' Linked external image: {}'.format(link))
	120	linked_ext_images += 1
	121	elif tag_text.startswith('<img'):
	122	pywikibot.stdout(' Embedded external image: {}'.format(link))
	123	embedded_ext_images += 1
	124	else:
	125	pywikibot.stdout(' ERROR: Could not process external image link {}.'.format(link))
	126	image_errors += 1
	127	return
[1169]	128
	129	def main(*args):
[1181]	130	global debug
	131	global pages_checked
	132	global page_errors
	133	global image_errors
	134	global linked_ext_images
	135	global linked_oni2_images
	136	global embedded_ext_images
	137	global embedded_oni2_images
	138	global tag_names
[1169]	139
[1181]	140	search_cat = ''
	141	search_page = ''
[1173]	142
[1181]	143	#pywikibot.stdout('The members of the bs4.element.Tag class are:')
	144	#pywikibot.stdout(format(dir(bs4.element.Tag)))
[1173]	145
[1181]	146	local_args = pywikibot.handle_args(args)
	147	genFactory = pagegenerators.GeneratorFactory()
[1169]	148
[1181]	149	for arg in local_args:
	150	if arg.startswith('-cat:'):
	151	search_cat = arg[5:]
	152	elif arg.startswith('-page:'):
	153	search_page = arg[6:]
	154	elif arg == '-linked':
	155	tag_names += ['a']
	156	elif arg == '-embedded':
	157	tag_names += ['img']
	158	elif arg == '-dbg':
	159	debug = 1
	160	else:
	161	pywikibot.stdout('Unknown argument "{}".'.format(arg))
	162	return
[1169]	163
[1181]	164	if not tag_names:
	165	pywikibot.stdout('You need to pass this script either "-linked", "-embedded", or both arguments in order to specify which image links you want to find.')
	166	return
[1173]	167
[1181]	168	site = pywikibot.Site()
	169	if search_cat != '':
	170	cat_obj = pywikibot.Category(site, search_cat)
	171	generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
	172	for page in pagegenerators.PreloadingGenerator(generator, 100):
	173	if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
	174	page_url = page.full_url().replace("%2F", "/")
	175	get_image_links(page_url, page.title())
	176	elif search_page != '':
	177	page = pywikibot.Page(site, search_page)
	178	if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
	179	page_url = page.full_url().replace("%2F", "/")
	180	get_image_links(page_url, page.title())
	181	else:
	182	pywikibot.stdout('No page name or category name received.'.format(arg))
	183	return
[1169]	184
[1181]	185	chk_page_str = plural_check("page", pages_checked)
	186	err_page_str = plural_check("page", page_errors)
	187	err_img_str = plural_check("image", image_errors)
	188	linked_ext_image_str = plural_check("image", linked_ext_images)
	189	linked_oni2_image_str = plural_check("image", linked_oni2_images)
	190	embedded_ext_image_str = plural_check("image", embedded_ext_images)
	191	embedded_oni2_image_str = plural_check("image", embedded_oni2_images)
[1169]	192
[1181]	193	pywikibot.stdout('-------------------------')
	194	pywikibot.stdout('Checked {0} {1} and failed to process {2} {3} on them. Failed to check {4} {5}.'.format(pages_checked, chk_page_str, image_errors, err_img_str, page_errors, err_page_str))
	195	if 'a' in tag_names:
	196	pywikibot.stdout('Found {0} linked {1} from oni2.net and {2} linked {3} from other domains.'.format(linked_oni2_images, linked_oni2_image_str, linked_ext_images, linked_ext_image_str))
	197	if 'img' in tag_names:
	198	pywikibot.stdout('Found {0} embedded {1} from oni2.net and {2} embedded {3} from other domains.'.format(embedded_oni2_images, embedded_oni2_image_str, embedded_ext_images, embedded_ext_image_str))
[1173]	199
[1169]	200	if __name__ == '__main__':
[1181]	201	main()

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: ValBot/Python/find_external_images.py@ 1188

Download in other formats: