Context Navigation

source: ValBot/Python/find_external_images.py@ 1180

Last change on this file since 1180 was 1173, checked in by iritscen, 2 years ago
ValBot: check_intrawiki_section_links.py won't quit when a link cannot be understood; it will just move on. find_external_images.py is now polished and robust.
File size: 5.3 KB

Rev	Line
[1173]	1	# Find External Images
	2	# by iritscen@yahoo.com
	3	# Looks at each link on a page (or in all the pages in a category) and prints the links to
	4	# images that are externally-hosted. You must pass in one or both of the following args:
	5	# -inlined: Show any plain URLs leading to images (these create embedded images, <img>)
	6	# -linked: Show any external URLs ("[URL]") leading to images (these create links, <a>)
	7	# Specify the page or category to search with -page:"Some Page" or -cat:"Some Category".
	8	#
	9	# Recommended viewing width:
	10	# \|---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---\|
	11
[1169]	12	import os
	13
	14	from urllib.parse import urljoin
	15
	16	import pywikibot
	17
	18	from pywikibot.bot import QuitKeyboardInterrupt
	19	from pywikibot import pagegenerators
	20	from pywikibot.comms.http import fetch
	21	from pywikibot.specialbots import UploadRobot
[1173]	22	#import bs4 # for listing members with dir()
[1169]	23	from bs4 import BeautifulSoup
	24
	25	pages_checked = 0
[1173]	26	page_errors = 0
	27	ext_images = 0
[1169]	28	oni2_images = 0
	29	file_formats = ('.jpg', '.jpeg', '.png', '.gif', '.svg')
[1173]	30	tag_names = []
[1169]	31
	32	# Scrapes the HTML at the given URL for image tags
[1173]	33	def get_image_links(url):
	34	global pages_checked
	35	global page_errors
	36	global ext_images
[1169]	37	global oni2_images
[1173]	38	global file_formats
	39	global tag_names
[1169]	40
	41	response = fetch(url)
	42	if response.status_code != 200:
[1173]	43	pywikibot.stdout(' ERROR: Could not load page at URL "{}"'.format(url))
	44	page_errors = page_errors + 1
	45	return
[1169]	46
	47	soup = BeautifulSoup(response.text, 'html.parser')
	48	pages_checked = pages_checked + 1
[1173]	49	for tag in soup.findAll(tag_names):
	50	link = tag.get('href')
[1169]	51	if not link:
[1173]	52	link = tag.get('src')
	53
	54	# Filter out empty links
	55	if not link:
	56	if tag.get('id') == "top":
	57	continue
	58
	59	class_names = tag.get('class')
	60	if "selflink" in class_names:
	61	continue
	62
	63	pywikibot.stdout(' Could not process mystery link {}'.format(tag.get_text))
	64	pywikibot.stdout(' Class is "{}".'.format(tag.get('class')))
[1169]	65	continue
[1173]	66
	67	# A "src" or "href" starting with "/" would be a link to a local page or file; a
	68	# link starting with "#" is a section link
	69	if link.startswith('/') or link.startswith('#'):
	70	continue
	71
	72	# The gnu.org link to the Free Documentation License is at the bottom of every page
	73	if link == "http://www.gnu.org/copyleft/fdl.html":
	74	continue
	75
[1169]	76	_, ext = os.path.splitext(link)
	77	if ext.lower() in file_formats:
	78	if "oni2.net" in link:
[1173]	79	pywikibot.stdout(' Oni2.net image: {}'.format(link))
[1169]	80	oni2_images = oni2_images + 1
[1173]	81	else:
	82	pywikibot.stdout(' External image: {}'.format(link))
	83	ext_images = ext_images + 1
	84	#else:
	85	#pywikibot.stdout(' Other external link: {}'.format(link))
[1169]	86
	87	def main(*args):
[1173]	88	global pages_checked
	89	global page_errors
	90	global ext_images
	91	global oni2_images
	92	global tag_names
[1169]	93
[1173]	94	cat_name = ''
	95	page_name = ''
	96
	97	#pywikibot.stdout('The members of the bs4.element.Tag class are:')
	98	#pywikibot.stdout(format(dir(bs4.element.Tag)))
	99
[1169]	100	local_args = pywikibot.handle_args(args)
	101	genFactory = pagegenerators.GeneratorFactory()
	102
	103	for arg in local_args:
	104	if arg.startswith('-cat:'):
[1173]	105	cat_name = arg[5:]
	106	elif arg.startswith('-page:'):
	107	page_name = arg[6:]
	108	elif arg == '-linked':
	109	tag_names += ['a']
	110	elif arg == '-inlined':
	111	tag_names += ['img']
[1169]	112	else:
[1173]	113	pywikibot.stdout('Unknown argument "{}".'.format(arg))
	114	return
[1169]	115
[1173]	116	if not tag_names:
	117	pywikibot.stdout('You need to pass this script either "-linked", "-inlined", or both arguments in order to specify which image links you want to find.')
	118	return
	119
[1169]	120	site = pywikibot.Site()
[1173]	121	if cat_name != '':
	122	cat_obj = pywikibot.Category(site, cat_name)
	123	generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
	124	for page in pagegenerators.PreloadingGenerator(generator, 100):
	125	pywikibot.stdout('Checking page "{}"'.format(page.title()))
	126	page_url = page.full_url().replace("%2F", "/")
	127	get_image_links(page_url)
	128	elif page_name != '':
	129	page = pywikibot.Page(site, page_name)
	130	pywikibot.stdout('Checking page "{}"'.format(page.title()))
[1169]	131	page_url = page.full_url().replace("%2F", "/")
[1173]	132	get_image_links(page_url)
	133	else:
	134	pywikibot.stdout('No page name or category name received.'.format(arg))
	135	return
[1169]	136
[1173]	137	chk_page_str = "pages"
	138	if pages_checked == 1:
	139	chk_page_str = "page"
[1169]	140
[1173]	141	err_page_str = "pages"
	142	if page_errors == 1:
	143	err_page_str = "page"
	144
	145	ext_image_str = "images"
	146	if ext_images == 1:
	147	ext_image_str = "image"
	148
	149	oni2_image_str = "images"
	150	if oni2_images == 1:
	151	oni2_image_str = "image"
	152
	153	pywikibot.stdout('-------------------------')
	154	pywikibot.stdout('Checked {0} {1} and failed to check {2} {3}.'.format(pages_checked, chk_page_str, page_errors, err_page_str))
	155	pywikibot.stdout('Found {0} {1} from oni2.net and {2} {3} from other domains.'.format(oni2_images, oni2_image_str, ext_images, ext_image_str))
	156
[1169]	157	if __name__ == '__main__':
	158	main()

Note: See TracBrowser for help on using the repository browser.

Download in other formats: