# Find External Images
# by iritscen@yahoo.com
# Looks at each link on a page (or in all the pages in a category) and prints the links to
# images that are externally-hosted. You must pass in one or both of the following args:
# -inlined: Show any plain URLs leading to images (these create embedded images, )
# -linked: Show any external URLs ("[URL]") leading to images (these create links, )
# Specify the page or category to search with -page:"Some Page" or -cat:"Some Category".
#
# Recommended viewing width:
# |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---|
import os
from urllib.parse import urljoin
import pywikibot
from pywikibot.bot import QuitKeyboardInterrupt
from pywikibot import pagegenerators
from pywikibot.comms.http import fetch
from pywikibot.specialbots import UploadRobot
#import bs4 # for listing members with dir()
from bs4 import BeautifulSoup
pages_checked = 0
page_errors = 0
ext_images = 0
oni2_images = 0
file_formats = ('.jpg', '.jpeg', '.png', '.gif', '.svg')
tag_names = []
# Scrapes the HTML at the given URL for image tags
def get_image_links(url):
global pages_checked
global page_errors
global ext_images
global oni2_images
global file_formats
global tag_names
response = fetch(url)
if response.status_code != 200:
pywikibot.stdout(' ERROR: Could not load page at URL "{}"'.format(url))
page_errors = page_errors + 1
return
soup = BeautifulSoup(response.text, 'html.parser')
pages_checked = pages_checked + 1
for tag in soup.findAll(tag_names):
link = tag.get('href')
if not link:
link = tag.get('src')
# Filter out empty links
if not link:
if tag.get('id') == "top":
continue
class_names = tag.get('class')
if "selflink" in class_names:
continue
pywikibot.stdout(' Could not process mystery link {}'.format(tag.get_text))
pywikibot.stdout(' Class is "{}".'.format(tag.get('class')))
continue
# A "src" or "href" starting with "/" would be a link to a local page or file; a
# link starting with "#" is a section link
if link.startswith('/') or link.startswith('#'):
continue
# The gnu.org link to the Free Documentation License is at the bottom of every page
if link == "http://www.gnu.org/copyleft/fdl.html":
continue
_, ext = os.path.splitext(link)
if ext.lower() in file_formats:
if "oni2.net" in link:
pywikibot.stdout(' Oni2.net image: {}'.format(link))
oni2_images = oni2_images + 1
else:
pywikibot.stdout(' External image: {}'.format(link))
ext_images = ext_images + 1
#else:
#pywikibot.stdout(' Other external link: {}'.format(link))
def main(*args):
global pages_checked
global page_errors
global ext_images
global oni2_images
global tag_names
cat_name = ''
page_name = ''
#pywikibot.stdout('The members of the bs4.element.Tag class are:')
#pywikibot.stdout(format(dir(bs4.element.Tag)))
local_args = pywikibot.handle_args(args)
genFactory = pagegenerators.GeneratorFactory()
for arg in local_args:
if arg.startswith('-cat:'):
cat_name = arg[5:]
elif arg.startswith('-page:'):
page_name = arg[6:]
elif arg == '-linked':
tag_names += ['a']
elif arg == '-inlined':
tag_names += ['img']
else:
pywikibot.stdout('Unknown argument "{}".'.format(arg))
return
if not tag_names:
pywikibot.stdout('You need to pass this script either "-linked", "-inlined", or both arguments in order to specify which image links you want to find.')
return
site = pywikibot.Site()
if cat_name != '':
cat_obj = pywikibot.Category(site, cat_name)
generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
for page in pagegenerators.PreloadingGenerator(generator, 100):
pywikibot.stdout('Checking page "{}"'.format(page.title()))
page_url = page.full_url().replace("%2F", "/")
get_image_links(page_url)
elif page_name != '':
page = pywikibot.Page(site, page_name)
pywikibot.stdout('Checking page "{}"'.format(page.title()))
page_url = page.full_url().replace("%2F", "/")
get_image_links(page_url)
else:
pywikibot.stdout('No page name or category name received.'.format(arg))
return
chk_page_str = "pages"
if pages_checked == 1:
chk_page_str = "page"
err_page_str = "pages"
if page_errors == 1:
err_page_str = "page"
ext_image_str = "images"
if ext_images == 1:
ext_image_str = "image"
oni2_image_str = "images"
if oni2_images == 1:
oni2_image_str = "image"
pywikibot.stdout('-------------------------')
pywikibot.stdout('Checked {0} {1} and failed to check {2} {3}.'.format(pages_checked, chk_page_str, page_errors, err_page_str))
pywikibot.stdout('Found {0} {1} from oni2.net and {2} {3} from other domains.'.format(oni2_images, oni2_image_str, ext_images, ext_image_str))
if __name__ == '__main__':
main()