# Find External Images # by iritscen@yahoo.com # Looks at each link on a page (or in all the pages in a category) and prints the links to # images that are externally-hosted. You must pass in one or both of the following args: # -inlined: Show any plain URLs leading to images (these create embedded images, ) # -linked: Show any external URLs ("[URL]") leading to images (these create links, ) # Specify the page or category to search with -page:"Some Page" or -cat:"Some Category". # # Recommended viewing width: # |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---| import os from urllib.parse import urljoin import pywikibot from pywikibot.bot import QuitKeyboardInterrupt from pywikibot import pagegenerators from pywikibot.comms.http import fetch from pywikibot.specialbots import UploadRobot #import bs4 # for listing members with dir() from bs4 import BeautifulSoup pages_checked = 0 page_errors = 0 ext_images = 0 oni2_images = 0 file_formats = ('.jpg', '.jpeg', '.png', '.gif', '.svg') tag_names = [] # Scrapes the HTML at the given URL for image tags def get_image_links(url): global pages_checked global page_errors global ext_images global oni2_images global file_formats global tag_names response = fetch(url) if response.status_code != 200: pywikibot.stdout(' ERROR: Could not load page at URL "{}"'.format(url)) page_errors = page_errors + 1 return soup = BeautifulSoup(response.text, 'html.parser') pages_checked = pages_checked + 1 for tag in soup.findAll(tag_names): link = tag.get('href') if not link: link = tag.get('src') # Filter out empty links if not link: if tag.get('id') == "top": continue class_names = tag.get('class') if "selflink" in class_names: continue pywikibot.stdout(' Could not process mystery link {}'.format(tag.get_text)) pywikibot.stdout(' Class is "{}".'.format(tag.get('class'))) continue # A "src" or "href" starting with "/" would be a link to a local page or file; a # link starting with "#" is a section link if link.startswith('/') or link.startswith('#'): continue # The gnu.org link to the Free Documentation License is at the bottom of every page if link == "http://www.gnu.org/copyleft/fdl.html": continue _, ext = os.path.splitext(link) if ext.lower() in file_formats: if "oni2.net" in link: pywikibot.stdout(' Oni2.net image: {}'.format(link)) oni2_images = oni2_images + 1 else: pywikibot.stdout(' External image: {}'.format(link)) ext_images = ext_images + 1 #else: #pywikibot.stdout(' Other external link: {}'.format(link)) def main(*args): global pages_checked global page_errors global ext_images global oni2_images global tag_names cat_name = '' page_name = '' #pywikibot.stdout('The members of the bs4.element.Tag class are:') #pywikibot.stdout(format(dir(bs4.element.Tag))) local_args = pywikibot.handle_args(args) genFactory = pagegenerators.GeneratorFactory() for arg in local_args: if arg.startswith('-cat:'): cat_name = arg[5:] elif arg.startswith('-page:'): page_name = arg[6:] elif arg == '-linked': tag_names += ['a'] elif arg == '-inlined': tag_names += ['img'] else: pywikibot.stdout('Unknown argument "{}".'.format(arg)) return if not tag_names: pywikibot.stdout('You need to pass this script either "-linked", "-inlined", or both arguments in order to specify which image links you want to find.') return site = pywikibot.Site() if cat_name != '': cat_obj = pywikibot.Category(site, cat_name) generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True) for page in pagegenerators.PreloadingGenerator(generator, 100): pywikibot.stdout('Checking page "{}"'.format(page.title())) page_url = page.full_url().replace("%2F", "/") get_image_links(page_url) elif page_name != '': page = pywikibot.Page(site, page_name) pywikibot.stdout('Checking page "{}"'.format(page.title())) page_url = page.full_url().replace("%2F", "/") get_image_links(page_url) else: pywikibot.stdout('No page name or category name received.'.format(arg)) return chk_page_str = "pages" if pages_checked == 1: chk_page_str = "page" err_page_str = "pages" if page_errors == 1: err_page_str = "page" ext_image_str = "images" if ext_images == 1: ext_image_str = "image" oni2_image_str = "images" if oni2_images == 1: oni2_image_str = "image" pywikibot.stdout('-------------------------') pywikibot.stdout('Checked {0} {1} and failed to check {2} {3}.'.format(pages_checked, chk_page_str, page_errors, err_page_str)) pywikibot.stdout('Found {0} {1} from oni2.net and {2} {3} from other domains.'.format(oni2_images, oni2_image_str, ext_images, ext_image_str)) if __name__ == '__main__': main()