# Find External Images # by iritscen@yahoo.com # Looks at each link on a page (or in all the pages in a category) and prints the links to # images that are external to the wiki. Distinction is made between images hosted on oni2.net # and on third-party domains. You must pass in one or both of the following args: # -embedded: Show any plain URLs leading to images (these create embedded images, ) # -linked: Show any external URLs ("[URL]") leading to images (these create links, ) # # Specify the page or category to search with -page:"Some Page" or -cat:"Some Category". # # Recommended viewing width: # |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---| import os from urllib.parse import urljoin import pywikibot from pywikibot.bot import QuitKeyboardInterrupt from pywikibot import pagegenerators from pywikibot.comms.http import fetch from pywikibot.specialbots import UploadRobot import bs4 from bs4 import BeautifulSoup # Initialize globals debug = 0 pages_checked = 0 page_errors = 0 image_errors = 0 linked_ext_images = 0 linked_oni2_images = 0 embedded_ext_images = 0 embedded_oni2_images = 0 file_formats = ('.jpg', '.jpeg', '.png', '.gif', '.svg') tag_names = [] # Pass this function a singular noun and it will add an 's' to "noun" if "quantity" is not 1 def plural_check(noun, quantity): if quantity != 1: return noun + "s" else: return noun # Scrapes the HTML at the given URL for image tags def get_image_links(page_url, page_name): global debug global pages_checked global page_errors global image_errors global linked_ext_images global linked_oni2_images global embedded_ext_images global embedded_oni2_images global file_formats global tag_names name_printed = 0 response = fetch(page_url) if response.status_code != 200: pywikibot.stdout(' ERROR: Could not load page at URL "{}".'.format(page_url)) page_errors += 1 return soup = BeautifulSoup(response.text, 'html.parser') pages_checked += 1 for tag in soup.findAll(tag_names): link = tag.get('href') if not link: link = tag.get('src') # Filter out empty links if not link: if tag.get('id') == "top": continue class_names = tag.get('class') if "selflink" in class_names: continue if not name_printed and not debug: pywikibot.stdout('From page "{}":'.format(page_name)) name_printed = 1 pywikibot.stdout(' ERROR: Could not process mystery link {}.'.format(tag.get_text)) pywikibot.stdout(' Class is "{}".'.format(tag.get('class'))) page_errors += 1 continue # A "src" or "href" starting with "/" would be a link to a local page or file; a # link starting with "#" is a section link if link.startswith('/') or link.startswith('#'): continue # The gnu.org link to the Free Documentation License is at the bottom of every page if link == "http://www.gnu.org/copyleft/fdl.html": continue # Determine if link is to an image _, ext = os.path.splitext(link) if ext.lower() in file_formats: if not name_printed and not debug: pywikibot.stdout('Found on page "{}":'.format(page_name)) name_printed = 1 tag_text = format(tag) if "oni2.net" in link: if tag_text.startswith('