# Find External Images
# by iritscen@yahoo.com
# Looks at each link on a page (or in all the pages in a category) and prints the links to
# images that are external to the wiki. Distinction is made between images hosted on oni2.net
# and on third-party domains. You must pass in one or both of the following args:
# -embedded: Show any plain URLs leading to images (these create embedded images, )
# -linked: Show any external URLs ("[URL]") leading to images (these create links, )
#
# Specify the page or category to search with -page:"Some Page" or -cat:"Some Category".
#
# Recommended viewing width:
# |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---|
import os
from urllib.parse import urljoin
import pywikibot
from pywikibot.bot import QuitKeyboardInterrupt
from pywikibot import pagegenerators
from pywikibot.comms.http import fetch
from pywikibot.specialbots import UploadRobot
import bs4
from bs4 import BeautifulSoup
# Initialize globals
debug = 0
pages_checked = 0
page_errors = 0
image_errors = 0
linked_ext_images = 0
linked_oni2_images = 0
embedded_ext_images = 0
embedded_oni2_images = 0
file_formats = ('.jpg', '.jpeg', '.png', '.gif', '.svg')
tag_names = []
# Pass this function a singular noun and it will add an 's' to "noun" if "quantity" is not 1
def plural_check(noun, quantity):
if quantity != 1:
return noun + "s"
else:
return noun
# Scrapes the HTML at the given URL for image tags
def get_image_links(page_url, page_name):
global debug
global pages_checked
global page_errors
global image_errors
global linked_ext_images
global linked_oni2_images
global embedded_ext_images
global embedded_oni2_images
global file_formats
global tag_names
name_printed = 0
response = fetch(page_url)
if response.status_code != 200:
pywikibot.stdout(' ERROR: Could not load page at URL "{}".'.format(page_url))
page_errors += 1
return
soup = BeautifulSoup(response.text, 'html.parser')
pages_checked += 1
for tag in soup.findAll(tag_names):
link = tag.get('href')
if not link:
link = tag.get('src')
# Filter out empty links
if not link:
if tag.get('id') == "top":
continue
class_names = tag.get('class')
if "selflink" in class_names:
continue
if not name_printed and not debug:
pywikibot.stdout('From page "{}":'.format(page_name))
name_printed = 1
pywikibot.stdout(' ERROR: Could not process mystery link {}.'.format(tag.get_text))
pywikibot.stdout(' Class is "{}".'.format(tag.get('class')))
page_errors += 1
continue
# A "src" or "href" starting with "/" would be a link to a local page or file; a
# link starting with "#" is a section link
if link.startswith('/') or link.startswith('#'):
continue
# The gnu.org link to the Free Documentation License is at the bottom of every page
if link == "http://www.gnu.org/copyleft/fdl.html":
continue
# Determine if link is to an image
_, ext = os.path.splitext(link)
if ext.lower() in file_formats:
if not name_printed and not debug:
pywikibot.stdout('Found on page "{}":'.format(page_name))
name_printed = 1
tag_text = format(tag)
if "oni2.net" in link:
if tag_text.startswith('