# Find External Images
# by iritscen@yahoo.com
# Looks at each link on a page (or in all the pages in a category) and prints the links to
# images that are external to the wiki. Distinction is made between images hosted on oni2.net
# and on third-party domains. You must pass in one or both of the following args:
#  -embedded: Show any plain URLs leading to images (these create embedded images, <img>)
#  -linked: Show any external URLs ("[URL]") leading to images (these create links, <a>)
# 
# Specify the page or category to search with -page:"Some Page" or -cat:"Some Category".
#
# Recommended viewing width:
# |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---|

import os

from urllib.parse import urljoin

import pywikibot
from pywikibot.bot import QuitKeyboardInterrupt
from pywikibot import pagegenerators
from pywikibot.comms.http import fetch
from pywikibot.specialbots import UploadRobot

import bs4
from bs4 import BeautifulSoup

# Initialize globals
debug = 0
pages_checked = 0
page_errors = 0
image_errors = 0
linked_ext_images = 0
linked_oni2_images = 0
embedded_ext_images = 0
embedded_oni2_images = 0
file_formats = ('.jpg', '.jpeg', '.png', '.gif', '.svg')
tag_names = []

# Pass this function a singular noun and it will add an 's' to "noun" if "quantity" is not 1
def plural_check(noun, quantity):
   if quantity != 1:
      return noun + "s"
   else:
      return noun

# Scrapes the HTML at the given URL for image tags
def get_image_links(page_url, page_name):
   global debug
   global pages_checked
   global page_errors
   global image_errors
   global linked_ext_images
   global linked_oni2_images
   global embedded_ext_images
   global embedded_oni2_images
   global file_formats
   global tag_names
   name_printed = 0

   response = fetch(page_url)
   if response.status_code != 200:
      pywikibot.stdout('   ERROR: Could not load page at URL "{}".'.format(page_url))
      page_errors += 1
      return

   soup = BeautifulSoup(response.text, 'html.parser')
   pages_checked += 1
   for tag in soup.findAll(tag_names):
      link = tag.get('href')
      if not link:
         link = tag.get('src')

      # Filter out empty links
      if not link:
         if tag.get('id') == "top":
            continue

         class_names = tag.get('class')
         if "selflink" in class_names:
            continue

         if not name_printed and not debug:
            pywikibot.stdout('From page "{}":'.format(page_name))
            name_printed = 1
         pywikibot.stdout('   ERROR: Could not process mystery link {}.'.format(tag.get_text))
         pywikibot.stdout('   Class is "{}".'.format(tag.get('class')))
         page_errors += 1
         continue

      # A "src" or "href" starting with "/" would be a link to a local page or file; a
      # link starting with "#" is a section link
      if link.startswith('/') or link.startswith('#'):
         continue

      # The gnu.org link to the Free Documentation License is at the bottom of every page
      if link == "http://www.gnu.org/copyleft/fdl.html":
         continue

      # Determine if link is to an image
      _, ext = os.path.splitext(link)
      if ext.lower() in file_formats:
         if not name_printed and not debug:
            pywikibot.stdout('Found on page "{}":'.format(page_name))
            name_printed = 1
         tag_text = format(tag)
         if "oni2.net" in link:
            if tag_text.startswith('<a'):
               pywikibot.stdout('   Linked oni2.net image: {}'.format(link))
               linked_oni2_images += 1
            elif tag_text.startswith('<img'):
               pywikibot.stdout('   Embedded oni2.net image: {}'.format(link))
               embedded_oni2_images += 1
            else:
               pywikibot.stdout('   ERROR: Could not process oni2.net image link {}.'.format(link))
               image_errors += 1
               return
         else:
            if tag_text.startswith('<a'):
               pywikibot.stdout('   Linked external image: {}'.format(link))
               linked_ext_images += 1
            elif tag_text.startswith('<img'):
               pywikibot.stdout('   Embedded external image: {}'.format(link))
               embedded_ext_images += 1
            else:
               pywikibot.stdout('   ERROR: Could not process external image link {}.'.format(link))
               image_errors += 1
               return

def main(*args):
   global debug
   global pages_checked
   global page_errors
   global image_errors
   global linked_ext_images
   global linked_oni2_images
   global embedded_ext_images
   global embedded_oni2_images
   global tag_names

   search_cat = ''
   search_page = ''

   #pywikibot.stdout('The members of the bs4.element.Tag class are:')
   #pywikibot.stdout(format(dir(bs4.element.Tag)))

   local_args = pywikibot.handle_args(args)
   genFactory = pagegenerators.GeneratorFactory()

   for arg in local_args:
      if arg.startswith('-cat:'):
         search_cat = arg[5:]
      elif arg.startswith('-page:'):
         search_page = arg[6:]
      elif arg == '-linked':
         tag_names += ['a']
      elif arg == '-embedded':
         tag_names += ['img']
      elif arg == '-dbg':
         debug = 1
      else:
         pywikibot.stdout('Unknown argument "{}".'.format(arg))
         return

   if not tag_names:
      pywikibot.stdout('You need to pass this script either "-linked", "-embedded", or both arguments in order to specify which image links you want to find.')
      return

   site = pywikibot.Site()
   if search_cat != '':
      cat_obj = pywikibot.Category(site, search_cat)
      generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
      for page in pagegenerators.PreloadingGenerator(generator, 100):
         if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
         page_url = page.full_url().replace("%2F", "/")
         get_image_links(page_url, page.title())
   elif search_page != '':
      page = pywikibot.Page(site, search_page)
      if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
      page_url = page.full_url().replace("%2F", "/")
      get_image_links(page_url, page.title())
   else:
      pywikibot.stdout('No page name or category name received.'.format(arg))
      return

   chk_page_str = plural_check("page", pages_checked)
   err_page_str = plural_check("page", page_errors)
   err_img_str = plural_check("image", image_errors)
   linked_ext_image_str = plural_check("image", linked_ext_images)
   linked_oni2_image_str = plural_check("image", linked_oni2_images)
   embedded_ext_image_str = plural_check("image", embedded_ext_images)
   embedded_oni2_image_str = plural_check("image", embedded_oni2_images)

   pywikibot.stdout('-------------------------')
   pywikibot.stdout('Checked {0} {1} and failed to process {2} {3} on them. Failed to check {4} {5}.'.format(pages_checked, chk_page_str, image_errors, err_img_str, page_errors, err_page_str))
   if 'a' in tag_names:
      pywikibot.stdout('Found {0} linked {1} from oni2.net and {2} linked {3} from other domains.'.format(linked_oni2_images, linked_oni2_image_str, linked_ext_images, linked_ext_image_str))
   if 'img' in tag_names:
      pywikibot.stdout('Found {0} embedded {1} from oni2.net and {2} embedded {3} from other domains.'.format(embedded_oni2_images, embedded_oni2_image_str, embedded_ext_images, embedded_ext_image_str))

if __name__ == '__main__':
   main()
