# Find External Images
# by iritscen@yahoo.com
# Looks at each link on a page (or in all the pages in a category) and prints the links to
# images that are externally-hosted. You must pass in one or both of the following args:
# -inlined: Show any plain URLs leading to images (these create embedded images, <img>)
# -linked: Show any external URLs ("[URL]") leading to images (these create links, <a>)
# Specify the page or category to search with -page:"Some Page" or -cat:"Some Category".
#
# Recommended viewing width:
# |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---|

import os

from urllib.parse import urljoin

import pywikibot

from pywikibot.bot import QuitKeyboardInterrupt
from pywikibot import pagegenerators
from pywikibot.comms.http import fetch
from pywikibot.specialbots import UploadRobot
#import bs4 # for listing members with dir()
from bs4 import BeautifulSoup

pages_checked = 0
page_errors = 0
ext_images = 0
oni2_images = 0
file_formats = ('.jpg', '.jpeg', '.png', '.gif', '.svg')
tag_names = []

# Scrapes the HTML at the given URL for image tags
def get_image_links(url):
    global pages_checked
    global page_errors
    global ext_images
    global oni2_images
    global file_formats
    global tag_names

    response = fetch(url)
    if response.status_code != 200:
        pywikibot.stdout('   ERROR: Could not load page at URL "{}"'.format(url))
        page_errors = page_errors + 1
        return

    soup = BeautifulSoup(response.text, 'html.parser')
    pages_checked = pages_checked + 1
    for tag in soup.findAll(tag_names):
        link = tag.get('href')
        if not link:
            link = tag.get('src')

        # Filter out empty links
        if not link:
            if tag.get('id') == "top":
                continue

            class_names = tag.get('class')
            if "selflink" in class_names:
                continue

            pywikibot.stdout('   Could not process mystery link {}'.format(tag.get_text))
            pywikibot.stdout('   Class is "{}".'.format(tag.get('class')))
            continue

        # A "src" or "href" starting with "/" would be a link to a local page or file; a
        # link starting with "#" is a section link
        if link.startswith('/') or link.startswith('#'):
            continue

        # The gnu.org link to the Free Documentation License is at the bottom of every page
        if link == "http://www.gnu.org/copyleft/fdl.html":
            continue

        _, ext = os.path.splitext(link)
        if ext.lower() in file_formats:
            if "oni2.net" in link:
                pywikibot.stdout('   Oni2.net image: {}'.format(link))
                oni2_images = oni2_images + 1
            else:
                pywikibot.stdout('   External image: {}'.format(link))
                ext_images = ext_images + 1
        #else:
           #pywikibot.stdout('   Other external link: {}'.format(link))

def main(*args):
    global pages_checked
    global page_errors
    global ext_images
    global oni2_images
    global tag_names

    cat_name = ''
    page_name = ''

    #pywikibot.stdout('The members of the bs4.element.Tag class are:')
    #pywikibot.stdout(format(dir(bs4.element.Tag)))

    local_args = pywikibot.handle_args(args)
    genFactory = pagegenerators.GeneratorFactory()

    for arg in local_args:
        if arg.startswith('-cat:'):
            cat_name = arg[5:]
        elif arg.startswith('-page:'):
            page_name = arg[6:]
        elif arg == '-linked':
            tag_names += ['a']
        elif arg == '-inlined':
            tag_names += ['img']
        else:
            pywikibot.stdout('Unknown argument "{}".'.format(arg))
            return

    if not tag_names:
        pywikibot.stdout('You need to pass this script either "-linked", "-inlined", or both arguments in order to specify which image links you want to find.')
        return

    site = pywikibot.Site()
    if cat_name != '':
        cat_obj = pywikibot.Category(site, cat_name)
        generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
        for page in pagegenerators.PreloadingGenerator(generator, 100):
            pywikibot.stdout('Checking page "{}"'.format(page.title()))
            page_url = page.full_url().replace("%2F", "/")
            get_image_links(page_url)
    elif page_name != '':
        page = pywikibot.Page(site, page_name)
        pywikibot.stdout('Checking page "{}"'.format(page.title()))
        page_url = page.full_url().replace("%2F", "/")
        get_image_links(page_url)
    else:
        pywikibot.stdout('No page name or category name received.'.format(arg))
        return

    chk_page_str = "pages"
    if pages_checked == 1:
        chk_page_str = "page"

    err_page_str = "pages"
    if page_errors == 1:
        err_page_str = "page"

    ext_image_str = "images"
    if ext_images == 1:
        ext_image_str = "image"

    oni2_image_str = "images"
    if oni2_images == 1:
        oni2_image_str = "image"

    pywikibot.stdout('-------------------------')
    pywikibot.stdout('Checked {0} {1} and failed to check {2} {3}.'.format(pages_checked, chk_page_str, page_errors, err_page_str))
    pywikibot.stdout('Found {0} {1} from oni2.net and {2} {3} from other domains.'.format(oni2_images, oni2_image_str, ext_images, ext_image_str))

if __name__ == '__main__':
    main()
