import os

from urllib.parse import urljoin

import pywikibot
import re

from pywikibot.bot import QuitKeyboardInterrupt
from pywikibot import pagegenerators
from pywikibot.tools.formatter import color_format
from pywikibot.comms.http import fetch
from pywikibot.specialbots import UploadRobot
from bs4 import BeautifulSoup

# Array of OniGalore's namespaces
intrawiki_prefixes = ('Image', 'Special', 'Talk', 'User', 'User_talk', 'OniGalore', 'OniGalore_talk', 'File', 'File_talk', 'MediaWiki', 'MediaWiki_talk', 'Template', 'Template_talk', 'Help', 'Help_talk', 'Category', 'Category_talk', 'BSL', 'BSL_talk', 'OBD', 'OBD_talk', 'AE', 'AE_talk', 'Oni2', 'Oni2_talk', 'XML', 'XML_talk')

# URL for main namespace of our wiki
onigalore_url = 'https://wiki.oni2.net/'

# Interwiki prefixes, for ruling out these links
interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')

pages_checked = 0
iw_found = 0
problems_found = 0
page_name = ''

# Searches the given page text for intrawiki links with section links in them
def scan_for_iw_links(page_text):
    global pages_checked
    global iw_found
    global problems_found
    global page_name
    pages_checked = pages_checked + 1

    # Isolate strings of pattern "[[anything]]", "[[any:thing]]", "[[any|thing]]" or
    # "[[any:thi|ng]]"
    iw_link = "\[\[[^|\]]*(\||\])"
    for match in re.finditer(iw_link, page_text):
        found_iw_match = False
        iw_url = ""
        page_name2 = page_name
    
        # Cut out the matched text from the page, and in the process remove the "[[" from the
        # front and the "|" or "]" from the end
        s = match.start() + 2
        e = match.end() - 1
        link_text = page_text[s:e]

        # Sometimes we used a space char. instead of a '_', so fix that before querying
        link_text = link_text.replace(' ', '_')
        #pywikibot.output('Found link {0}.'.format(link_text))
        
        # If this link doesn't have a section link in it, then we don't care about it, as
        # MediaWiki takes care of checking basic intrawiki links
        if not '#' in link_text:
            #pywikibot.output('Link doesn\'t have a section anchor in it. Skipping.')
            continue
        
        # If there is a '{' in the link, then probably it's a link built on transcluded text
        # like "Quotes/Diary#{{C3}}", which we cannot expand and work with, so skip it
        if '{' in link_text:
            pywikibot.output('ADVICE: Link {} seems to use transclusion, so it can\'t be verified automatically. You should check it manually.'.format(link_text))
            continue
        
        # If this is a relative "../" link, find the parent page and set ourselves to that
        # page, then remove the relative portion of the link. Note that this is only performed
        # once, so if there's multiple steps back ("../../"), we're out of luck.
        if link_text.startswith('../'):
            last_slash = page_name.rfind('/')
            page_name2 = page_name[0:last_slash]
            #pywikibot.output('Changed page_name to {} on account of "../".'.format(page_name2))
            link_text = link_text[3:len(link_text)]
            #pywikibot.output('Changed link_text to {} on account of "../".'.format(link_text))
            # If this is now going to be a bare section link for the parent page, don't add
            # a slash, otherwise do because we are drilling down to another subpage
            if link_text.startswith('#'):
                link_text = page_name2 + link_text
            else:
                link_text = page_name2 + '/' + link_text
            
        # If this is a bare section link, build URL based on this page
        if link_text.startswith('#'):
            iw_url = onigalore_url + page_name2
            iw_found = iw_found + 1
            #pywikibot.output('Found link to this very page, {}.'.format(link_text))
            found_iw_match = True
            link_text = page_name2 + link_text
        
        # If there's no ":" in the link (before the section link, where a colon would just be
        # part of the text) then it's a Main namespace article, so construct URL
        #if not ':' in link_text:
        if found_iw_match == False:
            if not re.search(":.*#", link_text):
                iw_url = onigalore_url + link_text
                iw_found = iw_found + 1
                #pywikibot.output('Found link to OniGalore Main namespace page {}.'.format(link_text))
                found_iw_match = True
            
        # If there is a ":", match the prefix against the intrawiki prefixes on OniGalore
        if found_iw_match == False:
            for prefix in intrawiki_prefixes:
                #pywikibot.output('Comparing link against prefix {}.'.format(prefix))
                if prefix + ":" in link_text:
                    iw_url = onigalore_url + link_text
                    _, post_ns = link_text.split(':', 1)
                    #pywikibot.output('Found link to OniGalore {0} namespace page {1}.'.format(prefix, post_ns))
                    iw_found = iw_found + 1
                    found_iw_match = True
                    break
        
        # If we didn't match the prefix against any intrawiki prefixes, see if it matches
        # against an interwiki prefix; if so, this link can be ignored
        is_interwiki = False
        if found_iw_match == False:
            for prefix in interwiki_prefixes:
                if prefix + ":" in link_text:
                    #pywikibot.output('Skipping link {} because it is an interwiki link.'.format(link_text))
                    is_interwiki = True
                    break
        if is_interwiki:
            continue
        
        # If we still haven't turned this match into a URL, something's gone wrong
        if (found_iw_match == False) or (iw_url == ""):
            pywikibot.output('ERROR: Couldn\'t figure out link {}. Aborting script.'.format(link_text))
            quit()

        # Test the URL
        iw_url = iw_url.replace(' ', '_')
        #pywikibot.output('Reading page at {}...'.format(iw_url))
        response = fetch(iw_url)

        # Redirects are followed automatically by fetch() and treated as "200"s, so the
        # way we tell that a redirect occurred is by checking the history
        if response.history != []:
            pywikibot.output('WARNING: Redirected from {}.'.format(response.history))
            problems_found = problems_found + 1
        elif response.status_code != 200:
            #pywikibot.output('WARNING: Got response code {}.'.format(response.status_code)) # commented out because fetch() already prints such a msg
            problems_found = problems_found + 1
        else:
            # Isolate section link
            pre_section, section_name = link_text.split('#', 1)
            #pywikibot.output('Searching for section link {} on page.'.format(section_name))
            
            # Convert slash character to the dot-notation hex encoding that MediaWiki uses
            section_name = section_name.replace('/', '.2F')
            
            # Read linked page to see if it really has this anchor link
            soup = BeautifulSoup(response.text, 'html.parser')
            found_section = False
            for span_tag in soup.findAll('span'):
                span_name = span_tag.get('id', None)
                if span_name == section_name:
                    #pywikibot.output('Found section!')
                    found_section = True
                    break
            if found_section == False:
                pywikibot.output('ERROR: Could not find section {0} on page {1}!'.format(section_name, pre_section))
                problems_found = problems_found + 1

def main(*args):
    cat_name = ''
    global page_name

    local_args = pywikibot.handle_args(args)
    genFactory = pagegenerators.GeneratorFactory()

    for arg in local_args:
        if arg.startswith('-cat:'):
            cat_name = arg[5:]
        elif arg.startswith('-page:'):
            page_name = arg[6:]

    site = pywikibot.Site()

    # This line of code enumerates the methods in the 'page' class
    #pywikibot.stdout(format(dir(page)))

    if cat_name != '':
        cat_obj = pywikibot.Category(site, cat_name)
        generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
        for page in pagegenerators.PreloadingGenerator(generator, 100):
            pywikibot.stdout('Checking page {0}'.format(page.title()))
            page_name = page.title()
            scan_for_iw_links(page.text)
    elif page_name != '':
        page = pywikibot.Page(site, page_name)
        pywikibot.stdout('Checking page {0}'.format(page.title()))
        scan_for_iw_links(page.text)

    global pages_checked
    global iw_found
    global problems_found
    pywikibot.stdout('Checked {0} page(s) and found {1} intrawiki link(s) with {2} section link problem(s).'.format(pages_checked, iw_found, problems_found))

if __name__ == '__main__':
    main()
