# Check Intrawiki Section Links
# by iritscen@yahoo.com
# Looks at each wikilink on a page (or in all the pages in a category) for a section link ('#'),
# and loads the linked page and verifies that the named section actually exists. The output will
# use the keywords ADVICE, WARNING or ERROR depending on the nature of issue that it encounters.
# Recommended viewing width:
# |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- --|

import os

from urllib.parse import urljoin

import pywikibot
import re

from pywikibot.bot import QuitKeyboardInterrupt
from pywikibot import pagegenerators
from pywikibot.tools.formatter import color_format
from pywikibot.comms.http import fetch
from pywikibot.specialbots import UploadRobot
from bs4 import BeautifulSoup

# Array of OniGalore's namespaces
intrawiki_prefixes = ('Image', 'Special', 'Talk', 'User', 'User_talk', 'OniGalore', 'OniGalore_talk', 'File', 'File_talk', 'MediaWiki', 'MediaWiki_talk', 'Template', 'Template_talk', 'Help', 'Help_talk', 'Category', 'Category_talk', 'BSL', 'BSL_talk', 'OBD', 'OBD_talk', 'AE', 'AE_talk', 'Oni2', 'Oni2_talk', 'XML', 'XML_talk')

# URL for main namespace of our wiki
onigalore_url = 'https://wiki.oni2.net/'

# Interwiki prefixes, for ruling out these links
interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')

pages_checked = 0
iw_found = 0
advice_issued = 0
warnings_issued = 0
errors_issued = 0
page_name = ''

# Searches the given page text for intrawiki links with section links in them
def scan_for_iw_links(page_text):
    global pages_checked
    global iw_found
    global advice_issued
    global warnings_issued
    global errors_issued
    global page_name
    pages_checked = pages_checked + 1

    # Isolate strings of pattern "[[anything]]", "[[any:thing]]", "[[any|thing]]" or
    # "[[any:thi|ng]]"
    iw_link = "\[\[[^|\]]*(\||\])"
    for match in re.finditer(iw_link, page_text):
        found_iw_match = False
        iw_url = ""
        page_name2 = page_name
    
        # Cut out the matched text from the page, and in the process remove the "[[" from the
        # front and the "|" or "]" from the end
        s = match.start() + 2
        e = match.end() - 1
        link_text = page_text[s:e]

        # Sometimes we used a space char. instead of a '_', so fix that before querying
        link_text = link_text.replace(' ', '_')
        #pywikibot.stdout('Found link {0}.'.format(link_text))
        
        # If this link doesn't have a section link in it, then we don't care about it, as
        # MediaWiki takes care of checking basic intrawiki links
        if not '#' in link_text:
            #pywikibot.stdout('Link doesn\'t have a section anchor in it. Skipping.')
            continue
        
        # If there is a '{' in the link, then probably it's a link built on transcluded text
        # like "Quotes/Diary#{{C3}}", which we cannot expand and work with, so skip it
        if '{' in link_text:
            pywikibot.stdout('ADVICE: Link {} seems to use transclusion, so it can\'t be verified automatically. You should check it manually.'.format(link_text))
            advice_issued = advice_issued + 1
            continue

        # If this is a relative "/" link, use the current page as the basis for the URL. Note
        # that only a leading slash is looked for, so if there's multiple steps down ("/x/y"),
        # we're out of luck.
        if link_text.startswith('/'):
            link_text = page_name + link_text
            pywikibot.stdout('Changed link_text to {} on account of "/".'.format(link_text))
        
        # If this is a relative "../" link, find the parent page and set ourselves to that page,
        # then remove the relative portion of the link. Note that this is only performed once,
        # so if there's multiple steps back ("../../"), we're out of luck.
        if link_text.startswith('../'):
            last_slash = page_name.rfind('/')
            page_name2 = page_name[0:last_slash]
            #pywikibot.stdout('Changed page_name to {} on account of "../".'.format(page_name2))
            link_text = link_text[3:len(link_text)]
            #pywikibot.stdout('Changed link_text to {} on account of "../".'.format(link_text))
            # If this is now going to be a bare section link for the parent page, don't add a
            # slash, otherwise do because we are drilling down to another subpage
            if link_text.startswith('#'):
                link_text = page_name2 + link_text
            else:
                link_text = page_name2 + '/' + link_text
            
        # If this is a bare section link, build URL based on this page
        if link_text.startswith('#'):
            iw_url = onigalore_url + page_name2
            iw_found = iw_found + 1
            #pywikibot.stdout('Found link to this very page, {}.'.format(link_text))
            found_iw_match = True
            link_text = page_name2 + link_text
        
        # If there's no ":" in the link (before the section link, where a colon would just be
        # part of the text) then it's a Main namespace article, so construct URL
        if found_iw_match == False:
            if not re.search(":.*#", link_text):
                iw_url = onigalore_url + link_text
                iw_found = iw_found + 1
                #pywikibot.stdout('Found link to OniGalore Main namespace page {}.'.format(link_text))
                found_iw_match = True
            
        # If there is a ":", match the prefix against the intrawiki prefixes on OniGalore
        if found_iw_match == False:
            for prefix in intrawiki_prefixes:
                #pywikibot.stdout('Comparing link against prefix {}.'.format(prefix))
                if prefix + ":" in link_text:
                    iw_url = onigalore_url + link_text
                    _, post_ns = link_text.split(':', 1)
                    #pywikibot.stdout('Found link to OniGalore {0} namespace page {1}.'.format(prefix, post_ns))
                    iw_found = iw_found + 1
                    found_iw_match = True
                    break
        
        # If we didn't match the prefix against any intrawiki prefixes, see if it matches
        # against an interwiki prefix; if so, this link can be ignored
        is_interwiki = False
        if found_iw_match == False:
            for prefix in interwiki_prefixes:
                if prefix + ":" in link_text:
                    #pywikibot.stdout('Skipping link {} because it is an interwiki link.'.format(link_text))
                    is_interwiki = True
                    break
        if is_interwiki:
            continue
        
        # If we still haven't turned this match into a URL, something's gone wrong
        if (found_iw_match == False) or (iw_url == ""):
            pywikibot.stdout('ERROR: Couldn\'t figure out link {}. Aborting script.'.format(link_text))
            quit()

        # Test the URL
        iw_url = iw_url.replace(' ', '_')
        #pywikibot.stdout('Reading page at {}...'.format(iw_url))
        response = fetch(iw_url)

        # Redirects are followed automatically by fetch() and treated as "200"s; the way we can
        # tell that a redirect occurred is by checking fetch's history
        if response.history != []:
            pywikibot.stdout('WARNING: Got redirection code ({0}) on URL "{1}".'.format(response.history[0], iw_url))
            warnings_issued = warnings_issued + 1
        elif response.status_code != 200:
            pywikibot.stdout('WARNING: Got response code {0} on URL {1}.'.format(response.status_code, iw_url))
            warnings_issued = warnings_issued + 1
        else:
            # Isolate section link
            pre_section, section_name = link_text.split('#', 1)
            #pywikibot.stdout('Searching for section link {} on page.'.format(section_name))
            
            # Convert slash character to the dot-notation hex encoding that MediaWiki uses
            section_name = section_name.replace('/', '.2F')
            
            # Read linked page to see if it really has this anchor link
            soup = BeautifulSoup(response.text, 'html.parser')
            found_section = False
            for span_tag in soup.findAll('span'):
                span_name = span_tag.get('id', None)
                if span_name == section_name:
                    #pywikibot.stdout('Found section!')
                    found_section = True
                    break
            if found_section == False:
                pywikibot.stdout('ERROR: Could not find section {0} on page {1}!'.format(section_name, pre_section))
                errors_issued = errors_issued + 1

def main(*args):
    cat_name = ''
    global page_name

    local_args = pywikibot.handle_args(args)
    genFactory = pagegenerators.GeneratorFactory()

    for arg in local_args:
        if arg.startswith('-cat:'):
            cat_name = arg[5:]
        elif arg.startswith('-page:'):
            page_name = arg[6:]

    site = pywikibot.Site()

    # This line of code enumerates the methods in the 'page' class
    #pywikibot.stdout(format(dir(page)))

    if cat_name != '':
        cat_obj = pywikibot.Category(site, cat_name)
        generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
        for page in pagegenerators.PreloadingGenerator(generator, 100):
            pywikibot.stdout('Checking page {0}'.format(page.title()))
            page_name = page.title()
            scan_for_iw_links(page.text)
    elif page_name != '':
        page = pywikibot.Page(site, page_name)
        pywikibot.stdout('Checking page {0}'.format(page.title()))
        scan_for_iw_links(page.text)

    global pages_checked
    global iw_found
    global advice_issued
    global warnings_issued
    global errors_issued

    page_str = "pages"
    if pages_checked == 1:
        page_str = "page"

    link_str = "links"
    if iw_found == 1:
        link_str = "link"

    pywikibot.stdout('Checked {0} {1} and found {2} intrawiki {3}.'.format(pages_checked, page_str, iw_found, link_str))
    pywikibot.stdout('While attempting to follow section links...')

    if advice_issued == 0:
        pywikibot.stdout('  No advice on potential problems was issued.')
    elif advice_issued == 1:
        pywikibot.stdout('  1 piece of advice on a potential problem was issued.')
    else:
        pywikibot.stdout('  {} pieces of advice on potential problems were issued.'.format(advice_issued))

    warning_str = "warnings were"
    if warnings_issued == 1:
        warning_str = "warning was"
    pywikibot.stdout('  {0} {1} issued.'.format(warnings_issued, warning_str))

    error_str = "errors were"
    if errors_issued == 1:
        error_str = "error was"
    pywikibot.stdout('  {0} {1} encountered.'.format(errors_issued, error_str))

if __name__ == '__main__':
    main()