# Check Interwiki Links
# by iritscen@yahoo.com
# Looks at each link on a page (or all the pages in a category) which uses a registered
# interwiki prefix and loads the linked page, verifying that it exists and that any section
# link, if present, is valid as well. The output will use the word "ERROR" when it cannot
# validate the interwiki link.
# Recommended viewing width:
# |---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---|

import bs4
import pywikibot
import re
import requests # for listing members with dir() when debugging

from bs4 import BeautifulSoup
from pywikibot import pagegenerators
from pywikibot.bot import QuitKeyboardInterrupt
from pywikibot.comms.http import fetch
from pywikibot.specialbots import UploadRobot
from pywikibot.tools.formatter import color_format
from urllib.parse import urljoin

class IWLink:
   def __init__(self, iw_prefix, prefix_url, full_url, page_name, page_slug, curl_response):
      self.iw_prefix = iw_prefix # e.g. "wp"
      self.prefix_url = prefix_url # e.g. "https://en.wikipedia.org/wiki/"
      self.full_url = full_url # e.g. "https://en.wikipedia.org/wiki/Easter_egg"
      self.page_name = page_name # "Easter egg"
      self.page_slug = page_slug # "Easter_egg"
      self.curl_response = curl_response # a class defined in the Requests library

# Parallel arrays based on https://wiki.oni2.net/Special:Interwiki
interwiki_prefixes = ('acronym', 'cache', 'commons', 'dictionary', 'google', 'metawikimedia', 'mw', 'wikibooks', 'wikidata', 'wikimedia', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikispecies', 'wikiversity', 'wikivoyage', 'wikt', 'wiktionary', 'wp')

interwiki_urls = ('http://www.acronymfinder.com/~/search/af.aspx?string=exact&Acronym=', 'http://www.google.com/search?q=cache:', 'https://commons.wikimedia.org/wiki/', 'http://www.dict.org/bin/Dict?Database=*&Form=Dict1&Strategy=*&Query=', 'http://www.google.com/search?q=', 'https://meta.wikimedia.org/wiki/', 'https://www.mediawiki.org/wiki/', 'https://en.wikibooks.org/wiki/', 'https://www.wikidata.org/wiki/', 'https://foundation.wikimedia.org/wiki/', 'https://en.wikinews.org/wiki/', 'https://en.wikipedia.org/wiki/', 'https://en.wikiquote.org/wiki/', 'https://wikisource.org/wiki/', 'https://species.wikimedia.org/wiki/', 'https://en.wikiversity.org/wiki/', 'https://en.wikivoyage.org/wiki/', 'https://en.wiktionary.org/wiki/', 'https://en.wiktionary.org/wiki/', 'https://en.wikipedia.org/wiki/')

# Initialize globals
debug = 0
pages_checked = 0
iw_found = 0
errors_issued = 0
unintended_redirects_found = 0
name_printed = 0

# Prints the name of a page on which something occurred, if it has not been printed before
def possibly_print(page_name):
   global debug
   global name_printed
   
   if not name_printed and not debug:
      pywikibot.stdout('')
      pywikibot.stdout('From page "{}":'.format(page_name))
      name_printed = 1

# Search a page for the section specified in the link
def find_section(the_link, print_result):
   global errors_issued

   # Isolate section link
   target_page_name, anchor_name = the_link.page_slug.split('#')
   target_page_name_human = target_page_name.replace('_', ' ')
   
   # Convert dot-notation hex entities to proper characters
   replacements = [(r'\.22', '"'), (r'\.27', "'"), (r'\.28', '('), (r'\.29', ')')]
   for pattern, replacement in replacements:
      anchor_name = re.sub(pattern, replacement, anchor_name)
   
   # Read linked page to see if it really has this anchor link
   soup = BeautifulSoup(the_link.curl_response.text, 'html.parser')
   tags_to_search = ['span', 'div', 'h2', 'h3', 'h4']
   found_section = False
   for tag_name in tags_to_search:
       for the_tag in soup.find_all(tag_name):
           if the_tag.get('id') == anchor_name:
               found_section = True
               break
       if found_section:
           break
   
   # Tell user what we found
   if found_section == False:
      possibly_print(the_link.page_name)
      pywikibot.stdout('   ERROR: Could not find section "{0}" on {1} page "{2}".'.format(anchor_name, the_link.iw_prefix, target_page_name_human))
      # TODO: Check that page name has been corrected to redirected page if there was a redirect
      errors_issued = errors_issued + 1
   elif print_result == True:
      pywikibot.stdout('   The section "{0}" was found on {1} page "{2}".'.format(anchor_name, the_link.iw_prefix, target_page_name_human))

# For a link that redirected us to another page, extract the name of the target page from
# the target page's source
def find_canonical_link(the_link):
   # Extract link from this markup which contains name of redirected-to page:
   # <link rel="canonical" href="https://en.wikipedia.org/wiki/Page_name"/>
   canonical_name = the_link.curl_response.text.split('<link rel="canonical" href="')[-1]
   prefix_length = len(the_link.prefix_url)
   canonical_name = canonical_name[prefix_length:]
   tag_end = canonical_name.find('">')
   
   if tag_end == -1:
      pywikibot.stdout('   ERROR: The {0} link "{1}" is a redirect page, but this script could not isolate the target page name.'.format(the_link.iw_prefix, the_link.page_slug))
      errors_issued = errors_issued + 1
   else:
      canonical_name = canonical_name[:tag_end]
      if len(canonical_name) > 100:
         # Certain things can cause the trim to fail; report error and avoid slamming the
         # output with massive page source from a failed trim
         pywikibot.stdout('   ERROR: The {0} link "{1}" is a redirect to "{2}…" (string overflow).'.format(the_link.iw_prefix, the_link.page_slug, canonical_name[:100]))
         errors_issued = errors_issued + 1
      else:
         canonical_name = canonical_name.replace('_', ' ')
         if '#' in the_link.page_slug:
            _, anchor_name = the_link.page_slug.split('#')
            pywikibot.stdout('   The {0} link "{1}" is a redirect to "{2}#{3}", which is a valid page. Checking for section on that page….'.format(the_link.iw_prefix, the_link.page_slug, canonical_name, anchor_name))
            the_link.page_slug = the_link.page_slug.replace(the_link.page_name, canonical_name) # update page slug so that find_section() uses the right page name in its messages
            find_section(the_link, True)
         else:
            pywikibot.stdout('   The {0} link "{1}" is a redirect to "{2}", which is a valid page.'.format(the_link.iw_prefix, the_link.page_slug, canonical_name))

# Test an interwiki link and look for a section link if applicable
def test_interwiki_link(the_link):
   global errors_issued
   global unintended_redirects_found
   
   the_link.curl_response = fetch(the_link.full_url)

   # One way we tell that a redirect occurred is by checking fetch's history, as it
   # automatically follows redirects. This will catch formal redirects which come from pages
   # such as Special:PermanentLink.
   if the_link.curl_response.history != []:
      possibly_print(the_link.page_name)
      
      # If linked page is in all caps, e.g. WP:BEANS, it's likely a deliberate use of a redirect
      if the_link.page_slug.startswith('WP:') and the_link.page_slug == the_link.page_slug.upper():
         pywikibot.stdout('   Got redirection code "{0}" for {1} link "{2}". This appears to be a deliberate use of a Wikipedia shortcut. Checking the target page….'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug))
         find_canonical_link(the_link)
      else:
         permalink1 = 'Special:PermanentLink/'.lower()
         permalink2 = 'Special:Permalink/'.lower()
         page_slug_lower = the_link.page_slug.lower()
         if page_slug_lower.startswith(permalink1) or page_slug_lower.startswith(permalink2):
            pywikibot.stdout('   Got redirection code "{0}" for {1} permanent revision link "{2}". Checking the target page….'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug))
            find_canonical_link(the_link)
         else:
            pywikibot.stdout('   ERROR: Unrecognized type of redirection (code "{0}") for {1} link "{2}". You should check the link manually.'.format(the_link.curl_response.history[0], the_link.iw_prefix, the_link.page_slug))
            errors_issued = errors_issued + 1
   elif the_link.curl_response.status_code != 200:
      possibly_print(the_link.page_name)
      pywikibot.stdout('   ERROR: Got response code {0} for {1} link "{2}". The page may not exist.'.format(the_link.curl_response.status_code, the_link.iw_prefix, the_link.page_slug))
      errors_issued = errors_issued + 1
   # However the usual way that a redirect occurs is that MediaWiki redirects us sneakily
   # using JavaScript, while returning code OK 200 as if the link was correct; this happens
   # when a redirect page is accessed. We must detect these soft redirects by looking at the
   # page source to find the redirect note inserted at the top of the page for the reader.
   elif 'Redirected from <a' in the_link.curl_response.text:
      unintended_redirects_found = unintended_redirects_found + 1
      possibly_print(the_link.page_name)
      pywikibot.stdout('   WARNING: Got silently redirected by {0} link "{1}". Checking the target page….'.format(the_link.iw_prefix, the_link.page_slug))
      find_canonical_link(the_link)
   elif '#' in the_link.page_slug:
      find_section(the_link, False)

# Searches the given page text for interwiki links
def scan_for_interwiki_links(page_text, page_name):
   global debug
   global pages_checked
   global iw_found
   global name_printed
   pages_checked = pages_checked + 1
   cur_prefix = 0
   name_printed = 0

   for prefix in interwiki_prefixes:
      # Isolate strings that start with "[[prefix:" and end with "|" or "]"
      iw_link = r"\[\[" + prefix + r":[^|\]]*(\||\])"
      for match in re.finditer(iw_link, page_text):
         the_link = IWLink(prefix, interwiki_urls[cur_prefix], "", page_name, "", "")
      
         # Extract just the page title from this regex match
         s = match.start() + 2 + len(the_link.iw_prefix) + 1
         e = match.end() - 1

         # Commonly we use spaces instead of underscores, so fix that before querying
         the_link.page_slug = page_text[s:e].replace(' ', '_')

         # But use spaces for title when printing it
         page_title_human = the_link.page_slug.replace('_', ' ')
         if debug: pywikibot.stdout('   Validating {0} link "{1}"'.format(the_link.iw_prefix, page_title_human))
         iw_found = iw_found + 1

         # Construct full URL for the particular wiki
         the_link.full_url = the_link.prefix_url + the_link.page_slug

         # Adjust URL if this is a foreign-language WP link
         if re.match("^[a-zA-Z]{2}:", the_link.page_slug):
            lang_code = the_link.page_slug[0:2] + "."
            # "wp:" is the Wikipedia: namespace, not a language
            if lang_code != "wp." and lang_code != "WP.":
               the_link.full_url = the_link.full_url.replace('en.', lang_code)
               the_link.full_url = the_link.full_url.replace(the_link.page_slug[0:3], '')

         # Test the URL
         test_interwiki_link(the_link)
      cur_prefix = cur_prefix + 1

# Print a wrap-up message
def print_summary():
   global pages_checked
   global iw_found
   global errors_issued
   global unintended_redirects_found

   page_str = "pages"
   if pages_checked == 1:
      page_str = "page"

   link_str = "links"
   if iw_found == 1:
      link_str = "link"

   pywikibot.stdout('Checked {0} {1} and found {2} interwiki {3}.'.format(pages_checked, page_str, iw_found, link_str))

   error_str = "errors were"
   if errors_issued == 1:
      error_str = "error was"

   pywikibot.stdout('{0} {1} encountered in validating these links.'.format(errors_issued, error_str))

   warning_str = "likely-unintended redirects were"
   if unintended_redirects_found == 1:
      warning_str = "likely-unintended redirect was"

   pywikibot.stdout('{0} {1} encountered in validating these links.'.format(unintended_redirects_found, warning_str))

# Main function
def main(*args):
   global debug
   search_cat = ''
   search_page = ''

   # Process arguments
   local_args = pywikibot.handle_args(args)
   for arg in local_args:
      if arg.startswith('-cat:'):
         search_cat = arg[5:]
      elif arg.startswith('-page:'):
         search_page = arg[6:]
      elif arg == '-dbg':
         debug = 1
      else:
         pywikibot.stdout('Unknown argument "{}". Exiting.'.format(arg))
         return

   #pywikibot.stdout('The members of the requests.models.Response class are:')
   #pywikibot.stdout(format(dir(requests.models.Response)))
   #return
   
   # Check specified page or loop through specified category and check all pages
   site = pywikibot.Site()
   if search_cat != '':
      cat_obj = pywikibot.Category(site, search_cat)
      generator = pagegenerators.CategorizedPageGenerator(cat_obj, recurse=True)
      for page in pagegenerators.PreloadingGenerator(generator, 100):
         if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
         scan_for_interwiki_links(page.text, page.title())
   elif search_page != '':
      page = pywikibot.Page(site, search_page)
      if debug: pywikibot.stdout('Checking page "{}"'.format(page.title()))
      scan_for_interwiki_links(page.text, page.title())

   # Print the results
   print_summary()

if __name__ == '__main__':
   main()