#!/bin/bash

# Validate External Links by Iritscen
# Provided with a list of external links found in the OniGalore wiki, this script validates them.
# The resulting logs are produced in three formats: TXT (for easy diffing with an earlier log), RTF
# (for reading as a local file with clickable links), and HTML (for uploading as a web page).
# Call script with "--help" argument for documentation. Also see Read Me First.rtf for critical notes.
# Recommended rule:
# ------------------------------------------------------------------------------------------------------

# Set separator token to newline
IFS="
"

### GLOBALS ###
# Settings -- these will be changed from their defaults by the arguments passed in to the script
LINKS_URL=""        # use 'curl' to download file with links from this location (can be file://)
EXCEPT_URL=""       # ditto above for file with exceptions to NG results
OUTPUT_DIR=""	    # place reports and all other output in a folder inside this existing folder
RECORD_OK_LINKS=0   # record response code to the log whether it's a value in OK_CODES or NG_CODES
SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page
TAKE_PAGE_SHOT=0    # take a screenshot of each OK page
URL_START=1         # start at this URL in LINKS_FILE (1 by default)
URL_LIMIT=0         # if non-zero, stop at this URL in LINKS_FILE
UPLOAD_INFO=""      # path to a file on your hard drive with the login info needed to upload a report

# Fixed strings -- see the occurrences of these variables to learn their purpose
AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:54.0) Gecko/20100101 Firefox/54.0"
ARCHIVE_API="http://archive.org/wayback/available"
ARCHIVE_GENERIC="https://web.archive.org/web/*"
ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
CHROME="/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary"
CHROME_SCREENSHOT="screenshot.png"
CURL_CODES="http://iritscen.oni2.net/val/curl_codes.txt"
EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
HTTP_CODES="http://iritscen.oni2.net/val/http_codes.txt"
MY_WIKI_PAGE="http://wiki.oni2.net/User:Iritscen"
THIS_DIR=$(cd $(dirname $0); pwd)
WORKING_DIR=$(pwd)
WIKI_PATH="wiki.oni2.net"

# These are parallel arrays of the IDs and names of OniGalore's current namespaces
declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 100 101 102 103 104 105 108 109 110 111)
declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk")

# These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
# This determines whether the script tries to take a screenshot of the page or just gets its HTTP code.
declare -a HTTP_FILES=(txt zip wmv jpg png m4a bsl rar oni mp3 mov ONWC vbs TRMA mp4 doc avi log gif pdf dmg exe cpp tga 7z wav east BINA xml dll dae xaf fbx 3ds blend flv csv)
declare -a HTTP_TLDS_AND_PAGES=(com net org uk ru de it htm html php pl asp aspx shtml pgi cgi php3 x jsp phtml cfm css action stm js)

# These arrays tells us which HTTP response codes are OK (good) and which are NG (no good). Pages that
# return NG codes will not be screenshotted. Remember to update http_codes.txt if you add a new code.
declare -a OK_CODES=(200 301 307 401 405 406 501)
declare -a NG_CODES=(000 302 403 404 410 500 503)

# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
# transcluded text, and if the transclusion fails, then the braces show up in the URL
ILLEGAL_CHARS="{ }"

# These are parallel arrays giving the prefixes that can be used in place of normal external links to
# some wikis and other sites
declare -a INTERWIKI_PREFIXES=(metawikipedia wikipedia wikiquote wiktionary)
declare -a INTERWIKI_DOMAINS=(meta.wikipedia.org wikipedia.org wikiquote.org wiktionary.org)

# Variables for keeping track of main loop progress and findings
LINK_NUM=0
OK_LINKS=0
NG_LINKS=0
SKIP_UNK_NS=0
SKIP_JS_PAGE=0
SKIP_BAD_URL=0
SKIP_NON_ASCII=0
SKIP_UNK_SUFFIX=0
SKIP_UNK_CODE=0
SKIP_EXCEPT=0
FILE_LINKS=0
PAGE_LINKS=0
SKIPPED_HEADER_ROW=0
FINISHED_LIST="no"


### HELP ###
# A pseudo-man page. Here is the 80-character rule for the page text:
# 234567890123456789012345678901234567890123456789012345678901234567890123456789
function printHelp()
{
  cat << EOF

NAME
       Validate External Links

SYNOPSIS
       validate_external_links.sh --help
       validate_external_links.sh --links URL --output PATH [--exceptions FILE]
          [--record-ok-links] [--suggest-snapshots] [--take-screenshots]
          [--start-url NUM] [--end-url NUM] [--upload PATH]

DESCRIPTION
       This script parses a list of external links found in the OniGalore wiki
       (which is dumped by the Oni2.net domain periodically in a particular
       format), validates them using the Unix tool 'curl', and produces a report
       of which links were OK (responded to an HTTP query) and which were NG (no
       good). This report can then be automatically uploaded to the location of
       your choice. The script can also suggest Internet Archive snapshots for
       NG links, and take screenshots of OK links for visual verification by the
       reader that the page in question is the one intended to be displayed.

       You must pass this script the URL at which the list of links is found
       (--links) and the path where logs should be outputted (--output). All
       other arguments are optional.

OPTIONS
       --help              Show this page
       --links URL         URL from which to download file with external links
                           (note that this can be a local file if you use the
                           file:// protocol) (required)
       --output DIR        Place the folder which will contain the reports and
                           optional screenshots at this path (required)
       --exceptions DIR    Don't log an NG link if it is listed in the file
                           provided at this path as long as the response code is
                           the same as the one associated with the link
       --record-ok-links   Log a link in the report whether its response code is
                           in the OK_CODES or the NG_CODES array
       --suggest-snapshots Query the Internet Archive for a possible snapshot
                           URL for each NG page
       --take-screenshots  Save screenshots of each OK page (requires Google
                           Chrome to be found at the path in CHROME)
       --start-url NUM     Start at this link in the links file
       --end-url NUM       Stop at this link in the links file
       --upload FILE       Upload report using info in this local file

BUGS
       The script cannot properly parse any line in the external links file
       which contains a comma in the name of the wiki page containing a link.
       Commas in the link itself are not an issue.
EOF
}


### SETUP ###
# If first argument is a help request, or if nothing was passed in at all, print help page and quit
if [ "$#" -eq 0 ] || [ "$1" == "--help" ]; then
  printHelp | less
  exit 0
fi

# Parse arguments as long as there are more arguments to process
while (( "$#" )); do
   case "$1" in
      --links )             LINKS_URL="$2";      shift 2;;
      --exceptions )        EXCEPT_URL="$2";     shift 2;;
      --output )            OUTPUT_DIR="$2";     shift 2;;
      --record-ok-links )   RECORD_OK_LINKS=1;   shift;;
      --suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;;
      --take-screenshots )  TAKE_PAGE_SHOT=1;    shift;;
      --start-url )         URL_START=$2;        shift 2;;
      --end-url )           URL_LIMIT=$2;        shift 2;;
      --upload )            UPLOAD_INFO=$2;      shift 2;;
      * )                   echo "Invalid argument $1 detected. Aborting."; exit 1;;
  esac
done

# If the required arguments were not supplied, print help page and quit
if [ -z $LINKS_URL ] || [ -z $OUTPUT_DIR ]; then
   printHelp
   echo "Error: I did not receive one or both required arguments."
   exit 2
fi

# Check that UPLOAD_INFO exists, if this argument was supplied
if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then
   echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting."
   exit 3
fi

# Check that OUTPUT_DIR is a directory
if [ ! -d "$OUTPUT_DIR" ]; then
   echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting."
   exit 4
fi

# Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots
SAFE_TIME=$(date "+%Y-%m-%d--%H-%M-%S") # for use in path names
NICE_TIME=$(TZ=":UTC" date "+%a, %d %b %Y %k:%M:%S GMT") # matches format of LINKS_DATE, which is extracted from the extlinks file
OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
SHOT_PATH="$OUTPUT_PATH/Screenshots"
LOG_NAME="ValExtLinks report"
LOG_TXT="$OUTPUT_PATH/$LOG_NAME.txt"
LOG_RTF="$OUTPUT_PATH/$LOG_NAME.rtf"
LOG_HTM="$OUTPUT_PATH/$LOG_NAME.htm"
mkdir "$OUTPUT_PATH"
if [ $TAKE_PAGE_SHOT -eq 1 ]; then
   mkdir "$SHOT_PATH"
fi

# Check that 'mkdir' succeeded
if [ ! -d "$OUTPUT_PATH" ]; then
   echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting."
   exit 5
fi

# Get date on the file at LINKS_URL and print to log
LINKS_DATE=$(curl --silent --head $LINKS_URL | grep "Last-Modified")
if [ -z "$LINKS_DATE" ]; then
   echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting."
   exit 6
fi
LINKS_DATE=${LINKS_DATE#Last-Modified: }


### UTILITY FUNCTIONS ###
# Writes a plain-text header to TXT log file
function printTXTheader()
{
   valPrint t "Validate External Links report"
   valPrint t "generated $NICE_TIME"
   valPrint t "from data of $LINKS_DATE"
   valPrint t "script by Iritscen (contact: $MY_WIKI_PAGE)"
   valPrint t ""
}

# Writes the RTF header to RTF log file, then a centered title, then sets text to left-justified
function printRTFheader()
{
   valPrint r "{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf820
{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
{\colortbl;\red255\green255\blue255;}
{\*\expandedcolortbl;;}
\margl1440\margr1440\vieww12600\viewh12100\viewkind0
\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\qc\partightenfactor0

\f0\fs28 \cf0 \b1 Validate External Links report \b0\\
generated $NICE_TIME\\
from data of $LINKS_DATE\\
script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$MY_WIKI_PAGE\"}}{\fldrslt contact}})
\\
\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
\cf0 "
}

# Closes the RTF markup of the RTF log file
function printRTFfooter()
{
   valPrint r "}"
}

# Writes the HTML header to HTML log file
function printHTMheader()
{
   valPrint h "<html>
<head>
<title>Validate External Links report</title>
</head>
<body>
<h2>Validate External Links report</h2>
<h3>generated $NICE_TIME<br />
from data of $LINKS_DATE<br />
script by Iritscen (<a href=\"$MY_WIKI_PAGE\" target=\"_blank\">contact</a>)</h3>"
}

# Closes the HTML markup of the HTML log file
function printHTMfooter()
{
   valPrint h "</body>
</html>"
}

# The central logging function. The first parameter is a string composed of one or more characters that
# indicates which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
# 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 'w' means "Don't
# pass console output through 'fmt'" ("fmt" fits the output to an 80-column CLI but can break special
# formatting and the 'n' option).
function valPrint()
{
   if [[ "$1" == *c* ]]; then
      if [[ "$1" == *n* ]]; then
         echo -n "$2"
      elif [[ "$1" == *w* ]]; then
         echo "$2"
      else
         echo "$2" | fmt -w 80
      fi
   fi
   if [[ "$1" == *t* ]]; then
      if [[ "$1" == *n* ]]; then
         echo -n "$2" >> "$LOG_TXT"
      else
         echo "$2" >> "$LOG_TXT"
      fi
   fi
   if [[ "$1" == *r* ]]; then
      if [[ "$1" == *n* ]]; then
         echo "$2" >> "$LOG_RTF"
      else
         echo "$2\\" >> "$LOG_RTF"
      fi
   fi
   if [[ "$1" == *h* ]]; then
      if [[ "$1" == *n* ]]; then
         echo "$2" >> "$LOG_HTM"
      else
         echo "$2<br />" >> "$LOG_HTM"
      fi
   fi
}

# Pluralize the string in parameter 1 if the number in parameter 2 is not 1
function pluralCheckNoun()
{
   if [ $2 -ne 1 ]; then
      if [[ $1 =~ x$ ]]; then
         echo $1es
      else
         echo $1s
      fi
   else
      echo $1
   fi
}

# Output "was" if parameter 1 is 1, otherwise "were"
function pluralCheckWas()
{
   if [ $1 -ne 1 ]; then
      echo "were"
   else
      echo "was"
   fi
}

# Upload HTML report using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
# reports being saved to disk have already been closed.
function uploadReport()
{
   valPrint c "Uploading HTML report..."

   SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
   SFTP_USER_NAME_MARKER="user:"
   SFTP_PASSWORD_MARKER="pw:"
   SFTP_PORT_MARKER="port:"
   SFTP_PATH_MARKER="path:"
   SFTP_USER_NAME=$(grep $SFTP_USER_NAME_MARKER $UPLOAD_INFO)
   SFTP_USER_NAME=${SFTP_USER_NAME#$SFTP_USER_NAME_MARKER}
   SFTP_PASSWORD=$(grep $SFTP_PASSWORD_MARKER $UPLOAD_INFO)
   SFTP_PASSWORD=${SFTP_PASSWORD#$SFTP_PASSWORD_MARKER}
   SFTP_PORT=$(grep $SFTP_PORT_MARKER $UPLOAD_INFO)
   SFTP_PORT=${SFTP_PORT#$SFTP_PORT_MARKER}
   SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
   SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}

   expect "$SCRIPT_PATH" "$LOG_HTM" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.htm"

   valPrint c "Report was uploaded, unless an error message appears above."
}

# Prints session summary when script is done
function wrapupAndExit()
{
   # Get off progress line on console, drop down a line from last link in log, and close HTML table
   valPrint ctr ""
   valPrint h "</table><br />"

   # If we didn't finish processing the last URL, then the iterator is one too high
   if [ $FINISHED_LIST != "yes" ]; then
      let LINK_NUM-=1
      if [ $FINISHED_LIST == "no" ]; then
         valPrint ctrh "The session was canceled by the user."
      fi
   fi

   # Output results of session and close the log file's markup
   LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
   LINKS_SKIPPED=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
   LINKS_CHECKED=$((LINKS_PROCESSED-LINKS_SKIPPED))
   valPrint ct "Summary:"
   valPrint r "\b1 Summary \b0"
   valPrint hn "<h3><span id=\"summary\">Summary</span></h3>"
   valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT)."
   valPrint ctrh "I skipped $LINKS_SKIPPED $(pluralCheckNoun link $LINKS_SKIPPED), and found $FILE_LINKS $(pluralCheckNoun file $FILE_LINKS) and $PAGE_LINKS $(pluralCheckNoun page $PAGE_LINKS)."
   if [ $LINKS_SKIPPED -gt 0 ]; then valPrint ctrh "Skip breakdown: "; fi
   if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
   if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE links on JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
   if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
   if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
   if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
   if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
   valPrint ctrh "Out of the $LINKS_CHECKED links checked, $OK_LINKS $(pluralCheckWas $OK_LINKS) OK and $NG_LINKS $(pluralCheckWas $NG_LINKS) NG."
   if [ $SKIP_EXCEPT -gt 0 ]; then
      valPrint ctrh "$SKIP_EXCEPT/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS) went unlisted due to being found in the exceptions file."
   fi
   printRTFfooter
   printHTMfooter

   # Upload report if this was requested
   if [ ! -z $UPLOAD_INFO ] && [ $FINISHED_LIST != "no" ]; then
      uploadReport
   fi

   # Really quit now
   valPrint c "ValExtLinks says goodbye."
   exit 0
}
trap wrapupAndExit INT


### INITIALIZATION ###
# Print opening message to console and log files
valPrint cw "$(tput bold)--Validate External Links--$(tput sgr0)"
printTXTheader
printRTFheader
printHTMheader

# Attempt to download file at LINKS_URL, then check that it succeeded
valPrint ctrh "Downloading list of external links from $LINKS_URL."
LINKS_FILE_NAME=$(echo "$LINKS_URL" | sed 's/.*\///')
LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
curl --silent -o "$LINKS_FILE" $LINKS_URL
if [ ! -f "$LINKS_FILE" ]; then
   echo "The download of $LINKS_URL appears to have failed. Aborting."
   wrapupAndExit
fi

# Attempt to download file at EXCEPT_URL, then check that it succeeded
if [ ! -z $EXCEPT_URL ]; then
   valPrint ctrh "Downloading list of NG exceptions from $EXCEPT_URL."
   EXCEPT_FILE_NAME=$(echo "$EXCEPT_URL" | sed 's/.*\///')
   EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
   curl --silent -o "$EXCEPT_FILE" $EXCEPT_URL
   if [ ! -f "$EXCEPT_FILE" ]; then
      echo "The download of $EXCEPT_URL appears to have failed. Aborting."
      wrapupAndExit
   fi
fi

# Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
LINK_COUNT_STRING=$(cat "$LINKS_FILE" | wc -l)

# Number of URLs is number of lines minus one (first line is column header row for the CSV)
LINK_COUNT=$(echo "${LINK_COUNT_STRING}" | tr -d '[:space:]')
let LINK_COUNT-=1

# Calculate number of URLs to consider
if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
   valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((URL_LIMIT-URL_START+1)) of them, from link $URL_START to $URL_LIMIT."
elif [ $URL_START -ne 1 ]; then
   valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((LINK_COUNT-URL_START+1)) of them, from link $URL_START to $LINK_COUNT."
else
   valPrint ctrh "Found $LINK_COUNT links to process. I will be considering all of them."
fi

# Print settings to console and log
declare -a SETTINGS_MSG=(I will be checking the response code of each link "and will" take a screenshot of each page. Pages that are OK will "also" be logged. I "will" ask the Internet Archive for a suggested snapshot URL for each NG page. "I will not print NG links that are listed in the exceptions file.")
if [ $TAKE_PAGE_SHOT -eq 0 ]; then SETTINGS_MSG[10]="but will not"; fi
if [ $RECORD_OK_LINKS -eq 0 ]; then SETTINGS_MSG[22]="not"; fi
if [ $SUGGEST_SNAPSHOTS -eq 0 ]; then SETTINGS_MSG[26]="will not"; fi
if [ -z $EXCEPT_URL ] || [ $RECORD_OK_LINKS -eq 1 ]; then SETTINGS_MSG[40]=""; fi
SETTINGS_STR=${SETTINGS_MSG[@]}
valPrint ctrh "$SETTINGS_STR"
valPrint tr "A summary of my findings will be found at the bottom of the report."
valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
valPrint trh ""

# Print legend to logs
valPrint t "Legend:"
valPrint r "\b1 Legend \b0"
valPrint hn "<h3>Legend</h3>"
valPrint trh "OK = URL seems to be working."
valPrint trh "NG = URL no longer seems to work. You should click each URL marked as NG before attempting to fix it. False negatives will occur from time to time due to hiccups in the Internet. Please report any persistent false negatives or other issues to Iritscen."
valPrint trh "IW = URL is working but should be converted to interwiki link using the suggested markup."
valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $HTTP_CODES)."
valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$HTTP_CODES\"}}{\fldrslt here}} for code reference)."
valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$HTTP_CODES\" target=\"_blank\">here</a> for code reference)."
valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $CURL_CODES)."
valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$CURL_CODES\"}}{\fldrslt here}} for code reference)."
valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$CURL_CODES\" target=\"_blank\">here</a> for code reference)."
valPrint trh "IA suggests = Last available snapshot suggested by the Internet Archive."
valPrint trh "Try browsing = The Archive occasionally fails to return a snapshot URL even when one exists, so you will need to check for a snapshot manually using the Wayback Machine before concluding that a site has not been archived."
valPrint trh ""


### MAIN LOOP ###
# Process each line of the .csv in LINKS_FILE
for LINE in `cat "$LINKS_FILE"`; do
   let LINK_NUM+=1

   # First line is the column header row for the CSV, so let's verify that the format hasn't changed
   if [ $SKIPPED_HEADER_ROW -eq 0 ]; then
      if [ $LINE == "namespace,title,target" ]; then
         SKIPPED_HEADER_ROW=1
         LINK_NUM=0 # this line is it's not a link, so reset the link counter
         valPrint hn "<table>"
         continue
      else
         valPrint ctrh "Did not find expected header row in $LINKS_FILE. Aborting."
         wrapupAndExit
      fi
   fi

   # Skip this link if we are not at URL_START yet
   if [ $LINK_NUM -lt $URL_START ]; then
      continue
   fi

   # Stop if we are at the limit declared for testing purposes
   if [ $URL_LIMIT -gt 0 ] && [ $LINK_NUM -gt $URL_LIMIT ]; then
      FINISHED_LIST="limit"
      wrapupAndExit
   fi

   # Print progress to screen
   if [ $LINK_NUM -gt 1 ]; then
      printf "\e[1A\n" # erase previous progress message so that new one appears in its place
   fi
   valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."

   # The number of the namespace is the element before the first comma on the line
   NS_ID=${LINE%%,*}

   # Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
   NS_NAME=""
   a=0
   while [ "x${NS_IDS[$a]}" != "x" ] # once this evaluates to "x", the array is done
   do
      if [ $NS_ID -eq ${NS_IDS[$a]} ]; then
         NS_NAME="${NS_NAMES[$a]}"
         break
      fi
      let a+=1
   done
   if [ -z "$NS_NAME" ]; then
      valPrint tr "Skipping URL found on page $PAGE_NAME because I could not find a name for namespace ID $NS_ID."
      let SKIP_UNK_NS+=1
      continue
   fi

   # The name of the page is everything between the namespace ID and the next comma on the line (commas
   # in page names will break this)
   PAGE_NAME=${LINE#$NS_ID,}
   PAGE_NAME=${PAGE_NAME%%,*}

   # We don't want to consider wiki pages ending in .js, as the parser cannot reliably isolate URLS in
   # JavaScript code, so it will return erroneous links
   PAGE_NAME_SUFFIX=$(echo $PAGE_NAME | sed 's/.*\.//')
   if [ $PAGE_NAME_SUFFIX == "js" ]; then
      valPrint tr "Skipping URL found on JavaScript page $PAGE_NAME."
      let SKIP_JS_PAGE+=1
      continue
   fi

   # The URL being linked to is everything after the previous two fields (this allows commas to be in
   # the URLs, but a comma in the previous field, the page name, will break this)
   URL=${LINE#$NS_ID,$PAGE_NAME,}

   # Scan for illegal characters
   if [[ $URL == *[$ILLEGAL_CHARS]* ]]; then
      valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because it contains characters illegal in a URL."
      let SKIP_BAD_URL+=1
      continue
   fi

   # Now we need to know if the URL is for a file or a web page. First step is to determine if the
   # URL ends in a suffix
   HAS_SUFFIX=0

   # If the URL ends in something like ".php?foo=bar", strip everything from the '?' onward
   SAN_URL=${URL%%\?*}

   # If the URL ends in something like "#section_15", strip everything from the '#' onward
   SAN_URL=${SAN_URL%%\#*}

   # 'sed' cannot handle Unicode in my Bash shell, so skip this URL and make user check it
   if [[ $SAN_URL == *[![:ascii:]]* ]]; then
      valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because I cannot handle non-ASCII characters."
      let SKIP_NON_ASCII+=1
      continue
   fi

   # Isolate the characters after the last period and after the last slash
   POST_DOT=$(echo "$SAN_URL" | sed 's/.*\.//')
   POST_SLASH=$(echo "$SAN_URL" | sed 's/.*\///')

   # If the last period comes after the last slash, then the URL ends in a suffix
   POST_DOT_LENGTH=$(echo | awk -v input=$POST_DOT '{print length(input)}')
   POST_SLASH_LENGTH=$(echo | awk -v input=$POST_SLASH '{print length(input)}')
   if [ $POST_DOT_LENGTH -lt $POST_SLASH_LENGTH ]; then
      HAS_SUFFIX=1
   else
      HAS_SUFFIX=0
   fi

   # Now we need to know if the URL is for a file. If the URL ends in a suffix, compare it against the
   # known URL endings supplied in HTTP_FILES and HTTP_TLDS_AND_PAGES
   IS_FILE=-1
   if [ $HAS_SUFFIX -eq 0 ]; then
      IS_FILE=0
   else
      # Turn off case sensitivity while we compare suffixes
      shopt -s nocasematch

      # Special case: URLs ending in something like "/productID.304297400" are pages, not files, so if
      # the URL's suffix is all numbers, we are looking at the end of a web page URL
      if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
         IS_FILE=0
      fi
      
      # If we did not identify this URL as a web page above, we need to compare the suffix against known
      # file extensions
      if [ $IS_FILE -eq -1 ]; then
         for EXTENSION in "${HTTP_FILES[@]}"; do
            if [[ $EXTENSION == $POST_DOT ]]; then # double brackets respect the nocasematch setting
               IS_FILE=1
               break
            fi
         done
      fi

      # If we did not identify this URL as a file above, we need to compare the suffix against known
      # pages and TLDs in order to rule out that this is an unknown file suffix or TLD that the user
      # needs to add to the HTTP_FILES or HTTP_TLDS_AND_PAGES array
      if [ $IS_FILE -eq -1 ]; then
         for PAGE_OR_TLD in "${HTTP_TLDS_AND_PAGES[@]}"; do
            if [[ $PAGE_OR_TLD == $POST_DOT ]]; then
               IS_FILE=0
               break
            fi
         done
      fi

      # Turn case sensitivity back on in Bash
      shopt -u nocasematch
   fi

   # If this suffix escaped identification as either a file, page or TLD, inform the user
   STR_TYPE=""
   if [ $IS_FILE -eq -1 ]; then
      valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because I encountered the unknown URL ending $POST_DOT. Please add this ending to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
      let SKIP_UNK_SUFFIX+=1
      continue
   elif [ $IS_FILE -eq 1 ]; then
      STR_TYPE="file"
      let FILE_LINKS+=1
   elif [ $IS_FILE -eq 0 ]; then
      STR_TYPE="page"
      let PAGE_LINKS+=1
   fi

   # Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
   # issue with sites that require HTTPS
   CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{http_code}\n' $URL)
   CURL_ERR=$(echo $?)
   CURL_RESULT=$CURL_CODE

   # Tack 'curl' exit code onto result string if 'curl' returned "000" (no HTTP response)
   if [ $CURL_CODE == "000" ]; then
      CURL_RESULT="$CURL_RESULT-$CURL_ERR"
   fi

   # Determine if this code is in our "OK" list
   STATUS="??"
   INTERWIKI_INDEX=-1
   for CODE in "${OK_CODES[@]}"; do
      if [[ $CODE == $CURL_CODE ]]; then
         let OK_LINKS+=1

         # Determine if this is a link to a domain that we have an interwiki prefix for
         for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
            if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]]; then
               STATUS="IW"
               INTERWIKI_INDEX=$i
               break
            fi
         done

         # If this link is OK and no interwiki advisory is needed, just mark as "OK"
         if [ $INTERWIKI_INDEX == -1 ]; then
            STATUS="OK"
         fi
         break
      fi
   done

   # If we didn't get a match with the "OK" codes, check it against the "NG" codes
   if [ $STATUS == "??" ]; then
      for CODE in "${NG_CODES[@]}"; do
         if [[ $CODE == $CURL_CODE ]]; then
            STATUS="NG"
            let NG_LINKS+=1
            break
         fi
      done
   fi

   # If we didn't match a known status code, advise the reader
   if [ $STATUS == "??" ]; then
      valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because I encountered the unknown return code $CURL_CODE."
      let SKIP_UNK_CODE+=1
      continue
   fi

   # If link is "NG" and there is an exceptions file, compare URL against the list before logging it
   if [ $STATUS == "NG" ] && [ ! -z $EXCEPT_URL ]; then
      GREP_RESULT=$(grep --max-count=1 "$URL" "$EXCEPT_FILE")
      EXCEPT_CODE=${GREP_RESULT%%,*}
      if [ "$EXCEPT_CODE" == $CURL_RESULT ]; then
         valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because its status code, $CURL_RESULT, is listed in the exceptions file."
         let SKIP_EXCEPT+=1
         continue
      fi
   fi

   # If appropriate, record this link to the log, with clickable URLs when possible
   if [ $STATUS != "OK" ] || [ $RECORD_OK_LINKS -eq 1 ]; then
      FULL_PAGE_PATH=http://$WIKI_PATH/$NS_NAME:$PAGE_NAME
      LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
      # Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it explicitly breaks the link
      if [ $NS_ID -eq 0 ]; then
         FULL_PAGE_PATH=http://$WIKI_PATH/$PAGE_NAME
         LOCAL_PAGE_PATH=$PAGE_NAME
      fi

      # Stupid hack since the text "IW" is narrower than "OK" or "NG" and it takes an extra tab to get
      # to the desired level of indentation in the RTF log
      RTF_TABS="	"
      if [ $STATUS == "IW" ]; then
         RTF_TABS="		"
      fi
      
      # Record link and its wiki page in TXT, RTF, and HTML markup
      valPrint t "$STATUS ($CURL_RESULT) $STR_TYPE $URL"
      valPrint t "  linked from $FULL_PAGE_PATH"
      valPrint r "$STATUS ($CURL_RESULT)$RTF_TABS$STR_TYPE	{\field{\*\fldinst{HYPERLINK \"$URL\"}}{\fldrslt $URL}}"
      valPrint r "		linked from	{\field{\*\fldinst{HYPERLINK \"$FULL_PAGE_PATH\"}}{\fldrslt $LOCAL_PAGE_PATH}}"
      valPrint hn "<tr><td style=\"white-space:nowrap\">$STATUS ($CURL_RESULT)</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>"
      valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"

      # Notify reader if we can use an interwiki prefix for this URL
      if [ $STATUS == "IW" ]; then
         valPrint t "  You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$POST_SLASH]]"
         valPrint r "		You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$POST_SLASH]]"
         valPrint hn "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$POST_SLASH]]</td></tr>"
      fi

      # Query Internet Archive for latest "OK" snapshot for "NG" page
      if [ $STATUS == "NG" ] && [ $SUGGEST_SNAPSHOTS -eq 1 ]; then
         ARCHIVE_QUERY=$(curl --silent --max-time 10 "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")

         # Isolate "url" property in response and log it if a "closest" snapshot was received...
         if [[ "$ARCHIVE_QUERY" == *\"closest\":* ]]; then
            SNAPSHOT_URL=${ARCHIVE_QUERY##*\"url\": \"}
            SNAPSHOT_URL=${SNAPSHOT_URL%\", \"timestamp*}
            valPrint t "  IA suggests $SNAPSHOT_URL"
            valPrint r "		IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
            valPrint hn "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
         else # ...otherwise give generic Wayback Machine link for this URL
            valPrint t " Try browsing $ARCHIVE_GENERIC/$URL"
            valPrint r "		Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
            valPrint hn "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"
         fi
      fi
   fi
   
   # If this is a page and it seems to be there, take screenshot of it using headless Google Chrome
   if [ $IS_FILE -eq 0 ] && [ $TAKE_PAGE_SHOT -eq 1 ] && [ $STATUS == "OK" ]; then
      # Sanitize URL by removing "http(s)://" and converting any other colons or slashes to underscores
      SHOT_NAME=$(echo "$URL" | sed 's/https*\:\/\///' | sed 'y/:\//__/')
      SHOT_FILE="$SHOT_PATH/$SHOT_NAME.png"

      # Don't take screenshot if we already encountered this page and screenshotted it
      if [ ! -f "$SHOT_FILE" ]; then
         "$CHROME" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
         if [ -f "$WORKING_DIR/$CHROME_SCREENSHOT" ]; then
            mv -n "$WORKING_DIR/$CHROME_SCREENSHOT" "$SHOT_FILE"
         else
            valPrint trh "Screenshot of URL $URL seems to have failed!"
         fi
      else
         valPrint trh "Skipping screenshot of URL $URL because $SHOT_FILE already exists."
      fi
   fi
done
FINISHED_LIST="yes"
wrapupAndExit