#!/bin/bash

# Validate External Links by Iritscen (iritscen@yahoo.com)
#
# Validates a list of external links in CSV format. The resulting logs are produced in three formats:
# - TXT (for easy diffing with an earlier log)
# - RTF (for reading as a local file with clickable links)
# - HTML (for reading as a web page)
# Call script with "--help" argument for documentation. Also see Read Me.rtf for critical notes.
#
# Recommended rule:
# |----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----|
#
# Table of contents (sections of script in order of appearance, not execution):
# • Globals
# • Help Output
# • Setup
# • Utility Functions
# • Summary Output
# • Initialization
#   • Data Sourcing
#   • Config Output
#   • Legend Output
# • Main Loop

# Set separator token to newline
IFS="
"

### GLOBALS ###
# Settings -- these will be changed from their defaults by the arguments passed in to the script
LINKS_URL=""           # download external link CSV from this location (can use "file://" protocol)
EXCEPT_URL=""          # location of wiki page with a list of exceptions for NG results
OUTPUT_DIR=""	        # place reports and all other output in a folder inside this existing folder
RECORD_OK_LINKS=0      # record response code to the log even when it's a value in OK_CODES
ONLY_200_OK=0          # only treat code 200 as "OK" and not any other code in OK_CODES
SHOW_SLASH=0           # record issue when a slash is added to the end of a URL
SHOW_HTTPS=0           # record issue when "http" is upgraded to "https"
SHOW_YT_RD=0           # record redirection for a youtu.be URL expanding to the full URL
SUGGEST_SNAPSHOTS_NG=0 # query the Internet Archive for a possible snapshot URL for each NG page
SUGGEST_SNAPSHOTS_OK=0 # query the Internet Archive for an existing snapshot of each OK page
CHECK_ARCHIVE_LINKS=0  # check URLs on archive.org and archive.is
TAKE_PAGE_SHOT=0       # take a screenshot of each OK page
TIMEOUT=10             # time to wait for a response when querying a site
CHROME_PATH=""         # path to a copy of Google Chrome that has the command-line screenshot feature
URL_START=1            # start at this URL in LINKS_FILE
URL_LIMIT=0            # if non-zero, stop at this URL in LINKS_FILE
UPLOAD_INFO=""         # path to a file on your hard drive with the login info needed to upload a report

# Fixed strings -- see the occurrences of these variables to learn their purpose
AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
ARCHIVE_API="http://archive.org/wayback/available"
ARCHIVE_GENERIC="https://web.archive.org/web/*"
ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
CHROME_SCREENSHOT="screenshot.png"
EXCEPT_FILE_NAME="exceptions.txt"
EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
WIKI_CURL="https://wiki.oni2.net/Validate_External_Links/Curl_codes"
WIKI_HTTP="https://wiki.oni2.net/Validate_External_Links/HTTP_codes"
WIKI_MAIN="https://wiki.oni2.net/Validate_External_Links"
WIKI_ME="http://iritscen.oni2.net"
THIS_DIR=$(cd $(dirname $0); pwd)
WORKING_DIR=$(pwd)
WIKI_PATH="wiki.oni2.net"

# These are parallel arrays of the IDs and names of OniGalore's current namespaces
declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 100 101 102 103 104 105 108 109 110 111)
declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk")

# These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
# This determines whether the script tries to take a screenshot of the URL (when screenshots are
# requested).
declare -a HTTP_FILES=(3ds 7z avi BINA blend bsl cpp csv dae dll dmg doc east exe fbx first63 flv gamedata gif jpg last32 log m4a mhm mov mp3 mp4 mpg oni ONWC pdf png psd py rar tga tif TRMA txt vbs wav wmv xaf xcf xlsx xml zip)
declare -a HTTP_TLDS_AND_PAGES=(abstract action amp ars asp aspx cfm cgi com css de do full htm html htmldem it js jsp less net org pgi php php3 phtml pl ru shtml stm uk x)

# These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
# are NG (no good). Pages that return OK codes will be screenshotted when screenshots are asked for.
# Remember to update http_codes.txt if you add a new code.
declare -a OK_CODES=(200 202 204 401 402 405 406 418 501)
declare -a RD_CODES=(301 302 303 307 308)
declare -a NG_CODES=(000 400 403 404 410 429 500 502 503 504 520 530)

# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
# transcluded text, and if the transclusion fails, then the braces show up in the URL
ILLEGAL_CHARS="{ }"

# The shortest URL possible, used for sanity-checking some URLs: http://a.co
MIN_URL_LENGTH=11

# These are parallel arrays giving the prefixes that can be used in place of normal external links to
# some wikis and other sites; based on https://wiki.oni2.net/Special:Interwiki
declare -a INTERWIKI_PREFIXES=(commons metawikimedia mw wikibooks wikidata wikimedia wikinews wikiquote wikisource wikispecies wikiversity wikivoyage wikt wp)
declare -a INTERWIKI_DOMAINS=(commons.wikimedia.org meta.wikimedia.org mediawiki.org wikibooks.org wikidata.org wikimediafoundation.org wikinews.org wikiquote.org wikisource.org species.wikimedia.org wikiversity.org wikivoyage.org wiktionary.org wikipedia.org)

# Variables for keeping track of main loop progress and findings
LINK_NUM=0
EI_LINKS=0
IW_LINKS=0
OK_LINKS=0
RD_LINKS=0
NG_LINKS=0
SKIP_PARSE_FAIL=0
SKIP_UNK_PROT=0
SKIP_UNK_NS=0
SKIP_JS_PAGE=0
SKIP_BAD_URL=0
SKIP_NON_ASCII=0
SKIP_UNK_SUFFIX=0
SKIP_UNK_CODE=0
SKIP_EXPECT_NG=0
SKIP_EXPECT_RD=0
SKIP_EXPECT_EI=0
SKIP_EXPECT_IW=0
SKIP_HTTPS_UP=0
SKIP_SLASH_ADD=0
SKIP_YOUTU_BE=0
SKIP_ARCHIVES=0
FILE_LINKS=0
PAGE_LINKS=0
SKIPPED_HEADER_ROW=0
FINISHED_LIST="no"
START_RUN=0
END_RUN=0


### HELP OUTPUT ###
# A pseudo-man page. Here is the 80-character rule for the page text:
# 345678901234567890123456789012345678901234567890123456789012345678901234567890
function printHelp()
{
  cat << EOF

NAME
       Validate External Links

SYNOPSIS
       validate_external_links.sh --help
       validate_external_links.sh --links URL --output DIR [--exceptions URL]
          [--record-ok-links] [--only-200-ok] [--show-added-slashes]
          [--show-https-upgrades] [--show-yt-redirects] [--suggest-snapshots]
          [--check-archive-links] [--take-screenshots FILE] [--timeout NUM]
          [--start-url NUM] [--end-url NUM] [--upload FILE]

DESCRIPTION
       This script parses a list of external links found in the OniGalore wiki
       (which is dumped by the Oni2.net server periodically in a particular
       format), validates them using the Unix tool 'curl', and produces a report
       of which links were "OK" (responded positively to an HTTP query), which
       were "RD" (responded with a 3xx redirect code), which could be "IW"
       (interwiki) links, which are "EI" (external internal) links and could be
       intrawiki links, and which were "NG" (no good; a negative response to the
       query). This report can then be automatically uploaded to the location of
       your choice. The script can also suggest Internet Archive snapshots for
       "NG" links, and take screenshots of "OK" links for visual verification by
       the reader that the page in question is the one intended to be displayed.

       You must pass this script the URL at which the list of links is found
       (--links) and the path where the directory of logs should be outputted
       (--output). All other arguments are optional.

OPTIONS
       --help                  Show this page.
       --links URL             (required) URL from which to download the CSV
                               file with external links. Note that this URL can
                               be a local file if you supply a file:// path.
       --output DIR            (required) Unix path to directory in which Val
                               should place its reports.
       --exceptions URL        In order to remove links from the report which
                               Val finds an issue with but which you regard as
                               OK, list those desired exceptions on a wiki page.
                               See the sample file "exceptions.pdf" for the
                               required format of the page. Note that this URL
                               can point to a local file if you supply a path
                               beginning with "file://".
       --record-ok-links       Log a link in the report even if its response
                               code is "OK".
       --only-200-ok           Only treat response code 200 as "OK". Normally
                               several additional codes are treated as "OK" (see
                               the array OK_CODES in script) because they are
                               typically not an indicator of a bad link.
       --show-added-slashes    Report on redirects that simply add a '/' to the
                               end of the URL.
       --show-https-upgrades   Report on redirects that simply upgrade a
                               "http://" URL to a "https://" URL.
       --show-yt-redirects     Report on redirects that expand a youtu.be URL.
       --suggest-snapshots-ng  Query the Internet Archive for a possible
                               snapshot URL for each "NG" page.
       --suggest-snapshots-ok  Query the Internet Archive for a snapshot of each
                               "OK" page just to make sure it's available. Note
                               that this will add a tremendous amount of time to
                               the script execution because there is a rate
                               limit to the Archive API. Note that this option
                               does nothing unless you also use the
                               --record-ok-links argument.
       --check-archive-links   Check links that are already pointing to a page
                               on the Internet Archive or archive.is (AKA 
                               archive.today). In theory these links should be
                               totally stable and not need validation.
       --take-screenshots FILE Call the Google Chrome binary at this path to
                               take screenshots of each "OK" page.
       --timeout NUM           Wait this many seconds for a site to respond. The
                               default is 10. Important note: Val will attempt
                               to reach each URL three times, so the time taken
                               to ping an unresponsive site will be three times
                               this setting.
       --start-url NUM         Start at this link in the links CSV file.
       --end-url NUM           Stop at this link in the links CSV file.
       --upload FILE           Upload report using the credentials and path
                               given in this local text file. See sftp_login.txt
                               for template.

BUGS
       The script cannot properly parse any line in the external links file
       which contains a comma in the name of the wiki page containing a link.
       Commas in the link itself are not an issue.
EOF
}


### SETUP ###
# If first argument is a help request, or if nothing was passed in at all, print help page and quit
if [ "$#" -eq 0 ] || [ "$1" == "--help" ]; then
  printHelp | less
  exit 0
fi

# Parse arguments as long as there are more arguments to process
while (( "$#" )); do
   case "$1" in
      --links )                LINKS_URL="$2";                     shift 2;;
      --exceptions )           EXCEPT_URL="$2";                    shift 2;;
      --output )               OUTPUT_DIR="$2";                    shift 2;;
      --record-ok-links )      RECORD_OK_LINKS=1;                  shift;;
      --only-200-ok )          ONLY_200_OK=1;                      shift;;
      --show-added-slashes )   SHOW_SLASH=1;                       shift;;
      --show-https-upgrades )  SHOW_HTTPS=1;                       shift;;
      --show-yt-redirects )    SHOW_YT_RD=1;                       shift;;
      --suggest-snapshots-ng ) SUGGEST_SNAPSHOTS_NG=1;             shift;;
      --suggest-snapshots-ok ) SUGGEST_SNAPSHOTS_OK=1;             shift;;
      --check-archive-links )  CHECK_ARCHIVE_LINKS=1;              shift;;
      --take-screenshots )     TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
      --timeout )              TIMEOUT=$2;                         shift 2;;
      --start-url )            URL_START=$2;                       shift 2;;
      --end-url )              URL_LIMIT=$2;                       shift 2;;
      --upload )               UPLOAD_INFO=$2;                     shift 2;;
      * )                      echo "Invalid argument '$1' detected. Aborting."; exit 1;;
  esac
done

# If the required arguments were not supplied, print help page and quit
if [ -z $LINKS_URL ] || [ -z $OUTPUT_DIR ]; then
   echo "Error: I did not receive one or both required arguments. Run me with the \"--help\" argument for documentation."
   exit 2
fi

# If user wants screenshots, make sure path to Chrome was passed in and is valid
if [ $TAKE_PAGE_SHOT -eq 1 ]; then
   if [ ! -f "$CHROME_PATH" ]; then
      echo "Error: You need to supply a path to the Google Chrome application in order to take screenshots."
      exit 3
   fi
fi

# Check that UPLOAD_INFO exists, if this argument was supplied
if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then
   echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting."
   exit 4
fi

# Check that OUTPUT_DIR is a directory
if [ ! -d "$OUTPUT_DIR" ]; then
   echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting."
   exit 5
fi

# Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots
SAFE_TIME=$(date "+%Y-%m-%d--%H-%M-%S") # for use in path names
NICE_TIME=$(TZ=":UTC" date "+%a, %d %b %Y %k:%M:%S GMT") # matches format of LINKS_DATE, which is extracted from the extlinks file
OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
SHOT_PATH="$OUTPUT_PATH/Screenshots"
LOG_NAME="ValExtLinks report"
LOG_NAME_TXT="$LOG_NAME.txt"
LOG_NAME_RTF="$LOG_NAME.rtf"
LOG_NAME_HTM="$LOG_NAME.htm"
LOG_PATH="$OUTPUT_PATH/$LOG_NAME"
LOG_PATH_TXT="$LOG_PATH.txt"
LOG_PATH_RTF="$LOG_PATH.rtf"
LOG_PATH_HTM="$LOG_PATH.htm"
mkdir "$OUTPUT_PATH"
if [ $TAKE_PAGE_SHOT -eq 1 ]; then
   mkdir "$SHOT_PATH"
fi

# Check that 'mkdir' succeeded
if [ ! -d "$OUTPUT_PATH" ]; then
   echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting."
   exit 6
fi

# Get date on the file at LINKS_URL and print to log
LINKS_DATE=$(curl --silent --head $LINKS_URL | grep "Last-Modified")
if [ -z "$LINKS_DATE" ]; then
   echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting."
   exit 7
fi
LINKS_DATE=${LINKS_DATE#Last-Modified: }


### UTILITY FUNCTIONS ###
# Writes a plain-text header to TXT log file
function printTXTheader()
{
   valPrint t "Validate External Links report"
   valPrint t "generated $NICE_TIME"
   valPrint t "from data of $LINKS_DATE"
   valPrint t "script by Iritscen (contact: $WIKI_ME)"
   valPrint t ""
}

# Writes the RTF header to RTF log file, then a centered title, then sets text to left-justified
function printRTFheader()
{
   valPrint r "{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf820
{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
{\colortbl;\red255\green255\blue255;}
{\*\expandedcolortbl;;}
\margl1440\margr1440\vieww12600\viewh12100\viewkind0
\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\qc\partightenfactor0

\f0\fs28 \cf0 \b1 Validate External Links report \b0\\
generated $NICE_TIME\\
from data of $LINKS_DATE\\
script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$WIKI_ME\"}}{\fldrslt contact}})
\\
\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
\cf0 "
}

# Closes the RTF markup of the RTF log file
function printRTFfooter()
{
   valPrint r "}"
}

# Writes the HTML header to HTML log file
function printHTMheader()
{
   valPrint h "<html>
<head>
<title>Validate External Links report</title>
</head>
<body>
<h2>Validate External Links report</h2>
<h3>generated $NICE_TIME<br />
from data of $LINKS_DATE<br />
script by Iritscen (<a href=\"$WIKI_ME\" target=\"_blank\">contact</a>)</h3>"
}

# Closes the HTML markup of the HTML log file
function printHTMfooter()
{
   valPrint h "</body>
</html>"
}

# The central logging function. The first parameter is a string composed of one or more characters that
# indicate which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
# 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 's' means "Print an
# extra newline at the end." 'w' means "Don't pass console output through 'fmt'" ("fmt" fits the output
# to an 80-column CLI but can break special formatting and the 'n' option).
function valPrint()
{
   if [[ "$1" == *c* ]]; then
      if [[ "$1" == *n* ]]; then
         echo -n "$2"
      elif [[ "$1" == *w* ]]; then
         echo "$2"
      elif [[ "$1" == *s* ]]; then
         echo -e "$2\n"
      else
         echo "$2" | fmt -w 80
      fi
   fi
   if [[ "$1" == *t* ]]; then
      if [[ "$1" == *n* ]]; then
         echo -n "$2" >> "$LOG_PATH_TXT"
      elif [[ "$1" == *s* ]]; then
         echo -e "$2\n" >> "$LOG_PATH_TXT"
      else
         echo "$2" >> "$LOG_PATH_TXT"
      fi
   fi
   if [[ "$1" == *r* ]]; then
      if [[ "$1" == *n* ]]; then
         echo "$2" >> "$LOG_PATH_RTF"
      elif [[ "$1" == *s* ]]; then
         echo "$2\line\line" >> "$LOG_PATH_RTF"
      else
         echo "$2\line" >> "$LOG_PATH_RTF"
      fi
   fi
   if [[ "$1" == *h* ]]; then
      if [[ "$1" == *s* ]]; then
         echo "$2<tr><td>&nbsp;</td></tr>" >> "$LOG_PATH_HTM"
      elif [[ "$1" == *n* ]]; then
         echo "$2" >> "$LOG_PATH_HTM"
      else
         echo "$2<br />" >> "$LOG_PATH_HTM"
      fi
   fi
}

# Pluralize the string in parameter 1 if the number in parameter 2 is not 1
function pluralCheckNoun()
{
   if [ $2 -ne 1 ]; then
      if [[ $1 =~ x$ ]]; then
         echo $1es
      else
         echo $1s
      fi
   else
      echo $1
   fi
}

# Output "is" if parameter 1 is 1, otherwise "are"
function pluralCheckIs()
{
   if [ $1 -ne 1 ]; then
      echo "are"
   else
      echo "is"
   fi
}

# Output "was" if parameter 1 is 1, otherwise "were"
function pluralCheckWas()
{
   if [ $1 -ne 1 ]; then
      echo "were"
   else
      echo "was"
   fi
}

# Output "a " if parameter 1 is 1, otherwise nothing
function pluralCheckA()
{
   if [ $1 -eq 1 ]; then
      echo "a "
   fi
}

# Output "an " if parameter 1 is 1, otherwise nothing
function pluralCheckAn()
{
   if [ $1 -eq 1 ]; then
      echo "an "
   fi
}

# Upload the reports using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
# reports being saved to disk have already been closed.
function uploadReport()
{
   valPrint c "Uploading reports..."

   SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
   SFTP_USER_NAME_MARKER="user:"
   SFTP_PASSWORD_MARKER="pw:"
   SFTP_PORT_MARKER="port:"
   SFTP_PATH_MARKER="path:"
   SFTP_USER_NAME=$(grep $SFTP_USER_NAME_MARKER $UPLOAD_INFO)
   SFTP_USER_NAME=${SFTP_USER_NAME#$SFTP_USER_NAME_MARKER}
   SFTP_PASSWORD=$(grep $SFTP_PASSWORD_MARKER $UPLOAD_INFO)
   SFTP_PASSWORD=${SFTP_PASSWORD#$SFTP_PASSWORD_MARKER}
   SFTP_PORT=$(grep $SFTP_PORT_MARKER $UPLOAD_INFO)
   SFTP_PORT=${SFTP_PORT#$SFTP_PORT_MARKER}
   SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
   SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}

   for SUFFIX in htm rtf txt; do
      expect "$SCRIPT_PATH" "$LOG_PATH.$SUFFIX" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.$SUFFIX"

      if [ "$?" -ne 0 ]; then
         valPrint c "Error $? occurred when attempting to upload $LOG_NAME.$SUFFIX!"
      else
         valPrint c "Report in `echo $SUFFIX | tr [:lower:] [:upper:]` format was uploaded."
      fi
   done
}

# Prints session summary when script is done
function wrapupAndExit()
{
   # Get off progress line on console, drop down a line from last link in log, and close HTML table
   valPrint ctr ""
   valPrint h "</table><br />"

   # If we didn't finish processing the last URL, then the iterator is one too high
   if [ $FINISHED_LIST != "yes" ]; then
      let LINK_NUM-=1
      if [ $FINISHED_LIST == "no" ]; then
         valPrint ctrh "The session was canceled by the user."
      fi
   fi

   # Generate string with elapsed time
   END_RUN=$(date +%s)
   ELAPSED=$(echo $(($END_RUN - $START_RUN)) | awk '{printf "%d min. %d sec. elapsed", int($1/60), int($1%60)}')

   # Do some math on results of session
   LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
   TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP+SKIP_YOUTU_BE))
   LINK_ERRORS=$((SKIP_PARSE_FAIL+SKIP_UNK_PROT+SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
   LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_RD+SKIP_EXPECT_EI+SKIP_EXPECT_IW))
   LINK_PROBLEMS_TOTAL=$((NG_LINKS+RD_LINKS+EI_LINKS+IW_LINKS))
   LINK_PROBLEMS_NG=$((NG_LINKS-SKIP_EXPECT_NG))
   LINK_PROBLEMS_RD=$((RD_LINKS-SKIP_EXPECT_RD))
   LINK_PROBLEMS_EI=$((EI_LINKS-SKIP_EXPECT_EI))
   LINK_PROBLEMS_IW=$((IW_LINKS-SKIP_EXPECT_IW))
   LINK_PROBLEMS_NET=$((LINK_PROBLEMS_NG+LINK_PROBLEMS_RD+LINK_PROBLEMS_EI+LINK_PROBLEMS_IW))

   # Print something in the Links section if no link issues were printed
   if [ $LINK_PROBLEMS_NET -eq 0 ]; then
      valPrint h "<i>No link problems to report! See the <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for a list of links with issues that were not reported.</i>"
   fi
   if [ $LINK_PROBLEMS_TOTAL -eq 0 ]; then
      valPrint t "No link problems to report!"
      valPrint r "\i1 No link problems to report! \i0"
   fi

   ## SUMMARY OUTPUT ##
   valPrint ct "Summary ($ELAPSED):"
   valPrint r "\b1 Summary \b0 ($ELAPSED)"
   valPrint hn "<h3><span id=\"summary\">Summary ($ELAPSED)</span></h3>"
   valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT) (there $(pluralCheckWas $FILE_LINKS) $FILE_LINKS file $(pluralCheckNoun link $FILE_LINKS) and $PAGE_LINKS page $(pluralCheckNoun link $PAGE_LINKS))."

   # Print processed link totals
   if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi
   if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi
   if [ $SKIP_ARCHIVES -gt 0 ]; then valPrint ctrh "- $SKIP_ARCHIVES archive.org/archive.is $(pluralCheckNoun link $SKIP_ARCHIVES) $(pluralCheckWas $SKIP_ARCHIVES) not checked"; fi
   if [ $LINK_PROBLEMS_TOTAL -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_TOTAL processed $(pluralCheckNoun link $LINK_PROBLEMS_TOTAL) had $(pluralCheckAn $LINK_PROBLEMS_TOTAL)$(pluralCheckNoun issue $LINK_PROBLEMS_TOTAL)"; fi
   if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr "  (excepted $LINKS_EXCEPTED link $(pluralCheckNoun issue $LINKS_EXCEPTED) from report)"; valPrint h "&nbsp;&nbsp;(excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi
   if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) $(pluralCheckWas $OK_LINKS) OK"; fi
   if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctr "  (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; valPrint h "&nbsp;&nbsp;(counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi

   # Print errored link totals
   if [ $LINK_ERRORS -gt 0 ]; then
      valPrint c "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"
      valPrint h "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for specific links):"
      valPrint rt "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS):"
   fi
   if [ $SKIP_PARSE_FAIL -gt 0 ]; then valPrint ctrh "- $SKIP_PARSE_FAIL line-parsing $(pluralCheckNoun failure $SKIP_PARSE_FAIL)"; fi
   if [ $SKIP_UNK_PROT -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_PROT unknown $(pluralCheckNoun protocol $SKIP_UNK_PROT)"; fi
   if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
   if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
   if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
   if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
   if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
   if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi

   # Print excepted link totals
   if [ $LINKS_EXCEPTED -gt 0 ]; then
      valPrint c "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"
      valPrint h "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for specific links):"
      valPrint rt "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted:"
   fi
   if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
   if [ $SKIP_EXPECT_RD -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_RD/$RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi
   if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi
   if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi

   # Perform exceptions audit
   EXCEPTION_ISSUES=0
   valPrint ctrh "Exceptions list audit:"
   for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
      EXCEPT_LINE="${EXCEPT_ARRAY[$i]}"
      EXCEPT_LINE=$(echo "$EXCEPT_LINE" | sed 's/\&amp;/\&/g') # copied from exception-matching code

      if [ ${EXCEPT_FOUND[$i]} -eq 0 ]; then
         EXCEPT_URL="${EXCEPT_LINE#*,}"
         EXCEPT_URL="${EXCEPT_URL%,*}"
         EXCEPT_PAGE="${EXCEPT_LINE##*,}"
         EXCEPT_PAGE="${EXCEPT_PAGE%% *}"
         if [ "$EXCEPT_PAGE" == "*" ]; then
            valPrint tr "- The link '$EXCEPT_URL' did not return an error or is not on any page."
         else
            valPrint tr "- The link '$EXCEPT_URL' did not return an error or is not on page '$EXCEPT_PAGE'."
         fi
         let EXCEPTION_ISSUES+=1
      elif [ ${EXCEPT_USED[$i]} -eq 0 ]; then
         EXCEPT_URL="${EXCEPT_LINE#*,}"
         EXCEPT_URL="${EXCEPT_URL%,*}"
         EXCEPT_CODE=${EXCEPT_LINE%%,*}
         valPrint tr "- The link '$EXCEPT_URL' did not return error code $EXCEPT_CODE."
         let EXCEPTION_ISSUES+=1
      fi
   done
   if [ $EXCEPTION_ISSUES -eq 0 ]; then
      valPrint ctrh "- No issues found."
   else
      valPrint c "$EXCEPTION_ISSUES exception list $(pluralCheckNoun issue $EXCEPTION_ISSUES) detected (see RTF or TXT report for details)."
      valPrint h "$EXCEPTION_ISSUES exception list $(pluralCheckNoun issue $EXCEPTION_ISSUES) detected (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for details)."
   fi

   # Print checked link totals
   if [ $LINK_PROBLEMS_NET -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS_NET link $(pluralCheckNoun issue $LINK_PROBLEMS_NET):"; fi
   if [ $LINK_PROBLEMS_NG -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_NG NG $(pluralCheckNoun link $LINK_PROBLEMS_NG)"; fi
   if [ $LINK_PROBLEMS_RD -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_RD $(pluralCheckNoun redirection $LINK_PROBLEMS_RD)"; fi
   if [ $LINK_PROBLEMS_EI -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_EI $(pluralCheckNoun link $LINK_PROBLEMS_EI) that could be intrawiki"; fi
   if [ $LINK_PROBLEMS_IW -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_IW $(pluralCheckNoun link $LINK_PROBLEMS_IW) that could be interwiki"; fi

   # Close the log files' markup
   valPrint trh "ValExtLinks says goodbye."
   printRTFfooter
   printHTMfooter

   # Upload report if this was requested
   if [ ! -z $UPLOAD_INFO ] && [ $FINISHED_LIST != "no" ]; then
      uploadReport
   fi

   # Really quit now
   valPrint c "ValExtLinks says goodbye."
   exit 0
}
trap wrapupAndExit INT


### INITIALIZATION ###
# Print opening message to console and log files
valPrint cw "$(tput bold)--Validate External Links--$(tput sgr0)"
printTXTheader
printRTFheader
printHTMheader

## DATA SOURCING ##
valPrint t "Startup:"
valPrint r "\b1 Startup \b0"
valPrint hn "<h3>Startup</h3>"

# Attempt to download file at LINKS_URL, then check that it succeeded
valPrint cwtrhn "Downloading list of external links from $LINKS_URL..."
LINKS_FILE_NAME=$(echo "$LINKS_URL" | sed 's/.*\///')
LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
curl --silent -o "$LINKS_FILE" $LINKS_URL
if [ ! -f "$LINKS_FILE" ]; then
   echo -e "\nThe download of $LINKS_URL appears to have failed. Aborting."
   wrapupAndExit
else
   valPrint ctrh " success."
fi

# Attempt to download file at EXCEPT_URL, then check that it succeeded
if [ ! -z $EXCEPT_URL ]; then
   valPrint cwtrhn "Downloading list of reporting exceptions from $EXCEPT_URL..."
   EXCEPT_DATA=$(curl --silent $EXCEPT_URL)
   if [ -z "$EXCEPT_DATA" ]; then
      echo -e "\nThe download of the exceptions data from '$EXCEPT_URL' appears to have failed. Aborting."
      wrapupAndExit
   else
      valPrint ctrh " success."
   fi
   EXCEPT_DATA=${EXCEPT_DATA%END LIST*}
   EXCEPT_DATA=${EXCEPT_DATA#*BEGIN LIST}
   EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"

   # Store on disk for debugging purposes
   echo "$EXCEPT_DATA" > "$EXCEPT_FILE"

   # Transfer to array for easy searching later
   declare -a EXCEPT_ARRAY=($(echo "$EXCEPT_DATA"))

   # Create parallel arrays for marking which exceptions get used later
   declare -a EXCEPT_USED=()
   declare -a EXCEPT_FOUND=()
   for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
      EXCEPT_USED+=(0)
      EXCEPT_FOUND+=(0)
   done
fi

# Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
LINK_COUNT_STRING=$(cat "$LINKS_FILE" | wc -l)

# Number of URLs is number of lines minus one (first line is column header row for the CSV)
LINK_COUNT=$(echo "${LINK_COUNT_STRING}" | tr -d '[:space:]')
let LINK_COUNT-=1
valPrint ctrh "Found $LINK_COUNT links to process."
valPrint trh ""

## CONFIG OUTPUT ##
valPrint t "Config:"
valPrint r "\b1 Config \b0"
valPrint hn "<h3>Config</h3>"

valPrint ctrhn "Links to consider: "
if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
   valPrint ctrh "$((URL_LIMIT-URL_START+1)), from link $URL_START to $URL_LIMIT"
elif [ $URL_START -ne 1 ]; then
   valPrint ctrh "$((LINK_COUNT-URL_START+1)), from link $URL_START to $LINK_COUNT"
else
   valPrint ctrh "$LINK_COUNT"
fi

valPrint ctrh "Site query timeout: $TIMEOUT seconds"

valPrint ctrhn "Show OK links: "
if [ $RECORD_OK_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi

valPrint ctrhn "Treat these response codes as OK: "
if [ $ONLY_200_OK -eq 1 ]; then valPrint ctrh "200"; else valPrint ctrh "${OK_CODES[*]}"; fi

valPrint ctrhn "Take screenshots: "
if [ $TAKE_PAGE_SHOT -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi

valPrint ctrhn "Suggest archive.org snapshots for NG pages: "
if [ $SUGGEST_SNAPSHOTS_NG -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi

valPrint ctrhn "Suggest archive.org snapshots for OK pages: "
if [ $SUGGEST_SNAPSHOTS_OK -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi

valPrint ctrhn "Ignore slash-adding redirects: "
if [ $SHOW_SLASH -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi

valPrint ctrhn "Ignore HTTPS-upgrading redirects: "
if [ $SHOW_HTTPS -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi

valPrint ctrhn "Ignore youtu.be redirects: "
if [ $SHOW_YT_RD -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi

valPrint ctrhn "Check archive.org and archive.is links: "
if [ $CHECK_ARCHIVE_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi

valPrint tr "A summary of my findings will be found at the bottom of the report."
valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
valPrint trh ""

## LEGEND OUTPUT ##
valPrint t "Legend:"
valPrint r "\b1 Legend \b0"
valPrint hn "<h3>Legend</h3>"
valPrint t "(For guidance in fixing these links, see $WIKI_MAIN. The exceptions list is at $EXCEPT_URL.)"
valPrint r "(For guidance in fixing these links, see {\field{\*\fldinst{HYPERLINK \"$WIKI_MAIN\"}}{\fldrslt here}}. The exceptions list is {\field{\*\fldinst{HYPERLINK \"$EXCEPT_URL\"}}{\fldrslt here}}.)"
valPrint h "(For guidance in fixing these links, see <a href=\"$WIKI_MAIN\" target=\"_blank\">here</a>. The exceptions list is <a href=\"$EXCEPT_URL\" target=\"_blank\">here</a>.)"
valPrint trh "OK = URL seems to be working"
valPrint trh "NG = URL no longer seems to work"
valPrint trh "RD = URL is redirecting to this new URL"
valPrint trh "EI = URL is an external link to an internal page and should be converted to an intrawiki link using the suggested markup"
valPrint trh "IW = URL is an external link to a fellow wiki and should be converted to an interwiki link using the suggested markup"
valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $WIKI_HTTP)"
valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$WIKI_HTTP\"}}{\fldrslt here}} for code reference)"
valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$WIKI_HTTP\" target=\"_blank\">here</a> for code reference)"
valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $WIKI_CURL)"
valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$WIKI_CURL\"}}{\fldrslt here}} for code reference)"
valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$WIKI_CURL\" target=\"_blank\">here</a> for code reference)"
valPrint trh "IA suggests = Last available snapshot returned by the Internet Archive"
valPrint trh "Try browsing = The Archive failed to return a snapshot URL, so check for a snapshot manually using this link"
valPrint trh ""


### MAIN LOOP ###
valPrint t "Links:"
valPrint r "\b1 Links \b0"
valPrint hn "<h3>Links</h3>"
START_RUN=$(date +%s)
# Process each line of the .csv in LINKS_FILE
for LINE in `cat "$LINKS_FILE"`; do
   START_LINK=$(date +%s)
   let LINK_NUM+=1

   # First line is the column header row for the CSV, so let's verify that the format hasn't changed
   if [ $SKIPPED_HEADER_ROW -eq 0 ]; then
      if [ $LINE == "namespace,title,target" ]; then
         SKIPPED_HEADER_ROW=1
         LINK_NUM=0 # this line is not a link, so reset the link counter
         valPrint hn "<table>"
         continue
      else
         valPrint ctrh "Did not find expected header row in $LINKS_FILE. Aborting."
         wrapupAndExit
      fi
   fi

   # Skip this link if we are not at URL_START yet
   if [ $LINK_NUM -lt $URL_START ]; then
      continue
   fi
   
   # Stop if we are at the limit declared for testing purposes
   if [ $URL_LIMIT -gt 0 ] && [ $LINK_NUM -gt $URL_LIMIT ]; then
      FINISHED_LIST="limit"
      wrapupAndExit
   fi
   
   # Parse line into namespace ID number, containing wiki page, and external link URL
   NS_ID=${LINE%%,*}
   PAGE_NAME=${LINE#$NS_ID,}
   PAGE_NAME=${PAGE_NAME%%,*} # a comma in the page name will break this
   URL=${LINE#$NS_ID,$PAGE_NAME,} # commas can be in this
   if [ -z "$NS_ID" ] || [ -z "$PAGE_NAME" ] || [ -z "$URL" ]; then
      valPrint trs "Skipping line $LINK_NUM ('$LINE') because the namespace, wiki page or link URL could not be read."
      let SKIP_PARSE_FAIL+=1
      continue
   fi
   
   # Skip any link that isn't "http://" or "https://"
   if [[ ! $URL =~ ^http* ]]; then
      valPrint trs "Skipping line $LINK_NUM ('$LINE') because the protocol isn't 'http://' or 'https://'."
      let SKIP_UNK_PROT+=1
      continue
   fi

   # Print progress to screen
   if [ $LINK_NUM -gt 1 ]; then
      printf "\e[1A\n" # erase previous progress message so that new one appears in its place
   fi
   valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."

   # Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
   NS_NAME=""
   a=0
   while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done
      if [ $NS_ID == "NULL" ]; then
         break
      elif [ $NS_ID -eq ${NS_IDS[$a]} ]; then
         NS_NAME="${NS_NAMES[$a]}"
         break
      fi
      let a+=1
   done
   if [ "$NS_NAME" == "" ]; then
      if [ $NS_ID == "NULL" ]; then
         valPrint trs "Skipping line $LINK_NUM ('$LINE') because the namespace (and probably the page too) is 'NULL'. Probably the link is no longer in existence on the wiki."
      else
         valPrint trs "Skipping line $LINK_NUM ('$LINE') because I could not find a name for namespace ID $NS_ID."
      fi
      let SKIP_UNK_NS+=1
      let PAGE_LINKS+=1
      continue
   fi

   # Build longer wiki page URLs from namespace and page names
   FULL_PAGE_PATH=https://$WIKI_PATH/$NS_NAME:$PAGE_NAME
   LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
   # Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it
   # explicitly breaks the link
   if [ $NS_ID -eq 0 ]; then
      FULL_PAGE_PATH=https://$WIKI_PATH/$PAGE_NAME
      LOCAL_PAGE_PATH=$PAGE_NAME
   fi

   # We don't want to consider wiki pages ending in .js, as the MW parser cannot reliably isolate URLS
   # in JavaScript code, so it returns erroneous links
   PAGE_NAME_SUFFIX=$(echo $PAGE_NAME | sed 's/.*\.//')
   if [ $PAGE_NAME_SUFFIX == "js" ]; then
      valPrint trs "Skipping URL '${LINE#$NS_ID,$PAGE_NAME,}' on line $LINK_NUM because it was found on JavaScript page '$LOCAL_PAGE_PATH'."
      let SKIP_JS_PAGE+=1
      let PAGE_LINKS+=1
      continue
   fi

   # Scan for illegal characters
   if [[ $URL == *[$ILLEGAL_CHARS]* ]]; then
      valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because it contains characters illegal in a URL."
      let SKIP_BAD_URL+=1
      let PAGE_LINKS+=1
      continue
   fi

   # If we're skipping archive links, see if this is one
   if [ $CHECK_ARCHIVE_LINKS -eq 0 ] && [[ ( $URL == *web.archive.org* || $URL == *archive.is* || $URL == *archive.ph* ) ]]; then
      valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to check archive links."
      let SKIP_ARCHIVES+=1
      let PAGE_LINKS+=1
      continue
   fi

   # Now we need to know if the URL is for a file or a web page. First step is to determine if the
   # URL ends in a suffix
   HAS_SUFFIX=0

   # If the URL ends in a query string like ".php?foo=bar", strip everything from the '?' onward
   CLEAN_URL=${URL%%\?*}

   # If the URL ends in an anchor link like "#section_15", strip everything from the '#' onward
   CLEAN_URL=${CLEAN_URL%%\#*}

   # 'sed' cannot handle Unicode in my Bash shell, so skip non-ASCII URL and make reader check it
   if [[ $CLEAN_URL == *[![:ascii:]]* ]]; then
      valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I cannot handle non-ASCII characters."
      let SKIP_NON_ASCII+=1
      let PAGE_LINKS+=1
      continue
   fi

   # Isolate the characters after the last period and after the last slash
   POST_DOT=$(echo "$CLEAN_URL" | sed 's/.*\.//')
   POST_SLASH=$(echo "$CLEAN_URL" | sed 's/.*\///')

   # If the last period comes after the last slash, then the URL ends in a suffix
   POST_DOT_LENGTH=$(echo | awk -v input=$POST_DOT '{print length(input)}')
   POST_SLASH_LENGTH=$(echo | awk -v input=$POST_SLASH '{print length(input)}')
   if [ $POST_DOT_LENGTH -lt $POST_SLASH_LENGTH ]; then
      HAS_SUFFIX=1
   else
      HAS_SUFFIX=0
   fi

   # Now we need to know if the URL is for a file. If the URL ends in a suffix, compare it against the
   # known URL endings supplied in HTTP_FILES and HTTP_TLDS_AND_PAGES
   IS_FILE=-1
   if [ $HAS_SUFFIX -eq 0 ]; then
      IS_FILE=0
   else
      # Turn off case sensitivity while we compare suffixes
      shopt -s nocasematch

      # Special case: URLs ending in something like "productID.304297400" are pages, not files, so if
      # the URL's suffix is all numbers, we are looking at the end of a web page URL
      if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
         IS_FILE=0
      fi

      # Special case: URLs ending in a parens, e.g. "ms537113(v=vs.85)", are pages
      if [[ $POST_DOT =~ ^.*[\(\)]$ ]]; then
         IS_FILE=0
      fi

      # Special case: URLs containing a '%', e.g. "10.1007%2FBF00329055", are pages
      if [[ $POST_DOT == *%* ]]; then
         IS_FILE=0
      fi
      
      # If we did not identify this URL as a web page above, we need to compare the suffix against known
      # file extensions
      if [ $IS_FILE -eq -1 ]; then
         for EXTENSION in "${HTTP_FILES[@]}"; do
            if [[ $EXTENSION == $POST_DOT ]]; then # double brackets respect the nocasematch setting
               IS_FILE=1
               break
            fi
         done
      fi

      # If we did not identify this URL as a file above, we need to compare the suffix against known
      # pages and TLDs in order to rule out that this is an unknown file suffix or TLD that the user
      # needs to add to the HTTP_FILES or HTTP_TLDS_AND_PAGES array
      if [ $IS_FILE -eq -1 ]; then
         for PAGE_OR_TLD in "${HTTP_TLDS_AND_PAGES[@]}"; do
            if [[ $PAGE_OR_TLD == $POST_DOT ]]; then
               IS_FILE=0
               break
            fi
         done
      fi

      # Turn case sensitivity back on in Bash
      shopt -u nocasematch
   fi

   # If this suffix escaped identification as either a file, page or TLD, inform the reader
   STR_TYPE=""
   if [ $IS_FILE -eq -1 ]; then
      valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I encountered the unknown URL suffix '$POST_DOT'. Please add this suffix to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
      let SKIP_UNK_SUFFIX+=1
      continue
   elif [ $IS_FILE -eq 1 ]; then
      STR_TYPE="file"
      let FILE_LINKS+=1
   else
      STR_TYPE="page"
      let PAGE_LINKS+=1
   fi

   # Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
   # issue with sites that require HTTPS
   CURL_CODE=$(curl -o /dev/null --silent --insecure --compressed --head --user-agent '$AGENT' --max-time $TIMEOUT --retry 2 --write-out '%{http_code}\n' $URL)
   CURL_ERR=$(echo $?)
   CURL_RESULT=$CURL_CODE

   # Tack 'curl' exit code onto result string if 'curl' returned "000" (no HTTP response)
   if [ "$CURL_CODE" == "000" ]; then
      CURL_RESULT="$CURL_RESULT-$CURL_ERR"
   fi

   # Begin to determine our status code for this URL (EI, IW, OK, RD, or NG)
   STATUS="??"
   NEW_URL=""
   INTERWIKI_INDEX=-1

   # First make sure that this isn't an "external internal" link to our own wiki that can be replaced
   # by "[[page_name]]". If it uses a special access URL beginning with "/w/", let it pass, as it
   # probably cannot be replaced by "[[ ]]" markup
   if [[ $URL == *$WIKI_PATH* ]] && [[ $URL != *$WIKI_PATH/w/* ]]; then
      STATUS="EI"
      let EI_LINKS+=1
   fi

   # If it's not, check if this is a link to a domain that we have an interwiki prefix for (also make
   # sure that it's not an archive.org link to a page from an interwiki domain)
   if [ $STATUS == "??" ] && [[ $URL != *web.archive.org* ]]; then
      for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
         if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]] && [[ $URL != *${INTERWIKI_DOMAINS[$i]}/w/* ]]; then
            STATUS="IW"
            let IW_LINKS+=1
            INTERWIKI_INDEX=$i
            break
         fi
      done
   fi

   # If we didn't match an interwiki domain, see if the status code is in our "OK" codes list
   if [ $STATUS == "??" ]; then
      for CODE in "${OK_CODES[@]}"; do
         if [ $ONLY_200_OK -eq 1 ] && [ $CODE -ne 200 ]; then
            continue
         fi
      
         if [[ $CODE == $CURL_CODE ]]; then
            STATUS="OK"
            let OK_LINKS+=1

            # If this is a YouTube link, we have to look at the actual page source to know if the video
            # is good or not; override the link's info if it's actually NG. Also see RD_CODES section
            # below for duplicative code.
            if [[ $URL == *www.youtube.com* ]]; then
               PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $URL)
               CURL_ERR=$(echo $?)
               if [ "$CURL_ERR" != "0" ]; then
                  STATUS="NG"
                  CURL_RESULT="000-$CURL_ERR"
                  let OK_LINKS-=1
                  let NG_LINKS+=1
               elif [[ "$PAGE_TEXT" =~ "simpleText\":\"Video unavailable" ]] | [[ "$PAGE_TEXT" =~ "simpleText\":\"Private video" ]]; then
                  STATUS="NG"
                  CURL_CODE="404"
                  CURL_RESULT=$CURL_CODE
                  let OK_LINKS-=1
                  let NG_LINKS+=1
               fi
            fi
            
            # If this is a OneDrive link, we have to look at the actual page source to know if the file
            # is really still at this URL; override the link's info if it's actually NG or RD
            if [[ $URL == *skydrive.live.com* ]]; then
               PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $URL)
               CURL_ERR=$(echo $?)
               if [ "$CURL_ERR" != "0" ]; then
                  STATUS="NG"
                  CURL_RESULT="000-$CURL_ERR"
                  let OK_LINKS-=1
                  let NG_LINKS+=1
               elif [[ "$PAGE_TEXT" =~ "<h1>Sorry, something went wrong" ]]; then
                  STATUS="NG"
                  CURL_CODE="404"
                  CURL_RESULT=$CURL_CODE
                  let OK_LINKS-=1
                  let NG_LINKS+=1
               elif [[ "$PAGE_TEXT" =~ "<h2>Object moved to" ]]; then
                  STATUS="??" # have to send the code through the next block to treat the redirect properly
                  CURL_CODE="301"
                  CURL_RESULT=$CURL_CODE
                  let OK_LINKS-=1
               fi
            fi
            
            break
         fi
      done
   fi

   # If we didn't get a match with the "OK" codes, check it against the "RD" codes
   if [ $STATUS == "??" ]; then
      for CODE in "${RD_CODES[@]}"; do
         if [[ $CODE == $CURL_CODE ]]; then
            # Get URL header again in order to retrieve the URL we are being redirected to, but if this
            # is a OneDrive link, we already have the new URL in $PAGE_TEXT
            if [[ $URL == *skydrive.live.com* ]]; then
               NEW_URL=${PAGE_TEXT##*href=\"}
               NEW_URL=${NEW_URL%\">here*}
            else
               NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --write-out '%{redirect_url}\n' $URL)
            fi

            # Adjust the old and new URLs to both use HTTP for comparison purposes, so we can filter
            # those changes out if the user didn't ask for them
            URL_HTTP=$(echo $URL | sed -E 's/^https:/http:/')
            NEW_URL_HTTP=$(echo $NEW_URL | sed -E 's/^https:/http:/')

            # Sometimes 'curl' fails to get the redirect_url due to time-out or bad web site config
            NEW_URL_LENGTH=$(echo | awk -v input=$NEW_URL_HTTP '{print length(input)}')
            if [ $NEW_URL_LENGTH -lt $MIN_URL_LENGTH ]; then
               NEW_URL_HTTP="[new URL not retrieved]"
            fi

            # Remove slash at end of new URL, if present, so we can filter out the redirects that
            # merely add an ending slash if the user didn't ask for them
            NEW_URL_NO_SLASH=$(echo $NEW_URL_HTTP | sed -E 's:/$::')

            # Detect if this is a youtu.be link simply being expanded by YouTube to the full
            # youtube.com address
            YOUTU_BE=0
            if [[ $URL_HTTP == http://youtu.be* ]] && [[ $NEW_URL_HTTP == http://www.youtube.com* ]]; then
               YOUTU_BE=1
            fi

            # If the URLs match besides HTTP being upgraded to HTTPS, then the link is OK (unless user
            # wants those to be reported)
            if [ $SHOW_HTTPS -eq 0 ] && [ $URL_HTTP == $NEW_URL_HTTP ]; then
               valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to show http->https upgrades, and I was redirected to '$NEW_URL'."
               STATUS="OK"
               let OK_LINKS+=1
               let SKIP_HTTPS_UP+=1
            # If the URLs match besides an added ending slash, then the link is OK (unless user wants
            # those to be reported)
            elif [ $SHOW_SLASH -eq 0 ] && [ $URL_HTTP == $NEW_URL_NO_SLASH ]; then
               valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to show added trailing slashes, and I was redirected to '$NEW_URL'."
               STATUS="OK"
               let OK_LINKS+=1
               let SKIP_SLASH_ADD+=1
            elif [ $YOUTU_BE -eq 1 ]; then
               # We have to look at the actual page source to know if a YouTube video is good or not.
               # Also see OK_CODES section above for duplicative code.
               PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $NEW_URL | grep -e "simpleText\":\"Video unavailable" -e "simpleText\":\"Private video")
               if [ ! -z "$PAGE_TEXT" ]; then
                  STATUS="NG"
                  let NG_LINKS+=1
               else
                  if [ $SHOW_YT_RD -eq 0 ]; then
                     valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to show redirects for youtu.be links, and I was redirected to '$NEW_URL'."
                     STATUS="OK"
                     let OK_LINKS+=1
                     let SKIP_YOUTU_BE+=1
                  else
                     STATUS="RD"
                     let RD_LINKS+=1
                  fi
               fi
            else
               STATUS="RD"
               let RD_LINKS+=1
            fi
            break
         fi
      done
   fi

   # If we didn't get a match with the "RD" codes, check it against the "NG" codes
   if [ $STATUS == "??" ]; then
      for CODE in "${NG_CODES[@]}"; do
         if [[ $CODE == $CURL_CODE ]]; then
            STATUS="NG"
            let NG_LINKS+=1
            break
         fi
      done
      # Also check it against the "OK" codes besides 200 if the --only-200-ok argument was received
      if [ $ONLY_200_OK -eq 1 ]; then
         for CODE in "${OK_CODES[@]}"; do
            if [ $CODE -eq 200 ]; then
               continue
            fi
            if [[ $CODE == $CURL_CODE ]]; then
               STATUS="NG"
               let NG_LINKS+=1
               break
            fi
         done
      fi
   fi

   # If we didn't match a known status code, advise the reader
   if [ $STATUS == "??" ]; then
      valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I encountered the unknown response code $CURL_CODE."
      let SKIP_UNK_CODE+=1
      continue
   fi

   # Check problem links against exceptions list before proceeding
   FOUND_EXCEPT=0
   if [ $STATUS != "OK" ] && [ ! -z "$EXCEPT_URL" ]; then
      # The code we expect to find in the exceptions file is either the 'curl' result or "EI"/"IW"
      EXPECT_CODE="$CURL_RESULT"
      if [ $STATUS == "EI" ]; then
         EXPECT_CODE="EI"
      elif [ $STATUS == "IW" ]; then
         EXPECT_CODE="IW"
      fi

      # Look for link in exceptions list and make sure the listed result code and wiki page also match
      for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
      {
         EXCEPT_LINE="${EXCEPT_ARRAY[$i]}"
         
         # Undo any HTML-encoding from the wiki page; for now we just worry about the ampersand, as most
         # other HTML-encoded characters are not found in URLs
         EXCEPT_LINE=$(echo "$EXCEPT_LINE" | sed 's/\&amp;/\&/g')

         # Check for URL match
         EXCEPT_URL="${EXCEPT_LINE#*,}"
         EXCEPT_URL="${EXCEPT_URL%,*}"
         if [[ "$EXCEPT_URL" =~ \* ]]; then # if this exception URL contains the '*' wildcard, use pattern-matching with it
            if [[ ! "$URL" == $EXCEPT_URL ]]; then
               continue
            fi
         else
            if [ "$EXCEPT_URL" != "$URL" ]; then # otherwise just use a straight string comparison
               continue
            fi
         fi

         # Check for page name match
         EXCEPT_PAGE="${EXCEPT_LINE##*,}"
         EXCEPT_PAGE="${EXCEPT_PAGE%% *}"
         if [ "$EXCEPT_PAGE" == "*" ] || [ "$EXCEPT_PAGE" == "$LOCAL_PAGE_PATH" ]; then
            let EXCEPT_FOUND[$i]+=1
            valPrint trs "Found exception '$URL' on page '$LOCAL_PAGE_PATH'."

            # Check for result code match
            EXCEPT_CODE=${EXCEPT_LINE%%,*}
            if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then
               FOUND_EXCEPT=1
               let EXCEPT_USED[$i]+=1
               valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because its expected result, '$EXPECT_CODE', is in the exceptions list."

               if [ $STATUS == "EI" ]; then
                  let SKIP_EXPECT_EI+=1
               elif [ $STATUS == "IW" ]; then
                  let SKIP_EXPECT_IW+=1
               elif [ $STATUS == "RD" ]; then
                  let SKIP_EXPECT_RD+=1
               else
                  let SKIP_EXPECT_NG+=1
               fi

               break
            fi
         fi
      } done
   fi
   if [ $FOUND_EXCEPT -eq 1 ]; then
      continue
   fi

   # If appropriate, record this link to the log, with clickable URLs when possible
   if [ $STATUS != "OK" ] || [ $RECORD_OK_LINKS -eq 1 ]; then
      # Prepare 'curl' result in parentheses to print after status code, unless this is an "IW" or "EI"
      # link, in which case showing the status code doesn't make sense. Adjust spacing after string to
      # ensure TXT and RTF reports have aligned columns of results.
      CURL_STR_H=" ($CURL_RESULT)"
      CURL_STR_T="$CURL_STR_H"
      CURL_STR_R="$CURL_STR_H	"
      if [ $STATUS == "IW" ] || [ $STATUS == "EI" ]; then
         CURL_STR_H=""
         CURL_STR_T="      "
         CURL_STR_R="			"
      fi
      
      # Record link and its wiki page in TXT, RTF, and HTML markup
      valPrint t "${STATUS}${CURL_STR_T} $STR_TYPE $URL"
      valPrint t "  linked from $FULL_PAGE_PATH"
      valPrint r "${STATUS}${CURL_STR_R}${RTF_TABS}$STR_TYPE	{\field{\*\fldinst{HYPERLINK \"$URL\"}}{\fldrslt $URL}}"
      valPrint r "		linked from	{\field{\*\fldinst{HYPERLINK \"$FULL_PAGE_PATH\"}}{\fldrslt $LOCAL_PAGE_PATH}}"
      valPrint hn "<tr><td style=\"white-space:nowrap\">${STATUS}${CURL_STR_H}</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>"
      valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"

      # Place vertical space here since we won't be printing anything more about this link
      if [ $STATUS == "OK" ] && [ $SUGGEST_SNAPSHOTS_OK -eq 0 ]; then valPrint tr ""; valPrint hs ""; fi

      # Record redirect URL if one was given by a 3xx response page
      if [ $STATUS == "RD" ]; then
         valPrint ts "  Server suggests $NEW_URL"
         valPrint rs "	Server suggests {\field{\*\fldinst{HYPERLINK \"$NEW_URL\"}}{\fldrslt $NEW_URL}}"
         valPrint hs "<tr><td colspan=\"2\" align=\"right\">Server suggests</td><td><a href=\"$NEW_URL\" target=\"_blank\">$NEW_URL</a></td></tr>"
      fi

      # Notify reader if we can use an intrawiki link for this URL
      if [ $STATUS == "EI" ]; then
         INTRA_PAGE=${URL#*://*/}
         # If INTRA_PAGE starts with Category:, File: or Image:, prefix it with a ':' to make it a wikilink
         if [[ $INTRA_PAGE == Category:* ]] || [[ $INTRA_PAGE == File:* ]]|| [[ $INTRA_PAGE == Image:* ]]; then
            INTRA_PAGE=:${INTRA_PAGE}
         fi
         valPrint ts "  Just use [[$INTRA_PAGE]]"
         valPrint rs "		Just use [[$INTRA_PAGE]]"
         valPrint hs "<tr><td colspan=\"2\" align=\"right\">Just use</td><td>[[$INTRA_PAGE]]</td></tr>"
      fi

      # Notify reader if we can use an interwiki prefix for this URL
      if [ $STATUS == "IW" ]; then
         INTER_PAGE=$(echo "$URL" | sed 's/.*\///')
         valPrint ts "  You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
         valPrint rs "		You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
         valPrint hs "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]</td></tr>"
      fi

      # Query Internet Archive for latest "OK" snapshot for "NG" page
      if [[ ( $STATUS == "NG" && $SUGGEST_SNAPSHOTS_NG -eq 1 ) || ( $STATUS == "OK" && $SUGGEST_SNAPSHOTS_OK -eq 1 ) ]]; then

         # We need to watch out for the rate limit or we'll get locked out; look at how much time has
         # elapsed and then wait the remainder between that and how long of a wait we think is needed
         # to avoid the dreaded "Too Many Requests" response. 5 seconds is just a guess.
         CUR_TIME=$(date +%s)
         WAIT_REMAINDER=$((5 - $CUR_TIME + $START_LINK))
         if [ $WAIT_REMAINDER -gt 0 ]; then
            valPrint t "Waiting $WAIT_REMAINDER second(s) to conform to Archive.org rate limit."
            sleep $WAIT_REMAINDER
         fi

         # Issue query to the API
         ARCHIVE_QUERY=$(curl --silent --max-time $TIMEOUT "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")

         # Notify reader if we hit the rate limit and just keep going
         if [[ "$ARCHIVE_QUERY" == "*Too Many Requests*" ]]; then
            valPrint t "  IA has rate-limited us!"
            valPrint r "		IA has rate-limited us!"
            valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td>(hit the API rate limit!)</td></tr>"
         # If a "closest" snapshot was received, inform reader
         elif [[ "$ARCHIVE_QUERY" == *\"closest\":* ]]; then
            # In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it
            ARCHIVE_QUERY=$(echo "$ARCHIVE_QUERY" | sed 's/#!/#\\!/')

            # ...isolate "url" property in the response that follows the "closest" tag
            SNAPSHOT_URL=${ARCHIVE_QUERY##*\"closest\":} # everything after '"closest":'
            SNAPSHOT_URL=${SNAPSHOT_URL#*\"url\": \"} # everything after '"url": "'
            SNAPSHOT_URL=${SNAPSHOT_URL%%\"*} # everything before '"'

            # Remove the port 80 part that IA often adds to the URL, as it's superfluous
            SNAPSHOT_URL=$(echo $SNAPSHOT_URL | sed 's/:80//')

            # Inform the reader of the snapshot URL
            valPrint ts "  IA suggests $SNAPSHOT_URL"
            valPrint rs "		IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
            valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
         else # Otherwise give a generic Wayback Machine link for this URL, which might work
            valPrint ts " Try browsing $ARCHIVE_GENERIC/$URL"
            valPrint rs "		Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
            valPrint hs "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"
         fi
      fi
   fi
   
   # If this is a page and it seems to be there, take screenshot of it using headless Google Chrome
   if [ $IS_FILE -eq 0 ] && [ $TAKE_PAGE_SHOT -eq 1 ] && [ $STATUS == "OK" ]; then
      # Sanitize URL by removing "http(s)://" and converting any other colons or slashes to underscores
      SHOT_NAME=$(echo "$URL" | sed 's/https*\:\/\///' | sed 'y/:\//__/')
      SHOT_FILE="$SHOT_PATH/$SHOT_NAME.png"

      # Don't take screenshot if we already encountered this page and screenshotted it
      if [ ! -f "$SHOT_FILE" ]; then
         "$CHROME_PATH" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
         if [ -f "$WORKING_DIR/$CHROME_SCREENSHOT" ]; then
            mv -n "$WORKING_DIR/$CHROME_SCREENSHOT" "$SHOT_FILE"
         else
            valPrint trhs "Screenshot of URL $URL seems to have failed!"
         fi
      else
         valPrint trhs "Skipping screenshot of URL '$URL' because file '$SHOT_FILE' already exists."
      fi
   fi
done
FINISHED_LIST="yes"
wrapupAndExit