Context Navigation

source: Validate External Links/validate_external_links.sh@ 1182

Last change on this file since 1182 was 1182, checked in by iritscen, 2 years ago
ValExtLinks: Added special code for checking OneDrive file links. Added some safety code around 'curl' usage on YT and OneDrive link checks. Fix logic error with handling the wildcard in a URL.
File size: 59.5 KB

Rev	Line
[1064]	1	#!/bin/bash
	2
[1177]	3	# Validate External Links by Iritscen (iritscen@yahoo.com)
[1141]	4	#
	5	# Validates a list of external links in CSV format. The resulting logs are produced in three formats:
	6	# - TXT (for easy diffing with an earlier log)
	7	# - RTF (for reading as a local file with clickable links)
[1144]	8	# - HTML (for reading as a web page)
[1142]	9	# Call script with "--help" argument for documentation. Also see Read Me.rtf for critical notes.
[1141]	10	#
[1064]	11	# Recommended rule:
[1118]	12	# \|----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----\|
[1141]	13	#
	14	# Table of contents (sections of script in order of appearance, not execution):
	15	# • Globals
	16	# • Help Output
	17	# • Setup
	18	# • Utility Functions
	19	# • Summary Output
	20	# • Initialization
	21	# • Data Sourcing
	22	# • Config Output
	23	# • Legend Output
	24	# • Main Loop
[1064]	25
	26	# Set separator token to newline
	27	IFS="
	28	"
	29
	30	### GLOBALS ###
	31	# Settings -- these will be changed from their defaults by the arguments passed in to the script
[1175]	32	LINKS_URL="" # download external link CSV from this location (can use "file://" protocol)
	33	EXCEPT_URL="" # location of wiki page with a list of exceptions for NG results
[1177]	34	OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder
[1147]	35	RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES
	36	SHOW_SLASH=0 # record issue when a slash is added to the end of a URL
	37	SHOW_HTTPS=0 # record issue when "http" is upgraded to "https"
	38	SHOW_YT_RD=0 # record redirection for a youtu.be URL expanding to the full URL
	39	SUGGEST_SNAPSHOTS_NG=0 # query the Internet Archive for a possible snapshot URL for each NG page
	40	SUGGEST_SNAPSHOTS_OK=0 # query the Internet Archive for an existing snapshot of each OK page
[1158]	41	CHECK_ARCHIVE_LINKS=0 # check URLs on archive.org and archive.is
[1147]	42	TAKE_PAGE_SHOT=0 # take a screenshot of each OK page
	43	TIMEOUT=10 # time to wait for a response when querying a site
	44	CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature
	45	URL_START=1 # start at this URL in LINKS_FILE
	46	URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE
	47	UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report
[1064]	48
	49	# Fixed strings -- see the occurrences of these variables to learn their purpose
[1178]	50	AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
[1064]	51	ARCHIVE_API="http://archive.org/wayback/available"
	52	ARCHIVE_GENERIC="https://web.archive.org/web/*"
	53	ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
	54	CHROME_SCREENSHOT="screenshot.png"
[1136]	55	EXCEPT_FILE_NAME="exceptions.txt"
[1064]	56	EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
[1141]	57	WIKI_CURL="https://wiki.oni2.net/Validate_External_Links/Curl_codes"
	58	WIKI_HTTP="https://wiki.oni2.net/Validate_External_Links/HTTP_codes"
	59	WIKI_MAIN="https://wiki.oni2.net/Validate_External_Links"
	60	WIKI_ME="http://iritscen.oni2.net"
[1064]	61	THIS_DIR=$(cd $(dirname $0); pwd)
	62	WORKING_DIR=$(pwd)
	63	WIKI_PATH="wiki.oni2.net"
	64
	65	# These are parallel arrays of the IDs and names of OniGalore's current namespaces
	66	declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 100 101 102 103 104 105 108 109 110 111)
	67	declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk")
	68
	69	# These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
[1070]	70	# This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code.
[1175]	71	declare -a HTTP_FILES=(3ds 7z avi BINA blend bsl cpp csv dae dll dmg doc east exe fbx first63 flv gamedata gif jpg last32 log m4a mhm mov mp3 mp4 oni ONWC pdf png psd py rar tga TRMA txt vbs wav wmv xaf xcf xlsx xml zip)
[1160]	72	declare -a HTTP_TLDS_AND_PAGES=(abstract action ars asp aspx cfm cgi com css de do full htm html it js jsp net org pgi php php3 phtml pl ru shtml stm uk x)
[1064]	73
[1067]	74	# These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
[1182]	75	# are NG (no good). Pages that return OK codes will be screenshotted when screenshots are asked for.
	76	# Remember to update http_codes.txt if you add a new code.
[1127]	77	declare -a OK_CODES=(200 401 405 406 418 501)
[1067]	78	declare -a RD_CODES=(301 302 303 307 308)
[1178]	79	declare -a NG_CODES=(000 400 403 404 410 429 500 502 503 504 520 530)
[1064]	80
	81	# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
	82	# transcluded text, and if the transclusion fails, then the braces show up in the URL
	83	ILLEGAL_CHARS="{ }"
	84
[1070]	85	# The shortest URL possible, used for sanity-checking some URLs: http://a.co
	86	MIN_URL_LENGTH=11
	87
[1064]	88	# These are parallel arrays giving the prefixes that can be used in place of normal external links to
[1157]	89	# some wikis and other sites; based on https://wiki.oni2.net/Special:Interwiki
[1070]	90	declare -a INTERWIKI_PREFIXES=(commons metawikimedia mw wikibooks wikidata wikimedia wikinews wikiquote wikisource wikispecies wikiversity wikivoyage wikt wp)
	91	declare -a INTERWIKI_DOMAINS=(commons.wikimedia.org meta.wikimedia.org mediawiki.org wikibooks.org wikidata.org wikimediafoundation.org wikinews.org wikiquote.org wikisource.org species.wikimedia.org wikiversity.org wikivoyage.org wiktionary.org wikipedia.org)
[1064]	92
	93	# Variables for keeping track of main loop progress and findings
	94	LINK_NUM=0
[1070]	95	EI_LINKS=0
	96	IW_LINKS=0
[1064]	97	OK_LINKS=0
[1067]	98	RD_LINKS=0
[1064]	99	NG_LINKS=0
[1177]	100	SKIP_PARSE_FAIL=0
	101	SKIP_UNK_PROT=0
[1064]	102	SKIP_UNK_NS=0
	103	SKIP_JS_PAGE=0
	104	SKIP_BAD_URL=0
	105	SKIP_NON_ASCII=0
	106	SKIP_UNK_SUFFIX=0
	107	SKIP_UNK_CODE=0
[1070]	108	SKIP_EXPECT_NG=0
[1142]	109	SKIP_EXPECT_RD=0
[1070]	110	SKIP_EXPECT_EI=0
	111	SKIP_EXPECT_IW=0
[1122]	112	SKIP_HTTPS_UP=0
	113	SKIP_SLASH_ADD=0
[1127]	114	SKIP_YOUTU_BE=0
[1158]	115	SKIP_ARCHIVES=0
[1064]	116	FILE_LINKS=0
	117	PAGE_LINKS=0
	118	SKIPPED_HEADER_ROW=0
	119	FINISHED_LIST="no"
[1118]	120	START_RUN=0
	121	END_RUN=0
[1064]	122
	123
[1141]	124	### HELP OUTPUT ###
[1064]	125	# A pseudo-man page. Here is the 80-character rule for the page text:
[1178]	126	# 345678901234567890123456789012345678901234567890123456789012345678901234567890
[1064]	127	function printHelp()
	128	{
	129	cat << EOF
	130
	131	NAME
	132	Validate External Links
	133
	134	SYNOPSIS
	135	validate_external_links.sh --help
[1070]	136	validate_external_links.sh --links URL --output DIR [--exceptions URL]
[1136]	137	[--record-ok-links] [--show-added-slashes] [--show-https-upgrades]
[1144]	138	[--show-yt-redirects] [--suggest-snapshots] [--check-archive-links]
[1141]	139	[--take-screenshots FILE] [--timeout NUM] [--start-url NUM]
	140	[--end-url NUM] [--upload FILE]
[1064]	141
	142	DESCRIPTION
	143	This script parses a list of external links found in the OniGalore wiki
[1147]	144	(which is dumped by the Oni2.net server periodically in a particular
[1064]	145	format), validates them using the Unix tool 'curl', and produces a report
[1070]	146	of which links were "OK" (responded positively to an HTTP query), which
	147	were "RD" (responded with a 3xx redirect code), which could be "IW"
	148	(interwiki) links, which are "EI" (external internal) links and could be
	149	intrawiki links, and which were "NG" (no good; a negative response to the
[1069]	150	query). This report can then be automatically uploaded to the location of
[1064]	151	your choice. The script can also suggest Internet Archive snapshots for
[1070]	152	"NG" links, and take screenshots of "OK" links for visual verification by
	153	the reader that the page in question is the one intended to be displayed.
[1064]	154
	155	You must pass this script the URL at which the list of links is found
[1070]	156	(--links) and the path where the directory of logs should be outputted
	157	(--output). All other arguments are optional.
[1064]	158
	159	OPTIONS
[1075]	160	--help Show this page.
	161	--links URL (required) URL from which to download the CSV
	162	file with external links. Note that this URL can
	163	be a local file if you supply a file:// path.
	164	--output DIR (required) Unix path to directory in which Val
	165	should place its reports.
	166	--exceptions URL In order to remove links from the report which
[1136]	167	Val finds an issue with but which you regard as
	168	OK, list those desired exceptions on a wiki page.
	169	See the sample file "exceptions.pdf" for the
	170	required format of the page. Note that this URL
	171	can point to a local file if you supply a path
	172	beginning with "file://".
[1075]	173	--record-ok-links Log a link in the report even if its response
	174	code is "OK".
[1122]	175	--show-added-slashes Report on redirects that simply add a '/' to the
	176	end of the URL.
[1127]	177	--show-https-upgrades Report on redirects that simply upgrade a
[1122]	178	"http://" URL to a "https://" URL.
[1127]	179	--show-yt-redirects Report on redirects that expand a youtu.be URL.
[1147]	180	--suggest-snapshots-ng Query the Internet Archive for a possible
[1075]	181	snapshot URL for each "NG" page.
[1147]	182	--suggest-snapshots-ok Query the Internet Archive for a snapshot of each
	183	"OK" page just to make sure it's available. Note
	184	that this will add a tremendous amount of time to
	185	the script execution because there is a rate
	186	limit to the Archive API. Note that this option
	187	does nothing unless you also use the
	188	--record-ok-links argument.
[1144]	189	--check-archive-links Check links that are already pointing to a page
[1158]	190	on the Internet Archive or archive.is (AKA
	191	archive.today). In theory these links should be
	192	totally stable and not need validation.
[1075]	193	--take-screenshots FILE Call the Google Chrome binary at this path to
	194	take screenshots of each "OK" page.
[1141]	195	--timeout NUM Wait this many seconds for a site to respond. The
[1142]	196	default is 10. Important note: Val will attempt
	197	to reach each URL three times, so the time taken
	198	to ping an unresponsive site will be three times
	199	this setting.
[1075]	200	--start-url NUM Start at this link in the links CSV file.
	201	--end-url NUM Stop at this link in the links CSV file.
	202	--upload FILE Upload report using the credentials and path
	203	given in this local text file. See sftp_login.txt
	204	for template.
[1064]	205
	206	BUGS
	207	The script cannot properly parse any line in the external links file
	208	which contains a comma in the name of the wiki page containing a link.
	209	Commas in the link itself are not an issue.
	210	EOF
	211	}
	212
	213
	214	### SETUP ###
	215	# If first argument is a help request, or if nothing was passed in at all, print help page and quit
	216	if [ "$#" -eq 0 ] \|\| [ "$1" == "--help" ]; then
	217	printHelp \| less
	218	exit 0
	219	fi
	220
	221	# Parse arguments as long as there are more arguments to process
	222	while (( "$#" )); do
	223	case "$1" in
[1147]	224	--links ) LINKS_URL="$2"; shift 2;;
	225	--exceptions ) EXCEPT_URL="$2"; shift 2;;
	226	--output ) OUTPUT_DIR="$2"; shift 2;;
	227	--record-ok-links ) RECORD_OK_LINKS=1; shift;;
	228	--show-added-slashes ) SHOW_SLASH=1; shift;;
	229	--show-https-upgrades ) SHOW_HTTPS=1; shift;;
	230	--show-yt-redirects ) SHOW_YT_RD=1; shift;;
	231	--suggest-snapshots-ng ) SUGGEST_SNAPSHOTS_NG=1; shift;;
	232	--suggest-snapshots-ok ) SUGGEST_SNAPSHOTS_OK=1; shift;;
	233	--check-archive-links ) CHECK_ARCHIVE_LINKS=1; shift;;
	234	--take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
	235	--timeout ) TIMEOUT=$2; shift 2;;
	236	--start-url ) URL_START=$2; shift 2;;
	237	--end-url ) URL_LIMIT=$2; shift 2;;
	238	--upload ) UPLOAD_INFO=$2; shift 2;;
[1157]	239	* ) echo "Invalid argument '$1' detected. Aborting."; exit 1;;
[1064]	240	esac
	241	done
	242
	243	# If the required arguments were not supplied, print help page and quit
	244	if [ -z $LINKS_URL ] \|\| [ -z $OUTPUT_DIR ]; then
[1070]	245	echo "Error: I did not receive one or both required arguments. Run me with the \"--help\" argument for documentation."
[1064]	246	exit 2
	247	fi
	248
[1070]	249	# If user wants screenshots, make sure path to Chrome was passed in and is valid
	250	if [ $TAKE_PAGE_SHOT -eq 1 ]; then
	251	if [ ! -f "$CHROME_PATH" ]; then
	252	echo "Error: You need to supply a path to the Google Chrome application in order to take screenshots."
	253	exit 3
	254	fi
	255	fi
	256
[1064]	257	# Check that UPLOAD_INFO exists, if this argument was supplied
	258	if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then
	259	echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting."
[1070]	260	exit 4
[1064]	261	fi
	262
	263	# Check that OUTPUT_DIR is a directory
	264	if [ ! -d "$OUTPUT_DIR" ]; then
	265	echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting."
[1070]	266	exit 5
[1064]	267	fi
	268
	269	# Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots
	270	SAFE_TIME=$(date "+%Y-%m-%d--%H-%M-%S") # for use in path names
	271	NICE_TIME=$(TZ=":UTC" date "+%a, %d %b %Y %k:%M:%S GMT") # matches format of LINKS_DATE, which is extracted from the extlinks file
	272	OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
	273	OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
	274	SHOT_PATH="$OUTPUT_PATH/Screenshots"
	275	LOG_NAME="ValExtLinks report"
[1144]	276	LOG_NAME_TXT="$LOG_NAME.txt"
	277	LOG_NAME_RTF="$LOG_NAME.rtf"
	278	LOG_NAME_HTM="$LOG_NAME.htm"
	279	LOG_PATH="$OUTPUT_PATH/$LOG_NAME"
	280	LOG_PATH_TXT="$LOG_PATH.txt"
	281	LOG_PATH_RTF="$LOG_PATH.rtf"
	282	LOG_PATH_HTM="$LOG_PATH.htm"
[1064]	283	mkdir "$OUTPUT_PATH"
	284	if [ $TAKE_PAGE_SHOT -eq 1 ]; then
	285	mkdir "$SHOT_PATH"
	286	fi
	287
	288	# Check that 'mkdir' succeeded
	289	if [ ! -d "$OUTPUT_PATH" ]; then
	290	echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting."
[1070]	291	exit 6
[1064]	292	fi
	293
	294	# Get date on the file at LINKS_URL and print to log
	295	LINKS_DATE=$(curl --silent --head $LINKS_URL \| grep "Last-Modified")
	296	if [ -z "$LINKS_DATE" ]; then
	297	echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting."
[1070]	298	exit 7
[1064]	299	fi
	300	LINKS_DATE=${LINKS_DATE#Last-Modified: }
	301
	302
	303	### UTILITY FUNCTIONS ###
	304	# Writes a plain-text header to TXT log file
	305	function printTXTheader()
	306	{
	307	valPrint t "Validate External Links report"
	308	valPrint t "generated $NICE_TIME"
	309	valPrint t "from data of $LINKS_DATE"
[1141]	310	valPrint t "script by Iritscen (contact: $WIKI_ME)"
[1064]	311	valPrint t ""
	312	}
	313
	314	# Writes the RTF header to RTF log file, then a centered title, then sets text to left-justified
	315	function printRTFheader()
	316	{
	317	valPrint r "{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf820
	318	{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
	319	{\colortbl;\red255\green255\blue255;}
	320	{\*\expandedcolortbl;;}
	321	\margl1440\margr1440\vieww12600\viewh12100\viewkind0
	322	\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\qc\partightenfactor0
	323
	324	\f0\fs28 \cf0 \b1 Validate External Links report \b0\\
	325	generated $NICE_TIME\\
	326	from data of $LINKS_DATE\\
[1141]	327	script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$WIKI_ME\"}}{\fldrslt contact}})
[1064]	328	\\
	329	\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
	330	\cf0 "
	331	}
	332
	333	# Closes the RTF markup of the RTF log file
	334	function printRTFfooter()
	335	{
	336	valPrint r "}"
	337	}
	338
	339	# Writes the HTML header to HTML log file
	340	function printHTMheader()
	341	{
	342	valPrint h "<html>
	343	<head>
	344	<title>Validate External Links report</title>
	345	</head>
	346	<body>
	347	<h2>Validate External Links report</h2>
	348	<h3>generated $NICE_TIME<br />
	349	from data of $LINKS_DATE<br />
[1141]	350	script by Iritscen (<a href=\"$WIKI_ME\" target=\"_blank\">contact</a>)</h3>"
[1064]	351	}
	352
	353	# Closes the HTML markup of the HTML log file
	354	function printHTMfooter()
	355	{
	356	valPrint h "</body>
	357	</html>"
	358	}
	359
	360	# The central logging function. The first parameter is a string composed of one or more characters that
[1070]	361	# indicate which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
[1141]	362	# 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 's' means "Print an
	363	# extra newline at the end." 'w' means "Don't pass console output through 'fmt'" ("fmt" fits the output
[1119]	364	# to an 80-column CLI but can break special formatting and the 'n' option).
[1064]	365	function valPrint()
	366	{
	367	if [[ "$1" == c ]]; then
	368	if [[ "$1" == n ]]; then
	369	echo -n "$2"
	370	elif [[ "$1" == w ]]; then
	371	echo "$2"
[1119]	372	elif [[ "$1" == s ]]; then
	373	echo -e "$2\n"
[1064]	374	else
	375	echo "$2" \| fmt -w 80
	376	fi
	377	fi
	378	if [[ "$1" == t ]]; then
	379	if [[ "$1" == n ]]; then
[1144]	380	echo -n "$2" >> "$LOG_PATH_TXT"
[1119]	381	elif [[ "$1" == s ]]; then
[1144]	382	echo -e "$2\n" >> "$LOG_PATH_TXT"
[1064]	383	else
[1144]	384	echo "$2" >> "$LOG_PATH_TXT"
[1064]	385	fi
	386	fi
	387	if [[ "$1" == r ]]; then
	388	if [[ "$1" == n ]]; then
[1144]	389	echo "$2" >> "$LOG_PATH_RTF"
[1119]	390	elif [[ "$1" == s ]]; then
[1144]	391	echo "$2\line\line" >> "$LOG_PATH_RTF"
[1064]	392	else
[1144]	393	echo "$2\line" >> "$LOG_PATH_RTF"
[1064]	394	fi
	395	fi
	396	if [[ "$1" == h ]]; then
[1119]	397	if [[ "$1" == s ]]; then
[1144]	398	echo "$2<tr><td> </td></tr>" >> "$LOG_PATH_HTM"
[1119]	399	elif [[ "$1" == n ]]; then
[1144]	400	echo "$2" >> "$LOG_PATH_HTM"
[1064]	401	else
[1144]	402	echo "$2<br />" >> "$LOG_PATH_HTM"
[1064]	403	fi
	404	fi
	405	}
	406
	407	# Pluralize the string in parameter 1 if the number in parameter 2 is not 1
	408	function pluralCheckNoun()
	409	{
	410	if [ $2 -ne 1 ]; then
	411	if [[ $1 =~ x$ ]]; then
	412	echo $1es
	413	else
	414	echo $1s
	415	fi
	416	else
	417	echo $1
	418	fi
	419	}
	420
[1067]	421	# Output "is" if parameter 1 is 1, otherwise "are"
	422	function pluralCheckIs()
	423	{
	424	if [ $1 -ne 1 ]; then
	425	echo "are"
	426	else
	427	echo "is"
	428	fi
	429	}
	430
[1064]	431	# Output "was" if parameter 1 is 1, otherwise "were"
	432	function pluralCheckWas()
	433	{
	434	if [ $1 -ne 1 ]; then
	435	echo "were"
	436	else
	437	echo "was"
	438	fi
	439	}
	440
[1067]	441	# Output "a " if parameter 1 is 1, otherwise nothing
	442	function pluralCheckA()
	443	{
	444	if [ $1 -eq 1 ]; then
	445	echo "a "
	446	fi
	447	}
	448
	449	# Output "an " if parameter 1 is 1, otherwise nothing
	450	function pluralCheckAn()
	451	{
	452	if [ $1 -eq 1 ]; then
	453	echo "an "
	454	fi
	455	}
	456
[1144]	457	# Upload the reports using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
[1064]	458	# reports being saved to disk have already been closed.
	459	function uploadReport()
	460	{
[1144]	461	valPrint c "Uploading reports..."
[1064]	462
	463	SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
	464	SFTP_USER_NAME_MARKER="user:"
	465	SFTP_PASSWORD_MARKER="pw:"
	466	SFTP_PORT_MARKER="port:"
	467	SFTP_PATH_MARKER="path:"
	468	SFTP_USER_NAME=$(grep $SFTP_USER_NAME_MARKER $UPLOAD_INFO)
	469	SFTP_USER_NAME=${SFTP_USER_NAME#$SFTP_USER_NAME_MARKER}
	470	SFTP_PASSWORD=$(grep $SFTP_PASSWORD_MARKER $UPLOAD_INFO)
	471	SFTP_PASSWORD=${SFTP_PASSWORD#$SFTP_PASSWORD_MARKER}
	472	SFTP_PORT=$(grep $SFTP_PORT_MARKER $UPLOAD_INFO)
	473	SFTP_PORT=${SFTP_PORT#$SFTP_PORT_MARKER}
	474	SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
	475	SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}
	476
[1144]	477	for SUFFIX in htm rtf txt; do
	478	expect "$SCRIPT_PATH" "$LOG_PATH.$SUFFIX" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.$SUFFIX"
[1064]	479
[1144]	480	if [ "$?" -ne 0 ]; then
	481	valPrint c "Error $? occurred when attempting to upload $LOG_NAME.$SUFFIX!"
	482	else
	483	valPrint c "Report in `echo $SUFFIX \| tr [:lower:] [:upper:]` format was uploaded."
	484	fi
	485	done
[1064]	486	}
	487
	488	# Prints session summary when script is done
	489	function wrapupAndExit()
	490	{
	491	# Get off progress line on console, drop down a line from last link in log, and close HTML table
	492	valPrint ctr ""
	493	valPrint h "</table><br />"
	494
	495	# If we didn't finish processing the last URL, then the iterator is one too high
	496	if [ $FINISHED_LIST != "yes" ]; then
	497	let LINK_NUM-=1
	498	if [ $FINISHED_LIST == "no" ]; then
	499	valPrint ctrh "The session was canceled by the user."
	500	fi
	501	fi
	502
[1118]	503	# Generate string with elapsed time
	504	END_RUN=$(date +%s)
	505	ELAPSED=$(echo $(($END_RUN - $START_RUN)) \| awk '{printf "%d min. %d sec. elapsed", int($1/60), int($1%60)}')
	506
[1122]	507	# Do some math on results of session
[1064]	508	LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
[1142]	509	TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP+SKIP_YOUTU_BE))
[1177]	510	LINK_ERRORS=$((SKIP_PARSE_FAIL+SKIP_UNK_PROT+SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
[1142]	511	LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_RD+SKIP_EXPECT_EI+SKIP_EXPECT_IW))
	512	LINK_PROBLEMS_TOTAL=$((NG_LINKS+RD_LINKS+EI_LINKS+IW_LINKS))
	513	LINK_PROBLEMS_NG=$((NG_LINKS-SKIP_EXPECT_NG))
	514	LINK_PROBLEMS_RD=$((RD_LINKS-SKIP_EXPECT_RD))
	515	LINK_PROBLEMS_EI=$((EI_LINKS-SKIP_EXPECT_EI))
	516	LINK_PROBLEMS_IW=$((IW_LINKS-SKIP_EXPECT_IW))
	517	LINK_PROBLEMS_NET=$((LINK_PROBLEMS_NG+LINK_PROBLEMS_RD+LINK_PROBLEMS_EI+LINK_PROBLEMS_IW))
[1122]	518
[1144]	519	# Print something in the Links section if no link issues were printed
	520	if [ $LINK_PROBLEMS_NET -eq 0 ]; then
	521	valPrint h "<i>No link problems to report! See the <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for a list of links with issues that were not reported.</i>"
	522	fi
	523	if [ $LINK_PROBLEMS_TOTAL -eq 0 ]; then
	524	valPrint t "No link problems to report!"
	525	valPrint r "\i1 No link problems to report! \i0"
	526	fi
	527
[1141]	528	## SUMMARY OUTPUT ##
[1118]	529	valPrint ct "Summary ($ELAPSED):"
	530	valPrint r "\b1 Summary \b0 ($ELAPSED)"
	531	valPrint hn "<h3><span id=\"summary\">Summary ($ELAPSED)</span></h3>"
[1123]	532	valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT) (there $(pluralCheckWas $FILE_LINKS) $FILE_LINKS file $(pluralCheckNoun link $FILE_LINKS) and $PAGE_LINKS page $(pluralCheckNoun link $PAGE_LINKS))."
[1122]	533
	534	# Print processed link totals
	535	if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi
	536	if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi
[1178]	537	if [ $SKIP_ARCHIVES -gt 0 ]; then valPrint ctrh "- $SKIP_ARCHIVES archive.org/archive.is $(pluralCheckNoun link $SKIP_ARCHIVES) $(pluralCheckWas $SKIP_ARCHIVES) not checked"; fi
[1142]	538	if [ $LINK_PROBLEMS_TOTAL -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_TOTAL processed $(pluralCheckNoun link $LINK_PROBLEMS_TOTAL) had $(pluralCheckAn $LINK_PROBLEMS_TOTAL)$(pluralCheckNoun issue $LINK_PROBLEMS_TOTAL)"; fi
	539	if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr " (excepted $LINKS_EXCEPTED link $(pluralCheckNoun issue $LINKS_EXCEPTED) from report)"; valPrint h "  (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi
[1123]	540	if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) $(pluralCheckWas $OK_LINKS) OK"; fi
	541	if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctr " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; valPrint h "  (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi
[1122]	542
	543	# Print errored link totals
[1144]	544	if [ $LINK_ERRORS -gt 0 ]; then
	545	valPrint c "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"
	546	valPrint h "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for specific links):"
	547	valPrint rt "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS):"
	548	fi
[1177]	549	if [ $SKIP_PARSE_FAIL -gt 0 ]; then valPrint ctrh "- $SKIP_PARSE_FAIL line-parsing $(pluralCheckNoun failure $SKIP_PARSE_FAIL)"; fi
	550	if [ $SKIP_UNK_PROT -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_PROT unknown $(pluralCheckNoun protocol $SKIP_UNK_PROT)"; fi
[1122]	551	if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
[1070]	552	if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
[1064]	553	if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
	554	if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
	555	if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
	556	if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
[1122]	557
[1142]	558	# Print excepted link totals
[1144]	559	if [ $LINKS_EXCEPTED -gt 0 ]; then
	560	valPrint c "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"
	561	valPrint h "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for specific links):"
	562	valPrint rt "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted:"
	563	fi
[1142]	564	if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
	565	if [ $SKIP_EXPECT_RD -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_RD/$RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi
	566	if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi
	567	if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi
	568
[1175]	569	# Perform exceptions audit
	570	EXCEPTION_ISSUES=0
	571	valPrint ctrh "Exceptions list audit:"
	572	for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
	573	EXCEPT_LINE="${EXCEPT_ARRAY[$i]}"
	574	EXCEPT_LINE=$(echo "$EXCEPT_LINE" \| sed 's/\&/\&/g') # copied from exception-matching code
	575
	576	if [ ${EXCEPT_FOUND[$i]} -eq 0 ]; then
	577	EXCEPT_URL="${EXCEPT_LINE#*,}"
	578	EXCEPT_URL="${EXCEPT_URL%,*}"
	579	EXCEPT_PAGE="${EXCEPT_LINE##*,}"
	580	EXCEPT_PAGE="${EXCEPT_PAGE%% *}"
	581	if [ "$EXCEPT_PAGE" == "*" ]; then
	582	valPrint tr "- The link '$EXCEPT_URL' did not return an error or is not on any page."
	583	else
	584	valPrint tr "- The link '$EXCEPT_URL' did not return an error or is not on page '$EXCEPT_PAGE'."
	585	fi
	586	let EXCEPTION_ISSUES+=1
	587	elif [ ${EXCEPT_USED[$i]} -eq 0 ]; then
	588	EXCEPT_URL="${EXCEPT_LINE#*,}"
	589	EXCEPT_URL="${EXCEPT_URL%,*}"
	590	EXCEPT_CODE=${EXCEPT_LINE%%,*}
	591	valPrint tr "- The link '$EXCEPT_URL' did not return error code $EXCEPT_CODE."
	592	let EXCEPTION_ISSUES+=1
	593	fi
	594	done
	595	if [ $EXCEPTION_ISSUES -eq 0 ]; then
	596	valPrint ctrh "- No issues found."
	597	else
	598	valPrint c "$EXCEPTION_ISSUES exception list $(pluralCheckNoun issue $EXCEPTION_ISSUES) detected (see RTF or TXT report for details)."
	599	valPrint h "$EXCEPTION_ISSUES exception list $(pluralCheckNoun issue $EXCEPTION_ISSUES) detected (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for details)."
	600	fi
	601
[1122]	602	# Print checked link totals
[1142]	603	if [ $LINK_PROBLEMS_NET -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS_NET link $(pluralCheckNoun issue $LINK_PROBLEMS_NET):"; fi
	604	if [ $LINK_PROBLEMS_NG -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_NG NG $(pluralCheckNoun link $LINK_PROBLEMS_NG)"; fi
	605	if [ $LINK_PROBLEMS_RD -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_RD $(pluralCheckNoun redirection $LINK_PROBLEMS_RD)"; fi
	606	if [ $LINK_PROBLEMS_EI -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_EI $(pluralCheckNoun link $LINK_PROBLEMS_EI) that could be intrawiki"; fi
	607	if [ $LINK_PROBLEMS_IW -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_IW $(pluralCheckNoun link $LINK_PROBLEMS_IW) that could be interwiki"; fi
[1122]	608
	609	# Close the log files' markup
[1070]	610	valPrint trh "ValExtLinks says goodbye."
[1064]	611	printRTFfooter
	612	printHTMfooter
	613
	614	# Upload report if this was requested
	615	if [ ! -z $UPLOAD_INFO ] && [ $FINISHED_LIST != "no" ]; then
	616	uploadReport
	617	fi
	618
	619	# Really quit now
	620	valPrint c "ValExtLinks says goodbye."
	621	exit 0
	622	}
	623	trap wrapupAndExit INT
	624
	625
	626	### INITIALIZATION ###
	627	# Print opening message to console and log files
	628	valPrint cw "$(tput bold)--Validate External Links--$(tput sgr0)"
	629	printTXTheader
	630	printRTFheader
	631	printHTMheader
	632
[1141]	633	## DATA SOURCING ##
	634	valPrint t "Startup:"
	635	valPrint r "\b1 Startup \b0"
	636	valPrint hn "<h3>Startup</h3>"
	637
[1064]	638	# Attempt to download file at LINKS_URL, then check that it succeeded
[1141]	639	valPrint cwtrhn "Downloading list of external links from $LINKS_URL..."
[1064]	640	LINKS_FILE_NAME=$(echo "$LINKS_URL" \| sed 's/.*\///')
	641	LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
	642	curl --silent -o "$LINKS_FILE" $LINKS_URL
	643	if [ ! -f "$LINKS_FILE" ]; then
[1141]	644	echo -e "\nThe download of $LINKS_URL appears to have failed. Aborting."
[1064]	645	wrapupAndExit
[1141]	646	else
	647	valPrint ctrh " success."
[1064]	648	fi
	649
	650	# Attempt to download file at EXCEPT_URL, then check that it succeeded
	651	if [ ! -z $EXCEPT_URL ]; then
[1141]	652	valPrint cwtrhn "Downloading list of reporting exceptions from $EXCEPT_URL..."
[1136]	653	EXCEPT_DATA=$(curl --silent $EXCEPT_URL)
	654	if [ -z "$EXCEPT_DATA" ]; then
[1141]	655	echo -e "\nThe download of the exceptions data from '$EXCEPT_URL' appears to have failed. Aborting."
[1064]	656	wrapupAndExit
[1141]	657	else
	658	valPrint ctrh " success."
[1064]	659	fi
[1136]	660	EXCEPT_DATA=${EXCEPT_DATA%END LIST*}
	661	EXCEPT_DATA=${EXCEPT_DATA#*BEGIN LIST}
	662	EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
	663
	664	# Store on disk for debugging purposes
	665	echo "$EXCEPT_DATA" > "$EXCEPT_FILE"
	666
	667	# Transfer to array for easy searching later
	668	declare -a EXCEPT_ARRAY=($(echo "$EXCEPT_DATA"))
[1175]	669
	670	# Create parallel arrays for marking which exceptions get used later
	671	declare -a EXCEPT_USED=()
	672	declare -a EXCEPT_FOUND=()
	673	for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
	674	EXCEPT_USED+=(0)
	675	EXCEPT_FOUND+=(0)
	676	done
[1064]	677	fi
	678
	679	# Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
	680	LINK_COUNT_STRING=$(cat "$LINKS_FILE" \| wc -l)
	681
	682	# Number of URLs is number of lines minus one (first line is column header row for the CSV)
	683	LINK_COUNT=$(echo "${LINK_COUNT_STRING}" \| tr -d '[:space:]')
	684	let LINK_COUNT-=1
[1141]	685	valPrint ctrh "Found $LINK_COUNT links to process."
	686	valPrint trh ""
[1064]	687
[1141]	688	## CONFIG OUTPUT ##
	689	valPrint t "Config:"
	690	valPrint r "\b1 Config \b0"
	691	valPrint hn "<h3>Config</h3>"
	692
	693	valPrint ctrhn "Links to consider: "
[1064]	694	if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
[1141]	695	valPrint ctrh "$((URL_LIMIT-URL_START+1)), from link $URL_START to $URL_LIMIT"
[1064]	696	elif [ $URL_START -ne 1 ]; then
[1141]	697	valPrint ctrh "$((LINK_COUNT-URL_START+1)), from link $URL_START to $LINK_COUNT"
[1064]	698	else
[1141]	699	valPrint ctrh "$LINK_COUNT"
[1064]	700	fi
	701
[1141]	702	valPrint ctrh "Site query timeout: $TIMEOUT seconds"
	703
	704	valPrint ctrhn "Show OK links: "
	705	if [ $RECORD_OK_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
	706
	707	valPrint ctrhn "Take screenshots: "
	708	if [ $TAKE_PAGE_SHOT -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
	709
[1147]	710	valPrint ctrhn "Suggest archive.org snapshots for NG pages: "
	711	if [ $SUGGEST_SNAPSHOTS_NG -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
[1141]	712
[1147]	713	valPrint ctrhn "Suggest archive.org snapshots for OK pages: "
	714	if [ $SUGGEST_SNAPSHOTS_OK -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
	715
[1141]	716	valPrint ctrhn "Ignore slash-adding redirects: "
	717	if [ $SHOW_SLASH -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
	718
	719	valPrint ctrhn "Ignore HTTPS-upgrading redirects: "
	720	if [ $SHOW_HTTPS -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
	721
	722	valPrint ctrhn "Ignore youtu.be redirects: "
	723	if [ $SHOW_YT_RD -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
	724
[1158]	725	valPrint ctrhn "Check archive.org and archive.is links: "
[1144]	726	if [ $CHECK_ARCHIVE_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
[1141]	727
[1064]	728	valPrint tr "A summary of my findings will be found at the bottom of the report."
	729	valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
	730	valPrint trh ""
	731
[1141]	732	## LEGEND OUTPUT ##
[1064]	733	valPrint t "Legend:"
	734	valPrint r "\b1 Legend \b0"
	735	valPrint hn "<h3>Legend</h3>"
[1175]	736	valPrint t "(For guidance in fixing these links, see $WIKI_MAIN. The exceptions list is at $EXCEPT_URL.)"
	737	valPrint r "(For guidance in fixing these links, see {\field{\\fldinst{HYPERLINK \"$WIKI_MAIN\"}}{\fldrslt here}}. The exceptions list is {\field{\\fldinst{HYPERLINK \"$EXCEPT_URL\"}}{\fldrslt here}}.)"
	738	valPrint h "(For guidance in fixing these links, see <a href=\"$WIKI_MAIN\" target=\"_blank\">here</a>. The exceptions list is <a href=\"$EXCEPT_URL\" target=\"_blank\">here</a>.)"
[1141]	739	valPrint trh "OK = URL seems to be working"
	740	valPrint trh "NG = URL no longer seems to work"
	741	valPrint trh "RD = URL is redirecting to this new URL"
	742	valPrint trh "EI = URL is an external link to an internal page and should be converted to an intrawiki link using the suggested markup"
	743	valPrint trh "IW = URL is an external link to a fellow wiki and should be converted to an interwiki link using the suggested markup"
	744	valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $WIKI_HTTP)"
	745	valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$WIKI_HTTP\"}}{\fldrslt here}} for code reference)"
	746	valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$WIKI_HTTP\" target=\"_blank\">here</a> for code reference)"
	747	valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $WIKI_CURL)"
	748	valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$WIKI_CURL\"}}{\fldrslt here}} for code reference)"
	749	valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$WIKI_CURL\" target=\"_blank\">here</a> for code reference)"
	750	valPrint trh "IA suggests = Last available snapshot returned by the Internet Archive"
	751	valPrint trh "Try browsing = The Archive failed to return a snapshot URL, so check for a snapshot manually using this link"
[1064]	752	valPrint trh ""
	753
	754
	755	### MAIN LOOP ###
[1120]	756	valPrint t "Links:"
	757	valPrint r "\b1 Links \b0"
	758	valPrint hn "<h3>Links</h3>"
[1118]	759	START_RUN=$(date +%s)
[1064]	760	# Process each line of the .csv in LINKS_FILE
	761	for LINE in `cat "$LINKS_FILE"`; do
[1147]	762	START_LINK=$(date +%s)
[1064]	763	let LINK_NUM+=1
	764
	765	# First line is the column header row for the CSV, so let's verify that the format hasn't changed
	766	if [ $SKIPPED_HEADER_ROW -eq 0 ]; then
	767	if [ $LINE == "namespace,title,target" ]; then
	768	SKIPPED_HEADER_ROW=1
[1148]	769	LINK_NUM=0 # this line is not a link, so reset the link counter
[1064]	770	valPrint hn "<table>"
	771	continue
	772	else
	773	valPrint ctrh "Did not find expected header row in $LINKS_FILE. Aborting."
	774	wrapupAndExit
	775	fi
	776	fi
	777
	778	# Skip this link if we are not at URL_START yet
	779	if [ $LINK_NUM -lt $URL_START ]; then
	780	continue
	781	fi
	782
	783	# Stop if we are at the limit declared for testing purposes
	784	if [ $URL_LIMIT -gt 0 ] && [ $LINK_NUM -gt $URL_LIMIT ]; then
	785	FINISHED_LIST="limit"
	786	wrapupAndExit
	787	fi
[1177]	788
	789	# Parse line into namespace ID number, containing wiki page, and external link URL
	790	NS_ID=${LINE%%,*}
	791	PAGE_NAME=${LINE#$NS_ID,}
	792	PAGE_NAME=${PAGE_NAME%%,*} # a comma in the page name will break this
	793	URL=${LINE#$NS_ID,$PAGE_NAME,} # commas can be in this
	794	if [ -z "$NS_ID" ] \|\| [ -z "$PAGE_NAME" ] \|\| [ -z "$URL" ]; then
	795	valPrint trs "Skipping line $LINK_NUM ('$LINE') because the namespace, wiki page or link URL could not be read."
	796	let SKIP_PARSE_FAIL+=1
	797	continue
	798	fi
	799
	800	# Skip any link that isn't "http://" or "https://"
	801	if [[ ! $URL =~ ^http* ]]; then
	802	valPrint trs "Skipping line $LINK_NUM ('$LINE') because the protocol isn't 'http://' or 'https://'."
	803	let SKIP_UNK_PROT+=1
	804	continue
	805	fi
[1064]	806
	807	# Print progress to screen
	808	if [ $LINK_NUM -gt 1 ]; then
	809	printf "\e[1A\n" # erase previous progress message so that new one appears in its place
	810	fi
	811	valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."
	812
	813	# Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
	814	NS_NAME=""
	815	a=0
[1069]	816	while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done
[1118]	817	if [ $NS_ID == "NULL" ]; then
	818	break
	819	elif [ $NS_ID -eq ${NS_IDS[$a]} ]; then
[1064]	820	NS_NAME="${NS_NAMES[$a]}"
	821	break
	822	fi
	823	let a+=1
	824	done
[1118]	825	if [ "$NS_NAME" == "" ]; then
	826	if [ $NS_ID == "NULL" ]; then
[1123]	827	valPrint trs "Skipping line $LINK_NUM ('$LINE') because the namespace (and probably the page too) is 'NULL'. Probably the link is no longer in existence on the wiki."
[1118]	828	else
[1123]	829	valPrint trs "Skipping line $LINK_NUM ('$LINE') because I could not find a name for namespace ID $NS_ID."
[1118]	830	fi
[1064]	831	let SKIP_UNK_NS+=1
[1148]	832	let PAGE_LINKS+=1
[1064]	833	continue
	834	fi
	835
[1070]	836	# Build longer wiki page URLs from namespace and page names
[1122]	837	FULL_PAGE_PATH=https://$WIKI_PATH/$NS_NAME:$PAGE_NAME
[1070]	838	LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
	839	# Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it
	840	# explicitly breaks the link
	841	if [ $NS_ID -eq 0 ]; then
[1122]	842	FULL_PAGE_PATH=https://$WIKI_PATH/$PAGE_NAME
[1070]	843	LOCAL_PAGE_PATH=$PAGE_NAME
	844	fi
	845
[1149]	846	# We don't want to consider wiki pages ending in .js, as the MW parser cannot reliably isolate URLS
	847	# in JavaScript code, so it returns erroneous links
	848	PAGE_NAME_SUFFIX=$(echo $PAGE_NAME \| sed 's/.*\.//')
	849	if [ $PAGE_NAME_SUFFIX == "js" ]; then
	850	valPrint trs "Skipping URL '${LINE#$NS_ID,$PAGE_NAME,}' on line $LINK_NUM because it was found on JavaScript page '$LOCAL_PAGE_PATH'."
	851	let SKIP_JS_PAGE+=1
	852	let PAGE_LINKS+=1
	853	continue
	854	fi
	855
[1064]	856	# Scan for illegal characters
	857	if [[ $URL == [$ILLEGAL_CHARS] ]]; then
[1149]	858	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because it contains characters illegal in a URL."
[1064]	859	let SKIP_BAD_URL+=1
[1148]	860	let PAGE_LINKS+=1
[1064]	861	continue
	862	fi
	863
[1158]	864	# If we're skipping archive links, see if this is one
	865	if [ $CHECK_ARCHIVE_LINKS -eq 0 ] && [[ ( $URL == web.archive.org \|\| $URL == archive.is ) ]]; then
	866	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to check archive links."
	867	let SKIP_ARCHIVES+=1
[1148]	868	let PAGE_LINKS+=1
[1135]	869	continue
	870	fi
	871
[1064]	872	# Now we need to know if the URL is for a file or a web page. First step is to determine if the
	873	# URL ends in a suffix
	874	HAS_SUFFIX=0
	875
	876	# If the URL ends in something like ".php?foo=bar", strip everything from the '?' onward
[1070]	877	CLEAN_URL=${URL%%\?*}
[1064]	878
	879	# If the URL ends in something like "#section_15", strip everything from the '#' onward
[1070]	880	CLEAN_URL=${CLEAN_URL%%\#*}
[1064]	881
[1175]	882	# 'sed' cannot handle Unicode in my Bash shell, so skip non-ASCII URL and make reader check it
[1070]	883	if [[ $CLEAN_URL == [![:ascii:]] ]]; then
[1149]	884	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I cannot handle non-ASCII characters."
[1064]	885	let SKIP_NON_ASCII+=1
[1148]	886	let PAGE_LINKS+=1
[1064]	887	continue
	888	fi
	889
	890	# Isolate the characters after the last period and after the last slash
[1070]	891	POST_DOT=$(echo "$CLEAN_URL" \| sed 's/.*\.//')
	892	POST_SLASH=$(echo "$CLEAN_URL" \| sed 's/.*\///')
[1064]	893
	894	# If the last period comes after the last slash, then the URL ends in a suffix
	895	POST_DOT_LENGTH=$(echo \| awk -v input=$POST_DOT '{print length(input)}')
	896	POST_SLASH_LENGTH=$(echo \| awk -v input=$POST_SLASH '{print length(input)}')
	897	if [ $POST_DOT_LENGTH -lt $POST_SLASH_LENGTH ]; then
	898	HAS_SUFFIX=1
	899	else
	900	HAS_SUFFIX=0
	901	fi
	902
	903	# Now we need to know if the URL is for a file. If the URL ends in a suffix, compare it against the
	904	# known URL endings supplied in HTTP_FILES and HTTP_TLDS_AND_PAGES
	905	IS_FILE=-1
	906	if [ $HAS_SUFFIX -eq 0 ]; then
	907	IS_FILE=0
	908	else
	909	# Turn off case sensitivity while we compare suffixes
	910	shopt -s nocasematch
	911
[1127]	912	# Special case: URLs ending in something like "productID.304297400" are pages, not files, so if
[1064]	913	# the URL's suffix is all numbers, we are looking at the end of a web page URL
	914	if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
	915	IS_FILE=0
	916	fi
[1127]	917
	918	# Special case: URLs ending in a parens, e.g. "ms537113(v=vs.85)", are pages
	919	if [[ $POST_DOT =~ ^.*[\(\)]$ ]]; then
	920	IS_FILE=0
	921	fi
	922
	923	# Special case: URLs containing a '%', e.g. "10.1007%2FBF00329055", are pages
	924	if [[ $POST_DOT == % ]]; then
	925	IS_FILE=0
	926	fi
[1064]	927
	928	# If we did not identify this URL as a web page above, we need to compare the suffix against known
	929	# file extensions
	930	if [ $IS_FILE -eq -1 ]; then
	931	for EXTENSION in "${HTTP_FILES[@]}"; do
	932	if [[ $EXTENSION == $POST_DOT ]]; then # double brackets respect the nocasematch setting
	933	IS_FILE=1
	934	break
	935	fi
	936	done
	937	fi
	938
	939	# If we did not identify this URL as a file above, we need to compare the suffix against known
	940	# pages and TLDs in order to rule out that this is an unknown file suffix or TLD that the user
	941	# needs to add to the HTTP_FILES or HTTP_TLDS_AND_PAGES array
	942	if [ $IS_FILE -eq -1 ]; then
	943	for PAGE_OR_TLD in "${HTTP_TLDS_AND_PAGES[@]}"; do
	944	if [[ $PAGE_OR_TLD == $POST_DOT ]]; then
	945	IS_FILE=0
	946	break
	947	fi
	948	done
	949	fi
	950
	951	# Turn case sensitivity back on in Bash
	952	shopt -u nocasematch
	953	fi
	954
[1175]	955	# If this suffix escaped identification as either a file, page or TLD, inform the reader
[1064]	956	STR_TYPE=""
	957	if [ $IS_FILE -eq -1 ]; then
[1160]	958	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I encountered the unknown URL suffix '$POST_DOT'. Please add this suffix to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
[1064]	959	let SKIP_UNK_SUFFIX+=1
	960	continue
	961	elif [ $IS_FILE -eq 1 ]; then
	962	STR_TYPE="file"
	963	let FILE_LINKS+=1
[1148]	964	else
[1064]	965	STR_TYPE="page"
	966	let PAGE_LINKS+=1
	967	fi
	968
	969	# Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
	970	# issue with sites that require HTTPS
[1158]	971	CURL_CODE=$(curl -o /dev/null --silent --insecure --compressed --head --user-agent '$AGENT' --max-time $TIMEOUT --retry 2 --write-out '%{http_code}\n' $URL)
[1064]	972	CURL_ERR=$(echo $?)
	973	CURL_RESULT=$CURL_CODE
	974
	975	# Tack 'curl' exit code onto result string if 'curl' returned "000" (no HTTP response)
	976	if [ $CURL_CODE == "000" ]; then
	977	CURL_RESULT="$CURL_RESULT-$CURL_ERR"
	978	fi
	979
[1070]	980	# Begin to determine our status code for this URL (EI, IW, OK, RD, or NG)
[1064]	981	STATUS="??"
[1067]	982	NEW_URL=""
[1064]	983	INTERWIKI_INDEX=-1
	984
[1070]	985	# First make sure that this isn't an "external internal" link to our own wiki that can be replaced
	986	# by "[[page_name]]". If it uses a special access URL beginning with "/w/", let it pass, as it
	987	# probably cannot be replaced by "[[ ]]" markup
	988	if [[ $URL == $WIKI_PATH ]] && [[ $URL != $WIKI_PATH/w/ ]]; then
	989	STATUS="EI"
	990	let EI_LINKS+=1
	991	fi
	992
[1144]	993	# If it's not, check if this is a link to a domain that we have an interwiki prefix for (also make
	994	# sure that it's not an archive.org link to a page from an interwiki domain)
	995	if [ $STATUS == "??" ] && [[ $URL != web.archive.org ]]; then
[1070]	996	for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
	997	if [[ $URL == ${INTERWIKI_DOMAINS[$i]} ]] && [[ $URL != ${INTERWIKI_DOMAINS[$i]}/w/ ]]; then
	998	STATUS="IW"
	999	let IW_LINKS+=1
	1000	INTERWIKI_INDEX=$i
	1001	break
	1002	fi
	1003	done
	1004	fi
	1005
[1069]	1006	# If we didn't match an interwiki domain, see if the status code is in our "OK" codes list
	1007	if [ $STATUS == "??" ]; then
	1008	for CODE in "${OK_CODES[@]}"; do
	1009	if [[ $CODE == $CURL_CODE ]]; then
	1010	STATUS="OK"
	1011	let OK_LINKS+=1
[1148]	1012
	1013	# If this is a YouTube link, we have to look at the actual page source to know if the video
[1157]	1014	# is good or not; override the link's info if it's actually NG
[1148]	1015	if [[ $URL == www.youtube.com ]]; then
[1182]	1016	PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $URL)
	1017	CURL_ERR=$(echo $?)
	1018	if [ "$CURL_ERR" != "0" ]; then
[1148]	1019	STATUS="NG"
[1182]	1020	CURL_RESULT="000-$CURL_ERR"
[1148]	1021	let OK_LINKS-=1
	1022	let NG_LINKS+=1
[1182]	1023	elif [[ "$PAGE_TEXT" =~ "simpleText\":\"Video unavailable" ]]; then
	1024	STATUS="NG"
	1025	CURL_CODE="404"
	1026	CURL_RESULT=$CURL_CODE
	1027	let OK_LINKS-=1
	1028	let NG_LINKS+=1
[1148]	1029	fi
	1030	fi
[1182]	1031
	1032	# If this is a OneDrive link, we have to look at the actual page source to know if the file
	1033	# is really still at this URL; override the link's info if it's actually NG or RD
	1034	if [[ $URL == skydrive.live.com ]]; then
	1035	PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $URL)
	1036	CURL_ERR=$(echo $?)
	1037	if [ "$CURL_ERR" != "0" ]; then
	1038	STATUS="NG"
	1039	CURL_RESULT="000-$CURL_ERR"
	1040	let OK_LINKS-=1
	1041	let NG_LINKS+=1
	1042	elif [[ "$PAGE_TEXT" =~ "<h1>Sorry, something went wrong" ]]; then
	1043	STATUS="NG"
	1044	CURL_CODE="404"
	1045	CURL_RESULT=$CURL_CODE
	1046	let OK_LINKS-=1
	1047	let NG_LINKS+=1
	1048	elif [[ "$PAGE_TEXT" =~ "<h2>Object moved to" ]]; then
	1049	STATUS="??" # have to send the code through the next block to treat the redirect properly
	1050	CURL_CODE="301"
	1051	CURL_RESULT=$CURL_CODE
	1052	let OK_LINKS-=1
	1053	fi
	1054	fi
	1055
[1069]	1056	break
	1057	fi
	1058	done
	1059	fi
	1060
[1067]	1061	# If we didn't get a match with the "OK" codes, check it against the "RD" codes
[1064]	1062	if [ $STATUS == "??" ]; then
[1067]	1063	for CODE in "${RD_CODES[@]}"; do
	1064	if [[ $CODE == $CURL_CODE ]]; then
[1182]	1065	# Get URL header again in order to retrieve the URL we are being redirected to, but if this
	1066	# is a OneDrive link, we already have the new URL in $PAGE_TEXT
	1067	if [[ $URL == skydrive.live.com ]]; then
	1068	NEW_URL=${PAGE_TEXT##*href=\"}
	1069	NEW_URL=${NEW_URL%\">here*}
	1070	else
	1071	NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --write-out '%{redirect_url}\n' $URL)
	1072	fi
[1067]	1073
[1122]	1074	# Adjust the old and new URLs to both use HTTP for comparison purposes, so we can filter
	1075	# those changes out if the user didn't ask for them
	1076	URL_HTTP=$(echo $URL \| sed -E 's/^https:/http:/')
	1077	NEW_URL_HTTP=$(echo $NEW_URL \| sed -E 's/^https:/http:/')
[1070]	1078
	1079	# Sometimes 'curl' fails to get the redirect_url due to time-out or bad web site config
[1122]	1080	NEW_URL_LENGTH=$(echo \| awk -v input=$NEW_URL_HTTP '{print length(input)}')
[1070]	1081	if [ $NEW_URL_LENGTH -lt $MIN_URL_LENGTH ]; then
[1122]	1082	NEW_URL_HTTP="[new URL not retrieved]"
[1070]	1083	fi
	1084
[1122]	1085	# Remove slash at end of new URL, if present, so we can filter out the redirects that
	1086	# merely add an ending slash if the user didn't ask for them
	1087	NEW_URL_NO_SLASH=$(echo $NEW_URL_HTTP \| sed -E 's:/$::')
	1088
[1127]	1089	# Detect if this is a youtu.be link simply being expanded by YouTube to the full
	1090	# youtube.com address
	1091	YOUTU_BE=0
	1092	if [[ $URL_HTTP == http://youtu.be* ]] && [[ $NEW_URL_HTTP == http://www.youtube.com* ]]; then
	1093	YOUTU_BE=1
	1094	fi
	1095
[1122]	1096	# If the URLs match besides HTTP being upgraded to HTTPS, then the link is OK (unless user
	1097	# wants those to be reported)
	1098	if [ $SHOW_HTTPS -eq 0 ] && [ $URL_HTTP == $NEW_URL_HTTP ]; then
[1149]	1099	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to show http->https upgrades, and I was redirected to '$NEW_URL'."
[1069]	1100	STATUS="OK"
	1101	let OK_LINKS+=1
[1122]	1102	let SKIP_HTTPS_UP+=1
	1103	# If the URLs match besides an added ending slash, then the link is OK (unless user wants
	1104	# those to be reported)
	1105	elif [ $SHOW_SLASH -eq 0 ] && [ $URL_HTTP == $NEW_URL_NO_SLASH ]; then
[1149]	1106	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to show added trailing slashes, and I was redirected to '$NEW_URL'."
[1122]	1107	STATUS="OK"
	1108	let OK_LINKS+=1
	1109	let SKIP_SLASH_ADD+=1
[1148]	1110	elif [ $YOUTU_BE -eq 1 ]; then
	1111	# We have to look at the actual page source to know if a YouTube video is good or not
	1112	PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $NEW_URL \| grep "\"simpleText\":\"Video unavailable\"")
	1113	if [ ! -z "$PAGE_TEXT" ]; then
	1114	STATUS="NG"
	1115	let NG_LINKS+=1
	1116	else
	1117	if [ $SHOW_YT_RD -eq 0 ]; then
[1149]	1118	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to show redirects for youtu.be links, and I was redirected to '$NEW_URL'."
[1148]	1119	STATUS="OK"
	1120	let OK_LINKS+=1
	1121	let SKIP_YOUTU_BE+=1
	1122	else
	1123	STATUS="RD"
	1124	let RD_LINKS+=1
	1125	fi
	1126	fi
[1069]	1127	else
	1128	STATUS="RD"
	1129	let RD_LINKS+=1
	1130	fi
[1067]	1131	break
	1132	fi
	1133	done
	1134	fi
	1135
	1136	# If we didn't get a match with the "RD" codes, check it against the "NG" codes
	1137	if [ $STATUS == "??" ]; then
[1064]	1138	for CODE in "${NG_CODES[@]}"; do
	1139	if [[ $CODE == $CURL_CODE ]]; then
	1140	STATUS="NG"
	1141	let NG_LINKS+=1
	1142	break
	1143	fi
	1144	done
	1145	fi
	1146
	1147	# If we didn't match a known status code, advise the reader
	1148	if [ $STATUS == "??" ]; then
[1149]	1149	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I encountered the unknown response code $CURL_CODE."
[1064]	1150	let SKIP_UNK_CODE+=1
	1151	continue
	1152	fi
	1153
[1136]	1154	# Check problem links against exceptions list before proceeding
	1155	FOUND_EXCEPT=0
[1175]	1156	if [ $STATUS != "OK" ] && [ ! -z "$EXCEPT_URL" ]; then
[1070]	1157	# The code we expect to find in the exceptions file is either the 'curl' result or "EI"/"IW"
	1158	EXPECT_CODE="$CURL_RESULT"
	1159	if [ $STATUS == "EI" ]; then
	1160	EXPECT_CODE="EI"
	1161	elif [ $STATUS == "IW" ]; then
	1162	EXPECT_CODE="IW"
	1163	fi
	1164
[1136]	1165	# Look for link in exceptions list and make sure the listed result code and wiki page also match
	1166	for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
	1167	{
	1168	EXCEPT_LINE="${EXCEPT_ARRAY[$i]}"
[1182]	1169
[1142]	1170	# Undo any HTML-encoding from the wiki page; for now we just worry about the ampersand, as most
	1171	# other HTML-encoded characters are not found in URLs
[1146]	1172	EXCEPT_LINE=$(echo "$EXCEPT_LINE" \| sed 's/\&/\&/g')
[1142]	1173
[1175]	1174	# Check for URL match
[1136]	1175	EXCEPT_URL="${EXCEPT_LINE#*,}"
	1176	EXCEPT_URL="${EXCEPT_URL%,*}"
[1178]	1177	if [[ "$EXCEPT_URL" =~ \* ]]; then # if this exception URL contains the '*' wildcard, use pattern-matching with it
[1182]	1178	if [[ ! "$URL" == $EXCEPT_URL ]]; then
[1178]	1179	continue
	1180	fi
	1181	else
	1182	if [ "$EXCEPT_URL" != "$URL" ]; then # otherwise just use a straight string comparison
	1183	continue
	1184	fi
[1070]	1185	fi
[1136]	1186
[1175]	1187	# Check for page name match
[1136]	1188	EXCEPT_PAGE="${EXCEPT_LINE##*,}"
	1189	EXCEPT_PAGE="${EXCEPT_PAGE%% *}"
[1175]	1190	if [ "$EXCEPT_PAGE" == "*" ] \|\| [ "$EXCEPT_PAGE" == "$LOCAL_PAGE_PATH" ]; then
	1191	let EXCEPT_FOUND[$i]+=1
	1192	valPrint trs "Found exception '$URL' on page '$LOCAL_PAGE_PATH'."
	1193
	1194	# Check for result code match
[1136]	1195	EXCEPT_CODE=${EXCEPT_LINE%%,*}
	1196	if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then
[1175]	1197	FOUND_EXCEPT=1
	1198	let EXCEPT_USED[$i]+=1
[1149]	1199	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because its expected result, '$EXPECT_CODE', is in the exceptions list."
[1175]	1200
[1136]	1201	if [ $STATUS == "EI" ]; then
	1202	let SKIP_EXPECT_EI+=1
	1203	elif [ $STATUS == "IW" ]; then
	1204	let SKIP_EXPECT_IW+=1
[1142]	1205	elif [ $STATUS == "RD" ]; then
	1206	let SKIP_EXPECT_RD+=1
[1136]	1207	else
	1208	let SKIP_EXPECT_NG+=1
	1209	fi
[1175]	1210
[1136]	1211	break
	1212	fi
	1213	fi
	1214	} done
[1064]	1215	fi
[1136]	1216	if [ $FOUND_EXCEPT -eq 1 ]; then
	1217	continue
	1218	fi
[1064]	1219
	1220	# If appropriate, record this link to the log, with clickable URLs when possible
	1221	if [ $STATUS != "OK" ] \|\| [ $RECORD_OK_LINKS -eq 1 ]; then
[1125]	1222	# Prepare 'curl' result in parentheses to print after status code, unless this is an "IW" or "EI"
	1223	# link, in which case showing the status code doesn't make sense. Adjust spacing after string to
	1224	# ensure TXT and RTF reports have aligned columns of results.
	1225	CURL_STR_H=" ($CURL_RESULT)"
	1226	CURL_STR_T="$CURL_STR_H"
	1227	CURL_STR_R="$CURL_STR_H "
[1070]	1228	if [ $STATUS == "IW" ] \|\| [ $STATUS == "EI" ]; then
[1125]	1229	CURL_STR_H=""
	1230	CURL_STR_T=" "
	1231	CURL_STR_R=" "
[1064]	1232	fi
	1233
	1234	# Record link and its wiki page in TXT, RTF, and HTML markup
[1125]	1235	valPrint t "${STATUS}${CURL_STR_T} $STR_TYPE $URL"
[1064]	1236	valPrint t " linked from $FULL_PAGE_PATH"
[1125]	1237	valPrint r "${STATUS}${CURL_STR_R}${RTF_TABS}$STR_TYPE {\field{\*\fldinst{HYPERLINK \"$URL\"}}{\fldrslt $URL}}"
[1064]	1238	valPrint r " linked from {\field{\*\fldinst{HYPERLINK \"$FULL_PAGE_PATH\"}}{\fldrslt $LOCAL_PAGE_PATH}}"
[1125]	1239	valPrint hn "<tr><td style=\"white-space:nowrap\">${STATUS}${CURL_STR_H}</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>"
[1064]	1240	valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"
	1241
[1123]	1242	# Place vertical space here since we won't be printing anything more about this link
[1147]	1243	if [ $STATUS == "OK" ] && [ $SUGGEST_SNAPSHOTS_OK -eq 0 ]; then valPrint tr ""; valPrint hs ""; fi
[1123]	1244
[1067]	1245	# Record redirect URL if one was given by a 3xx response page
	1246	if [ $STATUS == "RD" ]; then
[1119]	1247	valPrint ts " Server suggests $NEW_URL"
	1248	valPrint rs " Server suggests {\field{\*\fldinst{HYPERLINK \"$NEW_URL\"}}{\fldrslt $NEW_URL}}"
	1249	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Server suggests</td><td><a href=\"$NEW_URL\" target=\"_blank\">$NEW_URL</a></td></tr>"
[1067]	1250	fi
	1251
[1070]	1252	# Notify reader if we can use an intrawiki link for this URL
	1253	if [ $STATUS == "EI" ]; then
[1075]	1254	INTRA_PAGE=${URL#:///}
[1119]	1255	valPrint ts " Just use [[$INTRA_PAGE]]"
	1256	valPrint rs " Just use [[$INTRA_PAGE]]"
	1257	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Just use</td><td>[[$INTRA_PAGE]]</td></tr>"
[1070]	1258	fi
	1259
[1064]	1260	# Notify reader if we can use an interwiki prefix for this URL
	1261	if [ $STATUS == "IW" ]; then
[1075]	1262	INTER_PAGE=$(echo "$URL" \| sed 's/.*\///')
[1119]	1263	valPrint ts " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
	1264	valPrint rs " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
	1265	valPrint hs "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]</td></tr>"
[1064]	1266	fi
	1267
	1268	# Query Internet Archive for latest "OK" snapshot for "NG" page
[1147]	1269	if [[ ( $STATUS == "NG" && $SUGGEST_SNAPSHOTS_NG -eq 1 ) \|\| ( $STATUS == "OK" && $SUGGEST_SNAPSHOTS_OK -eq 1 ) ]]; then
	1270
	1271	# We need to watch out for the rate limit or we'll get locked out; look at how much time has
	1272	# elapsed and then wait the remainder between that and how long of a wait we think is needed
	1273	# to avoid the dreaded "Too Many Requests" response. 5 seconds is just a guess.
	1274	CUR_TIME=$(date +%s)
	1275	WAIT_REMAINDER=$((5 - $CUR_TIME + $START_LINK))
	1276	if [ $WAIT_REMAINDER -gt 0 ]; then
	1277	valPrint t "Waiting $WAIT_REMAINDER second(s) to conform to Archive.org rate limit."
	1278	sleep $WAIT_REMAINDER
	1279	fi
	1280
	1281	# Issue query to the API
[1141]	1282	ARCHIVE_QUERY=$(curl --silent --max-time $TIMEOUT "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
[1064]	1283
[1175]	1284	# Notify reader if we hit the rate limit and just keep going
[1147]	1285	if [[ "$ARCHIVE_QUERY" == "Too Many Requests" ]]; then
	1286	valPrint t " IA has rate-limited us!"
	1287	valPrint r " IA has rate-limited us!"
	1288	valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td>(hit the API rate limit!)</td></tr>"
[1175]	1289	# If a "closest" snapshot was received, inform reader
[1147]	1290	elif [[ "$ARCHIVE_QUERY" == \"closest\": ]]; then
[1118]	1291	# In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it
	1292	ARCHIVE_QUERY=$(echo "$ARCHIVE_QUERY" \| sed 's/#!/#\\!/')
	1293
	1294	# ...isolate "url" property in the response that follows the "closest" tag
	1295	SNAPSHOT_URL=${ARCHIVE_QUERY##*\"closest\":} # everything after '"closest":'
[1119]	1296	SNAPSHOT_URL=${SNAPSHOT_URL#*\"url\": \"} # everything after '"url": "'
[1118]	1297	SNAPSHOT_URL=${SNAPSHOT_URL%%\"*} # everything before '"'
	1298
[1124]	1299	# Remove the port 80 part that IA often adds to the URL, as it's superfluous
	1300	SNAPSHOT_URL=$(echo $SNAPSHOT_URL \| sed 's/:80//')
	1301
[1175]	1302	# Inform the reader of the snapshot URL
[1119]	1303	valPrint ts " IA suggests $SNAPSHOT_URL"
	1304	valPrint rs " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
	1305	valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
[1147]	1306	else # Otherwise give a generic Wayback Machine link for this URL, which might work
[1119]	1307	valPrint ts " Try browsing $ARCHIVE_GENERIC/$URL"
	1308	valPrint rs " Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
	1309	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"
[1064]	1310	fi
	1311	fi
	1312	fi
	1313
	1314	# If this is a page and it seems to be there, take screenshot of it using headless Google Chrome
	1315	if [ $IS_FILE -eq 0 ] && [ $TAKE_PAGE_SHOT -eq 1 ] && [ $STATUS == "OK" ]; then
	1316	# Sanitize URL by removing "http(s)://" and converting any other colons or slashes to underscores
	1317	SHOT_NAME=$(echo "$URL" \| sed 's/https*\:\/\///' \| sed 'y/:\//__/')
	1318	SHOT_FILE="$SHOT_PATH/$SHOT_NAME.png"
	1319
	1320	# Don't take screenshot if we already encountered this page and screenshotted it
	1321	if [ ! -f "$SHOT_FILE" ]; then
[1070]	1322	"$CHROME_PATH" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
[1064]	1323	if [ -f "$WORKING_DIR/$CHROME_SCREENSHOT" ]; then
	1324	mv -n "$WORKING_DIR/$CHROME_SCREENSHOT" "$SHOT_FILE"
	1325	else
[1119]	1326	valPrint trhs "Screenshot of URL $URL seems to have failed!"
[1064]	1327	fi
	1328	else
[1123]	1329	valPrint trhs "Skipping screenshot of URL '$URL' because file '$SHOT_FILE' already exists."
[1064]	1330	fi
	1331	fi
	1332	done
	1333	FINISHED_LIST="yes"
	1334	wrapupAndExit

Note: See TracBrowser for help on using the repository browser.

Download in other formats: