Context Navigation

source: Validate External Links/validate_external_links.sh@ 1186

Last change on this file since 1186 was 1184, checked in by iritscen, 18 months ago
ValExtLinks: Added --only-200-ok argument.
File size: 60.9 KB

Rev	Line
[1064]	1	#!/bin/bash
	2
[1177]	3	# Validate External Links by Iritscen (iritscen@yahoo.com)
[1141]	4	#
	5	# Validates a list of external links in CSV format. The resulting logs are produced in three formats:
	6	# - TXT (for easy diffing with an earlier log)
	7	# - RTF (for reading as a local file with clickable links)
[1144]	8	# - HTML (for reading as a web page)
[1142]	9	# Call script with "--help" argument for documentation. Also see Read Me.rtf for critical notes.
[1141]	10	#
[1064]	11	# Recommended rule:
[1118]	12	# \|----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----\|
[1141]	13	#
	14	# Table of contents (sections of script in order of appearance, not execution):
	15	# • Globals
	16	# • Help Output
	17	# • Setup
	18	# • Utility Functions
	19	# • Summary Output
	20	# • Initialization
	21	# • Data Sourcing
	22	# • Config Output
	23	# • Legend Output
	24	# • Main Loop
[1064]	25
	26	# Set separator token to newline
	27	IFS="
	28	"
	29
	30	### GLOBALS ###
	31	# Settings -- these will be changed from their defaults by the arguments passed in to the script
[1175]	32	LINKS_URL="" # download external link CSV from this location (can use "file://" protocol)
	33	EXCEPT_URL="" # location of wiki page with a list of exceptions for NG results
[1177]	34	OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder
[1147]	35	RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES
[1184]	36	ONLY_200_OK=0 # only treat code 200 as "OK" and not any other code in OK_CODES
[1147]	37	SHOW_SLASH=0 # record issue when a slash is added to the end of a URL
	38	SHOW_HTTPS=0 # record issue when "http" is upgraded to "https"
	39	SHOW_YT_RD=0 # record redirection for a youtu.be URL expanding to the full URL
	40	SUGGEST_SNAPSHOTS_NG=0 # query the Internet Archive for a possible snapshot URL for each NG page
	41	SUGGEST_SNAPSHOTS_OK=0 # query the Internet Archive for an existing snapshot of each OK page
[1158]	42	CHECK_ARCHIVE_LINKS=0 # check URLs on archive.org and archive.is
[1147]	43	TAKE_PAGE_SHOT=0 # take a screenshot of each OK page
	44	TIMEOUT=10 # time to wait for a response when querying a site
	45	CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature
	46	URL_START=1 # start at this URL in LINKS_FILE
	47	URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE
	48	UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report
[1064]	49
	50	# Fixed strings -- see the occurrences of these variables to learn their purpose
[1178]	51	AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
[1064]	52	ARCHIVE_API="http://archive.org/wayback/available"
	53	ARCHIVE_GENERIC="https://web.archive.org/web/*"
	54	ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
	55	CHROME_SCREENSHOT="screenshot.png"
[1136]	56	EXCEPT_FILE_NAME="exceptions.txt"
[1064]	57	EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
[1141]	58	WIKI_CURL="https://wiki.oni2.net/Validate_External_Links/Curl_codes"
	59	WIKI_HTTP="https://wiki.oni2.net/Validate_External_Links/HTTP_codes"
	60	WIKI_MAIN="https://wiki.oni2.net/Validate_External_Links"
	61	WIKI_ME="http://iritscen.oni2.net"
[1064]	62	THIS_DIR=$(cd $(dirname $0); pwd)
	63	WORKING_DIR=$(pwd)
	64	WIKI_PATH="wiki.oni2.net"
	65
	66	# These are parallel arrays of the IDs and names of OniGalore's current namespaces
	67	declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 100 101 102 103 104 105 108 109 110 111)
	68	declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk")
	69
	70	# These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
[1184]	71	# This determines whether the script tries to take a screenshot of the URL (when screenshots are
	72	# requested).
[1175]	73	declare -a HTTP_FILES=(3ds 7z avi BINA blend bsl cpp csv dae dll dmg doc east exe fbx first63 flv gamedata gif jpg last32 log m4a mhm mov mp3 mp4 oni ONWC pdf png psd py rar tga TRMA txt vbs wav wmv xaf xcf xlsx xml zip)
[1160]	74	declare -a HTTP_TLDS_AND_PAGES=(abstract action ars asp aspx cfm cgi com css de do full htm html it js jsp net org pgi php php3 phtml pl ru shtml stm uk x)
[1064]	75
[1067]	76	# These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
[1182]	77	# are NG (no good). Pages that return OK codes will be screenshotted when screenshots are asked for.
	78	# Remember to update http_codes.txt if you add a new code.
[1127]	79	declare -a OK_CODES=(200 401 405 406 418 501)
[1067]	80	declare -a RD_CODES=(301 302 303 307 308)
[1178]	81	declare -a NG_CODES=(000 400 403 404 410 429 500 502 503 504 520 530)
[1064]	82
	83	# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
	84	# transcluded text, and if the transclusion fails, then the braces show up in the URL
	85	ILLEGAL_CHARS="{ }"
	86
[1070]	87	# The shortest URL possible, used for sanity-checking some URLs: http://a.co
	88	MIN_URL_LENGTH=11
	89
[1064]	90	# These are parallel arrays giving the prefixes that can be used in place of normal external links to
[1157]	91	# some wikis and other sites; based on https://wiki.oni2.net/Special:Interwiki
[1070]	92	declare -a INTERWIKI_PREFIXES=(commons metawikimedia mw wikibooks wikidata wikimedia wikinews wikiquote wikisource wikispecies wikiversity wikivoyage wikt wp)
	93	declare -a INTERWIKI_DOMAINS=(commons.wikimedia.org meta.wikimedia.org mediawiki.org wikibooks.org wikidata.org wikimediafoundation.org wikinews.org wikiquote.org wikisource.org species.wikimedia.org wikiversity.org wikivoyage.org wiktionary.org wikipedia.org)
[1064]	94
	95	# Variables for keeping track of main loop progress and findings
	96	LINK_NUM=0
[1070]	97	EI_LINKS=0
	98	IW_LINKS=0
[1064]	99	OK_LINKS=0
[1067]	100	RD_LINKS=0
[1064]	101	NG_LINKS=0
[1177]	102	SKIP_PARSE_FAIL=0
	103	SKIP_UNK_PROT=0
[1064]	104	SKIP_UNK_NS=0
	105	SKIP_JS_PAGE=0
	106	SKIP_BAD_URL=0
	107	SKIP_NON_ASCII=0
	108	SKIP_UNK_SUFFIX=0
	109	SKIP_UNK_CODE=0
[1070]	110	SKIP_EXPECT_NG=0
[1142]	111	SKIP_EXPECT_RD=0
[1070]	112	SKIP_EXPECT_EI=0
	113	SKIP_EXPECT_IW=0
[1122]	114	SKIP_HTTPS_UP=0
	115	SKIP_SLASH_ADD=0
[1127]	116	SKIP_YOUTU_BE=0
[1158]	117	SKIP_ARCHIVES=0
[1064]	118	FILE_LINKS=0
	119	PAGE_LINKS=0
	120	SKIPPED_HEADER_ROW=0
	121	FINISHED_LIST="no"
[1118]	122	START_RUN=0
	123	END_RUN=0
[1064]	124
	125
[1141]	126	### HELP OUTPUT ###
[1064]	127	# A pseudo-man page. Here is the 80-character rule for the page text:
[1178]	128	# 345678901234567890123456789012345678901234567890123456789012345678901234567890
[1064]	129	function printHelp()
	130	{
	131	cat << EOF
	132
	133	NAME
	134	Validate External Links
	135
	136	SYNOPSIS
	137	validate_external_links.sh --help
[1070]	138	validate_external_links.sh --links URL --output DIR [--exceptions URL]
[1184]	139	[--record-ok-links] [--only-200-ok] [--show-added-slashes]
	140	[--show-https-upgrades] [--show-yt-redirects] [--suggest-snapshots]
	141	[--check-archive-links] [--take-screenshots FILE] [--timeout NUM]
	142	[--start-url NUM] [--end-url NUM] [--upload FILE]
[1064]	143
	144	DESCRIPTION
	145	This script parses a list of external links found in the OniGalore wiki
[1147]	146	(which is dumped by the Oni2.net server periodically in a particular
[1064]	147	format), validates them using the Unix tool 'curl', and produces a report
[1070]	148	of which links were "OK" (responded positively to an HTTP query), which
	149	were "RD" (responded with a 3xx redirect code), which could be "IW"
	150	(interwiki) links, which are "EI" (external internal) links and could be
	151	intrawiki links, and which were "NG" (no good; a negative response to the
[1069]	152	query). This report can then be automatically uploaded to the location of
[1064]	153	your choice. The script can also suggest Internet Archive snapshots for
[1070]	154	"NG" links, and take screenshots of "OK" links for visual verification by
	155	the reader that the page in question is the one intended to be displayed.
[1064]	156
	157	You must pass this script the URL at which the list of links is found
[1070]	158	(--links) and the path where the directory of logs should be outputted
	159	(--output). All other arguments are optional.
[1064]	160
	161	OPTIONS
[1075]	162	--help Show this page.
	163	--links URL (required) URL from which to download the CSV
	164	file with external links. Note that this URL can
	165	be a local file if you supply a file:// path.
	166	--output DIR (required) Unix path to directory in which Val
	167	should place its reports.
	168	--exceptions URL In order to remove links from the report which
[1136]	169	Val finds an issue with but which you regard as
	170	OK, list those desired exceptions on a wiki page.
	171	See the sample file "exceptions.pdf" for the
	172	required format of the page. Note that this URL
	173	can point to a local file if you supply a path
	174	beginning with "file://".
[1075]	175	--record-ok-links Log a link in the report even if its response
	176	code is "OK".
[1184]	177	--only-200-ok Only treat response code 200 as "OK". Normally
	178	several additional codes are treated as "OK" (see
	179	the array OK_CODES in script) because they are
	180	typically not an indicator of a bad link.
[1122]	181	--show-added-slashes Report on redirects that simply add a '/' to the
	182	end of the URL.
[1127]	183	--show-https-upgrades Report on redirects that simply upgrade a
[1122]	184	"http://" URL to a "https://" URL.
[1127]	185	--show-yt-redirects Report on redirects that expand a youtu.be URL.
[1147]	186	--suggest-snapshots-ng Query the Internet Archive for a possible
[1075]	187	snapshot URL for each "NG" page.
[1147]	188	--suggest-snapshots-ok Query the Internet Archive for a snapshot of each
	189	"OK" page just to make sure it's available. Note
	190	that this will add a tremendous amount of time to
	191	the script execution because there is a rate
	192	limit to the Archive API. Note that this option
	193	does nothing unless you also use the
	194	--record-ok-links argument.
[1144]	195	--check-archive-links Check links that are already pointing to a page
[1158]	196	on the Internet Archive or archive.is (AKA
	197	archive.today). In theory these links should be
	198	totally stable and not need validation.
[1075]	199	--take-screenshots FILE Call the Google Chrome binary at this path to
	200	take screenshots of each "OK" page.
[1141]	201	--timeout NUM Wait this many seconds for a site to respond. The
[1142]	202	default is 10. Important note: Val will attempt
	203	to reach each URL three times, so the time taken
	204	to ping an unresponsive site will be three times
	205	this setting.
[1075]	206	--start-url NUM Start at this link in the links CSV file.
	207	--end-url NUM Stop at this link in the links CSV file.
	208	--upload FILE Upload report using the credentials and path
	209	given in this local text file. See sftp_login.txt
	210	for template.
[1064]	211
	212	BUGS
	213	The script cannot properly parse any line in the external links file
	214	which contains a comma in the name of the wiki page containing a link.
	215	Commas in the link itself are not an issue.
	216	EOF
	217	}
	218
	219
	220	### SETUP ###
	221	# If first argument is a help request, or if nothing was passed in at all, print help page and quit
	222	if [ "$#" -eq 0 ] \|\| [ "$1" == "--help" ]; then
	223	printHelp \| less
	224	exit 0
	225	fi
	226
	227	# Parse arguments as long as there are more arguments to process
	228	while (( "$#" )); do
	229	case "$1" in
[1147]	230	--links ) LINKS_URL="$2"; shift 2;;
	231	--exceptions ) EXCEPT_URL="$2"; shift 2;;
	232	--output ) OUTPUT_DIR="$2"; shift 2;;
	233	--record-ok-links ) RECORD_OK_LINKS=1; shift;;
[1184]	234	--only-200-ok ) ONLY_200_OK=1; shift;;
[1147]	235	--show-added-slashes ) SHOW_SLASH=1; shift;;
	236	--show-https-upgrades ) SHOW_HTTPS=1; shift;;
	237	--show-yt-redirects ) SHOW_YT_RD=1; shift;;
	238	--suggest-snapshots-ng ) SUGGEST_SNAPSHOTS_NG=1; shift;;
	239	--suggest-snapshots-ok ) SUGGEST_SNAPSHOTS_OK=1; shift;;
	240	--check-archive-links ) CHECK_ARCHIVE_LINKS=1; shift;;
	241	--take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
	242	--timeout ) TIMEOUT=$2; shift 2;;
	243	--start-url ) URL_START=$2; shift 2;;
	244	--end-url ) URL_LIMIT=$2; shift 2;;
	245	--upload ) UPLOAD_INFO=$2; shift 2;;
[1157]	246	* ) echo "Invalid argument '$1' detected. Aborting."; exit 1;;
[1064]	247	esac
	248	done
	249
	250	# If the required arguments were not supplied, print help page and quit
	251	if [ -z $LINKS_URL ] \|\| [ -z $OUTPUT_DIR ]; then
[1070]	252	echo "Error: I did not receive one or both required arguments. Run me with the \"--help\" argument for documentation."
[1064]	253	exit 2
	254	fi
	255
[1070]	256	# If user wants screenshots, make sure path to Chrome was passed in and is valid
	257	if [ $TAKE_PAGE_SHOT -eq 1 ]; then
	258	if [ ! -f "$CHROME_PATH" ]; then
	259	echo "Error: You need to supply a path to the Google Chrome application in order to take screenshots."
	260	exit 3
	261	fi
	262	fi
	263
[1064]	264	# Check that UPLOAD_INFO exists, if this argument was supplied
	265	if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then
	266	echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting."
[1070]	267	exit 4
[1064]	268	fi
	269
	270	# Check that OUTPUT_DIR is a directory
	271	if [ ! -d "$OUTPUT_DIR" ]; then
	272	echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting."
[1070]	273	exit 5
[1064]	274	fi
	275
	276	# Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots
	277	SAFE_TIME=$(date "+%Y-%m-%d--%H-%M-%S") # for use in path names
	278	NICE_TIME=$(TZ=":UTC" date "+%a, %d %b %Y %k:%M:%S GMT") # matches format of LINKS_DATE, which is extracted from the extlinks file
	279	OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
	280	OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
	281	SHOT_PATH="$OUTPUT_PATH/Screenshots"
	282	LOG_NAME="ValExtLinks report"
[1144]	283	LOG_NAME_TXT="$LOG_NAME.txt"
	284	LOG_NAME_RTF="$LOG_NAME.rtf"
	285	LOG_NAME_HTM="$LOG_NAME.htm"
	286	LOG_PATH="$OUTPUT_PATH/$LOG_NAME"
	287	LOG_PATH_TXT="$LOG_PATH.txt"
	288	LOG_PATH_RTF="$LOG_PATH.rtf"
	289	LOG_PATH_HTM="$LOG_PATH.htm"
[1064]	290	mkdir "$OUTPUT_PATH"
	291	if [ $TAKE_PAGE_SHOT -eq 1 ]; then
	292	mkdir "$SHOT_PATH"
	293	fi
	294
	295	# Check that 'mkdir' succeeded
	296	if [ ! -d "$OUTPUT_PATH" ]; then
	297	echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting."
[1070]	298	exit 6
[1064]	299	fi
	300
	301	# Get date on the file at LINKS_URL and print to log
	302	LINKS_DATE=$(curl --silent --head $LINKS_URL \| grep "Last-Modified")
	303	if [ -z "$LINKS_DATE" ]; then
	304	echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting."
[1070]	305	exit 7
[1064]	306	fi
	307	LINKS_DATE=${LINKS_DATE#Last-Modified: }
	308
	309
	310	### UTILITY FUNCTIONS ###
	311	# Writes a plain-text header to TXT log file
	312	function printTXTheader()
	313	{
	314	valPrint t "Validate External Links report"
	315	valPrint t "generated $NICE_TIME"
	316	valPrint t "from data of $LINKS_DATE"
[1141]	317	valPrint t "script by Iritscen (contact: $WIKI_ME)"
[1064]	318	valPrint t ""
	319	}
	320
	321	# Writes the RTF header to RTF log file, then a centered title, then sets text to left-justified
	322	function printRTFheader()
	323	{
	324	valPrint r "{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf820
	325	{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
	326	{\colortbl;\red255\green255\blue255;}
	327	{\*\expandedcolortbl;;}
	328	\margl1440\margr1440\vieww12600\viewh12100\viewkind0
	329	\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\qc\partightenfactor0
	330
	331	\f0\fs28 \cf0 \b1 Validate External Links report \b0\\
	332	generated $NICE_TIME\\
	333	from data of $LINKS_DATE\\
[1141]	334	script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$WIKI_ME\"}}{\fldrslt contact}})
[1064]	335	\\
	336	\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
	337	\cf0 "
	338	}
	339
	340	# Closes the RTF markup of the RTF log file
	341	function printRTFfooter()
	342	{
	343	valPrint r "}"
	344	}
	345
	346	# Writes the HTML header to HTML log file
	347	function printHTMheader()
	348	{
	349	valPrint h "<html>
	350	<head>
	351	<title>Validate External Links report</title>
	352	</head>
	353	<body>
	354	<h2>Validate External Links report</h2>
	355	<h3>generated $NICE_TIME<br />
	356	from data of $LINKS_DATE<br />
[1141]	357	script by Iritscen (<a href=\"$WIKI_ME\" target=\"_blank\">contact</a>)</h3>"
[1064]	358	}
	359
	360	# Closes the HTML markup of the HTML log file
	361	function printHTMfooter()
	362	{
	363	valPrint h "</body>
	364	</html>"
	365	}
	366
	367	# The central logging function. The first parameter is a string composed of one or more characters that
[1070]	368	# indicate which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
[1141]	369	# 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 's' means "Print an
	370	# extra newline at the end." 'w' means "Don't pass console output through 'fmt'" ("fmt" fits the output
[1119]	371	# to an 80-column CLI but can break special formatting and the 'n' option).
[1064]	372	function valPrint()
	373	{
	374	if [[ "$1" == c ]]; then
	375	if [[ "$1" == n ]]; then
	376	echo -n "$2"
	377	elif [[ "$1" == w ]]; then
	378	echo "$2"
[1119]	379	elif [[ "$1" == s ]]; then
	380	echo -e "$2\n"
[1064]	381	else
	382	echo "$2" \| fmt -w 80
	383	fi
	384	fi
	385	if [[ "$1" == t ]]; then
	386	if [[ "$1" == n ]]; then
[1144]	387	echo -n "$2" >> "$LOG_PATH_TXT"
[1119]	388	elif [[ "$1" == s ]]; then
[1144]	389	echo -e "$2\n" >> "$LOG_PATH_TXT"
[1064]	390	else
[1144]	391	echo "$2" >> "$LOG_PATH_TXT"
[1064]	392	fi
	393	fi
	394	if [[ "$1" == r ]]; then
	395	if [[ "$1" == n ]]; then
[1144]	396	echo "$2" >> "$LOG_PATH_RTF"
[1119]	397	elif [[ "$1" == s ]]; then
[1144]	398	echo "$2\line\line" >> "$LOG_PATH_RTF"
[1064]	399	else
[1144]	400	echo "$2\line" >> "$LOG_PATH_RTF"
[1064]	401	fi
	402	fi
	403	if [[ "$1" == h ]]; then
[1119]	404	if [[ "$1" == s ]]; then
[1144]	405	echo "$2<tr><td> </td></tr>" >> "$LOG_PATH_HTM"
[1119]	406	elif [[ "$1" == n ]]; then
[1144]	407	echo "$2" >> "$LOG_PATH_HTM"
[1064]	408	else
[1144]	409	echo "$2<br />" >> "$LOG_PATH_HTM"
[1064]	410	fi
	411	fi
	412	}
	413
	414	# Pluralize the string in parameter 1 if the number in parameter 2 is not 1
	415	function pluralCheckNoun()
	416	{
	417	if [ $2 -ne 1 ]; then
	418	if [[ $1 =~ x$ ]]; then
	419	echo $1es
	420	else
	421	echo $1s
	422	fi
	423	else
	424	echo $1
	425	fi
	426	}
	427
[1067]	428	# Output "is" if parameter 1 is 1, otherwise "are"
	429	function pluralCheckIs()
	430	{
	431	if [ $1 -ne 1 ]; then
	432	echo "are"
	433	else
	434	echo "is"
	435	fi
	436	}
	437
[1064]	438	# Output "was" if parameter 1 is 1, otherwise "were"
	439	function pluralCheckWas()
	440	{
	441	if [ $1 -ne 1 ]; then
	442	echo "were"
	443	else
	444	echo "was"
	445	fi
	446	}
	447
[1067]	448	# Output "a " if parameter 1 is 1, otherwise nothing
	449	function pluralCheckA()
	450	{
	451	if [ $1 -eq 1 ]; then
	452	echo "a "
	453	fi
	454	}
	455
	456	# Output "an " if parameter 1 is 1, otherwise nothing
	457	function pluralCheckAn()
	458	{
	459	if [ $1 -eq 1 ]; then
	460	echo "an "
	461	fi
	462	}
	463
[1144]	464	# Upload the reports using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
[1064]	465	# reports being saved to disk have already been closed.
	466	function uploadReport()
	467	{
[1144]	468	valPrint c "Uploading reports..."
[1064]	469
	470	SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
	471	SFTP_USER_NAME_MARKER="user:"
	472	SFTP_PASSWORD_MARKER="pw:"
	473	SFTP_PORT_MARKER="port:"
	474	SFTP_PATH_MARKER="path:"
	475	SFTP_USER_NAME=$(grep $SFTP_USER_NAME_MARKER $UPLOAD_INFO)
	476	SFTP_USER_NAME=${SFTP_USER_NAME#$SFTP_USER_NAME_MARKER}
	477	SFTP_PASSWORD=$(grep $SFTP_PASSWORD_MARKER $UPLOAD_INFO)
	478	SFTP_PASSWORD=${SFTP_PASSWORD#$SFTP_PASSWORD_MARKER}
	479	SFTP_PORT=$(grep $SFTP_PORT_MARKER $UPLOAD_INFO)
	480	SFTP_PORT=${SFTP_PORT#$SFTP_PORT_MARKER}
	481	SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
	482	SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}
	483
[1144]	484	for SUFFIX in htm rtf txt; do
	485	expect "$SCRIPT_PATH" "$LOG_PATH.$SUFFIX" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.$SUFFIX"
[1064]	486
[1144]	487	if [ "$?" -ne 0 ]; then
	488	valPrint c "Error $? occurred when attempting to upload $LOG_NAME.$SUFFIX!"
	489	else
	490	valPrint c "Report in `echo $SUFFIX \| tr [:lower:] [:upper:]` format was uploaded."
	491	fi
	492	done
[1064]	493	}
	494
	495	# Prints session summary when script is done
	496	function wrapupAndExit()
	497	{
	498	# Get off progress line on console, drop down a line from last link in log, and close HTML table
	499	valPrint ctr ""
	500	valPrint h "</table><br />"
	501
	502	# If we didn't finish processing the last URL, then the iterator is one too high
	503	if [ $FINISHED_LIST != "yes" ]; then
	504	let LINK_NUM-=1
	505	if [ $FINISHED_LIST == "no" ]; then
	506	valPrint ctrh "The session was canceled by the user."
	507	fi
	508	fi
	509
[1118]	510	# Generate string with elapsed time
	511	END_RUN=$(date +%s)
	512	ELAPSED=$(echo $(($END_RUN - $START_RUN)) \| awk '{printf "%d min. %d sec. elapsed", int($1/60), int($1%60)}')
	513
[1122]	514	# Do some math on results of session
[1064]	515	LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
[1142]	516	TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP+SKIP_YOUTU_BE))
[1177]	517	LINK_ERRORS=$((SKIP_PARSE_FAIL+SKIP_UNK_PROT+SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
[1142]	518	LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_RD+SKIP_EXPECT_EI+SKIP_EXPECT_IW))
	519	LINK_PROBLEMS_TOTAL=$((NG_LINKS+RD_LINKS+EI_LINKS+IW_LINKS))
	520	LINK_PROBLEMS_NG=$((NG_LINKS-SKIP_EXPECT_NG))
	521	LINK_PROBLEMS_RD=$((RD_LINKS-SKIP_EXPECT_RD))
	522	LINK_PROBLEMS_EI=$((EI_LINKS-SKIP_EXPECT_EI))
	523	LINK_PROBLEMS_IW=$((IW_LINKS-SKIP_EXPECT_IW))
	524	LINK_PROBLEMS_NET=$((LINK_PROBLEMS_NG+LINK_PROBLEMS_RD+LINK_PROBLEMS_EI+LINK_PROBLEMS_IW))
[1122]	525
[1144]	526	# Print something in the Links section if no link issues were printed
	527	if [ $LINK_PROBLEMS_NET -eq 0 ]; then
	528	valPrint h "<i>No link problems to report! See the <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for a list of links with issues that were not reported.</i>"
	529	fi
	530	if [ $LINK_PROBLEMS_TOTAL -eq 0 ]; then
	531	valPrint t "No link problems to report!"
	532	valPrint r "\i1 No link problems to report! \i0"
	533	fi
	534
[1141]	535	## SUMMARY OUTPUT ##
[1118]	536	valPrint ct "Summary ($ELAPSED):"
	537	valPrint r "\b1 Summary \b0 ($ELAPSED)"
	538	valPrint hn "<h3><span id=\"summary\">Summary ($ELAPSED)</span></h3>"
[1123]	539	valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT) (there $(pluralCheckWas $FILE_LINKS) $FILE_LINKS file $(pluralCheckNoun link $FILE_LINKS) and $PAGE_LINKS page $(pluralCheckNoun link $PAGE_LINKS))."
[1122]	540
	541	# Print processed link totals
	542	if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi
	543	if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi
[1178]	544	if [ $SKIP_ARCHIVES -gt 0 ]; then valPrint ctrh "- $SKIP_ARCHIVES archive.org/archive.is $(pluralCheckNoun link $SKIP_ARCHIVES) $(pluralCheckWas $SKIP_ARCHIVES) not checked"; fi
[1142]	545	if [ $LINK_PROBLEMS_TOTAL -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_TOTAL processed $(pluralCheckNoun link $LINK_PROBLEMS_TOTAL) had $(pluralCheckAn $LINK_PROBLEMS_TOTAL)$(pluralCheckNoun issue $LINK_PROBLEMS_TOTAL)"; fi
	546	if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr " (excepted $LINKS_EXCEPTED link $(pluralCheckNoun issue $LINKS_EXCEPTED) from report)"; valPrint h "  (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi
[1123]	547	if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) $(pluralCheckWas $OK_LINKS) OK"; fi
	548	if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctr " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; valPrint h "  (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi
[1122]	549
	550	# Print errored link totals
[1144]	551	if [ $LINK_ERRORS -gt 0 ]; then
	552	valPrint c "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"
	553	valPrint h "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for specific links):"
	554	valPrint rt "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS):"
	555	fi
[1177]	556	if [ $SKIP_PARSE_FAIL -gt 0 ]; then valPrint ctrh "- $SKIP_PARSE_FAIL line-parsing $(pluralCheckNoun failure $SKIP_PARSE_FAIL)"; fi
	557	if [ $SKIP_UNK_PROT -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_PROT unknown $(pluralCheckNoun protocol $SKIP_UNK_PROT)"; fi
[1122]	558	if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
[1070]	559	if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
[1064]	560	if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
	561	if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
	562	if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
	563	if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
[1122]	564
[1142]	565	# Print excepted link totals
[1144]	566	if [ $LINKS_EXCEPTED -gt 0 ]; then
	567	valPrint c "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"
	568	valPrint h "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for specific links):"
	569	valPrint rt "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted:"
	570	fi
[1142]	571	if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
	572	if [ $SKIP_EXPECT_RD -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_RD/$RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi
	573	if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi
	574	if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi
	575
[1175]	576	# Perform exceptions audit
	577	EXCEPTION_ISSUES=0
	578	valPrint ctrh "Exceptions list audit:"
	579	for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
	580	EXCEPT_LINE="${EXCEPT_ARRAY[$i]}"
	581	EXCEPT_LINE=$(echo "$EXCEPT_LINE" \| sed 's/\&/\&/g') # copied from exception-matching code
	582
	583	if [ ${EXCEPT_FOUND[$i]} -eq 0 ]; then
	584	EXCEPT_URL="${EXCEPT_LINE#*,}"
	585	EXCEPT_URL="${EXCEPT_URL%,*}"
	586	EXCEPT_PAGE="${EXCEPT_LINE##*,}"
	587	EXCEPT_PAGE="${EXCEPT_PAGE%% *}"
	588	if [ "$EXCEPT_PAGE" == "*" ]; then
	589	valPrint tr "- The link '$EXCEPT_URL' did not return an error or is not on any page."
	590	else
	591	valPrint tr "- The link '$EXCEPT_URL' did not return an error or is not on page '$EXCEPT_PAGE'."
	592	fi
	593	let EXCEPTION_ISSUES+=1
	594	elif [ ${EXCEPT_USED[$i]} -eq 0 ]; then
	595	EXCEPT_URL="${EXCEPT_LINE#*,}"
	596	EXCEPT_URL="${EXCEPT_URL%,*}"
	597	EXCEPT_CODE=${EXCEPT_LINE%%,*}
	598	valPrint tr "- The link '$EXCEPT_URL' did not return error code $EXCEPT_CODE."
	599	let EXCEPTION_ISSUES+=1
	600	fi
	601	done
	602	if [ $EXCEPTION_ISSUES -eq 0 ]; then
	603	valPrint ctrh "- No issues found."
	604	else
	605	valPrint c "$EXCEPTION_ISSUES exception list $(pluralCheckNoun issue $EXCEPTION_ISSUES) detected (see RTF or TXT report for details)."
	606	valPrint h "$EXCEPTION_ISSUES exception list $(pluralCheckNoun issue $EXCEPTION_ISSUES) detected (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for details)."
	607	fi
	608
[1122]	609	# Print checked link totals
[1142]	610	if [ $LINK_PROBLEMS_NET -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS_NET link $(pluralCheckNoun issue $LINK_PROBLEMS_NET):"; fi
	611	if [ $LINK_PROBLEMS_NG -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_NG NG $(pluralCheckNoun link $LINK_PROBLEMS_NG)"; fi
	612	if [ $LINK_PROBLEMS_RD -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_RD $(pluralCheckNoun redirection $LINK_PROBLEMS_RD)"; fi
	613	if [ $LINK_PROBLEMS_EI -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_EI $(pluralCheckNoun link $LINK_PROBLEMS_EI) that could be intrawiki"; fi
	614	if [ $LINK_PROBLEMS_IW -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_IW $(pluralCheckNoun link $LINK_PROBLEMS_IW) that could be interwiki"; fi
[1122]	615
	616	# Close the log files' markup
[1070]	617	valPrint trh "ValExtLinks says goodbye."
[1064]	618	printRTFfooter
	619	printHTMfooter
	620
	621	# Upload report if this was requested
	622	if [ ! -z $UPLOAD_INFO ] && [ $FINISHED_LIST != "no" ]; then
	623	uploadReport
	624	fi
	625
	626	# Really quit now
	627	valPrint c "ValExtLinks says goodbye."
	628	exit 0
	629	}
	630	trap wrapupAndExit INT
	631
	632
	633	### INITIALIZATION ###
	634	# Print opening message to console and log files
	635	valPrint cw "$(tput bold)--Validate External Links--$(tput sgr0)"
	636	printTXTheader
	637	printRTFheader
	638	printHTMheader
	639
[1141]	640	## DATA SOURCING ##
	641	valPrint t "Startup:"
	642	valPrint r "\b1 Startup \b0"
	643	valPrint hn "<h3>Startup</h3>"
	644
[1064]	645	# Attempt to download file at LINKS_URL, then check that it succeeded
[1141]	646	valPrint cwtrhn "Downloading list of external links from $LINKS_URL..."
[1064]	647	LINKS_FILE_NAME=$(echo "$LINKS_URL" \| sed 's/.*\///')
	648	LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
	649	curl --silent -o "$LINKS_FILE" $LINKS_URL
	650	if [ ! -f "$LINKS_FILE" ]; then
[1141]	651	echo -e "\nThe download of $LINKS_URL appears to have failed. Aborting."
[1064]	652	wrapupAndExit
[1141]	653	else
	654	valPrint ctrh " success."
[1064]	655	fi
	656
	657	# Attempt to download file at EXCEPT_URL, then check that it succeeded
	658	if [ ! -z $EXCEPT_URL ]; then
[1141]	659	valPrint cwtrhn "Downloading list of reporting exceptions from $EXCEPT_URL..."
[1136]	660	EXCEPT_DATA=$(curl --silent $EXCEPT_URL)
	661	if [ -z "$EXCEPT_DATA" ]; then
[1141]	662	echo -e "\nThe download of the exceptions data from '$EXCEPT_URL' appears to have failed. Aborting."
[1064]	663	wrapupAndExit
[1141]	664	else
	665	valPrint ctrh " success."
[1064]	666	fi
[1136]	667	EXCEPT_DATA=${EXCEPT_DATA%END LIST*}
	668	EXCEPT_DATA=${EXCEPT_DATA#*BEGIN LIST}
	669	EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
	670
	671	# Store on disk for debugging purposes
	672	echo "$EXCEPT_DATA" > "$EXCEPT_FILE"
	673
	674	# Transfer to array for easy searching later
	675	declare -a EXCEPT_ARRAY=($(echo "$EXCEPT_DATA"))
[1175]	676
	677	# Create parallel arrays for marking which exceptions get used later
	678	declare -a EXCEPT_USED=()
	679	declare -a EXCEPT_FOUND=()
	680	for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
	681	EXCEPT_USED+=(0)
	682	EXCEPT_FOUND+=(0)
	683	done
[1064]	684	fi
	685
	686	# Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
	687	LINK_COUNT_STRING=$(cat "$LINKS_FILE" \| wc -l)
	688
	689	# Number of URLs is number of lines minus one (first line is column header row for the CSV)
	690	LINK_COUNT=$(echo "${LINK_COUNT_STRING}" \| tr -d '[:space:]')
	691	let LINK_COUNT-=1
[1141]	692	valPrint ctrh "Found $LINK_COUNT links to process."
	693	valPrint trh ""
[1064]	694
[1141]	695	## CONFIG OUTPUT ##
	696	valPrint t "Config:"
	697	valPrint r "\b1 Config \b0"
	698	valPrint hn "<h3>Config</h3>"
	699
	700	valPrint ctrhn "Links to consider: "
[1064]	701	if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
[1141]	702	valPrint ctrh "$((URL_LIMIT-URL_START+1)), from link $URL_START to $URL_LIMIT"
[1064]	703	elif [ $URL_START -ne 1 ]; then
[1141]	704	valPrint ctrh "$((LINK_COUNT-URL_START+1)), from link $URL_START to $LINK_COUNT"
[1064]	705	else
[1141]	706	valPrint ctrh "$LINK_COUNT"
[1064]	707	fi
	708
[1141]	709	valPrint ctrh "Site query timeout: $TIMEOUT seconds"
	710
	711	valPrint ctrhn "Show OK links: "
	712	if [ $RECORD_OK_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
	713
[1184]	714	valPrint ctrhn "Treat these response codes as OK: "
	715	if [ $ONLY_200_OK -eq 1 ]; then valPrint ctrh "200"; else valPrint ctrh "${OK_CODES[*]}"; fi
	716
[1141]	717	valPrint ctrhn "Take screenshots: "
	718	if [ $TAKE_PAGE_SHOT -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
	719
[1147]	720	valPrint ctrhn "Suggest archive.org snapshots for NG pages: "
	721	if [ $SUGGEST_SNAPSHOTS_NG -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
[1141]	722
[1147]	723	valPrint ctrhn "Suggest archive.org snapshots for OK pages: "
	724	if [ $SUGGEST_SNAPSHOTS_OK -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
	725
[1141]	726	valPrint ctrhn "Ignore slash-adding redirects: "
	727	if [ $SHOW_SLASH -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
	728
	729	valPrint ctrhn "Ignore HTTPS-upgrading redirects: "
	730	if [ $SHOW_HTTPS -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
	731
	732	valPrint ctrhn "Ignore youtu.be redirects: "
	733	if [ $SHOW_YT_RD -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
	734
[1158]	735	valPrint ctrhn "Check archive.org and archive.is links: "
[1144]	736	if [ $CHECK_ARCHIVE_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
[1141]	737
[1064]	738	valPrint tr "A summary of my findings will be found at the bottom of the report."
	739	valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
	740	valPrint trh ""
	741
[1141]	742	## LEGEND OUTPUT ##
[1064]	743	valPrint t "Legend:"
	744	valPrint r "\b1 Legend \b0"
	745	valPrint hn "<h3>Legend</h3>"
[1175]	746	valPrint t "(For guidance in fixing these links, see $WIKI_MAIN. The exceptions list is at $EXCEPT_URL.)"
	747	valPrint r "(For guidance in fixing these links, see {\field{\\fldinst{HYPERLINK \"$WIKI_MAIN\"}}{\fldrslt here}}. The exceptions list is {\field{\\fldinst{HYPERLINK \"$EXCEPT_URL\"}}{\fldrslt here}}.)"
	748	valPrint h "(For guidance in fixing these links, see <a href=\"$WIKI_MAIN\" target=\"_blank\">here</a>. The exceptions list is <a href=\"$EXCEPT_URL\" target=\"_blank\">here</a>.)"
[1141]	749	valPrint trh "OK = URL seems to be working"
	750	valPrint trh "NG = URL no longer seems to work"
	751	valPrint trh "RD = URL is redirecting to this new URL"
	752	valPrint trh "EI = URL is an external link to an internal page and should be converted to an intrawiki link using the suggested markup"
	753	valPrint trh "IW = URL is an external link to a fellow wiki and should be converted to an interwiki link using the suggested markup"
	754	valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $WIKI_HTTP)"
	755	valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$WIKI_HTTP\"}}{\fldrslt here}} for code reference)"
	756	valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$WIKI_HTTP\" target=\"_blank\">here</a> for code reference)"
	757	valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $WIKI_CURL)"
	758	valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$WIKI_CURL\"}}{\fldrslt here}} for code reference)"
	759	valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$WIKI_CURL\" target=\"_blank\">here</a> for code reference)"
	760	valPrint trh "IA suggests = Last available snapshot returned by the Internet Archive"
	761	valPrint trh "Try browsing = The Archive failed to return a snapshot URL, so check for a snapshot manually using this link"
[1064]	762	valPrint trh ""
	763
	764
	765	### MAIN LOOP ###
[1120]	766	valPrint t "Links:"
	767	valPrint r "\b1 Links \b0"
	768	valPrint hn "<h3>Links</h3>"
[1118]	769	START_RUN=$(date +%s)
[1064]	770	# Process each line of the .csv in LINKS_FILE
	771	for LINE in `cat "$LINKS_FILE"`; do
[1147]	772	START_LINK=$(date +%s)
[1064]	773	let LINK_NUM+=1
	774
	775	# First line is the column header row for the CSV, so let's verify that the format hasn't changed
	776	if [ $SKIPPED_HEADER_ROW -eq 0 ]; then
	777	if [ $LINE == "namespace,title,target" ]; then
	778	SKIPPED_HEADER_ROW=1
[1148]	779	LINK_NUM=0 # this line is not a link, so reset the link counter
[1064]	780	valPrint hn "<table>"
	781	continue
	782	else
	783	valPrint ctrh "Did not find expected header row in $LINKS_FILE. Aborting."
	784	wrapupAndExit
	785	fi
	786	fi
	787
	788	# Skip this link if we are not at URL_START yet
	789	if [ $LINK_NUM -lt $URL_START ]; then
	790	continue
	791	fi
[1183]	792
[1064]	793	# Stop if we are at the limit declared for testing purposes
	794	if [ $URL_LIMIT -gt 0 ] && [ $LINK_NUM -gt $URL_LIMIT ]; then
	795	FINISHED_LIST="limit"
	796	wrapupAndExit
	797	fi
[1177]	798
	799	# Parse line into namespace ID number, containing wiki page, and external link URL
	800	NS_ID=${LINE%%,*}
	801	PAGE_NAME=${LINE#$NS_ID,}
	802	PAGE_NAME=${PAGE_NAME%%,*} # a comma in the page name will break this
	803	URL=${LINE#$NS_ID,$PAGE_NAME,} # commas can be in this
	804	if [ -z "$NS_ID" ] \|\| [ -z "$PAGE_NAME" ] \|\| [ -z "$URL" ]; then
	805	valPrint trs "Skipping line $LINK_NUM ('$LINE') because the namespace, wiki page or link URL could not be read."
	806	let SKIP_PARSE_FAIL+=1
	807	continue
	808	fi
	809
	810	# Skip any link that isn't "http://" or "https://"
	811	if [[ ! $URL =~ ^http* ]]; then
	812	valPrint trs "Skipping line $LINK_NUM ('$LINE') because the protocol isn't 'http://' or 'https://'."
	813	let SKIP_UNK_PROT+=1
	814	continue
	815	fi
[1064]	816
	817	# Print progress to screen
	818	if [ $LINK_NUM -gt 1 ]; then
	819	printf "\e[1A\n" # erase previous progress message so that new one appears in its place
	820	fi
	821	valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."
	822
	823	# Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
	824	NS_NAME=""
	825	a=0
[1069]	826	while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done
[1118]	827	if [ $NS_ID == "NULL" ]; then
	828	break
	829	elif [ $NS_ID -eq ${NS_IDS[$a]} ]; then
[1064]	830	NS_NAME="${NS_NAMES[$a]}"
	831	break
	832	fi
	833	let a+=1
	834	done
[1118]	835	if [ "$NS_NAME" == "" ]; then
	836	if [ $NS_ID == "NULL" ]; then
[1123]	837	valPrint trs "Skipping line $LINK_NUM ('$LINE') because the namespace (and probably the page too) is 'NULL'. Probably the link is no longer in existence on the wiki."
[1118]	838	else
[1123]	839	valPrint trs "Skipping line $LINK_NUM ('$LINE') because I could not find a name for namespace ID $NS_ID."
[1118]	840	fi
[1064]	841	let SKIP_UNK_NS+=1
[1148]	842	let PAGE_LINKS+=1
[1064]	843	continue
	844	fi
	845
[1070]	846	# Build longer wiki page URLs from namespace and page names
[1122]	847	FULL_PAGE_PATH=https://$WIKI_PATH/$NS_NAME:$PAGE_NAME
[1070]	848	LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
	849	# Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it
	850	# explicitly breaks the link
	851	if [ $NS_ID -eq 0 ]; then
[1122]	852	FULL_PAGE_PATH=https://$WIKI_PATH/$PAGE_NAME
[1070]	853	LOCAL_PAGE_PATH=$PAGE_NAME
	854	fi
	855
[1149]	856	# We don't want to consider wiki pages ending in .js, as the MW parser cannot reliably isolate URLS
	857	# in JavaScript code, so it returns erroneous links
	858	PAGE_NAME_SUFFIX=$(echo $PAGE_NAME \| sed 's/.*\.//')
	859	if [ $PAGE_NAME_SUFFIX == "js" ]; then
	860	valPrint trs "Skipping URL '${LINE#$NS_ID,$PAGE_NAME,}' on line $LINK_NUM because it was found on JavaScript page '$LOCAL_PAGE_PATH'."
	861	let SKIP_JS_PAGE+=1
	862	let PAGE_LINKS+=1
	863	continue
	864	fi
	865
[1064]	866	# Scan for illegal characters
	867	if [[ $URL == [$ILLEGAL_CHARS] ]]; then
[1149]	868	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because it contains characters illegal in a URL."
[1064]	869	let SKIP_BAD_URL+=1
[1148]	870	let PAGE_LINKS+=1
[1064]	871	continue
	872	fi
	873
[1158]	874	# If we're skipping archive links, see if this is one
	875	if [ $CHECK_ARCHIVE_LINKS -eq 0 ] && [[ ( $URL == web.archive.org \|\| $URL == archive.is ) ]]; then
	876	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to check archive links."
	877	let SKIP_ARCHIVES+=1
[1148]	878	let PAGE_LINKS+=1
[1135]	879	continue
	880	fi
	881
[1064]	882	# Now we need to know if the URL is for a file or a web page. First step is to determine if the
	883	# URL ends in a suffix
	884	HAS_SUFFIX=0
	885
	886	# If the URL ends in something like ".php?foo=bar", strip everything from the '?' onward
[1070]	887	CLEAN_URL=${URL%%\?*}
[1064]	888
	889	# If the URL ends in something like "#section_15", strip everything from the '#' onward
[1070]	890	CLEAN_URL=${CLEAN_URL%%\#*}
[1064]	891
[1175]	892	# 'sed' cannot handle Unicode in my Bash shell, so skip non-ASCII URL and make reader check it
[1070]	893	if [[ $CLEAN_URL == [![:ascii:]] ]]; then
[1149]	894	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I cannot handle non-ASCII characters."
[1064]	895	let SKIP_NON_ASCII+=1
[1148]	896	let PAGE_LINKS+=1
[1064]	897	continue
	898	fi
	899
	900	# Isolate the characters after the last period and after the last slash
[1070]	901	POST_DOT=$(echo "$CLEAN_URL" \| sed 's/.*\.//')
	902	POST_SLASH=$(echo "$CLEAN_URL" \| sed 's/.*\///')
[1064]	903
	904	# If the last period comes after the last slash, then the URL ends in a suffix
	905	POST_DOT_LENGTH=$(echo \| awk -v input=$POST_DOT '{print length(input)}')
	906	POST_SLASH_LENGTH=$(echo \| awk -v input=$POST_SLASH '{print length(input)}')
	907	if [ $POST_DOT_LENGTH -lt $POST_SLASH_LENGTH ]; then
	908	HAS_SUFFIX=1
	909	else
	910	HAS_SUFFIX=0
	911	fi
	912
	913	# Now we need to know if the URL is for a file. If the URL ends in a suffix, compare it against the
	914	# known URL endings supplied in HTTP_FILES and HTTP_TLDS_AND_PAGES
	915	IS_FILE=-1
	916	if [ $HAS_SUFFIX -eq 0 ]; then
	917	IS_FILE=0
	918	else
	919	# Turn off case sensitivity while we compare suffixes
	920	shopt -s nocasematch
	921
[1127]	922	# Special case: URLs ending in something like "productID.304297400" are pages, not files, so if
[1064]	923	# the URL's suffix is all numbers, we are looking at the end of a web page URL
	924	if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
	925	IS_FILE=0
	926	fi
[1127]	927
	928	# Special case: URLs ending in a parens, e.g. "ms537113(v=vs.85)", are pages
	929	if [[ $POST_DOT =~ ^.*[\(\)]$ ]]; then
	930	IS_FILE=0
	931	fi
	932
	933	# Special case: URLs containing a '%', e.g. "10.1007%2FBF00329055", are pages
	934	if [[ $POST_DOT == % ]]; then
	935	IS_FILE=0
	936	fi
[1064]	937
	938	# If we did not identify this URL as a web page above, we need to compare the suffix against known
	939	# file extensions
	940	if [ $IS_FILE -eq -1 ]; then
	941	for EXTENSION in "${HTTP_FILES[@]}"; do
	942	if [[ $EXTENSION == $POST_DOT ]]; then # double brackets respect the nocasematch setting
	943	IS_FILE=1
	944	break
	945	fi
	946	done
	947	fi
	948
	949	# If we did not identify this URL as a file above, we need to compare the suffix against known
	950	# pages and TLDs in order to rule out that this is an unknown file suffix or TLD that the user
	951	# needs to add to the HTTP_FILES or HTTP_TLDS_AND_PAGES array
	952	if [ $IS_FILE -eq -1 ]; then
	953	for PAGE_OR_TLD in "${HTTP_TLDS_AND_PAGES[@]}"; do
	954	if [[ $PAGE_OR_TLD == $POST_DOT ]]; then
	955	IS_FILE=0
	956	break
	957	fi
	958	done
	959	fi
	960
	961	# Turn case sensitivity back on in Bash
	962	shopt -u nocasematch
	963	fi
	964
[1175]	965	# If this suffix escaped identification as either a file, page or TLD, inform the reader
[1064]	966	STR_TYPE=""
	967	if [ $IS_FILE -eq -1 ]; then
[1160]	968	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I encountered the unknown URL suffix '$POST_DOT'. Please add this suffix to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
[1064]	969	let SKIP_UNK_SUFFIX+=1
	970	continue
	971	elif [ $IS_FILE -eq 1 ]; then
	972	STR_TYPE="file"
	973	let FILE_LINKS+=1
[1148]	974	else
[1064]	975	STR_TYPE="page"
	976	let PAGE_LINKS+=1
	977	fi
	978
	979	# Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
	980	# issue with sites that require HTTPS
[1158]	981	CURL_CODE=$(curl -o /dev/null --silent --insecure --compressed --head --user-agent '$AGENT' --max-time $TIMEOUT --retry 2 --write-out '%{http_code}\n' $URL)
[1064]	982	CURL_ERR=$(echo $?)
	983	CURL_RESULT=$CURL_CODE
	984
	985	# Tack 'curl' exit code onto result string if 'curl' returned "000" (no HTTP response)
	986	if [ $CURL_CODE == "000" ]; then
	987	CURL_RESULT="$CURL_RESULT-$CURL_ERR"
	988	fi
	989
[1070]	990	# Begin to determine our status code for this URL (EI, IW, OK, RD, or NG)
[1064]	991	STATUS="??"
[1067]	992	NEW_URL=""
[1064]	993	INTERWIKI_INDEX=-1
	994
[1070]	995	# First make sure that this isn't an "external internal" link to our own wiki that can be replaced
	996	# by "[[page_name]]". If it uses a special access URL beginning with "/w/", let it pass, as it
	997	# probably cannot be replaced by "[[ ]]" markup
	998	if [[ $URL == $WIKI_PATH ]] && [[ $URL != $WIKI_PATH/w/ ]]; then
	999	STATUS="EI"
	1000	let EI_LINKS+=1
	1001	fi
	1002
[1144]	1003	# If it's not, check if this is a link to a domain that we have an interwiki prefix for (also make
	1004	# sure that it's not an archive.org link to a page from an interwiki domain)
	1005	if [ $STATUS == "??" ] && [[ $URL != web.archive.org ]]; then
[1070]	1006	for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
	1007	if [[ $URL == ${INTERWIKI_DOMAINS[$i]} ]] && [[ $URL != ${INTERWIKI_DOMAINS[$i]}/w/ ]]; then
	1008	STATUS="IW"
	1009	let IW_LINKS+=1
	1010	INTERWIKI_INDEX=$i
	1011	break
	1012	fi
	1013	done
	1014	fi
	1015
[1069]	1016	# If we didn't match an interwiki domain, see if the status code is in our "OK" codes list
	1017	if [ $STATUS == "??" ]; then
	1018	for CODE in "${OK_CODES[@]}"; do
[1184]	1019	if [ $ONLY_200_OK -eq 1 ] && [ $CODE -ne 200 ]; then
	1020	continue
	1021	fi
	1022
[1069]	1023	if [[ $CODE == $CURL_CODE ]]; then
	1024	STATUS="OK"
	1025	let OK_LINKS+=1
[1148]	1026
	1027	# If this is a YouTube link, we have to look at the actual page source to know if the video
[1157]	1028	# is good or not; override the link's info if it's actually NG
[1148]	1029	if [[ $URL == www.youtube.com ]]; then
[1182]	1030	PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $URL)
	1031	CURL_ERR=$(echo $?)
	1032	if [ "$CURL_ERR" != "0" ]; then
[1148]	1033	STATUS="NG"
[1182]	1034	CURL_RESULT="000-$CURL_ERR"
[1148]	1035	let OK_LINKS-=1
	1036	let NG_LINKS+=1
[1182]	1037	elif [[ "$PAGE_TEXT" =~ "simpleText\":\"Video unavailable" ]]; then
	1038	STATUS="NG"
	1039	CURL_CODE="404"
	1040	CURL_RESULT=$CURL_CODE
	1041	let OK_LINKS-=1
	1042	let NG_LINKS+=1
[1148]	1043	fi
	1044	fi
[1182]	1045
	1046	# If this is a OneDrive link, we have to look at the actual page source to know if the file
	1047	# is really still at this URL; override the link's info if it's actually NG or RD
	1048	if [[ $URL == skydrive.live.com ]]; then
	1049	PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $URL)
	1050	CURL_ERR=$(echo $?)
	1051	if [ "$CURL_ERR" != "0" ]; then
	1052	STATUS="NG"
	1053	CURL_RESULT="000-$CURL_ERR"
	1054	let OK_LINKS-=1
	1055	let NG_LINKS+=1
	1056	elif [[ "$PAGE_TEXT" =~ "<h1>Sorry, something went wrong" ]]; then
	1057	STATUS="NG"
	1058	CURL_CODE="404"
	1059	CURL_RESULT=$CURL_CODE
	1060	let OK_LINKS-=1
	1061	let NG_LINKS+=1
	1062	elif [[ "$PAGE_TEXT" =~ "<h2>Object moved to" ]]; then
	1063	STATUS="??" # have to send the code through the next block to treat the redirect properly
	1064	CURL_CODE="301"
	1065	CURL_RESULT=$CURL_CODE
	1066	let OK_LINKS-=1
	1067	fi
	1068	fi
	1069
[1069]	1070	break
	1071	fi
	1072	done
	1073	fi
	1074
[1067]	1075	# If we didn't get a match with the "OK" codes, check it against the "RD" codes
[1064]	1076	if [ $STATUS == "??" ]; then
[1067]	1077	for CODE in "${RD_CODES[@]}"; do
	1078	if [[ $CODE == $CURL_CODE ]]; then
[1182]	1079	# Get URL header again in order to retrieve the URL we are being redirected to, but if this
	1080	# is a OneDrive link, we already have the new URL in $PAGE_TEXT
	1081	if [[ $URL == skydrive.live.com ]]; then
	1082	NEW_URL=${PAGE_TEXT##*href=\"}
	1083	NEW_URL=${NEW_URL%\">here*}
	1084	else
	1085	NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --write-out '%{redirect_url}\n' $URL)
	1086	fi
[1067]	1087
[1122]	1088	# Adjust the old and new URLs to both use HTTP for comparison purposes, so we can filter
	1089	# those changes out if the user didn't ask for them
	1090	URL_HTTP=$(echo $URL \| sed -E 's/^https:/http:/')
	1091	NEW_URL_HTTP=$(echo $NEW_URL \| sed -E 's/^https:/http:/')
[1070]	1092
	1093	# Sometimes 'curl' fails to get the redirect_url due to time-out or bad web site config
[1122]	1094	NEW_URL_LENGTH=$(echo \| awk -v input=$NEW_URL_HTTP '{print length(input)}')
[1070]	1095	if [ $NEW_URL_LENGTH -lt $MIN_URL_LENGTH ]; then
[1122]	1096	NEW_URL_HTTP="[new URL not retrieved]"
[1070]	1097	fi
	1098
[1122]	1099	# Remove slash at end of new URL, if present, so we can filter out the redirects that
	1100	# merely add an ending slash if the user didn't ask for them
	1101	NEW_URL_NO_SLASH=$(echo $NEW_URL_HTTP \| sed -E 's:/$::')
	1102
[1127]	1103	# Detect if this is a youtu.be link simply being expanded by YouTube to the full
	1104	# youtube.com address
	1105	YOUTU_BE=0
	1106	if [[ $URL_HTTP == http://youtu.be* ]] && [[ $NEW_URL_HTTP == http://www.youtube.com* ]]; then
	1107	YOUTU_BE=1
	1108	fi
	1109
[1122]	1110	# If the URLs match besides HTTP being upgraded to HTTPS, then the link is OK (unless user
	1111	# wants those to be reported)
	1112	if [ $SHOW_HTTPS -eq 0 ] && [ $URL_HTTP == $NEW_URL_HTTP ]; then
[1149]	1113	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to show http->https upgrades, and I was redirected to '$NEW_URL'."
[1069]	1114	STATUS="OK"
	1115	let OK_LINKS+=1
[1122]	1116	let SKIP_HTTPS_UP+=1
	1117	# If the URLs match besides an added ending slash, then the link is OK (unless user wants
	1118	# those to be reported)
	1119	elif [ $SHOW_SLASH -eq 0 ] && [ $URL_HTTP == $NEW_URL_NO_SLASH ]; then
[1149]	1120	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to show added trailing slashes, and I was redirected to '$NEW_URL'."
[1122]	1121	STATUS="OK"
	1122	let OK_LINKS+=1
	1123	let SKIP_SLASH_ADD+=1
[1148]	1124	elif [ $YOUTU_BE -eq 1 ]; then
	1125	# We have to look at the actual page source to know if a YouTube video is good or not
	1126	PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $NEW_URL \| grep "\"simpleText\":\"Video unavailable\"")
	1127	if [ ! -z "$PAGE_TEXT" ]; then
	1128	STATUS="NG"
	1129	let NG_LINKS+=1
	1130	else
	1131	if [ $SHOW_YT_RD -eq 0 ]; then
[1149]	1132	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to show redirects for youtu.be links, and I was redirected to '$NEW_URL'."
[1148]	1133	STATUS="OK"
	1134	let OK_LINKS+=1
	1135	let SKIP_YOUTU_BE+=1
	1136	else
	1137	STATUS="RD"
	1138	let RD_LINKS+=1
	1139	fi
	1140	fi
[1069]	1141	else
	1142	STATUS="RD"
	1143	let RD_LINKS+=1
	1144	fi
[1067]	1145	break
	1146	fi
	1147	done
	1148	fi
	1149
	1150	# If we didn't get a match with the "RD" codes, check it against the "NG" codes
	1151	if [ $STATUS == "??" ]; then
[1064]	1152	for CODE in "${NG_CODES[@]}"; do
	1153	if [[ $CODE == $CURL_CODE ]]; then
	1154	STATUS="NG"
	1155	let NG_LINKS+=1
	1156	break
	1157	fi
	1158	done
[1184]	1159	# Also check it against the "OK" codes besides 200 if the --only-200-ok argument was received
	1160	if [ $ONLY_200_OK -eq 1 ]; then
	1161	for CODE in "${OK_CODES[@]}"; do
	1162	if [ $CODE -eq 200 ]; then
	1163	continue
	1164	fi
	1165	if [[ $CODE == $CURL_CODE ]]; then
	1166	STATUS="NG"
	1167	let NG_LINKS+=1
	1168	break
	1169	fi
	1170	done
	1171	fi
[1064]	1172	fi
	1173
	1174	# If we didn't match a known status code, advise the reader
	1175	if [ $STATUS == "??" ]; then
[1149]	1176	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I encountered the unknown response code $CURL_CODE."
[1064]	1177	let SKIP_UNK_CODE+=1
	1178	continue
	1179	fi
	1180
[1136]	1181	# Check problem links against exceptions list before proceeding
	1182	FOUND_EXCEPT=0
[1175]	1183	if [ $STATUS != "OK" ] && [ ! -z "$EXCEPT_URL" ]; then
[1070]	1184	# The code we expect to find in the exceptions file is either the 'curl' result or "EI"/"IW"
	1185	EXPECT_CODE="$CURL_RESULT"
	1186	if [ $STATUS == "EI" ]; then
	1187	EXPECT_CODE="EI"
	1188	elif [ $STATUS == "IW" ]; then
	1189	EXPECT_CODE="IW"
	1190	fi
	1191
[1136]	1192	# Look for link in exceptions list and make sure the listed result code and wiki page also match
	1193	for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
	1194	{
	1195	EXCEPT_LINE="${EXCEPT_ARRAY[$i]}"
[1182]	1196
[1142]	1197	# Undo any HTML-encoding from the wiki page; for now we just worry about the ampersand, as most
	1198	# other HTML-encoded characters are not found in URLs
[1146]	1199	EXCEPT_LINE=$(echo "$EXCEPT_LINE" \| sed 's/\&/\&/g')
[1142]	1200
[1175]	1201	# Check for URL match
[1136]	1202	EXCEPT_URL="${EXCEPT_LINE#*,}"
	1203	EXCEPT_URL="${EXCEPT_URL%,*}"
[1178]	1204	if [[ "$EXCEPT_URL" =~ \* ]]; then # if this exception URL contains the '*' wildcard, use pattern-matching with it
[1182]	1205	if [[ ! "$URL" == $EXCEPT_URL ]]; then
[1178]	1206	continue
	1207	fi
	1208	else
	1209	if [ "$EXCEPT_URL" != "$URL" ]; then # otherwise just use a straight string comparison
	1210	continue
	1211	fi
[1070]	1212	fi
[1136]	1213
[1175]	1214	# Check for page name match
[1136]	1215	EXCEPT_PAGE="${EXCEPT_LINE##*,}"
	1216	EXCEPT_PAGE="${EXCEPT_PAGE%% *}"
[1175]	1217	if [ "$EXCEPT_PAGE" == "*" ] \|\| [ "$EXCEPT_PAGE" == "$LOCAL_PAGE_PATH" ]; then
	1218	let EXCEPT_FOUND[$i]+=1
	1219	valPrint trs "Found exception '$URL' on page '$LOCAL_PAGE_PATH'."
	1220
	1221	# Check for result code match
[1136]	1222	EXCEPT_CODE=${EXCEPT_LINE%%,*}
	1223	if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then
[1175]	1224	FOUND_EXCEPT=1
	1225	let EXCEPT_USED[$i]+=1
[1149]	1226	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because its expected result, '$EXPECT_CODE', is in the exceptions list."
[1175]	1227
[1136]	1228	if [ $STATUS == "EI" ]; then
	1229	let SKIP_EXPECT_EI+=1
	1230	elif [ $STATUS == "IW" ]; then
	1231	let SKIP_EXPECT_IW+=1
[1142]	1232	elif [ $STATUS == "RD" ]; then
	1233	let SKIP_EXPECT_RD+=1
[1136]	1234	else
	1235	let SKIP_EXPECT_NG+=1
	1236	fi
[1175]	1237
[1136]	1238	break
	1239	fi
	1240	fi
	1241	} done
[1064]	1242	fi
[1136]	1243	if [ $FOUND_EXCEPT -eq 1 ]; then
	1244	continue
	1245	fi
[1064]	1246
	1247	# If appropriate, record this link to the log, with clickable URLs when possible
	1248	if [ $STATUS != "OK" ] \|\| [ $RECORD_OK_LINKS -eq 1 ]; then
[1125]	1249	# Prepare 'curl' result in parentheses to print after status code, unless this is an "IW" or "EI"
	1250	# link, in which case showing the status code doesn't make sense. Adjust spacing after string to
	1251	# ensure TXT and RTF reports have aligned columns of results.
	1252	CURL_STR_H=" ($CURL_RESULT)"
	1253	CURL_STR_T="$CURL_STR_H"
	1254	CURL_STR_R="$CURL_STR_H "
[1070]	1255	if [ $STATUS == "IW" ] \|\| [ $STATUS == "EI" ]; then
[1125]	1256	CURL_STR_H=""
	1257	CURL_STR_T=" "
	1258	CURL_STR_R=" "
[1064]	1259	fi
	1260
	1261	# Record link and its wiki page in TXT, RTF, and HTML markup
[1125]	1262	valPrint t "${STATUS}${CURL_STR_T} $STR_TYPE $URL"
[1064]	1263	valPrint t " linked from $FULL_PAGE_PATH"
[1125]	1264	valPrint r "${STATUS}${CURL_STR_R}${RTF_TABS}$STR_TYPE {\field{\*\fldinst{HYPERLINK \"$URL\"}}{\fldrslt $URL}}"
[1064]	1265	valPrint r " linked from {\field{\*\fldinst{HYPERLINK \"$FULL_PAGE_PATH\"}}{\fldrslt $LOCAL_PAGE_PATH}}"
[1125]	1266	valPrint hn "<tr><td style=\"white-space:nowrap\">${STATUS}${CURL_STR_H}</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>"
[1064]	1267	valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"
	1268
[1123]	1269	# Place vertical space here since we won't be printing anything more about this link
[1147]	1270	if [ $STATUS == "OK" ] && [ $SUGGEST_SNAPSHOTS_OK -eq 0 ]; then valPrint tr ""; valPrint hs ""; fi
[1123]	1271
[1067]	1272	# Record redirect URL if one was given by a 3xx response page
	1273	if [ $STATUS == "RD" ]; then
[1119]	1274	valPrint ts " Server suggests $NEW_URL"
	1275	valPrint rs " Server suggests {\field{\*\fldinst{HYPERLINK \"$NEW_URL\"}}{\fldrslt $NEW_URL}}"
	1276	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Server suggests</td><td><a href=\"$NEW_URL\" target=\"_blank\">$NEW_URL</a></td></tr>"
[1067]	1277	fi
	1278
[1070]	1279	# Notify reader if we can use an intrawiki link for this URL
	1280	if [ $STATUS == "EI" ]; then
[1075]	1281	INTRA_PAGE=${URL#:///}
[1183]	1282	# If INTRA_PAGE starts with Category:, File: or Image:, prefix it with a ':' to make it a wikilink
	1283	if [[ $INTRA_PAGE == Category:* ]] \|\| [[ $INTRA_PAGE == File:* ]]\|\| [[ $INTRA_PAGE == Image:* ]]; then
	1284	INTRA_PAGE=:${INTRA_PAGE}
	1285	fi
[1119]	1286	valPrint ts " Just use [[$INTRA_PAGE]]"
	1287	valPrint rs " Just use [[$INTRA_PAGE]]"
	1288	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Just use</td><td>[[$INTRA_PAGE]]</td></tr>"
[1070]	1289	fi
	1290
[1064]	1291	# Notify reader if we can use an interwiki prefix for this URL
	1292	if [ $STATUS == "IW" ]; then
[1075]	1293	INTER_PAGE=$(echo "$URL" \| sed 's/.*\///')
[1119]	1294	valPrint ts " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
	1295	valPrint rs " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
	1296	valPrint hs "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]</td></tr>"
[1064]	1297	fi
	1298
	1299	# Query Internet Archive for latest "OK" snapshot for "NG" page
[1147]	1300	if [[ ( $STATUS == "NG" && $SUGGEST_SNAPSHOTS_NG -eq 1 ) \|\| ( $STATUS == "OK" && $SUGGEST_SNAPSHOTS_OK -eq 1 ) ]]; then
	1301
	1302	# We need to watch out for the rate limit or we'll get locked out; look at how much time has
	1303	# elapsed and then wait the remainder between that and how long of a wait we think is needed
	1304	# to avoid the dreaded "Too Many Requests" response. 5 seconds is just a guess.
	1305	CUR_TIME=$(date +%s)
	1306	WAIT_REMAINDER=$((5 - $CUR_TIME + $START_LINK))
	1307	if [ $WAIT_REMAINDER -gt 0 ]; then
	1308	valPrint t "Waiting $WAIT_REMAINDER second(s) to conform to Archive.org rate limit."
	1309	sleep $WAIT_REMAINDER
	1310	fi
	1311
	1312	# Issue query to the API
[1141]	1313	ARCHIVE_QUERY=$(curl --silent --max-time $TIMEOUT "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
[1064]	1314
[1175]	1315	# Notify reader if we hit the rate limit and just keep going
[1147]	1316	if [[ "$ARCHIVE_QUERY" == "Too Many Requests" ]]; then
	1317	valPrint t " IA has rate-limited us!"
	1318	valPrint r " IA has rate-limited us!"
	1319	valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td>(hit the API rate limit!)</td></tr>"
[1175]	1320	# If a "closest" snapshot was received, inform reader
[1147]	1321	elif [[ "$ARCHIVE_QUERY" == \"closest\": ]]; then
[1118]	1322	# In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it
	1323	ARCHIVE_QUERY=$(echo "$ARCHIVE_QUERY" \| sed 's/#!/#\\!/')
	1324
	1325	# ...isolate "url" property in the response that follows the "closest" tag
	1326	SNAPSHOT_URL=${ARCHIVE_QUERY##*\"closest\":} # everything after '"closest":'
[1119]	1327	SNAPSHOT_URL=${SNAPSHOT_URL#*\"url\": \"} # everything after '"url": "'
[1118]	1328	SNAPSHOT_URL=${SNAPSHOT_URL%%\"*} # everything before '"'
	1329
[1124]	1330	# Remove the port 80 part that IA often adds to the URL, as it's superfluous
	1331	SNAPSHOT_URL=$(echo $SNAPSHOT_URL \| sed 's/:80//')
	1332
[1175]	1333	# Inform the reader of the snapshot URL
[1119]	1334	valPrint ts " IA suggests $SNAPSHOT_URL"
	1335	valPrint rs " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
	1336	valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
[1147]	1337	else # Otherwise give a generic Wayback Machine link for this URL, which might work
[1119]	1338	valPrint ts " Try browsing $ARCHIVE_GENERIC/$URL"
	1339	valPrint rs " Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
	1340	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"
[1064]	1341	fi
	1342	fi
	1343	fi
	1344
	1345	# If this is a page and it seems to be there, take screenshot of it using headless Google Chrome
	1346	if [ $IS_FILE -eq 0 ] && [ $TAKE_PAGE_SHOT -eq 1 ] && [ $STATUS == "OK" ]; then
	1347	# Sanitize URL by removing "http(s)://" and converting any other colons or slashes to underscores
	1348	SHOT_NAME=$(echo "$URL" \| sed 's/https*\:\/\///' \| sed 'y/:\//__/')
	1349	SHOT_FILE="$SHOT_PATH/$SHOT_NAME.png"
	1350
	1351	# Don't take screenshot if we already encountered this page and screenshotted it
	1352	if [ ! -f "$SHOT_FILE" ]; then
[1070]	1353	"$CHROME_PATH" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
[1064]	1354	if [ -f "$WORKING_DIR/$CHROME_SCREENSHOT" ]; then
	1355	mv -n "$WORKING_DIR/$CHROME_SCREENSHOT" "$SHOT_FILE"
	1356	else
[1119]	1357	valPrint trhs "Screenshot of URL $URL seems to have failed!"
[1064]	1358	fi
	1359	else
[1123]	1360	valPrint trhs "Skipping screenshot of URL '$URL' because file '$SHOT_FILE' already exists."
[1064]	1361	fi
	1362	fi
	1363	done
	1364	FINISHED_LIST="yes"
	1365	wrapupAndExit

Note: See TracBrowser for help on using the repository browser.

Download in other formats: