Context Navigation

source: Validate External Links/validate_external_links.sh@ 1142

Last change on this file since 1142 was 1142, checked in by iritscen, 4 years ago
Val now tries each URL three times. This has proven more effective than giving Val a long timeout and trying each URL once. The summary report has been refined a bit; the most notable change is that the final number and breakdown of link issues leaves out the excepted links. Also stopped Val from getting confused by HTML-encoded '&'s in the exceptions list.
File size: 49.7 KB

Rev	Line
[1064]	1	#!/bin/bash
	2
	3	# Validate External Links by Iritscen
[1141]	4	#
	5	# Validates a list of external links in CSV format. The resulting logs are produced in three formats:
	6	# - TXT (for easy diffing with an earlier log)
	7	# - RTF (for reading as a local file with clickable links)
	8	# - HTML (for uploading as a web page).
[1142]	9	# Call script with "--help" argument for documentation. Also see Read Me.rtf for critical notes.
[1141]	10	#
[1064]	11	# Recommended rule:
[1118]	12	# \|----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----\|
[1141]	13	#
	14	# Table of contents (sections of script in order of appearance, not execution):
	15	# • Globals
	16	# • Help Output
	17	# • Setup
	18	# • Utility Functions
	19	# • Summary Output
	20	# • Initialization
	21	# • Data Sourcing
	22	# • Config Output
	23	# • Legend Output
	24	# • Main Loop
[1064]	25
	26	# Set separator token to newline
	27	IFS="
	28	"
	29
	30	### GLOBALS ###
	31	# Settings -- these will be changed from their defaults by the arguments passed in to the script
[1135]	32	LINKS_URL="" # use 'curl' to download file with links from this location (can be file://)
[1136]	33	EXCEPT_URL="" # 'curl' will access this wiki page with a list of exceptions for NG results
[1135]	34	OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder
	35	RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES
	36	SHOW_SLASH=0 # record response code to the log when a slash is added to the end of a URL
	37	SHOW_HTTPS=0 # record response code to the log when "http" is upgraded to "https"
	38	SHOW_YT_RD=0 # record response code to the log when a youtu.be URL is expanded to the full URL
	39	SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page
	40	SKIP_ARCHIVE_LINKS=0 # don't check URLs under the archive.org domain
	41	TAKE_PAGE_SHOT=0 # take a screenshot of each OK page
[1141]	42	TIMEOUT=10 # time to wait for a response when querying a site
[1135]	43	CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature
[1142]	44	URL_START=1 # start at this URL in LINKS_FILE
[1135]	45	URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE
	46	UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report
[1064]	47
	48	# Fixed strings -- see the occurrences of these variables to learn their purpose
[1142]	49	AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36 OPR/70.0.3728.154"
[1064]	50	ARCHIVE_API="http://archive.org/wayback/available"
	51	ARCHIVE_GENERIC="https://web.archive.org/web/*"
	52	ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
	53	CHROME_SCREENSHOT="screenshot.png"
[1136]	54	EXCEPT_FILE_NAME="exceptions.txt"
[1064]	55	EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
[1141]	56	WIKI_CURL="https://wiki.oni2.net/Validate_External_Links/Curl_codes"
	57	WIKI_HTTP="https://wiki.oni2.net/Validate_External_Links/HTTP_codes"
	58	WIKI_MAIN="https://wiki.oni2.net/Validate_External_Links"
	59	WIKI_ME="http://iritscen.oni2.net"
[1064]	60	THIS_DIR=$(cd $(dirname $0); pwd)
	61	WORKING_DIR=$(pwd)
	62	WIKI_PATH="wiki.oni2.net"
	63
	64	# These are parallel arrays of the IDs and names of OniGalore's current namespaces
	65	declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 100 101 102 103 104 105 108 109 110 111)
	66	declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk")
	67
	68	# These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
[1070]	69	# This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code.
[1127]	70	declare -a HTTP_FILES=(3ds 7z avi BINA blend bsl cpp csv dae dll dmg doc east exe fbx first63 flv gamedata gif jpg last32 log m4a mhm mov mp3 mp4 oni ONWC pdf png py rar tga TRMA txt vbs wav wmv xaf xml zip)
[1137]	71	declare -a HTTP_TLDS_AND_PAGES=(action ars asp aspx cfm cgi com css de full htm html it js jsp net org pgi php php3 phtml pl ru shtml stm uk x)
[1064]	72
[1067]	73	# These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
	74	# are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt
	75	# if you add a new code.
[1127]	76	declare -a OK_CODES=(200 401 405 406 418 501)
[1067]	77	declare -a RD_CODES=(301 302 303 307 308)
[1127]	78	declare -a NG_CODES=(000 400 403 404 410 500 502 503 530)
[1064]	79
	80	# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
	81	# transcluded text, and if the transclusion fails, then the braces show up in the URL
	82	ILLEGAL_CHARS="{ }"
	83
[1070]	84	# The shortest URL possible, used for sanity-checking some URLs: http://a.co
	85	MIN_URL_LENGTH=11
	86
[1064]	87	# These are parallel arrays giving the prefixes that can be used in place of normal external links to
	88	# some wikis and other sites
[1070]	89	declare -a INTERWIKI_PREFIXES=(commons metawikimedia mw wikibooks wikidata wikimedia wikinews wikiquote wikisource wikispecies wikiversity wikivoyage wikt wp)
	90	declare -a INTERWIKI_DOMAINS=(commons.wikimedia.org meta.wikimedia.org mediawiki.org wikibooks.org wikidata.org wikimediafoundation.org wikinews.org wikiquote.org wikisource.org species.wikimedia.org wikiversity.org wikivoyage.org wiktionary.org wikipedia.org)
[1064]	91
	92	# Variables for keeping track of main loop progress and findings
	93	LINK_NUM=0
[1070]	94	EI_LINKS=0
	95	IW_LINKS=0
[1064]	96	OK_LINKS=0
[1067]	97	RD_LINKS=0
[1064]	98	NG_LINKS=0
	99	SKIP_UNK_NS=0
	100	SKIP_JS_PAGE=0
	101	SKIP_BAD_URL=0
	102	SKIP_NON_ASCII=0
	103	SKIP_UNK_SUFFIX=0
	104	SKIP_UNK_CODE=0
[1070]	105	SKIP_EXPECT_NG=0
[1142]	106	SKIP_EXPECT_RD=0
[1070]	107	SKIP_EXPECT_EI=0
	108	SKIP_EXPECT_IW=0
[1122]	109	SKIP_HTTPS_UP=0
	110	SKIP_SLASH_ADD=0
[1127]	111	SKIP_YOUTU_BE=0
[1135]	112	SKIP_ARCHIVE_ORG=0
[1064]	113	FILE_LINKS=0
	114	PAGE_LINKS=0
	115	SKIPPED_HEADER_ROW=0
	116	FINISHED_LIST="no"
[1118]	117	START_RUN=0
	118	END_RUN=0
[1064]	119
	120
[1141]	121	### HELP OUTPUT ###
[1064]	122	# A pseudo-man page. Here is the 80-character rule for the page text:
	123	# 234567890123456789012345678901234567890123456789012345678901234567890123456789
	124	function printHelp()
	125	{
	126	cat << EOF
	127
	128	NAME
	129	Validate External Links
	130
	131	SYNOPSIS
	132	validate_external_links.sh --help
[1070]	133	validate_external_links.sh --links URL --output DIR [--exceptions URL]
[1136]	134	[--record-ok-links] [--show-added-slashes] [--show-https-upgrades]
	135	[--show-yt-redirects] [--suggest-snapshots] [--skip-archive-links]
[1141]	136	[--take-screenshots FILE] [--timeout NUM] [--start-url NUM]
	137	[--end-url NUM] [--upload FILE]
[1064]	138
	139	DESCRIPTION
	140	This script parses a list of external links found in the OniGalore wiki
	141	(which is dumped by the Oni2.net domain periodically in a particular
	142	format), validates them using the Unix tool 'curl', and produces a report
[1070]	143	of which links were "OK" (responded positively to an HTTP query), which
	144	were "RD" (responded with a 3xx redirect code), which could be "IW"
	145	(interwiki) links, which are "EI" (external internal) links and could be
	146	intrawiki links, and which were "NG" (no good; a negative response to the
[1069]	147	query). This report can then be automatically uploaded to the location of
[1064]	148	your choice. The script can also suggest Internet Archive snapshots for
[1070]	149	"NG" links, and take screenshots of "OK" links for visual verification by
	150	the reader that the page in question is the one intended to be displayed.
[1064]	151
	152	You must pass this script the URL at which the list of links is found
[1070]	153	(--links) and the path where the directory of logs should be outputted
	154	(--output). All other arguments are optional.
[1064]	155
	156	OPTIONS
[1075]	157	--help Show this page.
	158	--links URL (required) URL from which to download the CSV
	159	file with external links. Note that this URL can
	160	be a local file if you supply a file:// path.
	161	--output DIR (required) Unix path to directory in which Val
	162	should place its reports.
	163	--exceptions URL In order to remove links from the report which
[1136]	164	Val finds an issue with but which you regard as
	165	OK, list those desired exceptions on a wiki page.
	166	See the sample file "exceptions.pdf" for the
	167	required format of the page. Note that this URL
	168	can point to a local file if you supply a path
	169	beginning with "file://".
[1075]	170	--record-ok-links Log a link in the report even if its response
	171	code is "OK".
[1122]	172	--show-added-slashes Report on redirects that simply add a '/' to the
	173	end of the URL.
[1127]	174	--show-https-upgrades Report on redirects that simply upgrade a
[1122]	175	"http://" URL to a "https://" URL.
[1127]	176	--show-yt-redirects Report on redirects that expand a youtu.be URL.
[1075]	177	--suggest-snapshots Query the Internet Archive for a possible
	178	snapshot URL for each "NG" page.
[1135]	179	--skip-archive-links Don't check links that are already pointing to
	180	a page on the Internet Archive.
[1075]	181	--take-screenshots FILE Call the Google Chrome binary at this path to
	182	take screenshots of each "OK" page.
[1141]	183	--timeout NUM Wait this many seconds for a site to respond. The
[1142]	184	default is 10. Important note: Val will attempt
	185	to reach each URL three times, so the time taken
	186	to ping an unresponsive site will be three times
	187	this setting.
[1075]	188	--start-url NUM Start at this link in the links CSV file.
	189	--end-url NUM Stop at this link in the links CSV file.
	190	--upload FILE Upload report using the credentials and path
	191	given in this local text file. See sftp_login.txt
	192	for template.
[1064]	193
	194	BUGS
	195	The script cannot properly parse any line in the external links file
	196	which contains a comma in the name of the wiki page containing a link.
	197	Commas in the link itself are not an issue.
	198	EOF
	199	}
	200
	201
	202	### SETUP ###
	203	# If first argument is a help request, or if nothing was passed in at all, print help page and quit
	204	if [ "$#" -eq 0 ] \|\| [ "$1" == "--help" ]; then
	205	printHelp \| less
	206	exit 0
	207	fi
	208
	209	# Parse arguments as long as there are more arguments to process
	210	while (( "$#" )); do
	211	case "$1" in
[1127]	212	--links ) LINKS_URL="$2"; shift 2;;
	213	--exceptions ) EXCEPT_URL="$2"; shift 2;;
	214	--output ) OUTPUT_DIR="$2"; shift 2;;
	215	--record-ok-links ) RECORD_OK_LINKS=1; shift;;
	216	--show-added-slashes ) SHOW_SLASH=1; shift;;
	217	--show-https-upgrades ) SHOW_HTTPS=1; shift;;
	218	--show-yt-redirects ) SHOW_YT_RD=1; shift;;
	219	--suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;;
[1135]	220	--skip-archive-links ) SKIP_ARCHIVE_LINKS=1; shift;;
[1127]	221	--take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
[1141]	222	--timeout ) TIMEOUT=$2; shift 2;;
[1127]	223	--start-url ) URL_START=$2; shift 2;;
	224	--end-url ) URL_LIMIT=$2; shift 2;;
	225	--upload ) UPLOAD_INFO=$2; shift 2;;
	226	* ) echo "Invalid argument $1 detected. Aborting."; exit 1;;
[1064]	227	esac
	228	done
	229
	230	# If the required arguments were not supplied, print help page and quit
	231	if [ -z $LINKS_URL ] \|\| [ -z $OUTPUT_DIR ]; then
[1070]	232	echo "Error: I did not receive one or both required arguments. Run me with the \"--help\" argument for documentation."
[1064]	233	exit 2
	234	fi
	235
[1070]	236	# If user wants screenshots, make sure path to Chrome was passed in and is valid
	237	if [ $TAKE_PAGE_SHOT -eq 1 ]; then
	238	if [ ! -f "$CHROME_PATH" ]; then
	239	echo "Error: You need to supply a path to the Google Chrome application in order to take screenshots."
	240	exit 3
	241	fi
	242	fi
	243
[1064]	244	# Check that UPLOAD_INFO exists, if this argument was supplied
	245	if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then
	246	echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting."
[1070]	247	exit 4
[1064]	248	fi
	249
	250	# Check that OUTPUT_DIR is a directory
	251	if [ ! -d "$OUTPUT_DIR" ]; then
	252	echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting."
[1070]	253	exit 5
[1064]	254	fi
	255
	256	# Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots
	257	SAFE_TIME=$(date "+%Y-%m-%d--%H-%M-%S") # for use in path names
	258	NICE_TIME=$(TZ=":UTC" date "+%a, %d %b %Y %k:%M:%S GMT") # matches format of LINKS_DATE, which is extracted from the extlinks file
	259	OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
	260	OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
	261	SHOT_PATH="$OUTPUT_PATH/Screenshots"
	262	LOG_NAME="ValExtLinks report"
	263	LOG_TXT="$OUTPUT_PATH/$LOG_NAME.txt"
	264	LOG_RTF="$OUTPUT_PATH/$LOG_NAME.rtf"
	265	LOG_HTM="$OUTPUT_PATH/$LOG_NAME.htm"
	266	mkdir "$OUTPUT_PATH"
	267	if [ $TAKE_PAGE_SHOT -eq 1 ]; then
	268	mkdir "$SHOT_PATH"
	269	fi
	270
	271	# Check that 'mkdir' succeeded
	272	if [ ! -d "$OUTPUT_PATH" ]; then
	273	echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting."
[1070]	274	exit 6
[1064]	275	fi
	276
	277	# Get date on the file at LINKS_URL and print to log
	278	LINKS_DATE=$(curl --silent --head $LINKS_URL \| grep "Last-Modified")
	279	if [ -z "$LINKS_DATE" ]; then
	280	echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting."
[1070]	281	exit 7
[1064]	282	fi
	283	LINKS_DATE=${LINKS_DATE#Last-Modified: }
	284
	285
	286	### UTILITY FUNCTIONS ###
	287	# Writes a plain-text header to TXT log file
	288	function printTXTheader()
	289	{
	290	valPrint t "Validate External Links report"
	291	valPrint t "generated $NICE_TIME"
	292	valPrint t "from data of $LINKS_DATE"
[1141]	293	valPrint t "script by Iritscen (contact: $WIKI_ME)"
[1064]	294	valPrint t ""
	295	}
	296
	297	# Writes the RTF header to RTF log file, then a centered title, then sets text to left-justified
	298	function printRTFheader()
	299	{
	300	valPrint r "{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf820
	301	{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
	302	{\colortbl;\red255\green255\blue255;}
	303	{\*\expandedcolortbl;;}
	304	\margl1440\margr1440\vieww12600\viewh12100\viewkind0
	305	\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\qc\partightenfactor0
	306
	307	\f0\fs28 \cf0 \b1 Validate External Links report \b0\\
	308	generated $NICE_TIME\\
	309	from data of $LINKS_DATE\\
[1141]	310	script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$WIKI_ME\"}}{\fldrslt contact}})
[1064]	311	\\
	312	\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
	313	\cf0 "
	314	}
	315
	316	# Closes the RTF markup of the RTF log file
	317	function printRTFfooter()
	318	{
	319	valPrint r "}"
	320	}
	321
	322	# Writes the HTML header to HTML log file
	323	function printHTMheader()
	324	{
	325	valPrint h "<html>
	326	<head>
	327	<title>Validate External Links report</title>
	328	</head>
	329	<body>
	330	<h2>Validate External Links report</h2>
	331	<h3>generated $NICE_TIME<br />
	332	from data of $LINKS_DATE<br />
[1141]	333	script by Iritscen (<a href=\"$WIKI_ME\" target=\"_blank\">contact</a>)</h3>"
[1064]	334	}
	335
	336	# Closes the HTML markup of the HTML log file
	337	function printHTMfooter()
	338	{
	339	valPrint h "</body>
	340	</html>"
	341	}
	342
	343	# The central logging function. The first parameter is a string composed of one or more characters that
[1070]	344	# indicate which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
[1141]	345	# 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 's' means "Print an
	346	# extra newline at the end." 'w' means "Don't pass console output through 'fmt'" ("fmt" fits the output
[1119]	347	# to an 80-column CLI but can break special formatting and the 'n' option).
[1064]	348	function valPrint()
	349	{
	350	if [[ "$1" == c ]]; then
	351	if [[ "$1" == n ]]; then
	352	echo -n "$2"
	353	elif [[ "$1" == w ]]; then
	354	echo "$2"
[1119]	355	elif [[ "$1" == s ]]; then
	356	echo -e "$2\n"
[1064]	357	else
	358	echo "$2" \| fmt -w 80
	359	fi
	360	fi
	361	if [[ "$1" == t ]]; then
	362	if [[ "$1" == n ]]; then
	363	echo -n "$2" >> "$LOG_TXT"
[1119]	364	elif [[ "$1" == s ]]; then
	365	echo -e "$2\n" >> "$LOG_TXT"
[1064]	366	else
	367	echo "$2" >> "$LOG_TXT"
	368	fi
	369	fi
	370	if [[ "$1" == r ]]; then
	371	if [[ "$1" == n ]]; then
	372	echo "$2" >> "$LOG_RTF"
[1119]	373	elif [[ "$1" == s ]]; then
	374	echo "$2\line\line" >> "$LOG_RTF"
[1064]	375	else
[1119]	376	echo "$2\line" >> "$LOG_RTF"
[1064]	377	fi
	378	fi
	379	if [[ "$1" == h ]]; then
[1119]	380	if [[ "$1" == s ]]; then
	381	echo "$2<tr><td> </td></tr>" >> "$LOG_HTM"
	382	elif [[ "$1" == n ]]; then
[1064]	383	echo "$2" >> "$LOG_HTM"
	384	else
	385	echo "$2<br />" >> "$LOG_HTM"
	386	fi
	387	fi
	388	}
	389
	390	# Pluralize the string in parameter 1 if the number in parameter 2 is not 1
	391	function pluralCheckNoun()
	392	{
	393	if [ $2 -ne 1 ]; then
	394	if [[ $1 =~ x$ ]]; then
	395	echo $1es
	396	else
	397	echo $1s
	398	fi
	399	else
	400	echo $1
	401	fi
	402	}
	403
[1067]	404	# Output "is" if parameter 1 is 1, otherwise "are"
	405	function pluralCheckIs()
	406	{
	407	if [ $1 -ne 1 ]; then
	408	echo "are"
	409	else
	410	echo "is"
	411	fi
	412	}
	413
[1064]	414	# Output "was" if parameter 1 is 1, otherwise "were"
	415	function pluralCheckWas()
	416	{
	417	if [ $1 -ne 1 ]; then
	418	echo "were"
	419	else
	420	echo "was"
	421	fi
	422	}
	423
[1067]	424	# Output "a " if parameter 1 is 1, otherwise nothing
	425	function pluralCheckA()
	426	{
	427	if [ $1 -eq 1 ]; then
	428	echo "a "
	429	fi
	430	}
	431
	432	# Output "an " if parameter 1 is 1, otherwise nothing
	433	function pluralCheckAn()
	434	{
	435	if [ $1 -eq 1 ]; then
	436	echo "an "
	437	fi
	438	}
	439
[1064]	440	# Upload HTML report using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
	441	# reports being saved to disk have already been closed.
	442	function uploadReport()
	443	{
	444	valPrint c "Uploading HTML report..."
	445
	446	SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
	447	SFTP_USER_NAME_MARKER="user:"
	448	SFTP_PASSWORD_MARKER="pw:"
	449	SFTP_PORT_MARKER="port:"
	450	SFTP_PATH_MARKER="path:"
	451	SFTP_USER_NAME=$(grep $SFTP_USER_NAME_MARKER $UPLOAD_INFO)
	452	SFTP_USER_NAME=${SFTP_USER_NAME#$SFTP_USER_NAME_MARKER}
	453	SFTP_PASSWORD=$(grep $SFTP_PASSWORD_MARKER $UPLOAD_INFO)
	454	SFTP_PASSWORD=${SFTP_PASSWORD#$SFTP_PASSWORD_MARKER}
	455	SFTP_PORT=$(grep $SFTP_PORT_MARKER $UPLOAD_INFO)
	456	SFTP_PORT=${SFTP_PORT#$SFTP_PORT_MARKER}
	457	SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
	458	SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}
	459
	460	expect "$SCRIPT_PATH" "$LOG_HTM" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.htm"
	461
	462	valPrint c "Report was uploaded, unless an error message appears above."
	463	}
	464
	465	# Prints session summary when script is done
	466	function wrapupAndExit()
	467	{
	468	# Get off progress line on console, drop down a line from last link in log, and close HTML table
	469	valPrint ctr ""
	470	valPrint h "</table><br />"
	471
	472	# If we didn't finish processing the last URL, then the iterator is one too high
	473	if [ $FINISHED_LIST != "yes" ]; then
	474	let LINK_NUM-=1
	475	if [ $FINISHED_LIST == "no" ]; then
	476	valPrint ctrh "The session was canceled by the user."
	477	fi
	478	fi
	479
[1118]	480	# Generate string with elapsed time
	481	END_RUN=$(date +%s)
	482	ELAPSED=$(echo $(($END_RUN - $START_RUN)) \| awk '{printf "%d min. %d sec. elapsed", int($1/60), int($1%60)}')
	483
[1122]	484	# Do some math on results of session
[1064]	485	LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
[1142]	486	TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP+SKIP_YOUTU_BE))
[1122]	487	LINK_ERRORS=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
[1142]	488	LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_RD+SKIP_EXPECT_EI+SKIP_EXPECT_IW))
	489	LINK_PROBLEMS_TOTAL=$((NG_LINKS+RD_LINKS+EI_LINKS+IW_LINKS))
	490	LINK_PROBLEMS_NG=$((NG_LINKS-SKIP_EXPECT_NG))
	491	LINK_PROBLEMS_RD=$((RD_LINKS-SKIP_EXPECT_RD))
	492	LINK_PROBLEMS_EI=$((EI_LINKS-SKIP_EXPECT_EI))
	493	LINK_PROBLEMS_IW=$((IW_LINKS-SKIP_EXPECT_IW))
	494	LINK_PROBLEMS_NET=$((LINK_PROBLEMS_NG+LINK_PROBLEMS_RD+LINK_PROBLEMS_EI+LINK_PROBLEMS_IW))
[1122]	495
[1141]	496	## SUMMARY OUTPUT ##
[1118]	497	valPrint ct "Summary ($ELAPSED):"
	498	valPrint r "\b1 Summary \b0 ($ELAPSED)"
	499	valPrint hn "<h3><span id=\"summary\">Summary ($ELAPSED)</span></h3>"
[1123]	500	valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT) (there $(pluralCheckWas $FILE_LINKS) $FILE_LINKS file $(pluralCheckNoun link $FILE_LINKS) and $PAGE_LINKS page $(pluralCheckNoun link $PAGE_LINKS))."
[1122]	501
	502	# Print processed link totals
	503	if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi
	504	if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi
[1135]	505	if [ $SKIP_ARCHIVE_ORG -gt 0 ]; then valPrint ctrh "- $SKIP_ARCHIVE_ORG Archive.org $(pluralCheckNoun link $SKIP_ARCHIVE_ORG) were not checked"; fi
[1142]	506	if [ $LINK_PROBLEMS_TOTAL -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_TOTAL processed $(pluralCheckNoun link $LINK_PROBLEMS_TOTAL) had $(pluralCheckAn $LINK_PROBLEMS_TOTAL)$(pluralCheckNoun issue $LINK_PROBLEMS_TOTAL)"; fi
	507	if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr " (excepted $LINKS_EXCEPTED link $(pluralCheckNoun issue $LINKS_EXCEPTED) from report)"; valPrint h "  (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi
[1123]	508	if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) $(pluralCheckWas $OK_LINKS) OK"; fi
	509	if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctr " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; valPrint h "  (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi
[1122]	510
	511	# Print errored link totals
	512	if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"; fi
	513	if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
[1070]	514	if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
[1064]	515	if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
	516	if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
	517	if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
	518	if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
[1122]	519
[1142]	520	# Print excepted link totals
	521	if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctrh "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"; fi
	522	if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
	523	if [ $SKIP_EXPECT_RD -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_RD/$RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi
	524	if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi
	525	if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi
	526
[1122]	527	# Print checked link totals
[1142]	528	if [ $LINK_PROBLEMS_NET -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS_NET link $(pluralCheckNoun issue $LINK_PROBLEMS_NET):"; fi
	529	if [ $LINK_PROBLEMS_NG -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_NG NG $(pluralCheckNoun link $LINK_PROBLEMS_NG)"; fi
	530	if [ $LINK_PROBLEMS_RD -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_RD $(pluralCheckNoun redirection $LINK_PROBLEMS_RD)"; fi
	531	if [ $LINK_PROBLEMS_EI -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_EI $(pluralCheckNoun link $LINK_PROBLEMS_EI) that could be intrawiki"; fi
	532	if [ $LINK_PROBLEMS_IW -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_IW $(pluralCheckNoun link $LINK_PROBLEMS_IW) that could be interwiki"; fi
[1122]	533
	534	# Close the log files' markup
[1070]	535	valPrint trh "ValExtLinks says goodbye."
[1064]	536	printRTFfooter
	537	printHTMfooter
	538
	539	# Upload report if this was requested
	540	if [ ! -z $UPLOAD_INFO ] && [ $FINISHED_LIST != "no" ]; then
	541	uploadReport
	542	fi
	543
	544	# Really quit now
	545	valPrint c "ValExtLinks says goodbye."
	546	exit 0
	547	}
	548	trap wrapupAndExit INT
	549
	550
	551	### INITIALIZATION ###
	552	# Print opening message to console and log files
	553	valPrint cw "$(tput bold)--Validate External Links--$(tput sgr0)"
	554	printTXTheader
	555	printRTFheader
	556	printHTMheader
	557
[1141]	558	## DATA SOURCING ##
	559	valPrint t "Startup:"
	560	valPrint r "\b1 Startup \b0"
	561	valPrint hn "<h3>Startup</h3>"
	562
[1064]	563	# Attempt to download file at LINKS_URL, then check that it succeeded
[1141]	564	valPrint cwtrhn "Downloading list of external links from $LINKS_URL..."
[1064]	565	LINKS_FILE_NAME=$(echo "$LINKS_URL" \| sed 's/.*\///')
	566	LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
	567	curl --silent -o "$LINKS_FILE" $LINKS_URL
	568	if [ ! -f "$LINKS_FILE" ]; then
[1141]	569	echo -e "\nThe download of $LINKS_URL appears to have failed. Aborting."
[1064]	570	wrapupAndExit
[1141]	571	else
	572	valPrint ctrh " success."
[1064]	573	fi
	574
	575	# Attempt to download file at EXCEPT_URL, then check that it succeeded
	576	if [ ! -z $EXCEPT_URL ]; then
[1141]	577	valPrint cwtrhn "Downloading list of reporting exceptions from $EXCEPT_URL..."
[1136]	578	EXCEPT_DATA=$(curl --silent $EXCEPT_URL)
	579	if [ -z "$EXCEPT_DATA" ]; then
[1141]	580	echo -e "\nThe download of the exceptions data from '$EXCEPT_URL' appears to have failed. Aborting."
[1064]	581	wrapupAndExit
[1141]	582	else
	583	valPrint ctrh " success."
[1064]	584	fi
[1136]	585	EXCEPT_DATA=${EXCEPT_DATA%END LIST*}
	586	EXCEPT_DATA=${EXCEPT_DATA#*BEGIN LIST}
	587	EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
	588
	589	# Store on disk for debugging purposes
	590	echo "$EXCEPT_DATA" > "$EXCEPT_FILE"
	591
	592	# Transfer to array for easy searching later
	593	declare -a EXCEPT_ARRAY=($(echo "$EXCEPT_DATA"))
[1064]	594	fi
	595
	596	# Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
	597	LINK_COUNT_STRING=$(cat "$LINKS_FILE" \| wc -l)
	598
	599	# Number of URLs is number of lines minus one (first line is column header row for the CSV)
	600	LINK_COUNT=$(echo "${LINK_COUNT_STRING}" \| tr -d '[:space:]')
	601	let LINK_COUNT-=1
[1141]	602	valPrint ctrh "Found $LINK_COUNT links to process."
	603	valPrint trh ""
[1064]	604
[1141]	605	## CONFIG OUTPUT ##
	606	valPrint t "Config:"
	607	valPrint r "\b1 Config \b0"
	608	valPrint hn "<h3>Config</h3>"
	609
	610	valPrint ctrhn "Links to consider: "
[1064]	611	if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
[1141]	612	valPrint ctrh "$((URL_LIMIT-URL_START+1)), from link $URL_START to $URL_LIMIT"
[1064]	613	elif [ $URL_START -ne 1 ]; then
[1141]	614	valPrint ctrh "$((LINK_COUNT-URL_START+1)), from link $URL_START to $LINK_COUNT"
[1064]	615	else
[1141]	616	valPrint ctrh "$LINK_COUNT"
[1064]	617	fi
	618
[1141]	619	valPrint ctrh "Site query timeout: $TIMEOUT seconds"
	620
	621	valPrint ctrhn "Show OK links: "
	622	if [ $RECORD_OK_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
	623
	624	valPrint ctrhn "Take screenshots: "
	625	if [ $TAKE_PAGE_SHOT -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
	626
[1142]	627	valPrint ctrhn "Suggest archive.org snapshots: "
[1141]	628	if [ $SUGGEST_SNAPSHOTS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
	629
	630	valPrint ctrhn "Ignore slash-adding redirects: "
	631	if [ $SHOW_SLASH -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
	632
	633	valPrint ctrhn "Ignore HTTPS-upgrading redirects: "
	634	if [ $SHOW_HTTPS -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
	635
	636	valPrint ctrhn "Ignore youtu.be redirects: "
	637	if [ $SHOW_YT_RD -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
	638
	639	valPrint ctrhn "Check archive.org links: "
	640	if [ $SKIP_ARCHIVE_LINKS -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
	641
[1064]	642	valPrint tr "A summary of my findings will be found at the bottom of the report."
	643	valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
	644	valPrint trh ""
	645
[1141]	646	## LEGEND OUTPUT ##
[1064]	647	valPrint t "Legend:"
	648	valPrint r "\b1 Legend \b0"
	649	valPrint hn "<h3>Legend</h3>"
[1141]	650	valPrint t "(For guidance in fixing these links, see $WIKI_MAIN.)"
	651	valPrint r "(For guidance in fixing these links, see {\field{\*\fldinst{HYPERLINK \"$WIKI_MAIN\"}}{\fldrslt here}}.)"
	652	valPrint h "(For guidance in fixing these links, see <a href=\"$WIKI_MAIN\" target=\"_blank\">here</a>.)"
	653	valPrint trh "OK = URL seems to be working"
	654	valPrint trh "NG = URL no longer seems to work"
	655	valPrint trh "RD = URL is redirecting to this new URL"
	656	valPrint trh "EI = URL is an external link to an internal page and should be converted to an intrawiki link using the suggested markup"
	657	valPrint trh "IW = URL is an external link to a fellow wiki and should be converted to an interwiki link using the suggested markup"
	658	valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $WIKI_HTTP)"
	659	valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$WIKI_HTTP\"}}{\fldrslt here}} for code reference)"
	660	valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$WIKI_HTTP\" target=\"_blank\">here</a> for code reference)"
	661	valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $WIKI_CURL)"
	662	valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$WIKI_CURL\"}}{\fldrslt here}} for code reference)"
	663	valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$WIKI_CURL\" target=\"_blank\">here</a> for code reference)"
	664	valPrint trh "IA suggests = Last available snapshot returned by the Internet Archive"
	665	valPrint trh "Try browsing = The Archive failed to return a snapshot URL, so check for a snapshot manually using this link"
[1064]	666	valPrint trh ""
	667
	668
	669	### MAIN LOOP ###
[1120]	670	valPrint t "Links:"
	671	valPrint r "\b1 Links \b0"
	672	valPrint hn "<h3>Links</h3>"
[1118]	673	START_RUN=$(date +%s)
[1064]	674	# Process each line of the .csv in LINKS_FILE
	675	for LINE in `cat "$LINKS_FILE"`; do
	676	let LINK_NUM+=1
	677
	678	# First line is the column header row for the CSV, so let's verify that the format hasn't changed
	679	if [ $SKIPPED_HEADER_ROW -eq 0 ]; then
	680	if [ $LINE == "namespace,title,target" ]; then
	681	SKIPPED_HEADER_ROW=1
	682	LINK_NUM=0 # this line is it's not a link, so reset the link counter
	683	valPrint hn "<table>"
	684	continue
	685	else
	686	valPrint ctrh "Did not find expected header row in $LINKS_FILE. Aborting."
	687	wrapupAndExit
	688	fi
	689	fi
	690
	691	# Skip this link if we are not at URL_START yet
	692	if [ $LINK_NUM -lt $URL_START ]; then
	693	continue
	694	fi
	695
	696	# Stop if we are at the limit declared for testing purposes
	697	if [ $URL_LIMIT -gt 0 ] && [ $LINK_NUM -gt $URL_LIMIT ]; then
	698	FINISHED_LIST="limit"
	699	wrapupAndExit
	700	fi
	701
	702	# Print progress to screen
	703	if [ $LINK_NUM -gt 1 ]; then
	704	printf "\e[1A\n" # erase previous progress message so that new one appears in its place
	705	fi
	706	valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."
	707
	708	# The number of the namespace is the element before the first comma on the line
	709	NS_ID=${LINE%%,*}
	710
	711	# Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
	712	NS_NAME=""
	713	a=0
[1069]	714	while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done
[1118]	715	if [ $NS_ID == "NULL" ]; then
	716	break
	717	elif [ $NS_ID -eq ${NS_IDS[$a]} ]; then
[1064]	718	NS_NAME="${NS_NAMES[$a]}"
	719	break
	720	fi
	721	let a+=1
	722	done
[1118]	723	if [ "$NS_NAME" == "" ]; then
	724	if [ $NS_ID == "NULL" ]; then
[1123]	725	valPrint trs "Skipping line $LINK_NUM ('$LINE') because the namespace (and probably the page too) is 'NULL'. Probably the link is no longer in existence on the wiki."
[1118]	726	else
[1123]	727	valPrint trs "Skipping line $LINK_NUM ('$LINE') because I could not find a name for namespace ID $NS_ID."
[1118]	728	fi
[1064]	729	let SKIP_UNK_NS+=1
	730	continue
	731	fi
	732
	733	# The name of the page is everything between the namespace ID and the next comma on the line (commas
	734	# in page names will break this)
	735	PAGE_NAME=${LINE#$NS_ID,}
	736	PAGE_NAME=${PAGE_NAME%%,*}
	737
[1135]	738	# We don't want to consider wiki pages ending in .js, as the MW parser cannot reliably isolate URLS
	739	# in JavaScript code, so it returns erroneous links
[1064]	740	PAGE_NAME_SUFFIX=$(echo $PAGE_NAME \| sed 's/.*\.//')
	741	if [ $PAGE_NAME_SUFFIX == "js" ]; then
[1123]	742	valPrint trs "Skipping URL '${LINE#$NS_ID,$PAGE_NAME,}' on line $LINK_NUM because it was found on JavaScript page '$PAGE_NAME'."
[1064]	743	let SKIP_JS_PAGE+=1
	744	continue
	745	fi
	746
[1070]	747	# Build longer wiki page URLs from namespace and page names
[1122]	748	FULL_PAGE_PATH=https://$WIKI_PATH/$NS_NAME:$PAGE_NAME
[1070]	749	LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
	750	# Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it
	751	# explicitly breaks the link
	752	if [ $NS_ID -eq 0 ]; then
[1122]	753	FULL_PAGE_PATH=https://$WIKI_PATH/$PAGE_NAME
[1070]	754	LOCAL_PAGE_PATH=$PAGE_NAME
	755	fi
	756
[1064]	757	# The URL being linked to is everything after the previous two fields (this allows commas to be in
	758	# the URLs, but a comma in the previous field, the page name, will break this)
	759	URL=${LINE#$NS_ID,$PAGE_NAME,}
	760
	761	# Scan for illegal characters
	762	if [[ $URL == [$ILLEGAL_CHARS] ]]; then
[1123]	763	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because it contains characters illegal in a URL."
[1064]	764	let SKIP_BAD_URL+=1
	765	continue
	766	fi
	767
[1135]	768	# If we're skipping Archive.org links, check if this is one
	769	if [ $SKIP_ARCHIVE_LINKS -eq 1 ] && [[ $URL == web.archive.org ]]; then
	770	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have been asked not to check Wayback Machine links."
	771	let SKIP_ARCHIVE_ORG+=1
	772	continue
	773	fi
	774
[1064]	775	# Now we need to know if the URL is for a file or a web page. First step is to determine if the
	776	# URL ends in a suffix
	777	HAS_SUFFIX=0
	778
	779	# If the URL ends in something like ".php?foo=bar", strip everything from the '?' onward
[1070]	780	CLEAN_URL=${URL%%\?*}
[1064]	781
	782	# If the URL ends in something like "#section_15", strip everything from the '#' onward
[1070]	783	CLEAN_URL=${CLEAN_URL%%\#*}
[1064]	784
[1135]	785	# 'sed' cannot handle Unicode in my Bash shell, so skip non-ASCII URL and make user check it
[1070]	786	if [[ $CLEAN_URL == [![:ascii:]] ]]; then
[1123]	787	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I cannot handle non-ASCII characters."
[1064]	788	let SKIP_NON_ASCII+=1
	789	continue
	790	fi
	791
	792	# Isolate the characters after the last period and after the last slash
[1070]	793	POST_DOT=$(echo "$CLEAN_URL" \| sed 's/.*\.//')
	794	POST_SLASH=$(echo "$CLEAN_URL" \| sed 's/.*\///')
[1064]	795
	796	# If the last period comes after the last slash, then the URL ends in a suffix
	797	POST_DOT_LENGTH=$(echo \| awk -v input=$POST_DOT '{print length(input)}')
	798	POST_SLASH_LENGTH=$(echo \| awk -v input=$POST_SLASH '{print length(input)}')
	799	if [ $POST_DOT_LENGTH -lt $POST_SLASH_LENGTH ]; then
	800	HAS_SUFFIX=1
	801	else
	802	HAS_SUFFIX=0
	803	fi
	804
	805	# Now we need to know if the URL is for a file. If the URL ends in a suffix, compare it against the
	806	# known URL endings supplied in HTTP_FILES and HTTP_TLDS_AND_PAGES
	807	IS_FILE=-1
	808	if [ $HAS_SUFFIX -eq 0 ]; then
	809	IS_FILE=0
	810	else
	811	# Turn off case sensitivity while we compare suffixes
	812	shopt -s nocasematch
	813
[1127]	814	# Special case: URLs ending in something like "productID.304297400" are pages, not files, so if
[1064]	815	# the URL's suffix is all numbers, we are looking at the end of a web page URL
	816	if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
	817	IS_FILE=0
	818	fi
[1127]	819
	820	# Special case: URLs ending in a parens, e.g. "ms537113(v=vs.85)", are pages
	821	if [[ $POST_DOT =~ ^.*[\(\)]$ ]]; then
	822	IS_FILE=0
	823	fi
	824
	825	# Special case: URLs containing a '%', e.g. "10.1007%2FBF00329055", are pages
	826	if [[ $POST_DOT == % ]]; then
	827	IS_FILE=0
	828	fi
[1064]	829
	830	# If we did not identify this URL as a web page above, we need to compare the suffix against known
	831	# file extensions
	832	if [ $IS_FILE -eq -1 ]; then
	833	for EXTENSION in "${HTTP_FILES[@]}"; do
	834	if [[ $EXTENSION == $POST_DOT ]]; then # double brackets respect the nocasematch setting
	835	IS_FILE=1
	836	break
	837	fi
	838	done
	839	fi
	840
	841	# If we did not identify this URL as a file above, we need to compare the suffix against known
	842	# pages and TLDs in order to rule out that this is an unknown file suffix or TLD that the user
	843	# needs to add to the HTTP_FILES or HTTP_TLDS_AND_PAGES array
	844	if [ $IS_FILE -eq -1 ]; then
	845	for PAGE_OR_TLD in "${HTTP_TLDS_AND_PAGES[@]}"; do
	846	if [[ $PAGE_OR_TLD == $POST_DOT ]]; then
	847	IS_FILE=0
	848	break
	849	fi
	850	done
	851	fi
	852
	853	# Turn case sensitivity back on in Bash
	854	shopt -u nocasematch
	855	fi
	856
	857	# If this suffix escaped identification as either a file, page or TLD, inform the user
	858	STR_TYPE=""
	859	if [ $IS_FILE -eq -1 ]; then
[1123]	860	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I encountered the unknown URL ending '$POST_DOT'. Please add this ending to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
[1064]	861	let SKIP_UNK_SUFFIX+=1
	862	continue
	863	elif [ $IS_FILE -eq 1 ]; then
	864	STR_TYPE="file"
	865	let FILE_LINKS+=1
	866	elif [ $IS_FILE -eq 0 ]; then
	867	STR_TYPE="page"
	868	let PAGE_LINKS+=1
	869	fi
	870
	871	# Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
	872	# issue with sites that require HTTPS
[1142]	873	CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --retry 2 --write-out '%{http_code}\n' $URL)
[1064]	874	CURL_ERR=$(echo $?)
	875	CURL_RESULT=$CURL_CODE
	876
	877	# Tack 'curl' exit code onto result string if 'curl' returned "000" (no HTTP response)
	878	if [ $CURL_CODE == "000" ]; then
	879	CURL_RESULT="$CURL_RESULT-$CURL_ERR"
	880	fi
	881
[1070]	882	# Begin to determine our status code for this URL (EI, IW, OK, RD, or NG)
[1064]	883	STATUS="??"
[1067]	884	NEW_URL=""
[1064]	885	INTERWIKI_INDEX=-1
	886
[1070]	887	# First make sure that this isn't an "external internal" link to our own wiki that can be replaced
	888	# by "[[page_name]]". If it uses a special access URL beginning with "/w/", let it pass, as it
	889	# probably cannot be replaced by "[[ ]]" markup
	890	if [[ $URL == $WIKI_PATH ]] && [[ $URL != $WIKI_PATH/w/ ]]; then
	891	STATUS="EI"
	892	let EI_LINKS+=1
	893	fi
	894
	895	# If it's not, check if this is a link to a domain that we have an interwiki prefix for
	896	if [ $STATUS == "??" ]; then
	897	for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
	898	if [[ $URL == ${INTERWIKI_DOMAINS[$i]} ]] && [[ $URL != ${INTERWIKI_DOMAINS[$i]}/w/ ]]; then
	899	STATUS="IW"
	900	let IW_LINKS+=1
	901	INTERWIKI_INDEX=$i
	902	break
	903	fi
	904	done
	905	fi
	906
[1069]	907	# If we didn't match an interwiki domain, see if the status code is in our "OK" codes list
	908	if [ $STATUS == "??" ]; then
	909	for CODE in "${OK_CODES[@]}"; do
	910	if [[ $CODE == $CURL_CODE ]]; then
	911	STATUS="OK"
	912	let OK_LINKS+=1
	913	break
	914	fi
	915	done
	916	fi
	917
[1067]	918	# If we didn't get a match with the "OK" codes, check it against the "RD" codes
[1064]	919	if [ $STATUS == "??" ]; then
[1067]	920	for CODE in "${RD_CODES[@]}"; do
	921	if [[ $CODE == $CURL_CODE ]]; then
	922	# Get URL header again in order to retrieve the URL we are being redirected to
[1141]	923	NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --write-out '%{redirect_url}\n' $URL)
[1067]	924
[1122]	925	# Adjust the old and new URLs to both use HTTP for comparison purposes, so we can filter
	926	# those changes out if the user didn't ask for them
	927	URL_HTTP=$(echo $URL \| sed -E 's/^https:/http:/')
	928	NEW_URL_HTTP=$(echo $NEW_URL \| sed -E 's/^https:/http:/')
[1070]	929
	930	# Sometimes 'curl' fails to get the redirect_url due to time-out or bad web site config
[1122]	931	NEW_URL_LENGTH=$(echo \| awk -v input=$NEW_URL_HTTP '{print length(input)}')
[1070]	932	if [ $NEW_URL_LENGTH -lt $MIN_URL_LENGTH ]; then
[1122]	933	NEW_URL_HTTP="[new URL not retrieved]"
[1070]	934	fi
	935
[1122]	936	# Remove slash at end of new URL, if present, so we can filter out the redirects that
	937	# merely add an ending slash if the user didn't ask for them
	938	NEW_URL_NO_SLASH=$(echo $NEW_URL_HTTP \| sed -E 's:/$::')
	939
[1127]	940	# Detect if this is a youtu.be link simply being expanded by YouTube to the full
	941	# youtube.com address
	942	YOUTU_BE=0
	943	if [[ $URL_HTTP == http://youtu.be* ]] && [[ $NEW_URL_HTTP == http://www.youtube.com* ]]; then
	944	YOUTU_BE=1
	945	fi
	946
[1122]	947	# If the URLs match besides HTTP being upgraded to HTTPS, then the link is OK (unless user
	948	# wants those to be reported)
	949	if [ $SHOW_HTTPS -eq 0 ] && [ $URL_HTTP == $NEW_URL_HTTP ]; then
[1123]	950	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show http->https upgrades, and I was redirected to '$NEW_URL'."
[1069]	951	STATUS="OK"
	952	let OK_LINKS+=1
[1122]	953	let SKIP_HTTPS_UP+=1
	954	# If the URLs match besides an added ending slash, then the link is OK (unless user wants
	955	# those to be reported)
	956	elif [ $SHOW_SLASH -eq 0 ] && [ $URL_HTTP == $NEW_URL_NO_SLASH ]; then
[1123]	957	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show added trailing slashes, and I was redirected to '$NEW_URL'."
[1122]	958	STATUS="OK"
	959	let OK_LINKS+=1
	960	let SKIP_SLASH_ADD+=1
[1127]	961	elif [ $SHOW_YT_RD -eq 0 ] && [ $YOUTU_BE -eq 1 ]; then
	962	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show redirects for youtu.be links, and I was redirected to '$NEW_URL'."
	963	STATUS="OK"
	964	let OK_LINKS+=1
	965	let SKIP_YOUTU_BE+=1
[1069]	966	else
	967	STATUS="RD"
	968	let RD_LINKS+=1
	969	fi
[1067]	970	break
	971	fi
	972	done
	973	fi
	974
	975	# If we didn't get a match with the "RD" codes, check it against the "NG" codes
	976	if [ $STATUS == "??" ]; then
[1064]	977	for CODE in "${NG_CODES[@]}"; do
	978	if [[ $CODE == $CURL_CODE ]]; then
	979	STATUS="NG"
	980	let NG_LINKS+=1
	981	break
	982	fi
	983	done
	984	fi
	985
	986	# If we didn't match a known status code, advise the reader
	987	if [ $STATUS == "??" ]; then
[1127]	988	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I encountered the unknown response code $CURL_CODE."
[1064]	989	let SKIP_UNK_CODE+=1
	990	continue
	991	fi
	992
[1136]	993	# Check problem links against exceptions list before proceeding
	994	FOUND_EXCEPT=0
[1070]	995	if [ $STATUS != "OK" ] && [ ! -z $EXCEPT_URL ]; then
	996	# The code we expect to find in the exceptions file is either the 'curl' result or "EI"/"IW"
	997	EXPECT_CODE="$CURL_RESULT"
	998	if [ $STATUS == "EI" ]; then
	999	EXPECT_CODE="EI"
	1000	elif [ $STATUS == "IW" ]; then
	1001	EXPECT_CODE="IW"
	1002	fi
	1003
[1136]	1004	# Look for link in exceptions list and make sure the listed result code and wiki page also match
	1005	for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
	1006	{
	1007	EXCEPT_LINE="${EXCEPT_ARRAY[$i]}"
	1008
[1142]	1009	# Undo any HTML-encoding from the wiki page; for now we just worry about the ampersand, as most
	1010	# other HTML-encoded characters are not found in URLs
	1011	EXCEPT_LINE=$(echo "$EXCEPT_LINE" \| sed 's/\&/\&/')
	1012
[1136]	1013	# Match URL
	1014	EXCEPT_URL="${EXCEPT_LINE#*,}"
	1015	EXCEPT_URL="${EXCEPT_URL%,*}"
	1016	if [ "$EXCEPT_URL" != "$URL" ]; then
[1070]	1017	continue
	1018	fi
[1136]	1019
	1020	# Match containing page's name
	1021	EXCEPT_PAGE="${EXCEPT_LINE##*,}"
	1022	EXCEPT_PAGE="${EXCEPT_PAGE%% *}"
	1023	if [ "$EXCEPT_PAGE" == "*" ] \|\| [ "$EXCEPT_PAGE" == $LOCAL_PAGE_PATH ]; then
	1024	# Match result code
	1025	EXCEPT_CODE=${EXCEPT_LINE%%,*}
	1026	if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then
	1027	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because its expected result, '$EXPECT_CODE', is in the exceptions list."
	1028	if [ $STATUS == "EI" ]; then
	1029	let SKIP_EXPECT_EI+=1
	1030	elif [ $STATUS == "IW" ]; then
	1031	let SKIP_EXPECT_IW+=1
[1142]	1032	elif [ $STATUS == "RD" ]; then
	1033	let SKIP_EXPECT_RD+=1
[1136]	1034	else
	1035	let SKIP_EXPECT_NG+=1
	1036	fi
	1037	FOUND_EXCEPT=1
	1038	break
	1039	fi
	1040	fi
	1041	} done
[1064]	1042	fi
[1136]	1043	if [ $FOUND_EXCEPT -eq 1 ]; then
	1044	continue
	1045	fi
[1064]	1046
	1047	# If appropriate, record this link to the log, with clickable URLs when possible
	1048	if [ $STATUS != "OK" ] \|\| [ $RECORD_OK_LINKS -eq 1 ]; then
[1125]	1049	# Prepare 'curl' result in parentheses to print after status code, unless this is an "IW" or "EI"
	1050	# link, in which case showing the status code doesn't make sense. Adjust spacing after string to
	1051	# ensure TXT and RTF reports have aligned columns of results.
	1052	CURL_STR_H=" ($CURL_RESULT)"
	1053	CURL_STR_T="$CURL_STR_H"
	1054	CURL_STR_R="$CURL_STR_H "
[1070]	1055	if [ $STATUS == "IW" ] \|\| [ $STATUS == "EI" ]; then
[1125]	1056	CURL_STR_H=""
	1057	CURL_STR_T=" "
	1058	CURL_STR_R=" "
[1064]	1059	fi
	1060
	1061	# Record link and its wiki page in TXT, RTF, and HTML markup
[1125]	1062	valPrint t "${STATUS}${CURL_STR_T} $STR_TYPE $URL"
[1064]	1063	valPrint t " linked from $FULL_PAGE_PATH"
[1125]	1064	valPrint r "${STATUS}${CURL_STR_R}${RTF_TABS}$STR_TYPE {\field{\*\fldinst{HYPERLINK \"$URL\"}}{\fldrslt $URL}}"
[1064]	1065	valPrint r " linked from {\field{\*\fldinst{HYPERLINK \"$FULL_PAGE_PATH\"}}{\fldrslt $LOCAL_PAGE_PATH}}"
[1125]	1066	valPrint hn "<tr><td style=\"white-space:nowrap\">${STATUS}${CURL_STR_H}</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>"
[1064]	1067	valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"
	1068
[1123]	1069	# Place vertical space here since we won't be printing anything more about this link
[1125]	1070	if [ $STATUS == "OK" ]; then valPrint tr ""; valPrint hs ""; fi
[1123]	1071
[1067]	1072	# Record redirect URL if one was given by a 3xx response page
	1073	if [ $STATUS == "RD" ]; then
[1119]	1074	valPrint ts " Server suggests $NEW_URL"
	1075	valPrint rs " Server suggests {\field{\*\fldinst{HYPERLINK \"$NEW_URL\"}}{\fldrslt $NEW_URL}}"
	1076	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Server suggests</td><td><a href=\"$NEW_URL\" target=\"_blank\">$NEW_URL</a></td></tr>"
[1067]	1077	fi
	1078
[1070]	1079	# Notify reader if we can use an intrawiki link for this URL
	1080	if [ $STATUS == "EI" ]; then
[1075]	1081	INTRA_PAGE=${URL#:///}
[1119]	1082	valPrint ts " Just use [[$INTRA_PAGE]]"
	1083	valPrint rs " Just use [[$INTRA_PAGE]]"
	1084	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Just use</td><td>[[$INTRA_PAGE]]</td></tr>"
[1070]	1085	fi
	1086
[1064]	1087	# Notify reader if we can use an interwiki prefix for this URL
	1088	if [ $STATUS == "IW" ]; then
[1075]	1089	INTER_PAGE=$(echo "$URL" \| sed 's/.*\///')
[1119]	1090	valPrint ts " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
	1091	valPrint rs " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
	1092	valPrint hs "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]</td></tr>"
[1064]	1093	fi
	1094
	1095	# Query Internet Archive for latest "OK" snapshot for "NG" page
	1096	if [ $STATUS == "NG" ] && [ $SUGGEST_SNAPSHOTS -eq 1 ]; then
[1141]	1097	ARCHIVE_QUERY=$(curl --silent --max-time $TIMEOUT "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
[1064]	1098
[1118]	1099	# If a "closest" snapshot was received...
[1066]	1100	if [[ "$ARCHIVE_QUERY" == \"closest\": ]]; then
[1118]	1101	# In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it
	1102	ARCHIVE_QUERY=$(echo "$ARCHIVE_QUERY" \| sed 's/#!/#\\!/')
	1103
	1104	# ...isolate "url" property in the response that follows the "closest" tag
	1105	SNAPSHOT_URL=${ARCHIVE_QUERY##*\"closest\":} # everything after '"closest":'
[1119]	1106	SNAPSHOT_URL=${SNAPSHOT_URL#*\"url\": \"} # everything after '"url": "'
[1118]	1107	SNAPSHOT_URL=${SNAPSHOT_URL%%\"*} # everything before '"'
	1108
[1124]	1109	# Remove the port 80 part that IA often adds to the URL, as it's superfluous
	1110	SNAPSHOT_URL=$(echo $SNAPSHOT_URL \| sed 's/:80//')
	1111
[1118]	1112	# Inform the user of the snapshot URL
[1119]	1113	valPrint ts " IA suggests $SNAPSHOT_URL"
	1114	valPrint rs " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
	1115	valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
[1064]	1116	else # ...otherwise give generic Wayback Machine link for this URL
[1119]	1117	valPrint ts " Try browsing $ARCHIVE_GENERIC/$URL"
	1118	valPrint rs " Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
	1119	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"
[1064]	1120	fi
	1121	fi
	1122	fi
	1123
	1124	# If this is a page and it seems to be there, take screenshot of it using headless Google Chrome
	1125	if [ $IS_FILE -eq 0 ] && [ $TAKE_PAGE_SHOT -eq 1 ] && [ $STATUS == "OK" ]; then
	1126	# Sanitize URL by removing "http(s)://" and converting any other colons or slashes to underscores
	1127	SHOT_NAME=$(echo "$URL" \| sed 's/https*\:\/\///' \| sed 'y/:\//__/')
	1128	SHOT_FILE="$SHOT_PATH/$SHOT_NAME.png"
	1129
	1130	# Don't take screenshot if we already encountered this page and screenshotted it
	1131	if [ ! -f "$SHOT_FILE" ]; then
[1070]	1132	"$CHROME_PATH" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
[1064]	1133	if [ -f "$WORKING_DIR/$CHROME_SCREENSHOT" ]; then
	1134	mv -n "$WORKING_DIR/$CHROME_SCREENSHOT" "$SHOT_FILE"
	1135	else
[1119]	1136	valPrint trhs "Screenshot of URL $URL seems to have failed!"
[1064]	1137	fi
	1138	else
[1123]	1139	valPrint trhs "Skipping screenshot of URL '$URL' because file '$SHOT_FILE' already exists."
[1064]	1140	fi
	1141	fi
	1142	done
	1143	FINISHED_LIST="yes"
	1144	wrapupAndExit

Note: See TracBrowser for help on using the repository browser.

Download in other formats: