Context Navigation

source: Validate External Links/validate_external_links.sh@ 1141

Last change on this file since 1141 was 1141, checked in by iritscen, 4 years ago
Committing the changes to Val which I meant to commit over a week ago. I committed everything but the updated script itself. See last Val commit message for list of changes.
File size: 48.7 KB

Rev	Line
[1064]	1	#!/bin/bash
	2
	3	# Validate External Links by Iritscen
[1141]	4	#
	5	# Validates a list of external links in CSV format. The resulting logs are produced in three formats:
	6	# - TXT (for easy diffing with an earlier log)
	7	# - RTF (for reading as a local file with clickable links)
	8	# - HTML (for uploading as a web page).
	9	# Call script with "--help" argument for documentation. Also see Read Me First.rtf for critical notes.
	10	#
[1064]	11	# Recommended rule:
[1118]	12	# \|----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----\|
[1141]	13	#
	14	# Table of contents (sections of script in order of appearance, not execution):
	15	# • Globals
	16	# • Help Output
	17	# • Setup
	18	# • Utility Functions
	19	# • Summary Output
	20	# • Initialization
	21	# • Data Sourcing
	22	# • Config Output
	23	# • Legend Output
	24	# • Main Loop
[1064]	25
	26	# Set separator token to newline
	27	IFS="
	28	"
	29
	30	### GLOBALS ###
	31	# Settings -- these will be changed from their defaults by the arguments passed in to the script
[1135]	32	LINKS_URL="" # use 'curl' to download file with links from this location (can be file://)
[1136]	33	EXCEPT_URL="" # 'curl' will access this wiki page with a list of exceptions for NG results
[1135]	34	OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder
	35	RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES
	36	SHOW_SLASH=0 # record response code to the log when a slash is added to the end of a URL
	37	SHOW_HTTPS=0 # record response code to the log when "http" is upgraded to "https"
	38	SHOW_YT_RD=0 # record response code to the log when a youtu.be URL is expanded to the full URL
	39	SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page
	40	SKIP_ARCHIVE_LINKS=0 # don't check URLs under the archive.org domain
	41	TAKE_PAGE_SHOT=0 # take a screenshot of each OK page
[1141]	42	TIMEOUT=10 # time to wait for a response when querying a site
[1135]	43	CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature
	44	URL_START=1 # start at this URL in LINKS_FILE (1 by default)
	45	URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE
	46	UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report
[1064]	47
	48	# Fixed strings -- see the occurrences of these variables to learn their purpose
[1136]	49	AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36 OPR/69.0.3686.77"
[1064]	50	ARCHIVE_API="http://archive.org/wayback/available"
	51	ARCHIVE_GENERIC="https://web.archive.org/web/*"
	52	ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
	53	CHROME_SCREENSHOT="screenshot.png"
[1136]	54	EXCEPT_FILE_NAME="exceptions.txt"
[1064]	55	EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
[1141]	56	WIKI_CURL="https://wiki.oni2.net/Validate_External_Links/Curl_codes"
	57	WIKI_HTTP="https://wiki.oni2.net/Validate_External_Links/HTTP_codes"
	58	WIKI_MAIN="https://wiki.oni2.net/Validate_External_Links"
	59	WIKI_ME="http://iritscen.oni2.net"
[1064]	60	THIS_DIR=$(cd $(dirname $0); pwd)
	61	WORKING_DIR=$(pwd)
	62	WIKI_PATH="wiki.oni2.net"
	63
	64	# These are parallel arrays of the IDs and names of OniGalore's current namespaces
	65	declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 100 101 102 103 104 105 108 109 110 111)
	66	declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk")
	67
	68	# These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
[1070]	69	# This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code.
[1127]	70	declare -a HTTP_FILES=(3ds 7z avi BINA blend bsl cpp csv dae dll dmg doc east exe fbx first63 flv gamedata gif jpg last32 log m4a mhm mov mp3 mp4 oni ONWC pdf png py rar tga TRMA txt vbs wav wmv xaf xml zip)
[1137]	71	declare -a HTTP_TLDS_AND_PAGES=(action ars asp aspx cfm cgi com css de full htm html it js jsp net org pgi php php3 phtml pl ru shtml stm uk x)
[1064]	72
[1067]	73	# These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
	74	# are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt
	75	# if you add a new code.
[1127]	76	declare -a OK_CODES=(200 401 405 406 418 501)
[1067]	77	declare -a RD_CODES=(301 302 303 307 308)
[1127]	78	declare -a NG_CODES=(000 400 403 404 410 500 502 503 530)
[1064]	79
	80	# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
	81	# transcluded text, and if the transclusion fails, then the braces show up in the URL
	82	ILLEGAL_CHARS="{ }"
	83
[1070]	84	# The shortest URL possible, used for sanity-checking some URLs: http://a.co
	85	MIN_URL_LENGTH=11
	86
[1064]	87	# These are parallel arrays giving the prefixes that can be used in place of normal external links to
	88	# some wikis and other sites
[1070]	89	declare -a INTERWIKI_PREFIXES=(commons metawikimedia mw wikibooks wikidata wikimedia wikinews wikiquote wikisource wikispecies wikiversity wikivoyage wikt wp)
	90	declare -a INTERWIKI_DOMAINS=(commons.wikimedia.org meta.wikimedia.org mediawiki.org wikibooks.org wikidata.org wikimediafoundation.org wikinews.org wikiquote.org wikisource.org species.wikimedia.org wikiversity.org wikivoyage.org wiktionary.org wikipedia.org)
[1064]	91
	92	# Variables for keeping track of main loop progress and findings
	93	LINK_NUM=0
[1070]	94	EI_LINKS=0
	95	IW_LINKS=0
[1064]	96	OK_LINKS=0
[1067]	97	RD_LINKS=0
[1064]	98	NG_LINKS=0
	99	SKIP_UNK_NS=0
	100	SKIP_JS_PAGE=0
	101	SKIP_BAD_URL=0
	102	SKIP_NON_ASCII=0
	103	SKIP_UNK_SUFFIX=0
	104	SKIP_UNK_CODE=0
[1070]	105	SKIP_EXPECT_NG=0
	106	SKIP_EXPECT_EI=0
	107	SKIP_EXPECT_IW=0
[1122]	108	SKIP_HTTPS_UP=0
	109	SKIP_SLASH_ADD=0
[1127]	110	SKIP_YOUTU_BE=0
[1135]	111	SKIP_ARCHIVE_ORG=0
[1064]	112	FILE_LINKS=0
	113	PAGE_LINKS=0
	114	SKIPPED_HEADER_ROW=0
	115	FINISHED_LIST="no"
[1118]	116	START_RUN=0
	117	END_RUN=0
[1064]	118
	119
[1141]	120	### HELP OUTPUT ###
[1064]	121	# A pseudo-man page. Here is the 80-character rule for the page text:
	122	# 234567890123456789012345678901234567890123456789012345678901234567890123456789
	123	function printHelp()
	124	{
	125	cat << EOF
	126
	127	NAME
	128	Validate External Links
	129
	130	SYNOPSIS
	131	validate_external_links.sh --help
[1070]	132	validate_external_links.sh --links URL --output DIR [--exceptions URL]
[1136]	133	[--record-ok-links] [--show-added-slashes] [--show-https-upgrades]
	134	[--show-yt-redirects] [--suggest-snapshots] [--skip-archive-links]
[1141]	135	[--take-screenshots FILE] [--timeout NUM] [--start-url NUM]
	136	[--end-url NUM] [--upload FILE]
[1064]	137
	138	DESCRIPTION
	139	This script parses a list of external links found in the OniGalore wiki
	140	(which is dumped by the Oni2.net domain periodically in a particular
	141	format), validates them using the Unix tool 'curl', and produces a report
[1070]	142	of which links were "OK" (responded positively to an HTTP query), which
	143	were "RD" (responded with a 3xx redirect code), which could be "IW"
	144	(interwiki) links, which are "EI" (external internal) links and could be
	145	intrawiki links, and which were "NG" (no good; a negative response to the
[1069]	146	query). This report can then be automatically uploaded to the location of
[1064]	147	your choice. The script can also suggest Internet Archive snapshots for
[1070]	148	"NG" links, and take screenshots of "OK" links for visual verification by
	149	the reader that the page in question is the one intended to be displayed.
[1064]	150
	151	You must pass this script the URL at which the list of links is found
[1070]	152	(--links) and the path where the directory of logs should be outputted
	153	(--output). All other arguments are optional.
[1064]	154
	155	OPTIONS
[1075]	156	--help Show this page.
	157	--links URL (required) URL from which to download the CSV
	158	file with external links. Note that this URL can
	159	be a local file if you supply a file:// path.
	160	--output DIR (required) Unix path to directory in which Val
	161	should place its reports.
	162	--exceptions URL In order to remove links from the report which
[1136]	163	Val finds an issue with but which you regard as
	164	OK, list those desired exceptions on a wiki page.
	165	See the sample file "exceptions.pdf" for the
	166	required format of the page. Note that this URL
	167	can point to a local file if you supply a path
	168	beginning with "file://".
[1075]	169	--record-ok-links Log a link in the report even if its response
	170	code is "OK".
[1122]	171	--show-added-slashes Report on redirects that simply add a '/' to the
	172	end of the URL.
[1127]	173	--show-https-upgrades Report on redirects that simply upgrade a
[1122]	174	"http://" URL to a "https://" URL.
[1127]	175	--show-yt-redirects Report on redirects that expand a youtu.be URL.
[1075]	176	--suggest-snapshots Query the Internet Archive for a possible
	177	snapshot URL for each "NG" page.
[1135]	178	--skip-archive-links Don't check links that are already pointing to
	179	a page on the Internet Archive.
[1075]	180	--take-screenshots FILE Call the Google Chrome binary at this path to
	181	take screenshots of each "OK" page.
[1141]	182	--timeout NUM Wait this many seconds for a site to respond. The
	183	default is 10.
[1075]	184	--start-url NUM Start at this link in the links CSV file.
	185	--end-url NUM Stop at this link in the links CSV file.
	186	--upload FILE Upload report using the credentials and path
	187	given in this local text file. See sftp_login.txt
	188	for template.
[1064]	189
	190	BUGS
	191	The script cannot properly parse any line in the external links file
	192	which contains a comma in the name of the wiki page containing a link.
	193	Commas in the link itself are not an issue.
	194	EOF
	195	}
	196
	197
	198	### SETUP ###
	199	# If first argument is a help request, or if nothing was passed in at all, print help page and quit
	200	if [ "$#" -eq 0 ] \|\| [ "$1" == "--help" ]; then
	201	printHelp \| less
	202	exit 0
	203	fi
	204
	205	# Parse arguments as long as there are more arguments to process
	206	while (( "$#" )); do
	207	case "$1" in
[1127]	208	--links ) LINKS_URL="$2"; shift 2;;
	209	--exceptions ) EXCEPT_URL="$2"; shift 2;;
	210	--output ) OUTPUT_DIR="$2"; shift 2;;
	211	--record-ok-links ) RECORD_OK_LINKS=1; shift;;
	212	--show-added-slashes ) SHOW_SLASH=1; shift;;
	213	--show-https-upgrades ) SHOW_HTTPS=1; shift;;
	214	--show-yt-redirects ) SHOW_YT_RD=1; shift;;
	215	--suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;;
[1135]	216	--skip-archive-links ) SKIP_ARCHIVE_LINKS=1; shift;;
[1127]	217	--take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
[1141]	218	--timeout ) TIMEOUT=$2; shift 2;;
[1127]	219	--start-url ) URL_START=$2; shift 2;;
	220	--end-url ) URL_LIMIT=$2; shift 2;;
	221	--upload ) UPLOAD_INFO=$2; shift 2;;
	222	* ) echo "Invalid argument $1 detected. Aborting."; exit 1;;
[1064]	223	esac
	224	done
	225
	226	# If the required arguments were not supplied, print help page and quit
	227	if [ -z $LINKS_URL ] \|\| [ -z $OUTPUT_DIR ]; then
[1070]	228	echo "Error: I did not receive one or both required arguments. Run me with the \"--help\" argument for documentation."
[1064]	229	exit 2
	230	fi
	231
[1070]	232	# If user wants screenshots, make sure path to Chrome was passed in and is valid
	233	if [ $TAKE_PAGE_SHOT -eq 1 ]; then
	234	if [ ! -f "$CHROME_PATH" ]; then
	235	echo "Error: You need to supply a path to the Google Chrome application in order to take screenshots."
	236	exit 3
	237	fi
	238	fi
	239
[1064]	240	# Check that UPLOAD_INFO exists, if this argument was supplied
	241	if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then
	242	echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting."
[1070]	243	exit 4
[1064]	244	fi
	245
	246	# Check that OUTPUT_DIR is a directory
	247	if [ ! -d "$OUTPUT_DIR" ]; then
	248	echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting."
[1070]	249	exit 5
[1064]	250	fi
	251
	252	# Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots
	253	SAFE_TIME=$(date "+%Y-%m-%d--%H-%M-%S") # for use in path names
	254	NICE_TIME=$(TZ=":UTC" date "+%a, %d %b %Y %k:%M:%S GMT") # matches format of LINKS_DATE, which is extracted from the extlinks file
	255	OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
	256	OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
	257	SHOT_PATH="$OUTPUT_PATH/Screenshots"
	258	LOG_NAME="ValExtLinks report"
	259	LOG_TXT="$OUTPUT_PATH/$LOG_NAME.txt"
	260	LOG_RTF="$OUTPUT_PATH/$LOG_NAME.rtf"
	261	LOG_HTM="$OUTPUT_PATH/$LOG_NAME.htm"
	262	mkdir "$OUTPUT_PATH"
	263	if [ $TAKE_PAGE_SHOT -eq 1 ]; then
	264	mkdir "$SHOT_PATH"
	265	fi
	266
	267	# Check that 'mkdir' succeeded
	268	if [ ! -d "$OUTPUT_PATH" ]; then
	269	echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting."
[1070]	270	exit 6
[1064]	271	fi
	272
	273	# Get date on the file at LINKS_URL and print to log
	274	LINKS_DATE=$(curl --silent --head $LINKS_URL \| grep "Last-Modified")
	275	if [ -z "$LINKS_DATE" ]; then
	276	echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting."
[1070]	277	exit 7
[1064]	278	fi
	279	LINKS_DATE=${LINKS_DATE#Last-Modified: }
	280
	281
	282	### UTILITY FUNCTIONS ###
	283	# Writes a plain-text header to TXT log file
	284	function printTXTheader()
	285	{
	286	valPrint t "Validate External Links report"
	287	valPrint t "generated $NICE_TIME"
	288	valPrint t "from data of $LINKS_DATE"
[1141]	289	valPrint t "script by Iritscen (contact: $WIKI_ME)"
[1064]	290	valPrint t ""
	291	}
	292
	293	# Writes the RTF header to RTF log file, then a centered title, then sets text to left-justified
	294	function printRTFheader()
	295	{
	296	valPrint r "{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf820
	297	{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
	298	{\colortbl;\red255\green255\blue255;}
	299	{\*\expandedcolortbl;;}
	300	\margl1440\margr1440\vieww12600\viewh12100\viewkind0
	301	\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\qc\partightenfactor0
	302
	303	\f0\fs28 \cf0 \b1 Validate External Links report \b0\\
	304	generated $NICE_TIME\\
	305	from data of $LINKS_DATE\\
[1141]	306	script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$WIKI_ME\"}}{\fldrslt contact}})
[1064]	307	\\
	308	\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
	309	\cf0 "
	310	}
	311
	312	# Closes the RTF markup of the RTF log file
	313	function printRTFfooter()
	314	{
	315	valPrint r "}"
	316	}
	317
	318	# Writes the HTML header to HTML log file
	319	function printHTMheader()
	320	{
	321	valPrint h "<html>
	322	<head>
	323	<title>Validate External Links report</title>
	324	</head>
	325	<body>
	326	<h2>Validate External Links report</h2>
	327	<h3>generated $NICE_TIME<br />
	328	from data of $LINKS_DATE<br />
[1141]	329	script by Iritscen (<a href=\"$WIKI_ME\" target=\"_blank\">contact</a>)</h3>"
[1064]	330	}
	331
	332	# Closes the HTML markup of the HTML log file
	333	function printHTMfooter()
	334	{
	335	valPrint h "</body>
	336	</html>"
	337	}
	338
	339	# The central logging function. The first parameter is a string composed of one or more characters that
[1070]	340	# indicate which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
[1141]	341	# 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 's' means "Print an
	342	# extra newline at the end." 'w' means "Don't pass console output through 'fmt'" ("fmt" fits the output
[1119]	343	# to an 80-column CLI but can break special formatting and the 'n' option).
[1064]	344	function valPrint()
	345	{
	346	if [[ "$1" == c ]]; then
	347	if [[ "$1" == n ]]; then
	348	echo -n "$2"
	349	elif [[ "$1" == w ]]; then
	350	echo "$2"
[1119]	351	elif [[ "$1" == s ]]; then
	352	echo -e "$2\n"
[1064]	353	else
	354	echo "$2" \| fmt -w 80
	355	fi
	356	fi
	357	if [[ "$1" == t ]]; then
	358	if [[ "$1" == n ]]; then
	359	echo -n "$2" >> "$LOG_TXT"
[1119]	360	elif [[ "$1" == s ]]; then
	361	echo -e "$2\n" >> "$LOG_TXT"
[1064]	362	else
	363	echo "$2" >> "$LOG_TXT"
	364	fi
	365	fi
	366	if [[ "$1" == r ]]; then
	367	if [[ "$1" == n ]]; then
	368	echo "$2" >> "$LOG_RTF"
[1119]	369	elif [[ "$1" == s ]]; then
	370	echo "$2\line\line" >> "$LOG_RTF"
[1064]	371	else
[1119]	372	echo "$2\line" >> "$LOG_RTF"
[1064]	373	fi
	374	fi
	375	if [[ "$1" == h ]]; then
[1119]	376	if [[ "$1" == s ]]; then
	377	echo "$2<tr><td> </td></tr>" >> "$LOG_HTM"
	378	elif [[ "$1" == n ]]; then
[1064]	379	echo "$2" >> "$LOG_HTM"
	380	else
	381	echo "$2<br />" >> "$LOG_HTM"
	382	fi
	383	fi
	384	}
	385
	386	# Pluralize the string in parameter 1 if the number in parameter 2 is not 1
	387	function pluralCheckNoun()
	388	{
	389	if [ $2 -ne 1 ]; then
	390	if [[ $1 =~ x$ ]]; then
	391	echo $1es
	392	else
	393	echo $1s
	394	fi
	395	else
	396	echo $1
	397	fi
	398	}
	399
[1067]	400	# Output "is" if parameter 1 is 1, otherwise "are"
	401	function pluralCheckIs()
	402	{
	403	if [ $1 -ne 1 ]; then
	404	echo "are"
	405	else
	406	echo "is"
	407	fi
	408	}
	409
[1064]	410	# Output "was" if parameter 1 is 1, otherwise "were"
	411	function pluralCheckWas()
	412	{
	413	if [ $1 -ne 1 ]; then
	414	echo "were"
	415	else
	416	echo "was"
	417	fi
	418	}
	419
[1067]	420	# Output "a " if parameter 1 is 1, otherwise nothing
	421	function pluralCheckA()
	422	{
	423	if [ $1 -eq 1 ]; then
	424	echo "a "
	425	fi
	426	}
	427
	428	# Output "an " if parameter 1 is 1, otherwise nothing
	429	function pluralCheckAn()
	430	{
	431	if [ $1 -eq 1 ]; then
	432	echo "an "
	433	fi
	434	}
	435
[1064]	436	# Upload HTML report using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
	437	# reports being saved to disk have already been closed.
	438	function uploadReport()
	439	{
	440	valPrint c "Uploading HTML report..."
	441
	442	SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
	443	SFTP_USER_NAME_MARKER="user:"
	444	SFTP_PASSWORD_MARKER="pw:"
	445	SFTP_PORT_MARKER="port:"
	446	SFTP_PATH_MARKER="path:"
	447	SFTP_USER_NAME=$(grep $SFTP_USER_NAME_MARKER $UPLOAD_INFO)
	448	SFTP_USER_NAME=${SFTP_USER_NAME#$SFTP_USER_NAME_MARKER}
	449	SFTP_PASSWORD=$(grep $SFTP_PASSWORD_MARKER $UPLOAD_INFO)
	450	SFTP_PASSWORD=${SFTP_PASSWORD#$SFTP_PASSWORD_MARKER}
	451	SFTP_PORT=$(grep $SFTP_PORT_MARKER $UPLOAD_INFO)
	452	SFTP_PORT=${SFTP_PORT#$SFTP_PORT_MARKER}
	453	SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
	454	SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}
	455
	456	expect "$SCRIPT_PATH" "$LOG_HTM" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.htm"
	457
	458	valPrint c "Report was uploaded, unless an error message appears above."
	459	}
	460
	461	# Prints session summary when script is done
	462	function wrapupAndExit()
	463	{
	464	# Get off progress line on console, drop down a line from last link in log, and close HTML table
	465	valPrint ctr ""
	466	valPrint h "</table><br />"
	467
	468	# If we didn't finish processing the last URL, then the iterator is one too high
	469	if [ $FINISHED_LIST != "yes" ]; then
	470	let LINK_NUM-=1
	471	if [ $FINISHED_LIST == "no" ]; then
	472	valPrint ctrh "The session was canceled by the user."
	473	fi
	474	fi
	475
[1118]	476	# Generate string with elapsed time
	477	END_RUN=$(date +%s)
	478	ELAPSED=$(echo $(($END_RUN - $START_RUN)) \| awk '{printf "%d min. %d sec. elapsed", int($1/60), int($1%60)}')
	479
[1122]	480	# Do some math on results of session
[1064]	481	LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
[1122]	482	LINK_PROBLEMS=$((EI_LINKS+IW_LINKS+RD_LINKS+NG_LINKS))
	483	LINK_ERRORS=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
	484	LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_EI+SKIP_EXPECT_IW))
[1127]	485	TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP+SKIP_YOUTU_BE))
[1122]	486	LINKS_CHECKED=$((LINKS_PROCESSED-LINK_ERRORS))
	487
[1141]	488	## SUMMARY OUTPUT ##
[1118]	489	valPrint ct "Summary ($ELAPSED):"
	490	valPrint r "\b1 Summary \b0 ($ELAPSED)"
	491	valPrint hn "<h3><span id=\"summary\">Summary ($ELAPSED)</span></h3>"
[1123]	492	valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT) (there $(pluralCheckWas $FILE_LINKS) $FILE_LINKS file $(pluralCheckNoun link $FILE_LINKS) and $PAGE_LINKS page $(pluralCheckNoun link $PAGE_LINKS))."
[1122]	493
	494	# Print processed link totals
	495	if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi
	496	if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi
[1135]	497	if [ $SKIP_ARCHIVE_ORG -gt 0 ]; then valPrint ctrh "- $SKIP_ARCHIVE_ORG Archive.org $(pluralCheckNoun link $SKIP_ARCHIVE_ORG) were not checked"; fi
[1123]	498	if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS processed $(pluralCheckNoun link $LINK_PROBLEMS) had $(pluralCheckAn $LINK_PROBLEMS)$(pluralCheckNoun issue $LINK_PROBLEMS)"; fi
[1127]	499	if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr " (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; valPrint h "  (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi
[1123]	500	if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) $(pluralCheckWas $OK_LINKS) OK"; fi
	501	if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctr " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; valPrint h "  (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi
[1122]	502
	503	# Print excepted link totals
	504	if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctrh "$LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"; fi
	505	if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
	506	if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi
	507	if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi
	508
	509	# Print errored link totals
	510	if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"; fi
	511	if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
[1070]	512	if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
[1064]	513	if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
	514	if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
	515	if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
	516	if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
[1122]	517
	518	# Print checked link totals
[1123]	519	if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS link $(pluralCheckNoun issue $LINK_PROBLEMS):"; fi
[1122]	520	if [ $NG_LINKS -gt 0 ]; then valPrint ctrh "- $NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
	521	if [ $RD_LINKS -gt 0 ]; then valPrint ctrh "- $RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi
	522	if [ $EI_LINKS -gt 0 ]; then valPrint ctrh "- $EI_LINKS $(pluralCheckNoun link $EI_LINKS) that could be intrawiki"; fi
	523	if [ $IW_LINKS -gt 0 ]; then valPrint ctrh "- $IW_LINKS $(pluralCheckNoun link $IW_LINKS) that could be interwiki"; fi
	524
	525	# Close the log files' markup
[1070]	526	valPrint trh "ValExtLinks says goodbye."
[1064]	527	printRTFfooter
	528	printHTMfooter
	529
	530	# Upload report if this was requested
	531	if [ ! -z $UPLOAD_INFO ] && [ $FINISHED_LIST != "no" ]; then
	532	uploadReport
	533	fi
	534
	535	# Really quit now
	536	valPrint c "ValExtLinks says goodbye."
	537	exit 0
	538	}
	539	trap wrapupAndExit INT
	540
	541
	542	### INITIALIZATION ###
	543	# Print opening message to console and log files
	544	valPrint cw "$(tput bold)--Validate External Links--$(tput sgr0)"
	545	printTXTheader
	546	printRTFheader
	547	printHTMheader
	548
[1141]	549	## DATA SOURCING ##
	550	valPrint t "Startup:"
	551	valPrint r "\b1 Startup \b0"
	552	valPrint hn "<h3>Startup</h3>"
	553
[1064]	554	# Attempt to download file at LINKS_URL, then check that it succeeded
[1141]	555	valPrint cwtrhn "Downloading list of external links from $LINKS_URL..."
[1064]	556	LINKS_FILE_NAME=$(echo "$LINKS_URL" \| sed 's/.*\///')
	557	LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
	558	curl --silent -o "$LINKS_FILE" $LINKS_URL
	559	if [ ! -f "$LINKS_FILE" ]; then
[1141]	560	echo -e "\nThe download of $LINKS_URL appears to have failed. Aborting."
[1064]	561	wrapupAndExit
[1141]	562	else
	563	valPrint ctrh " success."
[1064]	564	fi
	565
	566	# Attempt to download file at EXCEPT_URL, then check that it succeeded
	567	if [ ! -z $EXCEPT_URL ]; then
[1141]	568	valPrint cwtrhn "Downloading list of reporting exceptions from $EXCEPT_URL..."
[1136]	569	EXCEPT_DATA=$(curl --silent $EXCEPT_URL)
	570	if [ -z "$EXCEPT_DATA" ]; then
[1141]	571	echo -e "\nThe download of the exceptions data from '$EXCEPT_URL' appears to have failed. Aborting."
[1064]	572	wrapupAndExit
[1141]	573	else
	574	valPrint ctrh " success."
[1064]	575	fi
[1136]	576	EXCEPT_DATA=${EXCEPT_DATA%END LIST*}
	577	EXCEPT_DATA=${EXCEPT_DATA#*BEGIN LIST}
	578	EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
	579
	580	# Store on disk for debugging purposes
	581	echo "$EXCEPT_DATA" > "$EXCEPT_FILE"
	582
	583	# Transfer to array for easy searching later
	584	declare -a EXCEPT_ARRAY=($(echo "$EXCEPT_DATA"))
[1064]	585	fi
	586
	587	# Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
	588	LINK_COUNT_STRING=$(cat "$LINKS_FILE" \| wc -l)
	589
	590	# Number of URLs is number of lines minus one (first line is column header row for the CSV)
	591	LINK_COUNT=$(echo "${LINK_COUNT_STRING}" \| tr -d '[:space:]')
	592	let LINK_COUNT-=1
[1141]	593	valPrint ctrh "Found $LINK_COUNT links to process."
	594	valPrint trh ""
[1064]	595
[1141]	596	## CONFIG OUTPUT ##
	597	valPrint t "Config:"
	598	valPrint r "\b1 Config \b0"
	599	valPrint hn "<h3>Config</h3>"
	600
	601	valPrint ctrhn "Links to consider: "
[1064]	602	if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
[1141]	603	valPrint ctrh "$((URL_LIMIT-URL_START+1)), from link $URL_START to $URL_LIMIT"
[1064]	604	elif [ $URL_START -ne 1 ]; then
[1141]	605	valPrint ctrh "$((LINK_COUNT-URL_START+1)), from link $URL_START to $LINK_COUNT"
[1064]	606	else
[1141]	607	valPrint ctrh "$LINK_COUNT"
[1064]	608	fi
	609
[1141]	610	valPrint ctrh "Site query timeout: $TIMEOUT seconds"
	611
	612	valPrint ctrhn "Show OK links: "
	613	if [ $RECORD_OK_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
	614
	615	valPrint ctrhn "Take screenshots: "
	616	if [ $TAKE_PAGE_SHOT -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
	617
	618	valPrint ctrhn "Suggest Archive.org snapshots: "
	619	if [ $SUGGEST_SNAPSHOTS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
	620
	621	valPrint ctrhn "Ignore slash-adding redirects: "
	622	if [ $SHOW_SLASH -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
	623
	624	valPrint ctrhn "Ignore HTTPS-upgrading redirects: "
	625	if [ $SHOW_HTTPS -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
	626
	627	valPrint ctrhn "Ignore youtu.be redirects: "
	628	if [ $SHOW_YT_RD -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
	629
	630	valPrint ctrhn "Check archive.org links: "
	631	if [ $SKIP_ARCHIVE_LINKS -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
	632
[1064]	633	valPrint tr "A summary of my findings will be found at the bottom of the report."
	634	valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
	635	valPrint trh ""
	636
[1141]	637	## LEGEND OUTPUT ##
[1064]	638	valPrint t "Legend:"
	639	valPrint r "\b1 Legend \b0"
	640	valPrint hn "<h3>Legend</h3>"
[1141]	641	valPrint t "(For guidance in fixing these links, see $WIKI_MAIN.)"
	642	valPrint r "(For guidance in fixing these links, see {\field{\*\fldinst{HYPERLINK \"$WIKI_MAIN\"}}{\fldrslt here}}.)"
	643	valPrint h "(For guidance in fixing these links, see <a href=\"$WIKI_MAIN\" target=\"_blank\">here</a>.)"
	644	valPrint trh "OK = URL seems to be working"
	645	valPrint trh "NG = URL no longer seems to work"
	646	valPrint trh "RD = URL is redirecting to this new URL"
	647	valPrint trh "EI = URL is an external link to an internal page and should be converted to an intrawiki link using the suggested markup"
	648	valPrint trh "IW = URL is an external link to a fellow wiki and should be converted to an interwiki link using the suggested markup"
	649	valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $WIKI_HTTP)"
	650	valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$WIKI_HTTP\"}}{\fldrslt here}} for code reference)"
	651	valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$WIKI_HTTP\" target=\"_blank\">here</a> for code reference)"
	652	valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $WIKI_CURL)"
	653	valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$WIKI_CURL\"}}{\fldrslt here}} for code reference)"
	654	valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$WIKI_CURL\" target=\"_blank\">here</a> for code reference)"
	655	valPrint trh "IA suggests = Last available snapshot returned by the Internet Archive"
	656	valPrint trh "Try browsing = The Archive failed to return a snapshot URL, so check for a snapshot manually using this link"
[1064]	657	valPrint trh ""
	658
	659
	660	### MAIN LOOP ###
[1120]	661	valPrint t "Links:"
	662	valPrint r "\b1 Links \b0"
	663	valPrint hn "<h3>Links</h3>"
[1118]	664	START_RUN=$(date +%s)
[1064]	665	# Process each line of the .csv in LINKS_FILE
	666	for LINE in `cat "$LINKS_FILE"`; do
	667	let LINK_NUM+=1
	668
	669	# First line is the column header row for the CSV, so let's verify that the format hasn't changed
	670	if [ $SKIPPED_HEADER_ROW -eq 0 ]; then
	671	if [ $LINE == "namespace,title,target" ]; then
	672	SKIPPED_HEADER_ROW=1
	673	LINK_NUM=0 # this line is it's not a link, so reset the link counter
	674	valPrint hn "<table>"
	675	continue
	676	else
	677	valPrint ctrh "Did not find expected header row in $LINKS_FILE. Aborting."
	678	wrapupAndExit
	679	fi
	680	fi
	681
	682	# Skip this link if we are not at URL_START yet
	683	if [ $LINK_NUM -lt $URL_START ]; then
	684	continue
	685	fi
	686
	687	# Stop if we are at the limit declared for testing purposes
	688	if [ $URL_LIMIT -gt 0 ] && [ $LINK_NUM -gt $URL_LIMIT ]; then
	689	FINISHED_LIST="limit"
	690	wrapupAndExit
	691	fi
	692
	693	# Print progress to screen
	694	if [ $LINK_NUM -gt 1 ]; then
	695	printf "\e[1A\n" # erase previous progress message so that new one appears in its place
	696	fi
	697	valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."
	698
	699	# The number of the namespace is the element before the first comma on the line
	700	NS_ID=${LINE%%,*}
	701
	702	# Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
	703	NS_NAME=""
	704	a=0
[1069]	705	while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done
[1118]	706	if [ $NS_ID == "NULL" ]; then
	707	break
	708	elif [ $NS_ID -eq ${NS_IDS[$a]} ]; then
[1064]	709	NS_NAME="${NS_NAMES[$a]}"
	710	break
	711	fi
	712	let a+=1
	713	done
[1118]	714	if [ "$NS_NAME" == "" ]; then
	715	if [ $NS_ID == "NULL" ]; then
[1123]	716	valPrint trs "Skipping line $LINK_NUM ('$LINE') because the namespace (and probably the page too) is 'NULL'. Probably the link is no longer in existence on the wiki."
[1118]	717	else
[1123]	718	valPrint trs "Skipping line $LINK_NUM ('$LINE') because I could not find a name for namespace ID $NS_ID."
[1118]	719	fi
[1064]	720	let SKIP_UNK_NS+=1
	721	continue
	722	fi
	723
	724	# The name of the page is everything between the namespace ID and the next comma on the line (commas
	725	# in page names will break this)
	726	PAGE_NAME=${LINE#$NS_ID,}
	727	PAGE_NAME=${PAGE_NAME%%,*}
	728
[1135]	729	# We don't want to consider wiki pages ending in .js, as the MW parser cannot reliably isolate URLS
	730	# in JavaScript code, so it returns erroneous links
[1064]	731	PAGE_NAME_SUFFIX=$(echo $PAGE_NAME \| sed 's/.*\.//')
	732	if [ $PAGE_NAME_SUFFIX == "js" ]; then
[1123]	733	valPrint trs "Skipping URL '${LINE#$NS_ID,$PAGE_NAME,}' on line $LINK_NUM because it was found on JavaScript page '$PAGE_NAME'."
[1064]	734	let SKIP_JS_PAGE+=1
	735	continue
	736	fi
	737
[1070]	738	# Build longer wiki page URLs from namespace and page names
[1122]	739	FULL_PAGE_PATH=https://$WIKI_PATH/$NS_NAME:$PAGE_NAME
[1070]	740	LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
	741	# Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it
	742	# explicitly breaks the link
	743	if [ $NS_ID -eq 0 ]; then
[1122]	744	FULL_PAGE_PATH=https://$WIKI_PATH/$PAGE_NAME
[1070]	745	LOCAL_PAGE_PATH=$PAGE_NAME
	746	fi
	747
[1064]	748	# The URL being linked to is everything after the previous two fields (this allows commas to be in
	749	# the URLs, but a comma in the previous field, the page name, will break this)
	750	URL=${LINE#$NS_ID,$PAGE_NAME,}
	751
	752	# Scan for illegal characters
	753	if [[ $URL == [$ILLEGAL_CHARS] ]]; then
[1123]	754	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because it contains characters illegal in a URL."
[1064]	755	let SKIP_BAD_URL+=1
	756	continue
	757	fi
	758
[1135]	759	# If we're skipping Archive.org links, check if this is one
	760	if [ $SKIP_ARCHIVE_LINKS -eq 1 ] && [[ $URL == web.archive.org ]]; then
	761	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have been asked not to check Wayback Machine links."
	762	let SKIP_ARCHIVE_ORG+=1
	763	continue
	764	fi
	765
[1064]	766	# Now we need to know if the URL is for a file or a web page. First step is to determine if the
	767	# URL ends in a suffix
	768	HAS_SUFFIX=0
	769
	770	# If the URL ends in something like ".php?foo=bar", strip everything from the '?' onward
[1070]	771	CLEAN_URL=${URL%%\?*}
[1064]	772
	773	# If the URL ends in something like "#section_15", strip everything from the '#' onward
[1070]	774	CLEAN_URL=${CLEAN_URL%%\#*}
[1064]	775
[1135]	776	# 'sed' cannot handle Unicode in my Bash shell, so skip non-ASCII URL and make user check it
[1070]	777	if [[ $CLEAN_URL == [![:ascii:]] ]]; then
[1123]	778	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I cannot handle non-ASCII characters."
[1064]	779	let SKIP_NON_ASCII+=1
	780	continue
	781	fi
	782
	783	# Isolate the characters after the last period and after the last slash
[1070]	784	POST_DOT=$(echo "$CLEAN_URL" \| sed 's/.*\.//')
	785	POST_SLASH=$(echo "$CLEAN_URL" \| sed 's/.*\///')
[1064]	786
	787	# If the last period comes after the last slash, then the URL ends in a suffix
	788	POST_DOT_LENGTH=$(echo \| awk -v input=$POST_DOT '{print length(input)}')
	789	POST_SLASH_LENGTH=$(echo \| awk -v input=$POST_SLASH '{print length(input)}')
	790	if [ $POST_DOT_LENGTH -lt $POST_SLASH_LENGTH ]; then
	791	HAS_SUFFIX=1
	792	else
	793	HAS_SUFFIX=0
	794	fi
	795
	796	# Now we need to know if the URL is for a file. If the URL ends in a suffix, compare it against the
	797	# known URL endings supplied in HTTP_FILES and HTTP_TLDS_AND_PAGES
	798	IS_FILE=-1
	799	if [ $HAS_SUFFIX -eq 0 ]; then
	800	IS_FILE=0
	801	else
	802	# Turn off case sensitivity while we compare suffixes
	803	shopt -s nocasematch
	804
[1127]	805	# Special case: URLs ending in something like "productID.304297400" are pages, not files, so if
[1064]	806	# the URL's suffix is all numbers, we are looking at the end of a web page URL
	807	if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
	808	IS_FILE=0
	809	fi
[1127]	810
	811	# Special case: URLs ending in a parens, e.g. "ms537113(v=vs.85)", are pages
	812	if [[ $POST_DOT =~ ^.*[\(\)]$ ]]; then
	813	IS_FILE=0
	814	fi
	815
	816	# Special case: URLs containing a '%', e.g. "10.1007%2FBF00329055", are pages
	817	if [[ $POST_DOT == % ]]; then
	818	IS_FILE=0
	819	fi
[1064]	820
	821	# If we did not identify this URL as a web page above, we need to compare the suffix against known
	822	# file extensions
	823	if [ $IS_FILE -eq -1 ]; then
	824	for EXTENSION in "${HTTP_FILES[@]}"; do
	825	if [[ $EXTENSION == $POST_DOT ]]; then # double brackets respect the nocasematch setting
	826	IS_FILE=1
	827	break
	828	fi
	829	done
	830	fi
	831
	832	# If we did not identify this URL as a file above, we need to compare the suffix against known
	833	# pages and TLDs in order to rule out that this is an unknown file suffix or TLD that the user
	834	# needs to add to the HTTP_FILES or HTTP_TLDS_AND_PAGES array
	835	if [ $IS_FILE -eq -1 ]; then
	836	for PAGE_OR_TLD in "${HTTP_TLDS_AND_PAGES[@]}"; do
	837	if [[ $PAGE_OR_TLD == $POST_DOT ]]; then
	838	IS_FILE=0
	839	break
	840	fi
	841	done
	842	fi
	843
	844	# Turn case sensitivity back on in Bash
	845	shopt -u nocasematch
	846	fi
	847
	848	# If this suffix escaped identification as either a file, page or TLD, inform the user
	849	STR_TYPE=""
	850	if [ $IS_FILE -eq -1 ]; then
[1123]	851	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I encountered the unknown URL ending '$POST_DOT'. Please add this ending to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
[1064]	852	let SKIP_UNK_SUFFIX+=1
	853	continue
	854	elif [ $IS_FILE -eq 1 ]; then
	855	STR_TYPE="file"
	856	let FILE_LINKS+=1
	857	elif [ $IS_FILE -eq 0 ]; then
	858	STR_TYPE="page"
	859	let PAGE_LINKS+=1
	860	fi
	861
	862	# Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
	863	# issue with sites that require HTTPS
[1141]	864	CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --write-out '%{http_code}\n' $URL)
[1064]	865	CURL_ERR=$(echo $?)
	866	CURL_RESULT=$CURL_CODE
	867
	868	# Tack 'curl' exit code onto result string if 'curl' returned "000" (no HTTP response)
	869	if [ $CURL_CODE == "000" ]; then
	870	CURL_RESULT="$CURL_RESULT-$CURL_ERR"
	871	fi
	872
[1070]	873	# Begin to determine our status code for this URL (EI, IW, OK, RD, or NG)
[1064]	874	STATUS="??"
[1067]	875	NEW_URL=""
[1064]	876	INTERWIKI_INDEX=-1
	877
[1070]	878	# First make sure that this isn't an "external internal" link to our own wiki that can be replaced
	879	# by "[[page_name]]". If it uses a special access URL beginning with "/w/", let it pass, as it
	880	# probably cannot be replaced by "[[ ]]" markup
	881	if [[ $URL == $WIKI_PATH ]] && [[ $URL != $WIKI_PATH/w/ ]]; then
	882	STATUS="EI"
	883	let EI_LINKS+=1
	884	fi
	885
	886	# If it's not, check if this is a link to a domain that we have an interwiki prefix for
	887	if [ $STATUS == "??" ]; then
	888	for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
	889	if [[ $URL == ${INTERWIKI_DOMAINS[$i]} ]] && [[ $URL != ${INTERWIKI_DOMAINS[$i]}/w/ ]]; then
	890	STATUS="IW"
	891	let IW_LINKS+=1
	892	INTERWIKI_INDEX=$i
	893	break
	894	fi
	895	done
	896	fi
	897
[1069]	898	# If we didn't match an interwiki domain, see if the status code is in our "OK" codes list
	899	if [ $STATUS == "??" ]; then
	900	for CODE in "${OK_CODES[@]}"; do
	901	if [[ $CODE == $CURL_CODE ]]; then
	902	STATUS="OK"
	903	let OK_LINKS+=1
	904	break
	905	fi
	906	done
	907	fi
	908
[1067]	909	# If we didn't get a match with the "OK" codes, check it against the "RD" codes
[1064]	910	if [ $STATUS == "??" ]; then
[1067]	911	for CODE in "${RD_CODES[@]}"; do
	912	if [[ $CODE == $CURL_CODE ]]; then
	913	# Get URL header again in order to retrieve the URL we are being redirected to
[1141]	914	NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --write-out '%{redirect_url}\n' $URL)
[1067]	915
[1122]	916	# Adjust the old and new URLs to both use HTTP for comparison purposes, so we can filter
	917	# those changes out if the user didn't ask for them
	918	URL_HTTP=$(echo $URL \| sed -E 's/^https:/http:/')
	919	NEW_URL_HTTP=$(echo $NEW_URL \| sed -E 's/^https:/http:/')
[1070]	920
	921	# Sometimes 'curl' fails to get the redirect_url due to time-out or bad web site config
[1122]	922	NEW_URL_LENGTH=$(echo \| awk -v input=$NEW_URL_HTTP '{print length(input)}')
[1070]	923	if [ $NEW_URL_LENGTH -lt $MIN_URL_LENGTH ]; then
[1122]	924	NEW_URL_HTTP="[new URL not retrieved]"
[1070]	925	fi
	926
[1122]	927	# Remove slash at end of new URL, if present, so we can filter out the redirects that
	928	# merely add an ending slash if the user didn't ask for them
	929	NEW_URL_NO_SLASH=$(echo $NEW_URL_HTTP \| sed -E 's:/$::')
	930
[1127]	931	# Detect if this is a youtu.be link simply being expanded by YouTube to the full
	932	# youtube.com address
	933	YOUTU_BE=0
	934	if [[ $URL_HTTP == http://youtu.be* ]] && [[ $NEW_URL_HTTP == http://www.youtube.com* ]]; then
	935	YOUTU_BE=1
	936	fi
	937
[1122]	938	# If the URLs match besides HTTP being upgraded to HTTPS, then the link is OK (unless user
	939	# wants those to be reported)
	940	if [ $SHOW_HTTPS -eq 0 ] && [ $URL_HTTP == $NEW_URL_HTTP ]; then
[1123]	941	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show http->https upgrades, and I was redirected to '$NEW_URL'."
[1069]	942	STATUS="OK"
	943	let OK_LINKS+=1
[1122]	944	let SKIP_HTTPS_UP+=1
	945	# If the URLs match besides an added ending slash, then the link is OK (unless user wants
	946	# those to be reported)
	947	elif [ $SHOW_SLASH -eq 0 ] && [ $URL_HTTP == $NEW_URL_NO_SLASH ]; then
[1123]	948	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show added trailing slashes, and I was redirected to '$NEW_URL'."
[1122]	949	STATUS="OK"
	950	let OK_LINKS+=1
	951	let SKIP_SLASH_ADD+=1
[1127]	952	elif [ $SHOW_YT_RD -eq 0 ] && [ $YOUTU_BE -eq 1 ]; then
	953	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show redirects for youtu.be links, and I was redirected to '$NEW_URL'."
	954	STATUS="OK"
	955	let OK_LINKS+=1
	956	let SKIP_YOUTU_BE+=1
[1069]	957	else
	958	STATUS="RD"
	959	let RD_LINKS+=1
	960	fi
[1067]	961	break
	962	fi
	963	done
	964	fi
	965
	966	# If we didn't get a match with the "RD" codes, check it against the "NG" codes
	967	if [ $STATUS == "??" ]; then
[1064]	968	for CODE in "${NG_CODES[@]}"; do
	969	if [[ $CODE == $CURL_CODE ]]; then
	970	STATUS="NG"
	971	let NG_LINKS+=1
	972	break
	973	fi
	974	done
	975	fi
	976
	977	# If we didn't match a known status code, advise the reader
	978	if [ $STATUS == "??" ]; then
[1127]	979	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I encountered the unknown response code $CURL_CODE."
[1064]	980	let SKIP_UNK_CODE+=1
	981	continue
	982	fi
	983
[1136]	984	# Check problem links against exceptions list before proceeding
	985	FOUND_EXCEPT=0
[1070]	986	if [ $STATUS != "OK" ] && [ ! -z $EXCEPT_URL ]; then
	987	# The code we expect to find in the exceptions file is either the 'curl' result or "EI"/"IW"
	988	EXPECT_CODE="$CURL_RESULT"
	989	if [ $STATUS == "EI" ]; then
	990	EXPECT_CODE="EI"
	991	elif [ $STATUS == "IW" ]; then
	992	EXPECT_CODE="IW"
	993	fi
	994
[1136]	995	# Look for link in exceptions list and make sure the listed result code and wiki page also match
	996	for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
	997	{
	998	EXCEPT_LINE="${EXCEPT_ARRAY[$i]}"
	999
	1000	# Match URL
	1001	EXCEPT_URL="${EXCEPT_LINE#*,}"
	1002	EXCEPT_URL="${EXCEPT_URL%,*}"
	1003	if [ "$EXCEPT_URL" != "$URL" ]; then
[1070]	1004	continue
	1005	fi
[1136]	1006
	1007	# Match containing page's name
	1008	EXCEPT_PAGE="${EXCEPT_LINE##*,}"
	1009	EXCEPT_PAGE="${EXCEPT_PAGE%% *}"
	1010	if [ "$EXCEPT_PAGE" == "*" ] \|\| [ "$EXCEPT_PAGE" == $LOCAL_PAGE_PATH ]; then
	1011	# Match result code
	1012	EXCEPT_CODE=${EXCEPT_LINE%%,*}
	1013	if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then
	1014	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because its expected result, '$EXPECT_CODE', is in the exceptions list."
	1015	if [ $STATUS == "EI" ]; then
	1016	let SKIP_EXPECT_EI+=1
	1017	elif [ $STATUS == "IW" ]; then
	1018	let SKIP_EXPECT_IW+=1
	1019	else
	1020	let SKIP_EXPECT_NG+=1
	1021	fi
	1022	FOUND_EXCEPT=1
	1023	break
	1024	fi
	1025	fi
	1026	} done
[1064]	1027	fi
[1136]	1028	if [ $FOUND_EXCEPT -eq 1 ]; then
	1029	continue
	1030	fi
[1064]	1031
	1032	# If appropriate, record this link to the log, with clickable URLs when possible
	1033	if [ $STATUS != "OK" ] \|\| [ $RECORD_OK_LINKS -eq 1 ]; then
[1125]	1034	# Prepare 'curl' result in parentheses to print after status code, unless this is an "IW" or "EI"
	1035	# link, in which case showing the status code doesn't make sense. Adjust spacing after string to
	1036	# ensure TXT and RTF reports have aligned columns of results.
	1037	CURL_STR_H=" ($CURL_RESULT)"
	1038	CURL_STR_T="$CURL_STR_H"
	1039	CURL_STR_R="$CURL_STR_H "
[1070]	1040	if [ $STATUS == "IW" ] \|\| [ $STATUS == "EI" ]; then
[1125]	1041	CURL_STR_H=""
	1042	CURL_STR_T=" "
	1043	CURL_STR_R=" "
[1064]	1044	fi
	1045
	1046	# Record link and its wiki page in TXT, RTF, and HTML markup
[1125]	1047	valPrint t "${STATUS}${CURL_STR_T} $STR_TYPE $URL"
[1064]	1048	valPrint t " linked from $FULL_PAGE_PATH"
[1125]	1049	valPrint r "${STATUS}${CURL_STR_R}${RTF_TABS}$STR_TYPE {\field{\*\fldinst{HYPERLINK \"$URL\"}}{\fldrslt $URL}}"
[1064]	1050	valPrint r " linked from {\field{\*\fldinst{HYPERLINK \"$FULL_PAGE_PATH\"}}{\fldrslt $LOCAL_PAGE_PATH}}"
[1125]	1051	valPrint hn "<tr><td style=\"white-space:nowrap\">${STATUS}${CURL_STR_H}</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>"
[1064]	1052	valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"
	1053
[1123]	1054	# Place vertical space here since we won't be printing anything more about this link
[1125]	1055	if [ $STATUS == "OK" ]; then valPrint tr ""; valPrint hs ""; fi
[1123]	1056
[1067]	1057	# Record redirect URL if one was given by a 3xx response page
	1058	if [ $STATUS == "RD" ]; then
[1119]	1059	valPrint ts " Server suggests $NEW_URL"
	1060	valPrint rs " Server suggests {\field{\*\fldinst{HYPERLINK \"$NEW_URL\"}}{\fldrslt $NEW_URL}}"
	1061	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Server suggests</td><td><a href=\"$NEW_URL\" target=\"_blank\">$NEW_URL</a></td></tr>"
[1067]	1062	fi
	1063
[1070]	1064	# Notify reader if we can use an intrawiki link for this URL
	1065	if [ $STATUS == "EI" ]; then
[1075]	1066	INTRA_PAGE=${URL#:///}
[1119]	1067	valPrint ts " Just use [[$INTRA_PAGE]]"
	1068	valPrint rs " Just use [[$INTRA_PAGE]]"
	1069	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Just use</td><td>[[$INTRA_PAGE]]</td></tr>"
[1070]	1070	fi
	1071
[1064]	1072	# Notify reader if we can use an interwiki prefix for this URL
	1073	if [ $STATUS == "IW" ]; then
[1075]	1074	INTER_PAGE=$(echo "$URL" \| sed 's/.*\///')
[1119]	1075	valPrint ts " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
	1076	valPrint rs " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
	1077	valPrint hs "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]</td></tr>"
[1064]	1078	fi
	1079
	1080	# Query Internet Archive for latest "OK" snapshot for "NG" page
	1081	if [ $STATUS == "NG" ] && [ $SUGGEST_SNAPSHOTS -eq 1 ]; then
[1141]	1082	ARCHIVE_QUERY=$(curl --silent --max-time $TIMEOUT "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
[1064]	1083
[1118]	1084	# If a "closest" snapshot was received...
[1066]	1085	if [[ "$ARCHIVE_QUERY" == \"closest\": ]]; then
[1118]	1086	# In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it
	1087	ARCHIVE_QUERY=$(echo "$ARCHIVE_QUERY" \| sed 's/#!/#\\!/')
	1088
	1089	# ...isolate "url" property in the response that follows the "closest" tag
	1090	SNAPSHOT_URL=${ARCHIVE_QUERY##*\"closest\":} # everything after '"closest":'
[1119]	1091	SNAPSHOT_URL=${SNAPSHOT_URL#*\"url\": \"} # everything after '"url": "'
[1118]	1092	SNAPSHOT_URL=${SNAPSHOT_URL%%\"*} # everything before '"'
	1093
[1124]	1094	# Remove the port 80 part that IA often adds to the URL, as it's superfluous
	1095	SNAPSHOT_URL=$(echo $SNAPSHOT_URL \| sed 's/:80//')
	1096
[1118]	1097	# Inform the user of the snapshot URL
[1119]	1098	valPrint ts " IA suggests $SNAPSHOT_URL"
	1099	valPrint rs " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
	1100	valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
[1064]	1101	else # ...otherwise give generic Wayback Machine link for this URL
[1119]	1102	valPrint ts " Try browsing $ARCHIVE_GENERIC/$URL"
	1103	valPrint rs " Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
	1104	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"
[1064]	1105	fi
	1106	fi
	1107	fi
	1108
	1109	# If this is a page and it seems to be there, take screenshot of it using headless Google Chrome
	1110	if [ $IS_FILE -eq 0 ] && [ $TAKE_PAGE_SHOT -eq 1 ] && [ $STATUS == "OK" ]; then
	1111	# Sanitize URL by removing "http(s)://" and converting any other colons or slashes to underscores
	1112	SHOT_NAME=$(echo "$URL" \| sed 's/https*\:\/\///' \| sed 'y/:\//__/')
	1113	SHOT_FILE="$SHOT_PATH/$SHOT_NAME.png"
	1114
	1115	# Don't take screenshot if we already encountered this page and screenshotted it
	1116	if [ ! -f "$SHOT_FILE" ]; then
[1070]	1117	"$CHROME_PATH" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
[1064]	1118	if [ -f "$WORKING_DIR/$CHROME_SCREENSHOT" ]; then
	1119	mv -n "$WORKING_DIR/$CHROME_SCREENSHOT" "$SHOT_FILE"
	1120	else
[1119]	1121	valPrint trhs "Screenshot of URL $URL seems to have failed!"
[1064]	1122	fi
	1123	else
[1123]	1124	valPrint trhs "Skipping screenshot of URL '$URL' because file '$SHOT_FILE' already exists."
[1064]	1125	fi
	1126	fi
	1127	done
	1128	FINISHED_LIST="yes"
	1129	wrapupAndExit

Note: See TracBrowser for help on using the repository browser.

Download in other formats: