Context Navigation

source: Validate External Links/validate_external_links.sh@ 1121

Last change on this file since 1121 was 1120, checked in by iritscen, 5 years ago
Val's reports now print section headers for the init/config stage and for the link results themselves.
File size: 42.1 KB

Rev	Line
[1064]	1	#!/bin/bash
	2
	3	# Validate External Links by Iritscen
	4	# Provided with a list of external links found in the OniGalore wiki, this script validates them.
	5	# The resulting logs are produced in three formats: TXT (for easy diffing with an earlier log), RTF
	6	# (for reading as a local file with clickable links), and HTML (for uploading as a web page).
	7	# Call script with "--help" argument for documentation. Also see Read Me First.rtf for critical notes.
	8	# Recommended rule:
[1118]	9	# \|----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----\|
[1064]	10
	11	# Set separator token to newline
	12	IFS="
	13	"
	14
	15	### GLOBALS ###
	16	# Settings -- these will be changed from their defaults by the arguments passed in to the script
	17	LINKS_URL="" # use 'curl' to download file with links from this location (can be file://)
	18	EXCEPT_URL="" # ditto above for file with exceptions to NG results
	19	OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder
[1070]	20	RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES
[1064]	21	SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page
	22	TAKE_PAGE_SHOT=0 # take a screenshot of each OK page
[1070]	23	CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature
[1064]	24	URL_START=1 # start at this URL in LINKS_FILE (1 by default)
	25	URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE
	26	UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report
	27
	28	# Fixed strings -- see the occurrences of these variables to learn their purpose
[1118]	29	AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36 OPR/67.0.3575.53"
[1064]	30	ARCHIVE_API="http://archive.org/wayback/available"
	31	ARCHIVE_GENERIC="https://web.archive.org/web/*"
	32	ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
	33	CHROME_SCREENSHOT="screenshot.png"
[1066]	34	CURL_CODES="http://iritscen.oni2.net/val/curl_codes.txt"
[1064]	35	EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
[1066]	36	HTTP_CODES="http://iritscen.oni2.net/val/http_codes.txt"
[1064]	37	MY_WIKI_PAGE="http://wiki.oni2.net/User:Iritscen"
	38	THIS_DIR=$(cd $(dirname $0); pwd)
	39	WORKING_DIR=$(pwd)
	40	WIKI_PATH="wiki.oni2.net"
	41
	42	# These are parallel arrays of the IDs and names of OniGalore's current namespaces
	43	declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 100 101 102 103 104 105 108 109 110 111)
	44	declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk")
	45
	46	# These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
[1070]	47	# This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code.
[1064]	48	declare -a HTTP_FILES=(txt zip wmv jpg png m4a bsl rar oni mp3 mov ONWC vbs TRMA mp4 doc avi log gif pdf dmg exe cpp tga 7z wav east BINA xml dll dae xaf fbx 3ds blend flv csv)
	49	declare -a HTTP_TLDS_AND_PAGES=(com net org uk ru de it htm html php pl asp aspx shtml pgi cgi php3 x jsp phtml cfm css action stm js)
	50
[1067]	51	# These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
	52	# are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt
	53	# if you add a new code.
	54	declare -a OK_CODES=(200 401 405 406 501)
	55	declare -a RD_CODES=(301 302 303 307 308)
	56	declare -a NG_CODES=(000 403 404 410 500 503)
[1064]	57
	58	# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
	59	# transcluded text, and if the transclusion fails, then the braces show up in the URL
	60	ILLEGAL_CHARS="{ }"
	61
[1070]	62	# The shortest URL possible, used for sanity-checking some URLs: http://a.co
	63	MIN_URL_LENGTH=11
	64
[1064]	65	# These are parallel arrays giving the prefixes that can be used in place of normal external links to
	66	# some wikis and other sites
[1070]	67	declare -a INTERWIKI_PREFIXES=(commons metawikimedia mw wikibooks wikidata wikimedia wikinews wikiquote wikisource wikispecies wikiversity wikivoyage wikt wp)
	68	declare -a INTERWIKI_DOMAINS=(commons.wikimedia.org meta.wikimedia.org mediawiki.org wikibooks.org wikidata.org wikimediafoundation.org wikinews.org wikiquote.org wikisource.org species.wikimedia.org wikiversity.org wikivoyage.org wiktionary.org wikipedia.org)
[1064]	69
	70	# Variables for keeping track of main loop progress and findings
	71	LINK_NUM=0
[1070]	72	EI_LINKS=0
	73	IW_LINKS=0
[1064]	74	OK_LINKS=0
[1067]	75	RD_LINKS=0
[1064]	76	NG_LINKS=0
	77	SKIP_UNK_NS=0
	78	SKIP_JS_PAGE=0
	79	SKIP_BAD_URL=0
	80	SKIP_NON_ASCII=0
	81	SKIP_UNK_SUFFIX=0
	82	SKIP_UNK_CODE=0
[1070]	83	SKIP_EXPECT_NG=0
	84	SKIP_EXPECT_EI=0
	85	SKIP_EXPECT_IW=0
[1064]	86	FILE_LINKS=0
	87	PAGE_LINKS=0
	88	SKIPPED_HEADER_ROW=0
	89	FINISHED_LIST="no"
[1118]	90	START_RUN=0
	91	END_RUN=0
[1064]	92
	93
	94	### HELP ###
	95	# A pseudo-man page. Here is the 80-character rule for the page text:
	96	# 234567890123456789012345678901234567890123456789012345678901234567890123456789
	97	function printHelp()
	98	{
	99	cat << EOF
	100
	101	NAME
	102	Validate External Links
	103
	104	SYNOPSIS
	105	validate_external_links.sh --help
[1070]	106	validate_external_links.sh --links URL --output DIR [--exceptions URL]
[1075]	107	[--record-ok-links] [--suggest-snapshots] [--take-screenshots FILE]
[1070]	108	[--start-url NUM] [--end-url NUM] [--upload FILE]
[1064]	109
	110	DESCRIPTION
	111	This script parses a list of external links found in the OniGalore wiki
	112	(which is dumped by the Oni2.net domain periodically in a particular
	113	format), validates them using the Unix tool 'curl', and produces a report
[1070]	114	of which links were "OK" (responded positively to an HTTP query), which
	115	were "RD" (responded with a 3xx redirect code), which could be "IW"
	116	(interwiki) links, which are "EI" (external internal) links and could be
	117	intrawiki links, and which were "NG" (no good; a negative response to the
[1069]	118	query). This report can then be automatically uploaded to the location of
[1064]	119	your choice. The script can also suggest Internet Archive snapshots for
[1070]	120	"NG" links, and take screenshots of "OK" links for visual verification by
	121	the reader that the page in question is the one intended to be displayed.
[1064]	122
	123	You must pass this script the URL at which the list of links is found
[1070]	124	(--links) and the path where the directory of logs should be outputted
	125	(--output). All other arguments are optional.
[1064]	126
	127	OPTIONS
[1075]	128	--help Show this page.
	129	--links URL (required) URL from which to download the CSV
	130	file with external links. Note that this URL can
	131	be a local file if you supply a file:// path.
	132	--output DIR (required) Unix path to directory in which Val
	133	should place its reports.
	134	--exceptions URL In order to remove links from the report which
	135	Val finds an issue with, but which you regard as
	136	OK, list those desired exceptions in this file.
	137	See the sample file exceptions.txt for details.
	138	Note that this URL can point to a local file if
	139	you supply a file:// path.
	140	--record-ok-links Log a link in the report even if its response
	141	code is "OK".
	142	--suggest-snapshots Query the Internet Archive for a possible
	143	snapshot URL for each "NG" page.
	144	--take-screenshots FILE Call the Google Chrome binary at this path to
	145	take screenshots of each "OK" page.
	146	--start-url NUM Start at this link in the links CSV file.
	147	--end-url NUM Stop at this link in the links CSV file.
	148	--upload FILE Upload report using the credentials and path
	149	given in this local text file. See sftp_login.txt
	150	for template.
[1064]	151
	152	BUGS
	153	The script cannot properly parse any line in the external links file
	154	which contains a comma in the name of the wiki page containing a link.
	155	Commas in the link itself are not an issue.
	156	EOF
	157	}
	158
	159
	160	### SETUP ###
	161	# If first argument is a help request, or if nothing was passed in at all, print help page and quit
	162	if [ "$#" -eq 0 ] \|\| [ "$1" == "--help" ]; then
	163	printHelp \| less
	164	exit 0
	165	fi
	166
	167	# Parse arguments as long as there are more arguments to process
	168	while (( "$#" )); do
	169	case "$1" in
[1070]	170	--links ) LINKS_URL="$2"; shift 2;;
	171	--exceptions ) EXCEPT_URL="$2"; shift 2;;
	172	--output ) OUTPUT_DIR="$2"; shift 2;;
	173	--record-ok-links ) RECORD_OK_LINKS=1; shift;;
	174	--suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;;
	175	--take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
	176	--start-url ) URL_START=$2; shift 2;;
	177	--end-url ) URL_LIMIT=$2; shift 2;;
	178	--upload ) UPLOAD_INFO=$2; shift 2;;
[1064]	179	* ) echo "Invalid argument $1 detected. Aborting."; exit 1;;
	180	esac
	181	done
	182
	183	# If the required arguments were not supplied, print help page and quit
	184	if [ -z $LINKS_URL ] \|\| [ -z $OUTPUT_DIR ]; then
[1070]	185	echo "Error: I did not receive one or both required arguments. Run me with the \"--help\" argument for documentation."
[1064]	186	exit 2
	187	fi
	188
[1070]	189	# If user wants screenshots, make sure path to Chrome was passed in and is valid
	190	if [ $TAKE_PAGE_SHOT -eq 1 ]; then
	191	if [ ! -f "$CHROME_PATH" ]; then
	192	echo "Error: You need to supply a path to the Google Chrome application in order to take screenshots."
	193	exit 3
	194	fi
	195	fi
	196
[1064]	197	# Check that UPLOAD_INFO exists, if this argument was supplied
	198	if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then
	199	echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting."
[1070]	200	exit 4
[1064]	201	fi
	202
	203	# Check that OUTPUT_DIR is a directory
	204	if [ ! -d "$OUTPUT_DIR" ]; then
	205	echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting."
[1070]	206	exit 5
[1064]	207	fi
	208
	209	# Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots
	210	SAFE_TIME=$(date "+%Y-%m-%d--%H-%M-%S") # for use in path names
	211	NICE_TIME=$(TZ=":UTC" date "+%a, %d %b %Y %k:%M:%S GMT") # matches format of LINKS_DATE, which is extracted from the extlinks file
	212	OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
	213	OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
	214	SHOT_PATH="$OUTPUT_PATH/Screenshots"
	215	LOG_NAME="ValExtLinks report"
	216	LOG_TXT="$OUTPUT_PATH/$LOG_NAME.txt"
	217	LOG_RTF="$OUTPUT_PATH/$LOG_NAME.rtf"
	218	LOG_HTM="$OUTPUT_PATH/$LOG_NAME.htm"
	219	mkdir "$OUTPUT_PATH"
	220	if [ $TAKE_PAGE_SHOT -eq 1 ]; then
	221	mkdir "$SHOT_PATH"
	222	fi
	223
	224	# Check that 'mkdir' succeeded
	225	if [ ! -d "$OUTPUT_PATH" ]; then
	226	echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting."
[1070]	227	exit 6
[1064]	228	fi
	229
	230	# Get date on the file at LINKS_URL and print to log
	231	LINKS_DATE=$(curl --silent --head $LINKS_URL \| grep "Last-Modified")
	232	if [ -z "$LINKS_DATE" ]; then
	233	echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting."
[1070]	234	exit 7
[1064]	235	fi
	236	LINKS_DATE=${LINKS_DATE#Last-Modified: }
	237
	238
	239	### UTILITY FUNCTIONS ###
	240	# Writes a plain-text header to TXT log file
	241	function printTXTheader()
	242	{
	243	valPrint t "Validate External Links report"
	244	valPrint t "generated $NICE_TIME"
	245	valPrint t "from data of $LINKS_DATE"
	246	valPrint t "script by Iritscen (contact: $MY_WIKI_PAGE)"
	247	valPrint t ""
	248	}
	249
	250	# Writes the RTF header to RTF log file, then a centered title, then sets text to left-justified
	251	function printRTFheader()
	252	{
	253	valPrint r "{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf820
	254	{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
	255	{\colortbl;\red255\green255\blue255;}
	256	{\*\expandedcolortbl;;}
	257	\margl1440\margr1440\vieww12600\viewh12100\viewkind0
	258	\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\qc\partightenfactor0
	259
	260	\f0\fs28 \cf0 \b1 Validate External Links report \b0\\
	261	generated $NICE_TIME\\
	262	from data of $LINKS_DATE\\
	263	script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$MY_WIKI_PAGE\"}}{\fldrslt contact}})
	264	\\
	265	\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
	266	\cf0 "
	267	}
	268
	269	# Closes the RTF markup of the RTF log file
	270	function printRTFfooter()
	271	{
	272	valPrint r "}"
	273	}
	274
	275	# Writes the HTML header to HTML log file
	276	function printHTMheader()
	277	{
	278	valPrint h "<html>
	279	<head>
	280	<title>Validate External Links report</title>
	281	</head>
	282	<body>
	283	<h2>Validate External Links report</h2>
	284	<h3>generated $NICE_TIME<br />
	285	from data of $LINKS_DATE<br />
	286	script by Iritscen (<a href=\"$MY_WIKI_PAGE\" target=\"_blank\">contact</a>)</h3>"
	287	}
	288
	289	# Closes the HTML markup of the HTML log file
	290	function printHTMfooter()
	291	{
	292	valPrint h "</body>
	293	</html>"
	294	}
	295
	296	# The central logging function. The first parameter is a string composed of one or more characters that
[1070]	297	# indicate which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
[1119]	298	# 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 's' means "Print an extra newline at the end." 'w' means "Don't pass console output through 'fmt'" ("fmt" fits the output
	299	# to an 80-column CLI but can break special formatting and the 'n' option).
[1064]	300	function valPrint()
	301	{
	302	if [[ "$1" == c ]]; then
	303	if [[ "$1" == n ]]; then
	304	echo -n "$2"
	305	elif [[ "$1" == w ]]; then
	306	echo "$2"
[1119]	307	elif [[ "$1" == s ]]; then
	308	echo -e "$2\n"
[1064]	309	else
	310	echo "$2" \| fmt -w 80
	311	fi
	312	fi
	313	if [[ "$1" == t ]]; then
	314	if [[ "$1" == n ]]; then
	315	echo -n "$2" >> "$LOG_TXT"
[1119]	316	elif [[ "$1" == s ]]; then
	317	echo -e "$2\n" >> "$LOG_TXT"
[1064]	318	else
	319	echo "$2" >> "$LOG_TXT"
	320	fi
	321	fi
	322	if [[ "$1" == r ]]; then
	323	if [[ "$1" == n ]]; then
	324	echo "$2" >> "$LOG_RTF"
[1119]	325	elif [[ "$1" == s ]]; then
	326	echo "$2\line\line" >> "$LOG_RTF"
[1064]	327	else
[1119]	328	echo "$2\line" >> "$LOG_RTF"
[1064]	329	fi
	330	fi
	331	if [[ "$1" == h ]]; then
[1119]	332	if [[ "$1" == s ]]; then
	333	echo "$2<tr><td> </td></tr>" >> "$LOG_HTM"
	334	elif [[ "$1" == n ]]; then
[1064]	335	echo "$2" >> "$LOG_HTM"
	336	else
	337	echo "$2<br />" >> "$LOG_HTM"
	338	fi
	339	fi
	340	}
	341
	342	# Pluralize the string in parameter 1 if the number in parameter 2 is not 1
	343	function pluralCheckNoun()
	344	{
	345	if [ $2 -ne 1 ]; then
	346	if [[ $1 =~ x$ ]]; then
	347	echo $1es
	348	else
	349	echo $1s
	350	fi
	351	else
	352	echo $1
	353	fi
	354	}
	355
[1067]	356	# Output "is" if parameter 1 is 1, otherwise "are"
	357	function pluralCheckIs()
	358	{
	359	if [ $1 -ne 1 ]; then
	360	echo "are"
	361	else
	362	echo "is"
	363	fi
	364	}
	365
[1064]	366	# Output "was" if parameter 1 is 1, otherwise "were"
	367	function pluralCheckWas()
	368	{
	369	if [ $1 -ne 1 ]; then
	370	echo "were"
	371	else
	372	echo "was"
	373	fi
	374	}
	375
[1067]	376	# Output "a " if parameter 1 is 1, otherwise nothing
	377	function pluralCheckA()
	378	{
	379	if [ $1 -eq 1 ]; then
	380	echo "a "
	381	fi
	382	}
	383
	384	# Output "an " if parameter 1 is 1, otherwise nothing
	385	function pluralCheckAn()
	386	{
	387	if [ $1 -eq 1 ]; then
	388	echo "an "
	389	fi
	390	}
	391
[1064]	392	# Upload HTML report using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
	393	# reports being saved to disk have already been closed.
	394	function uploadReport()
	395	{
	396	valPrint c "Uploading HTML report..."
	397
	398	SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
	399	SFTP_USER_NAME_MARKER="user:"
	400	SFTP_PASSWORD_MARKER="pw:"
	401	SFTP_PORT_MARKER="port:"
	402	SFTP_PATH_MARKER="path:"
	403	SFTP_USER_NAME=$(grep $SFTP_USER_NAME_MARKER $UPLOAD_INFO)
	404	SFTP_USER_NAME=${SFTP_USER_NAME#$SFTP_USER_NAME_MARKER}
	405	SFTP_PASSWORD=$(grep $SFTP_PASSWORD_MARKER $UPLOAD_INFO)
	406	SFTP_PASSWORD=${SFTP_PASSWORD#$SFTP_PASSWORD_MARKER}
	407	SFTP_PORT=$(grep $SFTP_PORT_MARKER $UPLOAD_INFO)
	408	SFTP_PORT=${SFTP_PORT#$SFTP_PORT_MARKER}
	409	SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
	410	SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}
	411
	412	expect "$SCRIPT_PATH" "$LOG_HTM" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.htm"
	413
	414	valPrint c "Report was uploaded, unless an error message appears above."
	415	}
	416
	417	# Prints session summary when script is done
	418	function wrapupAndExit()
	419	{
	420	# Get off progress line on console, drop down a line from last link in log, and close HTML table
	421	valPrint ctr ""
	422	valPrint h "</table><br />"
	423
	424	# If we didn't finish processing the last URL, then the iterator is one too high
	425	if [ $FINISHED_LIST != "yes" ]; then
	426	let LINK_NUM-=1
	427	if [ $FINISHED_LIST == "no" ]; then
	428	valPrint ctrh "The session was canceled by the user."
	429	fi
	430	fi
	431
[1118]	432	# Generate string with elapsed time
	433	END_RUN=$(date +%s)
	434	ELAPSED=$(echo $(($END_RUN - $START_RUN)) \| awk '{printf "%d min. %d sec. elapsed", int($1/60), int($1%60)}')
	435
[1064]	436	# Output results of session and close the log file's markup
	437	LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
	438	LINKS_SKIPPED=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
	439	LINKS_CHECKED=$((LINKS_PROCESSED-LINKS_SKIPPED))
[1118]	440	valPrint ct "Summary ($ELAPSED):"
	441	valPrint r "\b1 Summary \b0 ($ELAPSED)"
	442	valPrint hn "<h3><span id=\"summary\">Summary ($ELAPSED)</span></h3>"
[1064]	443	valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT)."
	444	valPrint ctrh "I skipped $LINKS_SKIPPED $(pluralCheckNoun link $LINKS_SKIPPED), and found $FILE_LINKS $(pluralCheckNoun file $FILE_LINKS) and $PAGE_LINKS $(pluralCheckNoun page $PAGE_LINKS)."
	445	if [ $LINKS_SKIPPED -gt 0 ]; then valPrint ctrh "Skip breakdown: "; fi
	446	if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
[1070]	447	if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
[1064]	448	if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
	449	if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
	450	if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
	451	if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
[1070]	452	valPrint ctrh "Out of the $LINKS_CHECKED links checked, $EI_LINKS could be $(pluralCheckAn $EI_LINKS)intrawiki $(pluralCheckNoun link $EI_LINKS), $IW_LINKS could be $(pluralCheckAn $IW_LINKS)interwiki $(pluralCheckNoun link $IW_LINKS), $OK_LINKS $(pluralCheckWas $OK_LINKS) OK, $RD_LINKS $(pluralCheckWas $RD_LINKS) $(pluralCheckA $RD_LINKS)redirection $(pluralCheckNoun notice $RD_LINKS), and $NG_LINKS $(pluralCheckWas $NG_LINKS) NG."
	453	if [ $SKIP_EXPECT_NG -gt 0 ]; then
	454	valPrint ctrh "$SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS) went unlisted due to being found in the exceptions file."
[1064]	455	fi
[1070]	456	if [ $SKIP_EXPECT_EI -gt 0 ]; then
	457	valPrint ctrh "$SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS) went unlisted due to being found in the exceptions file."
	458	fi
	459	if [ $SKIP_EXPECT_IW -gt 0 ]; then
	460	valPrint ctrh "$SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS) went unlisted due to being found in the exceptions file."
	461	fi
	462	valPrint trh "ValExtLinks says goodbye."
[1064]	463	printRTFfooter
	464	printHTMfooter
	465
	466	# Upload report if this was requested
	467	if [ ! -z $UPLOAD_INFO ] && [ $FINISHED_LIST != "no" ]; then
	468	uploadReport
	469	fi
	470
	471	# Really quit now
	472	valPrint c "ValExtLinks says goodbye."
	473	exit 0
	474	}
	475	trap wrapupAndExit INT
	476
	477
	478	### INITIALIZATION ###
	479	# Print opening message to console and log files
	480	valPrint cw "$(tput bold)--Validate External Links--$(tput sgr0)"
	481	printTXTheader
	482	printRTFheader
	483	printHTMheader
	484
	485	# Attempt to download file at LINKS_URL, then check that it succeeded
[1120]	486	valPrint t "Config:"
	487	valPrint r "\b1 Config \b0"
	488	valPrint hn "<h3>Config</h3>"
[1069]	489	valPrint cwtrh "Downloading list of external links from $LINKS_URL."
[1064]	490	LINKS_FILE_NAME=$(echo "$LINKS_URL" \| sed 's/.*\///')
	491	LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
	492	curl --silent -o "$LINKS_FILE" $LINKS_URL
	493	if [ ! -f "$LINKS_FILE" ]; then
	494	echo "The download of $LINKS_URL appears to have failed. Aborting."
	495	wrapupAndExit
	496	fi
	497
	498	# Attempt to download file at EXCEPT_URL, then check that it succeeded
	499	if [ ! -z $EXCEPT_URL ]; then
[1070]	500	valPrint cwtrh "Downloading list of reporting exceptions from $EXCEPT_URL."
[1064]	501	EXCEPT_FILE_NAME=$(echo "$EXCEPT_URL" \| sed 's/.*\///')
	502	EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
	503	curl --silent -o "$EXCEPT_FILE" $EXCEPT_URL
	504	if [ ! -f "$EXCEPT_FILE" ]; then
	505	echo "The download of $EXCEPT_URL appears to have failed. Aborting."
	506	wrapupAndExit
	507	fi
	508	fi
	509
	510	# Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
	511	LINK_COUNT_STRING=$(cat "$LINKS_FILE" \| wc -l)
	512
	513	# Number of URLs is number of lines minus one (first line is column header row for the CSV)
	514	LINK_COUNT=$(echo "${LINK_COUNT_STRING}" \| tr -d '[:space:]')
	515	let LINK_COUNT-=1
	516
	517	# Calculate number of URLs to consider
	518	if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
	519	valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((URL_LIMIT-URL_START+1)) of them, from link $URL_START to $URL_LIMIT."
	520	elif [ $URL_START -ne 1 ]; then
	521	valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((LINK_COUNT-URL_START+1)) of them, from link $URL_START to $LINK_COUNT."
	522	else
	523	valPrint ctrh "Found $LINK_COUNT links to process. I will be considering all of them."
	524	fi
	525
	526	# Print settings to console and log
[1070]	527	declare -a SETTINGS_MSG=(I will be checking the response code of each link "and will" take a screenshot of each page. Pages that are OK will "also" be logged. I "will" ask the Internet Archive for a suggested snapshot URL for each NG page. "I will not report links that are listed in the exceptions file.")
[1064]	528	if [ $TAKE_PAGE_SHOT -eq 0 ]; then SETTINGS_MSG[10]="but will not"; fi
	529	if [ $RECORD_OK_LINKS -eq 0 ]; then SETTINGS_MSG[22]="not"; fi
	530	if [ $SUGGEST_SNAPSHOTS -eq 0 ]; then SETTINGS_MSG[26]="will not"; fi
	531	if [ -z $EXCEPT_URL ] \|\| [ $RECORD_OK_LINKS -eq 1 ]; then SETTINGS_MSG[40]=""; fi
	532	SETTINGS_STR=${SETTINGS_MSG[@]}
	533	valPrint ctrh "$SETTINGS_STR"
	534	valPrint tr "A summary of my findings will be found at the bottom of the report."
	535	valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
	536	valPrint trh ""
	537
	538	# Print legend to logs
	539	valPrint t "Legend:"
	540	valPrint r "\b1 Legend \b0"
	541	valPrint hn "<h3>Legend</h3>"
	542	valPrint trh "OK = URL seems to be working."
[1067]	543	valPrint trh "NG = URL no longer seems to work. You should click each URL marked as NG before attempting to fix it, because false negatives will occur from time to time due to hiccups in the Internet. Please report any persistent false negatives or other issues to Iritscen. An NG link should be followed by a link to the Internet Archive's Wayback Machine which may help you repair the link. If the link cannot be repaired, you can disable it on the wiki (which prevents it from showing up in future ValExtLinks reports) by wrapping it in nowiki tags."
	544	valPrint trh "RD = The server responding to this URL is saying that the page moved and you should instead use the supplied new URL. Some RD links represent minor adjustments in the organization of a web site, and some are soft 404s (the file/page has been removed and you are being redirected to something like the main page of the web site). You will have to look at the new URL yourself to determine if it represents an OK link and the link on the wiki should be updated to this one, or if the desired file/page is actually gone and we need to replace the wiki link with an Internet Archive snapshot link -- or disable the URL if it has not been archived."
[1070]	545	valPrint trh "EI = URL is an external link to an internal page and should be converted to an intrawiki link using the suggested markup."
	546	valPrint trh "IW = URL is an external link to a fellow wiki and should be converted to an interwiki link using the suggested markup."
[1064]	547	valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $HTTP_CODES)."
	548	valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$HTTP_CODES\"}}{\fldrslt here}} for code reference)."
	549	valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$HTTP_CODES\" target=\"_blank\">here</a> for code reference)."
	550	valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $CURL_CODES)."
	551	valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$CURL_CODES\"}}{\fldrslt here}} for code reference)."
	552	valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$CURL_CODES\" target=\"_blank\">here</a> for code reference)."
	553	valPrint trh "IA suggests = Last available snapshot suggested by the Internet Archive."
[1070]	554	valPrint trh "Try browsing = The Archive occasionally fails to return a snapshot URL even when one exists, so you will need to check for a snapshot manually using this link to the Wayback Machine before concluding that a site has not been archived."
[1064]	555	valPrint trh ""
	556
	557
	558	### MAIN LOOP ###
[1120]	559	valPrint t "Links:"
	560	valPrint r "\b1 Links \b0"
	561	valPrint hn "<h3>Links</h3>"
[1118]	562	START_RUN=$(date +%s)
[1064]	563	# Process each line of the .csv in LINKS_FILE
	564	for LINE in `cat "$LINKS_FILE"`; do
	565	let LINK_NUM+=1
	566
	567	# First line is the column header row for the CSV, so let's verify that the format hasn't changed
	568	if [ $SKIPPED_HEADER_ROW -eq 0 ]; then
	569	if [ $LINE == "namespace,title,target" ]; then
	570	SKIPPED_HEADER_ROW=1
	571	LINK_NUM=0 # this line is it's not a link, so reset the link counter
	572	valPrint hn "<table>"
	573	continue
	574	else
	575	valPrint ctrh "Did not find expected header row in $LINKS_FILE. Aborting."
	576	wrapupAndExit
	577	fi
	578	fi
	579
	580	# Skip this link if we are not at URL_START yet
	581	if [ $LINK_NUM -lt $URL_START ]; then
	582	continue
	583	fi
	584
	585	# Stop if we are at the limit declared for testing purposes
	586	if [ $URL_LIMIT -gt 0 ] && [ $LINK_NUM -gt $URL_LIMIT ]; then
	587	FINISHED_LIST="limit"
	588	wrapupAndExit
	589	fi
	590
	591	# Print progress to screen
	592	if [ $LINK_NUM -gt 1 ]; then
	593	printf "\e[1A\n" # erase previous progress message so that new one appears in its place
	594	fi
	595	valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."
	596
	597	# The number of the namespace is the element before the first comma on the line
	598	NS_ID=${LINE%%,*}
	599
	600	# Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
	601	NS_NAME=""
	602	a=0
[1069]	603	while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done
[1118]	604	if [ $NS_ID == "NULL" ]; then
	605	break
	606	elif [ $NS_ID -eq ${NS_IDS[$a]} ]; then
[1064]	607	NS_NAME="${NS_NAMES[$a]}"
	608	break
	609	fi
	610	let a+=1
	611	done
[1118]	612	if [ "$NS_NAME" == "" ]; then
	613	if [ $NS_ID == "NULL" ]; then
[1119]	614	valPrint trs "Skipping URL on line $LINK_NUM because the namespace (and probably the page too) is \"NULL\". Probably the link is no longer in existence on the wiki."
[1118]	615	else
[1119]	616	valPrint trs "Skipping URL on line $LINK_NUM found on page $PAGE_NAME because I could not find a name for namespace ID $NS_ID."
[1118]	617	fi
[1064]	618	let SKIP_UNK_NS+=1
	619	continue
	620	fi
	621
	622	# The name of the page is everything between the namespace ID and the next comma on the line (commas
	623	# in page names will break this)
	624	PAGE_NAME=${LINE#$NS_ID,}
	625	PAGE_NAME=${PAGE_NAME%%,*}
	626
	627	# We don't want to consider wiki pages ending in .js, as the parser cannot reliably isolate URLS in
	628	# JavaScript code, so it will return erroneous links
	629	PAGE_NAME_SUFFIX=$(echo $PAGE_NAME \| sed 's/.*\.//')
	630	if [ $PAGE_NAME_SUFFIX == "js" ]; then
[1119]	631	valPrint trs "Skipping URL on line $LINK_NUM because it was found on JavaScript page $PAGE_NAME."
[1064]	632	let SKIP_JS_PAGE+=1
	633	continue
	634	fi
	635
[1070]	636	# Build longer wiki page URLs from namespace and page names
	637	FULL_PAGE_PATH=http://$WIKI_PATH/$NS_NAME:$PAGE_NAME
	638	LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
	639	# Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it
	640	# explicitly breaks the link
	641	if [ $NS_ID -eq 0 ]; then
	642	FULL_PAGE_PATH=http://$WIKI_PATH/$PAGE_NAME
	643	LOCAL_PAGE_PATH=$PAGE_NAME
	644	fi
	645
[1064]	646	# The URL being linked to is everything after the previous two fields (this allows commas to be in
	647	# the URLs, but a comma in the previous field, the page name, will break this)
	648	URL=${LINE#$NS_ID,$PAGE_NAME,}
	649
	650	# Scan for illegal characters
	651	if [[ $URL == [$ILLEGAL_CHARS] ]]; then
[1119]	652	valPrint trs "Skipping URL $URL (found on page $PAGE_NAME) because it contains characters illegal in a URL."
[1064]	653	let SKIP_BAD_URL+=1
	654	continue
	655	fi
	656
	657	# Now we need to know if the URL is for a file or a web page. First step is to determine if the
	658	# URL ends in a suffix
	659	HAS_SUFFIX=0
	660
	661	# If the URL ends in something like ".php?foo=bar", strip everything from the '?' onward
[1070]	662	CLEAN_URL=${URL%%\?*}
[1064]	663
	664	# If the URL ends in something like "#section_15", strip everything from the '#' onward
[1070]	665	CLEAN_URL=${CLEAN_URL%%\#*}
[1064]	666
	667	# 'sed' cannot handle Unicode in my Bash shell, so skip this URL and make user check it
[1070]	668	if [[ $CLEAN_URL == [![:ascii:]] ]]; then
[1119]	669	valPrint trs "Skipping URL $URL (found on page $PAGE_NAME) because I cannot handle non-ASCII characters."
[1064]	670	let SKIP_NON_ASCII+=1
	671	continue
	672	fi
	673
	674	# Isolate the characters after the last period and after the last slash
[1070]	675	POST_DOT=$(echo "$CLEAN_URL" \| sed 's/.*\.//')
	676	POST_SLASH=$(echo "$CLEAN_URL" \| sed 's/.*\///')
[1064]	677
	678	# If the last period comes after the last slash, then the URL ends in a suffix
	679	POST_DOT_LENGTH=$(echo \| awk -v input=$POST_DOT '{print length(input)}')
	680	POST_SLASH_LENGTH=$(echo \| awk -v input=$POST_SLASH '{print length(input)}')
	681	if [ $POST_DOT_LENGTH -lt $POST_SLASH_LENGTH ]; then
	682	HAS_SUFFIX=1
	683	else
	684	HAS_SUFFIX=0
	685	fi
	686
	687	# Now we need to know if the URL is for a file. If the URL ends in a suffix, compare it against the
	688	# known URL endings supplied in HTTP_FILES and HTTP_TLDS_AND_PAGES
	689	IS_FILE=-1
	690	if [ $HAS_SUFFIX -eq 0 ]; then
	691	IS_FILE=0
	692	else
	693	# Turn off case sensitivity while we compare suffixes
	694	shopt -s nocasematch
	695
	696	# Special case: URLs ending in something like "/productID.304297400" are pages, not files, so if
	697	# the URL's suffix is all numbers, we are looking at the end of a web page URL
	698	if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
	699	IS_FILE=0
	700	fi
	701
	702	# If we did not identify this URL as a web page above, we need to compare the suffix against known
	703	# file extensions
	704	if [ $IS_FILE -eq -1 ]; then
	705	for EXTENSION in "${HTTP_FILES[@]}"; do
	706	if [[ $EXTENSION == $POST_DOT ]]; then # double brackets respect the nocasematch setting
	707	IS_FILE=1
	708	break
	709	fi
	710	done
	711	fi
	712
	713	# If we did not identify this URL as a file above, we need to compare the suffix against known
	714	# pages and TLDs in order to rule out that this is an unknown file suffix or TLD that the user
	715	# needs to add to the HTTP_FILES or HTTP_TLDS_AND_PAGES array
	716	if [ $IS_FILE -eq -1 ]; then
	717	for PAGE_OR_TLD in "${HTTP_TLDS_AND_PAGES[@]}"; do
	718	if [[ $PAGE_OR_TLD == $POST_DOT ]]; then
	719	IS_FILE=0
	720	break
	721	fi
	722	done
	723	fi
	724
	725	# Turn case sensitivity back on in Bash
	726	shopt -u nocasematch
	727	fi
	728
	729	# If this suffix escaped identification as either a file, page or TLD, inform the user
	730	STR_TYPE=""
	731	if [ $IS_FILE -eq -1 ]; then
[1119]	732	valPrint trs "Skipping URL $URL (found on page $PAGE_NAME) because I encountered the unknown URL ending $POST_DOT. Please add this ending to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
[1064]	733	let SKIP_UNK_SUFFIX+=1
	734	continue
	735	elif [ $IS_FILE -eq 1 ]; then
	736	STR_TYPE="file"
	737	let FILE_LINKS+=1
	738	elif [ $IS_FILE -eq 0 ]; then
	739	STR_TYPE="page"
	740	let PAGE_LINKS+=1
	741	fi
	742
	743	# Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
	744	# issue with sites that require HTTPS
	745	CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{http_code}\n' $URL)
	746	CURL_ERR=$(echo $?)
	747	CURL_RESULT=$CURL_CODE
	748
	749	# Tack 'curl' exit code onto result string if 'curl' returned "000" (no HTTP response)
	750	if [ $CURL_CODE == "000" ]; then
	751	CURL_RESULT="$CURL_RESULT-$CURL_ERR"
	752	fi
	753
[1070]	754	# Begin to determine our status code for this URL (EI, IW, OK, RD, or NG)
[1064]	755	STATUS="??"
[1067]	756	NEW_URL=""
[1064]	757	INTERWIKI_INDEX=-1
	758
[1070]	759	# First make sure that this isn't an "external internal" link to our own wiki that can be replaced
	760	# by "[[page_name]]". If it uses a special access URL beginning with "/w/", let it pass, as it
	761	# probably cannot be replaced by "[[ ]]" markup
	762	if [[ $URL == $WIKI_PATH ]] && [[ $URL != $WIKI_PATH/w/ ]]; then
	763	STATUS="EI"
	764	let EI_LINKS+=1
	765	fi
	766
	767	# If it's not, check if this is a link to a domain that we have an interwiki prefix for
	768	if [ $STATUS == "??" ]; then
	769	for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
	770	if [[ $URL == ${INTERWIKI_DOMAINS[$i]} ]] && [[ $URL != ${INTERWIKI_DOMAINS[$i]}/w/ ]]; then
	771	STATUS="IW"
	772	let IW_LINKS+=1
	773	INTERWIKI_INDEX=$i
	774	break
	775	fi
	776	done
	777	fi
	778
[1069]	779	# If we didn't match an interwiki domain, see if the status code is in our "OK" codes list
	780	if [ $STATUS == "??" ]; then
	781	for CODE in "${OK_CODES[@]}"; do
	782	if [[ $CODE == $CURL_CODE ]]; then
	783	STATUS="OK"
	784	let OK_LINKS+=1
	785	break
	786	fi
	787	done
	788	fi
	789
[1067]	790	# If we didn't get a match with the "OK" codes, check it against the "RD" codes
[1064]	791	if [ $STATUS == "??" ]; then
[1067]	792	for CODE in "${RD_CODES[@]}"; do
	793	if [[ $CODE == $CURL_CODE ]]; then
	794	# Get URL header again in order to retrieve the URL we are being redirected to
	795	NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{redirect_url}\n' $URL)
	796
[1070]	797	# Filter out cases where the redirect URL is just the original URL with https:// instead of
	798	# http://, or with an added '/' at the end. These corrections happen a lot and are not
	799	# important to us.
	800	URL_NO_PROTOCOL=${URL#http://}
	801	URL_NO_PROTOCOL=${URL_NO_PROTOCOL%/}
	802	NEW_URL_NO_PROTOCOL=${NEW_URL#https://}
	803	NEW_URL_NO_PROTOCOL=${NEW_URL_NO_PROTOCOL%/}
	804
	805	# Sometimes 'curl' fails to get the redirect_url due to time-out or bad web site config
	806	NEW_URL_LENGTH=$(echo \| awk -v input=$NEW_URL_NO_PROTOCOL '{print length(input)}')
	807	if [ $NEW_URL_LENGTH -lt $MIN_URL_LENGTH ]; then
	808	NEW_URL_NO_PROTOCOL="[new URL not retrieved]"
	809	fi
	810
	811	# If the URLs match after the above filters were applied, then the link is OK
[1069]	812	if [ $URL_NO_PROTOCOL == $NEW_URL_NO_PROTOCOL ]; then
	813	STATUS="OK"
	814	let OK_LINKS+=1
	815	else
	816	STATUS="RD"
	817	let RD_LINKS+=1
	818	fi
[1067]	819	break
	820	fi
	821	done
	822	fi
	823
	824	# If we didn't get a match with the "RD" codes, check it against the "NG" codes
	825	if [ $STATUS == "??" ]; then
[1064]	826	for CODE in "${NG_CODES[@]}"; do
	827	if [[ $CODE == $CURL_CODE ]]; then
	828	STATUS="NG"
	829	let NG_LINKS+=1
	830	break
	831	fi
	832	done
	833	fi
	834
	835	# If we didn't match a known status code, advise the reader
	836	if [ $STATUS == "??" ]; then
[1119]	837	valPrint trs "Skipping URL $URL (found on page $PAGE_NAME) because I encountered the unknown return code $CURL_CODE."
[1064]	838	let SKIP_UNK_CODE+=1
	839	continue
	840	fi
	841
[1070]	842	# Check problem links against exceptions file before proceeding
	843	if [ $STATUS != "OK" ] && [ ! -z $EXCEPT_URL ]; then
	844	# The code we expect to find in the exceptions file is either the 'curl' result or "EI"/"IW"
	845	EXPECT_CODE="$CURL_RESULT"
	846	if [ $STATUS == "EI" ]; then
	847	EXPECT_CODE="EI"
	848	elif [ $STATUS == "IW" ]; then
	849	EXPECT_CODE="IW"
	850	fi
	851
	852	# Look for link in exceptions file and make sure its listed result code and wiki page also match
[1064]	853	GREP_RESULT=$(grep --max-count=1 "$URL" "$EXCEPT_FILE")
[1070]	854	EXCEPT_PAGE=${GREP_RESULT##*,}
	855	if [ "$EXCEPT_PAGE" == "*" ] \|\| [ "$EXCEPT_PAGE" == $LOCAL_PAGE_PATH ]; then
	856	EXCEPT_CODE=${GREP_RESULT%%,*}
	857	if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then
[1119]	858	valPrint trs "Skipping URL $URL (found on page $PAGE_NAME) because its expected result, $EXPECT_CODE, is listed in the exceptions file."
[1070]	859	if [ $STATUS == "EI" ]; then
	860	let SKIP_EXPECT_EI+=1
	861	elif [ $STATUS == "IW" ]; then
	862	let SKIP_EXPECT_IW+=1
	863	else
	864	let SKIP_EXPECT_NG+=1
	865	fi
	866	continue
	867	fi
[1064]	868	fi
	869	fi
	870
	871	# If appropriate, record this link to the log, with clickable URLs when possible
	872	if [ $STATUS != "OK" ] \|\| [ $RECORD_OK_LINKS -eq 1 ]; then
[1070]	873	# Stupid hack since the strings "IW" and "EI" are narrower than "OK", "RD", or "NG" and it takes
	874	# an extra tab to get to the desired level of indentation in the RTF log
[1064]	875	RTF_TABS=" "
[1070]	876	if [ $STATUS == "IW" ] \|\| [ $STATUS == "EI" ]; then
[1064]	877	RTF_TABS=" "
	878	fi
	879
	880	# Record link and its wiki page in TXT, RTF, and HTML markup
	881	valPrint t "$STATUS ($CURL_RESULT) $STR_TYPE $URL"
	882	valPrint t " linked from $FULL_PAGE_PATH"
	883	valPrint r "$STATUS ($CURL_RESULT)$RTF_TABS$STR_TYPE {\field{\*\fldinst{HYPERLINK \"$URL\"}}{\fldrslt $URL}}"
	884	valPrint r " linked from {\field{\*\fldinst{HYPERLINK \"$FULL_PAGE_PATH\"}}{\fldrslt $LOCAL_PAGE_PATH}}"
	885	valPrint hn "<tr><td style=\"white-space:nowrap\">$STATUS ($CURL_RESULT)</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>"
	886	valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"
	887
[1067]	888	# Record redirect URL if one was given by a 3xx response page
	889	if [ $STATUS == "RD" ]; then
[1119]	890	valPrint ts " Server suggests $NEW_URL"
	891	valPrint rs " Server suggests {\field{\*\fldinst{HYPERLINK \"$NEW_URL\"}}{\fldrslt $NEW_URL}}"
	892	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Server suggests</td><td><a href=\"$NEW_URL\" target=\"_blank\">$NEW_URL</a></td></tr>"
[1067]	893	fi
	894
[1070]	895	# Notify reader if we can use an intrawiki link for this URL
	896	if [ $STATUS == "EI" ]; then
[1075]	897	INTRA_PAGE=${URL#:///}
[1119]	898	valPrint ts " Just use [[$INTRA_PAGE]]"
	899	valPrint rs " Just use [[$INTRA_PAGE]]"
	900	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Just use</td><td>[[$INTRA_PAGE]]</td></tr>"
[1070]	901	fi
	902
[1064]	903	# Notify reader if we can use an interwiki prefix for this URL
	904	if [ $STATUS == "IW" ]; then
[1075]	905	INTER_PAGE=$(echo "$URL" \| sed 's/.*\///')
[1119]	906	valPrint ts " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
	907	valPrint rs " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
	908	valPrint hs "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]</td></tr>"
[1064]	909	fi
	910
	911	# Query Internet Archive for latest "OK" snapshot for "NG" page
	912	if [ $STATUS == "NG" ] && [ $SUGGEST_SNAPSHOTS -eq 1 ]; then
	913	ARCHIVE_QUERY=$(curl --silent --max-time 10 "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
	914
[1118]	915	# If a "closest" snapshot was received...
[1066]	916	if [[ "$ARCHIVE_QUERY" == \"closest\": ]]; then
[1118]	917	# In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it
	918	ARCHIVE_QUERY=$(echo "$ARCHIVE_QUERY" \| sed 's/#!/#\\!/')
	919
	920	# ...isolate "url" property in the response that follows the "closest" tag
	921	SNAPSHOT_URL=${ARCHIVE_QUERY##*\"closest\":} # everything after '"closest":'
[1119]	922	SNAPSHOT_URL=${SNAPSHOT_URL#*\"url\": \"} # everything after '"url": "'
[1118]	923	SNAPSHOT_URL=${SNAPSHOT_URL%%\"*} # everything before '"'
	924
	925	# Inform the user of the snapshot URL
[1119]	926	valPrint ts " IA suggests $SNAPSHOT_URL"
	927	valPrint rs " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
	928	valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
[1064]	929	else # ...otherwise give generic Wayback Machine link for this URL
[1119]	930	valPrint ts " Try browsing $ARCHIVE_GENERIC/$URL"
	931	valPrint rs " Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
	932	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"
[1064]	933	fi
	934	fi
	935	fi
	936
	937	# If this is a page and it seems to be there, take screenshot of it using headless Google Chrome
	938	if [ $IS_FILE -eq 0 ] && [ $TAKE_PAGE_SHOT -eq 1 ] && [ $STATUS == "OK" ]; then
	939	# Sanitize URL by removing "http(s)://" and converting any other colons or slashes to underscores
	940	SHOT_NAME=$(echo "$URL" \| sed 's/https*\:\/\///' \| sed 'y/:\//__/')
	941	SHOT_FILE="$SHOT_PATH/$SHOT_NAME.png"
	942
	943	# Don't take screenshot if we already encountered this page and screenshotted it
	944	if [ ! -f "$SHOT_FILE" ]; then
[1070]	945	"$CHROME_PATH" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
[1064]	946	if [ -f "$WORKING_DIR/$CHROME_SCREENSHOT" ]; then
	947	mv -n "$WORKING_DIR/$CHROME_SCREENSHOT" "$SHOT_FILE"
	948	else
[1119]	949	valPrint trhs "Screenshot of URL $URL seems to have failed!"
[1064]	950	fi
	951	else
[1119]	952	valPrint trhs "Skipping screenshot of URL $URL because $SHOT_FILE already exists."
[1064]	953	fi
	954	fi
	955	done
	956	FINISHED_LIST="yes"
	957	wrapupAndExit

Note: See TracBrowser for help on using the repository browser.

Download in other formats: