Context Navigation

source: Validate External Links/validate_external_links.sh@ 1151

Last change on this file since 1151 was 1149, checked in by iritscen, 4 years ago
ValExtLinks: The messages about skipping URLs now show the wiki page's namespace. Added 504 to known response codes.
File size: 54.5 KB

Rev	Line
[1064]	1	#!/bin/bash
	2
	3	# Validate External Links by Iritscen
[1141]	4	#
	5	# Validates a list of external links in CSV format. The resulting logs are produced in three formats:
	6	# - TXT (for easy diffing with an earlier log)
	7	# - RTF (for reading as a local file with clickable links)
[1144]	8	# - HTML (for reading as a web page)
[1142]	9	# Call script with "--help" argument for documentation. Also see Read Me.rtf for critical notes.
[1141]	10	#
[1064]	11	# Recommended rule:
[1118]	12	# \|----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----\|
[1141]	13	#
	14	# Table of contents (sections of script in order of appearance, not execution):
	15	# • Globals
	16	# • Help Output
	17	# • Setup
	18	# • Utility Functions
	19	# • Summary Output
	20	# • Initialization
	21	# • Data Sourcing
	22	# • Config Output
	23	# • Legend Output
	24	# • Main Loop
[1064]	25
	26	# Set separator token to newline
	27	IFS="
	28	"
	29
	30	### GLOBALS ###
	31	# Settings -- these will be changed from their defaults by the arguments passed in to the script
[1147]	32	LINKS_URL="" # use 'curl' to download file with links from this location (can be file://)
	33	EXCEPT_URL="" # 'curl' will access this wiki page with a list of exceptions for NG results
	34	OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder
	35	RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES
	36	SHOW_SLASH=0 # record issue when a slash is added to the end of a URL
	37	SHOW_HTTPS=0 # record issue when "http" is upgraded to "https"
	38	SHOW_YT_RD=0 # record redirection for a youtu.be URL expanding to the full URL
	39	SUGGEST_SNAPSHOTS_NG=0 # query the Internet Archive for a possible snapshot URL for each NG page
	40	SUGGEST_SNAPSHOTS_OK=0 # query the Internet Archive for an existing snapshot of each OK page
	41	CHECK_ARCHIVE_LINKS=0 # check URLs under the archive.org domain
	42	TAKE_PAGE_SHOT=0 # take a screenshot of each OK page
	43	TIMEOUT=10 # time to wait for a response when querying a site
	44	CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature
	45	URL_START=1 # start at this URL in LINKS_FILE
	46	URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE
	47	UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report
[1064]	48
	49	# Fixed strings -- see the occurrences of these variables to learn their purpose
[1148]	50	AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/88.0.4324.146 Safari/537.36"
[1064]	51	ARCHIVE_API="http://archive.org/wayback/available"
	52	ARCHIVE_GENERIC="https://web.archive.org/web/*"
	53	ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
	54	CHROME_SCREENSHOT="screenshot.png"
[1136]	55	EXCEPT_FILE_NAME="exceptions.txt"
[1064]	56	EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
[1141]	57	WIKI_CURL="https://wiki.oni2.net/Validate_External_Links/Curl_codes"
	58	WIKI_HTTP="https://wiki.oni2.net/Validate_External_Links/HTTP_codes"
	59	WIKI_MAIN="https://wiki.oni2.net/Validate_External_Links"
	60	WIKI_ME="http://iritscen.oni2.net"
[1064]	61	THIS_DIR=$(cd $(dirname $0); pwd)
	62	WORKING_DIR=$(pwd)
	63	WIKI_PATH="wiki.oni2.net"
	64
	65	# These are parallel arrays of the IDs and names of OniGalore's current namespaces
	66	declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 100 101 102 103 104 105 108 109 110 111)
	67	declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk")
	68
	69	# These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
[1070]	70	# This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code.
[1127]	71	declare -a HTTP_FILES=(3ds 7z avi BINA blend bsl cpp csv dae dll dmg doc east exe fbx first63 flv gamedata gif jpg last32 log m4a mhm mov mp3 mp4 oni ONWC pdf png py rar tga TRMA txt vbs wav wmv xaf xml zip)
[1145]	72	declare -a HTTP_TLDS_AND_PAGES=(action ars asp aspx cfm cgi com css de do full htm html it js jsp net org pgi php php3 phtml pl ru shtml stm uk x)
[1064]	73
[1067]	74	# These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
	75	# are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt
	76	# if you add a new code.
[1127]	77	declare -a OK_CODES=(200 401 405 406 418 501)
[1067]	78	declare -a RD_CODES=(301 302 303 307 308)
[1149]	79	declare -a NG_CODES=(000 400 403 404 410 429 500 502 503 504 530)
[1064]	80
	81	# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
	82	# transcluded text, and if the transclusion fails, then the braces show up in the URL
	83	ILLEGAL_CHARS="{ }"
	84
[1070]	85	# The shortest URL possible, used for sanity-checking some URLs: http://a.co
	86	MIN_URL_LENGTH=11
	87
[1064]	88	# These are parallel arrays giving the prefixes that can be used in place of normal external links to
	89	# some wikis and other sites
[1070]	90	declare -a INTERWIKI_PREFIXES=(commons metawikimedia mw wikibooks wikidata wikimedia wikinews wikiquote wikisource wikispecies wikiversity wikivoyage wikt wp)
	91	declare -a INTERWIKI_DOMAINS=(commons.wikimedia.org meta.wikimedia.org mediawiki.org wikibooks.org wikidata.org wikimediafoundation.org wikinews.org wikiquote.org wikisource.org species.wikimedia.org wikiversity.org wikivoyage.org wiktionary.org wikipedia.org)
[1064]	92
	93	# Variables for keeping track of main loop progress and findings
	94	LINK_NUM=0
[1070]	95	EI_LINKS=0
	96	IW_LINKS=0
[1064]	97	OK_LINKS=0
[1067]	98	RD_LINKS=0
[1064]	99	NG_LINKS=0
	100	SKIP_UNK_NS=0
	101	SKIP_JS_PAGE=0
	102	SKIP_BAD_URL=0
	103	SKIP_NON_ASCII=0
	104	SKIP_UNK_SUFFIX=0
	105	SKIP_UNK_CODE=0
[1070]	106	SKIP_EXPECT_NG=0
[1142]	107	SKIP_EXPECT_RD=0
[1070]	108	SKIP_EXPECT_EI=0
	109	SKIP_EXPECT_IW=0
[1122]	110	SKIP_HTTPS_UP=0
	111	SKIP_SLASH_ADD=0
[1127]	112	SKIP_YOUTU_BE=0
[1135]	113	SKIP_ARCHIVE_ORG=0
[1064]	114	FILE_LINKS=0
	115	PAGE_LINKS=0
	116	SKIPPED_HEADER_ROW=0
	117	FINISHED_LIST="no"
[1118]	118	START_RUN=0
	119	END_RUN=0
[1064]	120
	121
[1141]	122	### HELP OUTPUT ###
[1064]	123	# A pseudo-man page. Here is the 80-character rule for the page text:
	124	# 234567890123456789012345678901234567890123456789012345678901234567890123456789
	125	function printHelp()
	126	{
	127	cat << EOF
	128
	129	NAME
	130	Validate External Links
	131
	132	SYNOPSIS
	133	validate_external_links.sh --help
[1070]	134	validate_external_links.sh --links URL --output DIR [--exceptions URL]
[1136]	135	[--record-ok-links] [--show-added-slashes] [--show-https-upgrades]
[1144]	136	[--show-yt-redirects] [--suggest-snapshots] [--check-archive-links]
[1141]	137	[--take-screenshots FILE] [--timeout NUM] [--start-url NUM]
	138	[--end-url NUM] [--upload FILE]
[1064]	139
	140	DESCRIPTION
	141	This script parses a list of external links found in the OniGalore wiki
[1147]	142	(which is dumped by the Oni2.net server periodically in a particular
[1064]	143	format), validates them using the Unix tool 'curl', and produces a report
[1070]	144	of which links were "OK" (responded positively to an HTTP query), which
	145	were "RD" (responded with a 3xx redirect code), which could be "IW"
	146	(interwiki) links, which are "EI" (external internal) links and could be
	147	intrawiki links, and which were "NG" (no good; a negative response to the
[1069]	148	query). This report can then be automatically uploaded to the location of
[1064]	149	your choice. The script can also suggest Internet Archive snapshots for
[1070]	150	"NG" links, and take screenshots of "OK" links for visual verification by
	151	the reader that the page in question is the one intended to be displayed.
[1064]	152
	153	You must pass this script the URL at which the list of links is found
[1070]	154	(--links) and the path where the directory of logs should be outputted
	155	(--output). All other arguments are optional.
[1064]	156
	157	OPTIONS
[1075]	158	--help Show this page.
	159	--links URL (required) URL from which to download the CSV
	160	file with external links. Note that this URL can
	161	be a local file if you supply a file:// path.
	162	--output DIR (required) Unix path to directory in which Val
	163	should place its reports.
	164	--exceptions URL In order to remove links from the report which
[1136]	165	Val finds an issue with but which you regard as
	166	OK, list those desired exceptions on a wiki page.
	167	See the sample file "exceptions.pdf" for the
	168	required format of the page. Note that this URL
	169	can point to a local file if you supply a path
	170	beginning with "file://".
[1075]	171	--record-ok-links Log a link in the report even if its response
	172	code is "OK".
[1122]	173	--show-added-slashes Report on redirects that simply add a '/' to the
	174	end of the URL.
[1127]	175	--show-https-upgrades Report on redirects that simply upgrade a
[1122]	176	"http://" URL to a "https://" URL.
[1127]	177	--show-yt-redirects Report on redirects that expand a youtu.be URL.
[1147]	178	--suggest-snapshots-ng Query the Internet Archive for a possible
[1075]	179	snapshot URL for each "NG" page.
[1147]	180	--suggest-snapshots-ok Query the Internet Archive for a snapshot of each
	181	"OK" page just to make sure it's available. Note
	182	that this will add a tremendous amount of time to
	183	the script execution because there is a rate
	184	limit to the Archive API. Note that this option
	185	does nothing unless you also use the
	186	--record-ok-links argument.
[1144]	187	--check-archive-links Check links that are already pointing to a page
	188	on the Internet Archive. In theory these links
	189	should be totally stable and not need validation.
[1075]	190	--take-screenshots FILE Call the Google Chrome binary at this path to
	191	take screenshots of each "OK" page.
[1141]	192	--timeout NUM Wait this many seconds for a site to respond. The
[1142]	193	default is 10. Important note: Val will attempt
	194	to reach each URL three times, so the time taken
	195	to ping an unresponsive site will be three times
	196	this setting.
[1075]	197	--start-url NUM Start at this link in the links CSV file.
	198	--end-url NUM Stop at this link in the links CSV file.
	199	--upload FILE Upload report using the credentials and path
	200	given in this local text file. See sftp_login.txt
	201	for template.
[1064]	202
	203	BUGS
	204	The script cannot properly parse any line in the external links file
	205	which contains a comma in the name of the wiki page containing a link.
	206	Commas in the link itself are not an issue.
	207	EOF
	208	}
	209
	210
	211	### SETUP ###
	212	# If first argument is a help request, or if nothing was passed in at all, print help page and quit
	213	if [ "$#" -eq 0 ] \|\| [ "$1" == "--help" ]; then
	214	printHelp \| less
	215	exit 0
	216	fi
	217
	218	# Parse arguments as long as there are more arguments to process
	219	while (( "$#" )); do
	220	case "$1" in
[1147]	221	--links ) LINKS_URL="$2"; shift 2;;
	222	--exceptions ) EXCEPT_URL="$2"; shift 2;;
	223	--output ) OUTPUT_DIR="$2"; shift 2;;
	224	--record-ok-links ) RECORD_OK_LINKS=1; shift;;
	225	--show-added-slashes ) SHOW_SLASH=1; shift;;
	226	--show-https-upgrades ) SHOW_HTTPS=1; shift;;
	227	--show-yt-redirects ) SHOW_YT_RD=1; shift;;
	228	--suggest-snapshots-ng ) SUGGEST_SNAPSHOTS_NG=1; shift;;
	229	--suggest-snapshots-ok ) SUGGEST_SNAPSHOTS_OK=1; shift;;
	230	--check-archive-links ) CHECK_ARCHIVE_LINKS=1; shift;;
	231	--take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
	232	--timeout ) TIMEOUT=$2; shift 2;;
	233	--start-url ) URL_START=$2; shift 2;;
	234	--end-url ) URL_LIMIT=$2; shift 2;;
	235	--upload ) UPLOAD_INFO=$2; shift 2;;
	236	* ) echo "Invalid argument $1 detected. Aborting."; exit 1;;
[1064]	237	esac
	238	done
	239
	240	# If the required arguments were not supplied, print help page and quit
	241	if [ -z $LINKS_URL ] \|\| [ -z $OUTPUT_DIR ]; then
[1070]	242	echo "Error: I did not receive one or both required arguments. Run me with the \"--help\" argument for documentation."
[1064]	243	exit 2
	244	fi
	245
[1070]	246	# If user wants screenshots, make sure path to Chrome was passed in and is valid
	247	if [ $TAKE_PAGE_SHOT -eq 1 ]; then
	248	if [ ! -f "$CHROME_PATH" ]; then
	249	echo "Error: You need to supply a path to the Google Chrome application in order to take screenshots."
	250	exit 3
	251	fi
	252	fi
	253
[1064]	254	# Check that UPLOAD_INFO exists, if this argument was supplied
	255	if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then
	256	echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting."
[1070]	257	exit 4
[1064]	258	fi
	259
	260	# Check that OUTPUT_DIR is a directory
	261	if [ ! -d "$OUTPUT_DIR" ]; then
	262	echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting."
[1070]	263	exit 5
[1064]	264	fi
	265
	266	# Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots
	267	SAFE_TIME=$(date "+%Y-%m-%d--%H-%M-%S") # for use in path names
	268	NICE_TIME=$(TZ=":UTC" date "+%a, %d %b %Y %k:%M:%S GMT") # matches format of LINKS_DATE, which is extracted from the extlinks file
	269	OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
	270	OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
	271	SHOT_PATH="$OUTPUT_PATH/Screenshots"
	272	LOG_NAME="ValExtLinks report"
[1144]	273	LOG_NAME_TXT="$LOG_NAME.txt"
	274	LOG_NAME_RTF="$LOG_NAME.rtf"
	275	LOG_NAME_HTM="$LOG_NAME.htm"
	276	LOG_PATH="$OUTPUT_PATH/$LOG_NAME"
	277	LOG_PATH_TXT="$LOG_PATH.txt"
	278	LOG_PATH_RTF="$LOG_PATH.rtf"
	279	LOG_PATH_HTM="$LOG_PATH.htm"
[1064]	280	mkdir "$OUTPUT_PATH"
	281	if [ $TAKE_PAGE_SHOT -eq 1 ]; then
	282	mkdir "$SHOT_PATH"
	283	fi
	284
	285	# Check that 'mkdir' succeeded
	286	if [ ! -d "$OUTPUT_PATH" ]; then
	287	echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting."
[1070]	288	exit 6
[1064]	289	fi
	290
	291	# Get date on the file at LINKS_URL and print to log
	292	LINKS_DATE=$(curl --silent --head $LINKS_URL \| grep "Last-Modified")
	293	if [ -z "$LINKS_DATE" ]; then
	294	echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting."
[1070]	295	exit 7
[1064]	296	fi
	297	LINKS_DATE=${LINKS_DATE#Last-Modified: }
	298
	299
	300	### UTILITY FUNCTIONS ###
	301	# Writes a plain-text header to TXT log file
	302	function printTXTheader()
	303	{
	304	valPrint t "Validate External Links report"
	305	valPrint t "generated $NICE_TIME"
	306	valPrint t "from data of $LINKS_DATE"
[1141]	307	valPrint t "script by Iritscen (contact: $WIKI_ME)"
[1064]	308	valPrint t ""
	309	}
	310
	311	# Writes the RTF header to RTF log file, then a centered title, then sets text to left-justified
	312	function printRTFheader()
	313	{
	314	valPrint r "{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf820
	315	{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
	316	{\colortbl;\red255\green255\blue255;}
	317	{\*\expandedcolortbl;;}
	318	\margl1440\margr1440\vieww12600\viewh12100\viewkind0
	319	\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\qc\partightenfactor0
	320
	321	\f0\fs28 \cf0 \b1 Validate External Links report \b0\\
	322	generated $NICE_TIME\\
	323	from data of $LINKS_DATE\\
[1141]	324	script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$WIKI_ME\"}}{\fldrslt contact}})
[1064]	325	\\
	326	\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
	327	\cf0 "
	328	}
	329
	330	# Closes the RTF markup of the RTF log file
	331	function printRTFfooter()
	332	{
	333	valPrint r "}"
	334	}
	335
	336	# Writes the HTML header to HTML log file
	337	function printHTMheader()
	338	{
	339	valPrint h "<html>
	340	<head>
	341	<title>Validate External Links report</title>
	342	</head>
	343	<body>
	344	<h2>Validate External Links report</h2>
	345	<h3>generated $NICE_TIME<br />
	346	from data of $LINKS_DATE<br />
[1141]	347	script by Iritscen (<a href=\"$WIKI_ME\" target=\"_blank\">contact</a>)</h3>"
[1064]	348	}
	349
	350	# Closes the HTML markup of the HTML log file
	351	function printHTMfooter()
	352	{
	353	valPrint h "</body>
	354	</html>"
	355	}
	356
	357	# The central logging function. The first parameter is a string composed of one or more characters that
[1070]	358	# indicate which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
[1141]	359	# 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 's' means "Print an
	360	# extra newline at the end." 'w' means "Don't pass console output through 'fmt'" ("fmt" fits the output
[1119]	361	# to an 80-column CLI but can break special formatting and the 'n' option).
[1064]	362	function valPrint()
	363	{
	364	if [[ "$1" == c ]]; then
	365	if [[ "$1" == n ]]; then
	366	echo -n "$2"
	367	elif [[ "$1" == w ]]; then
	368	echo "$2"
[1119]	369	elif [[ "$1" == s ]]; then
	370	echo -e "$2\n"
[1064]	371	else
	372	echo "$2" \| fmt -w 80
	373	fi
	374	fi
	375	if [[ "$1" == t ]]; then
	376	if [[ "$1" == n ]]; then
[1144]	377	echo -n "$2" >> "$LOG_PATH_TXT"
[1119]	378	elif [[ "$1" == s ]]; then
[1144]	379	echo -e "$2\n" >> "$LOG_PATH_TXT"
[1064]	380	else
[1144]	381	echo "$2" >> "$LOG_PATH_TXT"
[1064]	382	fi
	383	fi
	384	if [[ "$1" == r ]]; then
	385	if [[ "$1" == n ]]; then
[1144]	386	echo "$2" >> "$LOG_PATH_RTF"
[1119]	387	elif [[ "$1" == s ]]; then
[1144]	388	echo "$2\line\line" >> "$LOG_PATH_RTF"
[1064]	389	else
[1144]	390	echo "$2\line" >> "$LOG_PATH_RTF"
[1064]	391	fi
	392	fi
	393	if [[ "$1" == h ]]; then
[1119]	394	if [[ "$1" == s ]]; then
[1144]	395	echo "$2<tr><td> </td></tr>" >> "$LOG_PATH_HTM"
[1119]	396	elif [[ "$1" == n ]]; then
[1144]	397	echo "$2" >> "$LOG_PATH_HTM"
[1064]	398	else
[1144]	399	echo "$2<br />" >> "$LOG_PATH_HTM"
[1064]	400	fi
	401	fi
	402	}
	403
	404	# Pluralize the string in parameter 1 if the number in parameter 2 is not 1
	405	function pluralCheckNoun()
	406	{
	407	if [ $2 -ne 1 ]; then
	408	if [[ $1 =~ x$ ]]; then
	409	echo $1es
	410	else
	411	echo $1s
	412	fi
	413	else
	414	echo $1
	415	fi
	416	}
	417
[1067]	418	# Output "is" if parameter 1 is 1, otherwise "are"
	419	function pluralCheckIs()
	420	{
	421	if [ $1 -ne 1 ]; then
	422	echo "are"
	423	else
	424	echo "is"
	425	fi
	426	}
	427
[1064]	428	# Output "was" if parameter 1 is 1, otherwise "were"
	429	function pluralCheckWas()
	430	{
	431	if [ $1 -ne 1 ]; then
	432	echo "were"
	433	else
	434	echo "was"
	435	fi
	436	}
	437
[1067]	438	# Output "a " if parameter 1 is 1, otherwise nothing
	439	function pluralCheckA()
	440	{
	441	if [ $1 -eq 1 ]; then
	442	echo "a "
	443	fi
	444	}
	445
	446	# Output "an " if parameter 1 is 1, otherwise nothing
	447	function pluralCheckAn()
	448	{
	449	if [ $1 -eq 1 ]; then
	450	echo "an "
	451	fi
	452	}
	453
[1144]	454	# Upload the reports using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
[1064]	455	# reports being saved to disk have already been closed.
	456	function uploadReport()
	457	{
[1144]	458	valPrint c "Uploading reports..."
[1064]	459
	460	SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
	461	SFTP_USER_NAME_MARKER="user:"
	462	SFTP_PASSWORD_MARKER="pw:"
	463	SFTP_PORT_MARKER="port:"
	464	SFTP_PATH_MARKER="path:"
	465	SFTP_USER_NAME=$(grep $SFTP_USER_NAME_MARKER $UPLOAD_INFO)
	466	SFTP_USER_NAME=${SFTP_USER_NAME#$SFTP_USER_NAME_MARKER}
	467	SFTP_PASSWORD=$(grep $SFTP_PASSWORD_MARKER $UPLOAD_INFO)
	468	SFTP_PASSWORD=${SFTP_PASSWORD#$SFTP_PASSWORD_MARKER}
	469	SFTP_PORT=$(grep $SFTP_PORT_MARKER $UPLOAD_INFO)
	470	SFTP_PORT=${SFTP_PORT#$SFTP_PORT_MARKER}
	471	SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
	472	SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}
	473
[1144]	474	for SUFFIX in htm rtf txt; do
	475	expect "$SCRIPT_PATH" "$LOG_PATH.$SUFFIX" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.$SUFFIX"
[1064]	476
[1144]	477	if [ "$?" -ne 0 ]; then
	478	valPrint c "Error $? occurred when attempting to upload $LOG_NAME.$SUFFIX!"
	479	else
	480	valPrint c "Report in `echo $SUFFIX \| tr [:lower:] [:upper:]` format was uploaded."
	481	fi
	482	done
[1064]	483	}
	484
	485	# Prints session summary when script is done
	486	function wrapupAndExit()
	487	{
	488	# Get off progress line on console, drop down a line from last link in log, and close HTML table
	489	valPrint ctr ""
	490	valPrint h "</table><br />"
	491
	492	# If we didn't finish processing the last URL, then the iterator is one too high
	493	if [ $FINISHED_LIST != "yes" ]; then
	494	let LINK_NUM-=1
	495	if [ $FINISHED_LIST == "no" ]; then
	496	valPrint ctrh "The session was canceled by the user."
	497	fi
	498	fi
	499
[1118]	500	# Generate string with elapsed time
	501	END_RUN=$(date +%s)
	502	ELAPSED=$(echo $(($END_RUN - $START_RUN)) \| awk '{printf "%d min. %d sec. elapsed", int($1/60), int($1%60)}')
	503
[1122]	504	# Do some math on results of session
[1064]	505	LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
[1142]	506	TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP+SKIP_YOUTU_BE))
[1122]	507	LINK_ERRORS=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
[1142]	508	LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_RD+SKIP_EXPECT_EI+SKIP_EXPECT_IW))
	509	LINK_PROBLEMS_TOTAL=$((NG_LINKS+RD_LINKS+EI_LINKS+IW_LINKS))
	510	LINK_PROBLEMS_NG=$((NG_LINKS-SKIP_EXPECT_NG))
	511	LINK_PROBLEMS_RD=$((RD_LINKS-SKIP_EXPECT_RD))
	512	LINK_PROBLEMS_EI=$((EI_LINKS-SKIP_EXPECT_EI))
	513	LINK_PROBLEMS_IW=$((IW_LINKS-SKIP_EXPECT_IW))
	514	LINK_PROBLEMS_NET=$((LINK_PROBLEMS_NG+LINK_PROBLEMS_RD+LINK_PROBLEMS_EI+LINK_PROBLEMS_IW))
[1122]	515
[1144]	516	# Print something in the Links section if no link issues were printed
	517	if [ $LINK_PROBLEMS_NET -eq 0 ]; then
	518	valPrint h "<i>No link problems to report! See the <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for a list of links with issues that were not reported.</i>"
	519	fi
	520	if [ $LINK_PROBLEMS_TOTAL -eq 0 ]; then
	521	valPrint t "No link problems to report!"
	522	valPrint r "\i1 No link problems to report! \i0"
	523	fi
	524
[1141]	525	## SUMMARY OUTPUT ##
[1118]	526	valPrint ct "Summary ($ELAPSED):"
	527	valPrint r "\b1 Summary \b0 ($ELAPSED)"
	528	valPrint hn "<h3><span id=\"summary\">Summary ($ELAPSED)</span></h3>"
[1123]	529	valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT) (there $(pluralCheckWas $FILE_LINKS) $FILE_LINKS file $(pluralCheckNoun link $FILE_LINKS) and $PAGE_LINKS page $(pluralCheckNoun link $PAGE_LINKS))."
[1122]	530
	531	# Print processed link totals
	532	if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi
	533	if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi
[1135]	534	if [ $SKIP_ARCHIVE_ORG -gt 0 ]; then valPrint ctrh "- $SKIP_ARCHIVE_ORG Archive.org $(pluralCheckNoun link $SKIP_ARCHIVE_ORG) were not checked"; fi
[1142]	535	if [ $LINK_PROBLEMS_TOTAL -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_TOTAL processed $(pluralCheckNoun link $LINK_PROBLEMS_TOTAL) had $(pluralCheckAn $LINK_PROBLEMS_TOTAL)$(pluralCheckNoun issue $LINK_PROBLEMS_TOTAL)"; fi
	536	if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr " (excepted $LINKS_EXCEPTED link $(pluralCheckNoun issue $LINKS_EXCEPTED) from report)"; valPrint h "  (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi
[1123]	537	if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) $(pluralCheckWas $OK_LINKS) OK"; fi
	538	if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctr " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; valPrint h "  (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi
[1122]	539
	540	# Print errored link totals
[1144]	541	if [ $LINK_ERRORS -gt 0 ]; then
	542	valPrint c "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"
	543	valPrint h "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for specific links):"
	544	valPrint rt "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS):"
	545	fi
[1122]	546	if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
[1070]	547	if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
[1064]	548	if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
	549	if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
	550	if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
	551	if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
[1122]	552
[1142]	553	# Print excepted link totals
[1144]	554	if [ $LINKS_EXCEPTED -gt 0 ]; then
	555	valPrint c "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"
	556	valPrint h "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for specific links):"
	557	valPrint rt "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted:"
	558	fi
[1142]	559	if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
	560	if [ $SKIP_EXPECT_RD -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_RD/$RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi
	561	if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi
	562	if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi
	563
[1122]	564	# Print checked link totals
[1142]	565	if [ $LINK_PROBLEMS_NET -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS_NET link $(pluralCheckNoun issue $LINK_PROBLEMS_NET):"; fi
	566	if [ $LINK_PROBLEMS_NG -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_NG NG $(pluralCheckNoun link $LINK_PROBLEMS_NG)"; fi
	567	if [ $LINK_PROBLEMS_RD -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_RD $(pluralCheckNoun redirection $LINK_PROBLEMS_RD)"; fi
	568	if [ $LINK_PROBLEMS_EI -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_EI $(pluralCheckNoun link $LINK_PROBLEMS_EI) that could be intrawiki"; fi
	569	if [ $LINK_PROBLEMS_IW -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_IW $(pluralCheckNoun link $LINK_PROBLEMS_IW) that could be interwiki"; fi
[1122]	570
	571	# Close the log files' markup
[1070]	572	valPrint trh "ValExtLinks says goodbye."
[1064]	573	printRTFfooter
	574	printHTMfooter
	575
	576	# Upload report if this was requested
	577	if [ ! -z $UPLOAD_INFO ] && [ $FINISHED_LIST != "no" ]; then
	578	uploadReport
	579	fi
	580
	581	# Really quit now
	582	valPrint c "ValExtLinks says goodbye."
	583	exit 0
	584	}
	585	trap wrapupAndExit INT
	586
	587
	588	### INITIALIZATION ###
	589	# Print opening message to console and log files
	590	valPrint cw "$(tput bold)--Validate External Links--$(tput sgr0)"
	591	printTXTheader
	592	printRTFheader
	593	printHTMheader
	594
[1141]	595	## DATA SOURCING ##
	596	valPrint t "Startup:"
	597	valPrint r "\b1 Startup \b0"
	598	valPrint hn "<h3>Startup</h3>"
	599
[1064]	600	# Attempt to download file at LINKS_URL, then check that it succeeded
[1141]	601	valPrint cwtrhn "Downloading list of external links from $LINKS_URL..."
[1064]	602	LINKS_FILE_NAME=$(echo "$LINKS_URL" \| sed 's/.*\///')
	603	LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
	604	curl --silent -o "$LINKS_FILE" $LINKS_URL
	605	if [ ! -f "$LINKS_FILE" ]; then
[1141]	606	echo -e "\nThe download of $LINKS_URL appears to have failed. Aborting."
[1064]	607	wrapupAndExit
[1141]	608	else
	609	valPrint ctrh " success."
[1064]	610	fi
	611
	612	# Attempt to download file at EXCEPT_URL, then check that it succeeded
	613	if [ ! -z $EXCEPT_URL ]; then
[1141]	614	valPrint cwtrhn "Downloading list of reporting exceptions from $EXCEPT_URL..."
[1136]	615	EXCEPT_DATA=$(curl --silent $EXCEPT_URL)
	616	if [ -z "$EXCEPT_DATA" ]; then
[1141]	617	echo -e "\nThe download of the exceptions data from '$EXCEPT_URL' appears to have failed. Aborting."
[1064]	618	wrapupAndExit
[1141]	619	else
	620	valPrint ctrh " success."
[1064]	621	fi
[1136]	622	EXCEPT_DATA=${EXCEPT_DATA%END LIST*}
	623	EXCEPT_DATA=${EXCEPT_DATA#*BEGIN LIST}
	624	EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
	625
	626	# Store on disk for debugging purposes
	627	echo "$EXCEPT_DATA" > "$EXCEPT_FILE"
	628
	629	# Transfer to array for easy searching later
	630	declare -a EXCEPT_ARRAY=($(echo "$EXCEPT_DATA"))
[1064]	631	fi
	632
	633	# Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
	634	LINK_COUNT_STRING=$(cat "$LINKS_FILE" \| wc -l)
	635
	636	# Number of URLs is number of lines minus one (first line is column header row for the CSV)
	637	LINK_COUNT=$(echo "${LINK_COUNT_STRING}" \| tr -d '[:space:]')
	638	let LINK_COUNT-=1
[1141]	639	valPrint ctrh "Found $LINK_COUNT links to process."
	640	valPrint trh ""
[1064]	641
[1141]	642	## CONFIG OUTPUT ##
	643	valPrint t "Config:"
	644	valPrint r "\b1 Config \b0"
	645	valPrint hn "<h3>Config</h3>"
	646
	647	valPrint ctrhn "Links to consider: "
[1064]	648	if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
[1141]	649	valPrint ctrh "$((URL_LIMIT-URL_START+1)), from link $URL_START to $URL_LIMIT"
[1064]	650	elif [ $URL_START -ne 1 ]; then
[1141]	651	valPrint ctrh "$((LINK_COUNT-URL_START+1)), from link $URL_START to $LINK_COUNT"
[1064]	652	else
[1141]	653	valPrint ctrh "$LINK_COUNT"
[1064]	654	fi
	655
[1141]	656	valPrint ctrh "Site query timeout: $TIMEOUT seconds"
	657
	658	valPrint ctrhn "Show OK links: "
	659	if [ $RECORD_OK_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
	660
	661	valPrint ctrhn "Take screenshots: "
	662	if [ $TAKE_PAGE_SHOT -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
	663
[1147]	664	valPrint ctrhn "Suggest archive.org snapshots for NG pages: "
	665	if [ $SUGGEST_SNAPSHOTS_NG -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
[1141]	666
[1147]	667	valPrint ctrhn "Suggest archive.org snapshots for OK pages: "
	668	if [ $SUGGEST_SNAPSHOTS_OK -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
	669
[1141]	670	valPrint ctrhn "Ignore slash-adding redirects: "
	671	if [ $SHOW_SLASH -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
	672
	673	valPrint ctrhn "Ignore HTTPS-upgrading redirects: "
	674	if [ $SHOW_HTTPS -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
	675
	676	valPrint ctrhn "Ignore youtu.be redirects: "
	677	if [ $SHOW_YT_RD -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
	678
	679	valPrint ctrhn "Check archive.org links: "
[1144]	680	if [ $CHECK_ARCHIVE_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
[1141]	681
[1064]	682	valPrint tr "A summary of my findings will be found at the bottom of the report."
	683	valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
	684	valPrint trh ""
	685
[1141]	686	## LEGEND OUTPUT ##
[1064]	687	valPrint t "Legend:"
	688	valPrint r "\b1 Legend \b0"
	689	valPrint hn "<h3>Legend</h3>"
[1141]	690	valPrint t "(For guidance in fixing these links, see $WIKI_MAIN.)"
	691	valPrint r "(For guidance in fixing these links, see {\field{\*\fldinst{HYPERLINK \"$WIKI_MAIN\"}}{\fldrslt here}}.)"
	692	valPrint h "(For guidance in fixing these links, see <a href=\"$WIKI_MAIN\" target=\"_blank\">here</a>.)"
	693	valPrint trh "OK = URL seems to be working"
	694	valPrint trh "NG = URL no longer seems to work"
	695	valPrint trh "RD = URL is redirecting to this new URL"
	696	valPrint trh "EI = URL is an external link to an internal page and should be converted to an intrawiki link using the suggested markup"
	697	valPrint trh "IW = URL is an external link to a fellow wiki and should be converted to an interwiki link using the suggested markup"
	698	valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $WIKI_HTTP)"
	699	valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$WIKI_HTTP\"}}{\fldrslt here}} for code reference)"
	700	valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$WIKI_HTTP\" target=\"_blank\">here</a> for code reference)"
	701	valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $WIKI_CURL)"
	702	valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$WIKI_CURL\"}}{\fldrslt here}} for code reference)"
	703	valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$WIKI_CURL\" target=\"_blank\">here</a> for code reference)"
	704	valPrint trh "IA suggests = Last available snapshot returned by the Internet Archive"
	705	valPrint trh "Try browsing = The Archive failed to return a snapshot URL, so check for a snapshot manually using this link"
[1064]	706	valPrint trh ""
	707
	708
	709	### MAIN LOOP ###
[1120]	710	valPrint t "Links:"
	711	valPrint r "\b1 Links \b0"
	712	valPrint hn "<h3>Links</h3>"
[1118]	713	START_RUN=$(date +%s)
[1064]	714	# Process each line of the .csv in LINKS_FILE
	715	for LINE in `cat "$LINKS_FILE"`; do
[1147]	716	START_LINK=$(date +%s)
[1064]	717	let LINK_NUM+=1
	718
	719	# First line is the column header row for the CSV, so let's verify that the format hasn't changed
	720	if [ $SKIPPED_HEADER_ROW -eq 0 ]; then
	721	if [ $LINE == "namespace,title,target" ]; then
	722	SKIPPED_HEADER_ROW=1
[1148]	723	LINK_NUM=0 # this line is not a link, so reset the link counter
[1064]	724	valPrint hn "<table>"
	725	continue
	726	else
	727	valPrint ctrh "Did not find expected header row in $LINKS_FILE. Aborting."
	728	wrapupAndExit
	729	fi
	730	fi
	731
	732	# Skip this link if we are not at URL_START yet
	733	if [ $LINK_NUM -lt $URL_START ]; then
	734	continue
	735	fi
	736
	737	# Stop if we are at the limit declared for testing purposes
	738	if [ $URL_LIMIT -gt 0 ] && [ $LINK_NUM -gt $URL_LIMIT ]; then
	739	FINISHED_LIST="limit"
	740	wrapupAndExit
	741	fi
	742
	743	# Print progress to screen
	744	if [ $LINK_NUM -gt 1 ]; then
	745	printf "\e[1A\n" # erase previous progress message so that new one appears in its place
	746	fi
	747	valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."
	748
	749	# The number of the namespace is the element before the first comma on the line
	750	NS_ID=${LINE%%,*}
	751
	752	# Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
	753	NS_NAME=""
	754	a=0
[1069]	755	while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done
[1118]	756	if [ $NS_ID == "NULL" ]; then
	757	break
	758	elif [ $NS_ID -eq ${NS_IDS[$a]} ]; then
[1064]	759	NS_NAME="${NS_NAMES[$a]}"
	760	break
	761	fi
	762	let a+=1
	763	done
[1118]	764	if [ "$NS_NAME" == "" ]; then
	765	if [ $NS_ID == "NULL" ]; then
[1123]	766	valPrint trs "Skipping line $LINK_NUM ('$LINE') because the namespace (and probably the page too) is 'NULL'. Probably the link is no longer in existence on the wiki."
[1118]	767	else
[1123]	768	valPrint trs "Skipping line $LINK_NUM ('$LINE') because I could not find a name for namespace ID $NS_ID."
[1118]	769	fi
[1064]	770	let SKIP_UNK_NS+=1
[1148]	771	let PAGE_LINKS+=1
[1064]	772	continue
	773	fi
	774
	775	# The name of the page is everything between the namespace ID and the next comma on the line (commas
	776	# in page names will break this)
	777	PAGE_NAME=${LINE#$NS_ID,}
	778	PAGE_NAME=${PAGE_NAME%%,*}
	779
[1070]	780	# Build longer wiki page URLs from namespace and page names
[1122]	781	FULL_PAGE_PATH=https://$WIKI_PATH/$NS_NAME:$PAGE_NAME
[1070]	782	LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
	783	# Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it
	784	# explicitly breaks the link
	785	if [ $NS_ID -eq 0 ]; then
[1122]	786	FULL_PAGE_PATH=https://$WIKI_PATH/$PAGE_NAME
[1070]	787	LOCAL_PAGE_PATH=$PAGE_NAME
	788	fi
	789
[1149]	790	# We don't want to consider wiki pages ending in .js, as the MW parser cannot reliably isolate URLS
	791	# in JavaScript code, so it returns erroneous links
	792	PAGE_NAME_SUFFIX=$(echo $PAGE_NAME \| sed 's/.*\.//')
	793	if [ $PAGE_NAME_SUFFIX == "js" ]; then
	794	valPrint trs "Skipping URL '${LINE#$NS_ID,$PAGE_NAME,}' on line $LINK_NUM because it was found on JavaScript page '$LOCAL_PAGE_PATH'."
	795	let SKIP_JS_PAGE+=1
	796	let PAGE_LINKS+=1
	797	continue
	798	fi
	799
[1064]	800	# The URL being linked to is everything after the previous two fields (this allows commas to be in
	801	# the URLs, but a comma in the previous field, the page name, will break this)
	802	URL=${LINE#$NS_ID,$PAGE_NAME,}
	803
	804	# Scan for illegal characters
	805	if [[ $URL == [$ILLEGAL_CHARS] ]]; then
[1149]	806	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because it contains characters illegal in a URL."
[1064]	807	let SKIP_BAD_URL+=1
[1148]	808	let PAGE_LINKS+=1
[1064]	809	continue
	810	fi
	811
[1144]	812	# If we're skipping Archive.org links, see if this is one
	813	if [ $CHECK_ARCHIVE_LINKS -eq 0 ] && [[ $URL == web.archive.org ]]; then
[1149]	814	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to check Wayback Machine links."
[1135]	815	let SKIP_ARCHIVE_ORG+=1
[1148]	816	let PAGE_LINKS+=1
[1135]	817	continue
	818	fi
	819
[1064]	820	# Now we need to know if the URL is for a file or a web page. First step is to determine if the
	821	# URL ends in a suffix
	822	HAS_SUFFIX=0
	823
	824	# If the URL ends in something like ".php?foo=bar", strip everything from the '?' onward
[1070]	825	CLEAN_URL=${URL%%\?*}
[1064]	826
	827	# If the URL ends in something like "#section_15", strip everything from the '#' onward
[1070]	828	CLEAN_URL=${CLEAN_URL%%\#*}
[1064]	829
[1135]	830	# 'sed' cannot handle Unicode in my Bash shell, so skip non-ASCII URL and make user check it
[1070]	831	if [[ $CLEAN_URL == [![:ascii:]] ]]; then
[1149]	832	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I cannot handle non-ASCII characters."
[1064]	833	let SKIP_NON_ASCII+=1
[1148]	834	let PAGE_LINKS+=1
[1064]	835	continue
	836	fi
	837
	838	# Isolate the characters after the last period and after the last slash
[1070]	839	POST_DOT=$(echo "$CLEAN_URL" \| sed 's/.*\.//')
	840	POST_SLASH=$(echo "$CLEAN_URL" \| sed 's/.*\///')
[1064]	841
	842	# If the last period comes after the last slash, then the URL ends in a suffix
	843	POST_DOT_LENGTH=$(echo \| awk -v input=$POST_DOT '{print length(input)}')
	844	POST_SLASH_LENGTH=$(echo \| awk -v input=$POST_SLASH '{print length(input)}')
	845	if [ $POST_DOT_LENGTH -lt $POST_SLASH_LENGTH ]; then
	846	HAS_SUFFIX=1
	847	else
	848	HAS_SUFFIX=0
	849	fi
	850
	851	# Now we need to know if the URL is for a file. If the URL ends in a suffix, compare it against the
	852	# known URL endings supplied in HTTP_FILES and HTTP_TLDS_AND_PAGES
	853	IS_FILE=-1
	854	if [ $HAS_SUFFIX -eq 0 ]; then
	855	IS_FILE=0
	856	else
	857	# Turn off case sensitivity while we compare suffixes
	858	shopt -s nocasematch
	859
[1127]	860	# Special case: URLs ending in something like "productID.304297400" are pages, not files, so if
[1064]	861	# the URL's suffix is all numbers, we are looking at the end of a web page URL
	862	if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
	863	IS_FILE=0
	864	fi
[1127]	865
	866	# Special case: URLs ending in a parens, e.g. "ms537113(v=vs.85)", are pages
	867	if [[ $POST_DOT =~ ^.*[\(\)]$ ]]; then
	868	IS_FILE=0
	869	fi
	870
	871	# Special case: URLs containing a '%', e.g. "10.1007%2FBF00329055", are pages
	872	if [[ $POST_DOT == % ]]; then
	873	IS_FILE=0
	874	fi
[1064]	875
	876	# If we did not identify this URL as a web page above, we need to compare the suffix against known
	877	# file extensions
	878	if [ $IS_FILE -eq -1 ]; then
	879	for EXTENSION in "${HTTP_FILES[@]}"; do
	880	if [[ $EXTENSION == $POST_DOT ]]; then # double brackets respect the nocasematch setting
	881	IS_FILE=1
	882	break
	883	fi
	884	done
	885	fi
	886
	887	# If we did not identify this URL as a file above, we need to compare the suffix against known
	888	# pages and TLDs in order to rule out that this is an unknown file suffix or TLD that the user
	889	# needs to add to the HTTP_FILES or HTTP_TLDS_AND_PAGES array
	890	if [ $IS_FILE -eq -1 ]; then
	891	for PAGE_OR_TLD in "${HTTP_TLDS_AND_PAGES[@]}"; do
	892	if [[ $PAGE_OR_TLD == $POST_DOT ]]; then
	893	IS_FILE=0
	894	break
	895	fi
	896	done
	897	fi
	898
	899	# Turn case sensitivity back on in Bash
	900	shopt -u nocasematch
	901	fi
	902
	903	# If this suffix escaped identification as either a file, page or TLD, inform the user
	904	STR_TYPE=""
	905	if [ $IS_FILE -eq -1 ]; then
[1149]	906	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I encountered the unknown URL ending '$POST_DOT'. Please add this ending to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
[1064]	907	let SKIP_UNK_SUFFIX+=1
	908	continue
	909	elif [ $IS_FILE -eq 1 ]; then
	910	STR_TYPE="file"
	911	let FILE_LINKS+=1
[1148]	912	else
[1064]	913	STR_TYPE="page"
	914	let PAGE_LINKS+=1
	915	fi
	916
	917	# Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
	918	# issue with sites that require HTTPS
[1142]	919	CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --retry 2 --write-out '%{http_code}\n' $URL)
[1064]	920	CURL_ERR=$(echo $?)
	921	CURL_RESULT=$CURL_CODE
	922
	923	# Tack 'curl' exit code onto result string if 'curl' returned "000" (no HTTP response)
	924	if [ $CURL_CODE == "000" ]; then
	925	CURL_RESULT="$CURL_RESULT-$CURL_ERR"
	926	fi
	927
[1070]	928	# Begin to determine our status code for this URL (EI, IW, OK, RD, or NG)
[1064]	929	STATUS="??"
[1067]	930	NEW_URL=""
[1064]	931	INTERWIKI_INDEX=-1
	932
[1070]	933	# First make sure that this isn't an "external internal" link to our own wiki that can be replaced
	934	# by "[[page_name]]". If it uses a special access URL beginning with "/w/", let it pass, as it
	935	# probably cannot be replaced by "[[ ]]" markup
	936	if [[ $URL == $WIKI_PATH ]] && [[ $URL != $WIKI_PATH/w/ ]]; then
	937	STATUS="EI"
	938	let EI_LINKS+=1
	939	fi
	940
[1144]	941	# If it's not, check if this is a link to a domain that we have an interwiki prefix for (also make
	942	# sure that it's not an archive.org link to a page from an interwiki domain)
	943	if [ $STATUS == "??" ] && [[ $URL != web.archive.org ]]; then
[1070]	944	for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
	945	if [[ $URL == ${INTERWIKI_DOMAINS[$i]} ]] && [[ $URL != ${INTERWIKI_DOMAINS[$i]}/w/ ]]; then
	946	STATUS="IW"
	947	let IW_LINKS+=1
	948	INTERWIKI_INDEX=$i
	949	break
	950	fi
	951	done
	952	fi
	953
[1069]	954	# If we didn't match an interwiki domain, see if the status code is in our "OK" codes list
	955	if [ $STATUS == "??" ]; then
	956	for CODE in "${OK_CODES[@]}"; do
	957	if [[ $CODE == $CURL_CODE ]]; then
	958	STATUS="OK"
	959	let OK_LINKS+=1
[1148]	960
	961	# If this is a YouTube link, we have to look at the actual page source to know if the video
	962	# is good or not
	963	if [[ $URL == www.youtube.com ]]; then
	964	PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $URL \| grep "\"simpleText\":\"Video unavailable\"")
	965	if [ ! -z "$PAGE_TEXT" ]; then
	966	STATUS="NG"
	967	let OK_LINKS-=1
	968	let NG_LINKS+=1
	969	fi
	970	fi
[1069]	971	break
	972	fi
	973	done
	974	fi
	975
[1067]	976	# If we didn't get a match with the "OK" codes, check it against the "RD" codes
[1064]	977	if [ $STATUS == "??" ]; then
[1067]	978	for CODE in "${RD_CODES[@]}"; do
	979	if [[ $CODE == $CURL_CODE ]]; then
	980	# Get URL header again in order to retrieve the URL we are being redirected to
[1141]	981	NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --write-out '%{redirect_url}\n' $URL)
[1067]	982
[1122]	983	# Adjust the old and new URLs to both use HTTP for comparison purposes, so we can filter
	984	# those changes out if the user didn't ask for them
	985	URL_HTTP=$(echo $URL \| sed -E 's/^https:/http:/')
	986	NEW_URL_HTTP=$(echo $NEW_URL \| sed -E 's/^https:/http:/')
[1070]	987
	988	# Sometimes 'curl' fails to get the redirect_url due to time-out or bad web site config
[1122]	989	NEW_URL_LENGTH=$(echo \| awk -v input=$NEW_URL_HTTP '{print length(input)}')
[1070]	990	if [ $NEW_URL_LENGTH -lt $MIN_URL_LENGTH ]; then
[1122]	991	NEW_URL_HTTP="[new URL not retrieved]"
[1070]	992	fi
	993
[1122]	994	# Remove slash at end of new URL, if present, so we can filter out the redirects that
	995	# merely add an ending slash if the user didn't ask for them
	996	NEW_URL_NO_SLASH=$(echo $NEW_URL_HTTP \| sed -E 's:/$::')
	997
[1127]	998	# Detect if this is a youtu.be link simply being expanded by YouTube to the full
	999	# youtube.com address
	1000	YOUTU_BE=0
	1001	if [[ $URL_HTTP == http://youtu.be* ]] && [[ $NEW_URL_HTTP == http://www.youtube.com* ]]; then
	1002	YOUTU_BE=1
	1003	fi
	1004
[1122]	1005	# If the URLs match besides HTTP being upgraded to HTTPS, then the link is OK (unless user
	1006	# wants those to be reported)
	1007	if [ $SHOW_HTTPS -eq 0 ] && [ $URL_HTTP == $NEW_URL_HTTP ]; then
[1149]	1008	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to show http->https upgrades, and I was redirected to '$NEW_URL'."
[1069]	1009	STATUS="OK"
	1010	let OK_LINKS+=1
[1122]	1011	let SKIP_HTTPS_UP+=1
	1012	# If the URLs match besides an added ending slash, then the link is OK (unless user wants
	1013	# those to be reported)
	1014	elif [ $SHOW_SLASH -eq 0 ] && [ $URL_HTTP == $NEW_URL_NO_SLASH ]; then
[1149]	1015	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to show added trailing slashes, and I was redirected to '$NEW_URL'."
[1122]	1016	STATUS="OK"
	1017	let OK_LINKS+=1
	1018	let SKIP_SLASH_ADD+=1
[1148]	1019	elif [ $YOUTU_BE -eq 1 ]; then
	1020	# We have to look at the actual page source to know if a YouTube video is good or not
	1021	PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $NEW_URL \| grep "\"simpleText\":\"Video unavailable\"")
	1022	if [ ! -z "$PAGE_TEXT" ]; then
	1023	STATUS="NG"
	1024	let NG_LINKS+=1
	1025	else
	1026	if [ $SHOW_YT_RD -eq 0 ]; then
[1149]	1027	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to show redirects for youtu.be links, and I was redirected to '$NEW_URL'."
[1148]	1028	STATUS="OK"
	1029	let OK_LINKS+=1
	1030	let SKIP_YOUTU_BE+=1
	1031	else
	1032	STATUS="RD"
	1033	let RD_LINKS+=1
	1034	fi
	1035	fi
[1069]	1036	else
	1037	STATUS="RD"
	1038	let RD_LINKS+=1
	1039	fi
[1067]	1040	break
	1041	fi
	1042	done
	1043	fi
	1044
	1045	# If we didn't get a match with the "RD" codes, check it against the "NG" codes
	1046	if [ $STATUS == "??" ]; then
[1064]	1047	for CODE in "${NG_CODES[@]}"; do
	1048	if [[ $CODE == $CURL_CODE ]]; then
	1049	STATUS="NG"
	1050	let NG_LINKS+=1
	1051	break
	1052	fi
	1053	done
	1054	fi
	1055
	1056	# If we didn't match a known status code, advise the reader
	1057	if [ $STATUS == "??" ]; then
[1149]	1058	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I encountered the unknown response code $CURL_CODE."
[1064]	1059	let SKIP_UNK_CODE+=1
	1060	continue
	1061	fi
	1062
[1136]	1063	# Check problem links against exceptions list before proceeding
	1064	FOUND_EXCEPT=0
[1070]	1065	if [ $STATUS != "OK" ] && [ ! -z $EXCEPT_URL ]; then
	1066	# The code we expect to find in the exceptions file is either the 'curl' result or "EI"/"IW"
	1067	EXPECT_CODE="$CURL_RESULT"
	1068	if [ $STATUS == "EI" ]; then
	1069	EXPECT_CODE="EI"
	1070	elif [ $STATUS == "IW" ]; then
	1071	EXPECT_CODE="IW"
	1072	fi
	1073
[1136]	1074	# Look for link in exceptions list and make sure the listed result code and wiki page also match
	1075	for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
	1076	{
	1077	EXCEPT_LINE="${EXCEPT_ARRAY[$i]}"
	1078
[1142]	1079	# Undo any HTML-encoding from the wiki page; for now we just worry about the ampersand, as most
	1080	# other HTML-encoded characters are not found in URLs
[1146]	1081	EXCEPT_LINE=$(echo "$EXCEPT_LINE" \| sed 's/\&/\&/g')
[1142]	1082
[1136]	1083	# Match URL
	1084	EXCEPT_URL="${EXCEPT_LINE#*,}"
	1085	EXCEPT_URL="${EXCEPT_URL%,*}"
	1086	if [ "$EXCEPT_URL" != "$URL" ]; then
[1070]	1087	continue
	1088	fi
[1136]	1089
	1090	# Match containing page's name
	1091	EXCEPT_PAGE="${EXCEPT_LINE##*,}"
	1092	EXCEPT_PAGE="${EXCEPT_PAGE%% *}"
	1093	if [ "$EXCEPT_PAGE" == "*" ] \|\| [ "$EXCEPT_PAGE" == $LOCAL_PAGE_PATH ]; then
	1094	# Match result code
	1095	EXCEPT_CODE=${EXCEPT_LINE%%,*}
	1096	if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then
[1149]	1097	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because its expected result, '$EXPECT_CODE', is in the exceptions list."
[1136]	1098	if [ $STATUS == "EI" ]; then
	1099	let SKIP_EXPECT_EI+=1
	1100	elif [ $STATUS == "IW" ]; then
	1101	let SKIP_EXPECT_IW+=1
[1142]	1102	elif [ $STATUS == "RD" ]; then
	1103	let SKIP_EXPECT_RD+=1
[1136]	1104	else
	1105	let SKIP_EXPECT_NG+=1
	1106	fi
	1107	FOUND_EXCEPT=1
	1108	break
	1109	fi
	1110	fi
	1111	} done
[1064]	1112	fi
[1136]	1113	if [ $FOUND_EXCEPT -eq 1 ]; then
	1114	continue
	1115	fi
[1064]	1116
	1117	# If appropriate, record this link to the log, with clickable URLs when possible
	1118	if [ $STATUS != "OK" ] \|\| [ $RECORD_OK_LINKS -eq 1 ]; then
[1125]	1119	# Prepare 'curl' result in parentheses to print after status code, unless this is an "IW" or "EI"
	1120	# link, in which case showing the status code doesn't make sense. Adjust spacing after string to
	1121	# ensure TXT and RTF reports have aligned columns of results.
	1122	CURL_STR_H=" ($CURL_RESULT)"
	1123	CURL_STR_T="$CURL_STR_H"
	1124	CURL_STR_R="$CURL_STR_H "
[1070]	1125	if [ $STATUS == "IW" ] \|\| [ $STATUS == "EI" ]; then
[1125]	1126	CURL_STR_H=""
	1127	CURL_STR_T=" "
	1128	CURL_STR_R=" "
[1064]	1129	fi
	1130
	1131	# Record link and its wiki page in TXT, RTF, and HTML markup
[1125]	1132	valPrint t "${STATUS}${CURL_STR_T} $STR_TYPE $URL"
[1064]	1133	valPrint t " linked from $FULL_PAGE_PATH"
[1125]	1134	valPrint r "${STATUS}${CURL_STR_R}${RTF_TABS}$STR_TYPE {\field{\*\fldinst{HYPERLINK \"$URL\"}}{\fldrslt $URL}}"
[1064]	1135	valPrint r " linked from {\field{\*\fldinst{HYPERLINK \"$FULL_PAGE_PATH\"}}{\fldrslt $LOCAL_PAGE_PATH}}"
[1125]	1136	valPrint hn "<tr><td style=\"white-space:nowrap\">${STATUS}${CURL_STR_H}</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>"
[1064]	1137	valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"
	1138
[1123]	1139	# Place vertical space here since we won't be printing anything more about this link
[1147]	1140	if [ $STATUS == "OK" ] && [ $SUGGEST_SNAPSHOTS_OK -eq 0 ]; then valPrint tr ""; valPrint hs ""; fi
[1123]	1141
[1067]	1142	# Record redirect URL if one was given by a 3xx response page
	1143	if [ $STATUS == "RD" ]; then
[1119]	1144	valPrint ts " Server suggests $NEW_URL"
	1145	valPrint rs " Server suggests {\field{\*\fldinst{HYPERLINK \"$NEW_URL\"}}{\fldrslt $NEW_URL}}"
	1146	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Server suggests</td><td><a href=\"$NEW_URL\" target=\"_blank\">$NEW_URL</a></td></tr>"
[1067]	1147	fi
	1148
[1070]	1149	# Notify reader if we can use an intrawiki link for this URL
	1150	if [ $STATUS == "EI" ]; then
[1075]	1151	INTRA_PAGE=${URL#:///}
[1119]	1152	valPrint ts " Just use [[$INTRA_PAGE]]"
	1153	valPrint rs " Just use [[$INTRA_PAGE]]"
	1154	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Just use</td><td>[[$INTRA_PAGE]]</td></tr>"
[1070]	1155	fi
	1156
[1064]	1157	# Notify reader if we can use an interwiki prefix for this URL
	1158	if [ $STATUS == "IW" ]; then
[1075]	1159	INTER_PAGE=$(echo "$URL" \| sed 's/.*\///')
[1119]	1160	valPrint ts " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
	1161	valPrint rs " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
	1162	valPrint hs "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]</td></tr>"
[1064]	1163	fi
	1164
	1165	# Query Internet Archive for latest "OK" snapshot for "NG" page
[1147]	1166	if [[ ( $STATUS == "NG" && $SUGGEST_SNAPSHOTS_NG -eq 1 ) \|\| ( $STATUS == "OK" && $SUGGEST_SNAPSHOTS_OK -eq 1 ) ]]; then
	1167
	1168	# We need to watch out for the rate limit or we'll get locked out; look at how much time has
	1169	# elapsed and then wait the remainder between that and how long of a wait we think is needed
	1170	# to avoid the dreaded "Too Many Requests" response. 5 seconds is just a guess.
	1171	CUR_TIME=$(date +%s)
	1172	WAIT_REMAINDER=$((5 - $CUR_TIME + $START_LINK))
	1173	if [ $WAIT_REMAINDER -gt 0 ]; then
	1174	valPrint t "Waiting $WAIT_REMAINDER second(s) to conform to Archive.org rate limit."
	1175	sleep $WAIT_REMAINDER
	1176	fi
	1177
	1178	# Issue query to the API
[1141]	1179	ARCHIVE_QUERY=$(curl --silent --max-time $TIMEOUT "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
[1064]	1180
[1147]	1181	# Notify user if we hit the rate limit and just keep going
	1182	if [[ "$ARCHIVE_QUERY" == "Too Many Requests" ]]; then
	1183	valPrint t " IA has rate-limited us!"
	1184	valPrint r " IA has rate-limited us!"
	1185	valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td>(hit the API rate limit!)</td></tr>"
	1186	# If a "closest" snapshot was received, inform user
	1187	elif [[ "$ARCHIVE_QUERY" == \"closest\": ]]; then
[1118]	1188	# In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it
	1189	ARCHIVE_QUERY=$(echo "$ARCHIVE_QUERY" \| sed 's/#!/#\\!/')
	1190
	1191	# ...isolate "url" property in the response that follows the "closest" tag
	1192	SNAPSHOT_URL=${ARCHIVE_QUERY##*\"closest\":} # everything after '"closest":'
[1119]	1193	SNAPSHOT_URL=${SNAPSHOT_URL#*\"url\": \"} # everything after '"url": "'
[1118]	1194	SNAPSHOT_URL=${SNAPSHOT_URL%%\"*} # everything before '"'
	1195
[1124]	1196	# Remove the port 80 part that IA often adds to the URL, as it's superfluous
	1197	SNAPSHOT_URL=$(echo $SNAPSHOT_URL \| sed 's/:80//')
	1198
[1118]	1199	# Inform the user of the snapshot URL
[1119]	1200	valPrint ts " IA suggests $SNAPSHOT_URL"
	1201	valPrint rs " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
	1202	valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
[1147]	1203	else # Otherwise give a generic Wayback Machine link for this URL, which might work
[1119]	1204	valPrint ts " Try browsing $ARCHIVE_GENERIC/$URL"
	1205	valPrint rs " Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
	1206	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"
[1064]	1207	fi
	1208	fi
	1209	fi
	1210
	1211	# If this is a page and it seems to be there, take screenshot of it using headless Google Chrome
	1212	if [ $IS_FILE -eq 0 ] && [ $TAKE_PAGE_SHOT -eq 1 ] && [ $STATUS == "OK" ]; then
	1213	# Sanitize URL by removing "http(s)://" and converting any other colons or slashes to underscores
	1214	SHOT_NAME=$(echo "$URL" \| sed 's/https*\:\/\///' \| sed 'y/:\//__/')
	1215	SHOT_FILE="$SHOT_PATH/$SHOT_NAME.png"
	1216
	1217	# Don't take screenshot if we already encountered this page and screenshotted it
	1218	if [ ! -f "$SHOT_FILE" ]; then
[1070]	1219	"$CHROME_PATH" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
[1064]	1220	if [ -f "$WORKING_DIR/$CHROME_SCREENSHOT" ]; then
	1221	mv -n "$WORKING_DIR/$CHROME_SCREENSHOT" "$SHOT_FILE"
	1222	else
[1119]	1223	valPrint trhs "Screenshot of URL $URL seems to have failed!"
[1064]	1224	fi
	1225	else
[1123]	1226	valPrint trhs "Skipping screenshot of URL '$URL' because file '$SHOT_FILE' already exists."
[1064]	1227	fi
	1228	fi
	1229	done
	1230	FINISHED_LIST="yes"
	1231	wrapupAndExit

Note: See TracBrowser for help on using the repository browser.

Download in other formats: