Context Navigation

source: Validate External Links/validate_external_links.sh@ 1176

Last change on this file since 1176 was 1175, checked in by iritscen, 2 years ago
ValExtLinks: Added audit feature which tells the user if there are items in the exception list which are no longer present on the wiki or no longer return the given error code.
File size: 56.9 KB

Rev	Line
[1064]	1	#!/bin/bash
	2
	3	# Validate External Links by Iritscen
[1141]	4	#
	5	# Validates a list of external links in CSV format. The resulting logs are produced in three formats:
	6	# - TXT (for easy diffing with an earlier log)
	7	# - RTF (for reading as a local file with clickable links)
[1144]	8	# - HTML (for reading as a web page)
[1142]	9	# Call script with "--help" argument for documentation. Also see Read Me.rtf for critical notes.
[1141]	10	#
[1064]	11	# Recommended rule:
[1118]	12	# \|----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----\|
[1141]	13	#
	14	# Table of contents (sections of script in order of appearance, not execution):
	15	# • Globals
	16	# • Help Output
	17	# • Setup
	18	# • Utility Functions
	19	# • Summary Output
	20	# • Initialization
	21	# • Data Sourcing
	22	# • Config Output
	23	# • Legend Output
	24	# • Main Loop
[1064]	25
	26	# Set separator token to newline
	27	IFS="
	28	"
	29
	30	### GLOBALS ###
	31	# Settings -- these will be changed from their defaults by the arguments passed in to the script
[1175]	32	LINKS_URL="" # download external link CSV from this location (can use "file://" protocol)
	33	EXCEPT_URL="" # location of wiki page with a list of exceptions for NG results
[1147]	34	OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder
	35	RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES
	36	SHOW_SLASH=0 # record issue when a slash is added to the end of a URL
	37	SHOW_HTTPS=0 # record issue when "http" is upgraded to "https"
	38	SHOW_YT_RD=0 # record redirection for a youtu.be URL expanding to the full URL
	39	SUGGEST_SNAPSHOTS_NG=0 # query the Internet Archive for a possible snapshot URL for each NG page
	40	SUGGEST_SNAPSHOTS_OK=0 # query the Internet Archive for an existing snapshot of each OK page
[1158]	41	CHECK_ARCHIVE_LINKS=0 # check URLs on archive.org and archive.is
[1147]	42	TAKE_PAGE_SHOT=0 # take a screenshot of each OK page
	43	TIMEOUT=10 # time to wait for a response when querying a site
	44	CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature
	45	URL_START=1 # start at this URL in LINKS_FILE
	46	URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE
	47	UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report
[1064]	48
	49	# Fixed strings -- see the occurrences of these variables to learn their purpose
[1175]	50	AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.134 Safari/537.36"
[1064]	51	ARCHIVE_API="http://archive.org/wayback/available"
	52	ARCHIVE_GENERIC="https://web.archive.org/web/*"
	53	ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
	54	CHROME_SCREENSHOT="screenshot.png"
[1136]	55	EXCEPT_FILE_NAME="exceptions.txt"
[1064]	56	EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
[1141]	57	WIKI_CURL="https://wiki.oni2.net/Validate_External_Links/Curl_codes"
	58	WIKI_HTTP="https://wiki.oni2.net/Validate_External_Links/HTTP_codes"
	59	WIKI_MAIN="https://wiki.oni2.net/Validate_External_Links"
	60	WIKI_ME="http://iritscen.oni2.net"
[1064]	61	THIS_DIR=$(cd $(dirname $0); pwd)
	62	WORKING_DIR=$(pwd)
	63	WIKI_PATH="wiki.oni2.net"
	64
	65	# These are parallel arrays of the IDs and names of OniGalore's current namespaces
	66	declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 100 101 102 103 104 105 108 109 110 111)
	67	declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk")
	68
	69	# These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
[1070]	70	# This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code.
[1175]	71	declare -a HTTP_FILES=(3ds 7z avi BINA blend bsl cpp csv dae dll dmg doc east exe fbx first63 flv gamedata gif jpg last32 log m4a mhm mov mp3 mp4 oni ONWC pdf png psd py rar tga TRMA txt vbs wav wmv xaf xcf xlsx xml zip)
[1160]	72	declare -a HTTP_TLDS_AND_PAGES=(abstract action ars asp aspx cfm cgi com css de do full htm html it js jsp net org pgi php php3 phtml pl ru shtml stm uk x)
[1064]	73
[1067]	74	# These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
	75	# are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt
	76	# if you add a new code.
[1127]	77	declare -a OK_CODES=(200 401 405 406 418 501)
[1067]	78	declare -a RD_CODES=(301 302 303 307 308)
[1149]	79	declare -a NG_CODES=(000 400 403 404 410 429 500 502 503 504 530)
[1064]	80
	81	# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
	82	# transcluded text, and if the transclusion fails, then the braces show up in the URL
	83	ILLEGAL_CHARS="{ }"
	84
[1070]	85	# The shortest URL possible, used for sanity-checking some URLs: http://a.co
	86	MIN_URL_LENGTH=11
	87
[1064]	88	# These are parallel arrays giving the prefixes that can be used in place of normal external links to
[1157]	89	# some wikis and other sites; based on https://wiki.oni2.net/Special:Interwiki
[1070]	90	declare -a INTERWIKI_PREFIXES=(commons metawikimedia mw wikibooks wikidata wikimedia wikinews wikiquote wikisource wikispecies wikiversity wikivoyage wikt wp)
	91	declare -a INTERWIKI_DOMAINS=(commons.wikimedia.org meta.wikimedia.org mediawiki.org wikibooks.org wikidata.org wikimediafoundation.org wikinews.org wikiquote.org wikisource.org species.wikimedia.org wikiversity.org wikivoyage.org wiktionary.org wikipedia.org)
[1064]	92
	93	# Variables for keeping track of main loop progress and findings
	94	LINK_NUM=0
[1070]	95	EI_LINKS=0
	96	IW_LINKS=0
[1064]	97	OK_LINKS=0
[1067]	98	RD_LINKS=0
[1064]	99	NG_LINKS=0
	100	SKIP_UNK_NS=0
	101	SKIP_JS_PAGE=0
	102	SKIP_BAD_URL=0
	103	SKIP_NON_ASCII=0
	104	SKIP_UNK_SUFFIX=0
	105	SKIP_UNK_CODE=0
[1070]	106	SKIP_EXPECT_NG=0
[1142]	107	SKIP_EXPECT_RD=0
[1070]	108	SKIP_EXPECT_EI=0
	109	SKIP_EXPECT_IW=0
[1122]	110	SKIP_HTTPS_UP=0
	111	SKIP_SLASH_ADD=0
[1127]	112	SKIP_YOUTU_BE=0
[1158]	113	SKIP_ARCHIVES=0
[1064]	114	FILE_LINKS=0
	115	PAGE_LINKS=0
	116	SKIPPED_HEADER_ROW=0
	117	FINISHED_LIST="no"
[1118]	118	START_RUN=0
	119	END_RUN=0
[1064]	120
	121
[1141]	122	### HELP OUTPUT ###
[1064]	123	# A pseudo-man page. Here is the 80-character rule for the page text:
	124	# 234567890123456789012345678901234567890123456789012345678901234567890123456789
	125	function printHelp()
	126	{
	127	cat << EOF
	128
	129	NAME
	130	Validate External Links
	131
	132	SYNOPSIS
	133	validate_external_links.sh --help
[1070]	134	validate_external_links.sh --links URL --output DIR [--exceptions URL]
[1136]	135	[--record-ok-links] [--show-added-slashes] [--show-https-upgrades]
[1144]	136	[--show-yt-redirects] [--suggest-snapshots] [--check-archive-links]
[1141]	137	[--take-screenshots FILE] [--timeout NUM] [--start-url NUM]
	138	[--end-url NUM] [--upload FILE]
[1064]	139
	140	DESCRIPTION
	141	This script parses a list of external links found in the OniGalore wiki
[1147]	142	(which is dumped by the Oni2.net server periodically in a particular
[1064]	143	format), validates them using the Unix tool 'curl', and produces a report
[1070]	144	of which links were "OK" (responded positively to an HTTP query), which
	145	were "RD" (responded with a 3xx redirect code), which could be "IW"
	146	(interwiki) links, which are "EI" (external internal) links and could be
	147	intrawiki links, and which were "NG" (no good; a negative response to the
[1069]	148	query). This report can then be automatically uploaded to the location of
[1064]	149	your choice. The script can also suggest Internet Archive snapshots for
[1070]	150	"NG" links, and take screenshots of "OK" links for visual verification by
	151	the reader that the page in question is the one intended to be displayed.
[1064]	152
	153	You must pass this script the URL at which the list of links is found
[1070]	154	(--links) and the path where the directory of logs should be outputted
	155	(--output). All other arguments are optional.
[1064]	156
	157	OPTIONS
[1075]	158	--help Show this page.
	159	--links URL (required) URL from which to download the CSV
	160	file with external links. Note that this URL can
	161	be a local file if you supply a file:// path.
	162	--output DIR (required) Unix path to directory in which Val
	163	should place its reports.
	164	--exceptions URL In order to remove links from the report which
[1136]	165	Val finds an issue with but which you regard as
	166	OK, list those desired exceptions on a wiki page.
	167	See the sample file "exceptions.pdf" for the
	168	required format of the page. Note that this URL
	169	can point to a local file if you supply a path
	170	beginning with "file://".
[1075]	171	--record-ok-links Log a link in the report even if its response
	172	code is "OK".
[1122]	173	--show-added-slashes Report on redirects that simply add a '/' to the
	174	end of the URL.
[1127]	175	--show-https-upgrades Report on redirects that simply upgrade a
[1122]	176	"http://" URL to a "https://" URL.
[1127]	177	--show-yt-redirects Report on redirects that expand a youtu.be URL.
[1147]	178	--suggest-snapshots-ng Query the Internet Archive for a possible
[1075]	179	snapshot URL for each "NG" page.
[1147]	180	--suggest-snapshots-ok Query the Internet Archive for a snapshot of each
	181	"OK" page just to make sure it's available. Note
	182	that this will add a tremendous amount of time to
	183	the script execution because there is a rate
	184	limit to the Archive API. Note that this option
	185	does nothing unless you also use the
	186	--record-ok-links argument.
[1144]	187	--check-archive-links Check links that are already pointing to a page
[1158]	188	on the Internet Archive or archive.is (AKA
	189	archive.today). In theory these links should be
	190	totally stable and not need validation.
[1075]	191	--take-screenshots FILE Call the Google Chrome binary at this path to
	192	take screenshots of each "OK" page.
[1141]	193	--timeout NUM Wait this many seconds for a site to respond. The
[1142]	194	default is 10. Important note: Val will attempt
	195	to reach each URL three times, so the time taken
	196	to ping an unresponsive site will be three times
	197	this setting.
[1075]	198	--start-url NUM Start at this link in the links CSV file.
	199	--end-url NUM Stop at this link in the links CSV file.
	200	--upload FILE Upload report using the credentials and path
	201	given in this local text file. See sftp_login.txt
	202	for template.
[1064]	203
	204	BUGS
	205	The script cannot properly parse any line in the external links file
	206	which contains a comma in the name of the wiki page containing a link.
	207	Commas in the link itself are not an issue.
	208	EOF
	209	}
	210
	211
	212	### SETUP ###
	213	# If first argument is a help request, or if nothing was passed in at all, print help page and quit
	214	if [ "$#" -eq 0 ] \|\| [ "$1" == "--help" ]; then
	215	printHelp \| less
	216	exit 0
	217	fi
	218
	219	# Parse arguments as long as there are more arguments to process
	220	while (( "$#" )); do
	221	case "$1" in
[1147]	222	--links ) LINKS_URL="$2"; shift 2;;
	223	--exceptions ) EXCEPT_URL="$2"; shift 2;;
	224	--output ) OUTPUT_DIR="$2"; shift 2;;
	225	--record-ok-links ) RECORD_OK_LINKS=1; shift;;
	226	--show-added-slashes ) SHOW_SLASH=1; shift;;
	227	--show-https-upgrades ) SHOW_HTTPS=1; shift;;
	228	--show-yt-redirects ) SHOW_YT_RD=1; shift;;
	229	--suggest-snapshots-ng ) SUGGEST_SNAPSHOTS_NG=1; shift;;
	230	--suggest-snapshots-ok ) SUGGEST_SNAPSHOTS_OK=1; shift;;
	231	--check-archive-links ) CHECK_ARCHIVE_LINKS=1; shift;;
	232	--take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
	233	--timeout ) TIMEOUT=$2; shift 2;;
	234	--start-url ) URL_START=$2; shift 2;;
	235	--end-url ) URL_LIMIT=$2; shift 2;;
	236	--upload ) UPLOAD_INFO=$2; shift 2;;
[1157]	237	* ) echo "Invalid argument '$1' detected. Aborting."; exit 1;;
[1064]	238	esac
	239	done
	240
	241	# If the required arguments were not supplied, print help page and quit
	242	if [ -z $LINKS_URL ] \|\| [ -z $OUTPUT_DIR ]; then
[1070]	243	echo "Error: I did not receive one or both required arguments. Run me with the \"--help\" argument for documentation."
[1064]	244	exit 2
	245	fi
	246
[1070]	247	# If user wants screenshots, make sure path to Chrome was passed in and is valid
	248	if [ $TAKE_PAGE_SHOT -eq 1 ]; then
	249	if [ ! -f "$CHROME_PATH" ]; then
	250	echo "Error: You need to supply a path to the Google Chrome application in order to take screenshots."
	251	exit 3
	252	fi
	253	fi
	254
[1064]	255	# Check that UPLOAD_INFO exists, if this argument was supplied
	256	if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then
	257	echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting."
[1070]	258	exit 4
[1064]	259	fi
	260
	261	# Check that OUTPUT_DIR is a directory
	262	if [ ! -d "$OUTPUT_DIR" ]; then
	263	echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting."
[1070]	264	exit 5
[1064]	265	fi
	266
	267	# Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots
	268	SAFE_TIME=$(date "+%Y-%m-%d--%H-%M-%S") # for use in path names
	269	NICE_TIME=$(TZ=":UTC" date "+%a, %d %b %Y %k:%M:%S GMT") # matches format of LINKS_DATE, which is extracted from the extlinks file
	270	OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
	271	OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
	272	SHOT_PATH="$OUTPUT_PATH/Screenshots"
	273	LOG_NAME="ValExtLinks report"
[1144]	274	LOG_NAME_TXT="$LOG_NAME.txt"
	275	LOG_NAME_RTF="$LOG_NAME.rtf"
	276	LOG_NAME_HTM="$LOG_NAME.htm"
	277	LOG_PATH="$OUTPUT_PATH/$LOG_NAME"
	278	LOG_PATH_TXT="$LOG_PATH.txt"
	279	LOG_PATH_RTF="$LOG_PATH.rtf"
	280	LOG_PATH_HTM="$LOG_PATH.htm"
[1064]	281	mkdir "$OUTPUT_PATH"
	282	if [ $TAKE_PAGE_SHOT -eq 1 ]; then
	283	mkdir "$SHOT_PATH"
	284	fi
	285
	286	# Check that 'mkdir' succeeded
	287	if [ ! -d "$OUTPUT_PATH" ]; then
	288	echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting."
[1070]	289	exit 6
[1064]	290	fi
	291
	292	# Get date on the file at LINKS_URL and print to log
	293	LINKS_DATE=$(curl --silent --head $LINKS_URL \| grep "Last-Modified")
	294	if [ -z "$LINKS_DATE" ]; then
	295	echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting."
[1070]	296	exit 7
[1064]	297	fi
	298	LINKS_DATE=${LINKS_DATE#Last-Modified: }
	299
	300
	301	### UTILITY FUNCTIONS ###
	302	# Writes a plain-text header to TXT log file
	303	function printTXTheader()
	304	{
	305	valPrint t "Validate External Links report"
	306	valPrint t "generated $NICE_TIME"
	307	valPrint t "from data of $LINKS_DATE"
[1141]	308	valPrint t "script by Iritscen (contact: $WIKI_ME)"
[1064]	309	valPrint t ""
	310	}
	311
	312	# Writes the RTF header to RTF log file, then a centered title, then sets text to left-justified
	313	function printRTFheader()
	314	{
	315	valPrint r "{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf820
	316	{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
	317	{\colortbl;\red255\green255\blue255;}
	318	{\*\expandedcolortbl;;}
	319	\margl1440\margr1440\vieww12600\viewh12100\viewkind0
	320	\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\qc\partightenfactor0
	321
	322	\f0\fs28 \cf0 \b1 Validate External Links report \b0\\
	323	generated $NICE_TIME\\
	324	from data of $LINKS_DATE\\
[1141]	325	script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$WIKI_ME\"}}{\fldrslt contact}})
[1064]	326	\\
	327	\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
	328	\cf0 "
	329	}
	330
	331	# Closes the RTF markup of the RTF log file
	332	function printRTFfooter()
	333	{
	334	valPrint r "}"
	335	}
	336
	337	# Writes the HTML header to HTML log file
	338	function printHTMheader()
	339	{
	340	valPrint h "<html>
	341	<head>
	342	<title>Validate External Links report</title>
	343	</head>
	344	<body>
	345	<h2>Validate External Links report</h2>
	346	<h3>generated $NICE_TIME<br />
	347	from data of $LINKS_DATE<br />
[1141]	348	script by Iritscen (<a href=\"$WIKI_ME\" target=\"_blank\">contact</a>)</h3>"
[1064]	349	}
	350
	351	# Closes the HTML markup of the HTML log file
	352	function printHTMfooter()
	353	{
	354	valPrint h "</body>
	355	</html>"
	356	}
	357
	358	# The central logging function. The first parameter is a string composed of one or more characters that
[1070]	359	# indicate which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
[1141]	360	# 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 's' means "Print an
	361	# extra newline at the end." 'w' means "Don't pass console output through 'fmt'" ("fmt" fits the output
[1119]	362	# to an 80-column CLI but can break special formatting and the 'n' option).
[1064]	363	function valPrint()
	364	{
	365	if [[ "$1" == c ]]; then
	366	if [[ "$1" == n ]]; then
	367	echo -n "$2"
	368	elif [[ "$1" == w ]]; then
	369	echo "$2"
[1119]	370	elif [[ "$1" == s ]]; then
	371	echo -e "$2\n"
[1064]	372	else
	373	echo "$2" \| fmt -w 80
	374	fi
	375	fi
	376	if [[ "$1" == t ]]; then
	377	if [[ "$1" == n ]]; then
[1144]	378	echo -n "$2" >> "$LOG_PATH_TXT"
[1119]	379	elif [[ "$1" == s ]]; then
[1144]	380	echo -e "$2\n" >> "$LOG_PATH_TXT"
[1064]	381	else
[1144]	382	echo "$2" >> "$LOG_PATH_TXT"
[1064]	383	fi
	384	fi
	385	if [[ "$1" == r ]]; then
	386	if [[ "$1" == n ]]; then
[1144]	387	echo "$2" >> "$LOG_PATH_RTF"
[1119]	388	elif [[ "$1" == s ]]; then
[1144]	389	echo "$2\line\line" >> "$LOG_PATH_RTF"
[1064]	390	else
[1144]	391	echo "$2\line" >> "$LOG_PATH_RTF"
[1064]	392	fi
	393	fi
	394	if [[ "$1" == h ]]; then
[1119]	395	if [[ "$1" == s ]]; then
[1144]	396	echo "$2<tr><td> </td></tr>" >> "$LOG_PATH_HTM"
[1119]	397	elif [[ "$1" == n ]]; then
[1144]	398	echo "$2" >> "$LOG_PATH_HTM"
[1064]	399	else
[1144]	400	echo "$2<br />" >> "$LOG_PATH_HTM"
[1064]	401	fi
	402	fi
	403	}
	404
	405	# Pluralize the string in parameter 1 if the number in parameter 2 is not 1
	406	function pluralCheckNoun()
	407	{
	408	if [ $2 -ne 1 ]; then
	409	if [[ $1 =~ x$ ]]; then
	410	echo $1es
	411	else
	412	echo $1s
	413	fi
	414	else
	415	echo $1
	416	fi
	417	}
	418
[1067]	419	# Output "is" if parameter 1 is 1, otherwise "are"
	420	function pluralCheckIs()
	421	{
	422	if [ $1 -ne 1 ]; then
	423	echo "are"
	424	else
	425	echo "is"
	426	fi
	427	}
	428
[1064]	429	# Output "was" if parameter 1 is 1, otherwise "were"
	430	function pluralCheckWas()
	431	{
	432	if [ $1 -ne 1 ]; then
	433	echo "were"
	434	else
	435	echo "was"
	436	fi
	437	}
	438
[1067]	439	# Output "a " if parameter 1 is 1, otherwise nothing
	440	function pluralCheckA()
	441	{
	442	if [ $1 -eq 1 ]; then
	443	echo "a "
	444	fi
	445	}
	446
	447	# Output "an " if parameter 1 is 1, otherwise nothing
	448	function pluralCheckAn()
	449	{
	450	if [ $1 -eq 1 ]; then
	451	echo "an "
	452	fi
	453	}
	454
[1144]	455	# Upload the reports using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
[1064]	456	# reports being saved to disk have already been closed.
	457	function uploadReport()
	458	{
[1144]	459	valPrint c "Uploading reports..."
[1064]	460
	461	SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
	462	SFTP_USER_NAME_MARKER="user:"
	463	SFTP_PASSWORD_MARKER="pw:"
	464	SFTP_PORT_MARKER="port:"
	465	SFTP_PATH_MARKER="path:"
	466	SFTP_USER_NAME=$(grep $SFTP_USER_NAME_MARKER $UPLOAD_INFO)
	467	SFTP_USER_NAME=${SFTP_USER_NAME#$SFTP_USER_NAME_MARKER}
	468	SFTP_PASSWORD=$(grep $SFTP_PASSWORD_MARKER $UPLOAD_INFO)
	469	SFTP_PASSWORD=${SFTP_PASSWORD#$SFTP_PASSWORD_MARKER}
	470	SFTP_PORT=$(grep $SFTP_PORT_MARKER $UPLOAD_INFO)
	471	SFTP_PORT=${SFTP_PORT#$SFTP_PORT_MARKER}
	472	SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
	473	SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}
	474
[1144]	475	for SUFFIX in htm rtf txt; do
	476	expect "$SCRIPT_PATH" "$LOG_PATH.$SUFFIX" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.$SUFFIX"
[1064]	477
[1144]	478	if [ "$?" -ne 0 ]; then
	479	valPrint c "Error $? occurred when attempting to upload $LOG_NAME.$SUFFIX!"
	480	else
	481	valPrint c "Report in `echo $SUFFIX \| tr [:lower:] [:upper:]` format was uploaded."
	482	fi
	483	done
[1064]	484	}
	485
	486	# Prints session summary when script is done
	487	function wrapupAndExit()
	488	{
	489	# Get off progress line on console, drop down a line from last link in log, and close HTML table
	490	valPrint ctr ""
	491	valPrint h "</table><br />"
	492
	493	# If we didn't finish processing the last URL, then the iterator is one too high
	494	if [ $FINISHED_LIST != "yes" ]; then
	495	let LINK_NUM-=1
	496	if [ $FINISHED_LIST == "no" ]; then
	497	valPrint ctrh "The session was canceled by the user."
	498	fi
	499	fi
	500
[1118]	501	# Generate string with elapsed time
	502	END_RUN=$(date +%s)
	503	ELAPSED=$(echo $(($END_RUN - $START_RUN)) \| awk '{printf "%d min. %d sec. elapsed", int($1/60), int($1%60)}')
	504
[1122]	505	# Do some math on results of session
[1064]	506	LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
[1142]	507	TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP+SKIP_YOUTU_BE))
[1122]	508	LINK_ERRORS=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
[1142]	509	LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_RD+SKIP_EXPECT_EI+SKIP_EXPECT_IW))
	510	LINK_PROBLEMS_TOTAL=$((NG_LINKS+RD_LINKS+EI_LINKS+IW_LINKS))
	511	LINK_PROBLEMS_NG=$((NG_LINKS-SKIP_EXPECT_NG))
	512	LINK_PROBLEMS_RD=$((RD_LINKS-SKIP_EXPECT_RD))
	513	LINK_PROBLEMS_EI=$((EI_LINKS-SKIP_EXPECT_EI))
	514	LINK_PROBLEMS_IW=$((IW_LINKS-SKIP_EXPECT_IW))
	515	LINK_PROBLEMS_NET=$((LINK_PROBLEMS_NG+LINK_PROBLEMS_RD+LINK_PROBLEMS_EI+LINK_PROBLEMS_IW))
[1122]	516
[1144]	517	# Print something in the Links section if no link issues were printed
	518	if [ $LINK_PROBLEMS_NET -eq 0 ]; then
	519	valPrint h "<i>No link problems to report! See the <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for a list of links with issues that were not reported.</i>"
	520	fi
	521	if [ $LINK_PROBLEMS_TOTAL -eq 0 ]; then
	522	valPrint t "No link problems to report!"
	523	valPrint r "\i1 No link problems to report! \i0"
	524	fi
	525
[1141]	526	## SUMMARY OUTPUT ##
[1118]	527	valPrint ct "Summary ($ELAPSED):"
	528	valPrint r "\b1 Summary \b0 ($ELAPSED)"
	529	valPrint hn "<h3><span id=\"summary\">Summary ($ELAPSED)</span></h3>"
[1123]	530	valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT) (there $(pluralCheckWas $FILE_LINKS) $FILE_LINKS file $(pluralCheckNoun link $FILE_LINKS) and $PAGE_LINKS page $(pluralCheckNoun link $PAGE_LINKS))."
[1122]	531
	532	# Print processed link totals
	533	if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi
	534	if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi
[1158]	535	if [ $SKIP_ARCHIVES -gt 0 ]; then valPrint ctrh "- $SKIP_ARCHIVES archive.org/archive.is $(pluralCheckNoun link $SKIP_ARCHIVES) were not checked"; fi
[1142]	536	if [ $LINK_PROBLEMS_TOTAL -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_TOTAL processed $(pluralCheckNoun link $LINK_PROBLEMS_TOTAL) had $(pluralCheckAn $LINK_PROBLEMS_TOTAL)$(pluralCheckNoun issue $LINK_PROBLEMS_TOTAL)"; fi
	537	if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr " (excepted $LINKS_EXCEPTED link $(pluralCheckNoun issue $LINKS_EXCEPTED) from report)"; valPrint h "  (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi
[1123]	538	if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) $(pluralCheckWas $OK_LINKS) OK"; fi
	539	if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctr " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; valPrint h "  (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi
[1122]	540
	541	# Print errored link totals
[1144]	542	if [ $LINK_ERRORS -gt 0 ]; then
	543	valPrint c "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"
	544	valPrint h "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for specific links):"
	545	valPrint rt "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS):"
	546	fi
[1122]	547	if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
[1070]	548	if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
[1064]	549	if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
	550	if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
	551	if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
	552	if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
[1122]	553
[1142]	554	# Print excepted link totals
[1144]	555	if [ $LINKS_EXCEPTED -gt 0 ]; then
	556	valPrint c "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"
	557	valPrint h "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for specific links):"
	558	valPrint rt "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted:"
	559	fi
[1142]	560	if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
	561	if [ $SKIP_EXPECT_RD -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_RD/$RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi
	562	if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi
	563	if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi
	564
[1175]	565	# Perform exceptions audit
	566	EXCEPTION_ISSUES=0
	567	valPrint ctrh "Exceptions list audit:"
	568	for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
	569	EXCEPT_LINE="${EXCEPT_ARRAY[$i]}"
	570	EXCEPT_LINE=$(echo "$EXCEPT_LINE" \| sed 's/\&/\&/g') # copied from exception-matching code
	571
	572	if [ ${EXCEPT_FOUND[$i]} -eq 0 ]; then
	573	EXCEPT_URL="${EXCEPT_LINE#*,}"
	574	EXCEPT_URL="${EXCEPT_URL%,*}"
	575	EXCEPT_PAGE="${EXCEPT_LINE##*,}"
	576	EXCEPT_PAGE="${EXCEPT_PAGE%% *}"
	577	if [ "$EXCEPT_PAGE" == "*" ]; then
	578	valPrint tr "- The link '$EXCEPT_URL' did not return an error or is not on any page."
	579	else
	580	valPrint tr "- The link '$EXCEPT_URL' did not return an error or is not on page '$EXCEPT_PAGE'."
	581	fi
	582	let EXCEPTION_ISSUES+=1
	583	elif [ ${EXCEPT_USED[$i]} -eq 0 ]; then
	584	EXCEPT_URL="${EXCEPT_LINE#*,}"
	585	EXCEPT_URL="${EXCEPT_URL%,*}"
	586	EXCEPT_CODE=${EXCEPT_LINE%%,*}
	587	valPrint tr "- The link '$EXCEPT_URL' did not return error code $EXCEPT_CODE."
	588	let EXCEPTION_ISSUES+=1
	589	fi
	590	done
	591	if [ $EXCEPTION_ISSUES -eq 0 ]; then
	592	valPrint ctrh "- No issues found."
	593	else
	594	valPrint c "$EXCEPTION_ISSUES exception list $(pluralCheckNoun issue $EXCEPTION_ISSUES) detected (see RTF or TXT report for details)."
	595	valPrint h "$EXCEPTION_ISSUES exception list $(pluralCheckNoun issue $EXCEPTION_ISSUES) detected (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for details)."
	596	fi
	597
[1122]	598	# Print checked link totals
[1142]	599	if [ $LINK_PROBLEMS_NET -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS_NET link $(pluralCheckNoun issue $LINK_PROBLEMS_NET):"; fi
	600	if [ $LINK_PROBLEMS_NG -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_NG NG $(pluralCheckNoun link $LINK_PROBLEMS_NG)"; fi
	601	if [ $LINK_PROBLEMS_RD -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_RD $(pluralCheckNoun redirection $LINK_PROBLEMS_RD)"; fi
	602	if [ $LINK_PROBLEMS_EI -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_EI $(pluralCheckNoun link $LINK_PROBLEMS_EI) that could be intrawiki"; fi
	603	if [ $LINK_PROBLEMS_IW -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_IW $(pluralCheckNoun link $LINK_PROBLEMS_IW) that could be interwiki"; fi
[1122]	604
	605	# Close the log files' markup
[1070]	606	valPrint trh "ValExtLinks says goodbye."
[1064]	607	printRTFfooter
	608	printHTMfooter
	609
	610	# Upload report if this was requested
	611	if [ ! -z $UPLOAD_INFO ] && [ $FINISHED_LIST != "no" ]; then
	612	uploadReport
	613	fi
	614
	615	# Really quit now
	616	valPrint c "ValExtLinks says goodbye."
	617	exit 0
	618	}
	619	trap wrapupAndExit INT
	620
	621
	622	### INITIALIZATION ###
	623	# Print opening message to console and log files
	624	valPrint cw "$(tput bold)--Validate External Links--$(tput sgr0)"
	625	printTXTheader
	626	printRTFheader
	627	printHTMheader
	628
[1141]	629	## DATA SOURCING ##
	630	valPrint t "Startup:"
	631	valPrint r "\b1 Startup \b0"
	632	valPrint hn "<h3>Startup</h3>"
	633
[1064]	634	# Attempt to download file at LINKS_URL, then check that it succeeded
[1141]	635	valPrint cwtrhn "Downloading list of external links from $LINKS_URL..."
[1064]	636	LINKS_FILE_NAME=$(echo "$LINKS_URL" \| sed 's/.*\///')
	637	LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
	638	curl --silent -o "$LINKS_FILE" $LINKS_URL
	639	if [ ! -f "$LINKS_FILE" ]; then
[1141]	640	echo -e "\nThe download of $LINKS_URL appears to have failed. Aborting."
[1064]	641	wrapupAndExit
[1141]	642	else
	643	valPrint ctrh " success."
[1064]	644	fi
	645
	646	# Attempt to download file at EXCEPT_URL, then check that it succeeded
	647	if [ ! -z $EXCEPT_URL ]; then
[1141]	648	valPrint cwtrhn "Downloading list of reporting exceptions from $EXCEPT_URL..."
[1136]	649	EXCEPT_DATA=$(curl --silent $EXCEPT_URL)
	650	if [ -z "$EXCEPT_DATA" ]; then
[1141]	651	echo -e "\nThe download of the exceptions data from '$EXCEPT_URL' appears to have failed. Aborting."
[1064]	652	wrapupAndExit
[1141]	653	else
	654	valPrint ctrh " success."
[1064]	655	fi
[1136]	656	EXCEPT_DATA=${EXCEPT_DATA%END LIST*}
	657	EXCEPT_DATA=${EXCEPT_DATA#*BEGIN LIST}
	658	EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
	659
	660	# Store on disk for debugging purposes
	661	echo "$EXCEPT_DATA" > "$EXCEPT_FILE"
	662
	663	# Transfer to array for easy searching later
	664	declare -a EXCEPT_ARRAY=($(echo "$EXCEPT_DATA"))
[1175]	665
	666	# Create parallel arrays for marking which exceptions get used later
	667	declare -a EXCEPT_USED=()
	668	declare -a EXCEPT_FOUND=()
	669	for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
	670	EXCEPT_USED+=(0)
	671	EXCEPT_FOUND+=(0)
	672	done
[1064]	673	fi
	674
	675	# Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
	676	LINK_COUNT_STRING=$(cat "$LINKS_FILE" \| wc -l)
	677
	678	# Number of URLs is number of lines minus one (first line is column header row for the CSV)
	679	LINK_COUNT=$(echo "${LINK_COUNT_STRING}" \| tr -d '[:space:]')
	680	let LINK_COUNT-=1
[1141]	681	valPrint ctrh "Found $LINK_COUNT links to process."
	682	valPrint trh ""
[1064]	683
[1141]	684	## CONFIG OUTPUT ##
	685	valPrint t "Config:"
	686	valPrint r "\b1 Config \b0"
	687	valPrint hn "<h3>Config</h3>"
	688
	689	valPrint ctrhn "Links to consider: "
[1064]	690	if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
[1141]	691	valPrint ctrh "$((URL_LIMIT-URL_START+1)), from link $URL_START to $URL_LIMIT"
[1064]	692	elif [ $URL_START -ne 1 ]; then
[1141]	693	valPrint ctrh "$((LINK_COUNT-URL_START+1)), from link $URL_START to $LINK_COUNT"
[1064]	694	else
[1141]	695	valPrint ctrh "$LINK_COUNT"
[1064]	696	fi
	697
[1141]	698	valPrint ctrh "Site query timeout: $TIMEOUT seconds"
	699
	700	valPrint ctrhn "Show OK links: "
	701	if [ $RECORD_OK_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
	702
	703	valPrint ctrhn "Take screenshots: "
	704	if [ $TAKE_PAGE_SHOT -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
	705
[1147]	706	valPrint ctrhn "Suggest archive.org snapshots for NG pages: "
	707	if [ $SUGGEST_SNAPSHOTS_NG -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
[1141]	708
[1147]	709	valPrint ctrhn "Suggest archive.org snapshots for OK pages: "
	710	if [ $SUGGEST_SNAPSHOTS_OK -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
	711
[1141]	712	valPrint ctrhn "Ignore slash-adding redirects: "
	713	if [ $SHOW_SLASH -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
	714
	715	valPrint ctrhn "Ignore HTTPS-upgrading redirects: "
	716	if [ $SHOW_HTTPS -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
	717
	718	valPrint ctrhn "Ignore youtu.be redirects: "
	719	if [ $SHOW_YT_RD -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
	720
[1158]	721	valPrint ctrhn "Check archive.org and archive.is links: "
[1144]	722	if [ $CHECK_ARCHIVE_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
[1141]	723
[1064]	724	valPrint tr "A summary of my findings will be found at the bottom of the report."
	725	valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
	726	valPrint trh ""
	727
[1141]	728	## LEGEND OUTPUT ##
[1064]	729	valPrint t "Legend:"
	730	valPrint r "\b1 Legend \b0"
	731	valPrint hn "<h3>Legend</h3>"
[1175]	732	valPrint t "(For guidance in fixing these links, see $WIKI_MAIN. The exceptions list is at $EXCEPT_URL.)"
	733	valPrint r "(For guidance in fixing these links, see {\field{\\fldinst{HYPERLINK \"$WIKI_MAIN\"}}{\fldrslt here}}. The exceptions list is {\field{\\fldinst{HYPERLINK \"$EXCEPT_URL\"}}{\fldrslt here}}.)"
	734	valPrint h "(For guidance in fixing these links, see <a href=\"$WIKI_MAIN\" target=\"_blank\">here</a>. The exceptions list is <a href=\"$EXCEPT_URL\" target=\"_blank\">here</a>.)"
[1141]	735	valPrint trh "OK = URL seems to be working"
	736	valPrint trh "NG = URL no longer seems to work"
	737	valPrint trh "RD = URL is redirecting to this new URL"
	738	valPrint trh "EI = URL is an external link to an internal page and should be converted to an intrawiki link using the suggested markup"
	739	valPrint trh "IW = URL is an external link to a fellow wiki and should be converted to an interwiki link using the suggested markup"
	740	valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $WIKI_HTTP)"
	741	valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$WIKI_HTTP\"}}{\fldrslt here}} for code reference)"
	742	valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$WIKI_HTTP\" target=\"_blank\">here</a> for code reference)"
	743	valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $WIKI_CURL)"
	744	valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$WIKI_CURL\"}}{\fldrslt here}} for code reference)"
	745	valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$WIKI_CURL\" target=\"_blank\">here</a> for code reference)"
	746	valPrint trh "IA suggests = Last available snapshot returned by the Internet Archive"
	747	valPrint trh "Try browsing = The Archive failed to return a snapshot URL, so check for a snapshot manually using this link"
[1064]	748	valPrint trh ""
	749
	750
	751	### MAIN LOOP ###
[1120]	752	valPrint t "Links:"
	753	valPrint r "\b1 Links \b0"
	754	valPrint hn "<h3>Links</h3>"
[1118]	755	START_RUN=$(date +%s)
[1064]	756	# Process each line of the .csv in LINKS_FILE
	757	for LINE in `cat "$LINKS_FILE"`; do
[1147]	758	START_LINK=$(date +%s)
[1064]	759	let LINK_NUM+=1
	760
	761	# First line is the column header row for the CSV, so let's verify that the format hasn't changed
	762	if [ $SKIPPED_HEADER_ROW -eq 0 ]; then
	763	if [ $LINE == "namespace,title,target" ]; then
	764	SKIPPED_HEADER_ROW=1
[1148]	765	LINK_NUM=0 # this line is not a link, so reset the link counter
[1064]	766	valPrint hn "<table>"
	767	continue
	768	else
	769	valPrint ctrh "Did not find expected header row in $LINKS_FILE. Aborting."
	770	wrapupAndExit
	771	fi
	772	fi
	773
	774	# Skip this link if we are not at URL_START yet
	775	if [ $LINK_NUM -lt $URL_START ]; then
	776	continue
	777	fi
	778
	779	# Stop if we are at the limit declared for testing purposes
	780	if [ $URL_LIMIT -gt 0 ] && [ $LINK_NUM -gt $URL_LIMIT ]; then
	781	FINISHED_LIST="limit"
	782	wrapupAndExit
	783	fi
	784
	785	# Print progress to screen
	786	if [ $LINK_NUM -gt 1 ]; then
	787	printf "\e[1A\n" # erase previous progress message so that new one appears in its place
	788	fi
	789	valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."
	790
	791	# The number of the namespace is the element before the first comma on the line
	792	NS_ID=${LINE%%,*}
	793
	794	# Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
	795	NS_NAME=""
	796	a=0
[1069]	797	while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done
[1118]	798	if [ $NS_ID == "NULL" ]; then
	799	break
	800	elif [ $NS_ID -eq ${NS_IDS[$a]} ]; then
[1064]	801	NS_NAME="${NS_NAMES[$a]}"
	802	break
	803	fi
	804	let a+=1
	805	done
[1118]	806	if [ "$NS_NAME" == "" ]; then
	807	if [ $NS_ID == "NULL" ]; then
[1123]	808	valPrint trs "Skipping line $LINK_NUM ('$LINE') because the namespace (and probably the page too) is 'NULL'. Probably the link is no longer in existence on the wiki."
[1118]	809	else
[1123]	810	valPrint trs "Skipping line $LINK_NUM ('$LINE') because I could not find a name for namespace ID $NS_ID."
[1118]	811	fi
[1064]	812	let SKIP_UNK_NS+=1
[1148]	813	let PAGE_LINKS+=1
[1064]	814	continue
	815	fi
	816
	817	# The name of the page is everything between the namespace ID and the next comma on the line (commas
	818	# in page names will break this)
	819	PAGE_NAME=${LINE#$NS_ID,}
	820	PAGE_NAME=${PAGE_NAME%%,*}
	821
[1070]	822	# Build longer wiki page URLs from namespace and page names
[1122]	823	FULL_PAGE_PATH=https://$WIKI_PATH/$NS_NAME:$PAGE_NAME
[1070]	824	LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
	825	# Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it
	826	# explicitly breaks the link
	827	if [ $NS_ID -eq 0 ]; then
[1122]	828	FULL_PAGE_PATH=https://$WIKI_PATH/$PAGE_NAME
[1070]	829	LOCAL_PAGE_PATH=$PAGE_NAME
	830	fi
	831
[1149]	832	# We don't want to consider wiki pages ending in .js, as the MW parser cannot reliably isolate URLS
	833	# in JavaScript code, so it returns erroneous links
	834	PAGE_NAME_SUFFIX=$(echo $PAGE_NAME \| sed 's/.*\.//')
	835	if [ $PAGE_NAME_SUFFIX == "js" ]; then
	836	valPrint trs "Skipping URL '${LINE#$NS_ID,$PAGE_NAME,}' on line $LINK_NUM because it was found on JavaScript page '$LOCAL_PAGE_PATH'."
	837	let SKIP_JS_PAGE+=1
	838	let PAGE_LINKS+=1
	839	continue
	840	fi
	841
[1064]	842	# The URL being linked to is everything after the previous two fields (this allows commas to be in
	843	# the URLs, but a comma in the previous field, the page name, will break this)
	844	URL=${LINE#$NS_ID,$PAGE_NAME,}
	845
	846	# Scan for illegal characters
	847	if [[ $URL == [$ILLEGAL_CHARS] ]]; then
[1149]	848	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because it contains characters illegal in a URL."
[1064]	849	let SKIP_BAD_URL+=1
[1148]	850	let PAGE_LINKS+=1
[1064]	851	continue
	852	fi
	853
[1158]	854	# If we're skipping archive links, see if this is one
	855	if [ $CHECK_ARCHIVE_LINKS -eq 0 ] && [[ ( $URL == web.archive.org \|\| $URL == archive.is ) ]]; then
	856	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to check archive links."
	857	let SKIP_ARCHIVES+=1
[1148]	858	let PAGE_LINKS+=1
[1135]	859	continue
	860	fi
	861
[1064]	862	# Now we need to know if the URL is for a file or a web page. First step is to determine if the
	863	# URL ends in a suffix
	864	HAS_SUFFIX=0
	865
	866	# If the URL ends in something like ".php?foo=bar", strip everything from the '?' onward
[1070]	867	CLEAN_URL=${URL%%\?*}
[1064]	868
	869	# If the URL ends in something like "#section_15", strip everything from the '#' onward
[1070]	870	CLEAN_URL=${CLEAN_URL%%\#*}
[1064]	871
[1175]	872	# 'sed' cannot handle Unicode in my Bash shell, so skip non-ASCII URL and make reader check it
[1070]	873	if [[ $CLEAN_URL == [![:ascii:]] ]]; then
[1149]	874	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I cannot handle non-ASCII characters."
[1064]	875	let SKIP_NON_ASCII+=1
[1148]	876	let PAGE_LINKS+=1
[1064]	877	continue
	878	fi
	879
	880	# Isolate the characters after the last period and after the last slash
[1070]	881	POST_DOT=$(echo "$CLEAN_URL" \| sed 's/.*\.//')
	882	POST_SLASH=$(echo "$CLEAN_URL" \| sed 's/.*\///')
[1064]	883
	884	# If the last period comes after the last slash, then the URL ends in a suffix
	885	POST_DOT_LENGTH=$(echo \| awk -v input=$POST_DOT '{print length(input)}')
	886	POST_SLASH_LENGTH=$(echo \| awk -v input=$POST_SLASH '{print length(input)}')
	887	if [ $POST_DOT_LENGTH -lt $POST_SLASH_LENGTH ]; then
	888	HAS_SUFFIX=1
	889	else
	890	HAS_SUFFIX=0
	891	fi
	892
	893	# Now we need to know if the URL is for a file. If the URL ends in a suffix, compare it against the
	894	# known URL endings supplied in HTTP_FILES and HTTP_TLDS_AND_PAGES
	895	IS_FILE=-1
	896	if [ $HAS_SUFFIX -eq 0 ]; then
	897	IS_FILE=0
	898	else
	899	# Turn off case sensitivity while we compare suffixes
	900	shopt -s nocasematch
	901
[1127]	902	# Special case: URLs ending in something like "productID.304297400" are pages, not files, so if
[1064]	903	# the URL's suffix is all numbers, we are looking at the end of a web page URL
	904	if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
	905	IS_FILE=0
	906	fi
[1127]	907
	908	# Special case: URLs ending in a parens, e.g. "ms537113(v=vs.85)", are pages
	909	if [[ $POST_DOT =~ ^.*[\(\)]$ ]]; then
	910	IS_FILE=0
	911	fi
	912
	913	# Special case: URLs containing a '%', e.g. "10.1007%2FBF00329055", are pages
	914	if [[ $POST_DOT == % ]]; then
	915	IS_FILE=0
	916	fi
[1064]	917
	918	# If we did not identify this URL as a web page above, we need to compare the suffix against known
	919	# file extensions
	920	if [ $IS_FILE -eq -1 ]; then
	921	for EXTENSION in "${HTTP_FILES[@]}"; do
	922	if [[ $EXTENSION == $POST_DOT ]]; then # double brackets respect the nocasematch setting
	923	IS_FILE=1
	924	break
	925	fi
	926	done
	927	fi
	928
	929	# If we did not identify this URL as a file above, we need to compare the suffix against known
	930	# pages and TLDs in order to rule out that this is an unknown file suffix or TLD that the user
	931	# needs to add to the HTTP_FILES or HTTP_TLDS_AND_PAGES array
	932	if [ $IS_FILE -eq -1 ]; then
	933	for PAGE_OR_TLD in "${HTTP_TLDS_AND_PAGES[@]}"; do
	934	if [[ $PAGE_OR_TLD == $POST_DOT ]]; then
	935	IS_FILE=0
	936	break
	937	fi
	938	done
	939	fi
	940
	941	# Turn case sensitivity back on in Bash
	942	shopt -u nocasematch
	943	fi
	944
[1175]	945	# If this suffix escaped identification as either a file, page or TLD, inform the reader
[1064]	946	STR_TYPE=""
	947	if [ $IS_FILE -eq -1 ]; then
[1160]	948	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I encountered the unknown URL suffix '$POST_DOT'. Please add this suffix to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
[1064]	949	let SKIP_UNK_SUFFIX+=1
	950	continue
	951	elif [ $IS_FILE -eq 1 ]; then
	952	STR_TYPE="file"
	953	let FILE_LINKS+=1
[1148]	954	else
[1064]	955	STR_TYPE="page"
	956	let PAGE_LINKS+=1
	957	fi
	958
	959	# Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
	960	# issue with sites that require HTTPS
[1158]	961	CURL_CODE=$(curl -o /dev/null --silent --insecure --compressed --head --user-agent '$AGENT' --max-time $TIMEOUT --retry 2 --write-out '%{http_code}\n' $URL)
[1064]	962	CURL_ERR=$(echo $?)
	963	CURL_RESULT=$CURL_CODE
	964
	965	# Tack 'curl' exit code onto result string if 'curl' returned "000" (no HTTP response)
	966	if [ $CURL_CODE == "000" ]; then
	967	CURL_RESULT="$CURL_RESULT-$CURL_ERR"
	968	fi
	969
[1070]	970	# Begin to determine our status code for this URL (EI, IW, OK, RD, or NG)
[1064]	971	STATUS="??"
[1067]	972	NEW_URL=""
[1064]	973	INTERWIKI_INDEX=-1
	974
[1070]	975	# First make sure that this isn't an "external internal" link to our own wiki that can be replaced
	976	# by "[[page_name]]". If it uses a special access URL beginning with "/w/", let it pass, as it
	977	# probably cannot be replaced by "[[ ]]" markup
	978	if [[ $URL == $WIKI_PATH ]] && [[ $URL != $WIKI_PATH/w/ ]]; then
	979	STATUS="EI"
	980	let EI_LINKS+=1
	981	fi
	982
[1144]	983	# If it's not, check if this is a link to a domain that we have an interwiki prefix for (also make
	984	# sure that it's not an archive.org link to a page from an interwiki domain)
	985	if [ $STATUS == "??" ] && [[ $URL != web.archive.org ]]; then
[1070]	986	for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
	987	if [[ $URL == ${INTERWIKI_DOMAINS[$i]} ]] && [[ $URL != ${INTERWIKI_DOMAINS[$i]}/w/ ]]; then
	988	STATUS="IW"
	989	let IW_LINKS+=1
	990	INTERWIKI_INDEX=$i
	991	break
	992	fi
	993	done
	994	fi
	995
[1069]	996	# If we didn't match an interwiki domain, see if the status code is in our "OK" codes list
	997	if [ $STATUS == "??" ]; then
	998	for CODE in "${OK_CODES[@]}"; do
	999	if [[ $CODE == $CURL_CODE ]]; then
	1000	STATUS="OK"
	1001	let OK_LINKS+=1
[1148]	1002
	1003	# If this is a YouTube link, we have to look at the actual page source to know if the video
[1157]	1004	# is good or not; override the link's info if it's actually NG
[1148]	1005	if [[ $URL == www.youtube.com ]]; then
	1006	PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $URL \| grep "\"simpleText\":\"Video unavailable\"")
	1007	if [ ! -z "$PAGE_TEXT" ]; then
	1008	STATUS="NG"
[1157]	1009	CURL_RESULT=404
[1148]	1010	let OK_LINKS-=1
	1011	let NG_LINKS+=1
	1012	fi
	1013	fi
[1069]	1014	break
	1015	fi
	1016	done
	1017	fi
	1018
[1067]	1019	# If we didn't get a match with the "OK" codes, check it against the "RD" codes
[1064]	1020	if [ $STATUS == "??" ]; then
[1067]	1021	for CODE in "${RD_CODES[@]}"; do
	1022	if [[ $CODE == $CURL_CODE ]]; then
	1023	# Get URL header again in order to retrieve the URL we are being redirected to
[1141]	1024	NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --write-out '%{redirect_url}\n' $URL)
[1067]	1025
[1122]	1026	# Adjust the old and new URLs to both use HTTP for comparison purposes, so we can filter
	1027	# those changes out if the user didn't ask for them
	1028	URL_HTTP=$(echo $URL \| sed -E 's/^https:/http:/')
	1029	NEW_URL_HTTP=$(echo $NEW_URL \| sed -E 's/^https:/http:/')
[1070]	1030
	1031	# Sometimes 'curl' fails to get the redirect_url due to time-out or bad web site config
[1122]	1032	NEW_URL_LENGTH=$(echo \| awk -v input=$NEW_URL_HTTP '{print length(input)}')
[1070]	1033	if [ $NEW_URL_LENGTH -lt $MIN_URL_LENGTH ]; then
[1122]	1034	NEW_URL_HTTP="[new URL not retrieved]"
[1070]	1035	fi
	1036
[1122]	1037	# Remove slash at end of new URL, if present, so we can filter out the redirects that
	1038	# merely add an ending slash if the user didn't ask for them
	1039	NEW_URL_NO_SLASH=$(echo $NEW_URL_HTTP \| sed -E 's:/$::')
	1040
[1127]	1041	# Detect if this is a youtu.be link simply being expanded by YouTube to the full
	1042	# youtube.com address
	1043	YOUTU_BE=0
	1044	if [[ $URL_HTTP == http://youtu.be* ]] && [[ $NEW_URL_HTTP == http://www.youtube.com* ]]; then
	1045	YOUTU_BE=1
	1046	fi
	1047
[1122]	1048	# If the URLs match besides HTTP being upgraded to HTTPS, then the link is OK (unless user
	1049	# wants those to be reported)
	1050	if [ $SHOW_HTTPS -eq 0 ] && [ $URL_HTTP == $NEW_URL_HTTP ]; then
[1149]	1051	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to show http->https upgrades, and I was redirected to '$NEW_URL'."
[1069]	1052	STATUS="OK"
	1053	let OK_LINKS+=1
[1122]	1054	let SKIP_HTTPS_UP+=1
	1055	# If the URLs match besides an added ending slash, then the link is OK (unless user wants
	1056	# those to be reported)
	1057	elif [ $SHOW_SLASH -eq 0 ] && [ $URL_HTTP == $NEW_URL_NO_SLASH ]; then
[1149]	1058	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to show added trailing slashes, and I was redirected to '$NEW_URL'."
[1122]	1059	STATUS="OK"
	1060	let OK_LINKS+=1
	1061	let SKIP_SLASH_ADD+=1
[1148]	1062	elif [ $YOUTU_BE -eq 1 ]; then
	1063	# We have to look at the actual page source to know if a YouTube video is good or not
	1064	PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $NEW_URL \| grep "\"simpleText\":\"Video unavailable\"")
	1065	if [ ! -z "$PAGE_TEXT" ]; then
	1066	STATUS="NG"
	1067	let NG_LINKS+=1
	1068	else
	1069	if [ $SHOW_YT_RD -eq 0 ]; then
[1149]	1070	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to show redirects for youtu.be links, and I was redirected to '$NEW_URL'."
[1148]	1071	STATUS="OK"
	1072	let OK_LINKS+=1
	1073	let SKIP_YOUTU_BE+=1
	1074	else
	1075	STATUS="RD"
	1076	let RD_LINKS+=1
	1077	fi
	1078	fi
[1069]	1079	else
	1080	STATUS="RD"
	1081	let RD_LINKS+=1
	1082	fi
[1067]	1083	break
	1084	fi
	1085	done
	1086	fi
	1087
	1088	# If we didn't get a match with the "RD" codes, check it against the "NG" codes
	1089	if [ $STATUS == "??" ]; then
[1064]	1090	for CODE in "${NG_CODES[@]}"; do
	1091	if [[ $CODE == $CURL_CODE ]]; then
	1092	STATUS="NG"
	1093	let NG_LINKS+=1
	1094	break
	1095	fi
	1096	done
	1097	fi
	1098
	1099	# If we didn't match a known status code, advise the reader
	1100	if [ $STATUS == "??" ]; then
[1149]	1101	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I encountered the unknown response code $CURL_CODE."
[1064]	1102	let SKIP_UNK_CODE+=1
	1103	continue
	1104	fi
	1105
[1136]	1106	# Check problem links against exceptions list before proceeding
	1107	FOUND_EXCEPT=0
[1175]	1108	if [ $STATUS != "OK" ] && [ ! -z "$EXCEPT_URL" ]; then
[1070]	1109	# The code we expect to find in the exceptions file is either the 'curl' result or "EI"/"IW"
	1110	EXPECT_CODE="$CURL_RESULT"
	1111	if [ $STATUS == "EI" ]; then
	1112	EXPECT_CODE="EI"
	1113	elif [ $STATUS == "IW" ]; then
	1114	EXPECT_CODE="IW"
	1115	fi
	1116
[1136]	1117	# Look for link in exceptions list and make sure the listed result code and wiki page also match
	1118	for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
	1119	{
	1120	EXCEPT_LINE="${EXCEPT_ARRAY[$i]}"
	1121
[1142]	1122	# Undo any HTML-encoding from the wiki page; for now we just worry about the ampersand, as most
	1123	# other HTML-encoded characters are not found in URLs
[1146]	1124	EXCEPT_LINE=$(echo "$EXCEPT_LINE" \| sed 's/\&/\&/g')
[1142]	1125
[1175]	1126	# Check for URL match
[1136]	1127	EXCEPT_URL="${EXCEPT_LINE#*,}"
	1128	EXCEPT_URL="${EXCEPT_URL%,*}"
	1129	if [ "$EXCEPT_URL" != "$URL" ]; then
[1070]	1130	continue
	1131	fi
[1136]	1132
[1175]	1133	# Check for page name match
[1136]	1134	EXCEPT_PAGE="${EXCEPT_LINE##*,}"
	1135	EXCEPT_PAGE="${EXCEPT_PAGE%% *}"
[1175]	1136	if [ "$EXCEPT_PAGE" == "*" ] \|\| [ "$EXCEPT_PAGE" == "$LOCAL_PAGE_PATH" ]; then
	1137	let EXCEPT_FOUND[$i]+=1
	1138	valPrint trs "Found exception '$URL' on page '$LOCAL_PAGE_PATH'."
	1139
	1140	# Check for result code match
[1136]	1141	EXCEPT_CODE=${EXCEPT_LINE%%,*}
	1142	if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then
[1175]	1143	FOUND_EXCEPT=1
	1144	let EXCEPT_USED[$i]+=1
[1149]	1145	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because its expected result, '$EXPECT_CODE', is in the exceptions list."
[1175]	1146
[1136]	1147	if [ $STATUS == "EI" ]; then
	1148	let SKIP_EXPECT_EI+=1
	1149	elif [ $STATUS == "IW" ]; then
	1150	let SKIP_EXPECT_IW+=1
[1142]	1151	elif [ $STATUS == "RD" ]; then
	1152	let SKIP_EXPECT_RD+=1
[1136]	1153	else
	1154	let SKIP_EXPECT_NG+=1
	1155	fi
[1175]	1156
[1136]	1157	break
	1158	fi
	1159	fi
	1160	} done
[1064]	1161	fi
[1136]	1162	if [ $FOUND_EXCEPT -eq 1 ]; then
	1163	continue
	1164	fi
[1064]	1165
	1166	# If appropriate, record this link to the log, with clickable URLs when possible
	1167	if [ $STATUS != "OK" ] \|\| [ $RECORD_OK_LINKS -eq 1 ]; then
[1125]	1168	# Prepare 'curl' result in parentheses to print after status code, unless this is an "IW" or "EI"
	1169	# link, in which case showing the status code doesn't make sense. Adjust spacing after string to
	1170	# ensure TXT and RTF reports have aligned columns of results.
	1171	CURL_STR_H=" ($CURL_RESULT)"
	1172	CURL_STR_T="$CURL_STR_H"
	1173	CURL_STR_R="$CURL_STR_H "
[1070]	1174	if [ $STATUS == "IW" ] \|\| [ $STATUS == "EI" ]; then
[1125]	1175	CURL_STR_H=""
	1176	CURL_STR_T=" "
	1177	CURL_STR_R=" "
[1064]	1178	fi
	1179
	1180	# Record link and its wiki page in TXT, RTF, and HTML markup
[1125]	1181	valPrint t "${STATUS}${CURL_STR_T} $STR_TYPE $URL"
[1064]	1182	valPrint t " linked from $FULL_PAGE_PATH"
[1125]	1183	valPrint r "${STATUS}${CURL_STR_R}${RTF_TABS}$STR_TYPE {\field{\*\fldinst{HYPERLINK \"$URL\"}}{\fldrslt $URL}}"
[1064]	1184	valPrint r " linked from {\field{\*\fldinst{HYPERLINK \"$FULL_PAGE_PATH\"}}{\fldrslt $LOCAL_PAGE_PATH}}"
[1125]	1185	valPrint hn "<tr><td style=\"white-space:nowrap\">${STATUS}${CURL_STR_H}</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>"
[1064]	1186	valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"
	1187
[1123]	1188	# Place vertical space here since we won't be printing anything more about this link
[1147]	1189	if [ $STATUS == "OK" ] && [ $SUGGEST_SNAPSHOTS_OK -eq 0 ]; then valPrint tr ""; valPrint hs ""; fi
[1123]	1190
[1067]	1191	# Record redirect URL if one was given by a 3xx response page
	1192	if [ $STATUS == "RD" ]; then
[1119]	1193	valPrint ts " Server suggests $NEW_URL"
	1194	valPrint rs " Server suggests {\field{\*\fldinst{HYPERLINK \"$NEW_URL\"}}{\fldrslt $NEW_URL}}"
	1195	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Server suggests</td><td><a href=\"$NEW_URL\" target=\"_blank\">$NEW_URL</a></td></tr>"
[1067]	1196	fi
	1197
[1070]	1198	# Notify reader if we can use an intrawiki link for this URL
	1199	if [ $STATUS == "EI" ]; then
[1075]	1200	INTRA_PAGE=${URL#:///}
[1119]	1201	valPrint ts " Just use [[$INTRA_PAGE]]"
	1202	valPrint rs " Just use [[$INTRA_PAGE]]"
	1203	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Just use</td><td>[[$INTRA_PAGE]]</td></tr>"
[1070]	1204	fi
	1205
[1064]	1206	# Notify reader if we can use an interwiki prefix for this URL
	1207	if [ $STATUS == "IW" ]; then
[1075]	1208	INTER_PAGE=$(echo "$URL" \| sed 's/.*\///')
[1119]	1209	valPrint ts " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
	1210	valPrint rs " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
	1211	valPrint hs "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]</td></tr>"
[1064]	1212	fi
	1213
	1214	# Query Internet Archive for latest "OK" snapshot for "NG" page
[1147]	1215	if [[ ( $STATUS == "NG" && $SUGGEST_SNAPSHOTS_NG -eq 1 ) \|\| ( $STATUS == "OK" && $SUGGEST_SNAPSHOTS_OK -eq 1 ) ]]; then
	1216
	1217	# We need to watch out for the rate limit or we'll get locked out; look at how much time has
	1218	# elapsed and then wait the remainder between that and how long of a wait we think is needed
	1219	# to avoid the dreaded "Too Many Requests" response. 5 seconds is just a guess.
	1220	CUR_TIME=$(date +%s)
	1221	WAIT_REMAINDER=$((5 - $CUR_TIME + $START_LINK))
	1222	if [ $WAIT_REMAINDER -gt 0 ]; then
	1223	valPrint t "Waiting $WAIT_REMAINDER second(s) to conform to Archive.org rate limit."
	1224	sleep $WAIT_REMAINDER
	1225	fi
	1226
	1227	# Issue query to the API
[1141]	1228	ARCHIVE_QUERY=$(curl --silent --max-time $TIMEOUT "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
[1064]	1229
[1175]	1230	# Notify reader if we hit the rate limit and just keep going
[1147]	1231	if [[ "$ARCHIVE_QUERY" == "Too Many Requests" ]]; then
	1232	valPrint t " IA has rate-limited us!"
	1233	valPrint r " IA has rate-limited us!"
	1234	valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td>(hit the API rate limit!)</td></tr>"
[1175]	1235	# If a "closest" snapshot was received, inform reader
[1147]	1236	elif [[ "$ARCHIVE_QUERY" == \"closest\": ]]; then
[1118]	1237	# In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it
	1238	ARCHIVE_QUERY=$(echo "$ARCHIVE_QUERY" \| sed 's/#!/#\\!/')
	1239
	1240	# ...isolate "url" property in the response that follows the "closest" tag
	1241	SNAPSHOT_URL=${ARCHIVE_QUERY##*\"closest\":} # everything after '"closest":'
[1119]	1242	SNAPSHOT_URL=${SNAPSHOT_URL#*\"url\": \"} # everything after '"url": "'
[1118]	1243	SNAPSHOT_URL=${SNAPSHOT_URL%%\"*} # everything before '"'
	1244
[1124]	1245	# Remove the port 80 part that IA often adds to the URL, as it's superfluous
	1246	SNAPSHOT_URL=$(echo $SNAPSHOT_URL \| sed 's/:80//')
	1247
[1175]	1248	# Inform the reader of the snapshot URL
[1119]	1249	valPrint ts " IA suggests $SNAPSHOT_URL"
	1250	valPrint rs " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
	1251	valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
[1147]	1252	else # Otherwise give a generic Wayback Machine link for this URL, which might work
[1119]	1253	valPrint ts " Try browsing $ARCHIVE_GENERIC/$URL"
	1254	valPrint rs " Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
	1255	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"
[1064]	1256	fi
	1257	fi
	1258	fi
	1259
	1260	# If this is a page and it seems to be there, take screenshot of it using headless Google Chrome
	1261	if [ $IS_FILE -eq 0 ] && [ $TAKE_PAGE_SHOT -eq 1 ] && [ $STATUS == "OK" ]; then
	1262	# Sanitize URL by removing "http(s)://" and converting any other colons or slashes to underscores
	1263	SHOT_NAME=$(echo "$URL" \| sed 's/https*\:\/\///' \| sed 'y/:\//__/')
	1264	SHOT_FILE="$SHOT_PATH/$SHOT_NAME.png"
	1265
	1266	# Don't take screenshot if we already encountered this page and screenshotted it
	1267	if [ ! -f "$SHOT_FILE" ]; then
[1070]	1268	"$CHROME_PATH" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
[1064]	1269	if [ -f "$WORKING_DIR/$CHROME_SCREENSHOT" ]; then
	1270	mv -n "$WORKING_DIR/$CHROME_SCREENSHOT" "$SHOT_FILE"
	1271	else
[1119]	1272	valPrint trhs "Screenshot of URL $URL seems to have failed!"
[1064]	1273	fi
	1274	else
[1123]	1275	valPrint trhs "Skipping screenshot of URL '$URL' because file '$SHOT_FILE' already exists."
[1064]	1276	fi
	1277	fi
	1278	done
	1279	FINISHED_LIST="yes"
	1280	wrapupAndExit

Note: See TracBrowser for help on using the repository browser.

Download in other formats: