Context Navigation

source: Validate External Links/validate_external_links.sh@ 1122

Last change on this file since 1122 was 1122, checked in by iritscen, 5 years ago
Val now links to wiki pages using HTTPS instead of HTTP. Fixed code that exempts minor forms of redirects from being listed. New arguments --show-added-slashes and --show-https-upgrade allow one to turn off these exemptions. Reworked summary section extensively to be more readable.
File size: 44.8 KB

Rev	Line
[1064]	1	#!/bin/bash
	2
	3	# Validate External Links by Iritscen
	4	# Provided with a list of external links found in the OniGalore wiki, this script validates them.
	5	# The resulting logs are produced in three formats: TXT (for easy diffing with an earlier log), RTF
	6	# (for reading as a local file with clickable links), and HTML (for uploading as a web page).
	7	# Call script with "--help" argument for documentation. Also see Read Me First.rtf for critical notes.
	8	# Recommended rule:
[1118]	9	# \|----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----\|
[1064]	10
	11	# Set separator token to newline
	12	IFS="
	13	"
	14
	15	### GLOBALS ###
	16	# Settings -- these will be changed from their defaults by the arguments passed in to the script
	17	LINKS_URL="" # use 'curl' to download file with links from this location (can be file://)
	18	EXCEPT_URL="" # ditto above for file with exceptions to NG results
	19	OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder
[1070]	20	RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES
[1122]	21	SHOW_SLASH=0 # record response code to the log when a slash is added to the end of a URL
	22	SHOW_HTTPS=0 # record response code to the log when "http" is upgraded to "https"
[1064]	23	SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page
	24	TAKE_PAGE_SHOT=0 # take a screenshot of each OK page
[1070]	25	CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature
[1064]	26	URL_START=1 # start at this URL in LINKS_FILE (1 by default)
	27	URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE
	28	UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report
	29
	30	# Fixed strings -- see the occurrences of these variables to learn their purpose
[1118]	31	AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36 OPR/67.0.3575.53"
[1064]	32	ARCHIVE_API="http://archive.org/wayback/available"
	33	ARCHIVE_GENERIC="https://web.archive.org/web/*"
	34	ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
	35	CHROME_SCREENSHOT="screenshot.png"
[1066]	36	CURL_CODES="http://iritscen.oni2.net/val/curl_codes.txt"
[1064]	37	EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
[1066]	38	HTTP_CODES="http://iritscen.oni2.net/val/http_codes.txt"
[1122]	39	MY_WIKI_PAGE="https://wiki.oni2.net/User:Iritscen"
[1064]	40	THIS_DIR=$(cd $(dirname $0); pwd)
	41	WORKING_DIR=$(pwd)
	42	WIKI_PATH="wiki.oni2.net"
	43
	44	# These are parallel arrays of the IDs and names of OniGalore's current namespaces
	45	declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 100 101 102 103 104 105 108 109 110 111)
	46	declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk")
	47
	48	# These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
[1070]	49	# This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code.
[1064]	50	declare -a HTTP_FILES=(txt zip wmv jpg png m4a bsl rar oni mp3 mov ONWC vbs TRMA mp4 doc avi log gif pdf dmg exe cpp tga 7z wav east BINA xml dll dae xaf fbx 3ds blend flv csv)
	51	declare -a HTTP_TLDS_AND_PAGES=(com net org uk ru de it htm html php pl asp aspx shtml pgi cgi php3 x jsp phtml cfm css action stm js)
	52
[1067]	53	# These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
	54	# are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt
	55	# if you add a new code.
	56	declare -a OK_CODES=(200 401 405 406 501)
	57	declare -a RD_CODES=(301 302 303 307 308)
	58	declare -a NG_CODES=(000 403 404 410 500 503)
[1064]	59
	60	# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
	61	# transcluded text, and if the transclusion fails, then the braces show up in the URL
	62	ILLEGAL_CHARS="{ }"
	63
[1070]	64	# The shortest URL possible, used for sanity-checking some URLs: http://a.co
	65	MIN_URL_LENGTH=11
	66
[1064]	67	# These are parallel arrays giving the prefixes that can be used in place of normal external links to
	68	# some wikis and other sites
[1070]	69	declare -a INTERWIKI_PREFIXES=(commons metawikimedia mw wikibooks wikidata wikimedia wikinews wikiquote wikisource wikispecies wikiversity wikivoyage wikt wp)
	70	declare -a INTERWIKI_DOMAINS=(commons.wikimedia.org meta.wikimedia.org mediawiki.org wikibooks.org wikidata.org wikimediafoundation.org wikinews.org wikiquote.org wikisource.org species.wikimedia.org wikiversity.org wikivoyage.org wiktionary.org wikipedia.org)
[1064]	71
	72	# Variables for keeping track of main loop progress and findings
	73	LINK_NUM=0
[1070]	74	EI_LINKS=0
	75	IW_LINKS=0
[1064]	76	OK_LINKS=0
[1067]	77	RD_LINKS=0
[1064]	78	NG_LINKS=0
	79	SKIP_UNK_NS=0
	80	SKIP_JS_PAGE=0
	81	SKIP_BAD_URL=0
	82	SKIP_NON_ASCII=0
	83	SKIP_UNK_SUFFIX=0
	84	SKIP_UNK_CODE=0
[1070]	85	SKIP_EXPECT_NG=0
	86	SKIP_EXPECT_EI=0
	87	SKIP_EXPECT_IW=0
[1122]	88	SKIP_HTTPS_UP=0
	89	SKIP_SLASH_ADD=0
[1064]	90	FILE_LINKS=0
	91	PAGE_LINKS=0
	92	SKIPPED_HEADER_ROW=0
	93	FINISHED_LIST="no"
[1118]	94	START_RUN=0
	95	END_RUN=0
[1064]	96
	97
	98	### HELP ###
	99	# A pseudo-man page. Here is the 80-character rule for the page text:
	100	# 234567890123456789012345678901234567890123456789012345678901234567890123456789
	101	function printHelp()
	102	{
	103	cat << EOF
	104
	105	NAME
	106	Validate External Links
	107
	108	SYNOPSIS
	109	validate_external_links.sh --help
[1070]	110	validate_external_links.sh --links URL --output DIR [--exceptions URL]
[1075]	111	[--record-ok-links] [--suggest-snapshots] [--take-screenshots FILE]
[1070]	112	[--start-url NUM] [--end-url NUM] [--upload FILE]
[1064]	113
	114	DESCRIPTION
	115	This script parses a list of external links found in the OniGalore wiki
	116	(which is dumped by the Oni2.net domain periodically in a particular
	117	format), validates them using the Unix tool 'curl', and produces a report
[1070]	118	of which links were "OK" (responded positively to an HTTP query), which
	119	were "RD" (responded with a 3xx redirect code), which could be "IW"
	120	(interwiki) links, which are "EI" (external internal) links and could be
	121	intrawiki links, and which were "NG" (no good; a negative response to the
[1069]	122	query). This report can then be automatically uploaded to the location of
[1064]	123	your choice. The script can also suggest Internet Archive snapshots for
[1070]	124	"NG" links, and take screenshots of "OK" links for visual verification by
	125	the reader that the page in question is the one intended to be displayed.
[1064]	126
	127	You must pass this script the URL at which the list of links is found
[1070]	128	(--links) and the path where the directory of logs should be outputted
	129	(--output). All other arguments are optional.
[1064]	130
	131	OPTIONS
[1075]	132	--help Show this page.
	133	--links URL (required) URL from which to download the CSV
	134	file with external links. Note that this URL can
	135	be a local file if you supply a file:// path.
	136	--output DIR (required) Unix path to directory in which Val
	137	should place its reports.
	138	--exceptions URL In order to remove links from the report which
	139	Val finds an issue with, but which you regard as
	140	OK, list those desired exceptions in this file.
	141	See the sample file exceptions.txt for details.
	142	Note that this URL can point to a local file if
	143	you supply a file:// path.
	144	--record-ok-links Log a link in the report even if its response
	145	code is "OK".
[1122]	146	--show-added-slashes Report on redirects that simply add a '/' to the
	147	end of the URL.
	148	--show-https-upgrade Report on redirects that simply upgrade a
	149	"http://" URL to a "https://" URL.
[1075]	150	--suggest-snapshots Query the Internet Archive for a possible
	151	snapshot URL for each "NG" page.
	152	--take-screenshots FILE Call the Google Chrome binary at this path to
	153	take screenshots of each "OK" page.
	154	--start-url NUM Start at this link in the links CSV file.
	155	--end-url NUM Stop at this link in the links CSV file.
	156	--upload FILE Upload report using the credentials and path
	157	given in this local text file. See sftp_login.txt
	158	for template.
[1064]	159
	160	BUGS
	161	The script cannot properly parse any line in the external links file
	162	which contains a comma in the name of the wiki page containing a link.
	163	Commas in the link itself are not an issue.
	164	EOF
	165	}
	166
	167
	168	### SETUP ###
	169	# If first argument is a help request, or if nothing was passed in at all, print help page and quit
	170	if [ "$#" -eq 0 ] \|\| [ "$1" == "--help" ]; then
	171	printHelp \| less
	172	exit 0
	173	fi
	174
	175	# Parse arguments as long as there are more arguments to process
	176	while (( "$#" )); do
	177	case "$1" in
[1122]	178	--links ) LINKS_URL="$2"; shift 2;;
	179	--exceptions ) EXCEPT_URL="$2"; shift 2;;
	180	--output ) OUTPUT_DIR="$2"; shift 2;;
	181	--record-ok-links ) RECORD_OK_LINKS=1; shift;;
	182	--show-added-slashes ) SHOW_SLASH=1; shift;;
	183	--show-https-upgrade ) SHOW_HTTPS=1; shift;;
	184	--suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;;
	185	--take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
	186	--start-url ) URL_START=$2; shift 2;;
	187	--end-url ) URL_LIMIT=$2; shift 2;;
	188	--upload ) UPLOAD_INFO=$2; shift 2;;
	189	* ) echo "Invalid argument $1 detected. Aborting."; exit 1;;
[1064]	190	esac
	191	done
	192
	193	# If the required arguments were not supplied, print help page and quit
	194	if [ -z $LINKS_URL ] \|\| [ -z $OUTPUT_DIR ]; then
[1070]	195	echo "Error: I did not receive one or both required arguments. Run me with the \"--help\" argument for documentation."
[1064]	196	exit 2
	197	fi
	198
[1070]	199	# If user wants screenshots, make sure path to Chrome was passed in and is valid
	200	if [ $TAKE_PAGE_SHOT -eq 1 ]; then
	201	if [ ! -f "$CHROME_PATH" ]; then
	202	echo "Error: You need to supply a path to the Google Chrome application in order to take screenshots."
	203	exit 3
	204	fi
	205	fi
	206
[1064]	207	# Check that UPLOAD_INFO exists, if this argument was supplied
	208	if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then
	209	echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting."
[1070]	210	exit 4
[1064]	211	fi
	212
	213	# Check that OUTPUT_DIR is a directory
	214	if [ ! -d "$OUTPUT_DIR" ]; then
	215	echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting."
[1070]	216	exit 5
[1064]	217	fi
	218
	219	# Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots
	220	SAFE_TIME=$(date "+%Y-%m-%d--%H-%M-%S") # for use in path names
	221	NICE_TIME=$(TZ=":UTC" date "+%a, %d %b %Y %k:%M:%S GMT") # matches format of LINKS_DATE, which is extracted from the extlinks file
	222	OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
	223	OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
	224	SHOT_PATH="$OUTPUT_PATH/Screenshots"
	225	LOG_NAME="ValExtLinks report"
	226	LOG_TXT="$OUTPUT_PATH/$LOG_NAME.txt"
	227	LOG_RTF="$OUTPUT_PATH/$LOG_NAME.rtf"
	228	LOG_HTM="$OUTPUT_PATH/$LOG_NAME.htm"
	229	mkdir "$OUTPUT_PATH"
	230	if [ $TAKE_PAGE_SHOT -eq 1 ]; then
	231	mkdir "$SHOT_PATH"
	232	fi
	233
	234	# Check that 'mkdir' succeeded
	235	if [ ! -d "$OUTPUT_PATH" ]; then
	236	echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting."
[1070]	237	exit 6
[1064]	238	fi
	239
	240	# Get date on the file at LINKS_URL and print to log
	241	LINKS_DATE=$(curl --silent --head $LINKS_URL \| grep "Last-Modified")
	242	if [ -z "$LINKS_DATE" ]; then
	243	echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting."
[1070]	244	exit 7
[1064]	245	fi
	246	LINKS_DATE=${LINKS_DATE#Last-Modified: }
	247
	248
	249	### UTILITY FUNCTIONS ###
	250	# Writes a plain-text header to TXT log file
	251	function printTXTheader()
	252	{
	253	valPrint t "Validate External Links report"
	254	valPrint t "generated $NICE_TIME"
	255	valPrint t "from data of $LINKS_DATE"
	256	valPrint t "script by Iritscen (contact: $MY_WIKI_PAGE)"
	257	valPrint t ""
	258	}
	259
	260	# Writes the RTF header to RTF log file, then a centered title, then sets text to left-justified
	261	function printRTFheader()
	262	{
	263	valPrint r "{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf820
	264	{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
	265	{\colortbl;\red255\green255\blue255;}
	266	{\*\expandedcolortbl;;}
	267	\margl1440\margr1440\vieww12600\viewh12100\viewkind0
	268	\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\qc\partightenfactor0
	269
	270	\f0\fs28 \cf0 \b1 Validate External Links report \b0\\
	271	generated $NICE_TIME\\
	272	from data of $LINKS_DATE\\
	273	script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$MY_WIKI_PAGE\"}}{\fldrslt contact}})
	274	\\
	275	\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
	276	\cf0 "
	277	}
	278
	279	# Closes the RTF markup of the RTF log file
	280	function printRTFfooter()
	281	{
	282	valPrint r "}"
	283	}
	284
	285	# Writes the HTML header to HTML log file
	286	function printHTMheader()
	287	{
	288	valPrint h "<html>
	289	<head>
	290	<title>Validate External Links report</title>
	291	</head>
	292	<body>
	293	<h2>Validate External Links report</h2>
	294	<h3>generated $NICE_TIME<br />
	295	from data of $LINKS_DATE<br />
	296	script by Iritscen (<a href=\"$MY_WIKI_PAGE\" target=\"_blank\">contact</a>)</h3>"
	297	}
	298
	299	# Closes the HTML markup of the HTML log file
	300	function printHTMfooter()
	301	{
	302	valPrint h "</body>
	303	</html>"
	304	}
	305
	306	# The central logging function. The first parameter is a string composed of one or more characters that
[1070]	307	# indicate which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
[1119]	308	# 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 's' means "Print an extra newline at the end." 'w' means "Don't pass console output through 'fmt'" ("fmt" fits the output
	309	# to an 80-column CLI but can break special formatting and the 'n' option).
[1064]	310	function valPrint()
	311	{
	312	if [[ "$1" == c ]]; then
	313	if [[ "$1" == n ]]; then
	314	echo -n "$2"
	315	elif [[ "$1" == w ]]; then
	316	echo "$2"
[1119]	317	elif [[ "$1" == s ]]; then
	318	echo -e "$2\n"
[1064]	319	else
	320	echo "$2" \| fmt -w 80
	321	fi
	322	fi
	323	if [[ "$1" == t ]]; then
	324	if [[ "$1" == n ]]; then
	325	echo -n "$2" >> "$LOG_TXT"
[1119]	326	elif [[ "$1" == s ]]; then
	327	echo -e "$2\n" >> "$LOG_TXT"
[1064]	328	else
	329	echo "$2" >> "$LOG_TXT"
	330	fi
	331	fi
	332	if [[ "$1" == r ]]; then
	333	if [[ "$1" == n ]]; then
	334	echo "$2" >> "$LOG_RTF"
[1119]	335	elif [[ "$1" == s ]]; then
	336	echo "$2\line\line" >> "$LOG_RTF"
[1064]	337	else
[1119]	338	echo "$2\line" >> "$LOG_RTF"
[1064]	339	fi
	340	fi
	341	if [[ "$1" == h ]]; then
[1119]	342	if [[ "$1" == s ]]; then
	343	echo "$2<tr><td> </td></tr>" >> "$LOG_HTM"
	344	elif [[ "$1" == n ]]; then
[1064]	345	echo "$2" >> "$LOG_HTM"
	346	else
	347	echo "$2<br />" >> "$LOG_HTM"
	348	fi
	349	fi
	350	}
	351
	352	# Pluralize the string in parameter 1 if the number in parameter 2 is not 1
	353	function pluralCheckNoun()
	354	{
	355	if [ $2 -ne 1 ]; then
	356	if [[ $1 =~ x$ ]]; then
	357	echo $1es
	358	else
	359	echo $1s
	360	fi
	361	else
	362	echo $1
	363	fi
	364	}
	365
[1067]	366	# Output "is" if parameter 1 is 1, otherwise "are"
	367	function pluralCheckIs()
	368	{
	369	if [ $1 -ne 1 ]; then
	370	echo "are"
	371	else
	372	echo "is"
	373	fi
	374	}
	375
[1064]	376	# Output "was" if parameter 1 is 1, otherwise "were"
	377	function pluralCheckWas()
	378	{
	379	if [ $1 -ne 1 ]; then
	380	echo "were"
	381	else
	382	echo "was"
	383	fi
	384	}
	385
[1067]	386	# Output "a " if parameter 1 is 1, otherwise nothing
	387	function pluralCheckA()
	388	{
	389	if [ $1 -eq 1 ]; then
	390	echo "a "
	391	fi
	392	}
	393
	394	# Output "an " if parameter 1 is 1, otherwise nothing
	395	function pluralCheckAn()
	396	{
	397	if [ $1 -eq 1 ]; then
	398	echo "an "
	399	fi
	400	}
	401
[1064]	402	# Upload HTML report using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
	403	# reports being saved to disk have already been closed.
	404	function uploadReport()
	405	{
	406	valPrint c "Uploading HTML report..."
	407
	408	SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
	409	SFTP_USER_NAME_MARKER="user:"
	410	SFTP_PASSWORD_MARKER="pw:"
	411	SFTP_PORT_MARKER="port:"
	412	SFTP_PATH_MARKER="path:"
	413	SFTP_USER_NAME=$(grep $SFTP_USER_NAME_MARKER $UPLOAD_INFO)
	414	SFTP_USER_NAME=${SFTP_USER_NAME#$SFTP_USER_NAME_MARKER}
	415	SFTP_PASSWORD=$(grep $SFTP_PASSWORD_MARKER $UPLOAD_INFO)
	416	SFTP_PASSWORD=${SFTP_PASSWORD#$SFTP_PASSWORD_MARKER}
	417	SFTP_PORT=$(grep $SFTP_PORT_MARKER $UPLOAD_INFO)
	418	SFTP_PORT=${SFTP_PORT#$SFTP_PORT_MARKER}
	419	SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
	420	SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}
	421
	422	expect "$SCRIPT_PATH" "$LOG_HTM" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.htm"
	423
	424	valPrint c "Report was uploaded, unless an error message appears above."
	425	}
	426
	427	# Prints session summary when script is done
	428	function wrapupAndExit()
	429	{
	430	# Get off progress line on console, drop down a line from last link in log, and close HTML table
	431	valPrint ctr ""
	432	valPrint h "</table><br />"
	433
	434	# If we didn't finish processing the last URL, then the iterator is one too high
	435	if [ $FINISHED_LIST != "yes" ]; then
	436	let LINK_NUM-=1
	437	if [ $FINISHED_LIST == "no" ]; then
	438	valPrint ctrh "The session was canceled by the user."
	439	fi
	440	fi
	441
[1118]	442	# Generate string with elapsed time
	443	END_RUN=$(date +%s)
	444	ELAPSED=$(echo $(($END_RUN - $START_RUN)) \| awk '{printf "%d min. %d sec. elapsed", int($1/60), int($1%60)}')
	445
[1122]	446	# Do some math on results of session
[1064]	447	LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
[1122]	448	LINK_PROBLEMS=$((EI_LINKS+IW_LINKS+RD_LINKS+NG_LINKS))
	449	LINK_ERRORS=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
	450	LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_EI+SKIP_EXPECT_IW))
	451	TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP))
	452	LINKS_CHECKED=$((LINKS_PROCESSED-LINK_ERRORS))
	453
	454	# Print summary header
[1118]	455	valPrint ct "Summary ($ELAPSED):"
	456	valPrint r "\b1 Summary \b0 ($ELAPSED)"
	457	valPrint hn "<h3><span id=\"summary\">Summary ($ELAPSED)</span></h3>"
[1122]	458	valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT) (there were $FILE_LINKS file $(pluralCheckNoun link $FILE_LINKS) and $PAGE_LINKS page $(pluralCheckNoun link $PAGE_LINKS))."
	459
	460	# Print processed link totals
	461	if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi
	462	if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi
	463	if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS processed $(pluralCheckNoun link $LINK_PROBLEMS) had issues"; fi
	464	if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctrh " (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi
	465	if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) were OK"; fi
	466	if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctrh " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi
	467
	468	# Print excepted link totals
	469	if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctrh "$LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"; fi
	470	if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
	471	if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi
	472	if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi
	473
	474	# Print errored link totals
	475	if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"; fi
	476	if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
[1070]	477	if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
[1064]	478	if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
	479	if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
	480	if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
	481	if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
[1122]	482
	483	# Print checked link totals
	484	if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS link $(pluralCheckNoun issues $LINKS_CHECKED):"; fi
	485	if [ $NG_LINKS -gt 0 ]; then valPrint ctrh "- $NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
	486	if [ $RD_LINKS -gt 0 ]; then valPrint ctrh "- $RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi
	487	if [ $EI_LINKS -gt 0 ]; then valPrint ctrh "- $EI_LINKS $(pluralCheckNoun link $EI_LINKS) that could be intrawiki"; fi
	488	if [ $IW_LINKS -gt 0 ]; then valPrint ctrh "- $IW_LINKS $(pluralCheckNoun link $IW_LINKS) that could be interwiki"; fi
	489
	490	# Close the log files' markup
[1070]	491	valPrint trh "ValExtLinks says goodbye."
[1064]	492	printRTFfooter
	493	printHTMfooter
	494
	495	# Upload report if this was requested
	496	if [ ! -z $UPLOAD_INFO ] && [ $FINISHED_LIST != "no" ]; then
	497	uploadReport
	498	fi
	499
	500	# Really quit now
	501	valPrint c "ValExtLinks says goodbye."
	502	exit 0
	503	}
	504	trap wrapupAndExit INT
	505
	506
	507	### INITIALIZATION ###
	508	# Print opening message to console and log files
	509	valPrint cw "$(tput bold)--Validate External Links--$(tput sgr0)"
	510	printTXTheader
	511	printRTFheader
	512	printHTMheader
	513
	514	# Attempt to download file at LINKS_URL, then check that it succeeded
[1120]	515	valPrint t "Config:"
	516	valPrint r "\b1 Config \b0"
	517	valPrint hn "<h3>Config</h3>"
[1069]	518	valPrint cwtrh "Downloading list of external links from $LINKS_URL."
[1064]	519	LINKS_FILE_NAME=$(echo "$LINKS_URL" \| sed 's/.*\///')
	520	LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
	521	curl --silent -o "$LINKS_FILE" $LINKS_URL
	522	if [ ! -f "$LINKS_FILE" ]; then
	523	echo "The download of $LINKS_URL appears to have failed. Aborting."
	524	wrapupAndExit
	525	fi
	526
	527	# Attempt to download file at EXCEPT_URL, then check that it succeeded
	528	if [ ! -z $EXCEPT_URL ]; then
[1070]	529	valPrint cwtrh "Downloading list of reporting exceptions from $EXCEPT_URL."
[1064]	530	EXCEPT_FILE_NAME=$(echo "$EXCEPT_URL" \| sed 's/.*\///')
	531	EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
	532	curl --silent -o "$EXCEPT_FILE" $EXCEPT_URL
	533	if [ ! -f "$EXCEPT_FILE" ]; then
	534	echo "The download of $EXCEPT_URL appears to have failed. Aborting."
	535	wrapupAndExit
	536	fi
	537	fi
	538
	539	# Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
	540	LINK_COUNT_STRING=$(cat "$LINKS_FILE" \| wc -l)
	541
	542	# Number of URLs is number of lines minus one (first line is column header row for the CSV)
	543	LINK_COUNT=$(echo "${LINK_COUNT_STRING}" \| tr -d '[:space:]')
	544	let LINK_COUNT-=1
	545
	546	# Calculate number of URLs to consider
	547	if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
	548	valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((URL_LIMIT-URL_START+1)) of them, from link $URL_START to $URL_LIMIT."
	549	elif [ $URL_START -ne 1 ]; then
	550	valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((LINK_COUNT-URL_START+1)) of them, from link $URL_START to $LINK_COUNT."
	551	else
	552	valPrint ctrh "Found $LINK_COUNT links to process. I will be considering all of them."
	553	fi
	554
	555	# Print settings to console and log
[1070]	556	declare -a SETTINGS_MSG=(I will be checking the response code of each link "and will" take a screenshot of each page. Pages that are OK will "also" be logged. I "will" ask the Internet Archive for a suggested snapshot URL for each NG page. "I will not report links that are listed in the exceptions file.")
[1064]	557	if [ $TAKE_PAGE_SHOT -eq 0 ]; then SETTINGS_MSG[10]="but will not"; fi
	558	if [ $RECORD_OK_LINKS -eq 0 ]; then SETTINGS_MSG[22]="not"; fi
	559	if [ $SUGGEST_SNAPSHOTS -eq 0 ]; then SETTINGS_MSG[26]="will not"; fi
	560	if [ -z $EXCEPT_URL ] \|\| [ $RECORD_OK_LINKS -eq 1 ]; then SETTINGS_MSG[40]=""; fi
	561	SETTINGS_STR=${SETTINGS_MSG[@]}
	562	valPrint ctrh "$SETTINGS_STR"
	563	valPrint tr "A summary of my findings will be found at the bottom of the report."
	564	valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
	565	valPrint trh ""
	566
	567	# Print legend to logs
	568	valPrint t "Legend:"
	569	valPrint r "\b1 Legend \b0"
	570	valPrint hn "<h3>Legend</h3>"
	571	valPrint trh "OK = URL seems to be working."
[1067]	572	valPrint trh "NG = URL no longer seems to work. You should click each URL marked as NG before attempting to fix it, because false negatives will occur from time to time due to hiccups in the Internet. Please report any persistent false negatives or other issues to Iritscen. An NG link should be followed by a link to the Internet Archive's Wayback Machine which may help you repair the link. If the link cannot be repaired, you can disable it on the wiki (which prevents it from showing up in future ValExtLinks reports) by wrapping it in nowiki tags."
	573	valPrint trh "RD = The server responding to this URL is saying that the page moved and you should instead use the supplied new URL. Some RD links represent minor adjustments in the organization of a web site, and some are soft 404s (the file/page has been removed and you are being redirected to something like the main page of the web site). You will have to look at the new URL yourself to determine if it represents an OK link and the link on the wiki should be updated to this one, or if the desired file/page is actually gone and we need to replace the wiki link with an Internet Archive snapshot link -- or disable the URL if it has not been archived."
[1070]	574	valPrint trh "EI = URL is an external link to an internal page and should be converted to an intrawiki link using the suggested markup."
	575	valPrint trh "IW = URL is an external link to a fellow wiki and should be converted to an interwiki link using the suggested markup."
[1064]	576	valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $HTTP_CODES)."
	577	valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$HTTP_CODES\"}}{\fldrslt here}} for code reference)."
	578	valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$HTTP_CODES\" target=\"_blank\">here</a> for code reference)."
	579	valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $CURL_CODES)."
	580	valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$CURL_CODES\"}}{\fldrslt here}} for code reference)."
	581	valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$CURL_CODES\" target=\"_blank\">here</a> for code reference)."
	582	valPrint trh "IA suggests = Last available snapshot suggested by the Internet Archive."
[1070]	583	valPrint trh "Try browsing = The Archive occasionally fails to return a snapshot URL even when one exists, so you will need to check for a snapshot manually using this link to the Wayback Machine before concluding that a site has not been archived."
[1064]	584	valPrint trh ""
	585
	586
	587	### MAIN LOOP ###
[1120]	588	valPrint t "Links:"
	589	valPrint r "\b1 Links \b0"
	590	valPrint hn "<h3>Links</h3>"
[1118]	591	START_RUN=$(date +%s)
[1064]	592	# Process each line of the .csv in LINKS_FILE
	593	for LINE in `cat "$LINKS_FILE"`; do
	594	let LINK_NUM+=1
	595
	596	# First line is the column header row for the CSV, so let's verify that the format hasn't changed
	597	if [ $SKIPPED_HEADER_ROW -eq 0 ]; then
	598	if [ $LINE == "namespace,title,target" ]; then
	599	SKIPPED_HEADER_ROW=1
	600	LINK_NUM=0 # this line is it's not a link, so reset the link counter
	601	valPrint hn "<table>"
	602	continue
	603	else
	604	valPrint ctrh "Did not find expected header row in $LINKS_FILE. Aborting."
	605	wrapupAndExit
	606	fi
	607	fi
	608
	609	# Skip this link if we are not at URL_START yet
	610	if [ $LINK_NUM -lt $URL_START ]; then
	611	continue
	612	fi
	613
	614	# Stop if we are at the limit declared for testing purposes
	615	if [ $URL_LIMIT -gt 0 ] && [ $LINK_NUM -gt $URL_LIMIT ]; then
	616	FINISHED_LIST="limit"
	617	wrapupAndExit
	618	fi
	619
	620	# Print progress to screen
	621	if [ $LINK_NUM -gt 1 ]; then
	622	printf "\e[1A\n" # erase previous progress message so that new one appears in its place
	623	fi
	624	valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."
	625
	626	# The number of the namespace is the element before the first comma on the line
	627	NS_ID=${LINE%%,*}
	628
	629	# Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
	630	NS_NAME=""
	631	a=0
[1069]	632	while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done
[1118]	633	if [ $NS_ID == "NULL" ]; then
	634	break
	635	elif [ $NS_ID -eq ${NS_IDS[$a]} ]; then
[1064]	636	NS_NAME="${NS_NAMES[$a]}"
	637	break
	638	fi
	639	let a+=1
	640	done
[1118]	641	if [ "$NS_NAME" == "" ]; then
	642	if [ $NS_ID == "NULL" ]; then
[1119]	643	valPrint trs "Skipping URL on line $LINK_NUM because the namespace (and probably the page too) is \"NULL\". Probably the link is no longer in existence on the wiki."
[1118]	644	else
[1119]	645	valPrint trs "Skipping URL on line $LINK_NUM found on page $PAGE_NAME because I could not find a name for namespace ID $NS_ID."
[1118]	646	fi
[1064]	647	let SKIP_UNK_NS+=1
	648	continue
	649	fi
	650
	651	# The name of the page is everything between the namespace ID and the next comma on the line (commas
	652	# in page names will break this)
	653	PAGE_NAME=${LINE#$NS_ID,}
	654	PAGE_NAME=${PAGE_NAME%%,*}
	655
	656	# We don't want to consider wiki pages ending in .js, as the parser cannot reliably isolate URLS in
	657	# JavaScript code, so it will return erroneous links
	658	PAGE_NAME_SUFFIX=$(echo $PAGE_NAME \| sed 's/.*\.//')
	659	if [ $PAGE_NAME_SUFFIX == "js" ]; then
[1119]	660	valPrint trs "Skipping URL on line $LINK_NUM because it was found on JavaScript page $PAGE_NAME."
[1064]	661	let SKIP_JS_PAGE+=1
	662	continue
	663	fi
	664
[1070]	665	# Build longer wiki page URLs from namespace and page names
[1122]	666	FULL_PAGE_PATH=https://$WIKI_PATH/$NS_NAME:$PAGE_NAME
[1070]	667	LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
	668	# Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it
	669	# explicitly breaks the link
	670	if [ $NS_ID -eq 0 ]; then
[1122]	671	FULL_PAGE_PATH=https://$WIKI_PATH/$PAGE_NAME
[1070]	672	LOCAL_PAGE_PATH=$PAGE_NAME
	673	fi
	674
[1064]	675	# The URL being linked to is everything after the previous two fields (this allows commas to be in
	676	# the URLs, but a comma in the previous field, the page name, will break this)
	677	URL=${LINE#$NS_ID,$PAGE_NAME,}
	678
	679	# Scan for illegal characters
	680	if [[ $URL == [$ILLEGAL_CHARS] ]]; then
[1119]	681	valPrint trs "Skipping URL $URL (found on page $PAGE_NAME) because it contains characters illegal in a URL."
[1064]	682	let SKIP_BAD_URL+=1
	683	continue
	684	fi
	685
	686	# Now we need to know if the URL is for a file or a web page. First step is to determine if the
	687	# URL ends in a suffix
	688	HAS_SUFFIX=0
	689
	690	# If the URL ends in something like ".php?foo=bar", strip everything from the '?' onward
[1070]	691	CLEAN_URL=${URL%%\?*}
[1064]	692
	693	# If the URL ends in something like "#section_15", strip everything from the '#' onward
[1070]	694	CLEAN_URL=${CLEAN_URL%%\#*}
[1064]	695
	696	# 'sed' cannot handle Unicode in my Bash shell, so skip this URL and make user check it
[1070]	697	if [[ $CLEAN_URL == [![:ascii:]] ]]; then
[1119]	698	valPrint trs "Skipping URL $URL (found on page $PAGE_NAME) because I cannot handle non-ASCII characters."
[1064]	699	let SKIP_NON_ASCII+=1
	700	continue
	701	fi
	702
	703	# Isolate the characters after the last period and after the last slash
[1070]	704	POST_DOT=$(echo "$CLEAN_URL" \| sed 's/.*\.//')
	705	POST_SLASH=$(echo "$CLEAN_URL" \| sed 's/.*\///')
[1064]	706
	707	# If the last period comes after the last slash, then the URL ends in a suffix
	708	POST_DOT_LENGTH=$(echo \| awk -v input=$POST_DOT '{print length(input)}')
	709	POST_SLASH_LENGTH=$(echo \| awk -v input=$POST_SLASH '{print length(input)}')
	710	if [ $POST_DOT_LENGTH -lt $POST_SLASH_LENGTH ]; then
	711	HAS_SUFFIX=1
	712	else
	713	HAS_SUFFIX=0
	714	fi
	715
	716	# Now we need to know if the URL is for a file. If the URL ends in a suffix, compare it against the
	717	# known URL endings supplied in HTTP_FILES and HTTP_TLDS_AND_PAGES
	718	IS_FILE=-1
	719	if [ $HAS_SUFFIX -eq 0 ]; then
	720	IS_FILE=0
	721	else
	722	# Turn off case sensitivity while we compare suffixes
	723	shopt -s nocasematch
	724
	725	# Special case: URLs ending in something like "/productID.304297400" are pages, not files, so if
	726	# the URL's suffix is all numbers, we are looking at the end of a web page URL
	727	if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
	728	IS_FILE=0
	729	fi
	730
	731	# If we did not identify this URL as a web page above, we need to compare the suffix against known
	732	# file extensions
	733	if [ $IS_FILE -eq -1 ]; then
	734	for EXTENSION in "${HTTP_FILES[@]}"; do
	735	if [[ $EXTENSION == $POST_DOT ]]; then # double brackets respect the nocasematch setting
	736	IS_FILE=1
	737	break
	738	fi
	739	done
	740	fi
	741
	742	# If we did not identify this URL as a file above, we need to compare the suffix against known
	743	# pages and TLDs in order to rule out that this is an unknown file suffix or TLD that the user
	744	# needs to add to the HTTP_FILES or HTTP_TLDS_AND_PAGES array
	745	if [ $IS_FILE -eq -1 ]; then
	746	for PAGE_OR_TLD in "${HTTP_TLDS_AND_PAGES[@]}"; do
	747	if [[ $PAGE_OR_TLD == $POST_DOT ]]; then
	748	IS_FILE=0
	749	break
	750	fi
	751	done
	752	fi
	753
	754	# Turn case sensitivity back on in Bash
	755	shopt -u nocasematch
	756	fi
	757
	758	# If this suffix escaped identification as either a file, page or TLD, inform the user
	759	STR_TYPE=""
	760	if [ $IS_FILE -eq -1 ]; then
[1119]	761	valPrint trs "Skipping URL $URL (found on page $PAGE_NAME) because I encountered the unknown URL ending $POST_DOT. Please add this ending to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
[1064]	762	let SKIP_UNK_SUFFIX+=1
	763	continue
	764	elif [ $IS_FILE -eq 1 ]; then
	765	STR_TYPE="file"
	766	let FILE_LINKS+=1
	767	elif [ $IS_FILE -eq 0 ]; then
	768	STR_TYPE="page"
	769	let PAGE_LINKS+=1
	770	fi
	771
	772	# Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
	773	# issue with sites that require HTTPS
	774	CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{http_code}\n' $URL)
	775	CURL_ERR=$(echo $?)
	776	CURL_RESULT=$CURL_CODE
	777
	778	# Tack 'curl' exit code onto result string if 'curl' returned "000" (no HTTP response)
	779	if [ $CURL_CODE == "000" ]; then
	780	CURL_RESULT="$CURL_RESULT-$CURL_ERR"
	781	fi
	782
[1070]	783	# Begin to determine our status code for this URL (EI, IW, OK, RD, or NG)
[1064]	784	STATUS="??"
[1067]	785	NEW_URL=""
[1064]	786	INTERWIKI_INDEX=-1
	787
[1070]	788	# First make sure that this isn't an "external internal" link to our own wiki that can be replaced
	789	# by "[[page_name]]". If it uses a special access URL beginning with "/w/", let it pass, as it
	790	# probably cannot be replaced by "[[ ]]" markup
	791	if [[ $URL == $WIKI_PATH ]] && [[ $URL != $WIKI_PATH/w/ ]]; then
	792	STATUS="EI"
	793	let EI_LINKS+=1
	794	fi
	795
	796	# If it's not, check if this is a link to a domain that we have an interwiki prefix for
	797	if [ $STATUS == "??" ]; then
	798	for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
	799	if [[ $URL == ${INTERWIKI_DOMAINS[$i]} ]] && [[ $URL != ${INTERWIKI_DOMAINS[$i]}/w/ ]]; then
	800	STATUS="IW"
	801	let IW_LINKS+=1
	802	INTERWIKI_INDEX=$i
	803	break
	804	fi
	805	done
	806	fi
	807
[1069]	808	# If we didn't match an interwiki domain, see if the status code is in our "OK" codes list
	809	if [ $STATUS == "??" ]; then
	810	for CODE in "${OK_CODES[@]}"; do
	811	if [[ $CODE == $CURL_CODE ]]; then
	812	STATUS="OK"
	813	let OK_LINKS+=1
	814	break
	815	fi
	816	done
	817	fi
	818
[1067]	819	# If we didn't get a match with the "OK" codes, check it against the "RD" codes
[1064]	820	if [ $STATUS == "??" ]; then
[1067]	821	for CODE in "${RD_CODES[@]}"; do
	822	if [[ $CODE == $CURL_CODE ]]; then
	823	# Get URL header again in order to retrieve the URL we are being redirected to
	824	NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{redirect_url}\n' $URL)
	825
[1122]	826	# Adjust the old and new URLs to both use HTTP for comparison purposes, so we can filter
	827	# those changes out if the user didn't ask for them
	828	URL_HTTP=$(echo $URL \| sed -E 's/^https:/http:/')
	829	NEW_URL_HTTP=$(echo $NEW_URL \| sed -E 's/^https:/http:/')
[1070]	830
	831	# Sometimes 'curl' fails to get the redirect_url due to time-out or bad web site config
[1122]	832	NEW_URL_LENGTH=$(echo \| awk -v input=$NEW_URL_HTTP '{print length(input)}')
[1070]	833	if [ $NEW_URL_LENGTH -lt $MIN_URL_LENGTH ]; then
[1122]	834	NEW_URL_HTTP="[new URL not retrieved]"
[1070]	835	fi
	836
[1122]	837	# Remove slash at end of new URL, if present, so we can filter out the redirects that
	838	# merely add an ending slash if the user didn't ask for them
	839	NEW_URL_NO_SLASH=$(echo $NEW_URL_HTTP \| sed -E 's:/$::')
	840
	841	# If the URLs match besides HTTP being upgraded to HTTPS, then the link is OK (unless user
	842	# wants those to be reported)
	843	if [ $SHOW_HTTPS -eq 0 ] && [ $URL_HTTP == $NEW_URL_HTTP ]; then
	844	valPrint trs "Skipping URL $URL (found on page $PAGE_NAME) because we have not been asked to show http->https upgrades, and we were redirected to $NEW_URL."
[1069]	845	STATUS="OK"
	846	let OK_LINKS+=1
[1122]	847	let SKIP_HTTPS_UP+=1
	848	# If the URLs match besides an added ending slash, then the link is OK (unless user wants
	849	# those to be reported)
	850	elif [ $SHOW_SLASH -eq 0 ] && [ $URL_HTTP == $NEW_URL_NO_SLASH ]; then
	851	valPrint trs "Skipping URL $URL (found on page $PAGE_NAME) because we have not been asked to show added trailing slashes, and we were redirected to $NEW_URL."
	852	STATUS="OK"
	853	let OK_LINKS+=1
	854	let SKIP_SLASH_ADD+=1
[1069]	855	else
	856	STATUS="RD"
	857	let RD_LINKS+=1
	858	fi
[1067]	859	break
	860	fi
	861	done
	862	fi
	863
	864	# If we didn't get a match with the "RD" codes, check it against the "NG" codes
	865	if [ $STATUS == "??" ]; then
[1064]	866	for CODE in "${NG_CODES[@]}"; do
	867	if [[ $CODE == $CURL_CODE ]]; then
	868	STATUS="NG"
	869	let NG_LINKS+=1
	870	break
	871	fi
	872	done
	873	fi
	874
	875	# If we didn't match a known status code, advise the reader
	876	if [ $STATUS == "??" ]; then
[1119]	877	valPrint trs "Skipping URL $URL (found on page $PAGE_NAME) because I encountered the unknown return code $CURL_CODE."
[1064]	878	let SKIP_UNK_CODE+=1
	879	continue
	880	fi
	881
[1070]	882	# Check problem links against exceptions file before proceeding
	883	if [ $STATUS != "OK" ] && [ ! -z $EXCEPT_URL ]; then
	884	# The code we expect to find in the exceptions file is either the 'curl' result or "EI"/"IW"
	885	EXPECT_CODE="$CURL_RESULT"
	886	if [ $STATUS == "EI" ]; then
	887	EXPECT_CODE="EI"
	888	elif [ $STATUS == "IW" ]; then
	889	EXPECT_CODE="IW"
	890	fi
	891
	892	# Look for link in exceptions file and make sure its listed result code and wiki page also match
[1064]	893	GREP_RESULT=$(grep --max-count=1 "$URL" "$EXCEPT_FILE")
[1070]	894	EXCEPT_PAGE=${GREP_RESULT##*,}
	895	if [ "$EXCEPT_PAGE" == "*" ] \|\| [ "$EXCEPT_PAGE" == $LOCAL_PAGE_PATH ]; then
	896	EXCEPT_CODE=${GREP_RESULT%%,*}
	897	if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then
[1119]	898	valPrint trs "Skipping URL $URL (found on page $PAGE_NAME) because its expected result, $EXPECT_CODE, is listed in the exceptions file."
[1070]	899	if [ $STATUS == "EI" ]; then
	900	let SKIP_EXPECT_EI+=1
	901	elif [ $STATUS == "IW" ]; then
	902	let SKIP_EXPECT_IW+=1
	903	else
	904	let SKIP_EXPECT_NG+=1
	905	fi
	906	continue
	907	fi
[1064]	908	fi
	909	fi
	910
	911	# If appropriate, record this link to the log, with clickable URLs when possible
	912	if [ $STATUS != "OK" ] \|\| [ $RECORD_OK_LINKS -eq 1 ]; then
[1070]	913	# Stupid hack since the strings "IW" and "EI" are narrower than "OK", "RD", or "NG" and it takes
	914	# an extra tab to get to the desired level of indentation in the RTF log
[1064]	915	RTF_TABS=" "
[1070]	916	if [ $STATUS == "IW" ] \|\| [ $STATUS == "EI" ]; then
[1064]	917	RTF_TABS=" "
	918	fi
	919
	920	# Record link and its wiki page in TXT, RTF, and HTML markup
	921	valPrint t "$STATUS ($CURL_RESULT) $STR_TYPE $URL"
	922	valPrint t " linked from $FULL_PAGE_PATH"
	923	valPrint r "$STATUS ($CURL_RESULT)$RTF_TABS$STR_TYPE {\field{\*\fldinst{HYPERLINK \"$URL\"}}{\fldrslt $URL}}"
	924	valPrint r " linked from {\field{\*\fldinst{HYPERLINK \"$FULL_PAGE_PATH\"}}{\fldrslt $LOCAL_PAGE_PATH}}"
	925	valPrint hn "<tr><td style=\"white-space:nowrap\">$STATUS ($CURL_RESULT)</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>"
	926	valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"
	927
[1067]	928	# Record redirect URL if one was given by a 3xx response page
	929	if [ $STATUS == "RD" ]; then
[1119]	930	valPrint ts " Server suggests $NEW_URL"
	931	valPrint rs " Server suggests {\field{\*\fldinst{HYPERLINK \"$NEW_URL\"}}{\fldrslt $NEW_URL}}"
	932	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Server suggests</td><td><a href=\"$NEW_URL\" target=\"_blank\">$NEW_URL</a></td></tr>"
[1067]	933	fi
	934
[1070]	935	# Notify reader if we can use an intrawiki link for this URL
	936	if [ $STATUS == "EI" ]; then
[1075]	937	INTRA_PAGE=${URL#:///}
[1119]	938	valPrint ts " Just use [[$INTRA_PAGE]]"
	939	valPrint rs " Just use [[$INTRA_PAGE]]"
	940	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Just use</td><td>[[$INTRA_PAGE]]</td></tr>"
[1070]	941	fi
	942
[1064]	943	# Notify reader if we can use an interwiki prefix for this URL
	944	if [ $STATUS == "IW" ]; then
[1075]	945	INTER_PAGE=$(echo "$URL" \| sed 's/.*\///')
[1119]	946	valPrint ts " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
	947	valPrint rs " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
	948	valPrint hs "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]</td></tr>"
[1064]	949	fi
	950
	951	# Query Internet Archive for latest "OK" snapshot for "NG" page
	952	if [ $STATUS == "NG" ] && [ $SUGGEST_SNAPSHOTS -eq 1 ]; then
	953	ARCHIVE_QUERY=$(curl --silent --max-time 10 "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
	954
[1118]	955	# If a "closest" snapshot was received...
[1066]	956	if [[ "$ARCHIVE_QUERY" == \"closest\": ]]; then
[1118]	957	# In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it
	958	ARCHIVE_QUERY=$(echo "$ARCHIVE_QUERY" \| sed 's/#!/#\\!/')
	959
	960	# ...isolate "url" property in the response that follows the "closest" tag
	961	SNAPSHOT_URL=${ARCHIVE_QUERY##*\"closest\":} # everything after '"closest":'
[1119]	962	SNAPSHOT_URL=${SNAPSHOT_URL#*\"url\": \"} # everything after '"url": "'
[1118]	963	SNAPSHOT_URL=${SNAPSHOT_URL%%\"*} # everything before '"'
	964
	965	# Inform the user of the snapshot URL
[1119]	966	valPrint ts " IA suggests $SNAPSHOT_URL"
	967	valPrint rs " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
	968	valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
[1064]	969	else # ...otherwise give generic Wayback Machine link for this URL
[1119]	970	valPrint ts " Try browsing $ARCHIVE_GENERIC/$URL"
	971	valPrint rs " Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
	972	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"
[1064]	973	fi
	974	fi
	975	fi
	976
	977	# If this is a page and it seems to be there, take screenshot of it using headless Google Chrome
	978	if [ $IS_FILE -eq 0 ] && [ $TAKE_PAGE_SHOT -eq 1 ] && [ $STATUS == "OK" ]; then
	979	# Sanitize URL by removing "http(s)://" and converting any other colons or slashes to underscores
	980	SHOT_NAME=$(echo "$URL" \| sed 's/https*\:\/\///' \| sed 'y/:\//__/')
	981	SHOT_FILE="$SHOT_PATH/$SHOT_NAME.png"
	982
	983	# Don't take screenshot if we already encountered this page and screenshotted it
	984	if [ ! -f "$SHOT_FILE" ]; then
[1070]	985	"$CHROME_PATH" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
[1064]	986	if [ -f "$WORKING_DIR/$CHROME_SCREENSHOT" ]; then
	987	mv -n "$WORKING_DIR/$CHROME_SCREENSHOT" "$SHOT_FILE"
	988	else
[1119]	989	valPrint trhs "Screenshot of URL $URL seems to have failed!"
[1064]	990	fi
	991	else
[1119]	992	valPrint trhs "Skipping screenshot of URL $URL because $SHOT_FILE already exists."
[1064]	993	fi
	994	fi
	995	done
	996	FINISHED_LIST="yes"
	997	wrapupAndExit

Note: See TracBrowser for help on using the repository browser.

Download in other formats: