Context Navigation

source: Validate External Links/validate_external_links.sh@ 1068

Last change on this file since 1068 was 1067, checked in by iritscen, 8 years ago
Val now understands HTTP redirect responses and will report the URL we're redirected to. Also now tallies IW links.
File size: 35.9 KB

Rev	Line
[1064]	1	#!/bin/bash
	2
	3	# Validate External Links by Iritscen
	4	# Provided with a list of external links found in the OniGalore wiki, this script validates them.
	5	# The resulting logs are produced in three formats: TXT (for easy diffing with an earlier log), RTF
	6	# (for reading as a local file with clickable links), and HTML (for uploading as a web page).
	7	# Call script with "--help" argument for documentation. Also see Read Me First.rtf for critical notes.
	8	# Recommended rule:
	9	# ------------------------------------------------------------------------------------------------------
	10
	11	# Set separator token to newline
	12	IFS="
	13	"
	14
	15	### GLOBALS ###
	16	# Settings -- these will be changed from their defaults by the arguments passed in to the script
	17	LINKS_URL="" # use 'curl' to download file with links from this location (can be file://)
	18	EXCEPT_URL="" # ditto above for file with exceptions to NG results
	19	OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder
	20	RECORD_OK_LINKS=0 # record response code to the log whether it's a value in OK_CODES or NG_CODES
	21	SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page
	22	TAKE_PAGE_SHOT=0 # take a screenshot of each OK page
	23	URL_START=1 # start at this URL in LINKS_FILE (1 by default)
	24	URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE
	25	UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report
	26
	27	# Fixed strings -- see the occurrences of these variables to learn their purpose
[1066]	28	AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:54.0) Gecko/20100101 Firefox/54.0"
[1064]	29	ARCHIVE_API="http://archive.org/wayback/available"
	30	ARCHIVE_GENERIC="https://web.archive.org/web/*"
	31	ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
	32	CHROME="/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary"
	33	CHROME_SCREENSHOT="screenshot.png"
[1066]	34	CURL_CODES="http://iritscen.oni2.net/val/curl_codes.txt"
[1064]	35	EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
[1066]	36	HTTP_CODES="http://iritscen.oni2.net/val/http_codes.txt"
[1064]	37	MY_WIKI_PAGE="http://wiki.oni2.net/User:Iritscen"
	38	THIS_DIR=$(cd $(dirname $0); pwd)
	39	WORKING_DIR=$(pwd)
	40	WIKI_PATH="wiki.oni2.net"
	41
	42	# These are parallel arrays of the IDs and names of OniGalore's current namespaces
	43	declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 100 101 102 103 104 105 108 109 110 111)
	44	declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk")
	45
	46	# These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
	47	# This determines whether the script tries to take a screenshot of the page or just gets its HTTP code.
	48	declare -a HTTP_FILES=(txt zip wmv jpg png m4a bsl rar oni mp3 mov ONWC vbs TRMA mp4 doc avi log gif pdf dmg exe cpp tga 7z wav east BINA xml dll dae xaf fbx 3ds blend flv csv)
	49	declare -a HTTP_TLDS_AND_PAGES=(com net org uk ru de it htm html php pl asp aspx shtml pgi cgi php3 x jsp phtml cfm css action stm js)
	50
[1067]	51	# These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
	52	# are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt
	53	# if you add a new code.
	54	declare -a OK_CODES=(200 401 405 406 501)
	55	declare -a RD_CODES=(301 302 303 307 308)
	56	declare -a NG_CODES=(000 403 404 410 500 503)
[1064]	57
	58	# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
	59	# transcluded text, and if the transclusion fails, then the braces show up in the URL
	60	ILLEGAL_CHARS="{ }"
	61
	62	# These are parallel arrays giving the prefixes that can be used in place of normal external links to
	63	# some wikis and other sites
	64	declare -a INTERWIKI_PREFIXES=(metawikipedia wikipedia wikiquote wiktionary)
	65	declare -a INTERWIKI_DOMAINS=(meta.wikipedia.org wikipedia.org wikiquote.org wiktionary.org)
	66
	67	# Variables for keeping track of main loop progress and findings
	68	LINK_NUM=0
	69	OK_LINKS=0
[1067]	70	RD_LINKS=0
	71	IW_LINKS=0
[1064]	72	NG_LINKS=0
	73	SKIP_UNK_NS=0
	74	SKIP_JS_PAGE=0
	75	SKIP_BAD_URL=0
	76	SKIP_NON_ASCII=0
	77	SKIP_UNK_SUFFIX=0
	78	SKIP_UNK_CODE=0
	79	SKIP_EXCEPT=0
	80	FILE_LINKS=0
	81	PAGE_LINKS=0
	82	SKIPPED_HEADER_ROW=0
	83	FINISHED_LIST="no"
	84
	85
	86	### HELP ###
	87	# A pseudo-man page. Here is the 80-character rule for the page text:
	88	# 234567890123456789012345678901234567890123456789012345678901234567890123456789
	89	function printHelp()
	90	{
	91	cat << EOF
	92
	93	NAME
	94	Validate External Links
	95
	96	SYNOPSIS
	97	validate_external_links.sh --help
	98	validate_external_links.sh --links URL --output PATH [--exceptions FILE]
	99	[--record-ok-links] [--suggest-snapshots] [--take-screenshots]
	100	[--start-url NUM] [--end-url NUM] [--upload PATH]
	101
	102	DESCRIPTION
	103	This script parses a list of external links found in the OniGalore wiki
	104	(which is dumped by the Oni2.net domain periodically in a particular
	105	format), validates them using the Unix tool 'curl', and produces a report
	106	of which links were OK (responded to an HTTP query) and which were NG (no
	107	good). This report can then be automatically uploaded to the location of
	108	your choice. The script can also suggest Internet Archive snapshots for
	109	NG links, and take screenshots of OK links for visual verification by the
	110	reader that the page in question is the one intended to be displayed.
	111
	112	You must pass this script the URL at which the list of links is found
	113	(--links) and the path where logs should be outputted (--output). All
	114	other arguments are optional.
	115
	116	OPTIONS
	117	--help Show this page
	118	--links URL URL from which to download file with external links
	119	(note that this can be a local file if you use the
	120	file:// protocol) (required)
	121	--output DIR Place the folder which will contain the reports and
	122	optional screenshots at this path (required)
	123	--exceptions DIR Don't log an NG link if it is listed in the file
	124	provided at this path as long as the response code is
	125	the same as the one associated with the link
	126	--record-ok-links Log a link in the report whether its response code is
	127	in the OK_CODES or the NG_CODES array
	128	--suggest-snapshots Query the Internet Archive for a possible snapshot
	129	URL for each NG page
	130	--take-screenshots Save screenshots of each OK page (requires Google
	131	Chrome to be found at the path in CHROME)
	132	--start-url NUM Start at this link in the links file
	133	--end-url NUM Stop at this link in the links file
	134	--upload FILE Upload report using info in this local file
	135
	136	BUGS
	137	The script cannot properly parse any line in the external links file
	138	which contains a comma in the name of the wiki page containing a link.
	139	Commas in the link itself are not an issue.
	140	EOF
	141	}
	142
	143
	144	### SETUP ###
	145	# If first argument is a help request, or if nothing was passed in at all, print help page and quit
	146	if [ "$#" -eq 0 ] \|\| [ "$1" == "--help" ]; then
	147	printHelp \| less
	148	exit 0
	149	fi
	150
	151	# Parse arguments as long as there are more arguments to process
	152	while (( "$#" )); do
	153	case "$1" in
	154	--links ) LINKS_URL="$2"; shift 2;;
	155	--exceptions ) EXCEPT_URL="$2"; shift 2;;
	156	--output ) OUTPUT_DIR="$2"; shift 2;;
	157	--record-ok-links ) RECORD_OK_LINKS=1; shift;;
	158	--suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;;
	159	--take-screenshots ) TAKE_PAGE_SHOT=1; shift;;
	160	--start-url ) URL_START=$2; shift 2;;
	161	--end-url ) URL_LIMIT=$2; shift 2;;
	162	--upload ) UPLOAD_INFO=$2; shift 2;;
	163	* ) echo "Invalid argument $1 detected. Aborting."; exit 1;;
	164	esac
	165	done
	166
	167	# If the required arguments were not supplied, print help page and quit
	168	if [ -z $LINKS_URL ] \|\| [ -z $OUTPUT_DIR ]; then
	169	printHelp
	170	echo "Error: I did not receive one or both required arguments."
	171	exit 2
	172	fi
	173
	174	# Check that UPLOAD_INFO exists, if this argument was supplied
	175	if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then
	176	echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting."
	177	exit 3
	178	fi
	179
	180	# Check that OUTPUT_DIR is a directory
	181	if [ ! -d "$OUTPUT_DIR" ]; then
	182	echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting."
	183	exit 4
	184	fi
	185
	186	# Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots
	187	SAFE_TIME=$(date "+%Y-%m-%d--%H-%M-%S") # for use in path names
	188	NICE_TIME=$(TZ=":UTC" date "+%a, %d %b %Y %k:%M:%S GMT") # matches format of LINKS_DATE, which is extracted from the extlinks file
	189	OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
	190	OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
	191	SHOT_PATH="$OUTPUT_PATH/Screenshots"
	192	LOG_NAME="ValExtLinks report"
	193	LOG_TXT="$OUTPUT_PATH/$LOG_NAME.txt"
	194	LOG_RTF="$OUTPUT_PATH/$LOG_NAME.rtf"
	195	LOG_HTM="$OUTPUT_PATH/$LOG_NAME.htm"
	196	mkdir "$OUTPUT_PATH"
	197	if [ $TAKE_PAGE_SHOT -eq 1 ]; then
	198	mkdir "$SHOT_PATH"
	199	fi
	200
	201	# Check that 'mkdir' succeeded
	202	if [ ! -d "$OUTPUT_PATH" ]; then
	203	echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting."
	204	exit 5
	205	fi
	206
	207	# Get date on the file at LINKS_URL and print to log
	208	LINKS_DATE=$(curl --silent --head $LINKS_URL \| grep "Last-Modified")
	209	if [ -z "$LINKS_DATE" ]; then
	210	echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting."
	211	exit 6
	212	fi
	213	LINKS_DATE=${LINKS_DATE#Last-Modified: }
	214
	215
	216	### UTILITY FUNCTIONS ###
	217	# Writes a plain-text header to TXT log file
	218	function printTXTheader()
	219	{
	220	valPrint t "Validate External Links report"
	221	valPrint t "generated $NICE_TIME"
	222	valPrint t "from data of $LINKS_DATE"
	223	valPrint t "script by Iritscen (contact: $MY_WIKI_PAGE)"
	224	valPrint t ""
	225	}
	226
	227	# Writes the RTF header to RTF log file, then a centered title, then sets text to left-justified
	228	function printRTFheader()
	229	{
	230	valPrint r "{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf820
	231	{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
	232	{\colortbl;\red255\green255\blue255;}
	233	{\*\expandedcolortbl;;}
	234	\margl1440\margr1440\vieww12600\viewh12100\viewkind0
	235	\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\qc\partightenfactor0
	236
	237	\f0\fs28 \cf0 \b1 Validate External Links report \b0\\
	238	generated $NICE_TIME\\
	239	from data of $LINKS_DATE\\
	240	script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$MY_WIKI_PAGE\"}}{\fldrslt contact}})
	241	\\
	242	\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
	243	\cf0 "
	244	}
	245
	246	# Closes the RTF markup of the RTF log file
	247	function printRTFfooter()
	248	{
	249	valPrint r "}"
	250	}
	251
	252	# Writes the HTML header to HTML log file
	253	function printHTMheader()
	254	{
	255	valPrint h "<html>
	256	<head>
	257	<title>Validate External Links report</title>
	258	</head>
	259	<body>
	260	<h2>Validate External Links report</h2>
	261	<h3>generated $NICE_TIME<br />
	262	from data of $LINKS_DATE<br />
	263	script by Iritscen (<a href=\"$MY_WIKI_PAGE\" target=\"_blank\">contact</a>)</h3>"
	264	}
	265
	266	# Closes the HTML markup of the HTML log file
	267	function printHTMfooter()
	268	{
	269	valPrint h "</body>
	270	</html>"
	271	}
	272
	273	# The central logging function. The first parameter is a string composed of one or more characters that
	274	# indicates which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
	275	# 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 'w' means "Don't
	276	# pass console output through 'fmt'" ("fmt" fits the output to an 80-column CLI but can break special
	277	# formatting and the 'n' option).
	278	function valPrint()
	279	{
	280	if [[ "$1" == c ]]; then
	281	if [[ "$1" == n ]]; then
	282	echo -n "$2"
	283	elif [[ "$1" == w ]]; then
	284	echo "$2"
	285	else
	286	echo "$2" \| fmt -w 80
	287	fi
	288	fi
	289	if [[ "$1" == t ]]; then
	290	if [[ "$1" == n ]]; then
	291	echo -n "$2" >> "$LOG_TXT"
	292	else
	293	echo "$2" >> "$LOG_TXT"
	294	fi
	295	fi
	296	if [[ "$1" == r ]]; then
	297	if [[ "$1" == n ]]; then
	298	echo "$2" >> "$LOG_RTF"
	299	else
	300	echo "$2\\" >> "$LOG_RTF"
	301	fi
	302	fi
	303	if [[ "$1" == h ]]; then
	304	if [[ "$1" == n ]]; then
	305	echo "$2" >> "$LOG_HTM"
	306	else
	307	echo "$2<br />" >> "$LOG_HTM"
	308	fi
	309	fi
	310	}
	311
	312	# Pluralize the string in parameter 1 if the number in parameter 2 is not 1
	313	function pluralCheckNoun()
	314	{
	315	if [ $2 -ne 1 ]; then
	316	if [[ $1 =~ x$ ]]; then
	317	echo $1es
	318	else
	319	echo $1s
	320	fi
	321	else
	322	echo $1
	323	fi
	324	}
	325
[1067]	326	# Output "is" if parameter 1 is 1, otherwise "are"
	327	function pluralCheckIs()
	328	{
	329	if [ $1 -ne 1 ]; then
	330	echo "are"
	331	else
	332	echo "is"
	333	fi
	334	}
	335
[1064]	336	# Output "was" if parameter 1 is 1, otherwise "were"
	337	function pluralCheckWas()
	338	{
	339	if [ $1 -ne 1 ]; then
	340	echo "were"
	341	else
	342	echo "was"
	343	fi
	344	}
	345
[1067]	346	# Output "a " if parameter 1 is 1, otherwise nothing
	347	function pluralCheckA()
	348	{
	349	if [ $1 -eq 1 ]; then
	350	echo "a "
	351	fi
	352	}
	353
	354	# Output "an " if parameter 1 is 1, otherwise nothing
	355	function pluralCheckAn()
	356	{
	357	if [ $1 -eq 1 ]; then
	358	echo "an "
	359	fi
	360	}
	361
[1064]	362	# Upload HTML report using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
	363	# reports being saved to disk have already been closed.
	364	function uploadReport()
	365	{
	366	valPrint c "Uploading HTML report..."
	367
	368	SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
	369	SFTP_USER_NAME_MARKER="user:"
	370	SFTP_PASSWORD_MARKER="pw:"
	371	SFTP_PORT_MARKER="port:"
	372	SFTP_PATH_MARKER="path:"
	373	SFTP_USER_NAME=$(grep $SFTP_USER_NAME_MARKER $UPLOAD_INFO)
	374	SFTP_USER_NAME=${SFTP_USER_NAME#$SFTP_USER_NAME_MARKER}
	375	SFTP_PASSWORD=$(grep $SFTP_PASSWORD_MARKER $UPLOAD_INFO)
	376	SFTP_PASSWORD=${SFTP_PASSWORD#$SFTP_PASSWORD_MARKER}
	377	SFTP_PORT=$(grep $SFTP_PORT_MARKER $UPLOAD_INFO)
	378	SFTP_PORT=${SFTP_PORT#$SFTP_PORT_MARKER}
	379	SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
	380	SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}
	381
	382	expect "$SCRIPT_PATH" "$LOG_HTM" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.htm"
	383
	384	valPrint c "Report was uploaded, unless an error message appears above."
	385	}
	386
	387	# Prints session summary when script is done
	388	function wrapupAndExit()
	389	{
	390	# Get off progress line on console, drop down a line from last link in log, and close HTML table
	391	valPrint ctr ""
	392	valPrint h "</table><br />"
	393
	394	# If we didn't finish processing the last URL, then the iterator is one too high
	395	if [ $FINISHED_LIST != "yes" ]; then
	396	let LINK_NUM-=1
	397	if [ $FINISHED_LIST == "no" ]; then
	398	valPrint ctrh "The session was canceled by the user."
	399	fi
	400	fi
	401
	402	# Output results of session and close the log file's markup
	403	LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
	404	LINKS_SKIPPED=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
	405	LINKS_CHECKED=$((LINKS_PROCESSED-LINKS_SKIPPED))
	406	valPrint ct "Summary:"
	407	valPrint r "\b1 Summary \b0"
	408	valPrint hn "<h3><span id=\"summary\">Summary</span></h3>"
	409	valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT)."
	410	valPrint ctrh "I skipped $LINKS_SKIPPED $(pluralCheckNoun link $LINKS_SKIPPED), and found $FILE_LINKS $(pluralCheckNoun file $FILE_LINKS) and $PAGE_LINKS $(pluralCheckNoun page $PAGE_LINKS)."
	411	if [ $LINKS_SKIPPED -gt 0 ]; then valPrint ctrh "Skip breakdown: "; fi
	412	if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
	413	if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE links on JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
	414	if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
	415	if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
	416	if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
	417	if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
[1067]	418	valPrint ctrh "Out of the $LINKS_CHECKED links checked, $OK_LINKS $(pluralCheckWas $OK_LINKS) OK, $RD_LINKS $(pluralCheckWas $RD_LINKS) $(pluralCheckA $RD_LINKS)redirection $(pluralCheckNoun notice $RD_LINKS), and $NG_LINKS $(pluralCheckWas $NG_LINKS) NG."
	419	if [ $IW_LINKS -gt 0 ]; then
	420	valPrint ctrh "$IW_LINKS/$OK_LINKS OK $(pluralCheckNoun link $OK_LINKS) $(pluralCheckIs $IW_LINKS) $(pluralCheckAn $IW_LINKS)external $(pluralCheckNoun link $IW_LINKS) that could be $(pluralCheckAn $IW_LINKS)interwiki $(pluralCheckNoun link $IW_LINKS)."
	421	fi
[1064]	422	if [ $SKIP_EXCEPT -gt 0 ]; then
	423	valPrint ctrh "$SKIP_EXCEPT/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS) went unlisted due to being found in the exceptions file."
	424	fi
	425	printRTFfooter
	426	printHTMfooter
	427
	428	# Upload report if this was requested
	429	if [ ! -z $UPLOAD_INFO ] && [ $FINISHED_LIST != "no" ]; then
	430	uploadReport
	431	fi
	432
	433	# Really quit now
	434	valPrint c "ValExtLinks says goodbye."
	435	exit 0
	436	}
	437	trap wrapupAndExit INT
	438
	439
	440	### INITIALIZATION ###
	441	# Print opening message to console and log files
	442	valPrint cw "$(tput bold)--Validate External Links--$(tput sgr0)"
	443	printTXTheader
	444	printRTFheader
	445	printHTMheader
	446
	447	# Attempt to download file at LINKS_URL, then check that it succeeded
	448	valPrint ctrh "Downloading list of external links from $LINKS_URL."
	449	LINKS_FILE_NAME=$(echo "$LINKS_URL" \| sed 's/.*\///')
	450	LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
	451	curl --silent -o "$LINKS_FILE" $LINKS_URL
	452	if [ ! -f "$LINKS_FILE" ]; then
	453	echo "The download of $LINKS_URL appears to have failed. Aborting."
	454	wrapupAndExit
	455	fi
	456
	457	# Attempt to download file at EXCEPT_URL, then check that it succeeded
	458	if [ ! -z $EXCEPT_URL ]; then
	459	valPrint ctrh "Downloading list of NG exceptions from $EXCEPT_URL."
	460	EXCEPT_FILE_NAME=$(echo "$EXCEPT_URL" \| sed 's/.*\///')
	461	EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
	462	curl --silent -o "$EXCEPT_FILE" $EXCEPT_URL
	463	if [ ! -f "$EXCEPT_FILE" ]; then
	464	echo "The download of $EXCEPT_URL appears to have failed. Aborting."
	465	wrapupAndExit
	466	fi
	467	fi
	468
	469	# Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
	470	LINK_COUNT_STRING=$(cat "$LINKS_FILE" \| wc -l)
	471
	472	# Number of URLs is number of lines minus one (first line is column header row for the CSV)
	473	LINK_COUNT=$(echo "${LINK_COUNT_STRING}" \| tr -d '[:space:]')
	474	let LINK_COUNT-=1
	475
	476	# Calculate number of URLs to consider
	477	if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
	478	valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((URL_LIMIT-URL_START+1)) of them, from link $URL_START to $URL_LIMIT."
	479	elif [ $URL_START -ne 1 ]; then
	480	valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((LINK_COUNT-URL_START+1)) of them, from link $URL_START to $LINK_COUNT."
	481	else
	482	valPrint ctrh "Found $LINK_COUNT links to process. I will be considering all of them."
	483	fi
	484
	485	# Print settings to console and log
	486	declare -a SETTINGS_MSG=(I will be checking the response code of each link "and will" take a screenshot of each page. Pages that are OK will "also" be logged. I "will" ask the Internet Archive for a suggested snapshot URL for each NG page. "I will not print NG links that are listed in the exceptions file.")
	487	if [ $TAKE_PAGE_SHOT -eq 0 ]; then SETTINGS_MSG[10]="but will not"; fi
	488	if [ $RECORD_OK_LINKS -eq 0 ]; then SETTINGS_MSG[22]="not"; fi
	489	if [ $SUGGEST_SNAPSHOTS -eq 0 ]; then SETTINGS_MSG[26]="will not"; fi
	490	if [ -z $EXCEPT_URL ] \|\| [ $RECORD_OK_LINKS -eq 1 ]; then SETTINGS_MSG[40]=""; fi
	491	SETTINGS_STR=${SETTINGS_MSG[@]}
	492	valPrint ctrh "$SETTINGS_STR"
	493	valPrint tr "A summary of my findings will be found at the bottom of the report."
	494	valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
	495	valPrint trh ""
	496
	497	# Print legend to logs
	498	valPrint t "Legend:"
	499	valPrint r "\b1 Legend \b0"
	500	valPrint hn "<h3>Legend</h3>"
	501	valPrint trh "OK = URL seems to be working."
[1067]	502	valPrint trh "NG = URL no longer seems to work. You should click each URL marked as NG before attempting to fix it, because false negatives will occur from time to time due to hiccups in the Internet. Please report any persistent false negatives or other issues to Iritscen. An NG link should be followed by a link to the Internet Archive's Wayback Machine which may help you repair the link. If the link cannot be repaired, you can disable it on the wiki (which prevents it from showing up in future ValExtLinks reports) by wrapping it in nowiki tags."
	503	valPrint trh "RD = The server responding to this URL is saying that the page moved and you should instead use the supplied new URL. Some RD links represent minor adjustments in the organization of a web site, and some are soft 404s (the file/page has been removed and you are being redirected to something like the main page of the web site). You will have to look at the new URL yourself to determine if it represents an OK link and the link on the wiki should be updated to this one, or if the desired file/page is actually gone and we need to replace the wiki link with an Internet Archive snapshot link -- or disable the URL if it has not been archived."
[1064]	504	valPrint trh "IW = URL is working but should be converted to interwiki link using the suggested markup."
	505	valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $HTTP_CODES)."
	506	valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$HTTP_CODES\"}}{\fldrslt here}} for code reference)."
	507	valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$HTTP_CODES\" target=\"_blank\">here</a> for code reference)."
	508	valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $CURL_CODES)."
	509	valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$CURL_CODES\"}}{\fldrslt here}} for code reference)."
	510	valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$CURL_CODES\" target=\"_blank\">here</a> for code reference)."
	511	valPrint trh "IA suggests = Last available snapshot suggested by the Internet Archive."
	512	valPrint trh "Try browsing = The Archive occasionally fails to return a snapshot URL even when one exists, so you will need to check for a snapshot manually using the Wayback Machine before concluding that a site has not been archived."
	513	valPrint trh ""
	514
	515
	516	### MAIN LOOP ###
	517	# Process each line of the .csv in LINKS_FILE
	518	for LINE in `cat "$LINKS_FILE"`; do
	519	let LINK_NUM+=1
	520
	521	# First line is the column header row for the CSV, so let's verify that the format hasn't changed
	522	if [ $SKIPPED_HEADER_ROW -eq 0 ]; then
	523	if [ $LINE == "namespace,title,target" ]; then
	524	SKIPPED_HEADER_ROW=1
	525	LINK_NUM=0 # this line is it's not a link, so reset the link counter
	526	valPrint hn "<table>"
	527	continue
	528	else
	529	valPrint ctrh "Did not find expected header row in $LINKS_FILE. Aborting."
	530	wrapupAndExit
	531	fi
	532	fi
	533
	534	# Skip this link if we are not at URL_START yet
	535	if [ $LINK_NUM -lt $URL_START ]; then
	536	continue
	537	fi
	538
	539	# Stop if we are at the limit declared for testing purposes
	540	if [ $URL_LIMIT -gt 0 ] && [ $LINK_NUM -gt $URL_LIMIT ]; then
	541	FINISHED_LIST="limit"
	542	wrapupAndExit
	543	fi
	544
	545	# Print progress to screen
	546	if [ $LINK_NUM -gt 1 ]; then
	547	printf "\e[1A\n" # erase previous progress message so that new one appears in its place
	548	fi
	549	valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."
	550
	551	# The number of the namespace is the element before the first comma on the line
	552	NS_ID=${LINE%%,*}
	553
	554	# Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
	555	NS_NAME=""
	556	a=0
	557	while [ "x${NS_IDS[$a]}" != "x" ] # once this evaluates to "x", the array is done
	558	do
	559	if [ $NS_ID -eq ${NS_IDS[$a]} ]; then
	560	NS_NAME="${NS_NAMES[$a]}"
	561	break
	562	fi
	563	let a+=1
	564	done
	565	if [ -z "$NS_NAME" ]; then
	566	valPrint tr "Skipping URL found on page $PAGE_NAME because I could not find a name for namespace ID $NS_ID."
	567	let SKIP_UNK_NS+=1
	568	continue
	569	fi
	570
	571	# The name of the page is everything between the namespace ID and the next comma on the line (commas
	572	# in page names will break this)
	573	PAGE_NAME=${LINE#$NS_ID,}
	574	PAGE_NAME=${PAGE_NAME%%,*}
	575
	576	# We don't want to consider wiki pages ending in .js, as the parser cannot reliably isolate URLS in
	577	# JavaScript code, so it will return erroneous links
	578	PAGE_NAME_SUFFIX=$(echo $PAGE_NAME \| sed 's/.*\.//')
	579	if [ $PAGE_NAME_SUFFIX == "js" ]; then
	580	valPrint tr "Skipping URL found on JavaScript page $PAGE_NAME."
	581	let SKIP_JS_PAGE+=1
	582	continue
	583	fi
	584
	585	# The URL being linked to is everything after the previous two fields (this allows commas to be in
	586	# the URLs, but a comma in the previous field, the page name, will break this)
	587	URL=${LINE#$NS_ID,$PAGE_NAME,}
	588
	589	# Scan for illegal characters
	590	if [[ $URL == [$ILLEGAL_CHARS] ]]; then
	591	valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because it contains characters illegal in a URL."
	592	let SKIP_BAD_URL+=1
	593	continue
	594	fi
	595
	596	# Now we need to know if the URL is for a file or a web page. First step is to determine if the
	597	# URL ends in a suffix
	598	HAS_SUFFIX=0
	599
	600	# If the URL ends in something like ".php?foo=bar", strip everything from the '?' onward
	601	SAN_URL=${URL%%\?*}
	602
	603	# If the URL ends in something like "#section_15", strip everything from the '#' onward
	604	SAN_URL=${SAN_URL%%\#*}
	605
	606	# 'sed' cannot handle Unicode in my Bash shell, so skip this URL and make user check it
	607	if [[ $SAN_URL == [![:ascii:]] ]]; then
	608	valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because I cannot handle non-ASCII characters."
	609	let SKIP_NON_ASCII+=1
	610	continue
	611	fi
	612
	613	# Isolate the characters after the last period and after the last slash
	614	POST_DOT=$(echo "$SAN_URL" \| sed 's/.*\.//')
	615	POST_SLASH=$(echo "$SAN_URL" \| sed 's/.*\///')
	616
	617	# If the last period comes after the last slash, then the URL ends in a suffix
	618	POST_DOT_LENGTH=$(echo \| awk -v input=$POST_DOT '{print length(input)}')
	619	POST_SLASH_LENGTH=$(echo \| awk -v input=$POST_SLASH '{print length(input)}')
	620	if [ $POST_DOT_LENGTH -lt $POST_SLASH_LENGTH ]; then
	621	HAS_SUFFIX=1
	622	else
	623	HAS_SUFFIX=0
	624	fi
	625
	626	# Now we need to know if the URL is for a file. If the URL ends in a suffix, compare it against the
	627	# known URL endings supplied in HTTP_FILES and HTTP_TLDS_AND_PAGES
	628	IS_FILE=-1
	629	if [ $HAS_SUFFIX -eq 0 ]; then
	630	IS_FILE=0
	631	else
	632	# Turn off case sensitivity while we compare suffixes
	633	shopt -s nocasematch
	634
	635	# Special case: URLs ending in something like "/productID.304297400" are pages, not files, so if
	636	# the URL's suffix is all numbers, we are looking at the end of a web page URL
	637	if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
	638	IS_FILE=0
	639	fi
	640
	641	# If we did not identify this URL as a web page above, we need to compare the suffix against known
	642	# file extensions
	643	if [ $IS_FILE -eq -1 ]; then
	644	for EXTENSION in "${HTTP_FILES[@]}"; do
	645	if [[ $EXTENSION == $POST_DOT ]]; then # double brackets respect the nocasematch setting
	646	IS_FILE=1
	647	break
	648	fi
	649	done
	650	fi
	651
	652	# If we did not identify this URL as a file above, we need to compare the suffix against known
	653	# pages and TLDs in order to rule out that this is an unknown file suffix or TLD that the user
	654	# needs to add to the HTTP_FILES or HTTP_TLDS_AND_PAGES array
	655	if [ $IS_FILE -eq -1 ]; then
	656	for PAGE_OR_TLD in "${HTTP_TLDS_AND_PAGES[@]}"; do
	657	if [[ $PAGE_OR_TLD == $POST_DOT ]]; then
	658	IS_FILE=0
	659	break
	660	fi
	661	done
	662	fi
	663
	664	# Turn case sensitivity back on in Bash
	665	shopt -u nocasematch
	666	fi
	667
	668	# If this suffix escaped identification as either a file, page or TLD, inform the user
	669	STR_TYPE=""
	670	if [ $IS_FILE -eq -1 ]; then
	671	valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because I encountered the unknown URL ending $POST_DOT. Please add this ending to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
	672	let SKIP_UNK_SUFFIX+=1
	673	continue
	674	elif [ $IS_FILE -eq 1 ]; then
	675	STR_TYPE="file"
	676	let FILE_LINKS+=1
	677	elif [ $IS_FILE -eq 0 ]; then
	678	STR_TYPE="page"
	679	let PAGE_LINKS+=1
	680	fi
	681
	682	# Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
	683	# issue with sites that require HTTPS
	684	CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{http_code}\n' $URL)
	685	CURL_ERR=$(echo $?)
	686	CURL_RESULT=$CURL_CODE
	687
	688	# Tack 'curl' exit code onto result string if 'curl' returned "000" (no HTTP response)
	689	if [ $CURL_CODE == "000" ]; then
	690	CURL_RESULT="$CURL_RESULT-$CURL_ERR"
	691	fi
	692
	693	# Determine if this code is in our "OK" list
	694	STATUS="??"
[1067]	695	NEW_URL=""
[1064]	696	INTERWIKI_INDEX=-1
	697	for CODE in "${OK_CODES[@]}"; do
	698	if [[ $CODE == $CURL_CODE ]]; then
	699	let OK_LINKS+=1
	700
	701	# Determine if this is a link to a domain that we have an interwiki prefix for
	702	for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
	703	if [[ $URL == ${INTERWIKI_DOMAINS[$i]} ]]; then
	704	STATUS="IW"
[1067]	705	let IW_LINKS+=1
[1064]	706	INTERWIKI_INDEX=$i
	707	break
	708	fi
	709	done
	710
	711	# If this link is OK and no interwiki advisory is needed, just mark as "OK"
	712	if [ $INTERWIKI_INDEX == -1 ]; then
	713	STATUS="OK"
	714	fi
	715	break
	716	fi
	717	done
	718
[1067]	719	# If we didn't get a match with the "OK" codes, check it against the "RD" codes
[1064]	720	if [ $STATUS == "??" ]; then
[1067]	721	for CODE in "${RD_CODES[@]}"; do
	722	if [[ $CODE == $CURL_CODE ]]; then
	723	STATUS="RD"
	724	let RD_LINKS+=1
	725
	726	# Get URL header again in order to retrieve the URL we are being redirected to
	727	NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{redirect_url}\n' $URL)
	728
	729	break
	730	fi
	731	done
	732	fi
	733
	734	# If we didn't get a match with the "RD" codes, check it against the "NG" codes
	735	if [ $STATUS == "??" ]; then
[1064]	736	for CODE in "${NG_CODES[@]}"; do
	737	if [[ $CODE == $CURL_CODE ]]; then
	738	STATUS="NG"
	739	let NG_LINKS+=1
	740	break
	741	fi
	742	done
	743	fi
	744
	745	# If we didn't match a known status code, advise the reader
	746	if [ $STATUS == "??" ]; then
	747	valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because I encountered the unknown return code $CURL_CODE."
	748	let SKIP_UNK_CODE+=1
	749	continue
	750	fi
	751
	752	# If link is "NG" and there is an exceptions file, compare URL against the list before logging it
	753	if [ $STATUS == "NG" ] && [ ! -z $EXCEPT_URL ]; then
	754	GREP_RESULT=$(grep --max-count=1 "$URL" "$EXCEPT_FILE")
	755	EXCEPT_CODE=${GREP_RESULT%%,*}
	756	if [ "$EXCEPT_CODE" == $CURL_RESULT ]; then
	757	valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because its status code, $CURL_RESULT, is listed in the exceptions file."
	758	let SKIP_EXCEPT+=1
	759	continue
	760	fi
	761	fi
	762
	763	# If appropriate, record this link to the log, with clickable URLs when possible
	764	if [ $STATUS != "OK" ] \|\| [ $RECORD_OK_LINKS -eq 1 ]; then
	765	FULL_PAGE_PATH=http://$WIKI_PATH/$NS_NAME:$PAGE_NAME
	766	LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
	767	# Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it explicitly breaks the link
	768	if [ $NS_ID -eq 0 ]; then
	769	FULL_PAGE_PATH=http://$WIKI_PATH/$PAGE_NAME
	770	LOCAL_PAGE_PATH=$PAGE_NAME
	771	fi
	772
[1067]	773	# Stupid hack since the text "IW" is narrower than "OK", "RD", or "NG" and it takes an extra tab
	774	# to get to the desired level of indentation in the RTF log
[1064]	775	RTF_TABS=" "
	776	if [ $STATUS == "IW" ]; then
	777	RTF_TABS=" "
	778	fi
	779
	780	# Record link and its wiki page in TXT, RTF, and HTML markup
	781	valPrint t "$STATUS ($CURL_RESULT) $STR_TYPE $URL"
	782	valPrint t " linked from $FULL_PAGE_PATH"
	783	valPrint r "$STATUS ($CURL_RESULT)$RTF_TABS$STR_TYPE {\field{\*\fldinst{HYPERLINK \"$URL\"}}{\fldrslt $URL}}"
	784	valPrint r " linked from {\field{\*\fldinst{HYPERLINK \"$FULL_PAGE_PATH\"}}{\fldrslt $LOCAL_PAGE_PATH}}"
	785	valPrint hn "<tr><td style=\"white-space:nowrap\">$STATUS ($CURL_RESULT)</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>"
	786	valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"
	787
[1067]	788	# Record redirect URL if one was given by a 3xx response page
	789	if [ $STATUS == "RD" ]; then
	790	valPrint t " Server suggests $NEW_URL"
	791	valPrint r " Server suggests {\field{\*\fldinst{HYPERLINK \"$NEW_URL\"}}{\fldrslt $NEW_URL}}"
	792	valPrint hn "<tr><td colspan=\"2\" align=\"right\">Server suggests</td><td><a href=\"$NEW_URL\" target=\"_blank\">$NEW_URL</a></td></tr>"
	793	fi
	794
[1064]	795	# Notify reader if we can use an interwiki prefix for this URL
	796	if [ $STATUS == "IW" ]; then
	797	valPrint t " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$POST_SLASH]]"
	798	valPrint r " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$POST_SLASH]]"
	799	valPrint hn "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$POST_SLASH]]</td></tr>"
	800	fi
	801
	802	# Query Internet Archive for latest "OK" snapshot for "NG" page
	803	if [ $STATUS == "NG" ] && [ $SUGGEST_SNAPSHOTS -eq 1 ]; then
	804	ARCHIVE_QUERY=$(curl --silent --max-time 10 "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
	805
[1066]	806	# Isolate "url" property in response and log it if a "closest" snapshot was received...
	807	if [[ "$ARCHIVE_QUERY" == \"closest\": ]]; then
	808	SNAPSHOT_URL=${ARCHIVE_QUERY##*\"url\": \"}
	809	SNAPSHOT_URL=${SNAPSHOT_URL%\", \"timestamp*}
[1064]	810	valPrint t " IA suggests $SNAPSHOT_URL"
	811	valPrint r " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
	812	valPrint hn "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
	813	else # ...otherwise give generic Wayback Machine link for this URL
	814	valPrint t " Try browsing $ARCHIVE_GENERIC/$URL"
	815	valPrint r " Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
	816	valPrint hn "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"
	817	fi
	818	fi
	819	fi
	820
	821	# If this is a page and it seems to be there, take screenshot of it using headless Google Chrome
	822	if [ $IS_FILE -eq 0 ] && [ $TAKE_PAGE_SHOT -eq 1 ] && [ $STATUS == "OK" ]; then
	823	# Sanitize URL by removing "http(s)://" and converting any other colons or slashes to underscores
	824	SHOT_NAME=$(echo "$URL" \| sed 's/https*\:\/\///' \| sed 'y/:\//__/')
	825	SHOT_FILE="$SHOT_PATH/$SHOT_NAME.png"
	826
	827	# Don't take screenshot if we already encountered this page and screenshotted it
	828	if [ ! -f "$SHOT_FILE" ]; then
	829	"$CHROME" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
	830	if [ -f "$WORKING_DIR/$CHROME_SCREENSHOT" ]; then
	831	mv -n "$WORKING_DIR/$CHROME_SCREENSHOT" "$SHOT_FILE"
	832	else
	833	valPrint trh "Screenshot of URL $URL seems to have failed!"
	834	fi
	835	else
	836	valPrint trh "Skipping screenshot of URL $URL because $SHOT_FILE already exists."
	837	fi
	838	fi
	839	done
	840	FINISHED_LIST="yes"
	841	wrapupAndExit

Note: See TracBrowser for help on using the repository browser.

Download in other formats: