Context Navigation

source: Validate External Links/validate_external_links.sh@ 1133

Last change on this file since 1133 was 1127, checked in by iritscen, 5 years ago

Val now counts redirects from youtu.be to youtube.com as OK links. These links will be reported on if the argument --show-yt-redirects is used. Renamed --show-https-upgrade to --show-https-upgrades for consistency. Also sorted the file and page suffix arrays and added some more items to them. Now handling status codes 400, 418, 502 and 530. Fixed incorrect nbsps in HTML report. Val is no longer confused by URLs ending in '(' or ')', or which contain a '%' towards the end.

File size: 46.9 KB

Rev	Line
[1064]	1	#!/bin/bash
	2
	3	# Validate External Links by Iritscen
	4	# Provided with a list of external links found in the OniGalore wiki, this script validates them.
	5	# The resulting logs are produced in three formats: TXT (for easy diffing with an earlier log), RTF
	6	# (for reading as a local file with clickable links), and HTML (for uploading as a web page).
	7	# Call script with "--help" argument for documentation. Also see Read Me First.rtf for critical notes.
	8	# Recommended rule:
[1118]	9	# \|----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----\|
[1064]	10
	11	# Set separator token to newline
	12	IFS="
	13	"
	14
	15	### GLOBALS ###
	16	# Settings -- these will be changed from their defaults by the arguments passed in to the script
	17	LINKS_URL="" # use 'curl' to download file with links from this location (can be file://)
	18	EXCEPT_URL="" # ditto above for file with exceptions to NG results
	19	OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder
[1070]	20	RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES
[1122]	21	SHOW_SLASH=0 # record response code to the log when a slash is added to the end of a URL
	22	SHOW_HTTPS=0 # record response code to the log when "http" is upgraded to "https"
[1127]	23	SHOW_YT_RD=0 # record response code to the log when a youtu.be URL is expanded to the full URL
[1064]	24	SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page
	25	TAKE_PAGE_SHOT=0 # take a screenshot of each OK page
[1070]	26	CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature
[1064]	27	URL_START=1 # start at this URL in LINKS_FILE (1 by default)
	28	URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE
	29	UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report
	30
	31	# Fixed strings -- see the occurrences of these variables to learn their purpose
[1118]	32	AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36 OPR/67.0.3575.53"
[1064]	33	ARCHIVE_API="http://archive.org/wayback/available"
	34	ARCHIVE_GENERIC="https://web.archive.org/web/*"
	35	ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
	36	CHROME_SCREENSHOT="screenshot.png"
[1066]	37	CURL_CODES="http://iritscen.oni2.net/val/curl_codes.txt"
[1064]	38	EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
[1066]	39	HTTP_CODES="http://iritscen.oni2.net/val/http_codes.txt"
[1122]	40	MY_WIKI_PAGE="https://wiki.oni2.net/User:Iritscen"
[1064]	41	THIS_DIR=$(cd $(dirname $0); pwd)
	42	WORKING_DIR=$(pwd)
	43	WIKI_PATH="wiki.oni2.net"
	44
	45	# These are parallel arrays of the IDs and names of OniGalore's current namespaces
	46	declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 100 101 102 103 104 105 108 109 110 111)
	47	declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk")
	48
	49	# These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
[1070]	50	# This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code.
[1127]	51	declare -a HTTP_FILES=(3ds 7z avi BINA blend bsl cpp csv dae dll dmg doc east exe fbx first63 flv gamedata gif jpg last32 log m4a mhm mov mp3 mp4 oni ONWC pdf png py rar tga TRMA txt vbs wav wmv xaf xml zip)
	52	declare -a HTTP_TLDS_AND_PAGES=(action ars asp aspx cfm cgi com css de htm html it js jsp net org pgi php php3 phtml pl ru shtml stm uk x)
[1064]	53
[1067]	54	# These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
	55	# are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt
	56	# if you add a new code.
[1127]	57	declare -a OK_CODES=(200 401 405 406 418 501)
[1067]	58	declare -a RD_CODES=(301 302 303 307 308)
[1127]	59	declare -a NG_CODES=(000 400 403 404 410 500 502 503 530)
[1064]	60
	61	# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
	62	# transcluded text, and if the transclusion fails, then the braces show up in the URL
	63	ILLEGAL_CHARS="{ }"
	64
[1070]	65	# The shortest URL possible, used for sanity-checking some URLs: http://a.co
	66	MIN_URL_LENGTH=11
	67
[1064]	68	# These are parallel arrays giving the prefixes that can be used in place of normal external links to
	69	# some wikis and other sites
[1070]	70	declare -a INTERWIKI_PREFIXES=(commons metawikimedia mw wikibooks wikidata wikimedia wikinews wikiquote wikisource wikispecies wikiversity wikivoyage wikt wp)
	71	declare -a INTERWIKI_DOMAINS=(commons.wikimedia.org meta.wikimedia.org mediawiki.org wikibooks.org wikidata.org wikimediafoundation.org wikinews.org wikiquote.org wikisource.org species.wikimedia.org wikiversity.org wikivoyage.org wiktionary.org wikipedia.org)
[1064]	72
	73	# Variables for keeping track of main loop progress and findings
	74	LINK_NUM=0
[1070]	75	EI_LINKS=0
	76	IW_LINKS=0
[1064]	77	OK_LINKS=0
[1067]	78	RD_LINKS=0
[1064]	79	NG_LINKS=0
	80	SKIP_UNK_NS=0
	81	SKIP_JS_PAGE=0
	82	SKIP_BAD_URL=0
	83	SKIP_NON_ASCII=0
	84	SKIP_UNK_SUFFIX=0
	85	SKIP_UNK_CODE=0
[1070]	86	SKIP_EXPECT_NG=0
	87	SKIP_EXPECT_EI=0
	88	SKIP_EXPECT_IW=0
[1122]	89	SKIP_HTTPS_UP=0
	90	SKIP_SLASH_ADD=0
[1127]	91	SKIP_YOUTU_BE=0
[1064]	92	FILE_LINKS=0
	93	PAGE_LINKS=0
	94	SKIPPED_HEADER_ROW=0
	95	FINISHED_LIST="no"
[1118]	96	START_RUN=0
	97	END_RUN=0
[1064]	98
	99
	100	### HELP ###
	101	# A pseudo-man page. Here is the 80-character rule for the page text:
	102	# 234567890123456789012345678901234567890123456789012345678901234567890123456789
	103	function printHelp()
	104	{
	105	cat << EOF
	106
	107	NAME
	108	Validate External Links
	109
	110	SYNOPSIS
	111	validate_external_links.sh --help
[1070]	112	validate_external_links.sh --links URL --output DIR [--exceptions URL]
[1075]	113	[--record-ok-links] [--suggest-snapshots] [--take-screenshots FILE]
[1070]	114	[--start-url NUM] [--end-url NUM] [--upload FILE]
[1064]	115
	116	DESCRIPTION
	117	This script parses a list of external links found in the OniGalore wiki
	118	(which is dumped by the Oni2.net domain periodically in a particular
	119	format), validates them using the Unix tool 'curl', and produces a report
[1070]	120	of which links were "OK" (responded positively to an HTTP query), which
	121	were "RD" (responded with a 3xx redirect code), which could be "IW"
	122	(interwiki) links, which are "EI" (external internal) links and could be
	123	intrawiki links, and which were "NG" (no good; a negative response to the
[1069]	124	query). This report can then be automatically uploaded to the location of
[1064]	125	your choice. The script can also suggest Internet Archive snapshots for
[1070]	126	"NG" links, and take screenshots of "OK" links for visual verification by
	127	the reader that the page in question is the one intended to be displayed.
[1064]	128
	129	You must pass this script the URL at which the list of links is found
[1070]	130	(--links) and the path where the directory of logs should be outputted
	131	(--output). All other arguments are optional.
[1064]	132
	133	OPTIONS
[1075]	134	--help Show this page.
	135	--links URL (required) URL from which to download the CSV
	136	file with external links. Note that this URL can
	137	be a local file if you supply a file:// path.
	138	--output DIR (required) Unix path to directory in which Val
	139	should place its reports.
	140	--exceptions URL In order to remove links from the report which
	141	Val finds an issue with, but which you regard as
	142	OK, list those desired exceptions in this file.
	143	See the sample file exceptions.txt for details.
	144	Note that this URL can point to a local file if
	145	you supply a file:// path.
	146	--record-ok-links Log a link in the report even if its response
	147	code is "OK".
[1122]	148	--show-added-slashes Report on redirects that simply add a '/' to the
	149	end of the URL.
[1127]	150	--show-https-upgrades Report on redirects that simply upgrade a
[1122]	151	"http://" URL to a "https://" URL.
[1127]	152	--show-yt-redirects Report on redirects that expand a youtu.be URL.
[1075]	153	--suggest-snapshots Query the Internet Archive for a possible
	154	snapshot URL for each "NG" page.
	155	--take-screenshots FILE Call the Google Chrome binary at this path to
	156	take screenshots of each "OK" page.
	157	--start-url NUM Start at this link in the links CSV file.
	158	--end-url NUM Stop at this link in the links CSV file.
	159	--upload FILE Upload report using the credentials and path
	160	given in this local text file. See sftp_login.txt
	161	for template.
[1064]	162
	163	BUGS
	164	The script cannot properly parse any line in the external links file
	165	which contains a comma in the name of the wiki page containing a link.
	166	Commas in the link itself are not an issue.
	167	EOF
	168	}
	169
	170
	171	### SETUP ###
	172	# If first argument is a help request, or if nothing was passed in at all, print help page and quit
	173	if [ "$#" -eq 0 ] \|\| [ "$1" == "--help" ]; then
	174	printHelp \| less
	175	exit 0
	176	fi
	177
	178	# Parse arguments as long as there are more arguments to process
	179	while (( "$#" )); do
	180	case "$1" in
[1127]	181	--links ) LINKS_URL="$2"; shift 2;;
	182	--exceptions ) EXCEPT_URL="$2"; shift 2;;
	183	--output ) OUTPUT_DIR="$2"; shift 2;;
	184	--record-ok-links ) RECORD_OK_LINKS=1; shift;;
	185	--show-added-slashes ) SHOW_SLASH=1; shift;;
	186	--show-https-upgrades ) SHOW_HTTPS=1; shift;;
	187	--show-yt-redirects ) SHOW_YT_RD=1; shift;;
	188	--suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;;
	189	--take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
	190	--start-url ) URL_START=$2; shift 2;;
	191	--end-url ) URL_LIMIT=$2; shift 2;;
	192	--upload ) UPLOAD_INFO=$2; shift 2;;
	193	* ) echo "Invalid argument $1 detected. Aborting."; exit 1;;
[1064]	194	esac
	195	done
	196
	197	# If the required arguments were not supplied, print help page and quit
	198	if [ -z $LINKS_URL ] \|\| [ -z $OUTPUT_DIR ]; then
[1070]	199	echo "Error: I did not receive one or both required arguments. Run me with the \"--help\" argument for documentation."
[1064]	200	exit 2
	201	fi
	202
[1070]	203	# If user wants screenshots, make sure path to Chrome was passed in and is valid
	204	if [ $TAKE_PAGE_SHOT -eq 1 ]; then
	205	if [ ! -f "$CHROME_PATH" ]; then
	206	echo "Error: You need to supply a path to the Google Chrome application in order to take screenshots."
	207	exit 3
	208	fi
	209	fi
	210
[1064]	211	# Check that UPLOAD_INFO exists, if this argument was supplied
	212	if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then
	213	echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting."
[1070]	214	exit 4
[1064]	215	fi
	216
	217	# Check that OUTPUT_DIR is a directory
	218	if [ ! -d "$OUTPUT_DIR" ]; then
	219	echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting."
[1070]	220	exit 5
[1064]	221	fi
	222
	223	# Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots
	224	SAFE_TIME=$(date "+%Y-%m-%d--%H-%M-%S") # for use in path names
	225	NICE_TIME=$(TZ=":UTC" date "+%a, %d %b %Y %k:%M:%S GMT") # matches format of LINKS_DATE, which is extracted from the extlinks file
	226	OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
	227	OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
	228	SHOT_PATH="$OUTPUT_PATH/Screenshots"
	229	LOG_NAME="ValExtLinks report"
	230	LOG_TXT="$OUTPUT_PATH/$LOG_NAME.txt"
	231	LOG_RTF="$OUTPUT_PATH/$LOG_NAME.rtf"
	232	LOG_HTM="$OUTPUT_PATH/$LOG_NAME.htm"
	233	mkdir "$OUTPUT_PATH"
	234	if [ $TAKE_PAGE_SHOT -eq 1 ]; then
	235	mkdir "$SHOT_PATH"
	236	fi
	237
	238	# Check that 'mkdir' succeeded
	239	if [ ! -d "$OUTPUT_PATH" ]; then
	240	echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting."
[1070]	241	exit 6
[1064]	242	fi
	243
	244	# Get date on the file at LINKS_URL and print to log
	245	LINKS_DATE=$(curl --silent --head $LINKS_URL \| grep "Last-Modified")
	246	if [ -z "$LINKS_DATE" ]; then
	247	echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting."
[1070]	248	exit 7
[1064]	249	fi
	250	LINKS_DATE=${LINKS_DATE#Last-Modified: }
	251
	252
	253	### UTILITY FUNCTIONS ###
	254	# Writes a plain-text header to TXT log file
	255	function printTXTheader()
	256	{
	257	valPrint t "Validate External Links report"
	258	valPrint t "generated $NICE_TIME"
	259	valPrint t "from data of $LINKS_DATE"
	260	valPrint t "script by Iritscen (contact: $MY_WIKI_PAGE)"
	261	valPrint t ""
	262	}
	263
	264	# Writes the RTF header to RTF log file, then a centered title, then sets text to left-justified
	265	function printRTFheader()
	266	{
	267	valPrint r "{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf820
	268	{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
	269	{\colortbl;\red255\green255\blue255;}
	270	{\*\expandedcolortbl;;}
	271	\margl1440\margr1440\vieww12600\viewh12100\viewkind0
	272	\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\qc\partightenfactor0
	273
	274	\f0\fs28 \cf0 \b1 Validate External Links report \b0\\
	275	generated $NICE_TIME\\
	276	from data of $LINKS_DATE\\
	277	script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$MY_WIKI_PAGE\"}}{\fldrslt contact}})
	278	\\
	279	\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
	280	\cf0 "
	281	}
	282
	283	# Closes the RTF markup of the RTF log file
	284	function printRTFfooter()
	285	{
	286	valPrint r "}"
	287	}
	288
	289	# Writes the HTML header to HTML log file
	290	function printHTMheader()
	291	{
	292	valPrint h "<html>
	293	<head>
	294	<title>Validate External Links report</title>
	295	</head>
	296	<body>
	297	<h2>Validate External Links report</h2>
	298	<h3>generated $NICE_TIME<br />
	299	from data of $LINKS_DATE<br />
	300	script by Iritscen (<a href=\"$MY_WIKI_PAGE\" target=\"_blank\">contact</a>)</h3>"
	301	}
	302
	303	# Closes the HTML markup of the HTML log file
	304	function printHTMfooter()
	305	{
	306	valPrint h "</body>
	307	</html>"
	308	}
	309
	310	# The central logging function. The first parameter is a string composed of one or more characters that
[1070]	311	# indicate which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
[1119]	312	# 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 's' means "Print an extra newline at the end." 'w' means "Don't pass console output through 'fmt'" ("fmt" fits the output
	313	# to an 80-column CLI but can break special formatting and the 'n' option).
[1064]	314	function valPrint()
	315	{
	316	if [[ "$1" == c ]]; then
	317	if [[ "$1" == n ]]; then
	318	echo -n "$2"
	319	elif [[ "$1" == w ]]; then
	320	echo "$2"
[1119]	321	elif [[ "$1" == s ]]; then
	322	echo -e "$2\n"
[1064]	323	else
	324	echo "$2" \| fmt -w 80
	325	fi
	326	fi
	327	if [[ "$1" == t ]]; then
	328	if [[ "$1" == n ]]; then
	329	echo -n "$2" >> "$LOG_TXT"
[1119]	330	elif [[ "$1" == s ]]; then
	331	echo -e "$2\n" >> "$LOG_TXT"
[1064]	332	else
	333	echo "$2" >> "$LOG_TXT"
	334	fi
	335	fi
	336	if [[ "$1" == r ]]; then
	337	if [[ "$1" == n ]]; then
	338	echo "$2" >> "$LOG_RTF"
[1119]	339	elif [[ "$1" == s ]]; then
	340	echo "$2\line\line" >> "$LOG_RTF"
[1064]	341	else
[1119]	342	echo "$2\line" >> "$LOG_RTF"
[1064]	343	fi
	344	fi
	345	if [[ "$1" == h ]]; then
[1119]	346	if [[ "$1" == s ]]; then
	347	echo "$2<tr><td> </td></tr>" >> "$LOG_HTM"
	348	elif [[ "$1" == n ]]; then
[1064]	349	echo "$2" >> "$LOG_HTM"
	350	else
	351	echo "$2<br />" >> "$LOG_HTM"
	352	fi
	353	fi
	354	}
	355
	356	# Pluralize the string in parameter 1 if the number in parameter 2 is not 1
	357	function pluralCheckNoun()
	358	{
	359	if [ $2 -ne 1 ]; then
	360	if [[ $1 =~ x$ ]]; then
	361	echo $1es
	362	else
	363	echo $1s
	364	fi
	365	else
	366	echo $1
	367	fi
	368	}
	369
[1067]	370	# Output "is" if parameter 1 is 1, otherwise "are"
	371	function pluralCheckIs()
	372	{
	373	if [ $1 -ne 1 ]; then
	374	echo "are"
	375	else
	376	echo "is"
	377	fi
	378	}
	379
[1064]	380	# Output "was" if parameter 1 is 1, otherwise "were"
	381	function pluralCheckWas()
	382	{
	383	if [ $1 -ne 1 ]; then
	384	echo "were"
	385	else
	386	echo "was"
	387	fi
	388	}
	389
[1067]	390	# Output "a " if parameter 1 is 1, otherwise nothing
	391	function pluralCheckA()
	392	{
	393	if [ $1 -eq 1 ]; then
	394	echo "a "
	395	fi
	396	}
	397
	398	# Output "an " if parameter 1 is 1, otherwise nothing
	399	function pluralCheckAn()
	400	{
	401	if [ $1 -eq 1 ]; then
	402	echo "an "
	403	fi
	404	}
	405
[1064]	406	# Upload HTML report using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
	407	# reports being saved to disk have already been closed.
	408	function uploadReport()
	409	{
	410	valPrint c "Uploading HTML report..."
	411
	412	SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
	413	SFTP_USER_NAME_MARKER="user:"
	414	SFTP_PASSWORD_MARKER="pw:"
	415	SFTP_PORT_MARKER="port:"
	416	SFTP_PATH_MARKER="path:"
	417	SFTP_USER_NAME=$(grep $SFTP_USER_NAME_MARKER $UPLOAD_INFO)
	418	SFTP_USER_NAME=${SFTP_USER_NAME#$SFTP_USER_NAME_MARKER}
	419	SFTP_PASSWORD=$(grep $SFTP_PASSWORD_MARKER $UPLOAD_INFO)
	420	SFTP_PASSWORD=${SFTP_PASSWORD#$SFTP_PASSWORD_MARKER}
	421	SFTP_PORT=$(grep $SFTP_PORT_MARKER $UPLOAD_INFO)
	422	SFTP_PORT=${SFTP_PORT#$SFTP_PORT_MARKER}
	423	SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
	424	SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}
	425
	426	expect "$SCRIPT_PATH" "$LOG_HTM" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.htm"
	427
	428	valPrint c "Report was uploaded, unless an error message appears above."
	429	}
	430
	431	# Prints session summary when script is done
	432	function wrapupAndExit()
	433	{
	434	# Get off progress line on console, drop down a line from last link in log, and close HTML table
	435	valPrint ctr ""
	436	valPrint h "</table><br />"
	437
	438	# If we didn't finish processing the last URL, then the iterator is one too high
	439	if [ $FINISHED_LIST != "yes" ]; then
	440	let LINK_NUM-=1
	441	if [ $FINISHED_LIST == "no" ]; then
	442	valPrint ctrh "The session was canceled by the user."
	443	fi
	444	fi
	445
[1118]	446	# Generate string with elapsed time
	447	END_RUN=$(date +%s)
	448	ELAPSED=$(echo $(($END_RUN - $START_RUN)) \| awk '{printf "%d min. %d sec. elapsed", int($1/60), int($1%60)}')
	449
[1122]	450	# Do some math on results of session
[1064]	451	LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
[1122]	452	LINK_PROBLEMS=$((EI_LINKS+IW_LINKS+RD_LINKS+NG_LINKS))
	453	LINK_ERRORS=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
	454	LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_EI+SKIP_EXPECT_IW))
[1127]	455	TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP+SKIP_YOUTU_BE))
[1122]	456	LINKS_CHECKED=$((LINKS_PROCESSED-LINK_ERRORS))
	457
	458	# Print summary header
[1118]	459	valPrint ct "Summary ($ELAPSED):"
	460	valPrint r "\b1 Summary \b0 ($ELAPSED)"
	461	valPrint hn "<h3><span id=\"summary\">Summary ($ELAPSED)</span></h3>"
[1123]	462	valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT) (there $(pluralCheckWas $FILE_LINKS) $FILE_LINKS file $(pluralCheckNoun link $FILE_LINKS) and $PAGE_LINKS page $(pluralCheckNoun link $PAGE_LINKS))."
[1122]	463
	464	# Print processed link totals
	465	if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi
	466	if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi
[1123]	467	if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS processed $(pluralCheckNoun link $LINK_PROBLEMS) had $(pluralCheckAn $LINK_PROBLEMS)$(pluralCheckNoun issue $LINK_PROBLEMS)"; fi
[1127]	468	if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr " (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; valPrint h "  (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi
[1123]	469	if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) $(pluralCheckWas $OK_LINKS) OK"; fi
	470	if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctr " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; valPrint h "  (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi
[1122]	471
	472	# Print excepted link totals
	473	if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctrh "$LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"; fi
	474	if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
	475	if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi
	476	if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi
	477
	478	# Print errored link totals
	479	if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"; fi
	480	if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
[1070]	481	if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
[1064]	482	if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
	483	if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
	484	if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
	485	if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
[1122]	486
	487	# Print checked link totals
[1123]	488	if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS link $(pluralCheckNoun issue $LINK_PROBLEMS):"; fi
[1122]	489	if [ $NG_LINKS -gt 0 ]; then valPrint ctrh "- $NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
	490	if [ $RD_LINKS -gt 0 ]; then valPrint ctrh "- $RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi
	491	if [ $EI_LINKS -gt 0 ]; then valPrint ctrh "- $EI_LINKS $(pluralCheckNoun link $EI_LINKS) that could be intrawiki"; fi
	492	if [ $IW_LINKS -gt 0 ]; then valPrint ctrh "- $IW_LINKS $(pluralCheckNoun link $IW_LINKS) that could be interwiki"; fi
	493
	494	# Close the log files' markup
[1070]	495	valPrint trh "ValExtLinks says goodbye."
[1064]	496	printRTFfooter
	497	printHTMfooter
	498
	499	# Upload report if this was requested
	500	if [ ! -z $UPLOAD_INFO ] && [ $FINISHED_LIST != "no" ]; then
	501	uploadReport
	502	fi
	503
	504	# Really quit now
	505	valPrint c "ValExtLinks says goodbye."
	506	exit 0
	507	}
	508	trap wrapupAndExit INT
	509
	510
	511	### INITIALIZATION ###
	512	# Print opening message to console and log files
	513	valPrint cw "$(tput bold)--Validate External Links--$(tput sgr0)"
	514	printTXTheader
	515	printRTFheader
	516	printHTMheader
	517
	518	# Attempt to download file at LINKS_URL, then check that it succeeded
[1120]	519	valPrint t "Config:"
	520	valPrint r "\b1 Config \b0"
	521	valPrint hn "<h3>Config</h3>"
[1069]	522	valPrint cwtrh "Downloading list of external links from $LINKS_URL."
[1064]	523	LINKS_FILE_NAME=$(echo "$LINKS_URL" \| sed 's/.*\///')
	524	LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
	525	curl --silent -o "$LINKS_FILE" $LINKS_URL
	526	if [ ! -f "$LINKS_FILE" ]; then
	527	echo "The download of $LINKS_URL appears to have failed. Aborting."
	528	wrapupAndExit
	529	fi
	530
	531	# Attempt to download file at EXCEPT_URL, then check that it succeeded
	532	if [ ! -z $EXCEPT_URL ]; then
[1070]	533	valPrint cwtrh "Downloading list of reporting exceptions from $EXCEPT_URL."
[1064]	534	EXCEPT_FILE_NAME=$(echo "$EXCEPT_URL" \| sed 's/.*\///')
	535	EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
	536	curl --silent -o "$EXCEPT_FILE" $EXCEPT_URL
	537	if [ ! -f "$EXCEPT_FILE" ]; then
	538	echo "The download of $EXCEPT_URL appears to have failed. Aborting."
	539	wrapupAndExit
	540	fi
	541	fi
	542
	543	# Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
	544	LINK_COUNT_STRING=$(cat "$LINKS_FILE" \| wc -l)
	545
	546	# Number of URLs is number of lines minus one (first line is column header row for the CSV)
	547	LINK_COUNT=$(echo "${LINK_COUNT_STRING}" \| tr -d '[:space:]')
	548	let LINK_COUNT-=1
	549
	550	# Calculate number of URLs to consider
	551	if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
	552	valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((URL_LIMIT-URL_START+1)) of them, from link $URL_START to $URL_LIMIT."
	553	elif [ $URL_START -ne 1 ]; then
	554	valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((LINK_COUNT-URL_START+1)) of them, from link $URL_START to $LINK_COUNT."
	555	else
	556	valPrint ctrh "Found $LINK_COUNT links to process. I will be considering all of them."
	557	fi
	558
	559	# Print settings to console and log
[1070]	560	declare -a SETTINGS_MSG=(I will be checking the response code of each link "and will" take a screenshot of each page. Pages that are OK will "also" be logged. I "will" ask the Internet Archive for a suggested snapshot URL for each NG page. "I will not report links that are listed in the exceptions file.")
[1064]	561	if [ $TAKE_PAGE_SHOT -eq 0 ]; then SETTINGS_MSG[10]="but will not"; fi
	562	if [ $RECORD_OK_LINKS -eq 0 ]; then SETTINGS_MSG[22]="not"; fi
	563	if [ $SUGGEST_SNAPSHOTS -eq 0 ]; then SETTINGS_MSG[26]="will not"; fi
	564	if [ -z $EXCEPT_URL ] \|\| [ $RECORD_OK_LINKS -eq 1 ]; then SETTINGS_MSG[40]=""; fi
	565	SETTINGS_STR=${SETTINGS_MSG[@]}
	566	valPrint ctrh "$SETTINGS_STR"
	567	valPrint tr "A summary of my findings will be found at the bottom of the report."
	568	valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
	569	valPrint trh ""
	570
	571	# Print legend to logs
	572	valPrint t "Legend:"
	573	valPrint r "\b1 Legend \b0"
	574	valPrint hn "<h3>Legend</h3>"
	575	valPrint trh "OK = URL seems to be working."
[1067]	576	valPrint trh "NG = URL no longer seems to work. You should click each URL marked as NG before attempting to fix it, because false negatives will occur from time to time due to hiccups in the Internet. Please report any persistent false negatives or other issues to Iritscen. An NG link should be followed by a link to the Internet Archive's Wayback Machine which may help you repair the link. If the link cannot be repaired, you can disable it on the wiki (which prevents it from showing up in future ValExtLinks reports) by wrapping it in nowiki tags."
	577	valPrint trh "RD = The server responding to this URL is saying that the page moved and you should instead use the supplied new URL. Some RD links represent minor adjustments in the organization of a web site, and some are soft 404s (the file/page has been removed and you are being redirected to something like the main page of the web site). You will have to look at the new URL yourself to determine if it represents an OK link and the link on the wiki should be updated to this one, or if the desired file/page is actually gone and we need to replace the wiki link with an Internet Archive snapshot link -- or disable the URL if it has not been archived."
[1070]	578	valPrint trh "EI = URL is an external link to an internal page and should be converted to an intrawiki link using the suggested markup."
	579	valPrint trh "IW = URL is an external link to a fellow wiki and should be converted to an interwiki link using the suggested markup."
[1064]	580	valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $HTTP_CODES)."
	581	valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$HTTP_CODES\"}}{\fldrslt here}} for code reference)."
	582	valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$HTTP_CODES\" target=\"_blank\">here</a> for code reference)."
	583	valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $CURL_CODES)."
	584	valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$CURL_CODES\"}}{\fldrslt here}} for code reference)."
	585	valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$CURL_CODES\" target=\"_blank\">here</a> for code reference)."
	586	valPrint trh "IA suggests = Last available snapshot suggested by the Internet Archive."
[1070]	587	valPrint trh "Try browsing = The Archive occasionally fails to return a snapshot URL even when one exists, so you will need to check for a snapshot manually using this link to the Wayback Machine before concluding that a site has not been archived."
[1064]	588	valPrint trh ""
	589
	590
	591	### MAIN LOOP ###
[1120]	592	valPrint t "Links:"
	593	valPrint r "\b1 Links \b0"
	594	valPrint hn "<h3>Links</h3>"
[1118]	595	START_RUN=$(date +%s)
[1064]	596	# Process each line of the .csv in LINKS_FILE
	597	for LINE in `cat "$LINKS_FILE"`; do
	598	let LINK_NUM+=1
	599
	600	# First line is the column header row for the CSV, so let's verify that the format hasn't changed
	601	if [ $SKIPPED_HEADER_ROW -eq 0 ]; then
	602	if [ $LINE == "namespace,title,target" ]; then
	603	SKIPPED_HEADER_ROW=1
	604	LINK_NUM=0 # this line is it's not a link, so reset the link counter
	605	valPrint hn "<table>"
	606	continue
	607	else
	608	valPrint ctrh "Did not find expected header row in $LINKS_FILE. Aborting."
	609	wrapupAndExit
	610	fi
	611	fi
	612
	613	# Skip this link if we are not at URL_START yet
	614	if [ $LINK_NUM -lt $URL_START ]; then
	615	continue
	616	fi
	617
	618	# Stop if we are at the limit declared for testing purposes
	619	if [ $URL_LIMIT -gt 0 ] && [ $LINK_NUM -gt $URL_LIMIT ]; then
	620	FINISHED_LIST="limit"
	621	wrapupAndExit
	622	fi
	623
	624	# Print progress to screen
	625	if [ $LINK_NUM -gt 1 ]; then
	626	printf "\e[1A\n" # erase previous progress message so that new one appears in its place
	627	fi
	628	valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."
	629
	630	# The number of the namespace is the element before the first comma on the line
	631	NS_ID=${LINE%%,*}
	632
	633	# Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
	634	NS_NAME=""
	635	a=0
[1069]	636	while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done
[1118]	637	if [ $NS_ID == "NULL" ]; then
	638	break
	639	elif [ $NS_ID -eq ${NS_IDS[$a]} ]; then
[1064]	640	NS_NAME="${NS_NAMES[$a]}"
	641	break
	642	fi
	643	let a+=1
	644	done
[1118]	645	if [ "$NS_NAME" == "" ]; then
	646	if [ $NS_ID == "NULL" ]; then
[1123]	647	valPrint trs "Skipping line $LINK_NUM ('$LINE') because the namespace (and probably the page too) is 'NULL'. Probably the link is no longer in existence on the wiki."
[1118]	648	else
[1123]	649	valPrint trs "Skipping line $LINK_NUM ('$LINE') because I could not find a name for namespace ID $NS_ID."
[1118]	650	fi
[1064]	651	let SKIP_UNK_NS+=1
	652	continue
	653	fi
	654
	655	# The name of the page is everything between the namespace ID and the next comma on the line (commas
	656	# in page names will break this)
	657	PAGE_NAME=${LINE#$NS_ID,}
	658	PAGE_NAME=${PAGE_NAME%%,*}
	659
	660	# We don't want to consider wiki pages ending in .js, as the parser cannot reliably isolate URLS in
	661	# JavaScript code, so it will return erroneous links
	662	PAGE_NAME_SUFFIX=$(echo $PAGE_NAME \| sed 's/.*\.//')
	663	if [ $PAGE_NAME_SUFFIX == "js" ]; then
[1123]	664	valPrint trs "Skipping URL '${LINE#$NS_ID,$PAGE_NAME,}' on line $LINK_NUM because it was found on JavaScript page '$PAGE_NAME'."
[1064]	665	let SKIP_JS_PAGE+=1
	666	continue
	667	fi
	668
[1070]	669	# Build longer wiki page URLs from namespace and page names
[1122]	670	FULL_PAGE_PATH=https://$WIKI_PATH/$NS_NAME:$PAGE_NAME
[1070]	671	LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
	672	# Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it
	673	# explicitly breaks the link
	674	if [ $NS_ID -eq 0 ]; then
[1122]	675	FULL_PAGE_PATH=https://$WIKI_PATH/$PAGE_NAME
[1070]	676	LOCAL_PAGE_PATH=$PAGE_NAME
	677	fi
	678
[1064]	679	# The URL being linked to is everything after the previous two fields (this allows commas to be in
	680	# the URLs, but a comma in the previous field, the page name, will break this)
	681	URL=${LINE#$NS_ID,$PAGE_NAME,}
	682
	683	# Scan for illegal characters
	684	if [[ $URL == [$ILLEGAL_CHARS] ]]; then
[1123]	685	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because it contains characters illegal in a URL."
[1064]	686	let SKIP_BAD_URL+=1
	687	continue
	688	fi
	689
	690	# Now we need to know if the URL is for a file or a web page. First step is to determine if the
	691	# URL ends in a suffix
	692	HAS_SUFFIX=0
	693
	694	# If the URL ends in something like ".php?foo=bar", strip everything from the '?' onward
[1070]	695	CLEAN_URL=${URL%%\?*}
[1064]	696
	697	# If the URL ends in something like "#section_15", strip everything from the '#' onward
[1070]	698	CLEAN_URL=${CLEAN_URL%%\#*}
[1064]	699
	700	# 'sed' cannot handle Unicode in my Bash shell, so skip this URL and make user check it
[1070]	701	if [[ $CLEAN_URL == [![:ascii:]] ]]; then
[1123]	702	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I cannot handle non-ASCII characters."
[1064]	703	let SKIP_NON_ASCII+=1
	704	continue
	705	fi
	706
	707	# Isolate the characters after the last period and after the last slash
[1070]	708	POST_DOT=$(echo "$CLEAN_URL" \| sed 's/.*\.//')
	709	POST_SLASH=$(echo "$CLEAN_URL" \| sed 's/.*\///')
[1064]	710
	711	# If the last period comes after the last slash, then the URL ends in a suffix
	712	POST_DOT_LENGTH=$(echo \| awk -v input=$POST_DOT '{print length(input)}')
	713	POST_SLASH_LENGTH=$(echo \| awk -v input=$POST_SLASH '{print length(input)}')
	714	if [ $POST_DOT_LENGTH -lt $POST_SLASH_LENGTH ]; then
	715	HAS_SUFFIX=1
	716	else
	717	HAS_SUFFIX=0
	718	fi
	719
	720	# Now we need to know if the URL is for a file. If the URL ends in a suffix, compare it against the
	721	# known URL endings supplied in HTTP_FILES and HTTP_TLDS_AND_PAGES
	722	IS_FILE=-1
	723	if [ $HAS_SUFFIX -eq 0 ]; then
	724	IS_FILE=0
	725	else
	726	# Turn off case sensitivity while we compare suffixes
	727	shopt -s nocasematch
	728
[1127]	729	# Special case: URLs ending in something like "productID.304297400" are pages, not files, so if
[1064]	730	# the URL's suffix is all numbers, we are looking at the end of a web page URL
	731	if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
	732	IS_FILE=0
	733	fi
[1127]	734
	735	# Special case: URLs ending in a parens, e.g. "ms537113(v=vs.85)", are pages
	736	if [[ $POST_DOT =~ ^.*[]$ ]]; then
	737	IS_FILE=0
	738	fi
	739
	740	# Special case: URLs containing a '%', e.g. "10.1007%2FBF00329055", are pages
	741	if [[ $POST_DOT == % ]]; then
	742	IS_FILE=0
	743	fi
[1064]	744
	745	# If we did not identify this URL as a web page above, we need to compare the suffix against known
	746	# file extensions
	747	if [ $IS_FILE -eq -1 ]; then
	748	for EXTENSION in "${HTTP_FILES[@]}"; do
	749	if [[ $EXTENSION == $POST_DOT ]]; then # double brackets respect the nocasematch setting
	750	IS_FILE=1
	751	break
	752	fi
	753	done
	754	fi
	755
	756	# If we did not identify this URL as a file above, we need to compare the suffix against known
	757	# pages and TLDs in order to rule out that this is an unknown file suffix or TLD that the user
	758	# needs to add to the HTTP_FILES or HTTP_TLDS_AND_PAGES array
	759	if [ $IS_FILE -eq -1 ]; then
	760	for PAGE_OR_TLD in "${HTTP_TLDS_AND_PAGES[@]}"; do
	761	if [[ $PAGE_OR_TLD == $POST_DOT ]]; then
	762	IS_FILE=0
	763	break
	764	fi
	765	done
	766	fi
	767
	768	# Turn case sensitivity back on in Bash
	769	shopt -u nocasematch
	770	fi
	771
	772	# If this suffix escaped identification as either a file, page or TLD, inform the user
	773	STR_TYPE=""
	774	if [ $IS_FILE -eq -1 ]; then
[1123]	775	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I encountered the unknown URL ending '$POST_DOT'. Please add this ending to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
[1064]	776	let SKIP_UNK_SUFFIX+=1
	777	continue
	778	elif [ $IS_FILE -eq 1 ]; then
	779	STR_TYPE="file"
	780	let FILE_LINKS+=1
	781	elif [ $IS_FILE -eq 0 ]; then
	782	STR_TYPE="page"
	783	let PAGE_LINKS+=1
	784	fi
	785
	786	# Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
	787	# issue with sites that require HTTPS
[1123]	788	CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time 10 --write-out '%{http_code}\n' $URL)
[1064]	789	CURL_ERR=$(echo $?)
	790	CURL_RESULT=$CURL_CODE
	791
	792	# Tack 'curl' exit code onto result string if 'curl' returned "000" (no HTTP response)
	793	if [ $CURL_CODE == "000" ]; then
	794	CURL_RESULT="$CURL_RESULT-$CURL_ERR"
	795	fi
	796
[1070]	797	# Begin to determine our status code for this URL (EI, IW, OK, RD, or NG)
[1064]	798	STATUS="??"
[1067]	799	NEW_URL=""
[1064]	800	INTERWIKI_INDEX=-1
	801
[1070]	802	# First make sure that this isn't an "external internal" link to our own wiki that can be replaced
	803	# by "[[page_name]]". If it uses a special access URL beginning with "/w/", let it pass, as it
	804	# probably cannot be replaced by "[[ ]]" markup
	805	if [[ $URL == $WIKI_PATH ]] && [[ $URL != $WIKI_PATH/w/ ]]; then
	806	STATUS="EI"
	807	let EI_LINKS+=1
	808	fi
	809
	810	# If it's not, check if this is a link to a domain that we have an interwiki prefix for
	811	if [ $STATUS == "??" ]; then
	812	for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
	813	if [[ $URL == ${INTERWIKI_DOMAINS[$i]} ]] && [[ $URL != ${INTERWIKI_DOMAINS[$i]}/w/ ]]; then
	814	STATUS="IW"
	815	let IW_LINKS+=1
	816	INTERWIKI_INDEX=$i
	817	break
	818	fi
	819	done
	820	fi
	821
[1069]	822	# If we didn't match an interwiki domain, see if the status code is in our "OK" codes list
	823	if [ $STATUS == "??" ]; then
	824	for CODE in "${OK_CODES[@]}"; do
	825	if [[ $CODE == $CURL_CODE ]]; then
	826	STATUS="OK"
	827	let OK_LINKS+=1
	828	break
	829	fi
	830	done
	831	fi
	832
[1067]	833	# If we didn't get a match with the "OK" codes, check it against the "RD" codes
[1064]	834	if [ $STATUS == "??" ]; then
[1067]	835	for CODE in "${RD_CODES[@]}"; do
	836	if [[ $CODE == $CURL_CODE ]]; then
	837	# Get URL header again in order to retrieve the URL we are being redirected to
[1123]	838	NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time 10 --write-out '%{redirect_url}\n' $URL)
[1067]	839
[1122]	840	# Adjust the old and new URLs to both use HTTP for comparison purposes, so we can filter
	841	# those changes out if the user didn't ask for them
	842	URL_HTTP=$(echo $URL \| sed -E 's/^https:/http:/')
	843	NEW_URL_HTTP=$(echo $NEW_URL \| sed -E 's/^https:/http:/')
[1070]	844
	845	# Sometimes 'curl' fails to get the redirect_url due to time-out or bad web site config
[1122]	846	NEW_URL_LENGTH=$(echo \| awk -v input=$NEW_URL_HTTP '{print length(input)}')
[1070]	847	if [ $NEW_URL_LENGTH -lt $MIN_URL_LENGTH ]; then
[1122]	848	NEW_URL_HTTP="[new URL not retrieved]"
[1070]	849	fi
	850
[1122]	851	# Remove slash at end of new URL, if present, so we can filter out the redirects that
	852	# merely add an ending slash if the user didn't ask for them
	853	NEW_URL_NO_SLASH=$(echo $NEW_URL_HTTP \| sed -E 's:/$::')
	854
[1127]	855	# Detect if this is a youtu.be link simply being expanded by YouTube to the full
	856	# youtube.com address
	857	YOUTU_BE=0
	858	if [[ $URL_HTTP == http://youtu.be* ]] && [[ $NEW_URL_HTTP == http://www.youtube.com* ]]; then
	859	YOUTU_BE=1
	860	fi
	861
[1122]	862	# If the URLs match besides HTTP being upgraded to HTTPS, then the link is OK (unless user
	863	# wants those to be reported)
	864	if [ $SHOW_HTTPS -eq 0 ] && [ $URL_HTTP == $NEW_URL_HTTP ]; then
[1123]	865	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show http->https upgrades, and I was redirected to '$NEW_URL'."
[1069]	866	STATUS="OK"
	867	let OK_LINKS+=1
[1122]	868	let SKIP_HTTPS_UP+=1
	869	# If the URLs match besides an added ending slash, then the link is OK (unless user wants
	870	# those to be reported)
	871	elif [ $SHOW_SLASH -eq 0 ] && [ $URL_HTTP == $NEW_URL_NO_SLASH ]; then
[1123]	872	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show added trailing slashes, and I was redirected to '$NEW_URL'."
[1122]	873	STATUS="OK"
	874	let OK_LINKS+=1
	875	let SKIP_SLASH_ADD+=1
[1127]	876	elif [ $SHOW_YT_RD -eq 0 ] && [ $YOUTU_BE -eq 1 ]; then
	877	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show redirects for youtu.be links, and I was redirected to '$NEW_URL'."
	878	STATUS="OK"
	879	let OK_LINKS+=1
	880	let SKIP_YOUTU_BE+=1
[1069]	881	else
	882	STATUS="RD"
	883	let RD_LINKS+=1
	884	fi
[1067]	885	break
	886	fi
	887	done
	888	fi
	889
	890	# If we didn't get a match with the "RD" codes, check it against the "NG" codes
	891	if [ $STATUS == "??" ]; then
[1064]	892	for CODE in "${NG_CODES[@]}"; do
	893	if [[ $CODE == $CURL_CODE ]]; then
	894	STATUS="NG"
	895	let NG_LINKS+=1
	896	break
	897	fi
	898	done
	899	fi
	900
	901	# If we didn't match a known status code, advise the reader
	902	if [ $STATUS == "??" ]; then
[1127]	903	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I encountered the unknown response code $CURL_CODE."
[1064]	904	let SKIP_UNK_CODE+=1
	905	continue
	906	fi
	907
[1070]	908	# Check problem links against exceptions file before proceeding
	909	if [ $STATUS != "OK" ] && [ ! -z $EXCEPT_URL ]; then
	910	# The code we expect to find in the exceptions file is either the 'curl' result or "EI"/"IW"
	911	EXPECT_CODE="$CURL_RESULT"
	912	if [ $STATUS == "EI" ]; then
	913	EXPECT_CODE="EI"
	914	elif [ $STATUS == "IW" ]; then
	915	EXPECT_CODE="IW"
	916	fi
	917
	918	# Look for link in exceptions file and make sure its listed result code and wiki page also match
[1064]	919	GREP_RESULT=$(grep --max-count=1 "$URL" "$EXCEPT_FILE")
[1070]	920	EXCEPT_PAGE=${GREP_RESULT##*,}
	921	if [ "$EXCEPT_PAGE" == "*" ] \|\| [ "$EXCEPT_PAGE" == $LOCAL_PAGE_PATH ]; then
	922	EXCEPT_CODE=${GREP_RESULT%%,*}
	923	if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then
[1123]	924	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because its expected result, $EXPECT_CODE, is listed in the exceptions file."
[1070]	925	if [ $STATUS == "EI" ]; then
	926	let SKIP_EXPECT_EI+=1
	927	elif [ $STATUS == "IW" ]; then
	928	let SKIP_EXPECT_IW+=1
	929	else
	930	let SKIP_EXPECT_NG+=1
	931	fi
	932	continue
	933	fi
[1064]	934	fi
	935	fi
	936
	937	# If appropriate, record this link to the log, with clickable URLs when possible
	938	if [ $STATUS != "OK" ] \|\| [ $RECORD_OK_LINKS -eq 1 ]; then
[1125]	939	# Prepare 'curl' result in parentheses to print after status code, unless this is an "IW" or "EI"
	940	# link, in which case showing the status code doesn't make sense. Adjust spacing after string to
	941	# ensure TXT and RTF reports have aligned columns of results.
	942	CURL_STR_H=" ($CURL_RESULT)"
	943	CURL_STR_T="$CURL_STR_H"
	944	CURL_STR_R="$CURL_STR_H "
[1070]	945	if [ $STATUS == "IW" ] \|\| [ $STATUS == "EI" ]; then
[1125]	946	CURL_STR_H=""
	947	CURL_STR_T=" "
	948	CURL_STR_R=" "
[1064]	949	fi
	950
	951	# Record link and its wiki page in TXT, RTF, and HTML markup
[1125]	952	valPrint t "${STATUS}${CURL_STR_T} $STR_TYPE $URL"
[1064]	953	valPrint t " linked from $FULL_PAGE_PATH"
[1125]	954	valPrint r "${STATUS}${CURL_STR_R}${RTF_TABS}$STR_TYPE {\field{\*\fldinst{HYPERLINK \"$URL\"}}{\fldrslt $URL}}"
[1064]	955	valPrint r " linked from {\field{\*\fldinst{HYPERLINK \"$FULL_PAGE_PATH\"}}{\fldrslt $LOCAL_PAGE_PATH}}"
[1125]	956	valPrint hn "<tr><td style=\"white-space:nowrap\">${STATUS}${CURL_STR_H}</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>"
[1064]	957	valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"
	958
[1123]	959	# Place vertical space here since we won't be printing anything more about this link
[1125]	960	if [ $STATUS == "OK" ]; then valPrint tr ""; valPrint hs ""; fi
[1123]	961
[1067]	962	# Record redirect URL if one was given by a 3xx response page
	963	if [ $STATUS == "RD" ]; then
[1119]	964	valPrint ts " Server suggests $NEW_URL"
	965	valPrint rs " Server suggests {\field{\*\fldinst{HYPERLINK \"$NEW_URL\"}}{\fldrslt $NEW_URL}}"
	966	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Server suggests</td><td><a href=\"$NEW_URL\" target=\"_blank\">$NEW_URL</a></td></tr>"
[1067]	967	fi
	968
[1070]	969	# Notify reader if we can use an intrawiki link for this URL
	970	if [ $STATUS == "EI" ]; then
[1075]	971	INTRA_PAGE=${URL#:///}
[1119]	972	valPrint ts " Just use [[$INTRA_PAGE]]"
	973	valPrint rs " Just use [[$INTRA_PAGE]]"
	974	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Just use</td><td>[[$INTRA_PAGE]]</td></tr>"
[1070]	975	fi
	976
[1064]	977	# Notify reader if we can use an interwiki prefix for this URL
	978	if [ $STATUS == "IW" ]; then
[1075]	979	INTER_PAGE=$(echo "$URL" \| sed 's/.*\///')
[1119]	980	valPrint ts " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
	981	valPrint rs " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
	982	valPrint hs "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]</td></tr>"
[1064]	983	fi
	984
	985	# Query Internet Archive for latest "OK" snapshot for "NG" page
	986	if [ $STATUS == "NG" ] && [ $SUGGEST_SNAPSHOTS -eq 1 ]; then
	987	ARCHIVE_QUERY=$(curl --silent --max-time 10 "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
	988
[1118]	989	# If a "closest" snapshot was received...
[1066]	990	if [[ "$ARCHIVE_QUERY" == \"closest\": ]]; then
[1118]	991	# In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it
	992	ARCHIVE_QUERY=$(echo "$ARCHIVE_QUERY" \| sed 's/#!/#\\!/')
	993
	994	# ...isolate "url" property in the response that follows the "closest" tag
	995	SNAPSHOT_URL=${ARCHIVE_QUERY##*\"closest\":} # everything after '"closest":'
[1119]	996	SNAPSHOT_URL=${SNAPSHOT_URL#*\"url\": \"} # everything after '"url": "'
[1118]	997	SNAPSHOT_URL=${SNAPSHOT_URL%%\"*} # everything before '"'
	998
[1124]	999	# Remove the port 80 part that IA often adds to the URL, as it's superfluous
	1000	SNAPSHOT_URL=$(echo $SNAPSHOT_URL \| sed 's/:80//')
	1001
[1118]	1002	# Inform the user of the snapshot URL
[1119]	1003	valPrint ts " IA suggests $SNAPSHOT_URL"
	1004	valPrint rs " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
	1005	valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
[1064]	1006	else # ...otherwise give generic Wayback Machine link for this URL
[1119]	1007	valPrint ts " Try browsing $ARCHIVE_GENERIC/$URL"
	1008	valPrint rs " Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
	1009	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"
[1064]	1010	fi
	1011	fi
	1012	fi
	1013
	1014	# If this is a page and it seems to be there, take screenshot of it using headless Google Chrome
	1015	if [ $IS_FILE -eq 0 ] && [ $TAKE_PAGE_SHOT -eq 1 ] && [ $STATUS == "OK" ]; then
	1016	# Sanitize URL by removing "http(s)://" and converting any other colons or slashes to underscores
	1017	SHOT_NAME=$(echo "$URL" \| sed 's/https*\:\/\///' \| sed 'y/:\//__/')
	1018	SHOT_FILE="$SHOT_PATH/$SHOT_NAME.png"
	1019
	1020	# Don't take screenshot if we already encountered this page and screenshotted it
	1021	if [ ! -f "$SHOT_FILE" ]; then
[1070]	1022	"$CHROME_PATH" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
[1064]	1023	if [ -f "$WORKING_DIR/$CHROME_SCREENSHOT" ]; then
	1024	mv -n "$WORKING_DIR/$CHROME_SCREENSHOT" "$SHOT_FILE"
	1025	else
[1119]	1026	valPrint trhs "Screenshot of URL $URL seems to have failed!"
[1064]	1027	fi
	1028	else
[1123]	1029	valPrint trhs "Skipping screenshot of URL '$URL' because file '$SHOT_FILE' already exists."
[1064]	1030	fi
	1031	fi
	1032	done
	1033	FINISHED_LIST="yes"
	1034	wrapupAndExit

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: Validate External Links/validate_external_links.sh@ 1133

Download in other formats: