Context Navigation

source: Validate External Links/validate_external_links.sh@ 1147

Last change on this file since 1147 was 1147, checked in by iritscen, 4 years ago
ValExtLinks: Changed --suggest-snapshots to --suggest-snapshots-ng and added --suggest-snapshots-ok for getting snapshot URLs for all good links. This can be used to confirm that sites are backed up in case they die in the future, but note that this argument will take hours to run due to the API rate limit. Added awareness of API rate limit so Archive.org will not start blocking script.
File size: 53.3 KB

Line
1	#!/bin/bash
2
3	# Validate External Links by Iritscen
4	#
5	# Validates a list of external links in CSV format. The resulting logs are produced in three formats:
6	# - TXT (for easy diffing with an earlier log)
7	# - RTF (for reading as a local file with clickable links)
8	# - HTML (for reading as a web page)
9	# Call script with "--help" argument for documentation. Also see Read Me.rtf for critical notes.
10	#
11	# Recommended rule:
12	# \|----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----\|
13	#
14	# Table of contents (sections of script in order of appearance, not execution):
15	# • Globals
16	# • Help Output
17	# • Setup
18	# • Utility Functions
19	# • Summary Output
20	# • Initialization
21	# • Data Sourcing
22	# • Config Output
23	# • Legend Output
24	# • Main Loop
25
26	# Set separator token to newline
27	IFS="
28	"
29
30	### GLOBALS ###
31	# Settings -- these will be changed from their defaults by the arguments passed in to the script
32	LINKS_URL="" # use 'curl' to download file with links from this location (can be file://)
33	EXCEPT_URL="" # 'curl' will access this wiki page with a list of exceptions for NG results
34	OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder
35	RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES
36	SHOW_SLASH=0 # record issue when a slash is added to the end of a URL
37	SHOW_HTTPS=0 # record issue when "http" is upgraded to "https"
38	SHOW_YT_RD=0 # record redirection for a youtu.be URL expanding to the full URL
39	SUGGEST_SNAPSHOTS_NG=0 # query the Internet Archive for a possible snapshot URL for each NG page
40	SUGGEST_SNAPSHOTS_OK=0 # query the Internet Archive for an existing snapshot of each OK page
41	CHECK_ARCHIVE_LINKS=0 # check URLs under the archive.org domain
42	TAKE_PAGE_SHOT=0 # take a screenshot of each OK page
43	TIMEOUT=10 # time to wait for a response when querying a site
44	CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature
45	URL_START=1 # start at this URL in LINKS_FILE
46	URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE
47	UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report
48
49	# Fixed strings -- see the occurrences of these variables to learn their purpose
50	AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36 OPR/70.0.3728.154"
51	ARCHIVE_API="http://archive.org/wayback/available"
52	ARCHIVE_GENERIC="https://web.archive.org/web/*"
53	ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
54	CHROME_SCREENSHOT="screenshot.png"
55	EXCEPT_FILE_NAME="exceptions.txt"
56	EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
57	WIKI_CURL="https://wiki.oni2.net/Validate_External_Links/Curl_codes"
58	WIKI_HTTP="https://wiki.oni2.net/Validate_External_Links/HTTP_codes"
59	WIKI_MAIN="https://wiki.oni2.net/Validate_External_Links"
60	WIKI_ME="http://iritscen.oni2.net"
61	THIS_DIR=$(cd $(dirname $0); pwd)
62	WORKING_DIR=$(pwd)
63	WIKI_PATH="wiki.oni2.net"
64
65	# These are parallel arrays of the IDs and names of OniGalore's current namespaces
66	declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 100 101 102 103 104 105 108 109 110 111)
67	declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk")
68
69	# These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
70	# This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code.
71	declare -a HTTP_FILES=(3ds 7z avi BINA blend bsl cpp csv dae dll dmg doc east exe fbx first63 flv gamedata gif jpg last32 log m4a mhm mov mp3 mp4 oni ONWC pdf png py rar tga TRMA txt vbs wav wmv xaf xml zip)
72	declare -a HTTP_TLDS_AND_PAGES=(action ars asp aspx cfm cgi com css de do full htm html it js jsp net org pgi php php3 phtml pl ru shtml stm uk x)
73
74	# These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
75	# are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt
76	# if you add a new code.
77	declare -a OK_CODES=(200 401 405 406 418 501)
78	declare -a RD_CODES=(301 302 303 307 308)
79	declare -a NG_CODES=(000 400 403 404 410 500 502 503 530)
80
81	# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
82	# transcluded text, and if the transclusion fails, then the braces show up in the URL
83	ILLEGAL_CHARS="{ }"
84
85	# The shortest URL possible, used for sanity-checking some URLs: http://a.co
86	MIN_URL_LENGTH=11
87
88	# These are parallel arrays giving the prefixes that can be used in place of normal external links to
89	# some wikis and other sites
90	declare -a INTERWIKI_PREFIXES=(commons metawikimedia mw wikibooks wikidata wikimedia wikinews wikiquote wikisource wikispecies wikiversity wikivoyage wikt wp)
91	declare -a INTERWIKI_DOMAINS=(commons.wikimedia.org meta.wikimedia.org mediawiki.org wikibooks.org wikidata.org wikimediafoundation.org wikinews.org wikiquote.org wikisource.org species.wikimedia.org wikiversity.org wikivoyage.org wiktionary.org wikipedia.org)
92
93	# Variables for keeping track of main loop progress and findings
94	LINK_NUM=0
95	EI_LINKS=0
96	IW_LINKS=0
97	OK_LINKS=0
98	RD_LINKS=0
99	NG_LINKS=0
100	SKIP_UNK_NS=0
101	SKIP_JS_PAGE=0
102	SKIP_BAD_URL=0
103	SKIP_NON_ASCII=0
104	SKIP_UNK_SUFFIX=0
105	SKIP_UNK_CODE=0
106	SKIP_EXPECT_NG=0
107	SKIP_EXPECT_RD=0
108	SKIP_EXPECT_EI=0
109	SKIP_EXPECT_IW=0
110	SKIP_HTTPS_UP=0
111	SKIP_SLASH_ADD=0
112	SKIP_YOUTU_BE=0
113	SKIP_ARCHIVE_ORG=0
114	FILE_LINKS=0
115	PAGE_LINKS=0
116	SKIPPED_HEADER_ROW=0
117	FINISHED_LIST="no"
118	START_RUN=0
119	END_RUN=0
120
121
122	### HELP OUTPUT ###
123	# A pseudo-man page. Here is the 80-character rule for the page text:
124	# 234567890123456789012345678901234567890123456789012345678901234567890123456789
125	function printHelp()
126	{
127	cat << EOF
128
129	NAME
130	Validate External Links
131
132	SYNOPSIS
133	validate_external_links.sh --help
134	validate_external_links.sh --links URL --output DIR [--exceptions URL]
135	[--record-ok-links] [--show-added-slashes] [--show-https-upgrades]
136	[--show-yt-redirects] [--suggest-snapshots] [--check-archive-links]
137	[--take-screenshots FILE] [--timeout NUM] [--start-url NUM]
138	[--end-url NUM] [--upload FILE]
139
140	DESCRIPTION
141	This script parses a list of external links found in the OniGalore wiki
142	(which is dumped by the Oni2.net server periodically in a particular
143	format), validates them using the Unix tool 'curl', and produces a report
144	of which links were "OK" (responded positively to an HTTP query), which
145	were "RD" (responded with a 3xx redirect code), which could be "IW"
146	(interwiki) links, which are "EI" (external internal) links and could be
147	intrawiki links, and which were "NG" (no good; a negative response to the
148	query). This report can then be automatically uploaded to the location of
149	your choice. The script can also suggest Internet Archive snapshots for
150	"NG" links, and take screenshots of "OK" links for visual verification by
151	the reader that the page in question is the one intended to be displayed.
152
153	You must pass this script the URL at which the list of links is found
154	(--links) and the path where the directory of logs should be outputted
155	(--output). All other arguments are optional.
156
157	OPTIONS
158	--help Show this page.
159	--links URL (required) URL from which to download the CSV
160	file with external links. Note that this URL can
161	be a local file if you supply a file:// path.
162	--output DIR (required) Unix path to directory in which Val
163	should place its reports.
164	--exceptions URL In order to remove links from the report which
165	Val finds an issue with but which you regard as
166	OK, list those desired exceptions on a wiki page.
167	See the sample file "exceptions.pdf" for the
168	required format of the page. Note that this URL
169	can point to a local file if you supply a path
170	beginning with "file://".
171	--record-ok-links Log a link in the report even if its response
172	code is "OK".
173	--show-added-slashes Report on redirects that simply add a '/' to the
174	end of the URL.
175	--show-https-upgrades Report on redirects that simply upgrade a
176	"http://" URL to a "https://" URL.
177	--show-yt-redirects Report on redirects that expand a youtu.be URL.
178	--suggest-snapshots-ng Query the Internet Archive for a possible
179	snapshot URL for each "NG" page.
180	--suggest-snapshots-ok Query the Internet Archive for a snapshot of each
181	"OK" page just to make sure it's available. Note
182	that this will add a tremendous amount of time to
183	the script execution because there is a rate
184	limit to the Archive API. Note that this option
185	does nothing unless you also use the
186	--record-ok-links argument.
187	--check-archive-links Check links that are already pointing to a page
188	on the Internet Archive. In theory these links
189	should be totally stable and not need validation.
190	--take-screenshots FILE Call the Google Chrome binary at this path to
191	take screenshots of each "OK" page.
192	--timeout NUM Wait this many seconds for a site to respond. The
193	default is 10. Important note: Val will attempt
194	to reach each URL three times, so the time taken
195	to ping an unresponsive site will be three times
196	this setting.
197	--start-url NUM Start at this link in the links CSV file.
198	--end-url NUM Stop at this link in the links CSV file.
199	--upload FILE Upload report using the credentials and path
200	given in this local text file. See sftp_login.txt
201	for template.
202
203	BUGS
204	The script cannot properly parse any line in the external links file
205	which contains a comma in the name of the wiki page containing a link.
206	Commas in the link itself are not an issue.
207	EOF
208	}
209
210
211	### SETUP ###
212	# If first argument is a help request, or if nothing was passed in at all, print help page and quit
213	if [ "$#" -eq 0 ] \|\| [ "$1" == "--help" ]; then
214	printHelp \| less
215	exit 0
216	fi
217
218	# Parse arguments as long as there are more arguments to process
219	while (( "$#" )); do
220	case "$1" in
221	--links ) LINKS_URL="$2"; shift 2;;
222	--exceptions ) EXCEPT_URL="$2"; shift 2;;
223	--output ) OUTPUT_DIR="$2"; shift 2;;
224	--record-ok-links ) RECORD_OK_LINKS=1; shift;;
225	--show-added-slashes ) SHOW_SLASH=1; shift;;
226	--show-https-upgrades ) SHOW_HTTPS=1; shift;;
227	--show-yt-redirects ) SHOW_YT_RD=1; shift;;
228	--suggest-snapshots-ng ) SUGGEST_SNAPSHOTS_NG=1; shift;;
229	--suggest-snapshots-ok ) SUGGEST_SNAPSHOTS_OK=1; shift;;
230	--check-archive-links ) CHECK_ARCHIVE_LINKS=1; shift;;
231	--take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
232	--timeout ) TIMEOUT=$2; shift 2;;
233	--start-url ) URL_START=$2; shift 2;;
234	--end-url ) URL_LIMIT=$2; shift 2;;
235	--upload ) UPLOAD_INFO=$2; shift 2;;
236	* ) echo "Invalid argument $1 detected. Aborting."; exit 1;;
237	esac
238	done
239
240	# If the required arguments were not supplied, print help page and quit
241	if [ -z $LINKS_URL ] \|\| [ -z $OUTPUT_DIR ]; then
242	echo "Error: I did not receive one or both required arguments. Run me with the \"--help\" argument for documentation."
243	exit 2
244	fi
245
246	# If user wants screenshots, make sure path to Chrome was passed in and is valid
247	if [ $TAKE_PAGE_SHOT -eq 1 ]; then
248	if [ ! -f "$CHROME_PATH" ]; then
249	echo "Error: You need to supply a path to the Google Chrome application in order to take screenshots."
250	exit 3
251	fi
252	fi
253
254	# Check that UPLOAD_INFO exists, if this argument was supplied
255	if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then
256	echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting."
257	exit 4
258	fi
259
260	# Check that OUTPUT_DIR is a directory
261	if [ ! -d "$OUTPUT_DIR" ]; then
262	echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting."
263	exit 5
264	fi
265
266	# Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots
267	SAFE_TIME=$(date "+%Y-%m-%d--%H-%M-%S") # for use in path names
268	NICE_TIME=$(TZ=":UTC" date "+%a, %d %b %Y %k:%M:%S GMT") # matches format of LINKS_DATE, which is extracted from the extlinks file
269	OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
270	OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
271	SHOT_PATH="$OUTPUT_PATH/Screenshots"
272	LOG_NAME="ValExtLinks report"
273	LOG_NAME_TXT="$LOG_NAME.txt"
274	LOG_NAME_RTF="$LOG_NAME.rtf"
275	LOG_NAME_HTM="$LOG_NAME.htm"
276	LOG_PATH="$OUTPUT_PATH/$LOG_NAME"
277	LOG_PATH_TXT="$LOG_PATH.txt"
278	LOG_PATH_RTF="$LOG_PATH.rtf"
279	LOG_PATH_HTM="$LOG_PATH.htm"
280	mkdir "$OUTPUT_PATH"
281	if [ $TAKE_PAGE_SHOT -eq 1 ]; then
282	mkdir "$SHOT_PATH"
283	fi
284
285	# Check that 'mkdir' succeeded
286	if [ ! -d "$OUTPUT_PATH" ]; then
287	echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting."
288	exit 6
289	fi
290
291	# Get date on the file at LINKS_URL and print to log
292	LINKS_DATE=$(curl --silent --head $LINKS_URL \| grep "Last-Modified")
293	if [ -z "$LINKS_DATE" ]; then
294	echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting."
295	exit 7
296	fi
297	LINKS_DATE=${LINKS_DATE#Last-Modified: }
298
299
300	### UTILITY FUNCTIONS ###
301	# Writes a plain-text header to TXT log file
302	function printTXTheader()
303	{
304	valPrint t "Validate External Links report"
305	valPrint t "generated $NICE_TIME"
306	valPrint t "from data of $LINKS_DATE"
307	valPrint t "script by Iritscen (contact: $WIKI_ME)"
308	valPrint t ""
309	}
310
311	# Writes the RTF header to RTF log file, then a centered title, then sets text to left-justified
312	function printRTFheader()
313	{
314	valPrint r "{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf820
315	{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
316	{\colortbl;\red255\green255\blue255;}
317	{\*\expandedcolortbl;;}
318	\margl1440\margr1440\vieww12600\viewh12100\viewkind0
319	\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\qc\partightenfactor0
320
321	\f0\fs28 \cf0 \b1 Validate External Links report \b0\\
322	generated $NICE_TIME\\
323	from data of $LINKS_DATE\\
324	script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$WIKI_ME\"}}{\fldrslt contact}})
325	\\
326	\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
327	\cf0 "
328	}
329
330	# Closes the RTF markup of the RTF log file
331	function printRTFfooter()
332	{
333	valPrint r "}"
334	}
335
336	# Writes the HTML header to HTML log file
337	function printHTMheader()
338	{
339	valPrint h "<html>
340	<head>
341	<title>Validate External Links report</title>
342	</head>
343	<body>
344	<h2>Validate External Links report</h2>
345	<h3>generated $NICE_TIME<br />
346	from data of $LINKS_DATE<br />
347	script by Iritscen (<a href=\"$WIKI_ME\" target=\"_blank\">contact</a>)</h3>"
348	}
349
350	# Closes the HTML markup of the HTML log file
351	function printHTMfooter()
352	{
353	valPrint h "</body>
354	</html>"
355	}
356
357	# The central logging function. The first parameter is a string composed of one or more characters that
358	# indicate which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
359	# 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 's' means "Print an
360	# extra newline at the end." 'w' means "Don't pass console output through 'fmt'" ("fmt" fits the output
361	# to an 80-column CLI but can break special formatting and the 'n' option).
362	function valPrint()
363	{
364	if [[ "$1" == c ]]; then
365	if [[ "$1" == n ]]; then
366	echo -n "$2"
367	elif [[ "$1" == w ]]; then
368	echo "$2"
369	elif [[ "$1" == s ]]; then
370	echo -e "$2\n"
371	else
372	echo "$2" \| fmt -w 80
373	fi
374	fi
375	if [[ "$1" == t ]]; then
376	if [[ "$1" == n ]]; then
377	echo -n "$2" >> "$LOG_PATH_TXT"
378	elif [[ "$1" == s ]]; then
379	echo -e "$2\n" >> "$LOG_PATH_TXT"
380	else
381	echo "$2" >> "$LOG_PATH_TXT"
382	fi
383	fi
384	if [[ "$1" == r ]]; then
385	if [[ "$1" == n ]]; then
386	echo "$2" >> "$LOG_PATH_RTF"
387	elif [[ "$1" == s ]]; then
388	echo "$2\line\line" >> "$LOG_PATH_RTF"
389	else
390	echo "$2\line" >> "$LOG_PATH_RTF"
391	fi
392	fi
393	if [[ "$1" == h ]]; then
394	if [[ "$1" == s ]]; then
395	echo "$2<tr><td> </td></tr>" >> "$LOG_PATH_HTM"
396	elif [[ "$1" == n ]]; then
397	echo "$2" >> "$LOG_PATH_HTM"
398	else
399	echo "$2<br />" >> "$LOG_PATH_HTM"
400	fi
401	fi
402	}
403
404	# Pluralize the string in parameter 1 if the number in parameter 2 is not 1
405	function pluralCheckNoun()
406	{
407	if [ $2 -ne 1 ]; then
408	if [[ $1 =~ x$ ]]; then
409	echo $1es
410	else
411	echo $1s
412	fi
413	else
414	echo $1
415	fi
416	}
417
418	# Output "is" if parameter 1 is 1, otherwise "are"
419	function pluralCheckIs()
420	{
421	if [ $1 -ne 1 ]; then
422	echo "are"
423	else
424	echo "is"
425	fi
426	}
427
428	# Output "was" if parameter 1 is 1, otherwise "were"
429	function pluralCheckWas()
430	{
431	if [ $1 -ne 1 ]; then
432	echo "were"
433	else
434	echo "was"
435	fi
436	}
437
438	# Output "a " if parameter 1 is 1, otherwise nothing
439	function pluralCheckA()
440	{
441	if [ $1 -eq 1 ]; then
442	echo "a "
443	fi
444	}
445
446	# Output "an " if parameter 1 is 1, otherwise nothing
447	function pluralCheckAn()
448	{
449	if [ $1 -eq 1 ]; then
450	echo "an "
451	fi
452	}
453
454	# Upload the reports using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
455	# reports being saved to disk have already been closed.
456	function uploadReport()
457	{
458	valPrint c "Uploading reports..."
459
460	SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
461	SFTP_USER_NAME_MARKER="user:"
462	SFTP_PASSWORD_MARKER="pw:"
463	SFTP_PORT_MARKER="port:"
464	SFTP_PATH_MARKER="path:"
465	SFTP_USER_NAME=$(grep $SFTP_USER_NAME_MARKER $UPLOAD_INFO)
466	SFTP_USER_NAME=${SFTP_USER_NAME#$SFTP_USER_NAME_MARKER}
467	SFTP_PASSWORD=$(grep $SFTP_PASSWORD_MARKER $UPLOAD_INFO)
468	SFTP_PASSWORD=${SFTP_PASSWORD#$SFTP_PASSWORD_MARKER}
469	SFTP_PORT=$(grep $SFTP_PORT_MARKER $UPLOAD_INFO)
470	SFTP_PORT=${SFTP_PORT#$SFTP_PORT_MARKER}
471	SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
472	SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}
473
474	for SUFFIX in htm rtf txt; do
475	expect "$SCRIPT_PATH" "$LOG_PATH.$SUFFIX" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.$SUFFIX"
476
477	if [ "$?" -ne 0 ]; then
478	valPrint c "Error $? occurred when attempting to upload $LOG_NAME.$SUFFIX!"
479	else
480	valPrint c "Report in `echo $SUFFIX \| tr [:lower:] [:upper:]` format was uploaded."
481	fi
482	done
483	}
484
485	# Prints session summary when script is done
486	function wrapupAndExit()
487	{
488	# Get off progress line on console, drop down a line from last link in log, and close HTML table
489	valPrint ctr ""
490	valPrint h "</table><br />"
491
492	# If we didn't finish processing the last URL, then the iterator is one too high
493	if [ $FINISHED_LIST != "yes" ]; then
494	let LINK_NUM-=1
495	if [ $FINISHED_LIST == "no" ]; then
496	valPrint ctrh "The session was canceled by the user."
497	fi
498	fi
499
500	# Generate string with elapsed time
501	END_RUN=$(date +%s)
502	ELAPSED=$(echo $(($END_RUN - $START_RUN)) \| awk '{printf "%d min. %d sec. elapsed", int($1/60), int($1%60)}')
503
504	# Do some math on results of session
505	LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
506	TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP+SKIP_YOUTU_BE))
507	LINK_ERRORS=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
508	LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_RD+SKIP_EXPECT_EI+SKIP_EXPECT_IW))
509	LINK_PROBLEMS_TOTAL=$((NG_LINKS+RD_LINKS+EI_LINKS+IW_LINKS))
510	LINK_PROBLEMS_NG=$((NG_LINKS-SKIP_EXPECT_NG))
511	LINK_PROBLEMS_RD=$((RD_LINKS-SKIP_EXPECT_RD))
512	LINK_PROBLEMS_EI=$((EI_LINKS-SKIP_EXPECT_EI))
513	LINK_PROBLEMS_IW=$((IW_LINKS-SKIP_EXPECT_IW))
514	LINK_PROBLEMS_NET=$((LINK_PROBLEMS_NG+LINK_PROBLEMS_RD+LINK_PROBLEMS_EI+LINK_PROBLEMS_IW))
515
516	# Print something in the Links section if no link issues were printed
517	if [ $LINK_PROBLEMS_NET -eq 0 ]; then
518	valPrint h "<i>No link problems to report! See the <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for a list of links with issues that were not reported.</i>"
519	fi
520	if [ $LINK_PROBLEMS_TOTAL -eq 0 ]; then
521	valPrint t "No link problems to report!"
522	valPrint r "\i1 No link problems to report! \i0"
523	fi
524
525	## SUMMARY OUTPUT ##
526	valPrint ct "Summary ($ELAPSED):"
527	valPrint r "\b1 Summary \b0 ($ELAPSED)"
528	valPrint hn "<h3><span id=\"summary\">Summary ($ELAPSED)</span></h3>"
529	valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT) (there $(pluralCheckWas $FILE_LINKS) $FILE_LINKS file $(pluralCheckNoun link $FILE_LINKS) and $PAGE_LINKS page $(pluralCheckNoun link $PAGE_LINKS))."
530
531	# Print processed link totals
532	if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi
533	if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi
534	if [ $SKIP_ARCHIVE_ORG -gt 0 ]; then valPrint ctrh "- $SKIP_ARCHIVE_ORG Archive.org $(pluralCheckNoun link $SKIP_ARCHIVE_ORG) were not checked"; fi
535	if [ $LINK_PROBLEMS_TOTAL -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_TOTAL processed $(pluralCheckNoun link $LINK_PROBLEMS_TOTAL) had $(pluralCheckAn $LINK_PROBLEMS_TOTAL)$(pluralCheckNoun issue $LINK_PROBLEMS_TOTAL)"; fi
536	if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr " (excepted $LINKS_EXCEPTED link $(pluralCheckNoun issue $LINKS_EXCEPTED) from report)"; valPrint h "  (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi
537	if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) $(pluralCheckWas $OK_LINKS) OK"; fi
538	if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctr " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; valPrint h "  (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi
539
540	# Print errored link totals
541	if [ $LINK_ERRORS -gt 0 ]; then
542	valPrint c "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"
543	valPrint h "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for specific links):"
544	valPrint rt "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS):"
545	fi
546	if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
547	if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
548	if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
549	if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
550	if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
551	if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
552
553	# Print excepted link totals
554	if [ $LINKS_EXCEPTED -gt 0 ]; then
555	valPrint c "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"
556	valPrint h "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for specific links):"
557	valPrint rt "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted:"
558	fi
559	if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
560	if [ $SKIP_EXPECT_RD -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_RD/$RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi
561	if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi
562	if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi
563
564	# Print checked link totals
565	if [ $LINK_PROBLEMS_NET -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS_NET link $(pluralCheckNoun issue $LINK_PROBLEMS_NET):"; fi
566	if [ $LINK_PROBLEMS_NG -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_NG NG $(pluralCheckNoun link $LINK_PROBLEMS_NG)"; fi
567	if [ $LINK_PROBLEMS_RD -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_RD $(pluralCheckNoun redirection $LINK_PROBLEMS_RD)"; fi
568	if [ $LINK_PROBLEMS_EI -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_EI $(pluralCheckNoun link $LINK_PROBLEMS_EI) that could be intrawiki"; fi
569	if [ $LINK_PROBLEMS_IW -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_IW $(pluralCheckNoun link $LINK_PROBLEMS_IW) that could be interwiki"; fi
570
571	# Close the log files' markup
572	valPrint trh "ValExtLinks says goodbye."
573	printRTFfooter
574	printHTMfooter
575
576	# Upload report if this was requested
577	if [ ! -z $UPLOAD_INFO ] && [ $FINISHED_LIST != "no" ]; then
578	uploadReport
579	fi
580
581	# Really quit now
582	valPrint c "ValExtLinks says goodbye."
583	exit 0
584	}
585	trap wrapupAndExit INT
586
587
588	### INITIALIZATION ###
589	# Print opening message to console and log files
590	valPrint cw "$(tput bold)--Validate External Links--$(tput sgr0)"
591	printTXTheader
592	printRTFheader
593	printHTMheader
594
595	## DATA SOURCING ##
596	valPrint t "Startup:"
597	valPrint r "\b1 Startup \b0"
598	valPrint hn "<h3>Startup</h3>"
599
600	# Attempt to download file at LINKS_URL, then check that it succeeded
601	valPrint cwtrhn "Downloading list of external links from $LINKS_URL..."
602	LINKS_FILE_NAME=$(echo "$LINKS_URL" \| sed 's/.*\///')
603	LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
604	curl --silent -o "$LINKS_FILE" $LINKS_URL
605	if [ ! -f "$LINKS_FILE" ]; then
606	echo -e "\nThe download of $LINKS_URL appears to have failed. Aborting."
607	wrapupAndExit
608	else
609	valPrint ctrh " success."
610	fi
611
612	# Attempt to download file at EXCEPT_URL, then check that it succeeded
613	if [ ! -z $EXCEPT_URL ]; then
614	valPrint cwtrhn "Downloading list of reporting exceptions from $EXCEPT_URL..."
615	EXCEPT_DATA=$(curl --silent $EXCEPT_URL)
616	if [ -z "$EXCEPT_DATA" ]; then
617	echo -e "\nThe download of the exceptions data from '$EXCEPT_URL' appears to have failed. Aborting."
618	wrapupAndExit
619	else
620	valPrint ctrh " success."
621	fi
622	EXCEPT_DATA=${EXCEPT_DATA%END LIST*}
623	EXCEPT_DATA=${EXCEPT_DATA#*BEGIN LIST}
624	EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
625
626	# Store on disk for debugging purposes
627	echo "$EXCEPT_DATA" > "$EXCEPT_FILE"
628
629	# Transfer to array for easy searching later
630	declare -a EXCEPT_ARRAY=($(echo "$EXCEPT_DATA"))
631	fi
632
633	# Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
634	LINK_COUNT_STRING=$(cat "$LINKS_FILE" \| wc -l)
635
636	# Number of URLs is number of lines minus one (first line is column header row for the CSV)
637	LINK_COUNT=$(echo "${LINK_COUNT_STRING}" \| tr -d '[:space:]')
638	let LINK_COUNT-=1
639	valPrint ctrh "Found $LINK_COUNT links to process."
640	valPrint trh ""
641
642	## CONFIG OUTPUT ##
643	valPrint t "Config:"
644	valPrint r "\b1 Config \b0"
645	valPrint hn "<h3>Config</h3>"
646
647	valPrint ctrhn "Links to consider: "
648	if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
649	valPrint ctrh "$((URL_LIMIT-URL_START+1)), from link $URL_START to $URL_LIMIT"
650	elif [ $URL_START -ne 1 ]; then
651	valPrint ctrh "$((LINK_COUNT-URL_START+1)), from link $URL_START to $LINK_COUNT"
652	else
653	valPrint ctrh "$LINK_COUNT"
654	fi
655
656	valPrint ctrh "Site query timeout: $TIMEOUT seconds"
657
658	valPrint ctrhn "Show OK links: "
659	if [ $RECORD_OK_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
660
661	valPrint ctrhn "Take screenshots: "
662	if [ $TAKE_PAGE_SHOT -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
663
664	valPrint ctrhn "Suggest archive.org snapshots for NG pages: "
665	if [ $SUGGEST_SNAPSHOTS_NG -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
666
667	valPrint ctrhn "Suggest archive.org snapshots for OK pages: "
668	if [ $SUGGEST_SNAPSHOTS_OK -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
669
670	valPrint ctrhn "Ignore slash-adding redirects: "
671	if [ $SHOW_SLASH -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
672
673	valPrint ctrhn "Ignore HTTPS-upgrading redirects: "
674	if [ $SHOW_HTTPS -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
675
676	valPrint ctrhn "Ignore youtu.be redirects: "
677	if [ $SHOW_YT_RD -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
678
679	valPrint ctrhn "Check archive.org links: "
680	if [ $CHECK_ARCHIVE_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
681
682	valPrint tr "A summary of my findings will be found at the bottom of the report."
683	valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
684	valPrint trh ""
685
686	## LEGEND OUTPUT ##
687	valPrint t "Legend:"
688	valPrint r "\b1 Legend \b0"
689	valPrint hn "<h3>Legend</h3>"
690	valPrint t "(For guidance in fixing these links, see $WIKI_MAIN.)"
691	valPrint r "(For guidance in fixing these links, see {\field{\*\fldinst{HYPERLINK \"$WIKI_MAIN\"}}{\fldrslt here}}.)"
692	valPrint h "(For guidance in fixing these links, see <a href=\"$WIKI_MAIN\" target=\"_blank\">here</a>.)"
693	valPrint trh "OK = URL seems to be working"
694	valPrint trh "NG = URL no longer seems to work"
695	valPrint trh "RD = URL is redirecting to this new URL"
696	valPrint trh "EI = URL is an external link to an internal page and should be converted to an intrawiki link using the suggested markup"
697	valPrint trh "IW = URL is an external link to a fellow wiki and should be converted to an interwiki link using the suggested markup"
698	valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $WIKI_HTTP)"
699	valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$WIKI_HTTP\"}}{\fldrslt here}} for code reference)"
700	valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$WIKI_HTTP\" target=\"_blank\">here</a> for code reference)"
701	valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $WIKI_CURL)"
702	valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$WIKI_CURL\"}}{\fldrslt here}} for code reference)"
703	valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$WIKI_CURL\" target=\"_blank\">here</a> for code reference)"
704	valPrint trh "IA suggests = Last available snapshot returned by the Internet Archive"
705	valPrint trh "Try browsing = The Archive failed to return a snapshot URL, so check for a snapshot manually using this link"
706	valPrint trh ""
707
708
709	### MAIN LOOP ###
710	valPrint t "Links:"
711	valPrint r "\b1 Links \b0"
712	valPrint hn "<h3>Links</h3>"
713	START_RUN=$(date +%s)
714	# Process each line of the .csv in LINKS_FILE
715	for LINE in `cat "$LINKS_FILE"`; do
716	START_LINK=$(date +%s)
717	let LINK_NUM+=1
718
719	# First line is the column header row for the CSV, so let's verify that the format hasn't changed
720	if [ $SKIPPED_HEADER_ROW -eq 0 ]; then
721	if [ $LINE == "namespace,title,target" ]; then
722	SKIPPED_HEADER_ROW=1
723	LINK_NUM=0 # this line is it's not a link, so reset the link counter
724	valPrint hn "<table>"
725	continue
726	else
727	valPrint ctrh "Did not find expected header row in $LINKS_FILE. Aborting."
728	wrapupAndExit
729	fi
730	fi
731
732	# Skip this link if we are not at URL_START yet
733	if [ $LINK_NUM -lt $URL_START ]; then
734	continue
735	fi
736
737	# Stop if we are at the limit declared for testing purposes
738	if [ $URL_LIMIT -gt 0 ] && [ $LINK_NUM -gt $URL_LIMIT ]; then
739	FINISHED_LIST="limit"
740	wrapupAndExit
741	fi
742
743	# Print progress to screen
744	if [ $LINK_NUM -gt 1 ]; then
745	printf "\e[1A\n" # erase previous progress message so that new one appears in its place
746	fi
747	valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."
748
749	# The number of the namespace is the element before the first comma on the line
750	NS_ID=${LINE%%,*}
751
752	# Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
753	NS_NAME=""
754	a=0
755	while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done
756	if [ $NS_ID == "NULL" ]; then
757	break
758	elif [ $NS_ID -eq ${NS_IDS[$a]} ]; then
759	NS_NAME="${NS_NAMES[$a]}"
760	break
761	fi
762	let a+=1
763	done
764	if [ "$NS_NAME" == "" ]; then
765	if [ $NS_ID == "NULL" ]; then
766	valPrint trs "Skipping line $LINK_NUM ('$LINE') because the namespace (and probably the page too) is 'NULL'. Probably the link is no longer in existence on the wiki."
767	else
768	valPrint trs "Skipping line $LINK_NUM ('$LINE') because I could not find a name for namespace ID $NS_ID."
769	fi
770	let SKIP_UNK_NS+=1
771	continue
772	fi
773
774	# The name of the page is everything between the namespace ID and the next comma on the line (commas
775	# in page names will break this)
776	PAGE_NAME=${LINE#$NS_ID,}
777	PAGE_NAME=${PAGE_NAME%%,*}
778
779	# We don't want to consider wiki pages ending in .js, as the MW parser cannot reliably isolate URLS
780	# in JavaScript code, so it returns erroneous links
781	PAGE_NAME_SUFFIX=$(echo $PAGE_NAME \| sed 's/.*\.//')
782	if [ $PAGE_NAME_SUFFIX == "js" ]; then
783	valPrint trs "Skipping URL '${LINE#$NS_ID,$PAGE_NAME,}' on line $LINK_NUM because it was found on JavaScript page '$PAGE_NAME'."
784	let SKIP_JS_PAGE+=1
785	continue
786	fi
787
788	# Build longer wiki page URLs from namespace and page names
789	FULL_PAGE_PATH=https://$WIKI_PATH/$NS_NAME:$PAGE_NAME
790	LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
791	# Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it
792	# explicitly breaks the link
793	if [ $NS_ID -eq 0 ]; then
794	FULL_PAGE_PATH=https://$WIKI_PATH/$PAGE_NAME
795	LOCAL_PAGE_PATH=$PAGE_NAME
796	fi
797
798	# The URL being linked to is everything after the previous two fields (this allows commas to be in
799	# the URLs, but a comma in the previous field, the page name, will break this)
800	URL=${LINE#$NS_ID,$PAGE_NAME,}
801
802	# Scan for illegal characters
803	if [[ $URL == [$ILLEGAL_CHARS] ]]; then
804	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because it contains characters illegal in a URL."
805	let SKIP_BAD_URL+=1
806	continue
807	fi
808
809	# If we're skipping Archive.org links, see if this is one
810	if [ $CHECK_ARCHIVE_LINKS -eq 0 ] && [[ $URL == web.archive.org ]]; then
811	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to check Wayback Machine links."
812	let SKIP_ARCHIVE_ORG+=1
813	continue
814	fi
815
816	# Now we need to know if the URL is for a file or a web page. First step is to determine if the
817	# URL ends in a suffix
818	HAS_SUFFIX=0
819
820	# If the URL ends in something like ".php?foo=bar", strip everything from the '?' onward
821	CLEAN_URL=${URL%%\?*}
822
823	# If the URL ends in something like "#section_15", strip everything from the '#' onward
824	CLEAN_URL=${CLEAN_URL%%\#*}
825
826	# 'sed' cannot handle Unicode in my Bash shell, so skip non-ASCII URL and make user check it
827	if [[ $CLEAN_URL == [![:ascii:]] ]]; then
828	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I cannot handle non-ASCII characters."
829	let SKIP_NON_ASCII+=1
830	continue
831	fi
832
833	# Isolate the characters after the last period and after the last slash
834	POST_DOT=$(echo "$CLEAN_URL" \| sed 's/.*\.//')
835	POST_SLASH=$(echo "$CLEAN_URL" \| sed 's/.*\///')
836
837	# If the last period comes after the last slash, then the URL ends in a suffix
838	POST_DOT_LENGTH=$(echo \| awk -v input=$POST_DOT '{print length(input)}')
839	POST_SLASH_LENGTH=$(echo \| awk -v input=$POST_SLASH '{print length(input)}')
840	if [ $POST_DOT_LENGTH -lt $POST_SLASH_LENGTH ]; then
841	HAS_SUFFIX=1
842	else
843	HAS_SUFFIX=0
844	fi
845
846	# Now we need to know if the URL is for a file. If the URL ends in a suffix, compare it against the
847	# known URL endings supplied in HTTP_FILES and HTTP_TLDS_AND_PAGES
848	IS_FILE=-1
849	if [ $HAS_SUFFIX -eq 0 ]; then
850	IS_FILE=0
851	else
852	# Turn off case sensitivity while we compare suffixes
853	shopt -s nocasematch
854
855	# Special case: URLs ending in something like "productID.304297400" are pages, not files, so if
856	# the URL's suffix is all numbers, we are looking at the end of a web page URL
857	if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
858	IS_FILE=0
859	fi
860
861	# Special case: URLs ending in a parens, e.g. "ms537113(v=vs.85)", are pages
862	if [[ $POST_DOT =~ ^.*[\(\)]$ ]]; then
863	IS_FILE=0
864	fi
865
866	# Special case: URLs containing a '%', e.g. "10.1007%2FBF00329055", are pages
867	if [[ $POST_DOT == % ]]; then
868	IS_FILE=0
869	fi
870
871	# If we did not identify this URL as a web page above, we need to compare the suffix against known
872	# file extensions
873	if [ $IS_FILE -eq -1 ]; then
874	for EXTENSION in "${HTTP_FILES[@]}"; do
875	if [[ $EXTENSION == $POST_DOT ]]; then # double brackets respect the nocasematch setting
876	IS_FILE=1
877	break
878	fi
879	done
880	fi
881
882	# If we did not identify this URL as a file above, we need to compare the suffix against known
883	# pages and TLDs in order to rule out that this is an unknown file suffix or TLD that the user
884	# needs to add to the HTTP_FILES or HTTP_TLDS_AND_PAGES array
885	if [ $IS_FILE -eq -1 ]; then
886	for PAGE_OR_TLD in "${HTTP_TLDS_AND_PAGES[@]}"; do
887	if [[ $PAGE_OR_TLD == $POST_DOT ]]; then
888	IS_FILE=0
889	break
890	fi
891	done
892	fi
893
894	# Turn case sensitivity back on in Bash
895	shopt -u nocasematch
896	fi
897
898	# If this suffix escaped identification as either a file, page or TLD, inform the user
899	STR_TYPE=""
900	if [ $IS_FILE -eq -1 ]; then
901	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I encountered the unknown URL ending '$POST_DOT'. Please add this ending to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
902	let SKIP_UNK_SUFFIX+=1
903	continue
904	elif [ $IS_FILE -eq 1 ]; then
905	STR_TYPE="file"
906	let FILE_LINKS+=1
907	elif [ $IS_FILE -eq 0 ]; then
908	STR_TYPE="page"
909	let PAGE_LINKS+=1
910	fi
911
912	# Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
913	# issue with sites that require HTTPS
914	CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --retry 2 --write-out '%{http_code}\n' $URL)
915	CURL_ERR=$(echo $?)
916	CURL_RESULT=$CURL_CODE
917
918	# Tack 'curl' exit code onto result string if 'curl' returned "000" (no HTTP response)
919	if [ $CURL_CODE == "000" ]; then
920	CURL_RESULT="$CURL_RESULT-$CURL_ERR"
921	fi
922
923	# Begin to determine our status code for this URL (EI, IW, OK, RD, or NG)
924	STATUS="??"
925	NEW_URL=""
926	INTERWIKI_INDEX=-1
927
928	# First make sure that this isn't an "external internal" link to our own wiki that can be replaced
929	# by "[[page_name]]". If it uses a special access URL beginning with "/w/", let it pass, as it
930	# probably cannot be replaced by "[[ ]]" markup
931	if [[ $URL == $WIKI_PATH ]] && [[ $URL != $WIKI_PATH/w/ ]]; then
932	STATUS="EI"
933	let EI_LINKS+=1
934	fi
935
936	# If it's not, check if this is a link to a domain that we have an interwiki prefix for (also make
937	# sure that it's not an archive.org link to a page from an interwiki domain)
938	if [ $STATUS == "??" ] && [[ $URL != web.archive.org ]]; then
939	for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
940	if [[ $URL == ${INTERWIKI_DOMAINS[$i]} ]] && [[ $URL != ${INTERWIKI_DOMAINS[$i]}/w/ ]]; then
941	STATUS="IW"
942	let IW_LINKS+=1
943	INTERWIKI_INDEX=$i
944	break
945	fi
946	done
947	fi
948
949	# If we didn't match an interwiki domain, see if the status code is in our "OK" codes list
950	if [ $STATUS == "??" ]; then
951	for CODE in "${OK_CODES[@]}"; do
952	if [[ $CODE == $CURL_CODE ]]; then
953	STATUS="OK"
954	let OK_LINKS+=1
955	break
956	fi
957	done
958	fi
959
960	# If we didn't get a match with the "OK" codes, check it against the "RD" codes
961	if [ $STATUS == "??" ]; then
962	for CODE in "${RD_CODES[@]}"; do
963	if [[ $CODE == $CURL_CODE ]]; then
964	# Get URL header again in order to retrieve the URL we are being redirected to
965	NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --write-out '%{redirect_url}\n' $URL)
966
967	# Adjust the old and new URLs to both use HTTP for comparison purposes, so we can filter
968	# those changes out if the user didn't ask for them
969	URL_HTTP=$(echo $URL \| sed -E 's/^https:/http:/')
970	NEW_URL_HTTP=$(echo $NEW_URL \| sed -E 's/^https:/http:/')
971
972	# Sometimes 'curl' fails to get the redirect_url due to time-out or bad web site config
973	NEW_URL_LENGTH=$(echo \| awk -v input=$NEW_URL_HTTP '{print length(input)}')
974	if [ $NEW_URL_LENGTH -lt $MIN_URL_LENGTH ]; then
975	NEW_URL_HTTP="[new URL not retrieved]"
976	fi
977
978	# Remove slash at end of new URL, if present, so we can filter out the redirects that
979	# merely add an ending slash if the user didn't ask for them
980	NEW_URL_NO_SLASH=$(echo $NEW_URL_HTTP \| sed -E 's:/$::')
981
982	# Detect if this is a youtu.be link simply being expanded by YouTube to the full
983	# youtube.com address
984	YOUTU_BE=0
985	if [[ $URL_HTTP == http://youtu.be* ]] && [[ $NEW_URL_HTTP == http://www.youtube.com* ]]; then
986	YOUTU_BE=1
987	fi
988
989	# If the URLs match besides HTTP being upgraded to HTTPS, then the link is OK (unless user
990	# wants those to be reported)
991	if [ $SHOW_HTTPS -eq 0 ] && [ $URL_HTTP == $NEW_URL_HTTP ]; then
992	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show http->https upgrades, and I was redirected to '$NEW_URL'."
993	STATUS="OK"
994	let OK_LINKS+=1
995	let SKIP_HTTPS_UP+=1
996	# If the URLs match besides an added ending slash, then the link is OK (unless user wants
997	# those to be reported)
998	elif [ $SHOW_SLASH -eq 0 ] && [ $URL_HTTP == $NEW_URL_NO_SLASH ]; then
999	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show added trailing slashes, and I was redirected to '$NEW_URL'."
1000	STATUS="OK"
1001	let OK_LINKS+=1
1002	let SKIP_SLASH_ADD+=1
1003	elif [ $SHOW_YT_RD -eq 0 ] && [ $YOUTU_BE -eq 1 ]; then
1004	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show redirects for youtu.be links, and I was redirected to '$NEW_URL'."
1005	STATUS="OK"
1006	let OK_LINKS+=1
1007	let SKIP_YOUTU_BE+=1
1008	else
1009	STATUS="RD"
1010	let RD_LINKS+=1
1011	fi
1012	break
1013	fi
1014	done
1015	fi
1016
1017	# If we didn't get a match with the "RD" codes, check it against the "NG" codes
1018	if [ $STATUS == "??" ]; then
1019	for CODE in "${NG_CODES[@]}"; do
1020	if [[ $CODE == $CURL_CODE ]]; then
1021	STATUS="NG"
1022	let NG_LINKS+=1
1023	break
1024	fi
1025	done
1026	fi
1027
1028	# If we didn't match a known status code, advise the reader
1029	if [ $STATUS == "??" ]; then
1030	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I encountered the unknown response code $CURL_CODE."
1031	let SKIP_UNK_CODE+=1
1032	continue
1033	fi
1034
1035	# Check problem links against exceptions list before proceeding
1036	FOUND_EXCEPT=0
1037	if [ $STATUS != "OK" ] && [ ! -z $EXCEPT_URL ]; then
1038	# The code we expect to find in the exceptions file is either the 'curl' result or "EI"/"IW"
1039	EXPECT_CODE="$CURL_RESULT"
1040	if [ $STATUS == "EI" ]; then
1041	EXPECT_CODE="EI"
1042	elif [ $STATUS == "IW" ]; then
1043	EXPECT_CODE="IW"
1044	fi
1045
1046	# Look for link in exceptions list and make sure the listed result code and wiki page also match
1047	for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
1048	{
1049	EXCEPT_LINE="${EXCEPT_ARRAY[$i]}"
1050
1051	# Undo any HTML-encoding from the wiki page; for now we just worry about the ampersand, as most
1052	# other HTML-encoded characters are not found in URLs
1053	EXCEPT_LINE=$(echo "$EXCEPT_LINE" \| sed 's/\&/\&/g')
1054
1055	# Match URL
1056	EXCEPT_URL="${EXCEPT_LINE#*,}"
1057	EXCEPT_URL="${EXCEPT_URL%,*}"
1058	if [ "$EXCEPT_URL" != "$URL" ]; then
1059	continue
1060	fi
1061
1062	# Match containing page's name
1063	EXCEPT_PAGE="${EXCEPT_LINE##*,}"
1064	EXCEPT_PAGE="${EXCEPT_PAGE%% *}"
1065	if [ "$EXCEPT_PAGE" == "*" ] \|\| [ "$EXCEPT_PAGE" == $LOCAL_PAGE_PATH ]; then
1066	# Match result code
1067	EXCEPT_CODE=${EXCEPT_LINE%%,*}
1068	if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then
1069	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because its expected result, '$EXPECT_CODE', is in the exceptions list."
1070	if [ $STATUS == "EI" ]; then
1071	let SKIP_EXPECT_EI+=1
1072	elif [ $STATUS == "IW" ]; then
1073	let SKIP_EXPECT_IW+=1
1074	elif [ $STATUS == "RD" ]; then
1075	let SKIP_EXPECT_RD+=1
1076	else
1077	let SKIP_EXPECT_NG+=1
1078	fi
1079	FOUND_EXCEPT=1
1080	break
1081	fi
1082	fi
1083	} done
1084	fi
1085	if [ $FOUND_EXCEPT -eq 1 ]; then
1086	continue
1087	fi
1088
1089	# If appropriate, record this link to the log, with clickable URLs when possible
1090	if [ $STATUS != "OK" ] \|\| [ $RECORD_OK_LINKS -eq 1 ]; then
1091	# Prepare 'curl' result in parentheses to print after status code, unless this is an "IW" or "EI"
1092	# link, in which case showing the status code doesn't make sense. Adjust spacing after string to
1093	# ensure TXT and RTF reports have aligned columns of results.
1094	CURL_STR_H=" ($CURL_RESULT)"
1095	CURL_STR_T="$CURL_STR_H"
1096	CURL_STR_R="$CURL_STR_H "
1097	if [ $STATUS == "IW" ] \|\| [ $STATUS == "EI" ]; then
1098	CURL_STR_H=""
1099	CURL_STR_T=" "
1100	CURL_STR_R=" "
1101	fi
1102
1103	# Record link and its wiki page in TXT, RTF, and HTML markup
1104	valPrint t "${STATUS}${CURL_STR_T} $STR_TYPE $URL"
1105	valPrint t " linked from $FULL_PAGE_PATH"
1106	valPrint r "${STATUS}${CURL_STR_R}${RTF_TABS}$STR_TYPE {\field{\*\fldinst{HYPERLINK \"$URL\"}}{\fldrslt $URL}}"
1107	valPrint r " linked from {\field{\*\fldinst{HYPERLINK \"$FULL_PAGE_PATH\"}}{\fldrslt $LOCAL_PAGE_PATH}}"
1108	valPrint hn "<tr><td style=\"white-space:nowrap\">${STATUS}${CURL_STR_H}</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>"
1109	valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"
1110
1111	# Place vertical space here since we won't be printing anything more about this link
1112	if [ $STATUS == "OK" ] && [ $SUGGEST_SNAPSHOTS_OK -eq 0 ]; then valPrint tr ""; valPrint hs ""; fi
1113
1114	# Record redirect URL if one was given by a 3xx response page
1115	if [ $STATUS == "RD" ]; then
1116	valPrint ts " Server suggests $NEW_URL"
1117	valPrint rs " Server suggests {\field{\*\fldinst{HYPERLINK \"$NEW_URL\"}}{\fldrslt $NEW_URL}}"
1118	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Server suggests</td><td><a href=\"$NEW_URL\" target=\"_blank\">$NEW_URL</a></td></tr>"
1119	fi
1120
1121	# Notify reader if we can use an intrawiki link for this URL
1122	if [ $STATUS == "EI" ]; then
1123	INTRA_PAGE=${URL#:///}
1124	valPrint ts " Just use [[$INTRA_PAGE]]"
1125	valPrint rs " Just use [[$INTRA_PAGE]]"
1126	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Just use</td><td>[[$INTRA_PAGE]]</td></tr>"
1127	fi
1128
1129	# Notify reader if we can use an interwiki prefix for this URL
1130	if [ $STATUS == "IW" ]; then
1131	INTER_PAGE=$(echo "$URL" \| sed 's/.*\///')
1132	valPrint ts " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
1133	valPrint rs " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
1134	valPrint hs "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]</td></tr>"
1135	fi
1136
1137	# Query Internet Archive for latest "OK" snapshot for "NG" page
1138	if [[ ( $STATUS == "NG" && $SUGGEST_SNAPSHOTS_NG -eq 1 ) \|\| ( $STATUS == "OK" && $SUGGEST_SNAPSHOTS_OK -eq 1 ) ]]; then
1139
1140	# We need to watch out for the rate limit or we'll get locked out; look at how much time has
1141	# elapsed and then wait the remainder between that and how long of a wait we think is needed
1142	# to avoid the dreaded "Too Many Requests" response. 5 seconds is just a guess.
1143	CUR_TIME=$(date +%s)
1144	WAIT_REMAINDER=$((5 - $CUR_TIME + $START_LINK))
1145	if [ $WAIT_REMAINDER -gt 0 ]; then
1146	valPrint t "Waiting $WAIT_REMAINDER second(s) to conform to Archive.org rate limit."
1147	sleep $WAIT_REMAINDER
1148	fi
1149
1150	# Issue query to the API
1151	ARCHIVE_QUERY=$(curl --silent --max-time $TIMEOUT "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
1152
1153	# Notify user if we hit the rate limit and just keep going
1154	if [[ "$ARCHIVE_QUERY" == "Too Many Requests" ]]; then
1155	valPrint t " IA has rate-limited us!"
1156	valPrint r " IA has rate-limited us!"
1157	valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td>(hit the API rate limit!)</td></tr>"
1158	# If a "closest" snapshot was received, inform user
1159	elif [[ "$ARCHIVE_QUERY" == \"closest\": ]]; then
1160	# In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it
1161	ARCHIVE_QUERY=$(echo "$ARCHIVE_QUERY" \| sed 's/#!/#\\!/')
1162
1163	# ...isolate "url" property in the response that follows the "closest" tag
1164	SNAPSHOT_URL=${ARCHIVE_QUERY##*\"closest\":} # everything after '"closest":'
1165	SNAPSHOT_URL=${SNAPSHOT_URL#*\"url\": \"} # everything after '"url": "'
1166	SNAPSHOT_URL=${SNAPSHOT_URL%%\"*} # everything before '"'
1167
1168	# Remove the port 80 part that IA often adds to the URL, as it's superfluous
1169	SNAPSHOT_URL=$(echo $SNAPSHOT_URL \| sed 's/:80//')
1170
1171	# Inform the user of the snapshot URL
1172	valPrint ts " IA suggests $SNAPSHOT_URL"
1173	valPrint rs " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
1174	valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
1175	else # Otherwise give a generic Wayback Machine link for this URL, which might work
1176	valPrint ts " Try browsing $ARCHIVE_GENERIC/$URL"
1177	valPrint rs " Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
1178	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"
1179	fi
1180	fi
1181	fi
1182
1183	# If this is a page and it seems to be there, take screenshot of it using headless Google Chrome
1184	if [ $IS_FILE -eq 0 ] && [ $TAKE_PAGE_SHOT -eq 1 ] && [ $STATUS == "OK" ]; then
1185	# Sanitize URL by removing "http(s)://" and converting any other colons or slashes to underscores
1186	SHOT_NAME=$(echo "$URL" \| sed 's/https*\:\/\///' \| sed 'y/:\//__/')
1187	SHOT_FILE="$SHOT_PATH/$SHOT_NAME.png"
1188
1189	# Don't take screenshot if we already encountered this page and screenshotted it
1190	if [ ! -f "$SHOT_FILE" ]; then
1191	"$CHROME_PATH" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
1192	if [ -f "$WORKING_DIR/$CHROME_SCREENSHOT" ]; then
1193	mv -n "$WORKING_DIR/$CHROME_SCREENSHOT" "$SHOT_FILE"
1194	else
1195	valPrint trhs "Screenshot of URL $URL seems to have failed!"
1196	fi
1197	else
1198	valPrint trhs "Skipping screenshot of URL '$URL' because file '$SHOT_FILE' already exists."
1199	fi
1200	fi
1201	done
1202	FINISHED_LIST="yes"
1203	wrapupAndExit

Note: See TracBrowser for help on using the repository browser.

Download in other formats: