Context Navigation

source: Validate External Links/validate_external_links.sh@ 1118

Last change on this file since 1118 was 1118, checked in by iritscen, 5 years ago
Fixed ValExtLinks' reading of Archive API replies. Fix for reading links that happen to have a shebang in them. Now knows how to handle NULL namespace links. Now prints elapsed time.
File size: 41.6 KB

Line
1	#!/bin/bash
2
3	# Validate External Links by Iritscen
4	# Provided with a list of external links found in the OniGalore wiki, this script validates them.
5	# The resulting logs are produced in three formats: TXT (for easy diffing with an earlier log), RTF
6	# (for reading as a local file with clickable links), and HTML (for uploading as a web page).
7	# Call script with "--help" argument for documentation. Also see Read Me First.rtf for critical notes.
8	# Recommended rule:
9	# \|----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----\|
10
11	# Set separator token to newline
12	IFS="
13	"
14
15	### GLOBALS ###
16	# Settings -- these will be changed from their defaults by the arguments passed in to the script
17	LINKS_URL="" # use 'curl' to download file with links from this location (can be file://)
18	EXCEPT_URL="" # ditto above for file with exceptions to NG results
19	OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder
20	RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES
21	SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page
22	TAKE_PAGE_SHOT=0 # take a screenshot of each OK page
23	CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature
24	URL_START=1 # start at this URL in LINKS_FILE (1 by default)
25	URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE
26	UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report
27
28	# Fixed strings -- see the occurrences of these variables to learn their purpose
29	AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36 OPR/67.0.3575.53"
30	ARCHIVE_API="http://archive.org/wayback/available"
31	ARCHIVE_GENERIC="https://web.archive.org/web/*"
32	ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
33	CHROME_SCREENSHOT="screenshot.png"
34	CURL_CODES="http://iritscen.oni2.net/val/curl_codes.txt"
35	EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
36	HTTP_CODES="http://iritscen.oni2.net/val/http_codes.txt"
37	MY_WIKI_PAGE="http://wiki.oni2.net/User:Iritscen"
38	THIS_DIR=$(cd $(dirname $0); pwd)
39	WORKING_DIR=$(pwd)
40	WIKI_PATH="wiki.oni2.net"
41
42	# These are parallel arrays of the IDs and names of OniGalore's current namespaces
43	declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 100 101 102 103 104 105 108 109 110 111)
44	declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk")
45
46	# These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
47	# This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code.
48	declare -a HTTP_FILES=(txt zip wmv jpg png m4a bsl rar oni mp3 mov ONWC vbs TRMA mp4 doc avi log gif pdf dmg exe cpp tga 7z wav east BINA xml dll dae xaf fbx 3ds blend flv csv)
49	declare -a HTTP_TLDS_AND_PAGES=(com net org uk ru de it htm html php pl asp aspx shtml pgi cgi php3 x jsp phtml cfm css action stm js)
50
51	# These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
52	# are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt
53	# if you add a new code.
54	declare -a OK_CODES=(200 401 405 406 501)
55	declare -a RD_CODES=(301 302 303 307 308)
56	declare -a NG_CODES=(000 403 404 410 500 503)
57
58	# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
59	# transcluded text, and if the transclusion fails, then the braces show up in the URL
60	ILLEGAL_CHARS="{ }"
61
62	# The shortest URL possible, used for sanity-checking some URLs: http://a.co
63	MIN_URL_LENGTH=11
64
65	# These are parallel arrays giving the prefixes that can be used in place of normal external links to
66	# some wikis and other sites
67	declare -a INTERWIKI_PREFIXES=(commons metawikimedia mw wikibooks wikidata wikimedia wikinews wikiquote wikisource wikispecies wikiversity wikivoyage wikt wp)
68	declare -a INTERWIKI_DOMAINS=(commons.wikimedia.org meta.wikimedia.org mediawiki.org wikibooks.org wikidata.org wikimediafoundation.org wikinews.org wikiquote.org wikisource.org species.wikimedia.org wikiversity.org wikivoyage.org wiktionary.org wikipedia.org)
69
70	# Variables for keeping track of main loop progress and findings
71	LINK_NUM=0
72	EI_LINKS=0
73	IW_LINKS=0
74	OK_LINKS=0
75	RD_LINKS=0
76	NG_LINKS=0
77	SKIP_UNK_NS=0
78	SKIP_JS_PAGE=0
79	SKIP_BAD_URL=0
80	SKIP_NON_ASCII=0
81	SKIP_UNK_SUFFIX=0
82	SKIP_UNK_CODE=0
83	SKIP_EXPECT_NG=0
84	SKIP_EXPECT_EI=0
85	SKIP_EXPECT_IW=0
86	FILE_LINKS=0
87	PAGE_LINKS=0
88	SKIPPED_HEADER_ROW=0
89	FINISHED_LIST="no"
90	START_RUN=0
91	END_RUN=0
92
93
94	### HELP ###
95	# A pseudo-man page. Here is the 80-character rule for the page text:
96	# 234567890123456789012345678901234567890123456789012345678901234567890123456789
97	function printHelp()
98	{
99	cat << EOF
100
101	NAME
102	Validate External Links
103
104	SYNOPSIS
105	validate_external_links.sh --help
106	validate_external_links.sh --links URL --output DIR [--exceptions URL]
107	[--record-ok-links] [--suggest-snapshots] [--take-screenshots FILE]
108	[--start-url NUM] [--end-url NUM] [--upload FILE]
109
110	DESCRIPTION
111	This script parses a list of external links found in the OniGalore wiki
112	(which is dumped by the Oni2.net domain periodically in a particular
113	format), validates them using the Unix tool 'curl', and produces a report
114	of which links were "OK" (responded positively to an HTTP query), which
115	were "RD" (responded with a 3xx redirect code), which could be "IW"
116	(interwiki) links, which are "EI" (external internal) links and could be
117	intrawiki links, and which were "NG" (no good; a negative response to the
118	query). This report can then be automatically uploaded to the location of
119	your choice. The script can also suggest Internet Archive snapshots for
120	"NG" links, and take screenshots of "OK" links for visual verification by
121	the reader that the page in question is the one intended to be displayed.
122
123	You must pass this script the URL at which the list of links is found
124	(--links) and the path where the directory of logs should be outputted
125	(--output). All other arguments are optional.
126
127	OPTIONS
128	--help Show this page.
129	--links URL (required) URL from which to download the CSV
130	file with external links. Note that this URL can
131	be a local file if you supply a file:// path.
132	--output DIR (required) Unix path to directory in which Val
133	should place its reports.
134	--exceptions URL In order to remove links from the report which
135	Val finds an issue with, but which you regard as
136	OK, list those desired exceptions in this file.
137	See the sample file exceptions.txt for details.
138	Note that this URL can point to a local file if
139	you supply a file:// path.
140	--record-ok-links Log a link in the report even if its response
141	code is "OK".
142	--suggest-snapshots Query the Internet Archive for a possible
143	snapshot URL for each "NG" page.
144	--take-screenshots FILE Call the Google Chrome binary at this path to
145	take screenshots of each "OK" page.
146	--start-url NUM Start at this link in the links CSV file.
147	--end-url NUM Stop at this link in the links CSV file.
148	--upload FILE Upload report using the credentials and path
149	given in this local text file. See sftp_login.txt
150	for template.
151
152	BUGS
153	The script cannot properly parse any line in the external links file
154	which contains a comma in the name of the wiki page containing a link.
155	Commas in the link itself are not an issue.
156	EOF
157	}
158
159
160	### SETUP ###
161	# If first argument is a help request, or if nothing was passed in at all, print help page and quit
162	if [ "$#" -eq 0 ] \|\| [ "$1" == "--help" ]; then
163	printHelp \| less
164	exit 0
165	fi
166
167	# Parse arguments as long as there are more arguments to process
168	while (( "$#" )); do
169	case "$1" in
170	--links ) LINKS_URL="$2"; shift 2;;
171	--exceptions ) EXCEPT_URL="$2"; shift 2;;
172	--output ) OUTPUT_DIR="$2"; shift 2;;
173	--record-ok-links ) RECORD_OK_LINKS=1; shift;;
174	--suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;;
175	--take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
176	--start-url ) URL_START=$2; shift 2;;
177	--end-url ) URL_LIMIT=$2; shift 2;;
178	--upload ) UPLOAD_INFO=$2; shift 2;;
179	* ) echo "Invalid argument $1 detected. Aborting."; exit 1;;
180	esac
181	done
182
183	# If the required arguments were not supplied, print help page and quit
184	if [ -z $LINKS_URL ] \|\| [ -z $OUTPUT_DIR ]; then
185	echo "Error: I did not receive one or both required arguments. Run me with the \"--help\" argument for documentation."
186	exit 2
187	fi
188
189	# If user wants screenshots, make sure path to Chrome was passed in and is valid
190	if [ $TAKE_PAGE_SHOT -eq 1 ]; then
191	if [ ! -f "$CHROME_PATH" ]; then
192	echo "Error: You need to supply a path to the Google Chrome application in order to take screenshots."
193	exit 3
194	fi
195	fi
196
197	# Check that UPLOAD_INFO exists, if this argument was supplied
198	if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then
199	echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting."
200	exit 4
201	fi
202
203	# Check that OUTPUT_DIR is a directory
204	if [ ! -d "$OUTPUT_DIR" ]; then
205	echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting."
206	exit 5
207	fi
208
209	# Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots
210	SAFE_TIME=$(date "+%Y-%m-%d--%H-%M-%S") # for use in path names
211	NICE_TIME=$(TZ=":UTC" date "+%a, %d %b %Y %k:%M:%S GMT") # matches format of LINKS_DATE, which is extracted from the extlinks file
212	OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
213	OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
214	SHOT_PATH="$OUTPUT_PATH/Screenshots"
215	LOG_NAME="ValExtLinks report"
216	LOG_TXT="$OUTPUT_PATH/$LOG_NAME.txt"
217	LOG_RTF="$OUTPUT_PATH/$LOG_NAME.rtf"
218	LOG_HTM="$OUTPUT_PATH/$LOG_NAME.htm"
219	mkdir "$OUTPUT_PATH"
220	if [ $TAKE_PAGE_SHOT -eq 1 ]; then
221	mkdir "$SHOT_PATH"
222	fi
223
224	# Check that 'mkdir' succeeded
225	if [ ! -d "$OUTPUT_PATH" ]; then
226	echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting."
227	exit 6
228	fi
229
230	# Get date on the file at LINKS_URL and print to log
231	LINKS_DATE=$(curl --silent --head $LINKS_URL \| grep "Last-Modified")
232	if [ -z "$LINKS_DATE" ]; then
233	echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting."
234	exit 7
235	fi
236	LINKS_DATE=${LINKS_DATE#Last-Modified: }
237
238
239	### UTILITY FUNCTIONS ###
240	# Writes a plain-text header to TXT log file
241	function printTXTheader()
242	{
243	valPrint t "Validate External Links report"
244	valPrint t "generated $NICE_TIME"
245	valPrint t "from data of $LINKS_DATE"
246	valPrint t "script by Iritscen (contact: $MY_WIKI_PAGE)"
247	valPrint t ""
248	}
249
250	# Writes the RTF header to RTF log file, then a centered title, then sets text to left-justified
251	function printRTFheader()
252	{
253	valPrint r "{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf820
254	{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
255	{\colortbl;\red255\green255\blue255;}
256	{\*\expandedcolortbl;;}
257	\margl1440\margr1440\vieww12600\viewh12100\viewkind0
258	\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\qc\partightenfactor0
259
260	\f0\fs28 \cf0 \b1 Validate External Links report \b0\\
261	generated $NICE_TIME\\
262	from data of $LINKS_DATE\\
263	script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$MY_WIKI_PAGE\"}}{\fldrslt contact}})
264	\\
265	\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
266	\cf0 "
267	}
268
269	# Closes the RTF markup of the RTF log file
270	function printRTFfooter()
271	{
272	valPrint r "}"
273	}
274
275	# Writes the HTML header to HTML log file
276	function printHTMheader()
277	{
278	valPrint h "<html>
279	<head>
280	<title>Validate External Links report</title>
281	</head>
282	<body>
283	<h2>Validate External Links report</h2>
284	<h3>generated $NICE_TIME<br />
285	from data of $LINKS_DATE<br />
286	script by Iritscen (<a href=\"$MY_WIKI_PAGE\" target=\"_blank\">contact</a>)</h3>"
287	}
288
289	# Closes the HTML markup of the HTML log file
290	function printHTMfooter()
291	{
292	valPrint h "</body>
293	</html>"
294	}
295
296	# The central logging function. The first parameter is a string composed of one or more characters that
297	# indicate which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
298	# 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 'w' means "Don't
299	# pass console output through 'fmt'" ("fmt" fits the output to an 80-column CLI but can break special
300	# formatting and the 'n' option).
301	function valPrint()
302	{
303	if [[ "$1" == c ]]; then
304	if [[ "$1" == n ]]; then
305	echo -n "$2"
306	elif [[ "$1" == w ]]; then
307	echo "$2"
308	else
309	echo "$2" \| fmt -w 80
310	fi
311	fi
312	if [[ "$1" == t ]]; then
313	if [[ "$1" == n ]]; then
314	echo -n "$2" >> "$LOG_TXT"
315	else
316	echo "$2" >> "$LOG_TXT"
317	fi
318	fi
319	if [[ "$1" == r ]]; then
320	if [[ "$1" == n ]]; then
321	echo "$2" >> "$LOG_RTF"
322	else
323	echo "$2\\" >> "$LOG_RTF"
324	fi
325	fi
326	if [[ "$1" == h ]]; then
327	if [[ "$1" == n ]]; then
328	echo "$2" >> "$LOG_HTM"
329	else
330	echo "$2<br />" >> "$LOG_HTM"
331	fi
332	fi
333	}
334
335	# Pluralize the string in parameter 1 if the number in parameter 2 is not 1
336	function pluralCheckNoun()
337	{
338	if [ $2 -ne 1 ]; then
339	if [[ $1 =~ x$ ]]; then
340	echo $1es
341	else
342	echo $1s
343	fi
344	else
345	echo $1
346	fi
347	}
348
349	# Output "is" if parameter 1 is 1, otherwise "are"
350	function pluralCheckIs()
351	{
352	if [ $1 -ne 1 ]; then
353	echo "are"
354	else
355	echo "is"
356	fi
357	}
358
359	# Output "was" if parameter 1 is 1, otherwise "were"
360	function pluralCheckWas()
361	{
362	if [ $1 -ne 1 ]; then
363	echo "were"
364	else
365	echo "was"
366	fi
367	}
368
369	# Output "a " if parameter 1 is 1, otherwise nothing
370	function pluralCheckA()
371	{
372	if [ $1 -eq 1 ]; then
373	echo "a "
374	fi
375	}
376
377	# Output "an " if parameter 1 is 1, otherwise nothing
378	function pluralCheckAn()
379	{
380	if [ $1 -eq 1 ]; then
381	echo "an "
382	fi
383	}
384
385	# Upload HTML report using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
386	# reports being saved to disk have already been closed.
387	function uploadReport()
388	{
389	valPrint c "Uploading HTML report..."
390
391	SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
392	SFTP_USER_NAME_MARKER="user:"
393	SFTP_PASSWORD_MARKER="pw:"
394	SFTP_PORT_MARKER="port:"
395	SFTP_PATH_MARKER="path:"
396	SFTP_USER_NAME=$(grep $SFTP_USER_NAME_MARKER $UPLOAD_INFO)
397	SFTP_USER_NAME=${SFTP_USER_NAME#$SFTP_USER_NAME_MARKER}
398	SFTP_PASSWORD=$(grep $SFTP_PASSWORD_MARKER $UPLOAD_INFO)
399	SFTP_PASSWORD=${SFTP_PASSWORD#$SFTP_PASSWORD_MARKER}
400	SFTP_PORT=$(grep $SFTP_PORT_MARKER $UPLOAD_INFO)
401	SFTP_PORT=${SFTP_PORT#$SFTP_PORT_MARKER}
402	SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
403	SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}
404
405	expect "$SCRIPT_PATH" "$LOG_HTM" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.htm"
406
407	valPrint c "Report was uploaded, unless an error message appears above."
408	}
409
410	# Prints session summary when script is done
411	function wrapupAndExit()
412	{
413	# Get off progress line on console, drop down a line from last link in log, and close HTML table
414	valPrint ctr ""
415	valPrint h "</table><br />"
416
417	# If we didn't finish processing the last URL, then the iterator is one too high
418	if [ $FINISHED_LIST != "yes" ]; then
419	let LINK_NUM-=1
420	if [ $FINISHED_LIST == "no" ]; then
421	valPrint ctrh "The session was canceled by the user."
422	fi
423	fi
424
425	# Generate string with elapsed time
426	END_RUN=$(date +%s)
427	ELAPSED=$(echo $(($END_RUN - $START_RUN)) \| awk '{printf "%d min. %d sec. elapsed", int($1/60), int($1%60)}')
428
429	# Output results of session and close the log file's markup
430	LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
431	LINKS_SKIPPED=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
432	LINKS_CHECKED=$((LINKS_PROCESSED-LINKS_SKIPPED))
433	valPrint ct "Summary ($ELAPSED):"
434	valPrint r "\b1 Summary \b0 ($ELAPSED)"
435	valPrint hn "<h3><span id=\"summary\">Summary ($ELAPSED)</span></h3>"
436	valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT)."
437	valPrint ctrh "I skipped $LINKS_SKIPPED $(pluralCheckNoun link $LINKS_SKIPPED), and found $FILE_LINKS $(pluralCheckNoun file $FILE_LINKS) and $PAGE_LINKS $(pluralCheckNoun page $PAGE_LINKS)."
438	if [ $LINKS_SKIPPED -gt 0 ]; then valPrint ctrh "Skip breakdown: "; fi
439	if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
440	if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
441	if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
442	if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
443	if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
444	if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
445	valPrint ctrh "Out of the $LINKS_CHECKED links checked, $EI_LINKS could be $(pluralCheckAn $EI_LINKS)intrawiki $(pluralCheckNoun link $EI_LINKS), $IW_LINKS could be $(pluralCheckAn $IW_LINKS)interwiki $(pluralCheckNoun link $IW_LINKS), $OK_LINKS $(pluralCheckWas $OK_LINKS) OK, $RD_LINKS $(pluralCheckWas $RD_LINKS) $(pluralCheckA $RD_LINKS)redirection $(pluralCheckNoun notice $RD_LINKS), and $NG_LINKS $(pluralCheckWas $NG_LINKS) NG."
446	if [ $SKIP_EXPECT_NG -gt 0 ]; then
447	valPrint ctrh "$SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS) went unlisted due to being found in the exceptions file."
448	fi
449	if [ $SKIP_EXPECT_EI -gt 0 ]; then
450	valPrint ctrh "$SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS) went unlisted due to being found in the exceptions file."
451	fi
452	if [ $SKIP_EXPECT_IW -gt 0 ]; then
453	valPrint ctrh "$SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS) went unlisted due to being found in the exceptions file."
454	fi
455	valPrint trh "ValExtLinks says goodbye."
456	printRTFfooter
457	printHTMfooter
458
459	# Upload report if this was requested
460	if [ ! -z $UPLOAD_INFO ] && [ $FINISHED_LIST != "no" ]; then
461	uploadReport
462	fi
463
464	# Really quit now
465	valPrint c "ValExtLinks says goodbye."
466	exit 0
467	}
468	trap wrapupAndExit INT
469
470
471	### INITIALIZATION ###
472	# Print opening message to console and log files
473	valPrint cw "$(tput bold)--Validate External Links--$(tput sgr0)"
474	printTXTheader
475	printRTFheader
476	printHTMheader
477
478	# Attempt to download file at LINKS_URL, then check that it succeeded
479	valPrint cwtrh "Downloading list of external links from $LINKS_URL."
480	LINKS_FILE_NAME=$(echo "$LINKS_URL" \| sed 's/.*\///')
481	LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
482	curl --silent -o "$LINKS_FILE" $LINKS_URL
483	if [ ! -f "$LINKS_FILE" ]; then
484	echo "The download of $LINKS_URL appears to have failed. Aborting."
485	wrapupAndExit
486	fi
487
488	# Attempt to download file at EXCEPT_URL, then check that it succeeded
489	if [ ! -z $EXCEPT_URL ]; then
490	valPrint cwtrh "Downloading list of reporting exceptions from $EXCEPT_URL."
491	EXCEPT_FILE_NAME=$(echo "$EXCEPT_URL" \| sed 's/.*\///')
492	EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
493	curl --silent -o "$EXCEPT_FILE" $EXCEPT_URL
494	if [ ! -f "$EXCEPT_FILE" ]; then
495	echo "The download of $EXCEPT_URL appears to have failed. Aborting."
496	wrapupAndExit
497	fi
498	fi
499
500	# Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
501	LINK_COUNT_STRING=$(cat "$LINKS_FILE" \| wc -l)
502
503	# Number of URLs is number of lines minus one (first line is column header row for the CSV)
504	LINK_COUNT=$(echo "${LINK_COUNT_STRING}" \| tr -d '[:space:]')
505	let LINK_COUNT-=1
506
507	# Calculate number of URLs to consider
508	if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
509	valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((URL_LIMIT-URL_START+1)) of them, from link $URL_START to $URL_LIMIT."
510	elif [ $URL_START -ne 1 ]; then
511	valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((LINK_COUNT-URL_START+1)) of them, from link $URL_START to $LINK_COUNT."
512	else
513	valPrint ctrh "Found $LINK_COUNT links to process. I will be considering all of them."
514	fi
515
516	# Print settings to console and log
517	declare -a SETTINGS_MSG=(I will be checking the response code of each link "and will" take a screenshot of each page. Pages that are OK will "also" be logged. I "will" ask the Internet Archive for a suggested snapshot URL for each NG page. "I will not report links that are listed in the exceptions file.")
518	if [ $TAKE_PAGE_SHOT -eq 0 ]; then SETTINGS_MSG[10]="but will not"; fi
519	if [ $RECORD_OK_LINKS -eq 0 ]; then SETTINGS_MSG[22]="not"; fi
520	if [ $SUGGEST_SNAPSHOTS -eq 0 ]; then SETTINGS_MSG[26]="will not"; fi
521	if [ -z $EXCEPT_URL ] \|\| [ $RECORD_OK_LINKS -eq 1 ]; then SETTINGS_MSG[40]=""; fi
522	SETTINGS_STR=${SETTINGS_MSG[@]}
523	valPrint ctrh "$SETTINGS_STR"
524	valPrint tr "A summary of my findings will be found at the bottom of the report."
525	valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
526	valPrint trh ""
527
528	# Print legend to logs
529	valPrint t "Legend:"
530	valPrint r "\b1 Legend \b0"
531	valPrint hn "<h3>Legend</h3>"
532	valPrint trh "OK = URL seems to be working."
533	valPrint trh "NG = URL no longer seems to work. You should click each URL marked as NG before attempting to fix it, because false negatives will occur from time to time due to hiccups in the Internet. Please report any persistent false negatives or other issues to Iritscen. An NG link should be followed by a link to the Internet Archive's Wayback Machine which may help you repair the link. If the link cannot be repaired, you can disable it on the wiki (which prevents it from showing up in future ValExtLinks reports) by wrapping it in nowiki tags."
534	valPrint trh "RD = The server responding to this URL is saying that the page moved and you should instead use the supplied new URL. Some RD links represent minor adjustments in the organization of a web site, and some are soft 404s (the file/page has been removed and you are being redirected to something like the main page of the web site). You will have to look at the new URL yourself to determine if it represents an OK link and the link on the wiki should be updated to this one, or if the desired file/page is actually gone and we need to replace the wiki link with an Internet Archive snapshot link -- or disable the URL if it has not been archived."
535	valPrint trh "EI = URL is an external link to an internal page and should be converted to an intrawiki link using the suggested markup."
536	valPrint trh "IW = URL is an external link to a fellow wiki and should be converted to an interwiki link using the suggested markup."
537	valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $HTTP_CODES)."
538	valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$HTTP_CODES\"}}{\fldrslt here}} for code reference)."
539	valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$HTTP_CODES\" target=\"_blank\">here</a> for code reference)."
540	valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $CURL_CODES)."
541	valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$CURL_CODES\"}}{\fldrslt here}} for code reference)."
542	valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$CURL_CODES\" target=\"_blank\">here</a> for code reference)."
543	valPrint trh "IA suggests = Last available snapshot suggested by the Internet Archive."
544	valPrint trh "Try browsing = The Archive occasionally fails to return a snapshot URL even when one exists, so you will need to check for a snapshot manually using this link to the Wayback Machine before concluding that a site has not been archived."
545	valPrint trh ""
546
547
548	### MAIN LOOP ###
549	START_RUN=$(date +%s)
550	# Process each line of the .csv in LINKS_FILE
551	for LINE in `cat "$LINKS_FILE"`; do
552	let LINK_NUM+=1
553
554	# First line is the column header row for the CSV, so let's verify that the format hasn't changed
555	if [ $SKIPPED_HEADER_ROW -eq 0 ]; then
556	if [ $LINE == "namespace,title,target" ]; then
557	SKIPPED_HEADER_ROW=1
558	LINK_NUM=0 # this line is it's not a link, so reset the link counter
559	valPrint hn "<table>"
560	continue
561	else
562	valPrint ctrh "Did not find expected header row in $LINKS_FILE. Aborting."
563	wrapupAndExit
564	fi
565	fi
566
567	# Skip this link if we are not at URL_START yet
568	if [ $LINK_NUM -lt $URL_START ]; then
569	continue
570	fi
571
572	# Stop if we are at the limit declared for testing purposes
573	if [ $URL_LIMIT -gt 0 ] && [ $LINK_NUM -gt $URL_LIMIT ]; then
574	FINISHED_LIST="limit"
575	wrapupAndExit
576	fi
577
578	# Print progress to screen
579	if [ $LINK_NUM -gt 1 ]; then
580	printf "\e[1A\n" # erase previous progress message so that new one appears in its place
581	fi
582	valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."
583
584	# The number of the namespace is the element before the first comma on the line
585	NS_ID=${LINE%%,*}
586
587	# Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
588	NS_NAME=""
589	a=0
590	while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done
591	if [ $NS_ID == "NULL" ]; then
592	break
593	elif [ $NS_ID -eq ${NS_IDS[$a]} ]; then
594	NS_NAME="${NS_NAMES[$a]}"
595	break
596	fi
597	let a+=1
598	done
599	if [ "$NS_NAME" == "" ]; then
600	if [ $NS_ID == "NULL" ]; then
601	valPrint tr "Skipping URL on line $LINK_NUM because the namespace (and probably the page too) is \"NULL\". Probably the link is no longer in existence on the wiki."
602	else
603	valPrint tr "Skipping URL on line $LINK_NUM found on page $PAGE_NAME because I could not find a name for namespace ID $NS_ID."
604	fi
605	let SKIP_UNK_NS+=1
606	continue
607	fi
608
609	# The name of the page is everything between the namespace ID and the next comma on the line (commas
610	# in page names will break this)
611	PAGE_NAME=${LINE#$NS_ID,}
612	PAGE_NAME=${PAGE_NAME%%,*}
613
614	# We don't want to consider wiki pages ending in .js, as the parser cannot reliably isolate URLS in
615	# JavaScript code, so it will return erroneous links
616	PAGE_NAME_SUFFIX=$(echo $PAGE_NAME \| sed 's/.*\.//')
617	if [ $PAGE_NAME_SUFFIX == "js" ]; then
618	valPrint tr "Skipping URL on line $LINK_NUM because it was found on JavaScript page $PAGE_NAME."
619	let SKIP_JS_PAGE+=1
620	continue
621	fi
622
623	# Build longer wiki page URLs from namespace and page names
624	FULL_PAGE_PATH=http://$WIKI_PATH/$NS_NAME:$PAGE_NAME
625	LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
626	# Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it
627	# explicitly breaks the link
628	if [ $NS_ID -eq 0 ]; then
629	FULL_PAGE_PATH=http://$WIKI_PATH/$PAGE_NAME
630	LOCAL_PAGE_PATH=$PAGE_NAME
631	fi
632
633	# The URL being linked to is everything after the previous two fields (this allows commas to be in
634	# the URLs, but a comma in the previous field, the page name, will break this)
635	URL=${LINE#$NS_ID,$PAGE_NAME,}
636
637	# Scan for illegal characters
638	if [[ $URL == [$ILLEGAL_CHARS] ]]; then
639	valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because it contains characters illegal in a URL."
640	let SKIP_BAD_URL+=1
641	continue
642	fi
643
644	# Now we need to know if the URL is for a file or a web page. First step is to determine if the
645	# URL ends in a suffix
646	HAS_SUFFIX=0
647
648	# If the URL ends in something like ".php?foo=bar", strip everything from the '?' onward
649	CLEAN_URL=${URL%%\?*}
650
651	# If the URL ends in something like "#section_15", strip everything from the '#' onward
652	CLEAN_URL=${CLEAN_URL%%\#*}
653
654	# 'sed' cannot handle Unicode in my Bash shell, so skip this URL and make user check it
655	if [[ $CLEAN_URL == [![:ascii:]] ]]; then
656	valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because I cannot handle non-ASCII characters."
657	let SKIP_NON_ASCII+=1
658	continue
659	fi
660
661	# Isolate the characters after the last period and after the last slash
662	POST_DOT=$(echo "$CLEAN_URL" \| sed 's/.*\.//')
663	POST_SLASH=$(echo "$CLEAN_URL" \| sed 's/.*\///')
664
665	# If the last period comes after the last slash, then the URL ends in a suffix
666	POST_DOT_LENGTH=$(echo \| awk -v input=$POST_DOT '{print length(input)}')
667	POST_SLASH_LENGTH=$(echo \| awk -v input=$POST_SLASH '{print length(input)}')
668	if [ $POST_DOT_LENGTH -lt $POST_SLASH_LENGTH ]; then
669	HAS_SUFFIX=1
670	else
671	HAS_SUFFIX=0
672	fi
673
674	# Now we need to know if the URL is for a file. If the URL ends in a suffix, compare it against the
675	# known URL endings supplied in HTTP_FILES and HTTP_TLDS_AND_PAGES
676	IS_FILE=-1
677	if [ $HAS_SUFFIX -eq 0 ]; then
678	IS_FILE=0
679	else
680	# Turn off case sensitivity while we compare suffixes
681	shopt -s nocasematch
682
683	# Special case: URLs ending in something like "/productID.304297400" are pages, not files, so if
684	# the URL's suffix is all numbers, we are looking at the end of a web page URL
685	if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
686	IS_FILE=0
687	fi
688
689	# If we did not identify this URL as a web page above, we need to compare the suffix against known
690	# file extensions
691	if [ $IS_FILE -eq -1 ]; then
692	for EXTENSION in "${HTTP_FILES[@]}"; do
693	if [[ $EXTENSION == $POST_DOT ]]; then # double brackets respect the nocasematch setting
694	IS_FILE=1
695	break
696	fi
697	done
698	fi
699
700	# If we did not identify this URL as a file above, we need to compare the suffix against known
701	# pages and TLDs in order to rule out that this is an unknown file suffix or TLD that the user
702	# needs to add to the HTTP_FILES or HTTP_TLDS_AND_PAGES array
703	if [ $IS_FILE -eq -1 ]; then
704	for PAGE_OR_TLD in "${HTTP_TLDS_AND_PAGES[@]}"; do
705	if [[ $PAGE_OR_TLD == $POST_DOT ]]; then
706	IS_FILE=0
707	break
708	fi
709	done
710	fi
711
712	# Turn case sensitivity back on in Bash
713	shopt -u nocasematch
714	fi
715
716	# If this suffix escaped identification as either a file, page or TLD, inform the user
717	STR_TYPE=""
718	if [ $IS_FILE -eq -1 ]; then
719	valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because I encountered the unknown URL ending $POST_DOT. Please add this ending to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
720	let SKIP_UNK_SUFFIX+=1
721	continue
722	elif [ $IS_FILE -eq 1 ]; then
723	STR_TYPE="file"
724	let FILE_LINKS+=1
725	elif [ $IS_FILE -eq 0 ]; then
726	STR_TYPE="page"
727	let PAGE_LINKS+=1
728	fi
729
730	# Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
731	# issue with sites that require HTTPS
732	CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{http_code}\n' $URL)
733	CURL_ERR=$(echo $?)
734	CURL_RESULT=$CURL_CODE
735
736	# Tack 'curl' exit code onto result string if 'curl' returned "000" (no HTTP response)
737	if [ $CURL_CODE == "000" ]; then
738	CURL_RESULT="$CURL_RESULT-$CURL_ERR"
739	fi
740
741	# Begin to determine our status code for this URL (EI, IW, OK, RD, or NG)
742	STATUS="??"
743	NEW_URL=""
744	INTERWIKI_INDEX=-1
745
746	# First make sure that this isn't an "external internal" link to our own wiki that can be replaced
747	# by "[[page_name]]". If it uses a special access URL beginning with "/w/", let it pass, as it
748	# probably cannot be replaced by "[[ ]]" markup
749	if [[ $URL == $WIKI_PATH ]] && [[ $URL != $WIKI_PATH/w/ ]]; then
750	STATUS="EI"
751	let EI_LINKS+=1
752	fi
753
754	# If it's not, check if this is a link to a domain that we have an interwiki prefix for
755	if [ $STATUS == "??" ]; then
756	for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
757	if [[ $URL == ${INTERWIKI_DOMAINS[$i]} ]] && [[ $URL != ${INTERWIKI_DOMAINS[$i]}/w/ ]]; then
758	STATUS="IW"
759	let IW_LINKS+=1
760	INTERWIKI_INDEX=$i
761	break
762	fi
763	done
764	fi
765
766	# If we didn't match an interwiki domain, see if the status code is in our "OK" codes list
767	if [ $STATUS == "??" ]; then
768	for CODE in "${OK_CODES[@]}"; do
769	if [[ $CODE == $CURL_CODE ]]; then
770	STATUS="OK"
771	let OK_LINKS+=1
772	break
773	fi
774	done
775	fi
776
777	# If we didn't get a match with the "OK" codes, check it against the "RD" codes
778	if [ $STATUS == "??" ]; then
779	for CODE in "${RD_CODES[@]}"; do
780	if [[ $CODE == $CURL_CODE ]]; then
781	# Get URL header again in order to retrieve the URL we are being redirected to
782	NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{redirect_url}\n' $URL)
783
784	# Filter out cases where the redirect URL is just the original URL with https:// instead of
785	# http://, or with an added '/' at the end. These corrections happen a lot and are not
786	# important to us.
787	URL_NO_PROTOCOL=${URL#http://}
788	URL_NO_PROTOCOL=${URL_NO_PROTOCOL%/}
789	NEW_URL_NO_PROTOCOL=${NEW_URL#https://}
790	NEW_URL_NO_PROTOCOL=${NEW_URL_NO_PROTOCOL%/}
791
792	# Sometimes 'curl' fails to get the redirect_url due to time-out or bad web site config
793	NEW_URL_LENGTH=$(echo \| awk -v input=$NEW_URL_NO_PROTOCOL '{print length(input)}')
794	if [ $NEW_URL_LENGTH -lt $MIN_URL_LENGTH ]; then
795	NEW_URL_NO_PROTOCOL="[new URL not retrieved]"
796	fi
797
798	# If the URLs match after the above filters were applied, then the link is OK
799	if [ $URL_NO_PROTOCOL == $NEW_URL_NO_PROTOCOL ]; then
800	STATUS="OK"
801	let OK_LINKS+=1
802	else
803	STATUS="RD"
804	let RD_LINKS+=1
805	fi
806	break
807	fi
808	done
809	fi
810
811	# If we didn't get a match with the "RD" codes, check it against the "NG" codes
812	if [ $STATUS == "??" ]; then
813	for CODE in "${NG_CODES[@]}"; do
814	if [[ $CODE == $CURL_CODE ]]; then
815	STATUS="NG"
816	let NG_LINKS+=1
817	break
818	fi
819	done
820	fi
821
822	# If we didn't match a known status code, advise the reader
823	if [ $STATUS == "??" ]; then
824	valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because I encountered the unknown return code $CURL_CODE."
825	let SKIP_UNK_CODE+=1
826	continue
827	fi
828
829	# Check problem links against exceptions file before proceeding
830	if [ $STATUS != "OK" ] && [ ! -z $EXCEPT_URL ]; then
831	# The code we expect to find in the exceptions file is either the 'curl' result or "EI"/"IW"
832	EXPECT_CODE="$CURL_RESULT"
833	if [ $STATUS == "EI" ]; then
834	EXPECT_CODE="EI"
835	elif [ $STATUS == "IW" ]; then
836	EXPECT_CODE="IW"
837	fi
838
839	# Look for link in exceptions file and make sure its listed result code and wiki page also match
840	GREP_RESULT=$(grep --max-count=1 "$URL" "$EXCEPT_FILE")
841	EXCEPT_PAGE=${GREP_RESULT##*,}
842	if [ "$EXCEPT_PAGE" == "*" ] \|\| [ "$EXCEPT_PAGE" == $LOCAL_PAGE_PATH ]; then
843	EXCEPT_CODE=${GREP_RESULT%%,*}
844	if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then
845	valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because its expected result, $EXPECT_CODE, is listed in the exceptions file."
846	if [ $STATUS == "EI" ]; then
847	let SKIP_EXPECT_EI+=1
848	elif [ $STATUS == "IW" ]; then
849	let SKIP_EXPECT_IW+=1
850	else
851	let SKIP_EXPECT_NG+=1
852	fi
853	continue
854	fi
855	fi
856	fi
857
858	# If appropriate, record this link to the log, with clickable URLs when possible
859	if [ $STATUS != "OK" ] \|\| [ $RECORD_OK_LINKS -eq 1 ]; then
860	# Stupid hack since the strings "IW" and "EI" are narrower than "OK", "RD", or "NG" and it takes
861	# an extra tab to get to the desired level of indentation in the RTF log
862	RTF_TABS=" "
863	if [ $STATUS == "IW" ] \|\| [ $STATUS == "EI" ]; then
864	RTF_TABS=" "
865	fi
866
867	# Record link and its wiki page in TXT, RTF, and HTML markup
868	valPrint t "$STATUS ($CURL_RESULT) $STR_TYPE $URL"
869	valPrint t " linked from $FULL_PAGE_PATH"
870	valPrint r "$STATUS ($CURL_RESULT)$RTF_TABS$STR_TYPE {\field{\*\fldinst{HYPERLINK \"$URL\"}}{\fldrslt $URL}}"
871	valPrint r " linked from {\field{\*\fldinst{HYPERLINK \"$FULL_PAGE_PATH\"}}{\fldrslt $LOCAL_PAGE_PATH}}"
872	valPrint hn "<tr><td style=\"white-space:nowrap\">$STATUS ($CURL_RESULT)</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>"
873	valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"
874
875	# Record redirect URL if one was given by a 3xx response page
876	if [ $STATUS == "RD" ]; then
877	valPrint t " Server suggests $NEW_URL"
878	valPrint r " Server suggests {\field{\*\fldinst{HYPERLINK \"$NEW_URL\"}}{\fldrslt $NEW_URL}}"
879	valPrint hn "<tr><td colspan=\"2\" align=\"right\">Server suggests</td><td><a href=\"$NEW_URL\" target=\"_blank\">$NEW_URL</a></td></tr>"
880	fi
881
882	# Notify reader if we can use an intrawiki link for this URL
883	if [ $STATUS == "EI" ]; then
884	INTRA_PAGE=${URL#:///}
885	valPrint t " Just use [[$INTRA_PAGE]]"
886	valPrint r " Just use [[$INTRA_PAGE]]"
887	valPrint hn "<tr><td colspan=\"2\" align=\"right\">Just use</td><td>[[$INTRA_PAGE]]</td></tr>"
888	fi
889
890	# Notify reader if we can use an interwiki prefix for this URL
891	if [ $STATUS == "IW" ]; then
892	INTER_PAGE=$(echo "$URL" \| sed 's/.*\///')
893	valPrint t " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
894	valPrint r " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
895	valPrint hn "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]</td></tr>"
896	fi
897
898	# Query Internet Archive for latest "OK" snapshot for "NG" page
899	if [ $STATUS == "NG" ] && [ $SUGGEST_SNAPSHOTS -eq 1 ]; then
900	ARCHIVE_QUERY=$(curl --silent --max-time 10 "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
901
902	# If a "closest" snapshot was received...
903	if [[ "$ARCHIVE_QUERY" == \"closest\": ]]; then
904	# In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it
905	ARCHIVE_QUERY=$(echo "$ARCHIVE_QUERY" \| sed 's/#!/#\\!/')
906
907	# ...isolate "url" property in the response that follows the "closest" tag
908	SNAPSHOT_URL=${ARCHIVE_QUERY##*\"closest\":} # everything after '"closest":'
909	SNAPSHOT_URL=${SNAPSHOT_URL##*\"url\": \"} # everything after '"url": "'
910	SNAPSHOT_URL=${SNAPSHOT_URL%%\"*} # everything before '"'
911
912	# Inform the user of the snapshot URL
913	valPrint t " IA suggests $SNAPSHOT_URL"
914	valPrint r " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
915	valPrint hn "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
916	else # ...otherwise give generic Wayback Machine link for this URL
917	valPrint t " Try browsing $ARCHIVE_GENERIC/$URL"
918	valPrint r " Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
919	valPrint hn "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"
920	fi
921	fi
922	fi
923
924	# If this is a page and it seems to be there, take screenshot of it using headless Google Chrome
925	if [ $IS_FILE -eq 0 ] && [ $TAKE_PAGE_SHOT -eq 1 ] && [ $STATUS == "OK" ]; then
926	# Sanitize URL by removing "http(s)://" and converting any other colons or slashes to underscores
927	SHOT_NAME=$(echo "$URL" \| sed 's/https*\:\/\///' \| sed 'y/:\//__/')
928	SHOT_FILE="$SHOT_PATH/$SHOT_NAME.png"
929
930	# Don't take screenshot if we already encountered this page and screenshotted it
931	if [ ! -f "$SHOT_FILE" ]; then
932	"$CHROME_PATH" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
933	if [ -f "$WORKING_DIR/$CHROME_SCREENSHOT" ]; then
934	mv -n "$WORKING_DIR/$CHROME_SCREENSHOT" "$SHOT_FILE"
935	else
936	valPrint trh "Screenshot of URL $URL seems to have failed!"
937	fi
938	else
939	valPrint trh "Skipping screenshot of URL $URL because $SHOT_FILE already exists."
940	fi
941	fi
942	done
943	FINISHED_LIST="yes"
944	wrapupAndExit

Note: See TracBrowser for help on using the repository browser.

Download in other formats: