Context Navigation

source: Validate External Links/validate_external_links.sh@ 1119

Last change on this file since 1119 was 1119, checked in by iritscen, 5 years ago
Properly fixed Val's parsing of Archive API responses this time. Added a little space between each link result, making report much easier to read.
File size: 42.0 KB

Line
1	#!/bin/bash
2
3	# Validate External Links by Iritscen
4	# Provided with a list of external links found in the OniGalore wiki, this script validates them.
5	# The resulting logs are produced in three formats: TXT (for easy diffing with an earlier log), RTF
6	# (for reading as a local file with clickable links), and HTML (for uploading as a web page).
7	# Call script with "--help" argument for documentation. Also see Read Me First.rtf for critical notes.
8	# Recommended rule:
9	# \|----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----\|
10
11	# Set separator token to newline
12	IFS="
13	"
14
15	### GLOBALS ###
16	# Settings -- these will be changed from their defaults by the arguments passed in to the script
17	LINKS_URL="" # use 'curl' to download file with links from this location (can be file://)
18	EXCEPT_URL="" # ditto above for file with exceptions to NG results
19	OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder
20	RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES
21	SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page
22	TAKE_PAGE_SHOT=0 # take a screenshot of each OK page
23	CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature
24	URL_START=1 # start at this URL in LINKS_FILE (1 by default)
25	URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE
26	UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report
27
28	# Fixed strings -- see the occurrences of these variables to learn their purpose
29	AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36 OPR/67.0.3575.53"
30	ARCHIVE_API="http://archive.org/wayback/available"
31	ARCHIVE_GENERIC="https://web.archive.org/web/*"
32	ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
33	CHROME_SCREENSHOT="screenshot.png"
34	CURL_CODES="http://iritscen.oni2.net/val/curl_codes.txt"
35	EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
36	HTTP_CODES="http://iritscen.oni2.net/val/http_codes.txt"
37	MY_WIKI_PAGE="http://wiki.oni2.net/User:Iritscen"
38	THIS_DIR=$(cd $(dirname $0); pwd)
39	WORKING_DIR=$(pwd)
40	WIKI_PATH="wiki.oni2.net"
41
42	# These are parallel arrays of the IDs and names of OniGalore's current namespaces
43	declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 100 101 102 103 104 105 108 109 110 111)
44	declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk")
45
46	# These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
47	# This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code.
48	declare -a HTTP_FILES=(txt zip wmv jpg png m4a bsl rar oni mp3 mov ONWC vbs TRMA mp4 doc avi log gif pdf dmg exe cpp tga 7z wav east BINA xml dll dae xaf fbx 3ds blend flv csv)
49	declare -a HTTP_TLDS_AND_PAGES=(com net org uk ru de it htm html php pl asp aspx shtml pgi cgi php3 x jsp phtml cfm css action stm js)
50
51	# These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
52	# are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt
53	# if you add a new code.
54	declare -a OK_CODES=(200 401 405 406 501)
55	declare -a RD_CODES=(301 302 303 307 308)
56	declare -a NG_CODES=(000 403 404 410 500 503)
57
58	# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
59	# transcluded text, and if the transclusion fails, then the braces show up in the URL
60	ILLEGAL_CHARS="{ }"
61
62	# The shortest URL possible, used for sanity-checking some URLs: http://a.co
63	MIN_URL_LENGTH=11
64
65	# These are parallel arrays giving the prefixes that can be used in place of normal external links to
66	# some wikis and other sites
67	declare -a INTERWIKI_PREFIXES=(commons metawikimedia mw wikibooks wikidata wikimedia wikinews wikiquote wikisource wikispecies wikiversity wikivoyage wikt wp)
68	declare -a INTERWIKI_DOMAINS=(commons.wikimedia.org meta.wikimedia.org mediawiki.org wikibooks.org wikidata.org wikimediafoundation.org wikinews.org wikiquote.org wikisource.org species.wikimedia.org wikiversity.org wikivoyage.org wiktionary.org wikipedia.org)
69
70	# Variables for keeping track of main loop progress and findings
71	LINK_NUM=0
72	EI_LINKS=0
73	IW_LINKS=0
74	OK_LINKS=0
75	RD_LINKS=0
76	NG_LINKS=0
77	SKIP_UNK_NS=0
78	SKIP_JS_PAGE=0
79	SKIP_BAD_URL=0
80	SKIP_NON_ASCII=0
81	SKIP_UNK_SUFFIX=0
82	SKIP_UNK_CODE=0
83	SKIP_EXPECT_NG=0
84	SKIP_EXPECT_EI=0
85	SKIP_EXPECT_IW=0
86	FILE_LINKS=0
87	PAGE_LINKS=0
88	SKIPPED_HEADER_ROW=0
89	FINISHED_LIST="no"
90	START_RUN=0
91	END_RUN=0
92
93
94	### HELP ###
95	# A pseudo-man page. Here is the 80-character rule for the page text:
96	# 234567890123456789012345678901234567890123456789012345678901234567890123456789
97	function printHelp()
98	{
99	cat << EOF
100
101	NAME
102	Validate External Links
103
104	SYNOPSIS
105	validate_external_links.sh --help
106	validate_external_links.sh --links URL --output DIR [--exceptions URL]
107	[--record-ok-links] [--suggest-snapshots] [--take-screenshots FILE]
108	[--start-url NUM] [--end-url NUM] [--upload FILE]
109
110	DESCRIPTION
111	This script parses a list of external links found in the OniGalore wiki
112	(which is dumped by the Oni2.net domain periodically in a particular
113	format), validates them using the Unix tool 'curl', and produces a report
114	of which links were "OK" (responded positively to an HTTP query), which
115	were "RD" (responded with a 3xx redirect code), which could be "IW"
116	(interwiki) links, which are "EI" (external internal) links and could be
117	intrawiki links, and which were "NG" (no good; a negative response to the
118	query). This report can then be automatically uploaded to the location of
119	your choice. The script can also suggest Internet Archive snapshots for
120	"NG" links, and take screenshots of "OK" links for visual verification by
121	the reader that the page in question is the one intended to be displayed.
122
123	You must pass this script the URL at which the list of links is found
124	(--links) and the path where the directory of logs should be outputted
125	(--output). All other arguments are optional.
126
127	OPTIONS
128	--help Show this page.
129	--links URL (required) URL from which to download the CSV
130	file with external links. Note that this URL can
131	be a local file if you supply a file:// path.
132	--output DIR (required) Unix path to directory in which Val
133	should place its reports.
134	--exceptions URL In order to remove links from the report which
135	Val finds an issue with, but which you regard as
136	OK, list those desired exceptions in this file.
137	See the sample file exceptions.txt for details.
138	Note that this URL can point to a local file if
139	you supply a file:// path.
140	--record-ok-links Log a link in the report even if its response
141	code is "OK".
142	--suggest-snapshots Query the Internet Archive for a possible
143	snapshot URL for each "NG" page.
144	--take-screenshots FILE Call the Google Chrome binary at this path to
145	take screenshots of each "OK" page.
146	--start-url NUM Start at this link in the links CSV file.
147	--end-url NUM Stop at this link in the links CSV file.
148	--upload FILE Upload report using the credentials and path
149	given in this local text file. See sftp_login.txt
150	for template.
151
152	BUGS
153	The script cannot properly parse any line in the external links file
154	which contains a comma in the name of the wiki page containing a link.
155	Commas in the link itself are not an issue.
156	EOF
157	}
158
159
160	### SETUP ###
161	# If first argument is a help request, or if nothing was passed in at all, print help page and quit
162	if [ "$#" -eq 0 ] \|\| [ "$1" == "--help" ]; then
163	printHelp \| less
164	exit 0
165	fi
166
167	# Parse arguments as long as there are more arguments to process
168	while (( "$#" )); do
169	case "$1" in
170	--links ) LINKS_URL="$2"; shift 2;;
171	--exceptions ) EXCEPT_URL="$2"; shift 2;;
172	--output ) OUTPUT_DIR="$2"; shift 2;;
173	--record-ok-links ) RECORD_OK_LINKS=1; shift;;
174	--suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;;
175	--take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
176	--start-url ) URL_START=$2; shift 2;;
177	--end-url ) URL_LIMIT=$2; shift 2;;
178	--upload ) UPLOAD_INFO=$2; shift 2;;
179	* ) echo "Invalid argument $1 detected. Aborting."; exit 1;;
180	esac
181	done
182
183	# If the required arguments were not supplied, print help page and quit
184	if [ -z $LINKS_URL ] \|\| [ -z $OUTPUT_DIR ]; then
185	echo "Error: I did not receive one or both required arguments. Run me with the \"--help\" argument for documentation."
186	exit 2
187	fi
188
189	# If user wants screenshots, make sure path to Chrome was passed in and is valid
190	if [ $TAKE_PAGE_SHOT -eq 1 ]; then
191	if [ ! -f "$CHROME_PATH" ]; then
192	echo "Error: You need to supply a path to the Google Chrome application in order to take screenshots."
193	exit 3
194	fi
195	fi
196
197	# Check that UPLOAD_INFO exists, if this argument was supplied
198	if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then
199	echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting."
200	exit 4
201	fi
202
203	# Check that OUTPUT_DIR is a directory
204	if [ ! -d "$OUTPUT_DIR" ]; then
205	echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting."
206	exit 5
207	fi
208
209	# Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots
210	SAFE_TIME=$(date "+%Y-%m-%d--%H-%M-%S") # for use in path names
211	NICE_TIME=$(TZ=":UTC" date "+%a, %d %b %Y %k:%M:%S GMT") # matches format of LINKS_DATE, which is extracted from the extlinks file
212	OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
213	OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
214	SHOT_PATH="$OUTPUT_PATH/Screenshots"
215	LOG_NAME="ValExtLinks report"
216	LOG_TXT="$OUTPUT_PATH/$LOG_NAME.txt"
217	LOG_RTF="$OUTPUT_PATH/$LOG_NAME.rtf"
218	LOG_HTM="$OUTPUT_PATH/$LOG_NAME.htm"
219	mkdir "$OUTPUT_PATH"
220	if [ $TAKE_PAGE_SHOT -eq 1 ]; then
221	mkdir "$SHOT_PATH"
222	fi
223
224	# Check that 'mkdir' succeeded
225	if [ ! -d "$OUTPUT_PATH" ]; then
226	echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting."
227	exit 6
228	fi
229
230	# Get date on the file at LINKS_URL and print to log
231	LINKS_DATE=$(curl --silent --head $LINKS_URL \| grep "Last-Modified")
232	if [ -z "$LINKS_DATE" ]; then
233	echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting."
234	exit 7
235	fi
236	LINKS_DATE=${LINKS_DATE#Last-Modified: }
237
238
239	### UTILITY FUNCTIONS ###
240	# Writes a plain-text header to TXT log file
241	function printTXTheader()
242	{
243	valPrint t "Validate External Links report"
244	valPrint t "generated $NICE_TIME"
245	valPrint t "from data of $LINKS_DATE"
246	valPrint t "script by Iritscen (contact: $MY_WIKI_PAGE)"
247	valPrint t ""
248	}
249
250	# Writes the RTF header to RTF log file, then a centered title, then sets text to left-justified
251	function printRTFheader()
252	{
253	valPrint r "{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf820
254	{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
255	{\colortbl;\red255\green255\blue255;}
256	{\*\expandedcolortbl;;}
257	\margl1440\margr1440\vieww12600\viewh12100\viewkind0
258	\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\qc\partightenfactor0
259
260	\f0\fs28 \cf0 \b1 Validate External Links report \b0\\
261	generated $NICE_TIME\\
262	from data of $LINKS_DATE\\
263	script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$MY_WIKI_PAGE\"}}{\fldrslt contact}})
264	\\
265	\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
266	\cf0 "
267	}
268
269	# Closes the RTF markup of the RTF log file
270	function printRTFfooter()
271	{
272	valPrint r "}"
273	}
274
275	# Writes the HTML header to HTML log file
276	function printHTMheader()
277	{
278	valPrint h "<html>
279	<head>
280	<title>Validate External Links report</title>
281	</head>
282	<body>
283	<h2>Validate External Links report</h2>
284	<h3>generated $NICE_TIME<br />
285	from data of $LINKS_DATE<br />
286	script by Iritscen (<a href=\"$MY_WIKI_PAGE\" target=\"_blank\">contact</a>)</h3>"
287	}
288
289	# Closes the HTML markup of the HTML log file
290	function printHTMfooter()
291	{
292	valPrint h "</body>
293	</html>"
294	}
295
296	# The central logging function. The first parameter is a string composed of one or more characters that
297	# indicate which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
298	# 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 's' means "Print an extra newline at the end." 'w' means "Don't pass console output through 'fmt'" ("fmt" fits the output
299	# to an 80-column CLI but can break special formatting and the 'n' option).
300	function valPrint()
301	{
302	if [[ "$1" == c ]]; then
303	if [[ "$1" == n ]]; then
304	echo -n "$2"
305	elif [[ "$1" == w ]]; then
306	echo "$2"
307	elif [[ "$1" == s ]]; then
308	echo -e "$2\n"
309	else
310	echo "$2" \| fmt -w 80
311	fi
312	fi
313	if [[ "$1" == t ]]; then
314	if [[ "$1" == n ]]; then
315	echo -n "$2" >> "$LOG_TXT"
316	elif [[ "$1" == s ]]; then
317	echo -e "$2\n" >> "$LOG_TXT"
318	else
319	echo "$2" >> "$LOG_TXT"
320	fi
321	fi
322	if [[ "$1" == r ]]; then
323	if [[ "$1" == n ]]; then
324	echo "$2" >> "$LOG_RTF"
325	elif [[ "$1" == s ]]; then
326	echo "$2\line\line" >> "$LOG_RTF"
327	else
328	echo "$2\line" >> "$LOG_RTF"
329	fi
330	fi
331	if [[ "$1" == h ]]; then
332	if [[ "$1" == s ]]; then
333	echo "$2<tr><td> </td></tr>" >> "$LOG_HTM"
334	elif [[ "$1" == n ]]; then
335	echo "$2" >> "$LOG_HTM"
336	else
337	echo "$2<br />" >> "$LOG_HTM"
338	fi
339	fi
340	}
341
342	# Pluralize the string in parameter 1 if the number in parameter 2 is not 1
343	function pluralCheckNoun()
344	{
345	if [ $2 -ne 1 ]; then
346	if [[ $1 =~ x$ ]]; then
347	echo $1es
348	else
349	echo $1s
350	fi
351	else
352	echo $1
353	fi
354	}
355
356	# Output "is" if parameter 1 is 1, otherwise "are"
357	function pluralCheckIs()
358	{
359	if [ $1 -ne 1 ]; then
360	echo "are"
361	else
362	echo "is"
363	fi
364	}
365
366	# Output "was" if parameter 1 is 1, otherwise "were"
367	function pluralCheckWas()
368	{
369	if [ $1 -ne 1 ]; then
370	echo "were"
371	else
372	echo "was"
373	fi
374	}
375
376	# Output "a " if parameter 1 is 1, otherwise nothing
377	function pluralCheckA()
378	{
379	if [ $1 -eq 1 ]; then
380	echo "a "
381	fi
382	}
383
384	# Output "an " if parameter 1 is 1, otherwise nothing
385	function pluralCheckAn()
386	{
387	if [ $1 -eq 1 ]; then
388	echo "an "
389	fi
390	}
391
392	# Upload HTML report using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
393	# reports being saved to disk have already been closed.
394	function uploadReport()
395	{
396	valPrint c "Uploading HTML report..."
397
398	SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
399	SFTP_USER_NAME_MARKER="user:"
400	SFTP_PASSWORD_MARKER="pw:"
401	SFTP_PORT_MARKER="port:"
402	SFTP_PATH_MARKER="path:"
403	SFTP_USER_NAME=$(grep $SFTP_USER_NAME_MARKER $UPLOAD_INFO)
404	SFTP_USER_NAME=${SFTP_USER_NAME#$SFTP_USER_NAME_MARKER}
405	SFTP_PASSWORD=$(grep $SFTP_PASSWORD_MARKER $UPLOAD_INFO)
406	SFTP_PASSWORD=${SFTP_PASSWORD#$SFTP_PASSWORD_MARKER}
407	SFTP_PORT=$(grep $SFTP_PORT_MARKER $UPLOAD_INFO)
408	SFTP_PORT=${SFTP_PORT#$SFTP_PORT_MARKER}
409	SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
410	SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}
411
412	expect "$SCRIPT_PATH" "$LOG_HTM" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.htm"
413
414	valPrint c "Report was uploaded, unless an error message appears above."
415	}
416
417	# Prints session summary when script is done
418	function wrapupAndExit()
419	{
420	# Get off progress line on console, drop down a line from last link in log, and close HTML table
421	valPrint ctr ""
422	valPrint h "</table><br />"
423
424	# If we didn't finish processing the last URL, then the iterator is one too high
425	if [ $FINISHED_LIST != "yes" ]; then
426	let LINK_NUM-=1
427	if [ $FINISHED_LIST == "no" ]; then
428	valPrint ctrh "The session was canceled by the user."
429	fi
430	fi
431
432	# Generate string with elapsed time
433	END_RUN=$(date +%s)
434	ELAPSED=$(echo $(($END_RUN - $START_RUN)) \| awk '{printf "%d min. %d sec. elapsed", int($1/60), int($1%60)}')
435
436	# Output results of session and close the log file's markup
437	LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
438	LINKS_SKIPPED=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
439	LINKS_CHECKED=$((LINKS_PROCESSED-LINKS_SKIPPED))
440	valPrint ct "Summary ($ELAPSED):"
441	valPrint r "\b1 Summary \b0 ($ELAPSED)"
442	valPrint hn "<h3><span id=\"summary\">Summary ($ELAPSED)</span></h3>"
443	valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT)."
444	valPrint ctrh "I skipped $LINKS_SKIPPED $(pluralCheckNoun link $LINKS_SKIPPED), and found $FILE_LINKS $(pluralCheckNoun file $FILE_LINKS) and $PAGE_LINKS $(pluralCheckNoun page $PAGE_LINKS)."
445	if [ $LINKS_SKIPPED -gt 0 ]; then valPrint ctrh "Skip breakdown: "; fi
446	if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
447	if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
448	if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
449	if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
450	if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
451	if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
452	valPrint ctrh "Out of the $LINKS_CHECKED links checked, $EI_LINKS could be $(pluralCheckAn $EI_LINKS)intrawiki $(pluralCheckNoun link $EI_LINKS), $IW_LINKS could be $(pluralCheckAn $IW_LINKS)interwiki $(pluralCheckNoun link $IW_LINKS), $OK_LINKS $(pluralCheckWas $OK_LINKS) OK, $RD_LINKS $(pluralCheckWas $RD_LINKS) $(pluralCheckA $RD_LINKS)redirection $(pluralCheckNoun notice $RD_LINKS), and $NG_LINKS $(pluralCheckWas $NG_LINKS) NG."
453	if [ $SKIP_EXPECT_NG -gt 0 ]; then
454	valPrint ctrh "$SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS) went unlisted due to being found in the exceptions file."
455	fi
456	if [ $SKIP_EXPECT_EI -gt 0 ]; then
457	valPrint ctrh "$SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS) went unlisted due to being found in the exceptions file."
458	fi
459	if [ $SKIP_EXPECT_IW -gt 0 ]; then
460	valPrint ctrh "$SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS) went unlisted due to being found in the exceptions file."
461	fi
462	valPrint trh "ValExtLinks says goodbye."
463	printRTFfooter
464	printHTMfooter
465
466	# Upload report if this was requested
467	if [ ! -z $UPLOAD_INFO ] && [ $FINISHED_LIST != "no" ]; then
468	uploadReport
469	fi
470
471	# Really quit now
472	valPrint c "ValExtLinks says goodbye."
473	exit 0
474	}
475	trap wrapupAndExit INT
476
477
478	### INITIALIZATION ###
479	# Print opening message to console and log files
480	valPrint cw "$(tput bold)--Validate External Links--$(tput sgr0)"
481	printTXTheader
482	printRTFheader
483	printHTMheader
484
485	# Attempt to download file at LINKS_URL, then check that it succeeded
486	valPrint cwtrh "Downloading list of external links from $LINKS_URL."
487	LINKS_FILE_NAME=$(echo "$LINKS_URL" \| sed 's/.*\///')
488	LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
489	curl --silent -o "$LINKS_FILE" $LINKS_URL
490	if [ ! -f "$LINKS_FILE" ]; then
491	echo "The download of $LINKS_URL appears to have failed. Aborting."
492	wrapupAndExit
493	fi
494
495	# Attempt to download file at EXCEPT_URL, then check that it succeeded
496	if [ ! -z $EXCEPT_URL ]; then
497	valPrint cwtrh "Downloading list of reporting exceptions from $EXCEPT_URL."
498	EXCEPT_FILE_NAME=$(echo "$EXCEPT_URL" \| sed 's/.*\///')
499	EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
500	curl --silent -o "$EXCEPT_FILE" $EXCEPT_URL
501	if [ ! -f "$EXCEPT_FILE" ]; then
502	echo "The download of $EXCEPT_URL appears to have failed. Aborting."
503	wrapupAndExit
504	fi
505	fi
506
507	# Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
508	LINK_COUNT_STRING=$(cat "$LINKS_FILE" \| wc -l)
509
510	# Number of URLs is number of lines minus one (first line is column header row for the CSV)
511	LINK_COUNT=$(echo "${LINK_COUNT_STRING}" \| tr -d '[:space:]')
512	let LINK_COUNT-=1
513
514	# Calculate number of URLs to consider
515	if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
516	valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((URL_LIMIT-URL_START+1)) of them, from link $URL_START to $URL_LIMIT."
517	elif [ $URL_START -ne 1 ]; then
518	valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((LINK_COUNT-URL_START+1)) of them, from link $URL_START to $LINK_COUNT."
519	else
520	valPrint ctrh "Found $LINK_COUNT links to process. I will be considering all of them."
521	fi
522
523	# Print settings to console and log
524	declare -a SETTINGS_MSG=(I will be checking the response code of each link "and will" take a screenshot of each page. Pages that are OK will "also" be logged. I "will" ask the Internet Archive for a suggested snapshot URL for each NG page. "I will not report links that are listed in the exceptions file.")
525	if [ $TAKE_PAGE_SHOT -eq 0 ]; then SETTINGS_MSG[10]="but will not"; fi
526	if [ $RECORD_OK_LINKS -eq 0 ]; then SETTINGS_MSG[22]="not"; fi
527	if [ $SUGGEST_SNAPSHOTS -eq 0 ]; then SETTINGS_MSG[26]="will not"; fi
528	if [ -z $EXCEPT_URL ] \|\| [ $RECORD_OK_LINKS -eq 1 ]; then SETTINGS_MSG[40]=""; fi
529	SETTINGS_STR=${SETTINGS_MSG[@]}
530	valPrint ctrh "$SETTINGS_STR"
531	valPrint tr "A summary of my findings will be found at the bottom of the report."
532	valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
533	valPrint trh ""
534
535	# Print legend to logs
536	valPrint t "Legend:"
537	valPrint r "\b1 Legend \b0"
538	valPrint hn "<h3>Legend</h3>"
539	valPrint trh "OK = URL seems to be working."
540	valPrint trh "NG = URL no longer seems to work. You should click each URL marked as NG before attempting to fix it, because false negatives will occur from time to time due to hiccups in the Internet. Please report any persistent false negatives or other issues to Iritscen. An NG link should be followed by a link to the Internet Archive's Wayback Machine which may help you repair the link. If the link cannot be repaired, you can disable it on the wiki (which prevents it from showing up in future ValExtLinks reports) by wrapping it in nowiki tags."
541	valPrint trh "RD = The server responding to this URL is saying that the page moved and you should instead use the supplied new URL. Some RD links represent minor adjustments in the organization of a web site, and some are soft 404s (the file/page has been removed and you are being redirected to something like the main page of the web site). You will have to look at the new URL yourself to determine if it represents an OK link and the link on the wiki should be updated to this one, or if the desired file/page is actually gone and we need to replace the wiki link with an Internet Archive snapshot link -- or disable the URL if it has not been archived."
542	valPrint trh "EI = URL is an external link to an internal page and should be converted to an intrawiki link using the suggested markup."
543	valPrint trh "IW = URL is an external link to a fellow wiki and should be converted to an interwiki link using the suggested markup."
544	valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $HTTP_CODES)."
545	valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$HTTP_CODES\"}}{\fldrslt here}} for code reference)."
546	valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$HTTP_CODES\" target=\"_blank\">here</a> for code reference)."
547	valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $CURL_CODES)."
548	valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$CURL_CODES\"}}{\fldrslt here}} for code reference)."
549	valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$CURL_CODES\" target=\"_blank\">here</a> for code reference)."
550	valPrint trh "IA suggests = Last available snapshot suggested by the Internet Archive."
551	valPrint trh "Try browsing = The Archive occasionally fails to return a snapshot URL even when one exists, so you will need to check for a snapshot manually using this link to the Wayback Machine before concluding that a site has not been archived."
552	valPrint trh ""
553
554
555	### MAIN LOOP ###
556	START_RUN=$(date +%s)
557	# Process each line of the .csv in LINKS_FILE
558	for LINE in `cat "$LINKS_FILE"`; do
559	let LINK_NUM+=1
560
561	# First line is the column header row for the CSV, so let's verify that the format hasn't changed
562	if [ $SKIPPED_HEADER_ROW -eq 0 ]; then
563	if [ $LINE == "namespace,title,target" ]; then
564	SKIPPED_HEADER_ROW=1
565	LINK_NUM=0 # this line is it's not a link, so reset the link counter
566	valPrint hn "<table>"
567	continue
568	else
569	valPrint ctrh "Did not find expected header row in $LINKS_FILE. Aborting."
570	wrapupAndExit
571	fi
572	fi
573
574	# Skip this link if we are not at URL_START yet
575	if [ $LINK_NUM -lt $URL_START ]; then
576	continue
577	fi
578
579	# Stop if we are at the limit declared for testing purposes
580	if [ $URL_LIMIT -gt 0 ] && [ $LINK_NUM -gt $URL_LIMIT ]; then
581	FINISHED_LIST="limit"
582	wrapupAndExit
583	fi
584
585	# Print progress to screen
586	if [ $LINK_NUM -gt 1 ]; then
587	printf "\e[1A\n" # erase previous progress message so that new one appears in its place
588	fi
589	valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."
590
591	# The number of the namespace is the element before the first comma on the line
592	NS_ID=${LINE%%,*}
593
594	# Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
595	NS_NAME=""
596	a=0
597	while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done
598	if [ $NS_ID == "NULL" ]; then
599	break
600	elif [ $NS_ID -eq ${NS_IDS[$a]} ]; then
601	NS_NAME="${NS_NAMES[$a]}"
602	break
603	fi
604	let a+=1
605	done
606	if [ "$NS_NAME" == "" ]; then
607	if [ $NS_ID == "NULL" ]; then
608	valPrint trs "Skipping URL on line $LINK_NUM because the namespace (and probably the page too) is \"NULL\". Probably the link is no longer in existence on the wiki."
609	else
610	valPrint trs "Skipping URL on line $LINK_NUM found on page $PAGE_NAME because I could not find a name for namespace ID $NS_ID."
611	fi
612	let SKIP_UNK_NS+=1
613	continue
614	fi
615
616	# The name of the page is everything between the namespace ID and the next comma on the line (commas
617	# in page names will break this)
618	PAGE_NAME=${LINE#$NS_ID,}
619	PAGE_NAME=${PAGE_NAME%%,*}
620
621	# We don't want to consider wiki pages ending in .js, as the parser cannot reliably isolate URLS in
622	# JavaScript code, so it will return erroneous links
623	PAGE_NAME_SUFFIX=$(echo $PAGE_NAME \| sed 's/.*\.//')
624	if [ $PAGE_NAME_SUFFIX == "js" ]; then
625	valPrint trs "Skipping URL on line $LINK_NUM because it was found on JavaScript page $PAGE_NAME."
626	let SKIP_JS_PAGE+=1
627	continue
628	fi
629
630	# Build longer wiki page URLs from namespace and page names
631	FULL_PAGE_PATH=http://$WIKI_PATH/$NS_NAME:$PAGE_NAME
632	LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
633	# Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it
634	# explicitly breaks the link
635	if [ $NS_ID -eq 0 ]; then
636	FULL_PAGE_PATH=http://$WIKI_PATH/$PAGE_NAME
637	LOCAL_PAGE_PATH=$PAGE_NAME
638	fi
639
640	# The URL being linked to is everything after the previous two fields (this allows commas to be in
641	# the URLs, but a comma in the previous field, the page name, will break this)
642	URL=${LINE#$NS_ID,$PAGE_NAME,}
643
644	# Scan for illegal characters
645	if [[ $URL == [$ILLEGAL_CHARS] ]]; then
646	valPrint trs "Skipping URL $URL (found on page $PAGE_NAME) because it contains characters illegal in a URL."
647	let SKIP_BAD_URL+=1
648	continue
649	fi
650
651	# Now we need to know if the URL is for a file or a web page. First step is to determine if the
652	# URL ends in a suffix
653	HAS_SUFFIX=0
654
655	# If the URL ends in something like ".php?foo=bar", strip everything from the '?' onward
656	CLEAN_URL=${URL%%\?*}
657
658	# If the URL ends in something like "#section_15", strip everything from the '#' onward
659	CLEAN_URL=${CLEAN_URL%%\#*}
660
661	# 'sed' cannot handle Unicode in my Bash shell, so skip this URL and make user check it
662	if [[ $CLEAN_URL == [![:ascii:]] ]]; then
663	valPrint trs "Skipping URL $URL (found on page $PAGE_NAME) because I cannot handle non-ASCII characters."
664	let SKIP_NON_ASCII+=1
665	continue
666	fi
667
668	# Isolate the characters after the last period and after the last slash
669	POST_DOT=$(echo "$CLEAN_URL" \| sed 's/.*\.//')
670	POST_SLASH=$(echo "$CLEAN_URL" \| sed 's/.*\///')
671
672	# If the last period comes after the last slash, then the URL ends in a suffix
673	POST_DOT_LENGTH=$(echo \| awk -v input=$POST_DOT '{print length(input)}')
674	POST_SLASH_LENGTH=$(echo \| awk -v input=$POST_SLASH '{print length(input)}')
675	if [ $POST_DOT_LENGTH -lt $POST_SLASH_LENGTH ]; then
676	HAS_SUFFIX=1
677	else
678	HAS_SUFFIX=0
679	fi
680
681	# Now we need to know if the URL is for a file. If the URL ends in a suffix, compare it against the
682	# known URL endings supplied in HTTP_FILES and HTTP_TLDS_AND_PAGES
683	IS_FILE=-1
684	if [ $HAS_SUFFIX -eq 0 ]; then
685	IS_FILE=0
686	else
687	# Turn off case sensitivity while we compare suffixes
688	shopt -s nocasematch
689
690	# Special case: URLs ending in something like "/productID.304297400" are pages, not files, so if
691	# the URL's suffix is all numbers, we are looking at the end of a web page URL
692	if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
693	IS_FILE=0
694	fi
695
696	# If we did not identify this URL as a web page above, we need to compare the suffix against known
697	# file extensions
698	if [ $IS_FILE -eq -1 ]; then
699	for EXTENSION in "${HTTP_FILES[@]}"; do
700	if [[ $EXTENSION == $POST_DOT ]]; then # double brackets respect the nocasematch setting
701	IS_FILE=1
702	break
703	fi
704	done
705	fi
706
707	# If we did not identify this URL as a file above, we need to compare the suffix against known
708	# pages and TLDs in order to rule out that this is an unknown file suffix or TLD that the user
709	# needs to add to the HTTP_FILES or HTTP_TLDS_AND_PAGES array
710	if [ $IS_FILE -eq -1 ]; then
711	for PAGE_OR_TLD in "${HTTP_TLDS_AND_PAGES[@]}"; do
712	if [[ $PAGE_OR_TLD == $POST_DOT ]]; then
713	IS_FILE=0
714	break
715	fi
716	done
717	fi
718
719	# Turn case sensitivity back on in Bash
720	shopt -u nocasematch
721	fi
722
723	# If this suffix escaped identification as either a file, page or TLD, inform the user
724	STR_TYPE=""
725	if [ $IS_FILE -eq -1 ]; then
726	valPrint trs "Skipping URL $URL (found on page $PAGE_NAME) because I encountered the unknown URL ending $POST_DOT. Please add this ending to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
727	let SKIP_UNK_SUFFIX+=1
728	continue
729	elif [ $IS_FILE -eq 1 ]; then
730	STR_TYPE="file"
731	let FILE_LINKS+=1
732	elif [ $IS_FILE -eq 0 ]; then
733	STR_TYPE="page"
734	let PAGE_LINKS+=1
735	fi
736
737	# Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
738	# issue with sites that require HTTPS
739	CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{http_code}\n' $URL)
740	CURL_ERR=$(echo $?)
741	CURL_RESULT=$CURL_CODE
742
743	# Tack 'curl' exit code onto result string if 'curl' returned "000" (no HTTP response)
744	if [ $CURL_CODE == "000" ]; then
745	CURL_RESULT="$CURL_RESULT-$CURL_ERR"
746	fi
747
748	# Begin to determine our status code for this URL (EI, IW, OK, RD, or NG)
749	STATUS="??"
750	NEW_URL=""
751	INTERWIKI_INDEX=-1
752
753	# First make sure that this isn't an "external internal" link to our own wiki that can be replaced
754	# by "[[page_name]]". If it uses a special access URL beginning with "/w/", let it pass, as it
755	# probably cannot be replaced by "[[ ]]" markup
756	if [[ $URL == $WIKI_PATH ]] && [[ $URL != $WIKI_PATH/w/ ]]; then
757	STATUS="EI"
758	let EI_LINKS+=1
759	fi
760
761	# If it's not, check if this is a link to a domain that we have an interwiki prefix for
762	if [ $STATUS == "??" ]; then
763	for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
764	if [[ $URL == ${INTERWIKI_DOMAINS[$i]} ]] && [[ $URL != ${INTERWIKI_DOMAINS[$i]}/w/ ]]; then
765	STATUS="IW"
766	let IW_LINKS+=1
767	INTERWIKI_INDEX=$i
768	break
769	fi
770	done
771	fi
772
773	# If we didn't match an interwiki domain, see if the status code is in our "OK" codes list
774	if [ $STATUS == "??" ]; then
775	for CODE in "${OK_CODES[@]}"; do
776	if [[ $CODE == $CURL_CODE ]]; then
777	STATUS="OK"
778	let OK_LINKS+=1
779	break
780	fi
781	done
782	fi
783
784	# If we didn't get a match with the "OK" codes, check it against the "RD" codes
785	if [ $STATUS == "??" ]; then
786	for CODE in "${RD_CODES[@]}"; do
787	if [[ $CODE == $CURL_CODE ]]; then
788	# Get URL header again in order to retrieve the URL we are being redirected to
789	NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{redirect_url}\n' $URL)
790
791	# Filter out cases where the redirect URL is just the original URL with https:// instead of
792	# http://, or with an added '/' at the end. These corrections happen a lot and are not
793	# important to us.
794	URL_NO_PROTOCOL=${URL#http://}
795	URL_NO_PROTOCOL=${URL_NO_PROTOCOL%/}
796	NEW_URL_NO_PROTOCOL=${NEW_URL#https://}
797	NEW_URL_NO_PROTOCOL=${NEW_URL_NO_PROTOCOL%/}
798
799	# Sometimes 'curl' fails to get the redirect_url due to time-out or bad web site config
800	NEW_URL_LENGTH=$(echo \| awk -v input=$NEW_URL_NO_PROTOCOL '{print length(input)}')
801	if [ $NEW_URL_LENGTH -lt $MIN_URL_LENGTH ]; then
802	NEW_URL_NO_PROTOCOL="[new URL not retrieved]"
803	fi
804
805	# If the URLs match after the above filters were applied, then the link is OK
806	if [ $URL_NO_PROTOCOL == $NEW_URL_NO_PROTOCOL ]; then
807	STATUS="OK"
808	let OK_LINKS+=1
809	else
810	STATUS="RD"
811	let RD_LINKS+=1
812	fi
813	break
814	fi
815	done
816	fi
817
818	# If we didn't get a match with the "RD" codes, check it against the "NG" codes
819	if [ $STATUS == "??" ]; then
820	for CODE in "${NG_CODES[@]}"; do
821	if [[ $CODE == $CURL_CODE ]]; then
822	STATUS="NG"
823	let NG_LINKS+=1
824	break
825	fi
826	done
827	fi
828
829	# If we didn't match a known status code, advise the reader
830	if [ $STATUS == "??" ]; then
831	valPrint trs "Skipping URL $URL (found on page $PAGE_NAME) because I encountered the unknown return code $CURL_CODE."
832	let SKIP_UNK_CODE+=1
833	continue
834	fi
835
836	# Check problem links against exceptions file before proceeding
837	if [ $STATUS != "OK" ] && [ ! -z $EXCEPT_URL ]; then
838	# The code we expect to find in the exceptions file is either the 'curl' result or "EI"/"IW"
839	EXPECT_CODE="$CURL_RESULT"
840	if [ $STATUS == "EI" ]; then
841	EXPECT_CODE="EI"
842	elif [ $STATUS == "IW" ]; then
843	EXPECT_CODE="IW"
844	fi
845
846	# Look for link in exceptions file and make sure its listed result code and wiki page also match
847	GREP_RESULT=$(grep --max-count=1 "$URL" "$EXCEPT_FILE")
848	EXCEPT_PAGE=${GREP_RESULT##*,}
849	if [ "$EXCEPT_PAGE" == "*" ] \|\| [ "$EXCEPT_PAGE" == $LOCAL_PAGE_PATH ]; then
850	EXCEPT_CODE=${GREP_RESULT%%,*}
851	if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then
852	valPrint trs "Skipping URL $URL (found on page $PAGE_NAME) because its expected result, $EXPECT_CODE, is listed in the exceptions file."
853	if [ $STATUS == "EI" ]; then
854	let SKIP_EXPECT_EI+=1
855	elif [ $STATUS == "IW" ]; then
856	let SKIP_EXPECT_IW+=1
857	else
858	let SKIP_EXPECT_NG+=1
859	fi
860	continue
861	fi
862	fi
863	fi
864
865	# If appropriate, record this link to the log, with clickable URLs when possible
866	if [ $STATUS != "OK" ] \|\| [ $RECORD_OK_LINKS -eq 1 ]; then
867	# Stupid hack since the strings "IW" and "EI" are narrower than "OK", "RD", or "NG" and it takes
868	# an extra tab to get to the desired level of indentation in the RTF log
869	RTF_TABS=" "
870	if [ $STATUS == "IW" ] \|\| [ $STATUS == "EI" ]; then
871	RTF_TABS=" "
872	fi
873
874	# Record link and its wiki page in TXT, RTF, and HTML markup
875	valPrint t "$STATUS ($CURL_RESULT) $STR_TYPE $URL"
876	valPrint t " linked from $FULL_PAGE_PATH"
877	valPrint r "$STATUS ($CURL_RESULT)$RTF_TABS$STR_TYPE {\field{\*\fldinst{HYPERLINK \"$URL\"}}{\fldrslt $URL}}"
878	valPrint r " linked from {\field{\*\fldinst{HYPERLINK \"$FULL_PAGE_PATH\"}}{\fldrslt $LOCAL_PAGE_PATH}}"
879	valPrint hn "<tr><td style=\"white-space:nowrap\">$STATUS ($CURL_RESULT)</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>"
880	valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"
881
882	# Record redirect URL if one was given by a 3xx response page
883	if [ $STATUS == "RD" ]; then
884	valPrint ts " Server suggests $NEW_URL"
885	valPrint rs " Server suggests {\field{\*\fldinst{HYPERLINK \"$NEW_URL\"}}{\fldrslt $NEW_URL}}"
886	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Server suggests</td><td><a href=\"$NEW_URL\" target=\"_blank\">$NEW_URL</a></td></tr>"
887	fi
888
889	# Notify reader if we can use an intrawiki link for this URL
890	if [ $STATUS == "EI" ]; then
891	INTRA_PAGE=${URL#:///}
892	valPrint ts " Just use [[$INTRA_PAGE]]"
893	valPrint rs " Just use [[$INTRA_PAGE]]"
894	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Just use</td><td>[[$INTRA_PAGE]]</td></tr>"
895	fi
896
897	# Notify reader if we can use an interwiki prefix for this URL
898	if [ $STATUS == "IW" ]; then
899	INTER_PAGE=$(echo "$URL" \| sed 's/.*\///')
900	valPrint ts " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
901	valPrint rs " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
902	valPrint hs "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]</td></tr>"
903	fi
904
905	# Query Internet Archive for latest "OK" snapshot for "NG" page
906	if [ $STATUS == "NG" ] && [ $SUGGEST_SNAPSHOTS -eq 1 ]; then
907	ARCHIVE_QUERY=$(curl --silent --max-time 10 "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
908
909	# If a "closest" snapshot was received...
910	if [[ "$ARCHIVE_QUERY" == \"closest\": ]]; then
911	# In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it
912	ARCHIVE_QUERY=$(echo "$ARCHIVE_QUERY" \| sed 's/#!/#\\!/')
913
914	# ...isolate "url" property in the response that follows the "closest" tag
915	SNAPSHOT_URL=${ARCHIVE_QUERY##*\"closest\":} # everything after '"closest":'
916	SNAPSHOT_URL=${SNAPSHOT_URL#*\"url\": \"} # everything after '"url": "'
917	SNAPSHOT_URL=${SNAPSHOT_URL%%\"*} # everything before '"'
918
919	# Inform the user of the snapshot URL
920	valPrint ts " IA suggests $SNAPSHOT_URL"
921	valPrint rs " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
922	valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
923	else # ...otherwise give generic Wayback Machine link for this URL
924	valPrint ts " Try browsing $ARCHIVE_GENERIC/$URL"
925	valPrint rs " Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
926	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"
927	fi
928	fi
929	fi
930
931	# If this is a page and it seems to be there, take screenshot of it using headless Google Chrome
932	if [ $IS_FILE -eq 0 ] && [ $TAKE_PAGE_SHOT -eq 1 ] && [ $STATUS == "OK" ]; then
933	# Sanitize URL by removing "http(s)://" and converting any other colons or slashes to underscores
934	SHOT_NAME=$(echo "$URL" \| sed 's/https*\:\/\///' \| sed 'y/:\//__/')
935	SHOT_FILE="$SHOT_PATH/$SHOT_NAME.png"
936
937	# Don't take screenshot if we already encountered this page and screenshotted it
938	if [ ! -f "$SHOT_FILE" ]; then
939	"$CHROME_PATH" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
940	if [ -f "$WORKING_DIR/$CHROME_SCREENSHOT" ]; then
941	mv -n "$WORKING_DIR/$CHROME_SCREENSHOT" "$SHOT_FILE"
942	else
943	valPrint trhs "Screenshot of URL $URL seems to have failed!"
944	fi
945	else
946	valPrint trhs "Skipping screenshot of URL $URL because $SHOT_FILE already exists."
947	fi
948	fi
949	done
950	FINISHED_LIST="yes"
951	wrapupAndExit

Note: See TracBrowser for help on using the repository browser.

Download in other formats: