Context Navigation

source: Validate External Links/validate_external_links.sh@ 1124

Last change on this file since 1124 was 1124, checked in by iritscen, 5 years ago
Val now removes the annoying ':80' in many Archive links.
File size: 45.4 KB

Line
1	#!/bin/bash
2
3	# Validate External Links by Iritscen
4	# Provided with a list of external links found in the OniGalore wiki, this script validates them.
5	# The resulting logs are produced in three formats: TXT (for easy diffing with an earlier log), RTF
6	# (for reading as a local file with clickable links), and HTML (for uploading as a web page).
7	# Call script with "--help" argument for documentation. Also see Read Me First.rtf for critical notes.
8	# Recommended rule:
9	# \|----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----\|
10
11	# Set separator token to newline
12	IFS="
13	"
14
15	### GLOBALS ###
16	# Settings -- these will be changed from their defaults by the arguments passed in to the script
17	LINKS_URL="" # use 'curl' to download file with links from this location (can be file://)
18	EXCEPT_URL="" # ditto above for file with exceptions to NG results
19	OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder
20	RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES
21	SHOW_SLASH=0 # record response code to the log when a slash is added to the end of a URL
22	SHOW_HTTPS=0 # record response code to the log when "http" is upgraded to "https"
23	SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page
24	TAKE_PAGE_SHOT=0 # take a screenshot of each OK page
25	CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature
26	URL_START=1 # start at this URL in LINKS_FILE (1 by default)
27	URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE
28	UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report
29
30	# Fixed strings -- see the occurrences of these variables to learn their purpose
31	AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36 OPR/67.0.3575.53"
32	ARCHIVE_API="http://archive.org/wayback/available"
33	ARCHIVE_GENERIC="https://web.archive.org/web/*"
34	ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
35	CHROME_SCREENSHOT="screenshot.png"
36	CURL_CODES="http://iritscen.oni2.net/val/curl_codes.txt"
37	EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
38	HTTP_CODES="http://iritscen.oni2.net/val/http_codes.txt"
39	MY_WIKI_PAGE="https://wiki.oni2.net/User:Iritscen"
40	THIS_DIR=$(cd $(dirname $0); pwd)
41	WORKING_DIR=$(pwd)
42	WIKI_PATH="wiki.oni2.net"
43
44	# These are parallel arrays of the IDs and names of OniGalore's current namespaces
45	declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 100 101 102 103 104 105 108 109 110 111)
46	declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk")
47
48	# These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
49	# This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code.
50	declare -a HTTP_FILES=(txt zip wmv jpg png m4a bsl rar oni mp3 mov ONWC vbs TRMA mp4 doc avi log gif pdf dmg exe cpp tga 7z wav east BINA xml dll dae xaf fbx 3ds blend flv csv)
51	declare -a HTTP_TLDS_AND_PAGES=(com net org uk ru de it htm html php pl asp aspx shtml pgi cgi php3 x jsp phtml cfm css action stm js)
52
53	# These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
54	# are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt
55	# if you add a new code.
56	declare -a OK_CODES=(200 401 405 406 501)
57	declare -a RD_CODES=(301 302 303 307 308)
58	declare -a NG_CODES=(000 403 404 410 500 503)
59
60	# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
61	# transcluded text, and if the transclusion fails, then the braces show up in the URL
62	ILLEGAL_CHARS="{ }"
63
64	# The shortest URL possible, used for sanity-checking some URLs: http://a.co
65	MIN_URL_LENGTH=11
66
67	# These are parallel arrays giving the prefixes that can be used in place of normal external links to
68	# some wikis and other sites
69	declare -a INTERWIKI_PREFIXES=(commons metawikimedia mw wikibooks wikidata wikimedia wikinews wikiquote wikisource wikispecies wikiversity wikivoyage wikt wp)
70	declare -a INTERWIKI_DOMAINS=(commons.wikimedia.org meta.wikimedia.org mediawiki.org wikibooks.org wikidata.org wikimediafoundation.org wikinews.org wikiquote.org wikisource.org species.wikimedia.org wikiversity.org wikivoyage.org wiktionary.org wikipedia.org)
71
72	# Variables for keeping track of main loop progress and findings
73	LINK_NUM=0
74	EI_LINKS=0
75	IW_LINKS=0
76	OK_LINKS=0
77	RD_LINKS=0
78	NG_LINKS=0
79	SKIP_UNK_NS=0
80	SKIP_JS_PAGE=0
81	SKIP_BAD_URL=0
82	SKIP_NON_ASCII=0
83	SKIP_UNK_SUFFIX=0
84	SKIP_UNK_CODE=0
85	SKIP_EXPECT_NG=0
86	SKIP_EXPECT_EI=0
87	SKIP_EXPECT_IW=0
88	SKIP_HTTPS_UP=0
89	SKIP_SLASH_ADD=0
90	FILE_LINKS=0
91	PAGE_LINKS=0
92	SKIPPED_HEADER_ROW=0
93	FINISHED_LIST="no"
94	START_RUN=0
95	END_RUN=0
96
97
98	### HELP ###
99	# A pseudo-man page. Here is the 80-character rule for the page text:
100	# 234567890123456789012345678901234567890123456789012345678901234567890123456789
101	function printHelp()
102	{
103	cat << EOF
104
105	NAME
106	Validate External Links
107
108	SYNOPSIS
109	validate_external_links.sh --help
110	validate_external_links.sh --links URL --output DIR [--exceptions URL]
111	[--record-ok-links] [--suggest-snapshots] [--take-screenshots FILE]
112	[--start-url NUM] [--end-url NUM] [--upload FILE]
113
114	DESCRIPTION
115	This script parses a list of external links found in the OniGalore wiki
116	(which is dumped by the Oni2.net domain periodically in a particular
117	format), validates them using the Unix tool 'curl', and produces a report
118	of which links were "OK" (responded positively to an HTTP query), which
119	were "RD" (responded with a 3xx redirect code), which could be "IW"
120	(interwiki) links, which are "EI" (external internal) links and could be
121	intrawiki links, and which were "NG" (no good; a negative response to the
122	query). This report can then be automatically uploaded to the location of
123	your choice. The script can also suggest Internet Archive snapshots for
124	"NG" links, and take screenshots of "OK" links for visual verification by
125	the reader that the page in question is the one intended to be displayed.
126
127	You must pass this script the URL at which the list of links is found
128	(--links) and the path where the directory of logs should be outputted
129	(--output). All other arguments are optional.
130
131	OPTIONS
132	--help Show this page.
133	--links URL (required) URL from which to download the CSV
134	file with external links. Note that this URL can
135	be a local file if you supply a file:// path.
136	--output DIR (required) Unix path to directory in which Val
137	should place its reports.
138	--exceptions URL In order to remove links from the report which
139	Val finds an issue with, but which you regard as
140	OK, list those desired exceptions in this file.
141	See the sample file exceptions.txt for details.
142	Note that this URL can point to a local file if
143	you supply a file:// path.
144	--record-ok-links Log a link in the report even if its response
145	code is "OK".
146	--show-added-slashes Report on redirects that simply add a '/' to the
147	end of the URL.
148	--show-https-upgrade Report on redirects that simply upgrade a
149	"http://" URL to a "https://" URL.
150	--suggest-snapshots Query the Internet Archive for a possible
151	snapshot URL for each "NG" page.
152	--take-screenshots FILE Call the Google Chrome binary at this path to
153	take screenshots of each "OK" page.
154	--start-url NUM Start at this link in the links CSV file.
155	--end-url NUM Stop at this link in the links CSV file.
156	--upload FILE Upload report using the credentials and path
157	given in this local text file. See sftp_login.txt
158	for template.
159
160	BUGS
161	The script cannot properly parse any line in the external links file
162	which contains a comma in the name of the wiki page containing a link.
163	Commas in the link itself are not an issue.
164	EOF
165	}
166
167
168	### SETUP ###
169	# If first argument is a help request, or if nothing was passed in at all, print help page and quit
170	if [ "$#" -eq 0 ] \|\| [ "$1" == "--help" ]; then
171	printHelp \| less
172	exit 0
173	fi
174
175	# Parse arguments as long as there are more arguments to process
176	while (( "$#" )); do
177	case "$1" in
178	--links ) LINKS_URL="$2"; shift 2;;
179	--exceptions ) EXCEPT_URL="$2"; shift 2;;
180	--output ) OUTPUT_DIR="$2"; shift 2;;
181	--record-ok-links ) RECORD_OK_LINKS=1; shift;;
182	--show-added-slashes ) SHOW_SLASH=1; shift;;
183	--show-https-upgrade ) SHOW_HTTPS=1; shift;;
184	--suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;;
185	--take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
186	--start-url ) URL_START=$2; shift 2;;
187	--end-url ) URL_LIMIT=$2; shift 2;;
188	--upload ) UPLOAD_INFO=$2; shift 2;;
189	* ) echo "Invalid argument $1 detected. Aborting."; exit 1;;
190	esac
191	done
192
193	# If the required arguments were not supplied, print help page and quit
194	if [ -z $LINKS_URL ] \|\| [ -z $OUTPUT_DIR ]; then
195	echo "Error: I did not receive one or both required arguments. Run me with the \"--help\" argument for documentation."
196	exit 2
197	fi
198
199	# If user wants screenshots, make sure path to Chrome was passed in and is valid
200	if [ $TAKE_PAGE_SHOT -eq 1 ]; then
201	if [ ! -f "$CHROME_PATH" ]; then
202	echo "Error: You need to supply a path to the Google Chrome application in order to take screenshots."
203	exit 3
204	fi
205	fi
206
207	# Check that UPLOAD_INFO exists, if this argument was supplied
208	if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then
209	echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting."
210	exit 4
211	fi
212
213	# Check that OUTPUT_DIR is a directory
214	if [ ! -d "$OUTPUT_DIR" ]; then
215	echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting."
216	exit 5
217	fi
218
219	# Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots
220	SAFE_TIME=$(date "+%Y-%m-%d--%H-%M-%S") # for use in path names
221	NICE_TIME=$(TZ=":UTC" date "+%a, %d %b %Y %k:%M:%S GMT") # matches format of LINKS_DATE, which is extracted from the extlinks file
222	OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
223	OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
224	SHOT_PATH="$OUTPUT_PATH/Screenshots"
225	LOG_NAME="ValExtLinks report"
226	LOG_TXT="$OUTPUT_PATH/$LOG_NAME.txt"
227	LOG_RTF="$OUTPUT_PATH/$LOG_NAME.rtf"
228	LOG_HTM="$OUTPUT_PATH/$LOG_NAME.htm"
229	mkdir "$OUTPUT_PATH"
230	if [ $TAKE_PAGE_SHOT -eq 1 ]; then
231	mkdir "$SHOT_PATH"
232	fi
233
234	# Check that 'mkdir' succeeded
235	if [ ! -d "$OUTPUT_PATH" ]; then
236	echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting."
237	exit 6
238	fi
239
240	# Get date on the file at LINKS_URL and print to log
241	LINKS_DATE=$(curl --silent --head $LINKS_URL \| grep "Last-Modified")
242	if [ -z "$LINKS_DATE" ]; then
243	echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting."
244	exit 7
245	fi
246	LINKS_DATE=${LINKS_DATE#Last-Modified: }
247
248
249	### UTILITY FUNCTIONS ###
250	# Writes a plain-text header to TXT log file
251	function printTXTheader()
252	{
253	valPrint t "Validate External Links report"
254	valPrint t "generated $NICE_TIME"
255	valPrint t "from data of $LINKS_DATE"
256	valPrint t "script by Iritscen (contact: $MY_WIKI_PAGE)"
257	valPrint t ""
258	}
259
260	# Writes the RTF header to RTF log file, then a centered title, then sets text to left-justified
261	function printRTFheader()
262	{
263	valPrint r "{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf820
264	{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
265	{\colortbl;\red255\green255\blue255;}
266	{\*\expandedcolortbl;;}
267	\margl1440\margr1440\vieww12600\viewh12100\viewkind0
268	\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\qc\partightenfactor0
269
270	\f0\fs28 \cf0 \b1 Validate External Links report \b0\\
271	generated $NICE_TIME\\
272	from data of $LINKS_DATE\\
273	script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$MY_WIKI_PAGE\"}}{\fldrslt contact}})
274	\\
275	\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
276	\cf0 "
277	}
278
279	# Closes the RTF markup of the RTF log file
280	function printRTFfooter()
281	{
282	valPrint r "}"
283	}
284
285	# Writes the HTML header to HTML log file
286	function printHTMheader()
287	{
288	valPrint h "<html>
289	<head>
290	<title>Validate External Links report</title>
291	</head>
292	<body>
293	<h2>Validate External Links report</h2>
294	<h3>generated $NICE_TIME<br />
295	from data of $LINKS_DATE<br />
296	script by Iritscen (<a href=\"$MY_WIKI_PAGE\" target=\"_blank\">contact</a>)</h3>"
297	}
298
299	# Closes the HTML markup of the HTML log file
300	function printHTMfooter()
301	{
302	valPrint h "</body>
303	</html>"
304	}
305
306	# The central logging function. The first parameter is a string composed of one or more characters that
307	# indicate which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
308	# 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 's' means "Print an extra newline at the end." 'w' means "Don't pass console output through 'fmt'" ("fmt" fits the output
309	# to an 80-column CLI but can break special formatting and the 'n' option).
310	function valPrint()
311	{
312	if [[ "$1" == c ]]; then
313	if [[ "$1" == n ]]; then
314	echo -n "$2"
315	elif [[ "$1" == w ]]; then
316	echo "$2"
317	elif [[ "$1" == s ]]; then
318	echo -e "$2\n"
319	else
320	echo "$2" \| fmt -w 80
321	fi
322	fi
323	if [[ "$1" == t ]]; then
324	if [[ "$1" == n ]]; then
325	echo -n "$2" >> "$LOG_TXT"
326	elif [[ "$1" == s ]]; then
327	echo -e "$2\n" >> "$LOG_TXT"
328	else
329	echo "$2" >> "$LOG_TXT"
330	fi
331	fi
332	if [[ "$1" == r ]]; then
333	if [[ "$1" == n ]]; then
334	echo "$2" >> "$LOG_RTF"
335	elif [[ "$1" == s ]]; then
336	echo "$2\line\line" >> "$LOG_RTF"
337	else
338	echo "$2\line" >> "$LOG_RTF"
339	fi
340	fi
341	if [[ "$1" == h ]]; then
342	if [[ "$1" == s ]]; then
343	echo "$2<tr><td> </td></tr>" >> "$LOG_HTM"
344	elif [[ "$1" == n ]]; then
345	echo "$2" >> "$LOG_HTM"
346	else
347	echo "$2<br />" >> "$LOG_HTM"
348	fi
349	fi
350	}
351
352	# Pluralize the string in parameter 1 if the number in parameter 2 is not 1
353	function pluralCheckNoun()
354	{
355	if [ $2 -ne 1 ]; then
356	if [[ $1 =~ x$ ]]; then
357	echo $1es
358	else
359	echo $1s
360	fi
361	else
362	echo $1
363	fi
364	}
365
366	# Output "is" if parameter 1 is 1, otherwise "are"
367	function pluralCheckIs()
368	{
369	if [ $1 -ne 1 ]; then
370	echo "are"
371	else
372	echo "is"
373	fi
374	}
375
376	# Output "was" if parameter 1 is 1, otherwise "were"
377	function pluralCheckWas()
378	{
379	if [ $1 -ne 1 ]; then
380	echo "were"
381	else
382	echo "was"
383	fi
384	}
385
386	# Output "a " if parameter 1 is 1, otherwise nothing
387	function pluralCheckA()
388	{
389	if [ $1 -eq 1 ]; then
390	echo "a "
391	fi
392	}
393
394	# Output "an " if parameter 1 is 1, otherwise nothing
395	function pluralCheckAn()
396	{
397	if [ $1 -eq 1 ]; then
398	echo "an "
399	fi
400	}
401
402	# Upload HTML report using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
403	# reports being saved to disk have already been closed.
404	function uploadReport()
405	{
406	valPrint c "Uploading HTML report..."
407
408	SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
409	SFTP_USER_NAME_MARKER="user:"
410	SFTP_PASSWORD_MARKER="pw:"
411	SFTP_PORT_MARKER="port:"
412	SFTP_PATH_MARKER="path:"
413	SFTP_USER_NAME=$(grep $SFTP_USER_NAME_MARKER $UPLOAD_INFO)
414	SFTP_USER_NAME=${SFTP_USER_NAME#$SFTP_USER_NAME_MARKER}
415	SFTP_PASSWORD=$(grep $SFTP_PASSWORD_MARKER $UPLOAD_INFO)
416	SFTP_PASSWORD=${SFTP_PASSWORD#$SFTP_PASSWORD_MARKER}
417	SFTP_PORT=$(grep $SFTP_PORT_MARKER $UPLOAD_INFO)
418	SFTP_PORT=${SFTP_PORT#$SFTP_PORT_MARKER}
419	SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
420	SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}
421
422	expect "$SCRIPT_PATH" "$LOG_HTM" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.htm"
423
424	valPrint c "Report was uploaded, unless an error message appears above."
425	}
426
427	# Prints session summary when script is done
428	function wrapupAndExit()
429	{
430	# Get off progress line on console, drop down a line from last link in log, and close HTML table
431	valPrint ctr ""
432	valPrint h "</table><br />"
433
434	# If we didn't finish processing the last URL, then the iterator is one too high
435	if [ $FINISHED_LIST != "yes" ]; then
436	let LINK_NUM-=1
437	if [ $FINISHED_LIST == "no" ]; then
438	valPrint ctrh "The session was canceled by the user."
439	fi
440	fi
441
442	# Generate string with elapsed time
443	END_RUN=$(date +%s)
444	ELAPSED=$(echo $(($END_RUN - $START_RUN)) \| awk '{printf "%d min. %d sec. elapsed", int($1/60), int($1%60)}')
445
446	# Do some math on results of session
447	LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
448	LINK_PROBLEMS=$((EI_LINKS+IW_LINKS+RD_LINKS+NG_LINKS))
449	LINK_ERRORS=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
450	LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_EI+SKIP_EXPECT_IW))
451	TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP))
452	LINKS_CHECKED=$((LINKS_PROCESSED-LINK_ERRORS))
453
454	# Print summary header
455	valPrint ct "Summary ($ELAPSED):"
456	valPrint r "\b1 Summary \b0 ($ELAPSED)"
457	valPrint hn "<h3><span id=\"summary\">Summary ($ELAPSED)</span></h3>"
458	valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT) (there $(pluralCheckWas $FILE_LINKS) $FILE_LINKS file $(pluralCheckNoun link $FILE_LINKS) and $PAGE_LINKS page $(pluralCheckNoun link $PAGE_LINKS))."
459
460	# Print processed link totals
461	if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi
462	if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi
463	if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS processed $(pluralCheckNoun link $LINK_PROBLEMS) had $(pluralCheckAn $LINK_PROBLEMS)$(pluralCheckNoun issue $LINK_PROBLEMS)"; fi
464	if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr " (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; valPrint h "nbsp;nbsp;(excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi
465	if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) $(pluralCheckWas $OK_LINKS) OK"; fi
466	if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctr " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; valPrint h "  (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi
467
468	# Print excepted link totals
469	if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctrh "$LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"; fi
470	if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
471	if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi
472	if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi
473
474	# Print errored link totals
475	if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"; fi
476	if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
477	if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
478	if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
479	if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
480	if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
481	if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
482
483	# Print checked link totals
484	if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS link $(pluralCheckNoun issue $LINK_PROBLEMS):"; fi
485	if [ $NG_LINKS -gt 0 ]; then valPrint ctrh "- $NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
486	if [ $RD_LINKS -gt 0 ]; then valPrint ctrh "- $RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi
487	if [ $EI_LINKS -gt 0 ]; then valPrint ctrh "- $EI_LINKS $(pluralCheckNoun link $EI_LINKS) that could be intrawiki"; fi
488	if [ $IW_LINKS -gt 0 ]; then valPrint ctrh "- $IW_LINKS $(pluralCheckNoun link $IW_LINKS) that could be interwiki"; fi
489
490	# Close the log files' markup
491	valPrint trh "ValExtLinks says goodbye."
492	printRTFfooter
493	printHTMfooter
494
495	# Upload report if this was requested
496	if [ ! -z $UPLOAD_INFO ] && [ $FINISHED_LIST != "no" ]; then
497	uploadReport
498	fi
499
500	# Really quit now
501	valPrint c "ValExtLinks says goodbye."
502	exit 0
503	}
504	trap wrapupAndExit INT
505
506
507	### INITIALIZATION ###
508	# Print opening message to console and log files
509	valPrint cw "$(tput bold)--Validate External Links--$(tput sgr0)"
510	printTXTheader
511	printRTFheader
512	printHTMheader
513
514	# Attempt to download file at LINKS_URL, then check that it succeeded
515	valPrint t "Config:"
516	valPrint r "\b1 Config \b0"
517	valPrint hn "<h3>Config</h3>"
518	valPrint cwtrh "Downloading list of external links from $LINKS_URL."
519	LINKS_FILE_NAME=$(echo "$LINKS_URL" \| sed 's/.*\///')
520	LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
521	curl --silent -o "$LINKS_FILE" $LINKS_URL
522	if [ ! -f "$LINKS_FILE" ]; then
523	echo "The download of $LINKS_URL appears to have failed. Aborting."
524	wrapupAndExit
525	fi
526
527	# Attempt to download file at EXCEPT_URL, then check that it succeeded
528	if [ ! -z $EXCEPT_URL ]; then
529	valPrint cwtrh "Downloading list of reporting exceptions from $EXCEPT_URL."
530	EXCEPT_FILE_NAME=$(echo "$EXCEPT_URL" \| sed 's/.*\///')
531	EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
532	curl --silent -o "$EXCEPT_FILE" $EXCEPT_URL
533	if [ ! -f "$EXCEPT_FILE" ]; then
534	echo "The download of $EXCEPT_URL appears to have failed. Aborting."
535	wrapupAndExit
536	fi
537	fi
538
539	# Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
540	LINK_COUNT_STRING=$(cat "$LINKS_FILE" \| wc -l)
541
542	# Number of URLs is number of lines minus one (first line is column header row for the CSV)
543	LINK_COUNT=$(echo "${LINK_COUNT_STRING}" \| tr -d '[:space:]')
544	let LINK_COUNT-=1
545
546	# Calculate number of URLs to consider
547	if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
548	valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((URL_LIMIT-URL_START+1)) of them, from link $URL_START to $URL_LIMIT."
549	elif [ $URL_START -ne 1 ]; then
550	valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((LINK_COUNT-URL_START+1)) of them, from link $URL_START to $LINK_COUNT."
551	else
552	valPrint ctrh "Found $LINK_COUNT links to process. I will be considering all of them."
553	fi
554
555	# Print settings to console and log
556	declare -a SETTINGS_MSG=(I will be checking the response code of each link "and will" take a screenshot of each page. Pages that are OK will "also" be logged. I "will" ask the Internet Archive for a suggested snapshot URL for each NG page. "I will not report links that are listed in the exceptions file.")
557	if [ $TAKE_PAGE_SHOT -eq 0 ]; then SETTINGS_MSG[10]="but will not"; fi
558	if [ $RECORD_OK_LINKS -eq 0 ]; then SETTINGS_MSG[22]="not"; fi
559	if [ $SUGGEST_SNAPSHOTS -eq 0 ]; then SETTINGS_MSG[26]="will not"; fi
560	if [ -z $EXCEPT_URL ] \|\| [ $RECORD_OK_LINKS -eq 1 ]; then SETTINGS_MSG[40]=""; fi
561	SETTINGS_STR=${SETTINGS_MSG[@]}
562	valPrint ctrh "$SETTINGS_STR"
563	valPrint tr "A summary of my findings will be found at the bottom of the report."
564	valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
565	valPrint trh ""
566
567	# Print legend to logs
568	valPrint t "Legend:"
569	valPrint r "\b1 Legend \b0"
570	valPrint hn "<h3>Legend</h3>"
571	valPrint trh "OK = URL seems to be working."
572	valPrint trh "NG = URL no longer seems to work. You should click each URL marked as NG before attempting to fix it, because false negatives will occur from time to time due to hiccups in the Internet. Please report any persistent false negatives or other issues to Iritscen. An NG link should be followed by a link to the Internet Archive's Wayback Machine which may help you repair the link. If the link cannot be repaired, you can disable it on the wiki (which prevents it from showing up in future ValExtLinks reports) by wrapping it in nowiki tags."
573	valPrint trh "RD = The server responding to this URL is saying that the page moved and you should instead use the supplied new URL. Some RD links represent minor adjustments in the organization of a web site, and some are soft 404s (the file/page has been removed and you are being redirected to something like the main page of the web site). You will have to look at the new URL yourself to determine if it represents an OK link and the link on the wiki should be updated to this one, or if the desired file/page is actually gone and we need to replace the wiki link with an Internet Archive snapshot link -- or disable the URL if it has not been archived."
574	valPrint trh "EI = URL is an external link to an internal page and should be converted to an intrawiki link using the suggested markup."
575	valPrint trh "IW = URL is an external link to a fellow wiki and should be converted to an interwiki link using the suggested markup."
576	valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $HTTP_CODES)."
577	valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$HTTP_CODES\"}}{\fldrslt here}} for code reference)."
578	valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$HTTP_CODES\" target=\"_blank\">here</a> for code reference)."
579	valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $CURL_CODES)."
580	valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$CURL_CODES\"}}{\fldrslt here}} for code reference)."
581	valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$CURL_CODES\" target=\"_blank\">here</a> for code reference)."
582	valPrint trh "IA suggests = Last available snapshot suggested by the Internet Archive."
583	valPrint trh "Try browsing = The Archive occasionally fails to return a snapshot URL even when one exists, so you will need to check for a snapshot manually using this link to the Wayback Machine before concluding that a site has not been archived."
584	valPrint trh ""
585
586
587	### MAIN LOOP ###
588	valPrint t "Links:"
589	valPrint r "\b1 Links \b0"
590	valPrint hn "<h3>Links</h3>"
591	START_RUN=$(date +%s)
592	# Process each line of the .csv in LINKS_FILE
593	for LINE in `cat "$LINKS_FILE"`; do
594	let LINK_NUM+=1
595
596	# First line is the column header row for the CSV, so let's verify that the format hasn't changed
597	if [ $SKIPPED_HEADER_ROW -eq 0 ]; then
598	if [ $LINE == "namespace,title,target" ]; then
599	SKIPPED_HEADER_ROW=1
600	LINK_NUM=0 # this line is it's not a link, so reset the link counter
601	valPrint hn "<table>"
602	continue
603	else
604	valPrint ctrh "Did not find expected header row in $LINKS_FILE. Aborting."
605	wrapupAndExit
606	fi
607	fi
608
609	# Skip this link if we are not at URL_START yet
610	if [ $LINK_NUM -lt $URL_START ]; then
611	continue
612	fi
613
614	# Stop if we are at the limit declared for testing purposes
615	if [ $URL_LIMIT -gt 0 ] && [ $LINK_NUM -gt $URL_LIMIT ]; then
616	FINISHED_LIST="limit"
617	wrapupAndExit
618	fi
619
620	# Print progress to screen
621	if [ $LINK_NUM -gt 1 ]; then
622	printf "\e[1A\n" # erase previous progress message so that new one appears in its place
623	fi
624	valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."
625
626	# The number of the namespace is the element before the first comma on the line
627	NS_ID=${LINE%%,*}
628
629	# Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
630	NS_NAME=""
631	a=0
632	while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done
633	if [ $NS_ID == "NULL" ]; then
634	break
635	elif [ $NS_ID -eq ${NS_IDS[$a]} ]; then
636	NS_NAME="${NS_NAMES[$a]}"
637	break
638	fi
639	let a+=1
640	done
641	if [ "$NS_NAME" == "" ]; then
642	if [ $NS_ID == "NULL" ]; then
643	valPrint trs "Skipping line $LINK_NUM ('$LINE') because the namespace (and probably the page too) is 'NULL'. Probably the link is no longer in existence on the wiki."
644	else
645	valPrint trs "Skipping line $LINK_NUM ('$LINE') because I could not find a name for namespace ID $NS_ID."
646	fi
647	let SKIP_UNK_NS+=1
648	continue
649	fi
650
651	# The name of the page is everything between the namespace ID and the next comma on the line (commas
652	# in page names will break this)
653	PAGE_NAME=${LINE#$NS_ID,}
654	PAGE_NAME=${PAGE_NAME%%,*}
655
656	# We don't want to consider wiki pages ending in .js, as the parser cannot reliably isolate URLS in
657	# JavaScript code, so it will return erroneous links
658	PAGE_NAME_SUFFIX=$(echo $PAGE_NAME \| sed 's/.*\.//')
659	if [ $PAGE_NAME_SUFFIX == "js" ]; then
660	valPrint trs "Skipping URL '${LINE#$NS_ID,$PAGE_NAME,}' on line $LINK_NUM because it was found on JavaScript page '$PAGE_NAME'."
661	let SKIP_JS_PAGE+=1
662	continue
663	fi
664
665	# Build longer wiki page URLs from namespace and page names
666	FULL_PAGE_PATH=https://$WIKI_PATH/$NS_NAME:$PAGE_NAME
667	LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
668	# Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it
669	# explicitly breaks the link
670	if [ $NS_ID -eq 0 ]; then
671	FULL_PAGE_PATH=https://$WIKI_PATH/$PAGE_NAME
672	LOCAL_PAGE_PATH=$PAGE_NAME
673	fi
674
675	# The URL being linked to is everything after the previous two fields (this allows commas to be in
676	# the URLs, but a comma in the previous field, the page name, will break this)
677	URL=${LINE#$NS_ID,$PAGE_NAME,}
678
679	# Scan for illegal characters
680	if [[ $URL == [$ILLEGAL_CHARS] ]]; then
681	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because it contains characters illegal in a URL."
682	let SKIP_BAD_URL+=1
683	continue
684	fi
685
686	# Now we need to know if the URL is for a file or a web page. First step is to determine if the
687	# URL ends in a suffix
688	HAS_SUFFIX=0
689
690	# If the URL ends in something like ".php?foo=bar", strip everything from the '?' onward
691	CLEAN_URL=${URL%%\?*}
692
693	# If the URL ends in something like "#section_15", strip everything from the '#' onward
694	CLEAN_URL=${CLEAN_URL%%\#*}
695
696	# 'sed' cannot handle Unicode in my Bash shell, so skip this URL and make user check it
697	if [[ $CLEAN_URL == [![:ascii:]] ]]; then
698	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I cannot handle non-ASCII characters."
699	let SKIP_NON_ASCII+=1
700	continue
701	fi
702
703	# Isolate the characters after the last period and after the last slash
704	POST_DOT=$(echo "$CLEAN_URL" \| sed 's/.*\.//')
705	POST_SLASH=$(echo "$CLEAN_URL" \| sed 's/.*\///')
706
707	# If the last period comes after the last slash, then the URL ends in a suffix
708	POST_DOT_LENGTH=$(echo \| awk -v input=$POST_DOT '{print length(input)}')
709	POST_SLASH_LENGTH=$(echo \| awk -v input=$POST_SLASH '{print length(input)}')
710	if [ $POST_DOT_LENGTH -lt $POST_SLASH_LENGTH ]; then
711	HAS_SUFFIX=1
712	else
713	HAS_SUFFIX=0
714	fi
715
716	# Now we need to know if the URL is for a file. If the URL ends in a suffix, compare it against the
717	# known URL endings supplied in HTTP_FILES and HTTP_TLDS_AND_PAGES
718	IS_FILE=-1
719	if [ $HAS_SUFFIX -eq 0 ]; then
720	IS_FILE=0
721	else
722	# Turn off case sensitivity while we compare suffixes
723	shopt -s nocasematch
724
725	# Special case: URLs ending in something like "/productID.304297400" are pages, not files, so if
726	# the URL's suffix is all numbers, we are looking at the end of a web page URL
727	if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
728	IS_FILE=0
729	fi
730
731	# If we did not identify this URL as a web page above, we need to compare the suffix against known
732	# file extensions
733	if [ $IS_FILE -eq -1 ]; then
734	for EXTENSION in "${HTTP_FILES[@]}"; do
735	if [[ $EXTENSION == $POST_DOT ]]; then # double brackets respect the nocasematch setting
736	IS_FILE=1
737	break
738	fi
739	done
740	fi
741
742	# If we did not identify this URL as a file above, we need to compare the suffix against known
743	# pages and TLDs in order to rule out that this is an unknown file suffix or TLD that the user
744	# needs to add to the HTTP_FILES or HTTP_TLDS_AND_PAGES array
745	if [ $IS_FILE -eq -1 ]; then
746	for PAGE_OR_TLD in "${HTTP_TLDS_AND_PAGES[@]}"; do
747	if [[ $PAGE_OR_TLD == $POST_DOT ]]; then
748	IS_FILE=0
749	break
750	fi
751	done
752	fi
753
754	# Turn case sensitivity back on in Bash
755	shopt -u nocasematch
756	fi
757
758	# If this suffix escaped identification as either a file, page or TLD, inform the user
759	STR_TYPE=""
760	if [ $IS_FILE -eq -1 ]; then
761	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I encountered the unknown URL ending '$POST_DOT'. Please add this ending to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
762	let SKIP_UNK_SUFFIX+=1
763	continue
764	elif [ $IS_FILE -eq 1 ]; then
765	STR_TYPE="file"
766	let FILE_LINKS+=1
767	elif [ $IS_FILE -eq 0 ]; then
768	STR_TYPE="page"
769	let PAGE_LINKS+=1
770	fi
771
772	# Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
773	# issue with sites that require HTTPS
774	CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time 10 --write-out '%{http_code}\n' $URL)
775	CURL_ERR=$(echo $?)
776	CURL_RESULT=$CURL_CODE
777
778	# Tack 'curl' exit code onto result string if 'curl' returned "000" (no HTTP response)
779	if [ $CURL_CODE == "000" ]; then
780	CURL_RESULT="$CURL_RESULT-$CURL_ERR"
781	fi
782
783	# Begin to determine our status code for this URL (EI, IW, OK, RD, or NG)
784	STATUS="??"
785	NEW_URL=""
786	INTERWIKI_INDEX=-1
787
788	# First make sure that this isn't an "external internal" link to our own wiki that can be replaced
789	# by "[[page_name]]". If it uses a special access URL beginning with "/w/", let it pass, as it
790	# probably cannot be replaced by "[[ ]]" markup
791	if [[ $URL == $WIKI_PATH ]] && [[ $URL != $WIKI_PATH/w/ ]]; then
792	STATUS="EI"
793	let EI_LINKS+=1
794	fi
795
796	# If it's not, check if this is a link to a domain that we have an interwiki prefix for
797	if [ $STATUS == "??" ]; then
798	for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
799	if [[ $URL == ${INTERWIKI_DOMAINS[$i]} ]] && [[ $URL != ${INTERWIKI_DOMAINS[$i]}/w/ ]]; then
800	STATUS="IW"
801	let IW_LINKS+=1
802	INTERWIKI_INDEX=$i
803	break
804	fi
805	done
806	fi
807
808	# If we didn't match an interwiki domain, see if the status code is in our "OK" codes list
809	if [ $STATUS == "??" ]; then
810	for CODE in "${OK_CODES[@]}"; do
811	if [[ $CODE == $CURL_CODE ]]; then
812	STATUS="OK"
813	let OK_LINKS+=1
814	break
815	fi
816	done
817	fi
818
819	# If we didn't get a match with the "OK" codes, check it against the "RD" codes
820	if [ $STATUS == "??" ]; then
821	for CODE in "${RD_CODES[@]}"; do
822	if [[ $CODE == $CURL_CODE ]]; then
823	# Get URL header again in order to retrieve the URL we are being redirected to
824	NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time 10 --write-out '%{redirect_url}\n' $URL)
825
826	# Adjust the old and new URLs to both use HTTP for comparison purposes, so we can filter
827	# those changes out if the user didn't ask for them
828	URL_HTTP=$(echo $URL \| sed -E 's/^https:/http:/')
829	NEW_URL_HTTP=$(echo $NEW_URL \| sed -E 's/^https:/http:/')
830
831	# Sometimes 'curl' fails to get the redirect_url due to time-out or bad web site config
832	NEW_URL_LENGTH=$(echo \| awk -v input=$NEW_URL_HTTP '{print length(input)}')
833	if [ $NEW_URL_LENGTH -lt $MIN_URL_LENGTH ]; then
834	NEW_URL_HTTP="[new URL not retrieved]"
835	fi
836
837	# Remove slash at end of new URL, if present, so we can filter out the redirects that
838	# merely add an ending slash if the user didn't ask for them
839	NEW_URL_NO_SLASH=$(echo $NEW_URL_HTTP \| sed -E 's:/$::')
840
841	# If the URLs match besides HTTP being upgraded to HTTPS, then the link is OK (unless user
842	# wants those to be reported)
843	if [ $SHOW_HTTPS -eq 0 ] && [ $URL_HTTP == $NEW_URL_HTTP ]; then
844	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show http->https upgrades, and I was redirected to '$NEW_URL'."
845	STATUS="OK"
846	let OK_LINKS+=1
847	let SKIP_HTTPS_UP+=1
848	# If the URLs match besides an added ending slash, then the link is OK (unless user wants
849	# those to be reported)
850	elif [ $SHOW_SLASH -eq 0 ] && [ $URL_HTTP == $NEW_URL_NO_SLASH ]; then
851	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show added trailing slashes, and I was redirected to '$NEW_URL'."
852	STATUS="OK"
853	let OK_LINKS+=1
854	let SKIP_SLASH_ADD+=1
855	else
856	STATUS="RD"
857	let RD_LINKS+=1
858	fi
859	break
860	fi
861	done
862	fi
863
864	# If we didn't get a match with the "RD" codes, check it against the "NG" codes
865	if [ $STATUS == "??" ]; then
866	for CODE in "${NG_CODES[@]}"; do
867	if [[ $CODE == $CURL_CODE ]]; then
868	STATUS="NG"
869	let NG_LINKS+=1
870	break
871	fi
872	done
873	fi
874
875	# If we didn't match a known status code, advise the reader
876	if [ $STATUS == "??" ]; then
877	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I encountered the unknown return code $CURL_CODE."
878	let SKIP_UNK_CODE+=1
879	continue
880	fi
881
882	# Check problem links against exceptions file before proceeding
883	if [ $STATUS != "OK" ] && [ ! -z $EXCEPT_URL ]; then
884	# The code we expect to find in the exceptions file is either the 'curl' result or "EI"/"IW"
885	EXPECT_CODE="$CURL_RESULT"
886	if [ $STATUS == "EI" ]; then
887	EXPECT_CODE="EI"
888	elif [ $STATUS == "IW" ]; then
889	EXPECT_CODE="IW"
890	fi
891
892	# Look for link in exceptions file and make sure its listed result code and wiki page also match
893	GREP_RESULT=$(grep --max-count=1 "$URL" "$EXCEPT_FILE")
894	EXCEPT_PAGE=${GREP_RESULT##*,}
895	if [ "$EXCEPT_PAGE" == "*" ] \|\| [ "$EXCEPT_PAGE" == $LOCAL_PAGE_PATH ]; then
896	EXCEPT_CODE=${GREP_RESULT%%,*}
897	if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then
898	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because its expected result, $EXPECT_CODE, is listed in the exceptions file."
899	if [ $STATUS == "EI" ]; then
900	let SKIP_EXPECT_EI+=1
901	elif [ $STATUS == "IW" ]; then
902	let SKIP_EXPECT_IW+=1
903	else
904	let SKIP_EXPECT_NG+=1
905	fi
906	continue
907	fi
908	fi
909	fi
910
911	# If appropriate, record this link to the log, with clickable URLs when possible
912	if [ $STATUS != "OK" ] \|\| [ $RECORD_OK_LINKS -eq 1 ]; then
913	# Stupid hack since the strings "IW" and "EI" are narrower than "OK", "RD", or "NG" and it takes
914	# an extra tab to get to the desired level of indentation in the RTF log
915	RTF_TABS=" "
916	if [ $STATUS == "IW" ] \|\| [ $STATUS == "EI" ]; then
917	RTF_TABS=" "
918	fi
919
920	# Record link and its wiki page in TXT, RTF, and HTML markup
921	valPrint t "$STATUS ($CURL_RESULT) $STR_TYPE $URL"
922	valPrint t " linked from $FULL_PAGE_PATH"
923	valPrint r "$STATUS ($CURL_RESULT)$RTF_TABS$STR_TYPE {\field{\*\fldinst{HYPERLINK \"$URL\"}}{\fldrslt $URL}}"
924	valPrint r " linked from {\field{\*\fldinst{HYPERLINK \"$FULL_PAGE_PATH\"}}{\fldrslt $LOCAL_PAGE_PATH}}"
925	valPrint hn "<tr><td style=\"white-space:nowrap\">$STATUS ($CURL_RESULT)</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>"
926	valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"
927
928	# Place vertical space here since we won't be printing anything more about this link
929	if [ $STATUS == "OK" ]; then valPrint trh ""; fi
930
931	# Record redirect URL if one was given by a 3xx response page
932	if [ $STATUS == "RD" ]; then
933	valPrint ts " Server suggests $NEW_URL"
934	valPrint rs " Server suggests {\field{\*\fldinst{HYPERLINK \"$NEW_URL\"}}{\fldrslt $NEW_URL}}"
935	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Server suggests</td><td><a href=\"$NEW_URL\" target=\"_blank\">$NEW_URL</a></td></tr>"
936	fi
937
938	# Notify reader if we can use an intrawiki link for this URL
939	if [ $STATUS == "EI" ]; then
940	INTRA_PAGE=${URL#:///}
941	valPrint ts " Just use [[$INTRA_PAGE]]"
942	valPrint rs " Just use [[$INTRA_PAGE]]"
943	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Just use</td><td>[[$INTRA_PAGE]]</td></tr>"
944	fi
945
946	# Notify reader if we can use an interwiki prefix for this URL
947	if [ $STATUS == "IW" ]; then
948	INTER_PAGE=$(echo "$URL" \| sed 's/.*\///')
949	valPrint ts " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
950	valPrint rs " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
951	valPrint hs "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]</td></tr>"
952	fi
953
954	# Query Internet Archive for latest "OK" snapshot for "NG" page
955	if [ $STATUS == "NG" ] && [ $SUGGEST_SNAPSHOTS -eq 1 ]; then
956	ARCHIVE_QUERY=$(curl --silent --max-time 10 "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
957
958	# If a "closest" snapshot was received...
959	if [[ "$ARCHIVE_QUERY" == \"closest\": ]]; then
960	# In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it
961	ARCHIVE_QUERY=$(echo "$ARCHIVE_QUERY" \| sed 's/#!/#\\!/')
962
963	# ...isolate "url" property in the response that follows the "closest" tag
964	SNAPSHOT_URL=${ARCHIVE_QUERY##*\"closest\":} # everything after '"closest":'
965	SNAPSHOT_URL=${SNAPSHOT_URL#*\"url\": \"} # everything after '"url": "'
966	SNAPSHOT_URL=${SNAPSHOT_URL%%\"*} # everything before '"'
967
968	# Remove the port 80 part that IA often adds to the URL, as it's superfluous
969	SNAPSHOT_URL=$(echo $SNAPSHOT_URL \| sed 's/:80//')
970
971	# Inform the user of the snapshot URL
972	valPrint ts " IA suggests $SNAPSHOT_URL"
973	valPrint rs " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
974	valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
975	else # ...otherwise give generic Wayback Machine link for this URL
976	valPrint ts " Try browsing $ARCHIVE_GENERIC/$URL"
977	valPrint rs " Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
978	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"
979	fi
980	fi
981	fi
982
983	# If this is a page and it seems to be there, take screenshot of it using headless Google Chrome
984	if [ $IS_FILE -eq 0 ] && [ $TAKE_PAGE_SHOT -eq 1 ] && [ $STATUS == "OK" ]; then
985	# Sanitize URL by removing "http(s)://" and converting any other colons or slashes to underscores
986	SHOT_NAME=$(echo "$URL" \| sed 's/https*\:\/\///' \| sed 'y/:\//__/')
987	SHOT_FILE="$SHOT_PATH/$SHOT_NAME.png"
988
989	# Don't take screenshot if we already encountered this page and screenshotted it
990	if [ ! -f "$SHOT_FILE" ]; then
991	"$CHROME_PATH" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
992	if [ -f "$WORKING_DIR/$CHROME_SCREENSHOT" ]; then
993	mv -n "$WORKING_DIR/$CHROME_SCREENSHOT" "$SHOT_FILE"
994	else
995	valPrint trhs "Screenshot of URL $URL seems to have failed!"
996	fi
997	else
998	valPrint trhs "Skipping screenshot of URL '$URL' because file '$SHOT_FILE' already exists."
999	fi
1000	fi
1001	done
1002	FINISHED_LIST="yes"
1003	wrapupAndExit

Note: See TracBrowser for help on using the repository browser.

Download in other formats: