Context Navigation

source: Validate External Links/validate_external_links.sh@ 1135

Last change on this file since 1135 was 1135, checked in by iritscen, 4 years ago
Added option to not validate archive.org URLs, as those are unlikely to go bad, and we have an increasing number of them. Val now reports trivial redirect settings in Config section.
File size: 48.3 KB

Line
1	#!/bin/bash
2
3	# Validate External Links by Iritscen
4	# Provided with a list of external links in an expected CSV format, this script validates them. The
5	# resulting logs are produced in three formats: TXT (for easy diffing with an earlier log), RTF (for
6	# reading as a local file with clickable links), and HTML (for uploading as a web page). Call script
7	# with "--help" argument for documentation. Also see Read Me First.rtf for critical notes.
8	# Recommended rule:
9	# \|----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----\|
10
11	# Set separator token to newline
12	IFS="
13	"
14
15	### GLOBALS ###
16	# Settings -- these will be changed from their defaults by the arguments passed in to the script
17	LINKS_URL="" # use 'curl' to download file with links from this location (can be file://)
18	EXCEPT_URL="" # ditto above for file with exceptions to NG results
19	OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder
20	RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES
21	SHOW_SLASH=0 # record response code to the log when a slash is added to the end of a URL
22	SHOW_HTTPS=0 # record response code to the log when "http" is upgraded to "https"
23	SHOW_YT_RD=0 # record response code to the log when a youtu.be URL is expanded to the full URL
24	SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page
25	SKIP_ARCHIVE_LINKS=0 # don't check URLs under the archive.org domain
26	TAKE_PAGE_SHOT=0 # take a screenshot of each OK page
27	CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature
28	URL_START=1 # start at this URL in LINKS_FILE (1 by default)
29	URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE
30	UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report
31
32	# Fixed strings -- see the occurrences of these variables to learn their purpose
33	AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36 OPR/67.0.3575.53"
34	ARCHIVE_API="http://archive.org/wayback/available"
35	ARCHIVE_GENERIC="https://web.archive.org/web/*"
36	ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
37	CHROME_SCREENSHOT="screenshot.png"
38	CURL_CODES="http://iritscen.oni2.net/val/curl_codes.txt"
39	EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
40	HTTP_CODES="http://iritscen.oni2.net/val/http_codes.txt"
41	MY_WIKI_PAGE="https://wiki.oni2.net/User:Iritscen"
42	THIS_DIR=$(cd $(dirname $0); pwd)
43	WORKING_DIR=$(pwd)
44	WIKI_PATH="wiki.oni2.net"
45
46	# These are parallel arrays of the IDs and names of OniGalore's current namespaces
47	declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 100 101 102 103 104 105 108 109 110 111)
48	declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk")
49
50	# These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
51	# This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code.
52	declare -a HTTP_FILES=(3ds 7z avi BINA blend bsl cpp csv dae dll dmg doc east exe fbx first63 flv gamedata gif jpg last32 log m4a mhm mov mp3 mp4 oni ONWC pdf png py rar tga TRMA txt vbs wav wmv xaf xml zip)
53	declare -a HTTP_TLDS_AND_PAGES=(action ars asp aspx cfm cgi com css de htm html it js jsp net org pgi php php3 phtml pl ru shtml stm uk x)
54
55	# These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
56	# are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt
57	# if you add a new code.
58	declare -a OK_CODES=(200 401 405 406 418 501)
59	declare -a RD_CODES=(301 302 303 307 308)
60	declare -a NG_CODES=(000 400 403 404 410 500 502 503 530)
61
62	# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
63	# transcluded text, and if the transclusion fails, then the braces show up in the URL
64	ILLEGAL_CHARS="{ }"
65
66	# The shortest URL possible, used for sanity-checking some URLs: http://a.co
67	MIN_URL_LENGTH=11
68
69	# These are parallel arrays giving the prefixes that can be used in place of normal external links to
70	# some wikis and other sites
71	declare -a INTERWIKI_PREFIXES=(commons metawikimedia mw wikibooks wikidata wikimedia wikinews wikiquote wikisource wikispecies wikiversity wikivoyage wikt wp)
72	declare -a INTERWIKI_DOMAINS=(commons.wikimedia.org meta.wikimedia.org mediawiki.org wikibooks.org wikidata.org wikimediafoundation.org wikinews.org wikiquote.org wikisource.org species.wikimedia.org wikiversity.org wikivoyage.org wiktionary.org wikipedia.org)
73
74	# Variables for keeping track of main loop progress and findings
75	LINK_NUM=0
76	EI_LINKS=0
77	IW_LINKS=0
78	OK_LINKS=0
79	RD_LINKS=0
80	NG_LINKS=0
81	SKIP_UNK_NS=0
82	SKIP_JS_PAGE=0
83	SKIP_BAD_URL=0
84	SKIP_NON_ASCII=0
85	SKIP_UNK_SUFFIX=0
86	SKIP_UNK_CODE=0
87	SKIP_EXPECT_NG=0
88	SKIP_EXPECT_EI=0
89	SKIP_EXPECT_IW=0
90	SKIP_HTTPS_UP=0
91	SKIP_SLASH_ADD=0
92	SKIP_YOUTU_BE=0
93	SKIP_ARCHIVE_ORG=0
94	FILE_LINKS=0
95	PAGE_LINKS=0
96	SKIPPED_HEADER_ROW=0
97	FINISHED_LIST="no"
98	START_RUN=0
99	END_RUN=0
100
101
102	### HELP ###
103	# A pseudo-man page. Here is the 80-character rule for the page text:
104	# 234567890123456789012345678901234567890123456789012345678901234567890123456789
105	function printHelp()
106	{
107	cat << EOF
108
109	NAME
110	Validate External Links
111
112	SYNOPSIS
113	validate_external_links.sh --help
114	validate_external_links.sh --links URL --output DIR [--exceptions URL]
115	[--record-ok-links] [--suggest-snapshots] [--take-screenshots FILE]
116	[--start-url NUM] [--end-url NUM] [--upload FILE]
117
118	DESCRIPTION
119	This script parses a list of external links found in the OniGalore wiki
120	(which is dumped by the Oni2.net domain periodically in a particular
121	format), validates them using the Unix tool 'curl', and produces a report
122	of which links were "OK" (responded positively to an HTTP query), which
123	were "RD" (responded with a 3xx redirect code), which could be "IW"
124	(interwiki) links, which are "EI" (external internal) links and could be
125	intrawiki links, and which were "NG" (no good; a negative response to the
126	query). This report can then be automatically uploaded to the location of
127	your choice. The script can also suggest Internet Archive snapshots for
128	"NG" links, and take screenshots of "OK" links for visual verification by
129	the reader that the page in question is the one intended to be displayed.
130
131	You must pass this script the URL at which the list of links is found
132	(--links) and the path where the directory of logs should be outputted
133	(--output). All other arguments are optional.
134
135	OPTIONS
136	--help Show this page.
137	--links URL (required) URL from which to download the CSV
138	file with external links. Note that this URL can
139	be a local file if you supply a file:// path.
140	--output DIR (required) Unix path to directory in which Val
141	should place its reports.
142	--exceptions URL In order to remove links from the report which
143	Val finds an issue with, but which you regard as
144	OK, list those desired exceptions in this file.
145	See the sample file exceptions.txt for details.
146	Note that this URL can point to a local file if
147	you supply a file:// path.
148	--record-ok-links Log a link in the report even if its response
149	code is "OK".
150	--show-added-slashes Report on redirects that simply add a '/' to the
151	end of the URL.
152	--show-https-upgrades Report on redirects that simply upgrade a
153	"http://" URL to a "https://" URL.
154	--show-yt-redirects Report on redirects that expand a youtu.be URL.
155	--suggest-snapshots Query the Internet Archive for a possible
156	snapshot URL for each "NG" page.
157	--skip-archive-links Don't check links that are already pointing to
158	a page on the Internet Archive.
159	--take-screenshots FILE Call the Google Chrome binary at this path to
160	take screenshots of each "OK" page.
161	--start-url NUM Start at this link in the links CSV file.
162	--end-url NUM Stop at this link in the links CSV file.
163	--upload FILE Upload report using the credentials and path
164	given in this local text file. See sftp_login.txt
165	for template.
166
167	BUGS
168	The script cannot properly parse any line in the external links file
169	which contains a comma in the name of the wiki page containing a link.
170	Commas in the link itself are not an issue.
171	EOF
172	}
173
174
175	### SETUP ###
176	# If first argument is a help request, or if nothing was passed in at all, print help page and quit
177	if [ "$#" -eq 0 ] \|\| [ "$1" == "--help" ]; then
178	printHelp \| less
179	exit 0
180	fi
181
182	# Parse arguments as long as there are more arguments to process
183	while (( "$#" )); do
184	case "$1" in
185	--links ) LINKS_URL="$2"; shift 2;;
186	--exceptions ) EXCEPT_URL="$2"; shift 2;;
187	--output ) OUTPUT_DIR="$2"; shift 2;;
188	--record-ok-links ) RECORD_OK_LINKS=1; shift;;
189	--show-added-slashes ) SHOW_SLASH=1; shift;;
190	--show-https-upgrades ) SHOW_HTTPS=1; shift;;
191	--show-yt-redirects ) SHOW_YT_RD=1; shift;;
192	--suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;;
193	--skip-archive-links ) SKIP_ARCHIVE_LINKS=1; shift;;
194	--take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
195	--start-url ) URL_START=$2; shift 2;;
196	--end-url ) URL_LIMIT=$2; shift 2;;
197	--upload ) UPLOAD_INFO=$2; shift 2;;
198	* ) echo "Invalid argument $1 detected. Aborting."; exit 1;;
199	esac
200	done
201
202	# If the required arguments were not supplied, print help page and quit
203	if [ -z $LINKS_URL ] \|\| [ -z $OUTPUT_DIR ]; then
204	echo "Error: I did not receive one or both required arguments. Run me with the \"--help\" argument for documentation."
205	exit 2
206	fi
207
208	# If user wants screenshots, make sure path to Chrome was passed in and is valid
209	if [ $TAKE_PAGE_SHOT -eq 1 ]; then
210	if [ ! -f "$CHROME_PATH" ]; then
211	echo "Error: You need to supply a path to the Google Chrome application in order to take screenshots."
212	exit 3
213	fi
214	fi
215
216	# Check that UPLOAD_INFO exists, if this argument was supplied
217	if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then
218	echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting."
219	exit 4
220	fi
221
222	# Check that OUTPUT_DIR is a directory
223	if [ ! -d "$OUTPUT_DIR" ]; then
224	echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting."
225	exit 5
226	fi
227
228	# Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots
229	SAFE_TIME=$(date "+%Y-%m-%d--%H-%M-%S") # for use in path names
230	NICE_TIME=$(TZ=":UTC" date "+%a, %d %b %Y %k:%M:%S GMT") # matches format of LINKS_DATE, which is extracted from the extlinks file
231	OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
232	OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
233	SHOT_PATH="$OUTPUT_PATH/Screenshots"
234	LOG_NAME="ValExtLinks report"
235	LOG_TXT="$OUTPUT_PATH/$LOG_NAME.txt"
236	LOG_RTF="$OUTPUT_PATH/$LOG_NAME.rtf"
237	LOG_HTM="$OUTPUT_PATH/$LOG_NAME.htm"
238	mkdir "$OUTPUT_PATH"
239	if [ $TAKE_PAGE_SHOT -eq 1 ]; then
240	mkdir "$SHOT_PATH"
241	fi
242
243	# Check that 'mkdir' succeeded
244	if [ ! -d "$OUTPUT_PATH" ]; then
245	echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting."
246	exit 6
247	fi
248
249	# Get date on the file at LINKS_URL and print to log
250	LINKS_DATE=$(curl --silent --head $LINKS_URL \| grep "Last-Modified")
251	if [ -z "$LINKS_DATE" ]; then
252	echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting."
253	exit 7
254	fi
255	LINKS_DATE=${LINKS_DATE#Last-Modified: }
256
257
258	### UTILITY FUNCTIONS ###
259	# Writes a plain-text header to TXT log file
260	function printTXTheader()
261	{
262	valPrint t "Validate External Links report"
263	valPrint t "generated $NICE_TIME"
264	valPrint t "from data of $LINKS_DATE"
265	valPrint t "script by Iritscen (contact: $MY_WIKI_PAGE)"
266	valPrint t ""
267	}
268
269	# Writes the RTF header to RTF log file, then a centered title, then sets text to left-justified
270	function printRTFheader()
271	{
272	valPrint r "{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf820
273	{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
274	{\colortbl;\red255\green255\blue255;}
275	{\*\expandedcolortbl;;}
276	\margl1440\margr1440\vieww12600\viewh12100\viewkind0
277	\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\qc\partightenfactor0
278
279	\f0\fs28 \cf0 \b1 Validate External Links report \b0\\
280	generated $NICE_TIME\\
281	from data of $LINKS_DATE\\
282	script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$MY_WIKI_PAGE\"}}{\fldrslt contact}})
283	\\
284	\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
285	\cf0 "
286	}
287
288	# Closes the RTF markup of the RTF log file
289	function printRTFfooter()
290	{
291	valPrint r "}"
292	}
293
294	# Writes the HTML header to HTML log file
295	function printHTMheader()
296	{
297	valPrint h "<html>
298	<head>
299	<title>Validate External Links report</title>
300	</head>
301	<body>
302	<h2>Validate External Links report</h2>
303	<h3>generated $NICE_TIME<br />
304	from data of $LINKS_DATE<br />
305	script by Iritscen (<a href=\"$MY_WIKI_PAGE\" target=\"_blank\">contact</a>)</h3>"
306	}
307
308	# Closes the HTML markup of the HTML log file
309	function printHTMfooter()
310	{
311	valPrint h "</body>
312	</html>"
313	}
314
315	# The central logging function. The first parameter is a string composed of one or more characters that
316	# indicate which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
317	# 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 's' means "Print an extra newline at the end." 'w' means "Don't pass console output through 'fmt'" ("fmt" fits the output
318	# to an 80-column CLI but can break special formatting and the 'n' option).
319	function valPrint()
320	{
321	if [[ "$1" == c ]]; then
322	if [[ "$1" == n ]]; then
323	echo -n "$2"
324	elif [[ "$1" == w ]]; then
325	echo "$2"
326	elif [[ "$1" == s ]]; then
327	echo -e "$2\n"
328	else
329	echo "$2" \| fmt -w 80
330	fi
331	fi
332	if [[ "$1" == t ]]; then
333	if [[ "$1" == n ]]; then
334	echo -n "$2" >> "$LOG_TXT"
335	elif [[ "$1" == s ]]; then
336	echo -e "$2\n" >> "$LOG_TXT"
337	else
338	echo "$2" >> "$LOG_TXT"
339	fi
340	fi
341	if [[ "$1" == r ]]; then
342	if [[ "$1" == n ]]; then
343	echo "$2" >> "$LOG_RTF"
344	elif [[ "$1" == s ]]; then
345	echo "$2\line\line" >> "$LOG_RTF"
346	else
347	echo "$2\line" >> "$LOG_RTF"
348	fi
349	fi
350	if [[ "$1" == h ]]; then
351	if [[ "$1" == s ]]; then
352	echo "$2<tr><td> </td></tr>" >> "$LOG_HTM"
353	elif [[ "$1" == n ]]; then
354	echo "$2" >> "$LOG_HTM"
355	else
356	echo "$2<br />" >> "$LOG_HTM"
357	fi
358	fi
359	}
360
361	# Pluralize the string in parameter 1 if the number in parameter 2 is not 1
362	function pluralCheckNoun()
363	{
364	if [ $2 -ne 1 ]; then
365	if [[ $1 =~ x$ ]]; then
366	echo $1es
367	else
368	echo $1s
369	fi
370	else
371	echo $1
372	fi
373	}
374
375	# Output "is" if parameter 1 is 1, otherwise "are"
376	function pluralCheckIs()
377	{
378	if [ $1 -ne 1 ]; then
379	echo "are"
380	else
381	echo "is"
382	fi
383	}
384
385	# Output "was" if parameter 1 is 1, otherwise "were"
386	function pluralCheckWas()
387	{
388	if [ $1 -ne 1 ]; then
389	echo "were"
390	else
391	echo "was"
392	fi
393	}
394
395	# Output "a " if parameter 1 is 1, otherwise nothing
396	function pluralCheckA()
397	{
398	if [ $1 -eq 1 ]; then
399	echo "a "
400	fi
401	}
402
403	# Output "an " if parameter 1 is 1, otherwise nothing
404	function pluralCheckAn()
405	{
406	if [ $1 -eq 1 ]; then
407	echo "an "
408	fi
409	}
410
411	# Upload HTML report using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
412	# reports being saved to disk have already been closed.
413	function uploadReport()
414	{
415	valPrint c "Uploading HTML report..."
416
417	SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
418	SFTP_USER_NAME_MARKER="user:"
419	SFTP_PASSWORD_MARKER="pw:"
420	SFTP_PORT_MARKER="port:"
421	SFTP_PATH_MARKER="path:"
422	SFTP_USER_NAME=$(grep $SFTP_USER_NAME_MARKER $UPLOAD_INFO)
423	SFTP_USER_NAME=${SFTP_USER_NAME#$SFTP_USER_NAME_MARKER}
424	SFTP_PASSWORD=$(grep $SFTP_PASSWORD_MARKER $UPLOAD_INFO)
425	SFTP_PASSWORD=${SFTP_PASSWORD#$SFTP_PASSWORD_MARKER}
426	SFTP_PORT=$(grep $SFTP_PORT_MARKER $UPLOAD_INFO)
427	SFTP_PORT=${SFTP_PORT#$SFTP_PORT_MARKER}
428	SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
429	SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}
430
431	expect "$SCRIPT_PATH" "$LOG_HTM" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.htm"
432
433	valPrint c "Report was uploaded, unless an error message appears above."
434	}
435
436	# Prints session summary when script is done
437	function wrapupAndExit()
438	{
439	# Get off progress line on console, drop down a line from last link in log, and close HTML table
440	valPrint ctr ""
441	valPrint h "</table><br />"
442
443	# If we didn't finish processing the last URL, then the iterator is one too high
444	if [ $FINISHED_LIST != "yes" ]; then
445	let LINK_NUM-=1
446	if [ $FINISHED_LIST == "no" ]; then
447	valPrint ctrh "The session was canceled by the user."
448	fi
449	fi
450
451	# Generate string with elapsed time
452	END_RUN=$(date +%s)
453	ELAPSED=$(echo $(($END_RUN - $START_RUN)) \| awk '{printf "%d min. %d sec. elapsed", int($1/60), int($1%60)}')
454
455	# Do some math on results of session
456	LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
457	LINK_PROBLEMS=$((EI_LINKS+IW_LINKS+RD_LINKS+NG_LINKS))
458	LINK_ERRORS=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
459	LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_EI+SKIP_EXPECT_IW))
460	TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP+SKIP_YOUTU_BE))
461	LINKS_CHECKED=$((LINKS_PROCESSED-LINK_ERRORS))
462
463	# Print summary header
464	valPrint ct "Summary ($ELAPSED):"
465	valPrint r "\b1 Summary \b0 ($ELAPSED)"
466	valPrint hn "<h3><span id=\"summary\">Summary ($ELAPSED)</span></h3>"
467	valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT) (there $(pluralCheckWas $FILE_LINKS) $FILE_LINKS file $(pluralCheckNoun link $FILE_LINKS) and $PAGE_LINKS page $(pluralCheckNoun link $PAGE_LINKS))."
468
469	# Print processed link totals
470	if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi
471	if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi
472	if [ $SKIP_ARCHIVE_ORG -gt 0 ]; then valPrint ctrh "- $SKIP_ARCHIVE_ORG Archive.org $(pluralCheckNoun link $SKIP_ARCHIVE_ORG) were not checked"; fi
473	if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS processed $(pluralCheckNoun link $LINK_PROBLEMS) had $(pluralCheckAn $LINK_PROBLEMS)$(pluralCheckNoun issue $LINK_PROBLEMS)"; fi
474	if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr " (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; valPrint h "  (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi
475	if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) $(pluralCheckWas $OK_LINKS) OK"; fi
476	if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctr " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; valPrint h "  (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi
477
478	# Print excepted link totals
479	if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctrh "$LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"; fi
480	if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
481	if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi
482	if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi
483
484	# Print errored link totals
485	if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"; fi
486	if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
487	if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
488	if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
489	if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
490	if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
491	if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
492
493	# Print checked link totals
494	if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS link $(pluralCheckNoun issue $LINK_PROBLEMS):"; fi
495	if [ $NG_LINKS -gt 0 ]; then valPrint ctrh "- $NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
496	if [ $RD_LINKS -gt 0 ]; then valPrint ctrh "- $RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi
497	if [ $EI_LINKS -gt 0 ]; then valPrint ctrh "- $EI_LINKS $(pluralCheckNoun link $EI_LINKS) that could be intrawiki"; fi
498	if [ $IW_LINKS -gt 0 ]; then valPrint ctrh "- $IW_LINKS $(pluralCheckNoun link $IW_LINKS) that could be interwiki"; fi
499
500	# Close the log files' markup
501	valPrint trh "ValExtLinks says goodbye."
502	printRTFfooter
503	printHTMfooter
504
505	# Upload report if this was requested
506	if [ ! -z $UPLOAD_INFO ] && [ $FINISHED_LIST != "no" ]; then
507	uploadReport
508	fi
509
510	# Really quit now
511	valPrint c "ValExtLinks says goodbye."
512	exit 0
513	}
514	trap wrapupAndExit INT
515
516
517	### INITIALIZATION ###
518	# Print opening message to console and log files
519	valPrint cw "$(tput bold)--Validate External Links--$(tput sgr0)"
520	printTXTheader
521	printRTFheader
522	printHTMheader
523
524	# Attempt to download file at LINKS_URL, then check that it succeeded
525	valPrint t "Config:"
526	valPrint r "\b1 Config \b0"
527	valPrint hn "<h3>Config</h3>"
528	valPrint cwtrh "Downloading list of external links from $LINKS_URL."
529	LINKS_FILE_NAME=$(echo "$LINKS_URL" \| sed 's/.*\///')
530	LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
531	curl --silent -o "$LINKS_FILE" $LINKS_URL
532	if [ ! -f "$LINKS_FILE" ]; then
533	echo "The download of $LINKS_URL appears to have failed. Aborting."
534	wrapupAndExit
535	fi
536
537	# Attempt to download file at EXCEPT_URL, then check that it succeeded
538	if [ ! -z $EXCEPT_URL ]; then
539	valPrint cwtrh "Downloading list of reporting exceptions from $EXCEPT_URL."
540	EXCEPT_FILE_NAME=$(echo "$EXCEPT_URL" \| sed 's/.*\///')
541	EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
542	curl --silent -o "$EXCEPT_FILE" $EXCEPT_URL
543	if [ ! -f "$EXCEPT_FILE" ]; then
544	echo "The download of $EXCEPT_URL appears to have failed. Aborting."
545	wrapupAndExit
546	fi
547	fi
548
549	# Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
550	LINK_COUNT_STRING=$(cat "$LINKS_FILE" \| wc -l)
551
552	# Number of URLs is number of lines minus one (first line is column header row for the CSV)
553	LINK_COUNT=$(echo "${LINK_COUNT_STRING}" \| tr -d '[:space:]')
554	let LINK_COUNT-=1
555
556	# Calculate number of URLs to consider
557	if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
558	valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((URL_LIMIT-URL_START+1)) of them, from link $URL_START to $URL_LIMIT."
559	elif [ $URL_START -ne 1 ]; then
560	valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((LINK_COUNT-URL_START+1)) of them, from link $URL_START to $LINK_COUNT."
561	else
562	valPrint ctrh "Found $LINK_COUNT links to process. I will be considering all of them."
563	fi
564
565	# Print settings to console and log
566	declare -a SETTINGS_MSG=(I will be checking the response code of each link "and will" take a screenshot of each page. Pages that are OK will "also" be logged. I "will" ask the Internet Archive for a suggested snapshot URL for each NG page. "I will not report links that are listed in the exceptions file." "I will ignore URLs that simply have ending slashes added onto them." "I will ignore URLs that only upgrade from HTTP to HTTPS." "I will ignore youtu.be links that are merely being expanded." "I will not check the validity of Internet Archive snapshot URLs.")
567	if [ $TAKE_PAGE_SHOT -eq 0 ]; then SETTINGS_MSG[10]="but will not"; fi
568	if [ $RECORD_OK_LINKS -eq 0 ]; then SETTINGS_MSG[22]="not"; fi
569	if [ $SUGGEST_SNAPSHOTS -eq 0 ]; then SETTINGS_MSG[26]="will not"; fi
570	if [ -z $EXCEPT_URL ] \|\| [ $RECORD_OK_LINKS -eq 1 ]; then SETTINGS_MSG[40]=""; fi
571	if [ $SHOW_SLASH -eq 1 ]; then SETTINGS_MSG[41]=""; fi
572	if [ $SHOW_HTTPS -eq 1 ]; then SETTINGS_MSG[42]=""; fi
573	if [ $SHOW_YT_RD -eq 1 ]; then SETTINGS_MSG[43]=""; fi
574	if [ $SKIP_ARCHIVE_LINKS -eq 0 ]; then SETTINGS_MSG[44]=""; fi
575	SETTINGS_STR=${SETTINGS_MSG[@]}
576	valPrint ctrh "$SETTINGS_STR"
577	valPrint tr "A summary of my findings will be found at the bottom of the report."
578	valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
579	valPrint trh ""
580
581	# Print legend to logs
582	valPrint t "Legend:"
583	valPrint r "\b1 Legend \b0"
584	valPrint hn "<h3>Legend</h3>"
585	valPrint trh "OK = URL seems to be working."
586	valPrint trh "NG = URL no longer seems to work. You should click each URL marked as NG before attempting to fix it, because false negatives will occur from time to time due to hiccups in the Internet. Please report any persistent false negatives or other issues to the script's author (see top of report). An NG link should be followed by a link to the Internet Archive's Wayback Machine which may help you repair the link, unless the Archive does not have any snapshots of the site. If the link cannot be repaired, you can delete it from the wiki page, or, if this would disrupt the surrounding material on the page, disable the link by wrapping the URL in nowiki tags."
587	valPrint trh "RD = The server responding to this URL is saying that the page moved and you should instead use the supplied new URL. Some RD links represent minor adjustments in the organization of a web site, and some are soft 404s (the file/page has been removed and you are being redirected to something like the main page of the web site). You will have to look at the new URL yourself to determine if it represents an OK link and the link on the wiki should be updated to this one, or if the desired file/page is actually gone and we need to replace the wiki link with an Internet Archive snapshot link -- or disable the URL if it has not been archived."
588	valPrint trh "EI = URL is an external link to an internal page and should be converted to an intrawiki link using the suggested markup."
589	valPrint trh "IW = URL is an external link to a fellow wiki and should be converted to an interwiki link using the suggested markup."
590	valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $HTTP_CODES)."
591	valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$HTTP_CODES\"}}{\fldrslt here}} for code reference)."
592	valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$HTTP_CODES\" target=\"_blank\">here</a> for code reference)."
593	valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $CURL_CODES)."
594	valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$CURL_CODES\"}}{\fldrslt here}} for code reference)."
595	valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$CURL_CODES\" target=\"_blank\">here</a> for code reference)."
596	valPrint trh "IA suggests = Last available snapshot suggested by the Internet Archive."
597	valPrint trh "Try browsing = The Archive occasionally fails to return a snapshot URL even when one exists, so you will need to check for a snapshot manually using this link to the Wayback Machine before concluding that a site has not been archived."
598	valPrint trh ""
599
600
601	### MAIN LOOP ###
602	valPrint t "Links:"
603	valPrint r "\b1 Links \b0"
604	valPrint hn "<h3>Links</h3>"
605	START_RUN=$(date +%s)
606	# Process each line of the .csv in LINKS_FILE
607	for LINE in `cat "$LINKS_FILE"`; do
608	let LINK_NUM+=1
609
610	# First line is the column header row for the CSV, so let's verify that the format hasn't changed
611	if [ $SKIPPED_HEADER_ROW -eq 0 ]; then
612	if [ $LINE == "namespace,title,target" ]; then
613	SKIPPED_HEADER_ROW=1
614	LINK_NUM=0 # this line is it's not a link, so reset the link counter
615	valPrint hn "<table>"
616	continue
617	else
618	valPrint ctrh "Did not find expected header row in $LINKS_FILE. Aborting."
619	wrapupAndExit
620	fi
621	fi
622
623	# Skip this link if we are not at URL_START yet
624	if [ $LINK_NUM -lt $URL_START ]; then
625	continue
626	fi
627
628	# Stop if we are at the limit declared for testing purposes
629	if [ $URL_LIMIT -gt 0 ] && [ $LINK_NUM -gt $URL_LIMIT ]; then
630	FINISHED_LIST="limit"
631	wrapupAndExit
632	fi
633
634	# Print progress to screen
635	if [ $LINK_NUM -gt 1 ]; then
636	printf "\e[1A\n" # erase previous progress message so that new one appears in its place
637	fi
638	valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."
639
640	# The number of the namespace is the element before the first comma on the line
641	NS_ID=${LINE%%,*}
642
643	# Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
644	NS_NAME=""
645	a=0
646	while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done
647	if [ $NS_ID == "NULL" ]; then
648	break
649	elif [ $NS_ID -eq ${NS_IDS[$a]} ]; then
650	NS_NAME="${NS_NAMES[$a]}"
651	break
652	fi
653	let a+=1
654	done
655	if [ "$NS_NAME" == "" ]; then
656	if [ $NS_ID == "NULL" ]; then
657	valPrint trs "Skipping line $LINK_NUM ('$LINE') because the namespace (and probably the page too) is 'NULL'. Probably the link is no longer in existence on the wiki."
658	else
659	valPrint trs "Skipping line $LINK_NUM ('$LINE') because I could not find a name for namespace ID $NS_ID."
660	fi
661	let SKIP_UNK_NS+=1
662	continue
663	fi
664
665	# The name of the page is everything between the namespace ID and the next comma on the line (commas
666	# in page names will break this)
667	PAGE_NAME=${LINE#$NS_ID,}
668	PAGE_NAME=${PAGE_NAME%%,*}
669
670	# We don't want to consider wiki pages ending in .js, as the MW parser cannot reliably isolate URLS
671	# in JavaScript code, so it returns erroneous links
672	PAGE_NAME_SUFFIX=$(echo $PAGE_NAME \| sed 's/.*\.//')
673	if [ $PAGE_NAME_SUFFIX == "js" ]; then
674	valPrint trs "Skipping URL '${LINE#$NS_ID,$PAGE_NAME,}' on line $LINK_NUM because it was found on JavaScript page '$PAGE_NAME'."
675	let SKIP_JS_PAGE+=1
676	continue
677	fi
678
679	# Build longer wiki page URLs from namespace and page names
680	FULL_PAGE_PATH=https://$WIKI_PATH/$NS_NAME:$PAGE_NAME
681	LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
682	# Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it
683	# explicitly breaks the link
684	if [ $NS_ID -eq 0 ]; then
685	FULL_PAGE_PATH=https://$WIKI_PATH/$PAGE_NAME
686	LOCAL_PAGE_PATH=$PAGE_NAME
687	fi
688
689	# The URL being linked to is everything after the previous two fields (this allows commas to be in
690	# the URLs, but a comma in the previous field, the page name, will break this)
691	URL=${LINE#$NS_ID,$PAGE_NAME,}
692
693	# Scan for illegal characters
694	if [[ $URL == [$ILLEGAL_CHARS] ]]; then
695	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because it contains characters illegal in a URL."
696	let SKIP_BAD_URL+=1
697	continue
698	fi
699
700	# If we're skipping Archive.org links, check if this is one
701	if [ $SKIP_ARCHIVE_LINKS -eq 1 ] && [[ $URL == web.archive.org ]]; then
702	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have been asked not to check Wayback Machine links."
703	let SKIP_ARCHIVE_ORG+=1
704	continue
705	fi
706
707	# Now we need to know if the URL is for a file or a web page. First step is to determine if the
708	# URL ends in a suffix
709	HAS_SUFFIX=0
710
711	# If the URL ends in something like ".php?foo=bar", strip everything from the '?' onward
712	CLEAN_URL=${URL%%\?*}
713
714	# If the URL ends in something like "#section_15", strip everything from the '#' onward
715	CLEAN_URL=${CLEAN_URL%%\#*}
716
717	# 'sed' cannot handle Unicode in my Bash shell, so skip non-ASCII URL and make user check it
718	if [[ $CLEAN_URL == [![:ascii:]] ]]; then
719	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I cannot handle non-ASCII characters."
720	let SKIP_NON_ASCII+=1
721	continue
722	fi
723
724	# Isolate the characters after the last period and after the last slash
725	POST_DOT=$(echo "$CLEAN_URL" \| sed 's/.*\.//')
726	POST_SLASH=$(echo "$CLEAN_URL" \| sed 's/.*\///')
727
728	# If the last period comes after the last slash, then the URL ends in a suffix
729	POST_DOT_LENGTH=$(echo \| awk -v input=$POST_DOT '{print length(input)}')
730	POST_SLASH_LENGTH=$(echo \| awk -v input=$POST_SLASH '{print length(input)}')
731	if [ $POST_DOT_LENGTH -lt $POST_SLASH_LENGTH ]; then
732	HAS_SUFFIX=1
733	else
734	HAS_SUFFIX=0
735	fi
736
737	# Now we need to know if the URL is for a file. If the URL ends in a suffix, compare it against the
738	# known URL endings supplied in HTTP_FILES and HTTP_TLDS_AND_PAGES
739	IS_FILE=-1
740	if [ $HAS_SUFFIX -eq 0 ]; then
741	IS_FILE=0
742	else
743	# Turn off case sensitivity while we compare suffixes
744	shopt -s nocasematch
745
746	# Special case: URLs ending in something like "productID.304297400" are pages, not files, so if
747	# the URL's suffix is all numbers, we are looking at the end of a web page URL
748	if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
749	IS_FILE=0
750	fi
751
752	# Special case: URLs ending in a parens, e.g. "ms537113(v=vs.85)", are pages
753	if [[ $POST_DOT =~ ^.*[\(\)]$ ]]; then
754	IS_FILE=0
755	fi
756
757	# Special case: URLs containing a '%', e.g. "10.1007%2FBF00329055", are pages
758	if [[ $POST_DOT == % ]]; then
759	IS_FILE=0
760	fi
761
762	# If we did not identify this URL as a web page above, we need to compare the suffix against known
763	# file extensions
764	if [ $IS_FILE -eq -1 ]; then
765	for EXTENSION in "${HTTP_FILES[@]}"; do
766	if [[ $EXTENSION == $POST_DOT ]]; then # double brackets respect the nocasematch setting
767	IS_FILE=1
768	break
769	fi
770	done
771	fi
772
773	# If we did not identify this URL as a file above, we need to compare the suffix against known
774	# pages and TLDs in order to rule out that this is an unknown file suffix or TLD that the user
775	# needs to add to the HTTP_FILES or HTTP_TLDS_AND_PAGES array
776	if [ $IS_FILE -eq -1 ]; then
777	for PAGE_OR_TLD in "${HTTP_TLDS_AND_PAGES[@]}"; do
778	if [[ $PAGE_OR_TLD == $POST_DOT ]]; then
779	IS_FILE=0
780	break
781	fi
782	done
783	fi
784
785	# Turn case sensitivity back on in Bash
786	shopt -u nocasematch
787	fi
788
789	# If this suffix escaped identification as either a file, page or TLD, inform the user
790	STR_TYPE=""
791	if [ $IS_FILE -eq -1 ]; then
792	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I encountered the unknown URL ending '$POST_DOT'. Please add this ending to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
793	let SKIP_UNK_SUFFIX+=1
794	continue
795	elif [ $IS_FILE -eq 1 ]; then
796	STR_TYPE="file"
797	let FILE_LINKS+=1
798	elif [ $IS_FILE -eq 0 ]; then
799	STR_TYPE="page"
800	let PAGE_LINKS+=1
801	fi
802
803	# Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
804	# issue with sites that require HTTPS
805	CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time 10 --write-out '%{http_code}\n' $URL)
806	CURL_ERR=$(echo $?)
807	CURL_RESULT=$CURL_CODE
808
809	# Tack 'curl' exit code onto result string if 'curl' returned "000" (no HTTP response)
810	if [ $CURL_CODE == "000" ]; then
811	CURL_RESULT="$CURL_RESULT-$CURL_ERR"
812	fi
813
814	# Begin to determine our status code for this URL (EI, IW, OK, RD, or NG)
815	STATUS="??"
816	NEW_URL=""
817	INTERWIKI_INDEX=-1
818
819	# First make sure that this isn't an "external internal" link to our own wiki that can be replaced
820	# by "[[page_name]]". If it uses a special access URL beginning with "/w/", let it pass, as it
821	# probably cannot be replaced by "[[ ]]" markup
822	if [[ $URL == $WIKI_PATH ]] && [[ $URL != $WIKI_PATH/w/ ]]; then
823	STATUS="EI"
824	let EI_LINKS+=1
825	fi
826
827	# If it's not, check if this is a link to a domain that we have an interwiki prefix for
828	if [ $STATUS == "??" ]; then
829	for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
830	if [[ $URL == ${INTERWIKI_DOMAINS[$i]} ]] && [[ $URL != ${INTERWIKI_DOMAINS[$i]}/w/ ]]; then
831	STATUS="IW"
832	let IW_LINKS+=1
833	INTERWIKI_INDEX=$i
834	break
835	fi
836	done
837	fi
838
839	# If we didn't match an interwiki domain, see if the status code is in our "OK" codes list
840	if [ $STATUS == "??" ]; then
841	for CODE in "${OK_CODES[@]}"; do
842	if [[ $CODE == $CURL_CODE ]]; then
843	STATUS="OK"
844	let OK_LINKS+=1
845	break
846	fi
847	done
848	fi
849
850	# If we didn't get a match with the "OK" codes, check it against the "RD" codes
851	if [ $STATUS == "??" ]; then
852	for CODE in "${RD_CODES[@]}"; do
853	if [[ $CODE == $CURL_CODE ]]; then
854	# Get URL header again in order to retrieve the URL we are being redirected to
855	NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time 10 --write-out '%{redirect_url}\n' $URL)
856
857	# Adjust the old and new URLs to both use HTTP for comparison purposes, so we can filter
858	# those changes out if the user didn't ask for them
859	URL_HTTP=$(echo $URL \| sed -E 's/^https:/http:/')
860	NEW_URL_HTTP=$(echo $NEW_URL \| sed -E 's/^https:/http:/')
861
862	# Sometimes 'curl' fails to get the redirect_url due to time-out or bad web site config
863	NEW_URL_LENGTH=$(echo \| awk -v input=$NEW_URL_HTTP '{print length(input)}')
864	if [ $NEW_URL_LENGTH -lt $MIN_URL_LENGTH ]; then
865	NEW_URL_HTTP="[new URL not retrieved]"
866	fi
867
868	# Remove slash at end of new URL, if present, so we can filter out the redirects that
869	# merely add an ending slash if the user didn't ask for them
870	NEW_URL_NO_SLASH=$(echo $NEW_URL_HTTP \| sed -E 's:/$::')
871
872	# Detect if this is a youtu.be link simply being expanded by YouTube to the full
873	# youtube.com address
874	YOUTU_BE=0
875	if [[ $URL_HTTP == http://youtu.be* ]] && [[ $NEW_URL_HTTP == http://www.youtube.com* ]]; then
876	YOUTU_BE=1
877	fi
878
879	# If the URLs match besides HTTP being upgraded to HTTPS, then the link is OK (unless user
880	# wants those to be reported)
881	if [ $SHOW_HTTPS -eq 0 ] && [ $URL_HTTP == $NEW_URL_HTTP ]; then
882	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show http->https upgrades, and I was redirected to '$NEW_URL'."
883	STATUS="OK"
884	let OK_LINKS+=1
885	let SKIP_HTTPS_UP+=1
886	# If the URLs match besides an added ending slash, then the link is OK (unless user wants
887	# those to be reported)
888	elif [ $SHOW_SLASH -eq 0 ] && [ $URL_HTTP == $NEW_URL_NO_SLASH ]; then
889	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show added trailing slashes, and I was redirected to '$NEW_URL'."
890	STATUS="OK"
891	let OK_LINKS+=1
892	let SKIP_SLASH_ADD+=1
893	elif [ $SHOW_YT_RD -eq 0 ] && [ $YOUTU_BE -eq 1 ]; then
894	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show redirects for youtu.be links, and I was redirected to '$NEW_URL'."
895	STATUS="OK"
896	let OK_LINKS+=1
897	let SKIP_YOUTU_BE+=1
898	else
899	STATUS="RD"
900	let RD_LINKS+=1
901	fi
902	break
903	fi
904	done
905	fi
906
907	# If we didn't get a match with the "RD" codes, check it against the "NG" codes
908	if [ $STATUS == "??" ]; then
909	for CODE in "${NG_CODES[@]}"; do
910	if [[ $CODE == $CURL_CODE ]]; then
911	STATUS="NG"
912	let NG_LINKS+=1
913	break
914	fi
915	done
916	fi
917
918	# If we didn't match a known status code, advise the reader
919	if [ $STATUS == "??" ]; then
920	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I encountered the unknown response code $CURL_CODE."
921	let SKIP_UNK_CODE+=1
922	continue
923	fi
924
925	# Check problem links against exceptions file before proceeding
926	if [ $STATUS != "OK" ] && [ ! -z $EXCEPT_URL ]; then
927	# The code we expect to find in the exceptions file is either the 'curl' result or "EI"/"IW"
928	EXPECT_CODE="$CURL_RESULT"
929	if [ $STATUS == "EI" ]; then
930	EXPECT_CODE="EI"
931	elif [ $STATUS == "IW" ]; then
932	EXPECT_CODE="IW"
933	fi
934
935	# Look for link in exceptions file and make sure its listed result code and wiki page also match
936	GREP_RESULT=$(grep --max-count=1 "$URL" "$EXCEPT_FILE")
937	EXCEPT_PAGE=${GREP_RESULT##*,}
938	if [ "$EXCEPT_PAGE" == "*" ] \|\| [ "$EXCEPT_PAGE" == $LOCAL_PAGE_PATH ]; then
939	EXCEPT_CODE=${GREP_RESULT%%,*}
940	if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then
941	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because its expected result, $EXPECT_CODE, is listed in the exceptions file."
942	if [ $STATUS == "EI" ]; then
943	let SKIP_EXPECT_EI+=1
944	elif [ $STATUS == "IW" ]; then
945	let SKIP_EXPECT_IW+=1
946	else
947	let SKIP_EXPECT_NG+=1
948	fi
949	continue
950	fi
951	fi
952	fi
953
954	# If appropriate, record this link to the log, with clickable URLs when possible
955	if [ $STATUS != "OK" ] \|\| [ $RECORD_OK_LINKS -eq 1 ]; then
956	# Prepare 'curl' result in parentheses to print after status code, unless this is an "IW" or "EI"
957	# link, in which case showing the status code doesn't make sense. Adjust spacing after string to
958	# ensure TXT and RTF reports have aligned columns of results.
959	CURL_STR_H=" ($CURL_RESULT)"
960	CURL_STR_T="$CURL_STR_H"
961	CURL_STR_R="$CURL_STR_H "
962	if [ $STATUS == "IW" ] \|\| [ $STATUS == "EI" ]; then
963	CURL_STR_H=""
964	CURL_STR_T=" "
965	CURL_STR_R=" "
966	fi
967
968	# Record link and its wiki page in TXT, RTF, and HTML markup
969	valPrint t "${STATUS}${CURL_STR_T} $STR_TYPE $URL"
970	valPrint t " linked from $FULL_PAGE_PATH"
971	valPrint r "${STATUS}${CURL_STR_R}${RTF_TABS}$STR_TYPE {\field{\*\fldinst{HYPERLINK \"$URL\"}}{\fldrslt $URL}}"
972	valPrint r " linked from {\field{\*\fldinst{HYPERLINK \"$FULL_PAGE_PATH\"}}{\fldrslt $LOCAL_PAGE_PATH}}"
973	valPrint hn "<tr><td style=\"white-space:nowrap\">${STATUS}${CURL_STR_H}</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>"
974	valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"
975
976	# Place vertical space here since we won't be printing anything more about this link
977	if [ $STATUS == "OK" ]; then valPrint tr ""; valPrint hs ""; fi
978
979	# Record redirect URL if one was given by a 3xx response page
980	if [ $STATUS == "RD" ]; then
981	valPrint ts " Server suggests $NEW_URL"
982	valPrint rs " Server suggests {\field{\*\fldinst{HYPERLINK \"$NEW_URL\"}}{\fldrslt $NEW_URL}}"
983	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Server suggests</td><td><a href=\"$NEW_URL\" target=\"_blank\">$NEW_URL</a></td></tr>"
984	fi
985
986	# Notify reader if we can use an intrawiki link for this URL
987	if [ $STATUS == "EI" ]; then
988	INTRA_PAGE=${URL#:///}
989	valPrint ts " Just use [[$INTRA_PAGE]]"
990	valPrint rs " Just use [[$INTRA_PAGE]]"
991	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Just use</td><td>[[$INTRA_PAGE]]</td></tr>"
992	fi
993
994	# Notify reader if we can use an interwiki prefix for this URL
995	if [ $STATUS == "IW" ]; then
996	INTER_PAGE=$(echo "$URL" \| sed 's/.*\///')
997	valPrint ts " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
998	valPrint rs " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
999	valPrint hs "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]</td></tr>"
1000	fi
1001
1002	# Query Internet Archive for latest "OK" snapshot for "NG" page
1003	if [ $STATUS == "NG" ] && [ $SUGGEST_SNAPSHOTS -eq 1 ]; then
1004	ARCHIVE_QUERY=$(curl --silent --max-time 10 "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
1005
1006	# If a "closest" snapshot was received...
1007	if [[ "$ARCHIVE_QUERY" == \"closest\": ]]; then
1008	# In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it
1009	ARCHIVE_QUERY=$(echo "$ARCHIVE_QUERY" \| sed 's/#!/#\\!/')
1010
1011	# ...isolate "url" property in the response that follows the "closest" tag
1012	SNAPSHOT_URL=${ARCHIVE_QUERY##*\"closest\":} # everything after '"closest":'
1013	SNAPSHOT_URL=${SNAPSHOT_URL#*\"url\": \"} # everything after '"url": "'
1014	SNAPSHOT_URL=${SNAPSHOT_URL%%\"*} # everything before '"'
1015
1016	# Remove the port 80 part that IA often adds to the URL, as it's superfluous
1017	SNAPSHOT_URL=$(echo $SNAPSHOT_URL \| sed 's/:80//')
1018
1019	# Inform the user of the snapshot URL
1020	valPrint ts " IA suggests $SNAPSHOT_URL"
1021	valPrint rs " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
1022	valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
1023	else # ...otherwise give generic Wayback Machine link for this URL
1024	valPrint ts " Try browsing $ARCHIVE_GENERIC/$URL"
1025	valPrint rs " Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
1026	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"
1027	fi
1028	fi
1029	fi
1030
1031	# If this is a page and it seems to be there, take screenshot of it using headless Google Chrome
1032	if [ $IS_FILE -eq 0 ] && [ $TAKE_PAGE_SHOT -eq 1 ] && [ $STATUS == "OK" ]; then
1033	# Sanitize URL by removing "http(s)://" and converting any other colons or slashes to underscores
1034	SHOT_NAME=$(echo "$URL" \| sed 's/https*\:\/\///' \| sed 'y/:\//__/')
1035	SHOT_FILE="$SHOT_PATH/$SHOT_NAME.png"
1036
1037	# Don't take screenshot if we already encountered this page and screenshotted it
1038	if [ ! -f "$SHOT_FILE" ]; then
1039	"$CHROME_PATH" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
1040	if [ -f "$WORKING_DIR/$CHROME_SCREENSHOT" ]; then
1041	mv -n "$WORKING_DIR/$CHROME_SCREENSHOT" "$SHOT_FILE"
1042	else
1043	valPrint trhs "Screenshot of URL $URL seems to have failed!"
1044	fi
1045	else
1046	valPrint trhs "Skipping screenshot of URL '$URL' because file '$SHOT_FILE' already exists."
1047	fi
1048	fi
1049	done
1050	FINISHED_LIST="yes"
1051	wrapupAndExit

Note: See TracBrowser for help on using the repository browser.

Download in other formats: