Context Navigation

source: Validate External Links/validate_external_links.sh@ 1138

Last change on this file since 1138 was 1137, checked in by iritscen, 4 years ago
ValExtLinks: Added '.full' as a recognized page suffix.
File size: 49.3 KB

Line
1	#!/bin/bash
2
3	# Validate External Links by Iritscen
4	# Provided with a list of external links in an expected CSV format, this script validates them. The
5	# resulting logs are produced in three formats: TXT (for easy diffing with an earlier log), RTF (for
6	# reading as a local file with clickable links), and HTML (for uploading as a web page). Call script
7	# with "--help" argument for documentation. Also see Read Me First.rtf for critical notes.
8	# Recommended rule:
9	# \|----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----\|
10
11	# Set separator token to newline
12	IFS="
13	"
14
15	### GLOBALS ###
16	# Settings -- these will be changed from their defaults by the arguments passed in to the script
17	LINKS_URL="" # use 'curl' to download file with links from this location (can be file://)
18	EXCEPT_URL="" # 'curl' will access this wiki page with a list of exceptions for NG results
19	OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder
20	RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES
21	SHOW_SLASH=0 # record response code to the log when a slash is added to the end of a URL
22	SHOW_HTTPS=0 # record response code to the log when "http" is upgraded to "https"
23	SHOW_YT_RD=0 # record response code to the log when a youtu.be URL is expanded to the full URL
24	SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page
25	SKIP_ARCHIVE_LINKS=0 # don't check URLs under the archive.org domain
26	TAKE_PAGE_SHOT=0 # take a screenshot of each OK page
27	CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature
28	URL_START=1 # start at this URL in LINKS_FILE (1 by default)
29	URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE
30	UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report
31
32	# Fixed strings -- see the occurrences of these variables to learn their purpose
33	AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36 OPR/69.0.3686.77"
34	ARCHIVE_API="http://archive.org/wayback/available"
35	ARCHIVE_GENERIC="https://web.archive.org/web/*"
36	ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
37	CHROME_SCREENSHOT="screenshot.png"
38	CURL_CODES="http://iritscen.oni2.net/val/curl_codes.txt"
39	EXCEPT_FILE_NAME="exceptions.txt"
40	EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
41	HTTP_CODES="http://iritscen.oni2.net/val/http_codes.txt"
42	MY_WIKI_PAGE="https://wiki.oni2.net/User:Iritscen"
43	THIS_DIR=$(cd $(dirname $0); pwd)
44	WORKING_DIR=$(pwd)
45	WIKI_PATH="wiki.oni2.net"
46
47	# These are parallel arrays of the IDs and names of OniGalore's current namespaces
48	declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 100 101 102 103 104 105 108 109 110 111)
49	declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk")
50
51	# These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
52	# This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code.
53	declare -a HTTP_FILES=(3ds 7z avi BINA blend bsl cpp csv dae dll dmg doc east exe fbx first63 flv gamedata gif jpg last32 log m4a mhm mov mp3 mp4 oni ONWC pdf png py rar tga TRMA txt vbs wav wmv xaf xml zip)
54	declare -a HTTP_TLDS_AND_PAGES=(action ars asp aspx cfm cgi com css de full htm html it js jsp net org pgi php php3 phtml pl ru shtml stm uk x)
55
56	# These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
57	# are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt
58	# if you add a new code.
59	declare -a OK_CODES=(200 401 405 406 418 501)
60	declare -a RD_CODES=(301 302 303 307 308)
61	declare -a NG_CODES=(000 400 403 404 410 500 502 503 530)
62
63	# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
64	# transcluded text, and if the transclusion fails, then the braces show up in the URL
65	ILLEGAL_CHARS="{ }"
66
67	# The shortest URL possible, used for sanity-checking some URLs: http://a.co
68	MIN_URL_LENGTH=11
69
70	# These are parallel arrays giving the prefixes that can be used in place of normal external links to
71	# some wikis and other sites
72	declare -a INTERWIKI_PREFIXES=(commons metawikimedia mw wikibooks wikidata wikimedia wikinews wikiquote wikisource wikispecies wikiversity wikivoyage wikt wp)
73	declare -a INTERWIKI_DOMAINS=(commons.wikimedia.org meta.wikimedia.org mediawiki.org wikibooks.org wikidata.org wikimediafoundation.org wikinews.org wikiquote.org wikisource.org species.wikimedia.org wikiversity.org wikivoyage.org wiktionary.org wikipedia.org)
74
75	# Variables for keeping track of main loop progress and findings
76	LINK_NUM=0
77	EI_LINKS=0
78	IW_LINKS=0
79	OK_LINKS=0
80	RD_LINKS=0
81	NG_LINKS=0
82	SKIP_UNK_NS=0
83	SKIP_JS_PAGE=0
84	SKIP_BAD_URL=0
85	SKIP_NON_ASCII=0
86	SKIP_UNK_SUFFIX=0
87	SKIP_UNK_CODE=0
88	SKIP_EXPECT_NG=0
89	SKIP_EXPECT_EI=0
90	SKIP_EXPECT_IW=0
91	SKIP_HTTPS_UP=0
92	SKIP_SLASH_ADD=0
93	SKIP_YOUTU_BE=0
94	SKIP_ARCHIVE_ORG=0
95	FILE_LINKS=0
96	PAGE_LINKS=0
97	SKIPPED_HEADER_ROW=0
98	FINISHED_LIST="no"
99	START_RUN=0
100	END_RUN=0
101
102
103	### HELP ###
104	# A pseudo-man page. Here is the 80-character rule for the page text:
105	# 234567890123456789012345678901234567890123456789012345678901234567890123456789
106	function printHelp()
107	{
108	cat << EOF
109
110	NAME
111	Validate External Links
112
113	SYNOPSIS
114	validate_external_links.sh --help
115	validate_external_links.sh --links URL --output DIR [--exceptions URL]
116	[--record-ok-links] [--show-added-slashes] [--show-https-upgrades]
117	[--show-yt-redirects] [--suggest-snapshots] [--skip-archive-links]
118	[--take-screenshots FILE] [--start-url NUM] [--end-url NUM]
119	[--upload FILE]
120
121	DESCRIPTION
122	This script parses a list of external links found in the OniGalore wiki
123	(which is dumped by the Oni2.net domain periodically in a particular
124	format), validates them using the Unix tool 'curl', and produces a report
125	of which links were "OK" (responded positively to an HTTP query), which
126	were "RD" (responded with a 3xx redirect code), which could be "IW"
127	(interwiki) links, which are "EI" (external internal) links and could be
128	intrawiki links, and which were "NG" (no good; a negative response to the
129	query). This report can then be automatically uploaded to the location of
130	your choice. The script can also suggest Internet Archive snapshots for
131	"NG" links, and take screenshots of "OK" links for visual verification by
132	the reader that the page in question is the one intended to be displayed.
133
134	You must pass this script the URL at which the list of links is found
135	(--links) and the path where the directory of logs should be outputted
136	(--output). All other arguments are optional.
137
138	OPTIONS
139	--help Show this page.
140	--links URL (required) URL from which to download the CSV
141	file with external links. Note that this URL can
142	be a local file if you supply a file:// path.
143	--output DIR (required) Unix path to directory in which Val
144	should place its reports.
145	--exceptions URL In order to remove links from the report which
146	Val finds an issue with but which you regard as
147	OK, list those desired exceptions on a wiki page.
148	See the sample file "exceptions.pdf" for the
149	required format of the page. Note that this URL
150	can point to a local file if you supply a path
151	beginning with "file://".
152	--record-ok-links Log a link in the report even if its response
153	code is "OK".
154	--show-added-slashes Report on redirects that simply add a '/' to the
155	end of the URL.
156	--show-https-upgrades Report on redirects that simply upgrade a
157	"http://" URL to a "https://" URL.
158	--show-yt-redirects Report on redirects that expand a youtu.be URL.
159	--suggest-snapshots Query the Internet Archive for a possible
160	snapshot URL for each "NG" page.
161	--skip-archive-links Don't check links that are already pointing to
162	a page on the Internet Archive.
163	--take-screenshots FILE Call the Google Chrome binary at this path to
164	take screenshots of each "OK" page.
165	--start-url NUM Start at this link in the links CSV file.
166	--end-url NUM Stop at this link in the links CSV file.
167	--upload FILE Upload report using the credentials and path
168	given in this local text file. See sftp_login.txt
169	for template.
170
171	BUGS
172	The script cannot properly parse any line in the external links file
173	which contains a comma in the name of the wiki page containing a link.
174	Commas in the link itself are not an issue.
175	EOF
176	}
177
178
179	### SETUP ###
180	# If first argument is a help request, or if nothing was passed in at all, print help page and quit
181	if [ "$#" -eq 0 ] \|\| [ "$1" == "--help" ]; then
182	printHelp \| less
183	exit 0
184	fi
185
186	# Parse arguments as long as there are more arguments to process
187	while (( "$#" )); do
188	case "$1" in
189	--links ) LINKS_URL="$2"; shift 2;;
190	--exceptions ) EXCEPT_URL="$2"; shift 2;;
191	--output ) OUTPUT_DIR="$2"; shift 2;;
192	--record-ok-links ) RECORD_OK_LINKS=1; shift;;
193	--show-added-slashes ) SHOW_SLASH=1; shift;;
194	--show-https-upgrades ) SHOW_HTTPS=1; shift;;
195	--show-yt-redirects ) SHOW_YT_RD=1; shift;;
196	--suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;;
197	--skip-archive-links ) SKIP_ARCHIVE_LINKS=1; shift;;
198	--take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
199	--start-url ) URL_START=$2; shift 2;;
200	--end-url ) URL_LIMIT=$2; shift 2;;
201	--upload ) UPLOAD_INFO=$2; shift 2;;
202	* ) echo "Invalid argument $1 detected. Aborting."; exit 1;;
203	esac
204	done
205
206	# If the required arguments were not supplied, print help page and quit
207	if [ -z $LINKS_URL ] \|\| [ -z $OUTPUT_DIR ]; then
208	echo "Error: I did not receive one or both required arguments. Run me with the \"--help\" argument for documentation."
209	exit 2
210	fi
211
212	# If user wants screenshots, make sure path to Chrome was passed in and is valid
213	if [ $TAKE_PAGE_SHOT -eq 1 ]; then
214	if [ ! -f "$CHROME_PATH" ]; then
215	echo "Error: You need to supply a path to the Google Chrome application in order to take screenshots."
216	exit 3
217	fi
218	fi
219
220	# Check that UPLOAD_INFO exists, if this argument was supplied
221	if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then
222	echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting."
223	exit 4
224	fi
225
226	# Check that OUTPUT_DIR is a directory
227	if [ ! -d "$OUTPUT_DIR" ]; then
228	echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting."
229	exit 5
230	fi
231
232	# Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots
233	SAFE_TIME=$(date "+%Y-%m-%d--%H-%M-%S") # for use in path names
234	NICE_TIME=$(TZ=":UTC" date "+%a, %d %b %Y %k:%M:%S GMT") # matches format of LINKS_DATE, which is extracted from the extlinks file
235	OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
236	OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
237	SHOT_PATH="$OUTPUT_PATH/Screenshots"
238	LOG_NAME="ValExtLinks report"
239	LOG_TXT="$OUTPUT_PATH/$LOG_NAME.txt"
240	LOG_RTF="$OUTPUT_PATH/$LOG_NAME.rtf"
241	LOG_HTM="$OUTPUT_PATH/$LOG_NAME.htm"
242	mkdir "$OUTPUT_PATH"
243	if [ $TAKE_PAGE_SHOT -eq 1 ]; then
244	mkdir "$SHOT_PATH"
245	fi
246
247	# Check that 'mkdir' succeeded
248	if [ ! -d "$OUTPUT_PATH" ]; then
249	echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting."
250	exit 6
251	fi
252
253	# Get date on the file at LINKS_URL and print to log
254	LINKS_DATE=$(curl --silent --head $LINKS_URL \| grep "Last-Modified")
255	if [ -z "$LINKS_DATE" ]; then
256	echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting."
257	exit 7
258	fi
259	LINKS_DATE=${LINKS_DATE#Last-Modified: }
260
261
262	### UTILITY FUNCTIONS ###
263	# Writes a plain-text header to TXT log file
264	function printTXTheader()
265	{
266	valPrint t "Validate External Links report"
267	valPrint t "generated $NICE_TIME"
268	valPrint t "from data of $LINKS_DATE"
269	valPrint t "script by Iritscen (contact: $MY_WIKI_PAGE)"
270	valPrint t ""
271	}
272
273	# Writes the RTF header to RTF log file, then a centered title, then sets text to left-justified
274	function printRTFheader()
275	{
276	valPrint r "{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf820
277	{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
278	{\colortbl;\red255\green255\blue255;}
279	{\*\expandedcolortbl;;}
280	\margl1440\margr1440\vieww12600\viewh12100\viewkind0
281	\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\qc\partightenfactor0
282
283	\f0\fs28 \cf0 \b1 Validate External Links report \b0\\
284	generated $NICE_TIME\\
285	from data of $LINKS_DATE\\
286	script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$MY_WIKI_PAGE\"}}{\fldrslt contact}})
287	\\
288	\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
289	\cf0 "
290	}
291
292	# Closes the RTF markup of the RTF log file
293	function printRTFfooter()
294	{
295	valPrint r "}"
296	}
297
298	# Writes the HTML header to HTML log file
299	function printHTMheader()
300	{
301	valPrint h "<html>
302	<head>
303	<title>Validate External Links report</title>
304	</head>
305	<body>
306	<h2>Validate External Links report</h2>
307	<h3>generated $NICE_TIME<br />
308	from data of $LINKS_DATE<br />
309	script by Iritscen (<a href=\"$MY_WIKI_PAGE\" target=\"_blank\">contact</a>)</h3>"
310	}
311
312	# Closes the HTML markup of the HTML log file
313	function printHTMfooter()
314	{
315	valPrint h "</body>
316	</html>"
317	}
318
319	# The central logging function. The first parameter is a string composed of one or more characters that
320	# indicate which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
321	# 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 's' means "Print an extra newline at the end." 'w' means "Don't pass console output through 'fmt'" ("fmt" fits the output
322	# to an 80-column CLI but can break special formatting and the 'n' option).
323	function valPrint()
324	{
325	if [[ "$1" == c ]]; then
326	if [[ "$1" == n ]]; then
327	echo -n "$2"
328	elif [[ "$1" == w ]]; then
329	echo "$2"
330	elif [[ "$1" == s ]]; then
331	echo -e "$2\n"
332	else
333	echo "$2" \| fmt -w 80
334	fi
335	fi
336	if [[ "$1" == t ]]; then
337	if [[ "$1" == n ]]; then
338	echo -n "$2" >> "$LOG_TXT"
339	elif [[ "$1" == s ]]; then
340	echo -e "$2\n" >> "$LOG_TXT"
341	else
342	echo "$2" >> "$LOG_TXT"
343	fi
344	fi
345	if [[ "$1" == r ]]; then
346	if [[ "$1" == n ]]; then
347	echo "$2" >> "$LOG_RTF"
348	elif [[ "$1" == s ]]; then
349	echo "$2\line\line" >> "$LOG_RTF"
350	else
351	echo "$2\line" >> "$LOG_RTF"
352	fi
353	fi
354	if [[ "$1" == h ]]; then
355	if [[ "$1" == s ]]; then
356	echo "$2<tr><td> </td></tr>" >> "$LOG_HTM"
357	elif [[ "$1" == n ]]; then
358	echo "$2" >> "$LOG_HTM"
359	else
360	echo "$2<br />" >> "$LOG_HTM"
361	fi
362	fi
363	}
364
365	# Pluralize the string in parameter 1 if the number in parameter 2 is not 1
366	function pluralCheckNoun()
367	{
368	if [ $2 -ne 1 ]; then
369	if [[ $1 =~ x$ ]]; then
370	echo $1es
371	else
372	echo $1s
373	fi
374	else
375	echo $1
376	fi
377	}
378
379	# Output "is" if parameter 1 is 1, otherwise "are"
380	function pluralCheckIs()
381	{
382	if [ $1 -ne 1 ]; then
383	echo "are"
384	else
385	echo "is"
386	fi
387	}
388
389	# Output "was" if parameter 1 is 1, otherwise "were"
390	function pluralCheckWas()
391	{
392	if [ $1 -ne 1 ]; then
393	echo "were"
394	else
395	echo "was"
396	fi
397	}
398
399	# Output "a " if parameter 1 is 1, otherwise nothing
400	function pluralCheckA()
401	{
402	if [ $1 -eq 1 ]; then
403	echo "a "
404	fi
405	}
406
407	# Output "an " if parameter 1 is 1, otherwise nothing
408	function pluralCheckAn()
409	{
410	if [ $1 -eq 1 ]; then
411	echo "an "
412	fi
413	}
414
415	# Upload HTML report using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
416	# reports being saved to disk have already been closed.
417	function uploadReport()
418	{
419	valPrint c "Uploading HTML report..."
420
421	SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
422	SFTP_USER_NAME_MARKER="user:"
423	SFTP_PASSWORD_MARKER="pw:"
424	SFTP_PORT_MARKER="port:"
425	SFTP_PATH_MARKER="path:"
426	SFTP_USER_NAME=$(grep $SFTP_USER_NAME_MARKER $UPLOAD_INFO)
427	SFTP_USER_NAME=${SFTP_USER_NAME#$SFTP_USER_NAME_MARKER}
428	SFTP_PASSWORD=$(grep $SFTP_PASSWORD_MARKER $UPLOAD_INFO)
429	SFTP_PASSWORD=${SFTP_PASSWORD#$SFTP_PASSWORD_MARKER}
430	SFTP_PORT=$(grep $SFTP_PORT_MARKER $UPLOAD_INFO)
431	SFTP_PORT=${SFTP_PORT#$SFTP_PORT_MARKER}
432	SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
433	SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}
434
435	expect "$SCRIPT_PATH" "$LOG_HTM" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.htm"
436
437	valPrint c "Report was uploaded, unless an error message appears above."
438	}
439
440	# Prints session summary when script is done
441	function wrapupAndExit()
442	{
443	# Get off progress line on console, drop down a line from last link in log, and close HTML table
444	valPrint ctr ""
445	valPrint h "</table><br />"
446
447	# If we didn't finish processing the last URL, then the iterator is one too high
448	if [ $FINISHED_LIST != "yes" ]; then
449	let LINK_NUM-=1
450	if [ $FINISHED_LIST == "no" ]; then
451	valPrint ctrh "The session was canceled by the user."
452	fi
453	fi
454
455	# Generate string with elapsed time
456	END_RUN=$(date +%s)
457	ELAPSED=$(echo $(($END_RUN - $START_RUN)) \| awk '{printf "%d min. %d sec. elapsed", int($1/60), int($1%60)}')
458
459	# Do some math on results of session
460	LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
461	LINK_PROBLEMS=$((EI_LINKS+IW_LINKS+RD_LINKS+NG_LINKS))
462	LINK_ERRORS=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
463	LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_EI+SKIP_EXPECT_IW))
464	TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP+SKIP_YOUTU_BE))
465	LINKS_CHECKED=$((LINKS_PROCESSED-LINK_ERRORS))
466
467	# Print summary header
468	valPrint ct "Summary ($ELAPSED):"
469	valPrint r "\b1 Summary \b0 ($ELAPSED)"
470	valPrint hn "<h3><span id=\"summary\">Summary ($ELAPSED)</span></h3>"
471	valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT) (there $(pluralCheckWas $FILE_LINKS) $FILE_LINKS file $(pluralCheckNoun link $FILE_LINKS) and $PAGE_LINKS page $(pluralCheckNoun link $PAGE_LINKS))."
472
473	# Print processed link totals
474	if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi
475	if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi
476	if [ $SKIP_ARCHIVE_ORG -gt 0 ]; then valPrint ctrh "- $SKIP_ARCHIVE_ORG Archive.org $(pluralCheckNoun link $SKIP_ARCHIVE_ORG) were not checked"; fi
477	if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS processed $(pluralCheckNoun link $LINK_PROBLEMS) had $(pluralCheckAn $LINK_PROBLEMS)$(pluralCheckNoun issue $LINK_PROBLEMS)"; fi
478	if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr " (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; valPrint h "  (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi
479	if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) $(pluralCheckWas $OK_LINKS) OK"; fi
480	if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctr " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; valPrint h "  (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi
481
482	# Print excepted link totals
483	if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctrh "$LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"; fi
484	if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
485	if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi
486	if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi
487
488	# Print errored link totals
489	if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"; fi
490	if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
491	if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
492	if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
493	if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
494	if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
495	if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
496
497	# Print checked link totals
498	if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS link $(pluralCheckNoun issue $LINK_PROBLEMS):"; fi
499	if [ $NG_LINKS -gt 0 ]; then valPrint ctrh "- $NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
500	if [ $RD_LINKS -gt 0 ]; then valPrint ctrh "- $RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi
501	if [ $EI_LINKS -gt 0 ]; then valPrint ctrh "- $EI_LINKS $(pluralCheckNoun link $EI_LINKS) that could be intrawiki"; fi
502	if [ $IW_LINKS -gt 0 ]; then valPrint ctrh "- $IW_LINKS $(pluralCheckNoun link $IW_LINKS) that could be interwiki"; fi
503
504	# Close the log files' markup
505	valPrint trh "ValExtLinks says goodbye."
506	printRTFfooter
507	printHTMfooter
508
509	# Upload report if this was requested
510	if [ ! -z $UPLOAD_INFO ] && [ $FINISHED_LIST != "no" ]; then
511	uploadReport
512	fi
513
514	# Really quit now
515	valPrint c "ValExtLinks says goodbye."
516	exit 0
517	}
518	trap wrapupAndExit INT
519
520
521	### INITIALIZATION ###
522	# Print opening message to console and log files
523	valPrint cw "$(tput bold)--Validate External Links--$(tput sgr0)"
524	printTXTheader
525	printRTFheader
526	printHTMheader
527
528	# Attempt to download file at LINKS_URL, then check that it succeeded
529	valPrint t "Config:"
530	valPrint r "\b1 Config \b0"
531	valPrint hn "<h3>Config</h3>"
532	valPrint cwtrh "Downloading list of external links from $LINKS_URL."
533	LINKS_FILE_NAME=$(echo "$LINKS_URL" \| sed 's/.*\///')
534	LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
535	curl --silent -o "$LINKS_FILE" $LINKS_URL
536	if [ ! -f "$LINKS_FILE" ]; then
537	echo "The download of $LINKS_URL appears to have failed. Aborting."
538	wrapupAndExit
539	fi
540
541	# Attempt to download file at EXCEPT_URL, then check that it succeeded
542	if [ ! -z $EXCEPT_URL ]; then
543	valPrint cwtrh "Downloading list of reporting exceptions from $EXCEPT_URL."
544	EXCEPT_DATA=$(curl --silent $EXCEPT_URL)
545	if [ -z "$EXCEPT_DATA" ]; then
546	echo "The download of the exceptions data from '$EXCEPT_URL' appears to have failed. Aborting."
547	wrapupAndExit
548	fi
549	EXCEPT_DATA=${EXCEPT_DATA%END LIST*}
550	EXCEPT_DATA=${EXCEPT_DATA#*BEGIN LIST}
551	EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
552
553	# Store on disk for debugging purposes
554	echo "$EXCEPT_DATA" > "$EXCEPT_FILE"
555
556	# Transfer to array for easy searching later
557	declare -a EXCEPT_ARRAY=($(echo "$EXCEPT_DATA"))
558	fi
559
560	# Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
561	LINK_COUNT_STRING=$(cat "$LINKS_FILE" \| wc -l)
562
563	# Number of URLs is number of lines minus one (first line is column header row for the CSV)
564	LINK_COUNT=$(echo "${LINK_COUNT_STRING}" \| tr -d '[:space:]')
565	let LINK_COUNT-=1
566
567	# Calculate number of URLs to consider
568	if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
569	valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((URL_LIMIT-URL_START+1)) of them, from link $URL_START to $URL_LIMIT."
570	elif [ $URL_START -ne 1 ]; then
571	valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((LINK_COUNT-URL_START+1)) of them, from link $URL_START to $LINK_COUNT."
572	else
573	valPrint ctrh "Found $LINK_COUNT links to process. I will be considering all of them."
574	fi
575
576	# Print settings to console and log
577	declare -a SETTINGS_MSG=(I will be checking the response code of each link "and will" take a screenshot of each page. Pages that are OK will "also" be logged. I "will" ask the Internet Archive for a suggested snapshot URL for each NG page. "I will not report links that are in the exceptions list." "I will ignore URLs that simply have ending slashes added onto them." "I will ignore URLs that only upgrade from HTTP to HTTPS." "I will ignore youtu.be links that are merely being expanded." "I will not check the validity of Internet Archive snapshot URLs.")
578	if [ $TAKE_PAGE_SHOT -eq 0 ]; then SETTINGS_MSG[10]="but will not"; fi
579	if [ $RECORD_OK_LINKS -eq 0 ]; then SETTINGS_MSG[22]="not"; fi
580	if [ $SUGGEST_SNAPSHOTS -eq 0 ]; then SETTINGS_MSG[26]="will not"; fi
581	if [ -z $EXCEPT_URL ] \|\| [ $RECORD_OK_LINKS -eq 1 ]; then SETTINGS_MSG[40]=""; fi
582	if [ $SHOW_SLASH -eq 1 ]; then SETTINGS_MSG[41]=""; fi
583	if [ $SHOW_HTTPS -eq 1 ]; then SETTINGS_MSG[42]=""; fi
584	if [ $SHOW_YT_RD -eq 1 ]; then SETTINGS_MSG[43]=""; fi
585	if [ $SKIP_ARCHIVE_LINKS -eq 0 ]; then SETTINGS_MSG[44]=""; fi
586	SETTINGS_STR=${SETTINGS_MSG[@]}
587	valPrint ctrh "$SETTINGS_STR"
588	valPrint tr "A summary of my findings will be found at the bottom of the report."
589	valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
590	valPrint trh ""
591
592	# Print legend to logs
593	valPrint t "Legend:"
594	valPrint r "\b1 Legend \b0"
595	valPrint hn "<h3>Legend</h3>"
596	valPrint trh "OK = URL seems to be working."
597	valPrint trh "NG = URL no longer seems to work. You should click each URL marked as NG before attempting to fix it, because false negatives will occur from time to time due to hiccups in the Internet. Please report any persistent false negatives or other issues to the script's author (see top of report). An NG link should be followed by a link to the Internet Archive's Wayback Machine which may help you repair the link, unless the Archive does not have any snapshots of the site. If the link cannot be repaired, you can delete it from the wiki page, or, if this would disrupt the surrounding material on the page, disable the link by wrapping the URL in nowiki tags."
598	valPrint trh "RD = The server responding to this URL is saying that the page moved and you should instead use the supplied new URL. Some RD links represent minor adjustments in the organization of a web site, and some are soft 404s (the file/page has been removed and you are being redirected to something like the main page of the web site). You will have to look at the new URL yourself to determine if it represents an OK link and the link on the wiki should be updated to this one, or if the desired file/page is actually gone and we need to replace the wiki link with an Internet Archive snapshot link -- or disable the URL if it has not been archived."
599	valPrint trh "EI = URL is an external link to an internal page and should be converted to an intrawiki link using the suggested markup."
600	valPrint trh "IW = URL is an external link to a fellow wiki and should be converted to an interwiki link using the suggested markup."
601	valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $HTTP_CODES)."
602	valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$HTTP_CODES\"}}{\fldrslt here}} for code reference)."
603	valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$HTTP_CODES\" target=\"_blank\">here</a> for code reference)."
604	valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $CURL_CODES)."
605	valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$CURL_CODES\"}}{\fldrslt here}} for code reference)."
606	valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$CURL_CODES\" target=\"_blank\">here</a> for code reference)."
607	valPrint trh "IA suggests = Last available snapshot suggested by the Internet Archive."
608	valPrint trh "Try browsing = The Archive occasionally fails to return a snapshot URL even when one exists, so you will need to check for a snapshot manually using this link to the Wayback Machine before concluding that a site has not been archived."
609	valPrint trh ""
610
611
612	### MAIN LOOP ###
613	valPrint t "Links:"
614	valPrint r "\b1 Links \b0"
615	valPrint hn "<h3>Links</h3>"
616	START_RUN=$(date +%s)
617	# Process each line of the .csv in LINKS_FILE
618	for LINE in `cat "$LINKS_FILE"`; do
619	let LINK_NUM+=1
620
621	# First line is the column header row for the CSV, so let's verify that the format hasn't changed
622	if [ $SKIPPED_HEADER_ROW -eq 0 ]; then
623	if [ $LINE == "namespace,title,target" ]; then
624	SKIPPED_HEADER_ROW=1
625	LINK_NUM=0 # this line is it's not a link, so reset the link counter
626	valPrint hn "<table>"
627	continue
628	else
629	valPrint ctrh "Did not find expected header row in $LINKS_FILE. Aborting."
630	wrapupAndExit
631	fi
632	fi
633
634	# Skip this link if we are not at URL_START yet
635	if [ $LINK_NUM -lt $URL_START ]; then
636	continue
637	fi
638
639	# Stop if we are at the limit declared for testing purposes
640	if [ $URL_LIMIT -gt 0 ] && [ $LINK_NUM -gt $URL_LIMIT ]; then
641	FINISHED_LIST="limit"
642	wrapupAndExit
643	fi
644
645	# Print progress to screen
646	if [ $LINK_NUM -gt 1 ]; then
647	printf "\e[1A\n" # erase previous progress message so that new one appears in its place
648	fi
649	valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."
650
651	# The number of the namespace is the element before the first comma on the line
652	NS_ID=${LINE%%,*}
653
654	# Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
655	NS_NAME=""
656	a=0
657	while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done
658	if [ $NS_ID == "NULL" ]; then
659	break
660	elif [ $NS_ID -eq ${NS_IDS[$a]} ]; then
661	NS_NAME="${NS_NAMES[$a]}"
662	break
663	fi
664	let a+=1
665	done
666	if [ "$NS_NAME" == "" ]; then
667	if [ $NS_ID == "NULL" ]; then
668	valPrint trs "Skipping line $LINK_NUM ('$LINE') because the namespace (and probably the page too) is 'NULL'. Probably the link is no longer in existence on the wiki."
669	else
670	valPrint trs "Skipping line $LINK_NUM ('$LINE') because I could not find a name for namespace ID $NS_ID."
671	fi
672	let SKIP_UNK_NS+=1
673	continue
674	fi
675
676	# The name of the page is everything between the namespace ID and the next comma on the line (commas
677	# in page names will break this)
678	PAGE_NAME=${LINE#$NS_ID,}
679	PAGE_NAME=${PAGE_NAME%%,*}
680
681	# We don't want to consider wiki pages ending in .js, as the MW parser cannot reliably isolate URLS
682	# in JavaScript code, so it returns erroneous links
683	PAGE_NAME_SUFFIX=$(echo $PAGE_NAME \| sed 's/.*\.//')
684	if [ $PAGE_NAME_SUFFIX == "js" ]; then
685	valPrint trs "Skipping URL '${LINE#$NS_ID,$PAGE_NAME,}' on line $LINK_NUM because it was found on JavaScript page '$PAGE_NAME'."
686	let SKIP_JS_PAGE+=1
687	continue
688	fi
689
690	# Build longer wiki page URLs from namespace and page names
691	FULL_PAGE_PATH=https://$WIKI_PATH/$NS_NAME:$PAGE_NAME
692	LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
693	# Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it
694	# explicitly breaks the link
695	if [ $NS_ID -eq 0 ]; then
696	FULL_PAGE_PATH=https://$WIKI_PATH/$PAGE_NAME
697	LOCAL_PAGE_PATH=$PAGE_NAME
698	fi
699
700	# The URL being linked to is everything after the previous two fields (this allows commas to be in
701	# the URLs, but a comma in the previous field, the page name, will break this)
702	URL=${LINE#$NS_ID,$PAGE_NAME,}
703
704	# Scan for illegal characters
705	if [[ $URL == [$ILLEGAL_CHARS] ]]; then
706	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because it contains characters illegal in a URL."
707	let SKIP_BAD_URL+=1
708	continue
709	fi
710
711	# If we're skipping Archive.org links, check if this is one
712	if [ $SKIP_ARCHIVE_LINKS -eq 1 ] && [[ $URL == web.archive.org ]]; then
713	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have been asked not to check Wayback Machine links."
714	let SKIP_ARCHIVE_ORG+=1
715	continue
716	fi
717
718	# Now we need to know if the URL is for a file or a web page. First step is to determine if the
719	# URL ends in a suffix
720	HAS_SUFFIX=0
721
722	# If the URL ends in something like ".php?foo=bar", strip everything from the '?' onward
723	CLEAN_URL=${URL%%\?*}
724
725	# If the URL ends in something like "#section_15", strip everything from the '#' onward
726	CLEAN_URL=${CLEAN_URL%%\#*}
727
728	# 'sed' cannot handle Unicode in my Bash shell, so skip non-ASCII URL and make user check it
729	if [[ $CLEAN_URL == [![:ascii:]] ]]; then
730	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I cannot handle non-ASCII characters."
731	let SKIP_NON_ASCII+=1
732	continue
733	fi
734
735	# Isolate the characters after the last period and after the last slash
736	POST_DOT=$(echo "$CLEAN_URL" \| sed 's/.*\.//')
737	POST_SLASH=$(echo "$CLEAN_URL" \| sed 's/.*\///')
738
739	# If the last period comes after the last slash, then the URL ends in a suffix
740	POST_DOT_LENGTH=$(echo \| awk -v input=$POST_DOT '{print length(input)}')
741	POST_SLASH_LENGTH=$(echo \| awk -v input=$POST_SLASH '{print length(input)}')
742	if [ $POST_DOT_LENGTH -lt $POST_SLASH_LENGTH ]; then
743	HAS_SUFFIX=1
744	else
745	HAS_SUFFIX=0
746	fi
747
748	# Now we need to know if the URL is for a file. If the URL ends in a suffix, compare it against the
749	# known URL endings supplied in HTTP_FILES and HTTP_TLDS_AND_PAGES
750	IS_FILE=-1
751	if [ $HAS_SUFFIX -eq 0 ]; then
752	IS_FILE=0
753	else
754	# Turn off case sensitivity while we compare suffixes
755	shopt -s nocasematch
756
757	# Special case: URLs ending in something like "productID.304297400" are pages, not files, so if
758	# the URL's suffix is all numbers, we are looking at the end of a web page URL
759	if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
760	IS_FILE=0
761	fi
762
763	# Special case: URLs ending in a parens, e.g. "ms537113(v=vs.85)", are pages
764	if [[ $POST_DOT =~ ^.*[\(\)]$ ]]; then
765	IS_FILE=0
766	fi
767
768	# Special case: URLs containing a '%', e.g. "10.1007%2FBF00329055", are pages
769	if [[ $POST_DOT == % ]]; then
770	IS_FILE=0
771	fi
772
773	# If we did not identify this URL as a web page above, we need to compare the suffix against known
774	# file extensions
775	if [ $IS_FILE -eq -1 ]; then
776	for EXTENSION in "${HTTP_FILES[@]}"; do
777	if [[ $EXTENSION == $POST_DOT ]]; then # double brackets respect the nocasematch setting
778	IS_FILE=1
779	break
780	fi
781	done
782	fi
783
784	# If we did not identify this URL as a file above, we need to compare the suffix against known
785	# pages and TLDs in order to rule out that this is an unknown file suffix or TLD that the user
786	# needs to add to the HTTP_FILES or HTTP_TLDS_AND_PAGES array
787	if [ $IS_FILE -eq -1 ]; then
788	for PAGE_OR_TLD in "${HTTP_TLDS_AND_PAGES[@]}"; do
789	if [[ $PAGE_OR_TLD == $POST_DOT ]]; then
790	IS_FILE=0
791	break
792	fi
793	done
794	fi
795
796	# Turn case sensitivity back on in Bash
797	shopt -u nocasematch
798	fi
799
800	# If this suffix escaped identification as either a file, page or TLD, inform the user
801	STR_TYPE=""
802	if [ $IS_FILE -eq -1 ]; then
803	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I encountered the unknown URL ending '$POST_DOT'. Please add this ending to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
804	let SKIP_UNK_SUFFIX+=1
805	continue
806	elif [ $IS_FILE -eq 1 ]; then
807	STR_TYPE="file"
808	let FILE_LINKS+=1
809	elif [ $IS_FILE -eq 0 ]; then
810	STR_TYPE="page"
811	let PAGE_LINKS+=1
812	fi
813
814	# Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
815	# issue with sites that require HTTPS
816	CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time 10 --write-out '%{http_code}\n' $URL)
817	CURL_ERR=$(echo $?)
818	CURL_RESULT=$CURL_CODE
819
820	# Tack 'curl' exit code onto result string if 'curl' returned "000" (no HTTP response)
821	if [ $CURL_CODE == "000" ]; then
822	CURL_RESULT="$CURL_RESULT-$CURL_ERR"
823	fi
824
825	# Begin to determine our status code for this URL (EI, IW, OK, RD, or NG)
826	STATUS="??"
827	NEW_URL=""
828	INTERWIKI_INDEX=-1
829
830	# First make sure that this isn't an "external internal" link to our own wiki that can be replaced
831	# by "[[page_name]]". If it uses a special access URL beginning with "/w/", let it pass, as it
832	# probably cannot be replaced by "[[ ]]" markup
833	if [[ $URL == $WIKI_PATH ]] && [[ $URL != $WIKI_PATH/w/ ]]; then
834	STATUS="EI"
835	let EI_LINKS+=1
836	fi
837
838	# If it's not, check if this is a link to a domain that we have an interwiki prefix for
839	if [ $STATUS == "??" ]; then
840	for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
841	if [[ $URL == ${INTERWIKI_DOMAINS[$i]} ]] && [[ $URL != ${INTERWIKI_DOMAINS[$i]}/w/ ]]; then
842	STATUS="IW"
843	let IW_LINKS+=1
844	INTERWIKI_INDEX=$i
845	break
846	fi
847	done
848	fi
849
850	# If we didn't match an interwiki domain, see if the status code is in our "OK" codes list
851	if [ $STATUS == "??" ]; then
852	for CODE in "${OK_CODES[@]}"; do
853	if [[ $CODE == $CURL_CODE ]]; then
854	STATUS="OK"
855	let OK_LINKS+=1
856	break
857	fi
858	done
859	fi
860
861	# If we didn't get a match with the "OK" codes, check it against the "RD" codes
862	if [ $STATUS == "??" ]; then
863	for CODE in "${RD_CODES[@]}"; do
864	if [[ $CODE == $CURL_CODE ]]; then
865	# Get URL header again in order to retrieve the URL we are being redirected to
866	NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time 10 --write-out '%{redirect_url}\n' $URL)
867
868	# Adjust the old and new URLs to both use HTTP for comparison purposes, so we can filter
869	# those changes out if the user didn't ask for them
870	URL_HTTP=$(echo $URL \| sed -E 's/^https:/http:/')
871	NEW_URL_HTTP=$(echo $NEW_URL \| sed -E 's/^https:/http:/')
872
873	# Sometimes 'curl' fails to get the redirect_url due to time-out or bad web site config
874	NEW_URL_LENGTH=$(echo \| awk -v input=$NEW_URL_HTTP '{print length(input)}')
875	if [ $NEW_URL_LENGTH -lt $MIN_URL_LENGTH ]; then
876	NEW_URL_HTTP="[new URL not retrieved]"
877	fi
878
879	# Remove slash at end of new URL, if present, so we can filter out the redirects that
880	# merely add an ending slash if the user didn't ask for them
881	NEW_URL_NO_SLASH=$(echo $NEW_URL_HTTP \| sed -E 's:/$::')
882
883	# Detect if this is a youtu.be link simply being expanded by YouTube to the full
884	# youtube.com address
885	YOUTU_BE=0
886	if [[ $URL_HTTP == http://youtu.be* ]] && [[ $NEW_URL_HTTP == http://www.youtube.com* ]]; then
887	YOUTU_BE=1
888	fi
889
890	# If the URLs match besides HTTP being upgraded to HTTPS, then the link is OK (unless user
891	# wants those to be reported)
892	if [ $SHOW_HTTPS -eq 0 ] && [ $URL_HTTP == $NEW_URL_HTTP ]; then
893	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show http->https upgrades, and I was redirected to '$NEW_URL'."
894	STATUS="OK"
895	let OK_LINKS+=1
896	let SKIP_HTTPS_UP+=1
897	# If the URLs match besides an added ending slash, then the link is OK (unless user wants
898	# those to be reported)
899	elif [ $SHOW_SLASH -eq 0 ] && [ $URL_HTTP == $NEW_URL_NO_SLASH ]; then
900	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show added trailing slashes, and I was redirected to '$NEW_URL'."
901	STATUS="OK"
902	let OK_LINKS+=1
903	let SKIP_SLASH_ADD+=1
904	elif [ $SHOW_YT_RD -eq 0 ] && [ $YOUTU_BE -eq 1 ]; then
905	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show redirects for youtu.be links, and I was redirected to '$NEW_URL'."
906	STATUS="OK"
907	let OK_LINKS+=1
908	let SKIP_YOUTU_BE+=1
909	else
910	STATUS="RD"
911	let RD_LINKS+=1
912	fi
913	break
914	fi
915	done
916	fi
917
918	# If we didn't get a match with the "RD" codes, check it against the "NG" codes
919	if [ $STATUS == "??" ]; then
920	for CODE in "${NG_CODES[@]}"; do
921	if [[ $CODE == $CURL_CODE ]]; then
922	STATUS="NG"
923	let NG_LINKS+=1
924	break
925	fi
926	done
927	fi
928
929	# If we didn't match a known status code, advise the reader
930	if [ $STATUS == "??" ]; then
931	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I encountered the unknown response code $CURL_CODE."
932	let SKIP_UNK_CODE+=1
933	continue
934	fi
935
936	# Check problem links against exceptions list before proceeding
937	FOUND_EXCEPT=0
938	if [ $STATUS != "OK" ] && [ ! -z $EXCEPT_URL ]; then
939	# The code we expect to find in the exceptions file is either the 'curl' result or "EI"/"IW"
940	EXPECT_CODE="$CURL_RESULT"
941	if [ $STATUS == "EI" ]; then
942	EXPECT_CODE="EI"
943	elif [ $STATUS == "IW" ]; then
944	EXPECT_CODE="IW"
945	fi
946
947	# Look for link in exceptions list and make sure the listed result code and wiki page also match
948	for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
949	{
950	EXCEPT_LINE="${EXCEPT_ARRAY[$i]}"
951
952	# Match URL
953	EXCEPT_URL="${EXCEPT_LINE#*,}"
954	EXCEPT_URL="${EXCEPT_URL%,*}"
955	if [ "$EXCEPT_URL" != "$URL" ]; then
956	continue
957	fi
958
959	# Match containing page's name
960	EXCEPT_PAGE="${EXCEPT_LINE##*,}"
961	EXCEPT_PAGE="${EXCEPT_PAGE%% *}"
962	if [ "$EXCEPT_PAGE" == "*" ] \|\| [ "$EXCEPT_PAGE" == $LOCAL_PAGE_PATH ]; then
963	# Match result code
964	EXCEPT_CODE=${EXCEPT_LINE%%,*}
965	if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then
966	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because its expected result, '$EXPECT_CODE', is in the exceptions list."
967	if [ $STATUS == "EI" ]; then
968	let SKIP_EXPECT_EI+=1
969	elif [ $STATUS == "IW" ]; then
970	let SKIP_EXPECT_IW+=1
971	else
972	let SKIP_EXPECT_NG+=1
973	fi
974	FOUND_EXCEPT=1
975	break
976	fi
977	fi
978	} done
979	fi
980	if [ $FOUND_EXCEPT -eq 1 ]; then
981	continue
982	fi
983
984	# If appropriate, record this link to the log, with clickable URLs when possible
985	if [ $STATUS != "OK" ] \|\| [ $RECORD_OK_LINKS -eq 1 ]; then
986	# Prepare 'curl' result in parentheses to print after status code, unless this is an "IW" or "EI"
987	# link, in which case showing the status code doesn't make sense. Adjust spacing after string to
988	# ensure TXT and RTF reports have aligned columns of results.
989	CURL_STR_H=" ($CURL_RESULT)"
990	CURL_STR_T="$CURL_STR_H"
991	CURL_STR_R="$CURL_STR_H "
992	if [ $STATUS == "IW" ] \|\| [ $STATUS == "EI" ]; then
993	CURL_STR_H=""
994	CURL_STR_T=" "
995	CURL_STR_R=" "
996	fi
997
998	# Record link and its wiki page in TXT, RTF, and HTML markup
999	valPrint t "${STATUS}${CURL_STR_T} $STR_TYPE $URL"
1000	valPrint t " linked from $FULL_PAGE_PATH"
1001	valPrint r "${STATUS}${CURL_STR_R}${RTF_TABS}$STR_TYPE {\field{\*\fldinst{HYPERLINK \"$URL\"}}{\fldrslt $URL}}"
1002	valPrint r " linked from {\field{\*\fldinst{HYPERLINK \"$FULL_PAGE_PATH\"}}{\fldrslt $LOCAL_PAGE_PATH}}"
1003	valPrint hn "<tr><td style=\"white-space:nowrap\">${STATUS}${CURL_STR_H}</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>"
1004	valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"
1005
1006	# Place vertical space here since we won't be printing anything more about this link
1007	if [ $STATUS == "OK" ]; then valPrint tr ""; valPrint hs ""; fi
1008
1009	# Record redirect URL if one was given by a 3xx response page
1010	if [ $STATUS == "RD" ]; then
1011	valPrint ts " Server suggests $NEW_URL"
1012	valPrint rs " Server suggests {\field{\*\fldinst{HYPERLINK \"$NEW_URL\"}}{\fldrslt $NEW_URL}}"
1013	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Server suggests</td><td><a href=\"$NEW_URL\" target=\"_blank\">$NEW_URL</a></td></tr>"
1014	fi
1015
1016	# Notify reader if we can use an intrawiki link for this URL
1017	if [ $STATUS == "EI" ]; then
1018	INTRA_PAGE=${URL#:///}
1019	valPrint ts " Just use [[$INTRA_PAGE]]"
1020	valPrint rs " Just use [[$INTRA_PAGE]]"
1021	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Just use</td><td>[[$INTRA_PAGE]]</td></tr>"
1022	fi
1023
1024	# Notify reader if we can use an interwiki prefix for this URL
1025	if [ $STATUS == "IW" ]; then
1026	INTER_PAGE=$(echo "$URL" \| sed 's/.*\///')
1027	valPrint ts " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
1028	valPrint rs " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
1029	valPrint hs "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]</td></tr>"
1030	fi
1031
1032	# Query Internet Archive for latest "OK" snapshot for "NG" page
1033	if [ $STATUS == "NG" ] && [ $SUGGEST_SNAPSHOTS -eq 1 ]; then
1034	ARCHIVE_QUERY=$(curl --silent --max-time 10 "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
1035
1036	# If a "closest" snapshot was received...
1037	if [[ "$ARCHIVE_QUERY" == \"closest\": ]]; then
1038	# In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it
1039	ARCHIVE_QUERY=$(echo "$ARCHIVE_QUERY" \| sed 's/#!/#\\!/')
1040
1041	# ...isolate "url" property in the response that follows the "closest" tag
1042	SNAPSHOT_URL=${ARCHIVE_QUERY##*\"closest\":} # everything after '"closest":'
1043	SNAPSHOT_URL=${SNAPSHOT_URL#*\"url\": \"} # everything after '"url": "'
1044	SNAPSHOT_URL=${SNAPSHOT_URL%%\"*} # everything before '"'
1045
1046	# Remove the port 80 part that IA often adds to the URL, as it's superfluous
1047	SNAPSHOT_URL=$(echo $SNAPSHOT_URL \| sed 's/:80//')
1048
1049	# Inform the user of the snapshot URL
1050	valPrint ts " IA suggests $SNAPSHOT_URL"
1051	valPrint rs " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
1052	valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
1053	else # ...otherwise give generic Wayback Machine link for this URL
1054	valPrint ts " Try browsing $ARCHIVE_GENERIC/$URL"
1055	valPrint rs " Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
1056	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"
1057	fi
1058	fi
1059	fi
1060
1061	# If this is a page and it seems to be there, take screenshot of it using headless Google Chrome
1062	if [ $IS_FILE -eq 0 ] && [ $TAKE_PAGE_SHOT -eq 1 ] && [ $STATUS == "OK" ]; then
1063	# Sanitize URL by removing "http(s)://" and converting any other colons or slashes to underscores
1064	SHOT_NAME=$(echo "$URL" \| sed 's/https*\:\/\///' \| sed 'y/:\//__/')
1065	SHOT_FILE="$SHOT_PATH/$SHOT_NAME.png"
1066
1067	# Don't take screenshot if we already encountered this page and screenshotted it
1068	if [ ! -f "$SHOT_FILE" ]; then
1069	"$CHROME_PATH" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
1070	if [ -f "$WORKING_DIR/$CHROME_SCREENSHOT" ]; then
1071	mv -n "$WORKING_DIR/$CHROME_SCREENSHOT" "$SHOT_FILE"
1072	else
1073	valPrint trhs "Screenshot of URL $URL seems to have failed!"
1074	fi
1075	else
1076	valPrint trhs "Skipping screenshot of URL '$URL' because file '$SHOT_FILE' already exists."
1077	fi
1078	fi
1079	done
1080	FINISHED_LIST="yes"
1081	wrapupAndExit

Note: See TracBrowser for help on using the repository browser.

Download in other formats: