Context Navigation

source: Validate External Links/validate_external_links.sh@ 1144

Last change on this file since 1144 was 1144, checked in by iritscen, 4 years ago

ValExtLinks: Changed --skip-archive-links argument to --check-archive-links because the default should be to skip them. Val now uploads all three formats of its report, and links to the RTF and TXT versions from the HTML one. Val can also now tell whether each upload succeeded. A report with no link issues will print a placeholder message in that section of the report. Fixed a bug where Val thought a link should be an interwiki link when it was really a link to an archive.org snapshot from said wiki.

File size: 51.3 KB

Line
1	#!/bin/bash
2
3	# Validate External Links by Iritscen
4	#
5	# Validates a list of external links in CSV format. The resulting logs are produced in three formats:
6	# - TXT (for easy diffing with an earlier log)
7	# - RTF (for reading as a local file with clickable links)
8	# - HTML (for reading as a web page)
9	# Call script with "--help" argument for documentation. Also see Read Me.rtf for critical notes.
10	#
11	# Recommended rule:
12	# \|----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----\|
13	#
14	# Table of contents (sections of script in order of appearance, not execution):
15	# • Globals
16	# • Help Output
17	# • Setup
18	# • Utility Functions
19	# • Summary Output
20	# • Initialization
21	# • Data Sourcing
22	# • Config Output
23	# • Legend Output
24	# • Main Loop
25
26	# Set separator token to newline
27	IFS="
28	"
29
30	### GLOBALS ###
31	# Settings -- these will be changed from their defaults by the arguments passed in to the script
32	LINKS_URL="" # use 'curl' to download file with links from this location (can be file://)
33	EXCEPT_URL="" # 'curl' will access this wiki page with a list of exceptions for NG results
34	OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder
35	RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES
36	SHOW_SLASH=0 # record issue when a slash is added to the end of a URL
37	SHOW_HTTPS=0 # record issue when "http" is upgraded to "https"
38	SHOW_YT_RD=0 # record redirection for a youtu.be URL expanding to the full URL
39	SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page
40	CHECK_ARCHIVE_LINKS=0 # check URLs under the archive.org domain
41	TAKE_PAGE_SHOT=0 # take a screenshot of each OK page
42	TIMEOUT=10 # time to wait for a response when querying a site
43	CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature
44	URL_START=1 # start at this URL in LINKS_FILE
45	URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE
46	UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report
47
48	# Fixed strings -- see the occurrences of these variables to learn their purpose
49	AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36 OPR/70.0.3728.154"
50	ARCHIVE_API="http://archive.org/wayback/available"
51	ARCHIVE_GENERIC="https://web.archive.org/web/*"
52	ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
53	CHROME_SCREENSHOT="screenshot.png"
54	EXCEPT_FILE_NAME="exceptions.txt"
55	EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
56	WIKI_CURL="https://wiki.oni2.net/Validate_External_Links/Curl_codes"
57	WIKI_HTTP="https://wiki.oni2.net/Validate_External_Links/HTTP_codes"
58	WIKI_MAIN="https://wiki.oni2.net/Validate_External_Links"
59	WIKI_ME="http://iritscen.oni2.net"
60	THIS_DIR=$(cd $(dirname $0); pwd)
61	WORKING_DIR=$(pwd)
62	WIKI_PATH="wiki.oni2.net"
63
64	# These are parallel arrays of the IDs and names of OniGalore's current namespaces
65	declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 100 101 102 103 104 105 108 109 110 111)
66	declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk")
67
68	# These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
69	# This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code.
70	declare -a HTTP_FILES=(3ds 7z avi BINA blend bsl cpp csv dae dll dmg doc east exe fbx first63 flv gamedata gif jpg last32 log m4a mhm mov mp3 mp4 oni ONWC pdf png py rar tga TRMA txt vbs wav wmv xaf xml zip)
71	declare -a HTTP_TLDS_AND_PAGES=(action ars asp aspx cfm cgi com css de full htm html it js jsp net org pgi php php3 phtml pl ru shtml stm uk x)
72
73	# These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
74	# are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt
75	# if you add a new code.
76	declare -a OK_CODES=(200 401 405 406 418 501)
77	declare -a RD_CODES=(301 302 303 307 308)
78	declare -a NG_CODES=(000 400 403 404 410 500 502 503 530)
79
80	# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
81	# transcluded text, and if the transclusion fails, then the braces show up in the URL
82	ILLEGAL_CHARS="{ }"
83
84	# The shortest URL possible, used for sanity-checking some URLs: http://a.co
85	MIN_URL_LENGTH=11
86
87	# These are parallel arrays giving the prefixes that can be used in place of normal external links to
88	# some wikis and other sites
89	declare -a INTERWIKI_PREFIXES=(commons metawikimedia mw wikibooks wikidata wikimedia wikinews wikiquote wikisource wikispecies wikiversity wikivoyage wikt wp)
90	declare -a INTERWIKI_DOMAINS=(commons.wikimedia.org meta.wikimedia.org mediawiki.org wikibooks.org wikidata.org wikimediafoundation.org wikinews.org wikiquote.org wikisource.org species.wikimedia.org wikiversity.org wikivoyage.org wiktionary.org wikipedia.org)
91
92	# Variables for keeping track of main loop progress and findings
93	LINK_NUM=0
94	EI_LINKS=0
95	IW_LINKS=0
96	OK_LINKS=0
97	RD_LINKS=0
98	NG_LINKS=0
99	SKIP_UNK_NS=0
100	SKIP_JS_PAGE=0
101	SKIP_BAD_URL=0
102	SKIP_NON_ASCII=0
103	SKIP_UNK_SUFFIX=0
104	SKIP_UNK_CODE=0
105	SKIP_EXPECT_NG=0
106	SKIP_EXPECT_RD=0
107	SKIP_EXPECT_EI=0
108	SKIP_EXPECT_IW=0
109	SKIP_HTTPS_UP=0
110	SKIP_SLASH_ADD=0
111	SKIP_YOUTU_BE=0
112	SKIP_ARCHIVE_ORG=0
113	FILE_LINKS=0
114	PAGE_LINKS=0
115	SKIPPED_HEADER_ROW=0
116	FINISHED_LIST="no"
117	START_RUN=0
118	END_RUN=0
119
120
121	### HELP OUTPUT ###
122	# A pseudo-man page. Here is the 80-character rule for the page text:
123	# 234567890123456789012345678901234567890123456789012345678901234567890123456789
124	function printHelp()
125	{
126	cat << EOF
127
128	NAME
129	Validate External Links
130
131	SYNOPSIS
132	validate_external_links.sh --help
133	validate_external_links.sh --links URL --output DIR [--exceptions URL]
134	[--record-ok-links] [--show-added-slashes] [--show-https-upgrades]
135	[--show-yt-redirects] [--suggest-snapshots] [--check-archive-links]
136	[--take-screenshots FILE] [--timeout NUM] [--start-url NUM]
137	[--end-url NUM] [--upload FILE]
138
139	DESCRIPTION
140	This script parses a list of external links found in the OniGalore wiki
141	(which is dumped by the Oni2.net domain periodically in a particular
142	format), validates them using the Unix tool 'curl', and produces a report
143	of which links were "OK" (responded positively to an HTTP query), which
144	were "RD" (responded with a 3xx redirect code), which could be "IW"
145	(interwiki) links, which are "EI" (external internal) links and could be
146	intrawiki links, and which were "NG" (no good; a negative response to the
147	query). This report can then be automatically uploaded to the location of
148	your choice. The script can also suggest Internet Archive snapshots for
149	"NG" links, and take screenshots of "OK" links for visual verification by
150	the reader that the page in question is the one intended to be displayed.
151
152	You must pass this script the URL at which the list of links is found
153	(--links) and the path where the directory of logs should be outputted
154	(--output). All other arguments are optional.
155
156	OPTIONS
157	--help Show this page.
158	--links URL (required) URL from which to download the CSV
159	file with external links. Note that this URL can
160	be a local file if you supply a file:// path.
161	--output DIR (required) Unix path to directory in which Val
162	should place its reports.
163	--exceptions URL In order to remove links from the report which
164	Val finds an issue with but which you regard as
165	OK, list those desired exceptions on a wiki page.
166	See the sample file "exceptions.pdf" for the
167	required format of the page. Note that this URL
168	can point to a local file if you supply a path
169	beginning with "file://".
170	--record-ok-links Log a link in the report even if its response
171	code is "OK".
172	--show-added-slashes Report on redirects that simply add a '/' to the
173	end of the URL.
174	--show-https-upgrades Report on redirects that simply upgrade a
175	"http://" URL to a "https://" URL.
176	--show-yt-redirects Report on redirects that expand a youtu.be URL.
177	--suggest-snapshots Query the Internet Archive for a possible
178	snapshot URL for each "NG" page.
179	--check-archive-links Check links that are already pointing to a page
180	on the Internet Archive. In theory these links
181	should be totally stable and not need validation.
182	--take-screenshots FILE Call the Google Chrome binary at this path to
183	take screenshots of each "OK" page.
184	--timeout NUM Wait this many seconds for a site to respond. The
185	default is 10. Important note: Val will attempt
186	to reach each URL three times, so the time taken
187	to ping an unresponsive site will be three times
188	this setting.
189	--start-url NUM Start at this link in the links CSV file.
190	--end-url NUM Stop at this link in the links CSV file.
191	--upload FILE Upload report using the credentials and path
192	given in this local text file. See sftp_login.txt
193	for template.
194
195	BUGS
196	The script cannot properly parse any line in the external links file
197	which contains a comma in the name of the wiki page containing a link.
198	Commas in the link itself are not an issue.
199	EOF
200	}
201
202
203	### SETUP ###
204	# If first argument is a help request, or if nothing was passed in at all, print help page and quit
205	if [ "$#" -eq 0 ] \|\| [ "$1" == "--help" ]; then
206	printHelp \| less
207	exit 0
208	fi
209
210	# Parse arguments as long as there are more arguments to process
211	while (( "$#" )); do
212	case "$1" in
213	--links ) LINKS_URL="$2"; shift 2;;
214	--exceptions ) EXCEPT_URL="$2"; shift 2;;
215	--output ) OUTPUT_DIR="$2"; shift 2;;
216	--record-ok-links ) RECORD_OK_LINKS=1; shift;;
217	--show-added-slashes ) SHOW_SLASH=1; shift;;
218	--show-https-upgrades ) SHOW_HTTPS=1; shift;;
219	--show-yt-redirects ) SHOW_YT_RD=1; shift;;
220	--suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;;
221	--check-archive-links ) CHECK_ARCHIVE_LINKS=1; shift;;
222	--take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
223	--timeout ) TIMEOUT=$2; shift 2;;
224	--start-url ) URL_START=$2; shift 2;;
225	--end-url ) URL_LIMIT=$2; shift 2;;
226	--upload ) UPLOAD_INFO=$2; shift 2;;
227	* ) echo "Invalid argument $1 detected. Aborting."; exit 1;;
228	esac
229	done
230
231	# If the required arguments were not supplied, print help page and quit
232	if [ -z $LINKS_URL ] \|\| [ -z $OUTPUT_DIR ]; then
233	echo "Error: I did not receive one or both required arguments. Run me with the \"--help\" argument for documentation."
234	exit 2
235	fi
236
237	# If user wants screenshots, make sure path to Chrome was passed in and is valid
238	if [ $TAKE_PAGE_SHOT -eq 1 ]; then
239	if [ ! -f "$CHROME_PATH" ]; then
240	echo "Error: You need to supply a path to the Google Chrome application in order to take screenshots."
241	exit 3
242	fi
243	fi
244
245	# Check that UPLOAD_INFO exists, if this argument was supplied
246	if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then
247	echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting."
248	exit 4
249	fi
250
251	# Check that OUTPUT_DIR is a directory
252	if [ ! -d "$OUTPUT_DIR" ]; then
253	echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting."
254	exit 5
255	fi
256
257	# Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots
258	SAFE_TIME=$(date "+%Y-%m-%d--%H-%M-%S") # for use in path names
259	NICE_TIME=$(TZ=":UTC" date "+%a, %d %b %Y %k:%M:%S GMT") # matches format of LINKS_DATE, which is extracted from the extlinks file
260	OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
261	OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
262	SHOT_PATH="$OUTPUT_PATH/Screenshots"
263	LOG_NAME="ValExtLinks report"
264	LOG_NAME_TXT="$LOG_NAME.txt"
265	LOG_NAME_RTF="$LOG_NAME.rtf"
266	LOG_NAME_HTM="$LOG_NAME.htm"
267	LOG_PATH="$OUTPUT_PATH/$LOG_NAME"
268	LOG_PATH_TXT="$LOG_PATH.txt"
269	LOG_PATH_RTF="$LOG_PATH.rtf"
270	LOG_PATH_HTM="$LOG_PATH.htm"
271	mkdir "$OUTPUT_PATH"
272	if [ $TAKE_PAGE_SHOT -eq 1 ]; then
273	mkdir "$SHOT_PATH"
274	fi
275
276	# Check that 'mkdir' succeeded
277	if [ ! -d "$OUTPUT_PATH" ]; then
278	echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting."
279	exit 6
280	fi
281
282	# Get date on the file at LINKS_URL and print to log
283	LINKS_DATE=$(curl --silent --head $LINKS_URL \| grep "Last-Modified")
284	if [ -z "$LINKS_DATE" ]; then
285	echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting."
286	exit 7
287	fi
288	LINKS_DATE=${LINKS_DATE#Last-Modified: }
289
290
291	### UTILITY FUNCTIONS ###
292	# Writes a plain-text header to TXT log file
293	function printTXTheader()
294	{
295	valPrint t "Validate External Links report"
296	valPrint t "generated $NICE_TIME"
297	valPrint t "from data of $LINKS_DATE"
298	valPrint t "script by Iritscen (contact: $WIKI_ME)"
299	valPrint t ""
300	}
301
302	# Writes the RTF header to RTF log file, then a centered title, then sets text to left-justified
303	function printRTFheader()
304	{
305	valPrint r "{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf820
306	{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
307	{\colortbl;\red255\green255\blue255;}
308	{\*\expandedcolortbl;;}
309	\margl1440\margr1440\vieww12600\viewh12100\viewkind0
310	\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\qc\partightenfactor0
311
312	\f0\fs28 \cf0 \b1 Validate External Links report \b0\\
313	generated $NICE_TIME\\
314	from data of $LINKS_DATE\\
315	script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$WIKI_ME\"}}{\fldrslt contact}})
316	\\
317	\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
318	\cf0 "
319	}
320
321	# Closes the RTF markup of the RTF log file
322	function printRTFfooter()
323	{
324	valPrint r "}"
325	}
326
327	# Writes the HTML header to HTML log file
328	function printHTMheader()
329	{
330	valPrint h "<html>
331	<head>
332	<title>Validate External Links report</title>
333	</head>
334	<body>
335	<h2>Validate External Links report</h2>
336	<h3>generated $NICE_TIME<br />
337	from data of $LINKS_DATE<br />
338	script by Iritscen (<a href=\"$WIKI_ME\" target=\"_blank\">contact</a>)</h3>"
339	}
340
341	# Closes the HTML markup of the HTML log file
342	function printHTMfooter()
343	{
344	valPrint h "</body>
345	</html>"
346	}
347
348	# The central logging function. The first parameter is a string composed of one or more characters that
349	# indicate which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
350	# 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 's' means "Print an
351	# extra newline at the end." 'w' means "Don't pass console output through 'fmt'" ("fmt" fits the output
352	# to an 80-column CLI but can break special formatting and the 'n' option).
353	function valPrint()
354	{
355	if [[ "$1" == c ]]; then
356	if [[ "$1" == n ]]; then
357	echo -n "$2"
358	elif [[ "$1" == w ]]; then
359	echo "$2"
360	elif [[ "$1" == s ]]; then
361	echo -e "$2\n"
362	else
363	echo "$2" \| fmt -w 80
364	fi
365	fi
366	if [[ "$1" == t ]]; then
367	if [[ "$1" == n ]]; then
368	echo -n "$2" >> "$LOG_PATH_TXT"
369	elif [[ "$1" == s ]]; then
370	echo -e "$2\n" >> "$LOG_PATH_TXT"
371	else
372	echo "$2" >> "$LOG_PATH_TXT"
373	fi
374	fi
375	if [[ "$1" == r ]]; then
376	if [[ "$1" == n ]]; then
377	echo "$2" >> "$LOG_PATH_RTF"
378	elif [[ "$1" == s ]]; then
379	echo "$2\line\line" >> "$LOG_PATH_RTF"
380	else
381	echo "$2\line" >> "$LOG_PATH_RTF"
382	fi
383	fi
384	if [[ "$1" == h ]]; then
385	if [[ "$1" == s ]]; then
386	echo "$2<tr><td> </td></tr>" >> "$LOG_PATH_HTM"
387	elif [[ "$1" == n ]]; then
388	echo "$2" >> "$LOG_PATH_HTM"
389	else
390	echo "$2<br />" >> "$LOG_PATH_HTM"
391	fi
392	fi
393	}
394
395	# Pluralize the string in parameter 1 if the number in parameter 2 is not 1
396	function pluralCheckNoun()
397	{
398	if [ $2 -ne 1 ]; then
399	if [[ $1 =~ x$ ]]; then
400	echo $1es
401	else
402	echo $1s
403	fi
404	else
405	echo $1
406	fi
407	}
408
409	# Output "is" if parameter 1 is 1, otherwise "are"
410	function pluralCheckIs()
411	{
412	if [ $1 -ne 1 ]; then
413	echo "are"
414	else
415	echo "is"
416	fi
417	}
418
419	# Output "was" if parameter 1 is 1, otherwise "were"
420	function pluralCheckWas()
421	{
422	if [ $1 -ne 1 ]; then
423	echo "were"
424	else
425	echo "was"
426	fi
427	}
428
429	# Output "a " if parameter 1 is 1, otherwise nothing
430	function pluralCheckA()
431	{
432	if [ $1 -eq 1 ]; then
433	echo "a "
434	fi
435	}
436
437	# Output "an " if parameter 1 is 1, otherwise nothing
438	function pluralCheckAn()
439	{
440	if [ $1 -eq 1 ]; then
441	echo "an "
442	fi
443	}
444
445	# Upload the reports using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
446	# reports being saved to disk have already been closed.
447	function uploadReport()
448	{
449	valPrint c "Uploading reports..."
450
451	SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
452	SFTP_USER_NAME_MARKER="user:"
453	SFTP_PASSWORD_MARKER="pw:"
454	SFTP_PORT_MARKER="port:"
455	SFTP_PATH_MARKER="path:"
456	SFTP_USER_NAME=$(grep $SFTP_USER_NAME_MARKER $UPLOAD_INFO)
457	SFTP_USER_NAME=${SFTP_USER_NAME#$SFTP_USER_NAME_MARKER}
458	SFTP_PASSWORD=$(grep $SFTP_PASSWORD_MARKER $UPLOAD_INFO)
459	SFTP_PASSWORD=${SFTP_PASSWORD#$SFTP_PASSWORD_MARKER}
460	SFTP_PORT=$(grep $SFTP_PORT_MARKER $UPLOAD_INFO)
461	SFTP_PORT=${SFTP_PORT#$SFTP_PORT_MARKER}
462	SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
463	SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}
464
465	for SUFFIX in htm rtf txt; do
466	expect "$SCRIPT_PATH" "$LOG_PATH.$SUFFIX" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.$SUFFIX"
467
468	if [ "$?" -ne 0 ]; then
469	valPrint c "Error $? occurred when attempting to upload $LOG_NAME.$SUFFIX!"
470	else
471	valPrint c "Report in `echo $SUFFIX \| tr [:lower:] [:upper:]` format was uploaded."
472	fi
473	done
474	}
475
476	# Prints session summary when script is done
477	function wrapupAndExit()
478	{
479	# Get off progress line on console, drop down a line from last link in log, and close HTML table
480	valPrint ctr ""
481	valPrint h "</table><br />"
482
483	# If we didn't finish processing the last URL, then the iterator is one too high
484	if [ $FINISHED_LIST != "yes" ]; then
485	let LINK_NUM-=1
486	if [ $FINISHED_LIST == "no" ]; then
487	valPrint ctrh "The session was canceled by the user."
488	fi
489	fi
490
491	# Generate string with elapsed time
492	END_RUN=$(date +%s)
493	ELAPSED=$(echo $(($END_RUN - $START_RUN)) \| awk '{printf "%d min. %d sec. elapsed", int($1/60), int($1%60)}')
494
495	# Do some math on results of session
496	LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
497	TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP+SKIP_YOUTU_BE))
498	LINK_ERRORS=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
499	LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_RD+SKIP_EXPECT_EI+SKIP_EXPECT_IW))
500	LINK_PROBLEMS_TOTAL=$((NG_LINKS+RD_LINKS+EI_LINKS+IW_LINKS))
501	LINK_PROBLEMS_NG=$((NG_LINKS-SKIP_EXPECT_NG))
502	LINK_PROBLEMS_RD=$((RD_LINKS-SKIP_EXPECT_RD))
503	LINK_PROBLEMS_EI=$((EI_LINKS-SKIP_EXPECT_EI))
504	LINK_PROBLEMS_IW=$((IW_LINKS-SKIP_EXPECT_IW))
505	LINK_PROBLEMS_NET=$((LINK_PROBLEMS_NG+LINK_PROBLEMS_RD+LINK_PROBLEMS_EI+LINK_PROBLEMS_IW))
506
507	# Print something in the Links section if no link issues were printed
508	if [ $LINK_PROBLEMS_NET -eq 0 ]; then
509	valPrint h "<i>No link problems to report! See the <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for a list of links with issues that were not reported.</i>"
510	fi
511	if [ $LINK_PROBLEMS_TOTAL -eq 0 ]; then
512	valPrint t "No link problems to report!"
513	valPrint r "\i1 No link problems to report! \i0"
514	fi
515
516	## SUMMARY OUTPUT ##
517	valPrint ct "Summary ($ELAPSED):"
518	valPrint r "\b1 Summary \b0 ($ELAPSED)"
519	valPrint hn "<h3><span id=\"summary\">Summary ($ELAPSED)</span></h3>"
520	valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT) (there $(pluralCheckWas $FILE_LINKS) $FILE_LINKS file $(pluralCheckNoun link $FILE_LINKS) and $PAGE_LINKS page $(pluralCheckNoun link $PAGE_LINKS))."
521
522	# Print processed link totals
523	if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi
524	if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi
525	if [ $SKIP_ARCHIVE_ORG -gt 0 ]; then valPrint ctrh "- $SKIP_ARCHIVE_ORG Archive.org $(pluralCheckNoun link $SKIP_ARCHIVE_ORG) were not checked"; fi
526	if [ $LINK_PROBLEMS_TOTAL -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_TOTAL processed $(pluralCheckNoun link $LINK_PROBLEMS_TOTAL) had $(pluralCheckAn $LINK_PROBLEMS_TOTAL)$(pluralCheckNoun issue $LINK_PROBLEMS_TOTAL)"; fi
527	if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr " (excepted $LINKS_EXCEPTED link $(pluralCheckNoun issue $LINKS_EXCEPTED) from report)"; valPrint h "  (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi
528	if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) $(pluralCheckWas $OK_LINKS) OK"; fi
529	if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctr " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; valPrint h "  (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi
530
531	# Print errored link totals
532	if [ $LINK_ERRORS -gt 0 ]; then
533	valPrint c "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"
534	valPrint h "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for specific links):"
535	valPrint rt "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS):"
536	fi
537	if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
538	if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
539	if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
540	if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
541	if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
542	if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
543
544	# Print excepted link totals
545	if [ $LINKS_EXCEPTED -gt 0 ]; then
546	valPrint c "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"
547	valPrint h "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for specific links):"
548	valPrint rt "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted:"
549	fi
550	if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
551	if [ $SKIP_EXPECT_RD -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_RD/$RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi
552	if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi
553	if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi
554
555	# Print checked link totals
556	if [ $LINK_PROBLEMS_NET -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS_NET link $(pluralCheckNoun issue $LINK_PROBLEMS_NET):"; fi
557	if [ $LINK_PROBLEMS_NG -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_NG NG $(pluralCheckNoun link $LINK_PROBLEMS_NG)"; fi
558	if [ $LINK_PROBLEMS_RD -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_RD $(pluralCheckNoun redirection $LINK_PROBLEMS_RD)"; fi
559	if [ $LINK_PROBLEMS_EI -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_EI $(pluralCheckNoun link $LINK_PROBLEMS_EI) that could be intrawiki"; fi
560	if [ $LINK_PROBLEMS_IW -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_IW $(pluralCheckNoun link $LINK_PROBLEMS_IW) that could be interwiki"; fi
561
562	# Close the log files' markup
563	valPrint trh "ValExtLinks says goodbye."
564	printRTFfooter
565	printHTMfooter
566
567	# Upload report if this was requested
568	if [ ! -z $UPLOAD_INFO ] && [ $FINISHED_LIST != "no" ]; then
569	uploadReport
570	fi
571
572	# Really quit now
573	valPrint c "ValExtLinks says goodbye."
574	exit 0
575	}
576	trap wrapupAndExit INT
577
578
579	### INITIALIZATION ###
580	# Print opening message to console and log files
581	valPrint cw "$(tput bold)--Validate External Links--$(tput sgr0)"
582	printTXTheader
583	printRTFheader
584	printHTMheader
585
586	## DATA SOURCING ##
587	valPrint t "Startup:"
588	valPrint r "\b1 Startup \b0"
589	valPrint hn "<h3>Startup</h3>"
590
591	# Attempt to download file at LINKS_URL, then check that it succeeded
592	valPrint cwtrhn "Downloading list of external links from $LINKS_URL..."
593	LINKS_FILE_NAME=$(echo "$LINKS_URL" \| sed 's/.*\///')
594	LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
595	curl --silent -o "$LINKS_FILE" $LINKS_URL
596	if [ ! -f "$LINKS_FILE" ]; then
597	echo -e "\nThe download of $LINKS_URL appears to have failed. Aborting."
598	wrapupAndExit
599	else
600	valPrint ctrh " success."
601	fi
602
603	# Attempt to download file at EXCEPT_URL, then check that it succeeded
604	if [ ! -z $EXCEPT_URL ]; then
605	valPrint cwtrhn "Downloading list of reporting exceptions from $EXCEPT_URL..."
606	EXCEPT_DATA=$(curl --silent $EXCEPT_URL)
607	if [ -z "$EXCEPT_DATA" ]; then
608	echo -e "\nThe download of the exceptions data from '$EXCEPT_URL' appears to have failed. Aborting."
609	wrapupAndExit
610	else
611	valPrint ctrh " success."
612	fi
613	EXCEPT_DATA=${EXCEPT_DATA%END LIST*}
614	EXCEPT_DATA=${EXCEPT_DATA#*BEGIN LIST}
615	EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
616
617	# Store on disk for debugging purposes
618	echo "$EXCEPT_DATA" > "$EXCEPT_FILE"
619
620	# Transfer to array for easy searching later
621	declare -a EXCEPT_ARRAY=($(echo "$EXCEPT_DATA"))
622	fi
623
624	# Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
625	LINK_COUNT_STRING=$(cat "$LINKS_FILE" \| wc -l)
626
627	# Number of URLs is number of lines minus one (first line is column header row for the CSV)
628	LINK_COUNT=$(echo "${LINK_COUNT_STRING}" \| tr -d '[:space:]')
629	let LINK_COUNT-=1
630	valPrint ctrh "Found $LINK_COUNT links to process."
631	valPrint trh ""
632
633	## CONFIG OUTPUT ##
634	valPrint t "Config:"
635	valPrint r "\b1 Config \b0"
636	valPrint hn "<h3>Config</h3>"
637
638	valPrint ctrhn "Links to consider: "
639	if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
640	valPrint ctrh "$((URL_LIMIT-URL_START+1)), from link $URL_START to $URL_LIMIT"
641	elif [ $URL_START -ne 1 ]; then
642	valPrint ctrh "$((LINK_COUNT-URL_START+1)), from link $URL_START to $LINK_COUNT"
643	else
644	valPrint ctrh "$LINK_COUNT"
645	fi
646
647	valPrint ctrh "Site query timeout: $TIMEOUT seconds"
648
649	valPrint ctrhn "Show OK links: "
650	if [ $RECORD_OK_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
651
652	valPrint ctrhn "Take screenshots: "
653	if [ $TAKE_PAGE_SHOT -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
654
655	valPrint ctrhn "Suggest archive.org snapshots: "
656	if [ $SUGGEST_SNAPSHOTS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
657
658	valPrint ctrhn "Ignore slash-adding redirects: "
659	if [ $SHOW_SLASH -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
660
661	valPrint ctrhn "Ignore HTTPS-upgrading redirects: "
662	if [ $SHOW_HTTPS -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
663
664	valPrint ctrhn "Ignore youtu.be redirects: "
665	if [ $SHOW_YT_RD -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
666
667	valPrint ctrhn "Check archive.org links: "
668	if [ $CHECK_ARCHIVE_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
669
670	valPrint tr "A summary of my findings will be found at the bottom of the report."
671	valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
672	valPrint trh ""
673
674	## LEGEND OUTPUT ##
675	valPrint t "Legend:"
676	valPrint r "\b1 Legend \b0"
677	valPrint hn "<h3>Legend</h3>"
678	valPrint t "(For guidance in fixing these links, see $WIKI_MAIN.)"
679	valPrint r "(For guidance in fixing these links, see {\field{\*\fldinst{HYPERLINK \"$WIKI_MAIN\"}}{\fldrslt here}}.)"
680	valPrint h "(For guidance in fixing these links, see <a href=\"$WIKI_MAIN\" target=\"_blank\">here</a>.)"
681	valPrint trh "OK = URL seems to be working"
682	valPrint trh "NG = URL no longer seems to work"
683	valPrint trh "RD = URL is redirecting to this new URL"
684	valPrint trh "EI = URL is an external link to an internal page and should be converted to an intrawiki link using the suggested markup"
685	valPrint trh "IW = URL is an external link to a fellow wiki and should be converted to an interwiki link using the suggested markup"
686	valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $WIKI_HTTP)"
687	valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$WIKI_HTTP\"}}{\fldrslt here}} for code reference)"
688	valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$WIKI_HTTP\" target=\"_blank\">here</a> for code reference)"
689	valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $WIKI_CURL)"
690	valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$WIKI_CURL\"}}{\fldrslt here}} for code reference)"
691	valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$WIKI_CURL\" target=\"_blank\">here</a> for code reference)"
692	valPrint trh "IA suggests = Last available snapshot returned by the Internet Archive"
693	valPrint trh "Try browsing = The Archive failed to return a snapshot URL, so check for a snapshot manually using this link"
694	valPrint trh ""
695
696
697	### MAIN LOOP ###
698	valPrint t "Links:"
699	valPrint r "\b1 Links \b0"
700	valPrint hn "<h3>Links</h3>"
701	START_RUN=$(date +%s)
702	# Process each line of the .csv in LINKS_FILE
703	for LINE in `cat "$LINKS_FILE"`; do
704	let LINK_NUM+=1
705
706	# First line is the column header row for the CSV, so let's verify that the format hasn't changed
707	if [ $SKIPPED_HEADER_ROW -eq 0 ]; then
708	if [ $LINE == "namespace,title,target" ]; then
709	SKIPPED_HEADER_ROW=1
710	LINK_NUM=0 # this line is it's not a link, so reset the link counter
711	valPrint hn "<table>"
712	continue
713	else
714	valPrint ctrh "Did not find expected header row in $LINKS_FILE. Aborting."
715	wrapupAndExit
716	fi
717	fi
718
719	# Skip this link if we are not at URL_START yet
720	if [ $LINK_NUM -lt $URL_START ]; then
721	continue
722	fi
723
724	# Stop if we are at the limit declared for testing purposes
725	if [ $URL_LIMIT -gt 0 ] && [ $LINK_NUM -gt $URL_LIMIT ]; then
726	FINISHED_LIST="limit"
727	wrapupAndExit
728	fi
729
730	# Print progress to screen
731	if [ $LINK_NUM -gt 1 ]; then
732	printf "\e[1A\n" # erase previous progress message so that new one appears in its place
733	fi
734	valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."
735
736	# The number of the namespace is the element before the first comma on the line
737	NS_ID=${LINE%%,*}
738
739	# Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
740	NS_NAME=""
741	a=0
742	while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done
743	if [ $NS_ID == "NULL" ]; then
744	break
745	elif [ $NS_ID -eq ${NS_IDS[$a]} ]; then
746	NS_NAME="${NS_NAMES[$a]}"
747	break
748	fi
749	let a+=1
750	done
751	if [ "$NS_NAME" == "" ]; then
752	if [ $NS_ID == "NULL" ]; then
753	valPrint trs "Skipping line $LINK_NUM ('$LINE') because the namespace (and probably the page too) is 'NULL'. Probably the link is no longer in existence on the wiki."
754	else
755	valPrint trs "Skipping line $LINK_NUM ('$LINE') because I could not find a name for namespace ID $NS_ID."
756	fi
757	let SKIP_UNK_NS+=1
758	continue
759	fi
760
761	# The name of the page is everything between the namespace ID and the next comma on the line (commas
762	# in page names will break this)
763	PAGE_NAME=${LINE#$NS_ID,}
764	PAGE_NAME=${PAGE_NAME%%,*}
765
766	# We don't want to consider wiki pages ending in .js, as the MW parser cannot reliably isolate URLS
767	# in JavaScript code, so it returns erroneous links
768	PAGE_NAME_SUFFIX=$(echo $PAGE_NAME \| sed 's/.*\.//')
769	if [ $PAGE_NAME_SUFFIX == "js" ]; then
770	valPrint trs "Skipping URL '${LINE#$NS_ID,$PAGE_NAME,}' on line $LINK_NUM because it was found on JavaScript page '$PAGE_NAME'."
771	let SKIP_JS_PAGE+=1
772	continue
773	fi
774
775	# Build longer wiki page URLs from namespace and page names
776	FULL_PAGE_PATH=https://$WIKI_PATH/$NS_NAME:$PAGE_NAME
777	LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
778	# Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it
779	# explicitly breaks the link
780	if [ $NS_ID -eq 0 ]; then
781	FULL_PAGE_PATH=https://$WIKI_PATH/$PAGE_NAME
782	LOCAL_PAGE_PATH=$PAGE_NAME
783	fi
784
785	# The URL being linked to is everything after the previous two fields (this allows commas to be in
786	# the URLs, but a comma in the previous field, the page name, will break this)
787	URL=${LINE#$NS_ID,$PAGE_NAME,}
788
789	# Scan for illegal characters
790	if [[ $URL == [$ILLEGAL_CHARS] ]]; then
791	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because it contains characters illegal in a URL."
792	let SKIP_BAD_URL+=1
793	continue
794	fi
795
796	# If we're skipping Archive.org links, see if this is one
797	if [ $CHECK_ARCHIVE_LINKS -eq 0 ] && [[ $URL == web.archive.org ]]; then
798	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to check Wayback Machine links."
799	let SKIP_ARCHIVE_ORG+=1
800	continue
801	fi
802
803	# Now we need to know if the URL is for a file or a web page. First step is to determine if the
804	# URL ends in a suffix
805	HAS_SUFFIX=0
806
807	# If the URL ends in something like ".php?foo=bar", strip everything from the '?' onward
808	CLEAN_URL=${URL%%\?*}
809
810	# If the URL ends in something like "#section_15", strip everything from the '#' onward
811	CLEAN_URL=${CLEAN_URL%%\#*}
812
813	# 'sed' cannot handle Unicode in my Bash shell, so skip non-ASCII URL and make user check it
814	if [[ $CLEAN_URL == [![:ascii:]] ]]; then
815	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I cannot handle non-ASCII characters."
816	let SKIP_NON_ASCII+=1
817	continue
818	fi
819
820	# Isolate the characters after the last period and after the last slash
821	POST_DOT=$(echo "$CLEAN_URL" \| sed 's/.*\.//')
822	POST_SLASH=$(echo "$CLEAN_URL" \| sed 's/.*\///')
823
824	# If the last period comes after the last slash, then the URL ends in a suffix
825	POST_DOT_LENGTH=$(echo \| awk -v input=$POST_DOT '{print length(input)}')
826	POST_SLASH_LENGTH=$(echo \| awk -v input=$POST_SLASH '{print length(input)}')
827	if [ $POST_DOT_LENGTH -lt $POST_SLASH_LENGTH ]; then
828	HAS_SUFFIX=1
829	else
830	HAS_SUFFIX=0
831	fi
832
833	# Now we need to know if the URL is for a file. If the URL ends in a suffix, compare it against the
834	# known URL endings supplied in HTTP_FILES and HTTP_TLDS_AND_PAGES
835	IS_FILE=-1
836	if [ $HAS_SUFFIX -eq 0 ]; then
837	IS_FILE=0
838	else
839	# Turn off case sensitivity while we compare suffixes
840	shopt -s nocasematch
841
842	# Special case: URLs ending in something like "productID.304297400" are pages, not files, so if
843	# the URL's suffix is all numbers, we are looking at the end of a web page URL
844	if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
845	IS_FILE=0
846	fi
847
848	# Special case: URLs ending in a parens, e.g. "ms537113(v=vs.85)", are pages
849	if [[ $POST_DOT =~ ^.*[]$ ]]; then
850	IS_FILE=0
851	fi
852
853	# Special case: URLs containing a '%', e.g. "10.1007%2FBF00329055", are pages
854	if [[ $POST_DOT == % ]]; then
855	IS_FILE=0
856	fi
857
858	# If we did not identify this URL as a web page above, we need to compare the suffix against known
859	# file extensions
860	if [ $IS_FILE -eq -1 ]; then
861	for EXTENSION in "${HTTP_FILES[@]}"; do
862	if [[ $EXTENSION == $POST_DOT ]]; then # double brackets respect the nocasematch setting
863	IS_FILE=1
864	break
865	fi
866	done
867	fi
868
869	# If we did not identify this URL as a file above, we need to compare the suffix against known
870	# pages and TLDs in order to rule out that this is an unknown file suffix or TLD that the user
871	# needs to add to the HTTP_FILES or HTTP_TLDS_AND_PAGES array
872	if [ $IS_FILE -eq -1 ]; then
873	for PAGE_OR_TLD in "${HTTP_TLDS_AND_PAGES[@]}"; do
874	if [[ $PAGE_OR_TLD == $POST_DOT ]]; then
875	IS_FILE=0
876	break
877	fi
878	done
879	fi
880
881	# Turn case sensitivity back on in Bash
882	shopt -u nocasematch
883	fi
884
885	# If this suffix escaped identification as either a file, page or TLD, inform the user
886	STR_TYPE=""
887	if [ $IS_FILE -eq -1 ]; then
888	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I encountered the unknown URL ending '$POST_DOT'. Please add this ending to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
889	let SKIP_UNK_SUFFIX+=1
890	continue
891	elif [ $IS_FILE -eq 1 ]; then
892	STR_TYPE="file"
893	let FILE_LINKS+=1
894	elif [ $IS_FILE -eq 0 ]; then
895	STR_TYPE="page"
896	let PAGE_LINKS+=1
897	fi
898
899	# Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
900	# issue with sites that require HTTPS
901	CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --retry 2 --write-out '%{http_code}\n' $URL)
902	CURL_ERR=$(echo $?)
903	CURL_RESULT=$CURL_CODE
904
905	# Tack 'curl' exit code onto result string if 'curl' returned "000" (no HTTP response)
906	if [ $CURL_CODE == "000" ]; then
907	CURL_RESULT="$CURL_RESULT-$CURL_ERR"
908	fi
909
910	# Begin to determine our status code for this URL (EI, IW, OK, RD, or NG)
911	STATUS="??"
912	NEW_URL=""
913	INTERWIKI_INDEX=-1
914
915	# First make sure that this isn't an "external internal" link to our own wiki that can be replaced
916	# by "[[page_name]]". If it uses a special access URL beginning with "/w/", let it pass, as it
917	# probably cannot be replaced by "[[ ]]" markup
918	if [[ $URL == $WIKI_PATH ]] && [[ $URL != $WIKI_PATH/w/ ]]; then
919	STATUS="EI"
920	let EI_LINKS+=1
921	fi
922
923	# If it's not, check if this is a link to a domain that we have an interwiki prefix for (also make
924	# sure that it's not an archive.org link to a page from an interwiki domain)
925	if [ $STATUS == "??" ] && [[ $URL != web.archive.org ]]; then
926	for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
927	if [[ $URL == ${INTERWIKI_DOMAINS[$i]} ]] && [[ $URL != ${INTERWIKI_DOMAINS[$i]}/w/ ]]; then
928	STATUS="IW"
929	let IW_LINKS+=1
930	INTERWIKI_INDEX=$i
931	break
932	fi
933	done
934	fi
935
936	# If we didn't match an interwiki domain, see if the status code is in our "OK" codes list
937	if [ $STATUS == "??" ]; then
938	for CODE in "${OK_CODES[@]}"; do
939	if [[ $CODE == $CURL_CODE ]]; then
940	STATUS="OK"
941	let OK_LINKS+=1
942	break
943	fi
944	done
945	fi
946
947	# If we didn't get a match with the "OK" codes, check it against the "RD" codes
948	if [ $STATUS == "??" ]; then
949	for CODE in "${RD_CODES[@]}"; do
950	if [[ $CODE == $CURL_CODE ]]; then
951	# Get URL header again in order to retrieve the URL we are being redirected to
952	NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --write-out '%{redirect_url}\n' $URL)
953
954	# Adjust the old and new URLs to both use HTTP for comparison purposes, so we can filter
955	# those changes out if the user didn't ask for them
956	URL_HTTP=$(echo $URL \| sed -E 's/^https:/http:/')
957	NEW_URL_HTTP=$(echo $NEW_URL \| sed -E 's/^https:/http:/')
958
959	# Sometimes 'curl' fails to get the redirect_url due to time-out or bad web site config
960	NEW_URL_LENGTH=$(echo \| awk -v input=$NEW_URL_HTTP '{print length(input)}')
961	if [ $NEW_URL_LENGTH -lt $MIN_URL_LENGTH ]; then
962	NEW_URL_HTTP="[new URL not retrieved]"
963	fi
964
965	# Remove slash at end of new URL, if present, so we can filter out the redirects that
966	# merely add an ending slash if the user didn't ask for them
967	NEW_URL_NO_SLASH=$(echo $NEW_URL_HTTP \| sed -E 's:/$::')
968
969	# Detect if this is a youtu.be link simply being expanded by YouTube to the full
970	# youtube.com address
971	YOUTU_BE=0
972	if [[ $URL_HTTP == http://youtu.be* ]] && [[ $NEW_URL_HTTP == http://www.youtube.com* ]]; then
973	YOUTU_BE=1
974	fi
975
976	# If the URLs match besides HTTP being upgraded to HTTPS, then the link is OK (unless user
977	# wants those to be reported)
978	if [ $SHOW_HTTPS -eq 0 ] && [ $URL_HTTP == $NEW_URL_HTTP ]; then
979	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show http->https upgrades, and I was redirected to '$NEW_URL'."
980	STATUS="OK"
981	let OK_LINKS+=1
982	let SKIP_HTTPS_UP+=1
983	# If the URLs match besides an added ending slash, then the link is OK (unless user wants
984	# those to be reported)
985	elif [ $SHOW_SLASH -eq 0 ] && [ $URL_HTTP == $NEW_URL_NO_SLASH ]; then
986	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show added trailing slashes, and I was redirected to '$NEW_URL'."
987	STATUS="OK"
988	let OK_LINKS+=1
989	let SKIP_SLASH_ADD+=1
990	elif [ $SHOW_YT_RD -eq 0 ] && [ $YOUTU_BE -eq 1 ]; then
991	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show redirects for youtu.be links, and I was redirected to '$NEW_URL'."
992	STATUS="OK"
993	let OK_LINKS+=1
994	let SKIP_YOUTU_BE+=1
995	else
996	STATUS="RD"
997	let RD_LINKS+=1
998	fi
999	break
1000	fi
1001	done
1002	fi
1003
1004	# If we didn't get a match with the "RD" codes, check it against the "NG" codes
1005	if [ $STATUS == "??" ]; then
1006	for CODE in "${NG_CODES[@]}"; do
1007	if [[ $CODE == $CURL_CODE ]]; then
1008	STATUS="NG"
1009	let NG_LINKS+=1
1010	break
1011	fi
1012	done
1013	fi
1014
1015	# If we didn't match a known status code, advise the reader
1016	if [ $STATUS == "??" ]; then
1017	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I encountered the unknown response code $CURL_CODE."
1018	let SKIP_UNK_CODE+=1
1019	continue
1020	fi
1021
1022	# Check problem links against exceptions list before proceeding
1023	FOUND_EXCEPT=0
1024	if [ $STATUS != "OK" ] && [ ! -z $EXCEPT_URL ]; then
1025	# The code we expect to find in the exceptions file is either the 'curl' result or "EI"/"IW"
1026	EXPECT_CODE="$CURL_RESULT"
1027	if [ $STATUS == "EI" ]; then
1028	EXPECT_CODE="EI"
1029	elif [ $STATUS == "IW" ]; then
1030	EXPECT_CODE="IW"
1031	fi
1032
1033	# Look for link in exceptions list and make sure the listed result code and wiki page also match
1034	for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
1035	{
1036	EXCEPT_LINE="${EXCEPT_ARRAY[$i]}"
1037
1038	# Undo any HTML-encoding from the wiki page; for now we just worry about the ampersand, as most
1039	# other HTML-encoded characters are not found in URLs
1040	EXCEPT_LINE=$(echo "$EXCEPT_LINE" \| sed 's/\&/\&/')
1041
1042	# Match URL
1043	EXCEPT_URL="${EXCEPT_LINE#*,}"
1044	EXCEPT_URL="${EXCEPT_URL%,*}"
1045	if [ "$EXCEPT_URL" != "$URL" ]; then
1046	continue
1047	fi
1048
1049	# Match containing page's name
1050	EXCEPT_PAGE="${EXCEPT_LINE##*,}"
1051	EXCEPT_PAGE="${EXCEPT_PAGE%% *}"
1052	if [ "$EXCEPT_PAGE" == "*" ] \|\| [ "$EXCEPT_PAGE" == $LOCAL_PAGE_PATH ]; then
1053	# Match result code
1054	EXCEPT_CODE=${EXCEPT_LINE%%,*}
1055	if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then
1056	valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because its expected result, '$EXPECT_CODE', is in the exceptions list."
1057	if [ $STATUS == "EI" ]; then
1058	let SKIP_EXPECT_EI+=1
1059	elif [ $STATUS == "IW" ]; then
1060	let SKIP_EXPECT_IW+=1
1061	elif [ $STATUS == "RD" ]; then
1062	let SKIP_EXPECT_RD+=1
1063	else
1064	let SKIP_EXPECT_NG+=1
1065	fi
1066	FOUND_EXCEPT=1
1067	break
1068	fi
1069	fi
1070	} done
1071	fi
1072	if [ $FOUND_EXCEPT -eq 1 ]; then
1073	continue
1074	fi
1075
1076	# If appropriate, record this link to the log, with clickable URLs when possible
1077	if [ $STATUS != "OK" ] \|\| [ $RECORD_OK_LINKS -eq 1 ]; then
1078	# Prepare 'curl' result in parentheses to print after status code, unless this is an "IW" or "EI"
1079	# link, in which case showing the status code doesn't make sense. Adjust spacing after string to
1080	# ensure TXT and RTF reports have aligned columns of results.
1081	CURL_STR_H=" ($CURL_RESULT)"
1082	CURL_STR_T="$CURL_STR_H"
1083	CURL_STR_R="$CURL_STR_H "
1084	if [ $STATUS == "IW" ] \|\| [ $STATUS == "EI" ]; then
1085	CURL_STR_H=""
1086	CURL_STR_T=" "
1087	CURL_STR_R=" "
1088	fi
1089
1090	# Record link and its wiki page in TXT, RTF, and HTML markup
1091	valPrint t "${STATUS}${CURL_STR_T} $STR_TYPE $URL"
1092	valPrint t " linked from $FULL_PAGE_PATH"
1093	valPrint r "${STATUS}${CURL_STR_R}${RTF_TABS}$STR_TYPE {\field{\*\fldinst{HYPERLINK \"$URL\"}}{\fldrslt $URL}}"
1094	valPrint r " linked from {\field{\*\fldinst{HYPERLINK \"$FULL_PAGE_PATH\"}}{\fldrslt $LOCAL_PAGE_PATH}}"
1095	valPrint hn "<tr><td style=\"white-space:nowrap\">${STATUS}${CURL_STR_H}</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>"
1096	valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"
1097
1098	# Place vertical space here since we won't be printing anything more about this link
1099	if [ $STATUS == "OK" ]; then valPrint tr ""; valPrint hs ""; fi
1100
1101	# Record redirect URL if one was given by a 3xx response page
1102	if [ $STATUS == "RD" ]; then
1103	valPrint ts " Server suggests $NEW_URL"
1104	valPrint rs " Server suggests {\field{\*\fldinst{HYPERLINK \"$NEW_URL\"}}{\fldrslt $NEW_URL}}"
1105	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Server suggests</td><td><a href=\"$NEW_URL\" target=\"_blank\">$NEW_URL</a></td></tr>"
1106	fi
1107
1108	# Notify reader if we can use an intrawiki link for this URL
1109	if [ $STATUS == "EI" ]; then
1110	INTRA_PAGE=${URL#:///}
1111	valPrint ts " Just use [[$INTRA_PAGE]]"
1112	valPrint rs " Just use [[$INTRA_PAGE]]"
1113	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Just use</td><td>[[$INTRA_PAGE]]</td></tr>"
1114	fi
1115
1116	# Notify reader if we can use an interwiki prefix for this URL
1117	if [ $STATUS == "IW" ]; then
1118	INTER_PAGE=$(echo "$URL" \| sed 's/.*\///')
1119	valPrint ts " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
1120	valPrint rs " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
1121	valPrint hs "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]</td></tr>"
1122	fi
1123
1124	# Query Internet Archive for latest "OK" snapshot for "NG" page
1125	if [ $STATUS == "NG" ] && [ $SUGGEST_SNAPSHOTS -eq 1 ]; then
1126	ARCHIVE_QUERY=$(curl --silent --max-time $TIMEOUT "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
1127
1128	# If a "closest" snapshot was received...
1129	if [[ "$ARCHIVE_QUERY" == \"closest\": ]]; then
1130	# In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it
1131	ARCHIVE_QUERY=$(echo "$ARCHIVE_QUERY" \| sed 's/#!/#\\!/')
1132
1133	# ...isolate "url" property in the response that follows the "closest" tag
1134	SNAPSHOT_URL=${ARCHIVE_QUERY##*\"closest\":} # everything after '"closest":'
1135	SNAPSHOT_URL=${SNAPSHOT_URL#*\"url\": \"} # everything after '"url": "'
1136	SNAPSHOT_URL=${SNAPSHOT_URL%%\"*} # everything before '"'
1137
1138	# Remove the port 80 part that IA often adds to the URL, as it's superfluous
1139	SNAPSHOT_URL=$(echo $SNAPSHOT_URL \| sed 's/:80//')
1140
1141	# Inform the user of the snapshot URL
1142	valPrint ts " IA suggests $SNAPSHOT_URL"
1143	valPrint rs " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
1144	valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
1145	else # ...otherwise give generic Wayback Machine link for this URL
1146	valPrint ts " Try browsing $ARCHIVE_GENERIC/$URL"
1147	valPrint rs " Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
1148	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"
1149	fi
1150	fi
1151	fi
1152
1153	# If this is a page and it seems to be there, take screenshot of it using headless Google Chrome
1154	if [ $IS_FILE -eq 0 ] && [ $TAKE_PAGE_SHOT -eq 1 ] && [ $STATUS == "OK" ]; then
1155	# Sanitize URL by removing "http(s)://" and converting any other colons or slashes to underscores
1156	SHOT_NAME=$(echo "$URL" \| sed 's/https*\:\/\///' \| sed 'y/:\//__/')
1157	SHOT_FILE="$SHOT_PATH/$SHOT_NAME.png"
1158
1159	# Don't take screenshot if we already encountered this page and screenshotted it
1160	if [ ! -f "$SHOT_FILE" ]; then
1161	"$CHROME_PATH" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
1162	if [ -f "$WORKING_DIR/$CHROME_SCREENSHOT" ]; then
1163	mv -n "$WORKING_DIR/$CHROME_SCREENSHOT" "$SHOT_FILE"
1164	else
1165	valPrint trhs "Screenshot of URL $URL seems to have failed!"
1166	fi
1167	else
1168	valPrint trhs "Skipping screenshot of URL '$URL' because file '$SHOT_FILE' already exists."
1169	fi
1170	fi
1171	done
1172	FINISHED_LIST="yes"
1173	wrapupAndExit

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: Validate External Links/validate_external_links.sh@ 1144

Download in other formats: