Context Navigation

source: Validate External Links/validate_external_links.sh@ 1164

Last change on this file since 1164 was 1160, checked in by iritscen, 4 years ago
ValExtLinks: Added some entries to the lists of known file and page suffixes.
File size: 54.7 KB

Line
1	#!/bin/bash
2
3	# Validate External Links by Iritscen
4	#
5	# Validates a list of external links in CSV format. The resulting logs are produced in three formats:
6	# - TXT (for easy diffing with an earlier log)
7	# - RTF (for reading as a local file with clickable links)
8	# - HTML (for reading as a web page)
9	# Call script with "--help" argument for documentation. Also see Read Me.rtf for critical notes.
10	#
11	# Recommended rule:
12	# \|----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----\|
13	#
14	# Table of contents (sections of script in order of appearance, not execution):
15	# • Globals
16	# • Help Output
17	# • Setup
18	# • Utility Functions
19	# • Summary Output
20	# • Initialization
21	# • Data Sourcing
22	# • Config Output
23	# • Legend Output
24	# • Main Loop
25
26	# Set separator token to newline
27	IFS="
28	"
29
30	### GLOBALS ###
31	# Settings -- these will be changed from their defaults by the arguments passed in to the script
32	LINKS_URL="" # use 'curl' to download file with links from this location (can be file://)
33	EXCEPT_URL="" # 'curl' will access this wiki page with a list of exceptions for NG results
34	OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder
35	RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES
36	SHOW_SLASH=0 # record issue when a slash is added to the end of a URL
37	SHOW_HTTPS=0 # record issue when "http" is upgraded to "https"
38	SHOW_YT_RD=0 # record redirection for a youtu.be URL expanding to the full URL
39	SUGGEST_SNAPSHOTS_NG=0 # query the Internet Archive for a possible snapshot URL for each NG page
40	SUGGEST_SNAPSHOTS_OK=0 # query the Internet Archive for an existing snapshot of each OK page
41	CHECK_ARCHIVE_LINKS=0 # check URLs on archive.org and archive.is
42	TAKE_PAGE_SHOT=0 # take a screenshot of each OK page
43	TIMEOUT=10 # time to wait for a response when querying a site
44	CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature
45	URL_START=1 # start at this URL in LINKS_FILE
46	URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE
47	UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report
48
49	# Fixed strings -- see the occurrences of these variables to learn their purpose
50	AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/88.0.4324.146 Safari/537.36"
51	ARCHIVE_API="http://archive.org/wayback/available"
52	ARCHIVE_GENERIC="https://web.archive.org/web/*"
53	ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
54	CHROME_SCREENSHOT="screenshot.png"
55	EXCEPT_FILE_NAME="exceptions.txt"
56	EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
57	WIKI_CURL="https://wiki.oni2.net/Validate_External_Links/Curl_codes"
58	WIKI_HTTP="https://wiki.oni2.net/Validate_External_Links/HTTP_codes"
59	WIKI_MAIN="https://wiki.oni2.net/Validate_External_Links"
60	WIKI_ME="http://iritscen.oni2.net"
61	THIS_DIR=$(cd $(dirname $0); pwd)
62	WORKING_DIR=$(pwd)
63	WIKI_PATH="wiki.oni2.net"
64
65	# These are parallel arrays of the IDs and names of OniGalore's current namespaces
66	declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 100 101 102 103 104 105 108 109 110 111)
67	declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk")
68
69	# These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
70	# This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code.
71	declare -a HTTP_FILES=(3ds 7z avi BINA blend bsl cpp csv dae dll dmg doc east exe fbx first63 flv gamedata gif jpg last32 log m4a mhm mov mp3 mp4 oni ONWC pdf png psd py rar tga TRMA txt vbs wav wmv xaf xcf xml zip)
72	declare -a HTTP_TLDS_AND_PAGES=(abstract action ars asp aspx cfm cgi com css de do full htm html it js jsp net org pgi php php3 phtml pl ru shtml stm uk x)
73
74	# These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
75	# are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt
76	# if you add a new code.
77	declare -a OK_CODES=(200 401 405 406 418 501)
78	declare -a RD_CODES=(301 302 303 307 308)
79	declare -a NG_CODES=(000 400 403 404 410 429 500 502 503 504 530)
80
81	# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
82	# transcluded text, and if the transclusion fails, then the braces show up in the URL
83	ILLEGAL_CHARS="{ }"
84
85	# The shortest URL possible, used for sanity-checking some URLs: http://a.co
86	MIN_URL_LENGTH=11
87
88	# These are parallel arrays giving the prefixes that can be used in place of normal external links to
89	# some wikis and other sites; based on https://wiki.oni2.net/Special:Interwiki
90	declare -a INTERWIKI_PREFIXES=(commons metawikimedia mw wikibooks wikidata wikimedia wikinews wikiquote wikisource wikispecies wikiversity wikivoyage wikt wp)
91	declare -a INTERWIKI_DOMAINS=(commons.wikimedia.org meta.wikimedia.org mediawiki.org wikibooks.org wikidata.org wikimediafoundation.org wikinews.org wikiquote.org wikisource.org species.wikimedia.org wikiversity.org wikivoyage.org wiktionary.org wikipedia.org)
92
93	# Variables for keeping track of main loop progress and findings
94	LINK_NUM=0
95	EI_LINKS=0
96	IW_LINKS=0
97	OK_LINKS=0
98	RD_LINKS=0
99	NG_LINKS=0
100	SKIP_UNK_NS=0
101	SKIP_JS_PAGE=0
102	SKIP_BAD_URL=0
103	SKIP_NON_ASCII=0
104	SKIP_UNK_SUFFIX=0
105	SKIP_UNK_CODE=0
106	SKIP_EXPECT_NG=0
107	SKIP_EXPECT_RD=0
108	SKIP_EXPECT_EI=0
109	SKIP_EXPECT_IW=0
110	SKIP_HTTPS_UP=0
111	SKIP_SLASH_ADD=0
112	SKIP_YOUTU_BE=0
113	SKIP_ARCHIVES=0
114	FILE_LINKS=0
115	PAGE_LINKS=0
116	SKIPPED_HEADER_ROW=0
117	FINISHED_LIST="no"
118	START_RUN=0
119	END_RUN=0
120
121
122	### HELP OUTPUT ###
123	# A pseudo-man page. Here is the 80-character rule for the page text:
124	# 234567890123456789012345678901234567890123456789012345678901234567890123456789
125	function printHelp()
126	{
127	cat << EOF
128
129	NAME
130	Validate External Links
131
132	SYNOPSIS
133	validate_external_links.sh --help
134	validate_external_links.sh --links URL --output DIR [--exceptions URL]
135	[--record-ok-links] [--show-added-slashes] [--show-https-upgrades]
136	[--show-yt-redirects] [--suggest-snapshots] [--check-archive-links]
137	[--take-screenshots FILE] [--timeout NUM] [--start-url NUM]
138	[--end-url NUM] [--upload FILE]
139
140	DESCRIPTION
141	This script parses a list of external links found in the OniGalore wiki
142	(which is dumped by the Oni2.net server periodically in a particular
143	format), validates them using the Unix tool 'curl', and produces a report
144	of which links were "OK" (responded positively to an HTTP query), which
145	were "RD" (responded with a 3xx redirect code), which could be "IW"
146	(interwiki) links, which are "EI" (external internal) links and could be
147	intrawiki links, and which were "NG" (no good; a negative response to the
148	query). This report can then be automatically uploaded to the location of
149	your choice. The script can also suggest Internet Archive snapshots for
150	"NG" links, and take screenshots of "OK" links for visual verification by
151	the reader that the page in question is the one intended to be displayed.
152
153	You must pass this script the URL at which the list of links is found
154	(--links) and the path where the directory of logs should be outputted
155	(--output). All other arguments are optional.
156
157	OPTIONS
158	--help Show this page.
159	--links URL (required) URL from which to download the CSV
160	file with external links. Note that this URL can
161	be a local file if you supply a file:// path.
162	--output DIR (required) Unix path to directory in which Val
163	should place its reports.
164	--exceptions URL In order to remove links from the report which
165	Val finds an issue with but which you regard as
166	OK, list those desired exceptions on a wiki page.
167	See the sample file "exceptions.pdf" for the
168	required format of the page. Note that this URL
169	can point to a local file if you supply a path
170	beginning with "file://".
171	--record-ok-links Log a link in the report even if its response
172	code is "OK".
173	--show-added-slashes Report on redirects that simply add a '/' to the
174	end of the URL.
175	--show-https-upgrades Report on redirects that simply upgrade a
176	"http://" URL to a "https://" URL.
177	--show-yt-redirects Report on redirects that expand a youtu.be URL.
178	--suggest-snapshots-ng Query the Internet Archive for a possible
179	snapshot URL for each "NG" page.
180	--suggest-snapshots-ok Query the Internet Archive for a snapshot of each
181	"OK" page just to make sure it's available. Note
182	that this will add a tremendous amount of time to
183	the script execution because there is a rate
184	limit to the Archive API. Note that this option
185	does nothing unless you also use the
186	--record-ok-links argument.
187	--check-archive-links Check links that are already pointing to a page
188	on the Internet Archive or archive.is (AKA
189	archive.today). In theory these links should be
190	totally stable and not need validation.
191	--take-screenshots FILE Call the Google Chrome binary at this path to
192	take screenshots of each "OK" page.
193	--timeout NUM Wait this many seconds for a site to respond. The
194	default is 10. Important note: Val will attempt
195	to reach each URL three times, so the time taken
196	to ping an unresponsive site will be three times
197	this setting.
198	--start-url NUM Start at this link in the links CSV file.
199	--end-url NUM Stop at this link in the links CSV file.
200	--upload FILE Upload report using the credentials and path
201	given in this local text file. See sftp_login.txt
202	for template.
203
204	BUGS
205	The script cannot properly parse any line in the external links file
206	which contains a comma in the name of the wiki page containing a link.
207	Commas in the link itself are not an issue.
208	EOF
209	}
210
211
212	### SETUP ###
213	# If first argument is a help request, or if nothing was passed in at all, print help page and quit
214	if [ "$#" -eq 0 ] \|\| [ "$1" == "--help" ]; then
215	printHelp \| less
216	exit 0
217	fi
218
219	# Parse arguments as long as there are more arguments to process
220	while (( "$#" )); do
221	case "$1" in
222	--links ) LINKS_URL="$2"; shift 2;;
223	--exceptions ) EXCEPT_URL="$2"; shift 2;;
224	--output ) OUTPUT_DIR="$2"; shift 2;;
225	--record-ok-links ) RECORD_OK_LINKS=1; shift;;
226	--show-added-slashes ) SHOW_SLASH=1; shift;;
227	--show-https-upgrades ) SHOW_HTTPS=1; shift;;
228	--show-yt-redirects ) SHOW_YT_RD=1; shift;;
229	--suggest-snapshots-ng ) SUGGEST_SNAPSHOTS_NG=1; shift;;
230	--suggest-snapshots-ok ) SUGGEST_SNAPSHOTS_OK=1; shift;;
231	--check-archive-links ) CHECK_ARCHIVE_LINKS=1; shift;;
232	--take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
233	--timeout ) TIMEOUT=$2; shift 2;;
234	--start-url ) URL_START=$2; shift 2;;
235	--end-url ) URL_LIMIT=$2; shift 2;;
236	--upload ) UPLOAD_INFO=$2; shift 2;;
237	* ) echo "Invalid argument '$1' detected. Aborting."; exit 1;;
238	esac
239	done
240
241	# If the required arguments were not supplied, print help page and quit
242	if [ -z $LINKS_URL ] \|\| [ -z $OUTPUT_DIR ]; then
243	echo "Error: I did not receive one or both required arguments. Run me with the \"--help\" argument for documentation."
244	exit 2
245	fi
246
247	# If user wants screenshots, make sure path to Chrome was passed in and is valid
248	if [ $TAKE_PAGE_SHOT -eq 1 ]; then
249	if [ ! -f "$CHROME_PATH" ]; then
250	echo "Error: You need to supply a path to the Google Chrome application in order to take screenshots."
251	exit 3
252	fi
253	fi
254
255	# Check that UPLOAD_INFO exists, if this argument was supplied
256	if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then
257	echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting."
258	exit 4
259	fi
260
261	# Check that OUTPUT_DIR is a directory
262	if [ ! -d "$OUTPUT_DIR" ]; then
263	echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting."
264	exit 5
265	fi
266
267	# Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots
268	SAFE_TIME=$(date "+%Y-%m-%d--%H-%M-%S") # for use in path names
269	NICE_TIME=$(TZ=":UTC" date "+%a, %d %b %Y %k:%M:%S GMT") # matches format of LINKS_DATE, which is extracted from the extlinks file
270	OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
271	OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
272	SHOT_PATH="$OUTPUT_PATH/Screenshots"
273	LOG_NAME="ValExtLinks report"
274	LOG_NAME_TXT="$LOG_NAME.txt"
275	LOG_NAME_RTF="$LOG_NAME.rtf"
276	LOG_NAME_HTM="$LOG_NAME.htm"
277	LOG_PATH="$OUTPUT_PATH/$LOG_NAME"
278	LOG_PATH_TXT="$LOG_PATH.txt"
279	LOG_PATH_RTF="$LOG_PATH.rtf"
280	LOG_PATH_HTM="$LOG_PATH.htm"
281	mkdir "$OUTPUT_PATH"
282	if [ $TAKE_PAGE_SHOT -eq 1 ]; then
283	mkdir "$SHOT_PATH"
284	fi
285
286	# Check that 'mkdir' succeeded
287	if [ ! -d "$OUTPUT_PATH" ]; then
288	echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting."
289	exit 6
290	fi
291
292	# Get date on the file at LINKS_URL and print to log
293	LINKS_DATE=$(curl --silent --head $LINKS_URL \| grep "Last-Modified")
294	if [ -z "$LINKS_DATE" ]; then
295	echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting."
296	exit 7
297	fi
298	LINKS_DATE=${LINKS_DATE#Last-Modified: }
299
300
301	### UTILITY FUNCTIONS ###
302	# Writes a plain-text header to TXT log file
303	function printTXTheader()
304	{
305	valPrint t "Validate External Links report"
306	valPrint t "generated $NICE_TIME"
307	valPrint t "from data of $LINKS_DATE"
308	valPrint t "script by Iritscen (contact: $WIKI_ME)"
309	valPrint t ""
310	}
311
312	# Writes the RTF header to RTF log file, then a centered title, then sets text to left-justified
313	function printRTFheader()
314	{
315	valPrint r "{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf820
316	{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
317	{\colortbl;\red255\green255\blue255;}
318	{\*\expandedcolortbl;;}
319	\margl1440\margr1440\vieww12600\viewh12100\viewkind0
320	\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\qc\partightenfactor0
321
322	\f0\fs28 \cf0 \b1 Validate External Links report \b0\\
323	generated $NICE_TIME\\
324	from data of $LINKS_DATE\\
325	script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$WIKI_ME\"}}{\fldrslt contact}})
326	\\
327	\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
328	\cf0 "
329	}
330
331	# Closes the RTF markup of the RTF log file
332	function printRTFfooter()
333	{
334	valPrint r "}"
335	}
336
337	# Writes the HTML header to HTML log file
338	function printHTMheader()
339	{
340	valPrint h "<html>
341	<head>
342	<title>Validate External Links report</title>
343	</head>
344	<body>
345	<h2>Validate External Links report</h2>
346	<h3>generated $NICE_TIME<br />
347	from data of $LINKS_DATE<br />
348	script by Iritscen (<a href=\"$WIKI_ME\" target=\"_blank\">contact</a>)</h3>"
349	}
350
351	# Closes the HTML markup of the HTML log file
352	function printHTMfooter()
353	{
354	valPrint h "</body>
355	</html>"
356	}
357
358	# The central logging function. The first parameter is a string composed of one or more characters that
359	# indicate which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
360	# 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 's' means "Print an
361	# extra newline at the end." 'w' means "Don't pass console output through 'fmt'" ("fmt" fits the output
362	# to an 80-column CLI but can break special formatting and the 'n' option).
363	function valPrint()
364	{
365	if [[ "$1" == c ]]; then
366	if [[ "$1" == n ]]; then
367	echo -n "$2"
368	elif [[ "$1" == w ]]; then
369	echo "$2"
370	elif [[ "$1" == s ]]; then
371	echo -e "$2\n"
372	else
373	echo "$2" \| fmt -w 80
374	fi
375	fi
376	if [[ "$1" == t ]]; then
377	if [[ "$1" == n ]]; then
378	echo -n "$2" >> "$LOG_PATH_TXT"
379	elif [[ "$1" == s ]]; then
380	echo -e "$2\n" >> "$LOG_PATH_TXT"
381	else
382	echo "$2" >> "$LOG_PATH_TXT"
383	fi
384	fi
385	if [[ "$1" == r ]]; then
386	if [[ "$1" == n ]]; then
387	echo "$2" >> "$LOG_PATH_RTF"
388	elif [[ "$1" == s ]]; then
389	echo "$2\line\line" >> "$LOG_PATH_RTF"
390	else
391	echo "$2\line" >> "$LOG_PATH_RTF"
392	fi
393	fi
394	if [[ "$1" == h ]]; then
395	if [[ "$1" == s ]]; then
396	echo "$2<tr><td> </td></tr>" >> "$LOG_PATH_HTM"
397	elif [[ "$1" == n ]]; then
398	echo "$2" >> "$LOG_PATH_HTM"
399	else
400	echo "$2<br />" >> "$LOG_PATH_HTM"
401	fi
402	fi
403	}
404
405	# Pluralize the string in parameter 1 if the number in parameter 2 is not 1
406	function pluralCheckNoun()
407	{
408	if [ $2 -ne 1 ]; then
409	if [[ $1 =~ x$ ]]; then
410	echo $1es
411	else
412	echo $1s
413	fi
414	else
415	echo $1
416	fi
417	}
418
419	# Output "is" if parameter 1 is 1, otherwise "are"
420	function pluralCheckIs()
421	{
422	if [ $1 -ne 1 ]; then
423	echo "are"
424	else
425	echo "is"
426	fi
427	}
428
429	# Output "was" if parameter 1 is 1, otherwise "were"
430	function pluralCheckWas()
431	{
432	if [ $1 -ne 1 ]; then
433	echo "were"
434	else
435	echo "was"
436	fi
437	}
438
439	# Output "a " if parameter 1 is 1, otherwise nothing
440	function pluralCheckA()
441	{
442	if [ $1 -eq 1 ]; then
443	echo "a "
444	fi
445	}
446
447	# Output "an " if parameter 1 is 1, otherwise nothing
448	function pluralCheckAn()
449	{
450	if [ $1 -eq 1 ]; then
451	echo "an "
452	fi
453	}
454
455	# Upload the reports using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
456	# reports being saved to disk have already been closed.
457	function uploadReport()
458	{
459	valPrint c "Uploading reports..."
460
461	SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
462	SFTP_USER_NAME_MARKER="user:"
463	SFTP_PASSWORD_MARKER="pw:"
464	SFTP_PORT_MARKER="port:"
465	SFTP_PATH_MARKER="path:"
466	SFTP_USER_NAME=$(grep $SFTP_USER_NAME_MARKER $UPLOAD_INFO)
467	SFTP_USER_NAME=${SFTP_USER_NAME#$SFTP_USER_NAME_MARKER}
468	SFTP_PASSWORD=$(grep $SFTP_PASSWORD_MARKER $UPLOAD_INFO)
469	SFTP_PASSWORD=${SFTP_PASSWORD#$SFTP_PASSWORD_MARKER}
470	SFTP_PORT=$(grep $SFTP_PORT_MARKER $UPLOAD_INFO)
471	SFTP_PORT=${SFTP_PORT#$SFTP_PORT_MARKER}
472	SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
473	SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}
474
475	for SUFFIX in htm rtf txt; do
476	expect "$SCRIPT_PATH" "$LOG_PATH.$SUFFIX" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.$SUFFIX"
477
478	if [ "$?" -ne 0 ]; then
479	valPrint c "Error $? occurred when attempting to upload $LOG_NAME.$SUFFIX!"
480	else
481	valPrint c "Report in `echo $SUFFIX \| tr [:lower:] [:upper:]` format was uploaded."
482	fi
483	done
484	}
485
486	# Prints session summary when script is done
487	function wrapupAndExit()
488	{
489	# Get off progress line on console, drop down a line from last link in log, and close HTML table
490	valPrint ctr ""
491	valPrint h "</table><br />"
492
493	# If we didn't finish processing the last URL, then the iterator is one too high
494	if [ $FINISHED_LIST != "yes" ]; then
495	let LINK_NUM-=1
496	if [ $FINISHED_LIST == "no" ]; then
497	valPrint ctrh "The session was canceled by the user."
498	fi
499	fi
500
501	# Generate string with elapsed time
502	END_RUN=$(date +%s)
503	ELAPSED=$(echo $(($END_RUN - $START_RUN)) \| awk '{printf "%d min. %d sec. elapsed", int($1/60), int($1%60)}')
504
505	# Do some math on results of session
506	LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
507	TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP+SKIP_YOUTU_BE))
508	LINK_ERRORS=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
509	LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_RD+SKIP_EXPECT_EI+SKIP_EXPECT_IW))
510	LINK_PROBLEMS_TOTAL=$((NG_LINKS+RD_LINKS+EI_LINKS+IW_LINKS))
511	LINK_PROBLEMS_NG=$((NG_LINKS-SKIP_EXPECT_NG))
512	LINK_PROBLEMS_RD=$((RD_LINKS-SKIP_EXPECT_RD))
513	LINK_PROBLEMS_EI=$((EI_LINKS-SKIP_EXPECT_EI))
514	LINK_PROBLEMS_IW=$((IW_LINKS-SKIP_EXPECT_IW))
515	LINK_PROBLEMS_NET=$((LINK_PROBLEMS_NG+LINK_PROBLEMS_RD+LINK_PROBLEMS_EI+LINK_PROBLEMS_IW))
516
517	# Print something in the Links section if no link issues were printed
518	if [ $LINK_PROBLEMS_NET -eq 0 ]; then
519	valPrint h "<i>No link problems to report! See the <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for a list of links with issues that were not reported.</i>"
520	fi
521	if [ $LINK_PROBLEMS_TOTAL -eq 0 ]; then
522	valPrint t "No link problems to report!"
523	valPrint r "\i1 No link problems to report! \i0"
524	fi
525
526	## SUMMARY OUTPUT ##
527	valPrint ct "Summary ($ELAPSED):"
528	valPrint r "\b1 Summary \b0 ($ELAPSED)"
529	valPrint hn "<h3><span id=\"summary\">Summary ($ELAPSED)</span></h3>"
530	valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT) (there $(pluralCheckWas $FILE_LINKS) $FILE_LINKS file $(pluralCheckNoun link $FILE_LINKS) and $PAGE_LINKS page $(pluralCheckNoun link $PAGE_LINKS))."
531
532	# Print processed link totals
533	if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi
534	if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi
535	if [ $SKIP_ARCHIVES -gt 0 ]; then valPrint ctrh "- $SKIP_ARCHIVES archive.org/archive.is $(pluralCheckNoun link $SKIP_ARCHIVES) were not checked"; fi
536	if [ $LINK_PROBLEMS_TOTAL -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_TOTAL processed $(pluralCheckNoun link $LINK_PROBLEMS_TOTAL) had $(pluralCheckAn $LINK_PROBLEMS_TOTAL)$(pluralCheckNoun issue $LINK_PROBLEMS_TOTAL)"; fi
537	if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr " (excepted $LINKS_EXCEPTED link $(pluralCheckNoun issue $LINKS_EXCEPTED) from report)"; valPrint h "  (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi
538	if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) $(pluralCheckWas $OK_LINKS) OK"; fi
539	if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctr " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; valPrint h "  (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi
540
541	# Print errored link totals
542	if [ $LINK_ERRORS -gt 0 ]; then
543	valPrint c "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"
544	valPrint h "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for specific links):"
545	valPrint rt "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS):"
546	fi
547	if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
548	if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
549	if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
550	if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
551	if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
552	if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
553
554	# Print excepted link totals
555	if [ $LINKS_EXCEPTED -gt 0 ]; then
556	valPrint c "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"
557	valPrint h "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for specific links):"
558	valPrint rt "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted:"
559	fi
560	if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
561	if [ $SKIP_EXPECT_RD -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_RD/$RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi
562	if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi
563	if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi
564
565	# Print checked link totals
566	if [ $LINK_PROBLEMS_NET -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS_NET link $(pluralCheckNoun issue $LINK_PROBLEMS_NET):"; fi
567	if [ $LINK_PROBLEMS_NG -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_NG NG $(pluralCheckNoun link $LINK_PROBLEMS_NG)"; fi
568	if [ $LINK_PROBLEMS_RD -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_RD $(pluralCheckNoun redirection $LINK_PROBLEMS_RD)"; fi
569	if [ $LINK_PROBLEMS_EI -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_EI $(pluralCheckNoun link $LINK_PROBLEMS_EI) that could be intrawiki"; fi
570	if [ $LINK_PROBLEMS_IW -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_IW $(pluralCheckNoun link $LINK_PROBLEMS_IW) that could be interwiki"; fi
571
572	# Close the log files' markup
573	valPrint trh "ValExtLinks says goodbye."
574	printRTFfooter
575	printHTMfooter
576
577	# Upload report if this was requested
578	if [ ! -z $UPLOAD_INFO ] && [ $FINISHED_LIST != "no" ]; then
579	uploadReport
580	fi
581
582	# Really quit now
583	valPrint c "ValExtLinks says goodbye."
584	exit 0
585	}
586	trap wrapupAndExit INT
587
588
589	### INITIALIZATION ###
590	# Print opening message to console and log files
591	valPrint cw "$(tput bold)--Validate External Links--$(tput sgr0)"
592	printTXTheader
593	printRTFheader
594	printHTMheader
595
596	## DATA SOURCING ##
597	valPrint t "Startup:"
598	valPrint r "\b1 Startup \b0"
599	valPrint hn "<h3>Startup</h3>"
600
601	# Attempt to download file at LINKS_URL, then check that it succeeded
602	valPrint cwtrhn "Downloading list of external links from $LINKS_URL..."
603	LINKS_FILE_NAME=$(echo "$LINKS_URL" \| sed 's/.*\///')
604	LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
605	curl --silent -o "$LINKS_FILE" $LINKS_URL
606	if [ ! -f "$LINKS_FILE" ]; then
607	echo -e "\nThe download of $LINKS_URL appears to have failed. Aborting."
608	wrapupAndExit
609	else
610	valPrint ctrh " success."
611	fi
612
613	# Attempt to download file at EXCEPT_URL, then check that it succeeded
614	if [ ! -z $EXCEPT_URL ]; then
615	valPrint cwtrhn "Downloading list of reporting exceptions from $EXCEPT_URL..."
616	EXCEPT_DATA=$(curl --silent $EXCEPT_URL)
617	if [ -z "$EXCEPT_DATA" ]; then
618	echo -e "\nThe download of the exceptions data from '$EXCEPT_URL' appears to have failed. Aborting."
619	wrapupAndExit
620	else
621	valPrint ctrh " success."
622	fi
623	EXCEPT_DATA=${EXCEPT_DATA%END LIST*}
624	EXCEPT_DATA=${EXCEPT_DATA#*BEGIN LIST}
625	EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
626
627	# Store on disk for debugging purposes
628	echo "$EXCEPT_DATA" > "$EXCEPT_FILE"
629
630	# Transfer to array for easy searching later
631	declare -a EXCEPT_ARRAY=($(echo "$EXCEPT_DATA"))
632	fi
633
634	# Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
635	LINK_COUNT_STRING=$(cat "$LINKS_FILE" \| wc -l)
636
637	# Number of URLs is number of lines minus one (first line is column header row for the CSV)
638	LINK_COUNT=$(echo "${LINK_COUNT_STRING}" \| tr -d '[:space:]')
639	let LINK_COUNT-=1
640	valPrint ctrh "Found $LINK_COUNT links to process."
641	valPrint trh ""
642
643	## CONFIG OUTPUT ##
644	valPrint t "Config:"
645	valPrint r "\b1 Config \b0"
646	valPrint hn "<h3>Config</h3>"
647
648	valPrint ctrhn "Links to consider: "
649	if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
650	valPrint ctrh "$((URL_LIMIT-URL_START+1)), from link $URL_START to $URL_LIMIT"
651	elif [ $URL_START -ne 1 ]; then
652	valPrint ctrh "$((LINK_COUNT-URL_START+1)), from link $URL_START to $LINK_COUNT"
653	else
654	valPrint ctrh "$LINK_COUNT"
655	fi
656
657	valPrint ctrh "Site query timeout: $TIMEOUT seconds"
658
659	valPrint ctrhn "Show OK links: "
660	if [ $RECORD_OK_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
661
662	valPrint ctrhn "Take screenshots: "
663	if [ $TAKE_PAGE_SHOT -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
664
665	valPrint ctrhn "Suggest archive.org snapshots for NG pages: "
666	if [ $SUGGEST_SNAPSHOTS_NG -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
667
668	valPrint ctrhn "Suggest archive.org snapshots for OK pages: "
669	if [ $SUGGEST_SNAPSHOTS_OK -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
670
671	valPrint ctrhn "Ignore slash-adding redirects: "
672	if [ $SHOW_SLASH -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
673
674	valPrint ctrhn "Ignore HTTPS-upgrading redirects: "
675	if [ $SHOW_HTTPS -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
676
677	valPrint ctrhn "Ignore youtu.be redirects: "
678	if [ $SHOW_YT_RD -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
679
680	valPrint ctrhn "Check archive.org and archive.is links: "
681	if [ $CHECK_ARCHIVE_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
682
683	valPrint tr "A summary of my findings will be found at the bottom of the report."
684	valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
685	valPrint trh ""
686
687	## LEGEND OUTPUT ##
688	valPrint t "Legend:"
689	valPrint r "\b1 Legend \b0"
690	valPrint hn "<h3>Legend</h3>"
691	valPrint t "(For guidance in fixing these links, see $WIKI_MAIN.)"
692	valPrint r "(For guidance in fixing these links, see {\field{\*\fldinst{HYPERLINK \"$WIKI_MAIN\"}}{\fldrslt here}}.)"
693	valPrint h "(For guidance in fixing these links, see <a href=\"$WIKI_MAIN\" target=\"_blank\">here</a>.)"
694	valPrint trh "OK = URL seems to be working"
695	valPrint trh "NG = URL no longer seems to work"
696	valPrint trh "RD = URL is redirecting to this new URL"
697	valPrint trh "EI = URL is an external link to an internal page and should be converted to an intrawiki link using the suggested markup"
698	valPrint trh "IW = URL is an external link to a fellow wiki and should be converted to an interwiki link using the suggested markup"
699	valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $WIKI_HTTP)"
700	valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$WIKI_HTTP\"}}{\fldrslt here}} for code reference)"
701	valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$WIKI_HTTP\" target=\"_blank\">here</a> for code reference)"
702	valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $WIKI_CURL)"
703	valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$WIKI_CURL\"}}{\fldrslt here}} for code reference)"
704	valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$WIKI_CURL\" target=\"_blank\">here</a> for code reference)"
705	valPrint trh "IA suggests = Last available snapshot returned by the Internet Archive"
706	valPrint trh "Try browsing = The Archive failed to return a snapshot URL, so check for a snapshot manually using this link"
707	valPrint trh ""
708
709
710	### MAIN LOOP ###
711	valPrint t "Links:"
712	valPrint r "\b1 Links \b0"
713	valPrint hn "<h3>Links</h3>"
714	START_RUN=$(date +%s)
715	# Process each line of the .csv in LINKS_FILE
716	for LINE in `cat "$LINKS_FILE"`; do
717	START_LINK=$(date +%s)
718	let LINK_NUM+=1
719
720	# First line is the column header row for the CSV, so let's verify that the format hasn't changed
721	if [ $SKIPPED_HEADER_ROW -eq 0 ]; then
722	if [ $LINE == "namespace,title,target" ]; then
723	SKIPPED_HEADER_ROW=1
724	LINK_NUM=0 # this line is not a link, so reset the link counter
725	valPrint hn "<table>"
726	continue
727	else
728	valPrint ctrh "Did not find expected header row in $LINKS_FILE. Aborting."
729	wrapupAndExit
730	fi
731	fi
732
733	# Skip this link if we are not at URL_START yet
734	if [ $LINK_NUM -lt $URL_START ]; then
735	continue
736	fi
737
738	# Stop if we are at the limit declared for testing purposes
739	if [ $URL_LIMIT -gt 0 ] && [ $LINK_NUM -gt $URL_LIMIT ]; then
740	FINISHED_LIST="limit"
741	wrapupAndExit
742	fi
743
744	# Print progress to screen
745	if [ $LINK_NUM -gt 1 ]; then
746	printf "\e[1A\n" # erase previous progress message so that new one appears in its place
747	fi
748	valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."
749
750	# The number of the namespace is the element before the first comma on the line
751	NS_ID=${LINE%%,*}
752
753	# Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
754	NS_NAME=""
755	a=0
756	while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done
757	if [ $NS_ID == "NULL" ]; then
758	break
759	elif [ $NS_ID -eq ${NS_IDS[$a]} ]; then
760	NS_NAME="${NS_NAMES[$a]}"
761	break
762	fi
763	let a+=1
764	done
765	if [ "$NS_NAME" == "" ]; then
766	if [ $NS_ID == "NULL" ]; then
767	valPrint trs "Skipping line $LINK_NUM ('$LINE') because the namespace (and probably the page too) is 'NULL'. Probably the link is no longer in existence on the wiki."
768	else
769	valPrint trs "Skipping line $LINK_NUM ('$LINE') because I could not find a name for namespace ID $NS_ID."
770	fi
771	let SKIP_UNK_NS+=1
772	let PAGE_LINKS+=1
773	continue
774	fi
775
776	# The name of the page is everything between the namespace ID and the next comma on the line (commas
777	# in page names will break this)
778	PAGE_NAME=${LINE#$NS_ID,}
779	PAGE_NAME=${PAGE_NAME%%,*}
780
781	# Build longer wiki page URLs from namespace and page names
782	FULL_PAGE_PATH=https://$WIKI_PATH/$NS_NAME:$PAGE_NAME
783	LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
784	# Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it
785	# explicitly breaks the link
786	if [ $NS_ID -eq 0 ]; then
787	FULL_PAGE_PATH=https://$WIKI_PATH/$PAGE_NAME
788	LOCAL_PAGE_PATH=$PAGE_NAME
789	fi
790
791	# We don't want to consider wiki pages ending in .js, as the MW parser cannot reliably isolate URLS
792	# in JavaScript code, so it returns erroneous links
793	PAGE_NAME_SUFFIX=$(echo $PAGE_NAME \| sed 's/.*\.//')
794	if [ $PAGE_NAME_SUFFIX == "js" ]; then
795	valPrint trs "Skipping URL '${LINE#$NS_ID,$PAGE_NAME,}' on line $LINK_NUM because it was found on JavaScript page '$LOCAL_PAGE_PATH'."
796	let SKIP_JS_PAGE+=1
797	let PAGE_LINKS+=1
798	continue
799	fi
800
801	# The URL being linked to is everything after the previous two fields (this allows commas to be in
802	# the URLs, but a comma in the previous field, the page name, will break this)
803	URL=${LINE#$NS_ID,$PAGE_NAME,}
804
805	# Scan for illegal characters
806	if [[ $URL == [$ILLEGAL_CHARS] ]]; then
807	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because it contains characters illegal in a URL."
808	let SKIP_BAD_URL+=1
809	let PAGE_LINKS+=1
810	continue
811	fi
812
813	# If we're skipping archive links, see if this is one
814	if [ $CHECK_ARCHIVE_LINKS -eq 0 ] && [[ ( $URL == web.archive.org \|\| $URL == archive.is ) ]]; then
815	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to check archive links."
816	let SKIP_ARCHIVES+=1
817	let PAGE_LINKS+=1
818	continue
819	fi
820
821	# Now we need to know if the URL is for a file or a web page. First step is to determine if the
822	# URL ends in a suffix
823	HAS_SUFFIX=0
824
825	# If the URL ends in something like ".php?foo=bar", strip everything from the '?' onward
826	CLEAN_URL=${URL%%\?*}
827
828	# If the URL ends in something like "#section_15", strip everything from the '#' onward
829	CLEAN_URL=${CLEAN_URL%%\#*}
830
831	# 'sed' cannot handle Unicode in my Bash shell, so skip non-ASCII URL and make user check it
832	if [[ $CLEAN_URL == [![:ascii:]] ]]; then
833	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I cannot handle non-ASCII characters."
834	let SKIP_NON_ASCII+=1
835	let PAGE_LINKS+=1
836	continue
837	fi
838
839	# Isolate the characters after the last period and after the last slash
840	POST_DOT=$(echo "$CLEAN_URL" \| sed 's/.*\.//')
841	POST_SLASH=$(echo "$CLEAN_URL" \| sed 's/.*\///')
842
843	# If the last period comes after the last slash, then the URL ends in a suffix
844	POST_DOT_LENGTH=$(echo \| awk -v input=$POST_DOT '{print length(input)}')
845	POST_SLASH_LENGTH=$(echo \| awk -v input=$POST_SLASH '{print length(input)}')
846	if [ $POST_DOT_LENGTH -lt $POST_SLASH_LENGTH ]; then
847	HAS_SUFFIX=1
848	else
849	HAS_SUFFIX=0
850	fi
851
852	# Now we need to know if the URL is for a file. If the URL ends in a suffix, compare it against the
853	# known URL endings supplied in HTTP_FILES and HTTP_TLDS_AND_PAGES
854	IS_FILE=-1
855	if [ $HAS_SUFFIX -eq 0 ]; then
856	IS_FILE=0
857	else
858	# Turn off case sensitivity while we compare suffixes
859	shopt -s nocasematch
860
861	# Special case: URLs ending in something like "productID.304297400" are pages, not files, so if
862	# the URL's suffix is all numbers, we are looking at the end of a web page URL
863	if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
864	IS_FILE=0
865	fi
866
867	# Special case: URLs ending in a parens, e.g. "ms537113(v=vs.85)", are pages
868	if [[ $POST_DOT =~ ^.*[\(\)]$ ]]; then
869	IS_FILE=0
870	fi
871
872	# Special case: URLs containing a '%', e.g. "10.1007%2FBF00329055", are pages
873	if [[ $POST_DOT == % ]]; then
874	IS_FILE=0
875	fi
876
877	# If we did not identify this URL as a web page above, we need to compare the suffix against known
878	# file extensions
879	if [ $IS_FILE -eq -1 ]; then
880	for EXTENSION in "${HTTP_FILES[@]}"; do
881	if [[ $EXTENSION == $POST_DOT ]]; then # double brackets respect the nocasematch setting
882	IS_FILE=1
883	break
884	fi
885	done
886	fi
887
888	# If we did not identify this URL as a file above, we need to compare the suffix against known
889	# pages and TLDs in order to rule out that this is an unknown file suffix or TLD that the user
890	# needs to add to the HTTP_FILES or HTTP_TLDS_AND_PAGES array
891	if [ $IS_FILE -eq -1 ]; then
892	for PAGE_OR_TLD in "${HTTP_TLDS_AND_PAGES[@]}"; do
893	if [[ $PAGE_OR_TLD == $POST_DOT ]]; then
894	IS_FILE=0
895	break
896	fi
897	done
898	fi
899
900	# Turn case sensitivity back on in Bash
901	shopt -u nocasematch
902	fi
903
904	# If this suffix escaped identification as either a file, page or TLD, inform the user
905	STR_TYPE=""
906	if [ $IS_FILE -eq -1 ]; then
907	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I encountered the unknown URL suffix '$POST_DOT'. Please add this suffix to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
908	let SKIP_UNK_SUFFIX+=1
909	continue
910	elif [ $IS_FILE -eq 1 ]; then
911	STR_TYPE="file"
912	let FILE_LINKS+=1
913	else
914	STR_TYPE="page"
915	let PAGE_LINKS+=1
916	fi
917
918	# Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
919	# issue with sites that require HTTPS
920	CURL_CODE=$(curl -o /dev/null --silent --insecure --compressed --head --user-agent '$AGENT' --max-time $TIMEOUT --retry 2 --write-out '%{http_code}\n' $URL)
921	CURL_ERR=$(echo $?)
922	CURL_RESULT=$CURL_CODE
923
924	# Tack 'curl' exit code onto result string if 'curl' returned "000" (no HTTP response)
925	if [ $CURL_CODE == "000" ]; then
926	CURL_RESULT="$CURL_RESULT-$CURL_ERR"
927	fi
928
929	# Begin to determine our status code for this URL (EI, IW, OK, RD, or NG)
930	STATUS="??"
931	NEW_URL=""
932	INTERWIKI_INDEX=-1
933
934	# First make sure that this isn't an "external internal" link to our own wiki that can be replaced
935	# by "[[page_name]]". If it uses a special access URL beginning with "/w/", let it pass, as it
936	# probably cannot be replaced by "[[ ]]" markup
937	if [[ $URL == $WIKI_PATH ]] && [[ $URL != $WIKI_PATH/w/ ]]; then
938	STATUS="EI"
939	let EI_LINKS+=1
940	fi
941
942	# If it's not, check if this is a link to a domain that we have an interwiki prefix for (also make
943	# sure that it's not an archive.org link to a page from an interwiki domain)
944	if [ $STATUS == "??" ] && [[ $URL != web.archive.org ]]; then
945	for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
946	if [[ $URL == ${INTERWIKI_DOMAINS[$i]} ]] && [[ $URL != ${INTERWIKI_DOMAINS[$i]}/w/ ]]; then
947	STATUS="IW"
948	let IW_LINKS+=1
949	INTERWIKI_INDEX=$i
950	break
951	fi
952	done
953	fi
954
955	# If we didn't match an interwiki domain, see if the status code is in our "OK" codes list
956	if [ $STATUS == "??" ]; then
957	for CODE in "${OK_CODES[@]}"; do
958	if [[ $CODE == $CURL_CODE ]]; then
959	STATUS="OK"
960	let OK_LINKS+=1
961
962	# If this is a YouTube link, we have to look at the actual page source to know if the video
963	# is good or not; override the link's info if it's actually NG
964	if [[ $URL == www.youtube.com ]]; then
965	PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $URL \| grep "\"simpleText\":\"Video unavailable\"")
966	if [ ! -z "$PAGE_TEXT" ]; then
967	STATUS="NG"
968	CURL_RESULT=404
969	let OK_LINKS-=1
970	let NG_LINKS+=1
971	fi
972	fi
973	break
974	fi
975	done
976	fi
977
978	# If we didn't get a match with the "OK" codes, check it against the "RD" codes
979	if [ $STATUS == "??" ]; then
980	for CODE in "${RD_CODES[@]}"; do
981	if [[ $CODE == $CURL_CODE ]]; then
982	# Get URL header again in order to retrieve the URL we are being redirected to
983	NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --write-out '%{redirect_url}\n' $URL)
984
985	# Adjust the old and new URLs to both use HTTP for comparison purposes, so we can filter
986	# those changes out if the user didn't ask for them
987	URL_HTTP=$(echo $URL \| sed -E 's/^https:/http:/')
988	NEW_URL_HTTP=$(echo $NEW_URL \| sed -E 's/^https:/http:/')
989
990	# Sometimes 'curl' fails to get the redirect_url due to time-out or bad web site config
991	NEW_URL_LENGTH=$(echo \| awk -v input=$NEW_URL_HTTP '{print length(input)}')
992	if [ $NEW_URL_LENGTH -lt $MIN_URL_LENGTH ]; then
993	NEW_URL_HTTP="[new URL not retrieved]"
994	fi
995
996	# Remove slash at end of new URL, if present, so we can filter out the redirects that
997	# merely add an ending slash if the user didn't ask for them
998	NEW_URL_NO_SLASH=$(echo $NEW_URL_HTTP \| sed -E 's:/$::')
999
1000	# Detect if this is a youtu.be link simply being expanded by YouTube to the full
1001	# youtube.com address
1002	YOUTU_BE=0
1003	if [[ $URL_HTTP == http://youtu.be* ]] && [[ $NEW_URL_HTTP == http://www.youtube.com* ]]; then
1004	YOUTU_BE=1
1005	fi
1006
1007	# If the URLs match besides HTTP being upgraded to HTTPS, then the link is OK (unless user
1008	# wants those to be reported)
1009	if [ $SHOW_HTTPS -eq 0 ] && [ $URL_HTTP == $NEW_URL_HTTP ]; then
1010	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to show http->https upgrades, and I was redirected to '$NEW_URL'."
1011	STATUS="OK"
1012	let OK_LINKS+=1
1013	let SKIP_HTTPS_UP+=1
1014	# If the URLs match besides an added ending slash, then the link is OK (unless user wants
1015	# those to be reported)
1016	elif [ $SHOW_SLASH -eq 0 ] && [ $URL_HTTP == $NEW_URL_NO_SLASH ]; then
1017	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to show added trailing slashes, and I was redirected to '$NEW_URL'."
1018	STATUS="OK"
1019	let OK_LINKS+=1
1020	let SKIP_SLASH_ADD+=1
1021	elif [ $YOUTU_BE -eq 1 ]; then
1022	# We have to look at the actual page source to know if a YouTube video is good or not
1023	PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $NEW_URL \| grep "\"simpleText\":\"Video unavailable\"")
1024	if [ ! -z "$PAGE_TEXT" ]; then
1025	STATUS="NG"
1026	let NG_LINKS+=1
1027	else
1028	if [ $SHOW_YT_RD -eq 0 ]; then
1029	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to show redirects for youtu.be links, and I was redirected to '$NEW_URL'."
1030	STATUS="OK"
1031	let OK_LINKS+=1
1032	let SKIP_YOUTU_BE+=1
1033	else
1034	STATUS="RD"
1035	let RD_LINKS+=1
1036	fi
1037	fi
1038	else
1039	STATUS="RD"
1040	let RD_LINKS+=1
1041	fi
1042	break
1043	fi
1044	done
1045	fi
1046
1047	# If we didn't get a match with the "RD" codes, check it against the "NG" codes
1048	if [ $STATUS == "??" ]; then
1049	for CODE in "${NG_CODES[@]}"; do
1050	if [[ $CODE == $CURL_CODE ]]; then
1051	STATUS="NG"
1052	let NG_LINKS+=1
1053	break
1054	fi
1055	done
1056	fi
1057
1058	# If we didn't match a known status code, advise the reader
1059	if [ $STATUS == "??" ]; then
1060	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I encountered the unknown response code $CURL_CODE."
1061	let SKIP_UNK_CODE+=1
1062	continue
1063	fi
1064
1065	# Check problem links against exceptions list before proceeding
1066	FOUND_EXCEPT=0
1067	if [ $STATUS != "OK" ] && [ ! -z $EXCEPT_URL ]; then
1068	# The code we expect to find in the exceptions file is either the 'curl' result or "EI"/"IW"
1069	EXPECT_CODE="$CURL_RESULT"
1070	if [ $STATUS == "EI" ]; then
1071	EXPECT_CODE="EI"
1072	elif [ $STATUS == "IW" ]; then
1073	EXPECT_CODE="IW"
1074	fi
1075
1076	# Look for link in exceptions list and make sure the listed result code and wiki page also match
1077	for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
1078	{
1079	EXCEPT_LINE="${EXCEPT_ARRAY[$i]}"
1080
1081	# Undo any HTML-encoding from the wiki page; for now we just worry about the ampersand, as most
1082	# other HTML-encoded characters are not found in URLs
1083	EXCEPT_LINE=$(echo "$EXCEPT_LINE" \| sed 's/\&/\&/g')
1084
1085	# Match URL
1086	EXCEPT_URL="${EXCEPT_LINE#*,}"
1087	EXCEPT_URL="${EXCEPT_URL%,*}"
1088	if [ "$EXCEPT_URL" != "$URL" ]; then
1089	continue
1090	fi
1091
1092	# Match containing page's name
1093	EXCEPT_PAGE="${EXCEPT_LINE##*,}"
1094	EXCEPT_PAGE="${EXCEPT_PAGE%% *}"
1095	if [ "$EXCEPT_PAGE" == "*" ] \|\| [ "$EXCEPT_PAGE" == $LOCAL_PAGE_PATH ]; then
1096	# Match result code
1097	EXCEPT_CODE=${EXCEPT_LINE%%,*}
1098	if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then
1099	valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because its expected result, '$EXPECT_CODE', is in the exceptions list."
1100	if [ $STATUS == "EI" ]; then
1101	let SKIP_EXPECT_EI+=1
1102	elif [ $STATUS == "IW" ]; then
1103	let SKIP_EXPECT_IW+=1
1104	elif [ $STATUS == "RD" ]; then
1105	let SKIP_EXPECT_RD+=1
1106	else
1107	let SKIP_EXPECT_NG+=1
1108	fi
1109	FOUND_EXCEPT=1
1110	break
1111	fi
1112	fi
1113	} done
1114	fi
1115	if [ $FOUND_EXCEPT -eq 1 ]; then
1116	continue
1117	fi
1118
1119	# If appropriate, record this link to the log, with clickable URLs when possible
1120	if [ $STATUS != "OK" ] \|\| [ $RECORD_OK_LINKS -eq 1 ]; then
1121	# Prepare 'curl' result in parentheses to print after status code, unless this is an "IW" or "EI"
1122	# link, in which case showing the status code doesn't make sense. Adjust spacing after string to
1123	# ensure TXT and RTF reports have aligned columns of results.
1124	CURL_STR_H=" ($CURL_RESULT)"
1125	CURL_STR_T="$CURL_STR_H"
1126	CURL_STR_R="$CURL_STR_H "
1127	if [ $STATUS == "IW" ] \|\| [ $STATUS == "EI" ]; then
1128	CURL_STR_H=""
1129	CURL_STR_T=" "
1130	CURL_STR_R=" "
1131	fi
1132
1133	# Record link and its wiki page in TXT, RTF, and HTML markup
1134	valPrint t "${STATUS}${CURL_STR_T} $STR_TYPE $URL"
1135	valPrint t " linked from $FULL_PAGE_PATH"
1136	valPrint r "${STATUS}${CURL_STR_R}${RTF_TABS}$STR_TYPE {\field{\*\fldinst{HYPERLINK \"$URL\"}}{\fldrslt $URL}}"
1137	valPrint r " linked from {\field{\*\fldinst{HYPERLINK \"$FULL_PAGE_PATH\"}}{\fldrslt $LOCAL_PAGE_PATH}}"
1138	valPrint hn "<tr><td style=\"white-space:nowrap\">${STATUS}${CURL_STR_H}</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>"
1139	valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"
1140
1141	# Place vertical space here since we won't be printing anything more about this link
1142	if [ $STATUS == "OK" ] && [ $SUGGEST_SNAPSHOTS_OK -eq 0 ]; then valPrint tr ""; valPrint hs ""; fi
1143
1144	# Record redirect URL if one was given by a 3xx response page
1145	if [ $STATUS == "RD" ]; then
1146	valPrint ts " Server suggests $NEW_URL"
1147	valPrint rs " Server suggests {\field{\*\fldinst{HYPERLINK \"$NEW_URL\"}}{\fldrslt $NEW_URL}}"
1148	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Server suggests</td><td><a href=\"$NEW_URL\" target=\"_blank\">$NEW_URL</a></td></tr>"
1149	fi
1150
1151	# Notify reader if we can use an intrawiki link for this URL
1152	if [ $STATUS == "EI" ]; then
1153	INTRA_PAGE=${URL#:///}
1154	valPrint ts " Just use [[$INTRA_PAGE]]"
1155	valPrint rs " Just use [[$INTRA_PAGE]]"
1156	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Just use</td><td>[[$INTRA_PAGE]]</td></tr>"
1157	fi
1158
1159	# Notify reader if we can use an interwiki prefix for this URL
1160	if [ $STATUS == "IW" ]; then
1161	INTER_PAGE=$(echo "$URL" \| sed 's/.*\///')
1162	valPrint ts " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
1163	valPrint rs " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
1164	valPrint hs "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]</td></tr>"
1165	fi
1166
1167	# Query Internet Archive for latest "OK" snapshot for "NG" page
1168	if [[ ( $STATUS == "NG" && $SUGGEST_SNAPSHOTS_NG -eq 1 ) \|\| ( $STATUS == "OK" && $SUGGEST_SNAPSHOTS_OK -eq 1 ) ]]; then
1169
1170	# We need to watch out for the rate limit or we'll get locked out; look at how much time has
1171	# elapsed and then wait the remainder between that and how long of a wait we think is needed
1172	# to avoid the dreaded "Too Many Requests" response. 5 seconds is just a guess.
1173	CUR_TIME=$(date +%s)
1174	WAIT_REMAINDER=$((5 - $CUR_TIME + $START_LINK))
1175	if [ $WAIT_REMAINDER -gt 0 ]; then
1176	valPrint t "Waiting $WAIT_REMAINDER second(s) to conform to Archive.org rate limit."
1177	sleep $WAIT_REMAINDER
1178	fi
1179
1180	# Issue query to the API
1181	ARCHIVE_QUERY=$(curl --silent --max-time $TIMEOUT "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
1182
1183	# Notify user if we hit the rate limit and just keep going
1184	if [[ "$ARCHIVE_QUERY" == "Too Many Requests" ]]; then
1185	valPrint t " IA has rate-limited us!"
1186	valPrint r " IA has rate-limited us!"
1187	valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td>(hit the API rate limit!)</td></tr>"
1188	# If a "closest" snapshot was received, inform user
1189	elif [[ "$ARCHIVE_QUERY" == \"closest\": ]]; then
1190	# In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it
1191	ARCHIVE_QUERY=$(echo "$ARCHIVE_QUERY" \| sed 's/#!/#\\!/')
1192
1193	# ...isolate "url" property in the response that follows the "closest" tag
1194	SNAPSHOT_URL=${ARCHIVE_QUERY##*\"closest\":} # everything after '"closest":'
1195	SNAPSHOT_URL=${SNAPSHOT_URL#*\"url\": \"} # everything after '"url": "'
1196	SNAPSHOT_URL=${SNAPSHOT_URL%%\"*} # everything before '"'
1197
1198	# Remove the port 80 part that IA often adds to the URL, as it's superfluous
1199	SNAPSHOT_URL=$(echo $SNAPSHOT_URL \| sed 's/:80//')
1200
1201	# Inform the user of the snapshot URL
1202	valPrint ts " IA suggests $SNAPSHOT_URL"
1203	valPrint rs " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
1204	valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
1205	else # Otherwise give a generic Wayback Machine link for this URL, which might work
1206	valPrint ts " Try browsing $ARCHIVE_GENERIC/$URL"
1207	valPrint rs " Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
1208	valPrint hs "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"
1209	fi
1210	fi
1211	fi
1212
1213	# If this is a page and it seems to be there, take screenshot of it using headless Google Chrome
1214	if [ $IS_FILE -eq 0 ] && [ $TAKE_PAGE_SHOT -eq 1 ] && [ $STATUS == "OK" ]; then
1215	# Sanitize URL by removing "http(s)://" and converting any other colons or slashes to underscores
1216	SHOT_NAME=$(echo "$URL" \| sed 's/https*\:\/\///' \| sed 'y/:\//__/')
1217	SHOT_FILE="$SHOT_PATH/$SHOT_NAME.png"
1218
1219	# Don't take screenshot if we already encountered this page and screenshotted it
1220	if [ ! -f "$SHOT_FILE" ]; then
1221	"$CHROME_PATH" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
1222	if [ -f "$WORKING_DIR/$CHROME_SCREENSHOT" ]; then
1223	mv -n "$WORKING_DIR/$CHROME_SCREENSHOT" "$SHOT_FILE"
1224	else
1225	valPrint trhs "Screenshot of URL $URL seems to have failed!"
1226	fi
1227	else
1228	valPrint trhs "Skipping screenshot of URL '$URL' because file '$SHOT_FILE' already exists."
1229	fi
1230	fi
1231	done
1232	FINISHED_LIST="yes"
1233	wrapupAndExit

Note: See TracBrowser for help on using the repository browser.

Download in other formats: