Context Navigation

source: Validate External Links/validate_external_links.sh@ 1068

Last change on this file since 1068 was 1067, checked in by iritscen, 7 years ago
Val now understands HTTP redirect responses and will report the URL we're redirected to. Also now tallies IW links.
File size: 35.9 KB

Line
1	#!/bin/bash
2
3	# Validate External Links by Iritscen
4	# Provided with a list of external links found in the OniGalore wiki, this script validates them.
5	# The resulting logs are produced in three formats: TXT (for easy diffing with an earlier log), RTF
6	# (for reading as a local file with clickable links), and HTML (for uploading as a web page).
7	# Call script with "--help" argument for documentation. Also see Read Me First.rtf for critical notes.
8	# Recommended rule:
9	# ------------------------------------------------------------------------------------------------------
10
11	# Set separator token to newline
12	IFS="
13	"
14
15	### GLOBALS ###
16	# Settings -- these will be changed from their defaults by the arguments passed in to the script
17	LINKS_URL="" # use 'curl' to download file with links from this location (can be file://)
18	EXCEPT_URL="" # ditto above for file with exceptions to NG results
19	OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder
20	RECORD_OK_LINKS=0 # record response code to the log whether it's a value in OK_CODES or NG_CODES
21	SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page
22	TAKE_PAGE_SHOT=0 # take a screenshot of each OK page
23	URL_START=1 # start at this URL in LINKS_FILE (1 by default)
24	URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE
25	UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report
26
27	# Fixed strings -- see the occurrences of these variables to learn their purpose
28	AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:54.0) Gecko/20100101 Firefox/54.0"
29	ARCHIVE_API="http://archive.org/wayback/available"
30	ARCHIVE_GENERIC="https://web.archive.org/web/*"
31	ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
32	CHROME="/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary"
33	CHROME_SCREENSHOT="screenshot.png"
34	CURL_CODES="http://iritscen.oni2.net/val/curl_codes.txt"
35	EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
36	HTTP_CODES="http://iritscen.oni2.net/val/http_codes.txt"
37	MY_WIKI_PAGE="http://wiki.oni2.net/User:Iritscen"
38	THIS_DIR=$(cd $(dirname $0); pwd)
39	WORKING_DIR=$(pwd)
40	WIKI_PATH="wiki.oni2.net"
41
42	# These are parallel arrays of the IDs and names of OniGalore's current namespaces
43	declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 100 101 102 103 104 105 108 109 110 111)
44	declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk")
45
46	# These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
47	# This determines whether the script tries to take a screenshot of the page or just gets its HTTP code.
48	declare -a HTTP_FILES=(txt zip wmv jpg png m4a bsl rar oni mp3 mov ONWC vbs TRMA mp4 doc avi log gif pdf dmg exe cpp tga 7z wav east BINA xml dll dae xaf fbx 3ds blend flv csv)
49	declare -a HTTP_TLDS_AND_PAGES=(com net org uk ru de it htm html php pl asp aspx shtml pgi cgi php3 x jsp phtml cfm css action stm js)
50
51	# These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
52	# are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt
53	# if you add a new code.
54	declare -a OK_CODES=(200 401 405 406 501)
55	declare -a RD_CODES=(301 302 303 307 308)
56	declare -a NG_CODES=(000 403 404 410 500 503)
57
58	# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
59	# transcluded text, and if the transclusion fails, then the braces show up in the URL
60	ILLEGAL_CHARS="{ }"
61
62	# These are parallel arrays giving the prefixes that can be used in place of normal external links to
63	# some wikis and other sites
64	declare -a INTERWIKI_PREFIXES=(metawikipedia wikipedia wikiquote wiktionary)
65	declare -a INTERWIKI_DOMAINS=(meta.wikipedia.org wikipedia.org wikiquote.org wiktionary.org)
66
67	# Variables for keeping track of main loop progress and findings
68	LINK_NUM=0
69	OK_LINKS=0
70	RD_LINKS=0
71	IW_LINKS=0
72	NG_LINKS=0
73	SKIP_UNK_NS=0
74	SKIP_JS_PAGE=0
75	SKIP_BAD_URL=0
76	SKIP_NON_ASCII=0
77	SKIP_UNK_SUFFIX=0
78	SKIP_UNK_CODE=0
79	SKIP_EXCEPT=0
80	FILE_LINKS=0
81	PAGE_LINKS=0
82	SKIPPED_HEADER_ROW=0
83	FINISHED_LIST="no"
84
85
86	### HELP ###
87	# A pseudo-man page. Here is the 80-character rule for the page text:
88	# 234567890123456789012345678901234567890123456789012345678901234567890123456789
89	function printHelp()
90	{
91	cat << EOF
92
93	NAME
94	Validate External Links
95
96	SYNOPSIS
97	validate_external_links.sh --help
98	validate_external_links.sh --links URL --output PATH [--exceptions FILE]
99	[--record-ok-links] [--suggest-snapshots] [--take-screenshots]
100	[--start-url NUM] [--end-url NUM] [--upload PATH]
101
102	DESCRIPTION
103	This script parses a list of external links found in the OniGalore wiki
104	(which is dumped by the Oni2.net domain periodically in a particular
105	format), validates them using the Unix tool 'curl', and produces a report
106	of which links were OK (responded to an HTTP query) and which were NG (no
107	good). This report can then be automatically uploaded to the location of
108	your choice. The script can also suggest Internet Archive snapshots for
109	NG links, and take screenshots of OK links for visual verification by the
110	reader that the page in question is the one intended to be displayed.
111
112	You must pass this script the URL at which the list of links is found
113	(--links) and the path where logs should be outputted (--output). All
114	other arguments are optional.
115
116	OPTIONS
117	--help Show this page
118	--links URL URL from which to download file with external links
119	(note that this can be a local file if you use the
120	file:// protocol) (required)
121	--output DIR Place the folder which will contain the reports and
122	optional screenshots at this path (required)
123	--exceptions DIR Don't log an NG link if it is listed in the file
124	provided at this path as long as the response code is
125	the same as the one associated with the link
126	--record-ok-links Log a link in the report whether its response code is
127	in the OK_CODES or the NG_CODES array
128	--suggest-snapshots Query the Internet Archive for a possible snapshot
129	URL for each NG page
130	--take-screenshots Save screenshots of each OK page (requires Google
131	Chrome to be found at the path in CHROME)
132	--start-url NUM Start at this link in the links file
133	--end-url NUM Stop at this link in the links file
134	--upload FILE Upload report using info in this local file
135
136	BUGS
137	The script cannot properly parse any line in the external links file
138	which contains a comma in the name of the wiki page containing a link.
139	Commas in the link itself are not an issue.
140	EOF
141	}
142
143
144	### SETUP ###
145	# If first argument is a help request, or if nothing was passed in at all, print help page and quit
146	if [ "$#" -eq 0 ] \|\| [ "$1" == "--help" ]; then
147	printHelp \| less
148	exit 0
149	fi
150
151	# Parse arguments as long as there are more arguments to process
152	while (( "$#" )); do
153	case "$1" in
154	--links ) LINKS_URL="$2"; shift 2;;
155	--exceptions ) EXCEPT_URL="$2"; shift 2;;
156	--output ) OUTPUT_DIR="$2"; shift 2;;
157	--record-ok-links ) RECORD_OK_LINKS=1; shift;;
158	--suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;;
159	--take-screenshots ) TAKE_PAGE_SHOT=1; shift;;
160	--start-url ) URL_START=$2; shift 2;;
161	--end-url ) URL_LIMIT=$2; shift 2;;
162	--upload ) UPLOAD_INFO=$2; shift 2;;
163	* ) echo "Invalid argument $1 detected. Aborting."; exit 1;;
164	esac
165	done
166
167	# If the required arguments were not supplied, print help page and quit
168	if [ -z $LINKS_URL ] \|\| [ -z $OUTPUT_DIR ]; then
169	printHelp
170	echo "Error: I did not receive one or both required arguments."
171	exit 2
172	fi
173
174	# Check that UPLOAD_INFO exists, if this argument was supplied
175	if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then
176	echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting."
177	exit 3
178	fi
179
180	# Check that OUTPUT_DIR is a directory
181	if [ ! -d "$OUTPUT_DIR" ]; then
182	echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting."
183	exit 4
184	fi
185
186	# Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots
187	SAFE_TIME=$(date "+%Y-%m-%d--%H-%M-%S") # for use in path names
188	NICE_TIME=$(TZ=":UTC" date "+%a, %d %b %Y %k:%M:%S GMT") # matches format of LINKS_DATE, which is extracted from the extlinks file
189	OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
190	OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
191	SHOT_PATH="$OUTPUT_PATH/Screenshots"
192	LOG_NAME="ValExtLinks report"
193	LOG_TXT="$OUTPUT_PATH/$LOG_NAME.txt"
194	LOG_RTF="$OUTPUT_PATH/$LOG_NAME.rtf"
195	LOG_HTM="$OUTPUT_PATH/$LOG_NAME.htm"
196	mkdir "$OUTPUT_PATH"
197	if [ $TAKE_PAGE_SHOT -eq 1 ]; then
198	mkdir "$SHOT_PATH"
199	fi
200
201	# Check that 'mkdir' succeeded
202	if [ ! -d "$OUTPUT_PATH" ]; then
203	echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting."
204	exit 5
205	fi
206
207	# Get date on the file at LINKS_URL and print to log
208	LINKS_DATE=$(curl --silent --head $LINKS_URL \| grep "Last-Modified")
209	if [ -z "$LINKS_DATE" ]; then
210	echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting."
211	exit 6
212	fi
213	LINKS_DATE=${LINKS_DATE#Last-Modified: }
214
215
216	### UTILITY FUNCTIONS ###
217	# Writes a plain-text header to TXT log file
218	function printTXTheader()
219	{
220	valPrint t "Validate External Links report"
221	valPrint t "generated $NICE_TIME"
222	valPrint t "from data of $LINKS_DATE"
223	valPrint t "script by Iritscen (contact: $MY_WIKI_PAGE)"
224	valPrint t ""
225	}
226
227	# Writes the RTF header to RTF log file, then a centered title, then sets text to left-justified
228	function printRTFheader()
229	{
230	valPrint r "{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf820
231	{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
232	{\colortbl;\red255\green255\blue255;}
233	{\*\expandedcolortbl;;}
234	\margl1440\margr1440\vieww12600\viewh12100\viewkind0
235	\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\qc\partightenfactor0
236
237	\f0\fs28 \cf0 \b1 Validate External Links report \b0\\
238	generated $NICE_TIME\\
239	from data of $LINKS_DATE\\
240	script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$MY_WIKI_PAGE\"}}{\fldrslt contact}})
241	\\
242	\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
243	\cf0 "
244	}
245
246	# Closes the RTF markup of the RTF log file
247	function printRTFfooter()
248	{
249	valPrint r "}"
250	}
251
252	# Writes the HTML header to HTML log file
253	function printHTMheader()
254	{
255	valPrint h "<html>
256	<head>
257	<title>Validate External Links report</title>
258	</head>
259	<body>
260	<h2>Validate External Links report</h2>
261	<h3>generated $NICE_TIME<br />
262	from data of $LINKS_DATE<br />
263	script by Iritscen (<a href=\"$MY_WIKI_PAGE\" target=\"_blank\">contact</a>)</h3>"
264	}
265
266	# Closes the HTML markup of the HTML log file
267	function printHTMfooter()
268	{
269	valPrint h "</body>
270	</html>"
271	}
272
273	# The central logging function. The first parameter is a string composed of one or more characters that
274	# indicates which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
275	# 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 'w' means "Don't
276	# pass console output through 'fmt'" ("fmt" fits the output to an 80-column CLI but can break special
277	# formatting and the 'n' option).
278	function valPrint()
279	{
280	if [[ "$1" == c ]]; then
281	if [[ "$1" == n ]]; then
282	echo -n "$2"
283	elif [[ "$1" == w ]]; then
284	echo "$2"
285	else
286	echo "$2" \| fmt -w 80
287	fi
288	fi
289	if [[ "$1" == t ]]; then
290	if [[ "$1" == n ]]; then
291	echo -n "$2" >> "$LOG_TXT"
292	else
293	echo "$2" >> "$LOG_TXT"
294	fi
295	fi
296	if [[ "$1" == r ]]; then
297	if [[ "$1" == n ]]; then
298	echo "$2" >> "$LOG_RTF"
299	else
300	echo "$2\\" >> "$LOG_RTF"
301	fi
302	fi
303	if [[ "$1" == h ]]; then
304	if [[ "$1" == n ]]; then
305	echo "$2" >> "$LOG_HTM"
306	else
307	echo "$2<br />" >> "$LOG_HTM"
308	fi
309	fi
310	}
311
312	# Pluralize the string in parameter 1 if the number in parameter 2 is not 1
313	function pluralCheckNoun()
314	{
315	if [ $2 -ne 1 ]; then
316	if [[ $1 =~ x$ ]]; then
317	echo $1es
318	else
319	echo $1s
320	fi
321	else
322	echo $1
323	fi
324	}
325
326	# Output "is" if parameter 1 is 1, otherwise "are"
327	function pluralCheckIs()
328	{
329	if [ $1 -ne 1 ]; then
330	echo "are"
331	else
332	echo "is"
333	fi
334	}
335
336	# Output "was" if parameter 1 is 1, otherwise "were"
337	function pluralCheckWas()
338	{
339	if [ $1 -ne 1 ]; then
340	echo "were"
341	else
342	echo "was"
343	fi
344	}
345
346	# Output "a " if parameter 1 is 1, otherwise nothing
347	function pluralCheckA()
348	{
349	if [ $1 -eq 1 ]; then
350	echo "a "
351	fi
352	}
353
354	# Output "an " if parameter 1 is 1, otherwise nothing
355	function pluralCheckAn()
356	{
357	if [ $1 -eq 1 ]; then
358	echo "an "
359	fi
360	}
361
362	# Upload HTML report using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
363	# reports being saved to disk have already been closed.
364	function uploadReport()
365	{
366	valPrint c "Uploading HTML report..."
367
368	SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
369	SFTP_USER_NAME_MARKER="user:"
370	SFTP_PASSWORD_MARKER="pw:"
371	SFTP_PORT_MARKER="port:"
372	SFTP_PATH_MARKER="path:"
373	SFTP_USER_NAME=$(grep $SFTP_USER_NAME_MARKER $UPLOAD_INFO)
374	SFTP_USER_NAME=${SFTP_USER_NAME#$SFTP_USER_NAME_MARKER}
375	SFTP_PASSWORD=$(grep $SFTP_PASSWORD_MARKER $UPLOAD_INFO)
376	SFTP_PASSWORD=${SFTP_PASSWORD#$SFTP_PASSWORD_MARKER}
377	SFTP_PORT=$(grep $SFTP_PORT_MARKER $UPLOAD_INFO)
378	SFTP_PORT=${SFTP_PORT#$SFTP_PORT_MARKER}
379	SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
380	SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}
381
382	expect "$SCRIPT_PATH" "$LOG_HTM" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.htm"
383
384	valPrint c "Report was uploaded, unless an error message appears above."
385	}
386
387	# Prints session summary when script is done
388	function wrapupAndExit()
389	{
390	# Get off progress line on console, drop down a line from last link in log, and close HTML table
391	valPrint ctr ""
392	valPrint h "</table><br />"
393
394	# If we didn't finish processing the last URL, then the iterator is one too high
395	if [ $FINISHED_LIST != "yes" ]; then
396	let LINK_NUM-=1
397	if [ $FINISHED_LIST == "no" ]; then
398	valPrint ctrh "The session was canceled by the user."
399	fi
400	fi
401
402	# Output results of session and close the log file's markup
403	LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
404	LINKS_SKIPPED=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
405	LINKS_CHECKED=$((LINKS_PROCESSED-LINKS_SKIPPED))
406	valPrint ct "Summary:"
407	valPrint r "\b1 Summary \b0"
408	valPrint hn "<h3><span id=\"summary\">Summary</span></h3>"
409	valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT)."
410	valPrint ctrh "I skipped $LINKS_SKIPPED $(pluralCheckNoun link $LINKS_SKIPPED), and found $FILE_LINKS $(pluralCheckNoun file $FILE_LINKS) and $PAGE_LINKS $(pluralCheckNoun page $PAGE_LINKS)."
411	if [ $LINKS_SKIPPED -gt 0 ]; then valPrint ctrh "Skip breakdown: "; fi
412	if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
413	if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE links on JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
414	if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
415	if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
416	if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
417	if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
418	valPrint ctrh "Out of the $LINKS_CHECKED links checked, $OK_LINKS $(pluralCheckWas $OK_LINKS) OK, $RD_LINKS $(pluralCheckWas $RD_LINKS) $(pluralCheckA $RD_LINKS)redirection $(pluralCheckNoun notice $RD_LINKS), and $NG_LINKS $(pluralCheckWas $NG_LINKS) NG."
419	if [ $IW_LINKS -gt 0 ]; then
420	valPrint ctrh "$IW_LINKS/$OK_LINKS OK $(pluralCheckNoun link $OK_LINKS) $(pluralCheckIs $IW_LINKS) $(pluralCheckAn $IW_LINKS)external $(pluralCheckNoun link $IW_LINKS) that could be $(pluralCheckAn $IW_LINKS)interwiki $(pluralCheckNoun link $IW_LINKS)."
421	fi
422	if [ $SKIP_EXCEPT -gt 0 ]; then
423	valPrint ctrh "$SKIP_EXCEPT/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS) went unlisted due to being found in the exceptions file."
424	fi
425	printRTFfooter
426	printHTMfooter
427
428	# Upload report if this was requested
429	if [ ! -z $UPLOAD_INFO ] && [ $FINISHED_LIST != "no" ]; then
430	uploadReport
431	fi
432
433	# Really quit now
434	valPrint c "ValExtLinks says goodbye."
435	exit 0
436	}
437	trap wrapupAndExit INT
438
439
440	### INITIALIZATION ###
441	# Print opening message to console and log files
442	valPrint cw "$(tput bold)--Validate External Links--$(tput sgr0)"
443	printTXTheader
444	printRTFheader
445	printHTMheader
446
447	# Attempt to download file at LINKS_URL, then check that it succeeded
448	valPrint ctrh "Downloading list of external links from $LINKS_URL."
449	LINKS_FILE_NAME=$(echo "$LINKS_URL" \| sed 's/.*\///')
450	LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
451	curl --silent -o "$LINKS_FILE" $LINKS_URL
452	if [ ! -f "$LINKS_FILE" ]; then
453	echo "The download of $LINKS_URL appears to have failed. Aborting."
454	wrapupAndExit
455	fi
456
457	# Attempt to download file at EXCEPT_URL, then check that it succeeded
458	if [ ! -z $EXCEPT_URL ]; then
459	valPrint ctrh "Downloading list of NG exceptions from $EXCEPT_URL."
460	EXCEPT_FILE_NAME=$(echo "$EXCEPT_URL" \| sed 's/.*\///')
461	EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
462	curl --silent -o "$EXCEPT_FILE" $EXCEPT_URL
463	if [ ! -f "$EXCEPT_FILE" ]; then
464	echo "The download of $EXCEPT_URL appears to have failed. Aborting."
465	wrapupAndExit
466	fi
467	fi
468
469	# Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
470	LINK_COUNT_STRING=$(cat "$LINKS_FILE" \| wc -l)
471
472	# Number of URLs is number of lines minus one (first line is column header row for the CSV)
473	LINK_COUNT=$(echo "${LINK_COUNT_STRING}" \| tr -d '[:space:]')
474	let LINK_COUNT-=1
475
476	# Calculate number of URLs to consider
477	if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
478	valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((URL_LIMIT-URL_START+1)) of them, from link $URL_START to $URL_LIMIT."
479	elif [ $URL_START -ne 1 ]; then
480	valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((LINK_COUNT-URL_START+1)) of them, from link $URL_START to $LINK_COUNT."
481	else
482	valPrint ctrh "Found $LINK_COUNT links to process. I will be considering all of them."
483	fi
484
485	# Print settings to console and log
486	declare -a SETTINGS_MSG=(I will be checking the response code of each link "and will" take a screenshot of each page. Pages that are OK will "also" be logged. I "will" ask the Internet Archive for a suggested snapshot URL for each NG page. "I will not print NG links that are listed in the exceptions file.")
487	if [ $TAKE_PAGE_SHOT -eq 0 ]; then SETTINGS_MSG[10]="but will not"; fi
488	if [ $RECORD_OK_LINKS -eq 0 ]; then SETTINGS_MSG[22]="not"; fi
489	if [ $SUGGEST_SNAPSHOTS -eq 0 ]; then SETTINGS_MSG[26]="will not"; fi
490	if [ -z $EXCEPT_URL ] \|\| [ $RECORD_OK_LINKS -eq 1 ]; then SETTINGS_MSG[40]=""; fi
491	SETTINGS_STR=${SETTINGS_MSG[@]}
492	valPrint ctrh "$SETTINGS_STR"
493	valPrint tr "A summary of my findings will be found at the bottom of the report."
494	valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
495	valPrint trh ""
496
497	# Print legend to logs
498	valPrint t "Legend:"
499	valPrint r "\b1 Legend \b0"
500	valPrint hn "<h3>Legend</h3>"
501	valPrint trh "OK = URL seems to be working."
502	valPrint trh "NG = URL no longer seems to work. You should click each URL marked as NG before attempting to fix it, because false negatives will occur from time to time due to hiccups in the Internet. Please report any persistent false negatives or other issues to Iritscen. An NG link should be followed by a link to the Internet Archive's Wayback Machine which may help you repair the link. If the link cannot be repaired, you can disable it on the wiki (which prevents it from showing up in future ValExtLinks reports) by wrapping it in nowiki tags."
503	valPrint trh "RD = The server responding to this URL is saying that the page moved and you should instead use the supplied new URL. Some RD links represent minor adjustments in the organization of a web site, and some are soft 404s (the file/page has been removed and you are being redirected to something like the main page of the web site). You will have to look at the new URL yourself to determine if it represents an OK link and the link on the wiki should be updated to this one, or if the desired file/page is actually gone and we need to replace the wiki link with an Internet Archive snapshot link -- or disable the URL if it has not been archived."
504	valPrint trh "IW = URL is working but should be converted to interwiki link using the suggested markup."
505	valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $HTTP_CODES)."
506	valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$HTTP_CODES\"}}{\fldrslt here}} for code reference)."
507	valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$HTTP_CODES\" target=\"_blank\">here</a> for code reference)."
508	valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $CURL_CODES)."
509	valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$CURL_CODES\"}}{\fldrslt here}} for code reference)."
510	valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$CURL_CODES\" target=\"_blank\">here</a> for code reference)."
511	valPrint trh "IA suggests = Last available snapshot suggested by the Internet Archive."
512	valPrint trh "Try browsing = The Archive occasionally fails to return a snapshot URL even when one exists, so you will need to check for a snapshot manually using the Wayback Machine before concluding that a site has not been archived."
513	valPrint trh ""
514
515
516	### MAIN LOOP ###
517	# Process each line of the .csv in LINKS_FILE
518	for LINE in `cat "$LINKS_FILE"`; do
519	let LINK_NUM+=1
520
521	# First line is the column header row for the CSV, so let's verify that the format hasn't changed
522	if [ $SKIPPED_HEADER_ROW -eq 0 ]; then
523	if [ $LINE == "namespace,title,target" ]; then
524	SKIPPED_HEADER_ROW=1
525	LINK_NUM=0 # this line is it's not a link, so reset the link counter
526	valPrint hn "<table>"
527	continue
528	else
529	valPrint ctrh "Did not find expected header row in $LINKS_FILE. Aborting."
530	wrapupAndExit
531	fi
532	fi
533
534	# Skip this link if we are not at URL_START yet
535	if [ $LINK_NUM -lt $URL_START ]; then
536	continue
537	fi
538
539	# Stop if we are at the limit declared for testing purposes
540	if [ $URL_LIMIT -gt 0 ] && [ $LINK_NUM -gt $URL_LIMIT ]; then
541	FINISHED_LIST="limit"
542	wrapupAndExit
543	fi
544
545	# Print progress to screen
546	if [ $LINK_NUM -gt 1 ]; then
547	printf "\e[1A\n" # erase previous progress message so that new one appears in its place
548	fi
549	valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."
550
551	# The number of the namespace is the element before the first comma on the line
552	NS_ID=${LINE%%,*}
553
554	# Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
555	NS_NAME=""
556	a=0
557	while [ "x${NS_IDS[$a]}" != "x" ] # once this evaluates to "x", the array is done
558	do
559	if [ $NS_ID -eq ${NS_IDS[$a]} ]; then
560	NS_NAME="${NS_NAMES[$a]}"
561	break
562	fi
563	let a+=1
564	done
565	if [ -z "$NS_NAME" ]; then
566	valPrint tr "Skipping URL found on page $PAGE_NAME because I could not find a name for namespace ID $NS_ID."
567	let SKIP_UNK_NS+=1
568	continue
569	fi
570
571	# The name of the page is everything between the namespace ID and the next comma on the line (commas
572	# in page names will break this)
573	PAGE_NAME=${LINE#$NS_ID,}
574	PAGE_NAME=${PAGE_NAME%%,*}
575
576	# We don't want to consider wiki pages ending in .js, as the parser cannot reliably isolate URLS in
577	# JavaScript code, so it will return erroneous links
578	PAGE_NAME_SUFFIX=$(echo $PAGE_NAME \| sed 's/.*\.//')
579	if [ $PAGE_NAME_SUFFIX == "js" ]; then
580	valPrint tr "Skipping URL found on JavaScript page $PAGE_NAME."
581	let SKIP_JS_PAGE+=1
582	continue
583	fi
584
585	# The URL being linked to is everything after the previous two fields (this allows commas to be in
586	# the URLs, but a comma in the previous field, the page name, will break this)
587	URL=${LINE#$NS_ID,$PAGE_NAME,}
588
589	# Scan for illegal characters
590	if [[ $URL == [$ILLEGAL_CHARS] ]]; then
591	valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because it contains characters illegal in a URL."
592	let SKIP_BAD_URL+=1
593	continue
594	fi
595
596	# Now we need to know if the URL is for a file or a web page. First step is to determine if the
597	# URL ends in a suffix
598	HAS_SUFFIX=0
599
600	# If the URL ends in something like ".php?foo=bar", strip everything from the '?' onward
601	SAN_URL=${URL%%\?*}
602
603	# If the URL ends in something like "#section_15", strip everything from the '#' onward
604	SAN_URL=${SAN_URL%%\#*}
605
606	# 'sed' cannot handle Unicode in my Bash shell, so skip this URL and make user check it
607	if [[ $SAN_URL == [![:ascii:]] ]]; then
608	valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because I cannot handle non-ASCII characters."
609	let SKIP_NON_ASCII+=1
610	continue
611	fi
612
613	# Isolate the characters after the last period and after the last slash
614	POST_DOT=$(echo "$SAN_URL" \| sed 's/.*\.//')
615	POST_SLASH=$(echo "$SAN_URL" \| sed 's/.*\///')
616
617	# If the last period comes after the last slash, then the URL ends in a suffix
618	POST_DOT_LENGTH=$(echo \| awk -v input=$POST_DOT '{print length(input)}')
619	POST_SLASH_LENGTH=$(echo \| awk -v input=$POST_SLASH '{print length(input)}')
620	if [ $POST_DOT_LENGTH -lt $POST_SLASH_LENGTH ]; then
621	HAS_SUFFIX=1
622	else
623	HAS_SUFFIX=0
624	fi
625
626	# Now we need to know if the URL is for a file. If the URL ends in a suffix, compare it against the
627	# known URL endings supplied in HTTP_FILES and HTTP_TLDS_AND_PAGES
628	IS_FILE=-1
629	if [ $HAS_SUFFIX -eq 0 ]; then
630	IS_FILE=0
631	else
632	# Turn off case sensitivity while we compare suffixes
633	shopt -s nocasematch
634
635	# Special case: URLs ending in something like "/productID.304297400" are pages, not files, so if
636	# the URL's suffix is all numbers, we are looking at the end of a web page URL
637	if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
638	IS_FILE=0
639	fi
640
641	# If we did not identify this URL as a web page above, we need to compare the suffix against known
642	# file extensions
643	if [ $IS_FILE -eq -1 ]; then
644	for EXTENSION in "${HTTP_FILES[@]}"; do
645	if [[ $EXTENSION == $POST_DOT ]]; then # double brackets respect the nocasematch setting
646	IS_FILE=1
647	break
648	fi
649	done
650	fi
651
652	# If we did not identify this URL as a file above, we need to compare the suffix against known
653	# pages and TLDs in order to rule out that this is an unknown file suffix or TLD that the user
654	# needs to add to the HTTP_FILES or HTTP_TLDS_AND_PAGES array
655	if [ $IS_FILE -eq -1 ]; then
656	for PAGE_OR_TLD in "${HTTP_TLDS_AND_PAGES[@]}"; do
657	if [[ $PAGE_OR_TLD == $POST_DOT ]]; then
658	IS_FILE=0
659	break
660	fi
661	done
662	fi
663
664	# Turn case sensitivity back on in Bash
665	shopt -u nocasematch
666	fi
667
668	# If this suffix escaped identification as either a file, page or TLD, inform the user
669	STR_TYPE=""
670	if [ $IS_FILE -eq -1 ]; then
671	valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because I encountered the unknown URL ending $POST_DOT. Please add this ending to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
672	let SKIP_UNK_SUFFIX+=1
673	continue
674	elif [ $IS_FILE -eq 1 ]; then
675	STR_TYPE="file"
676	let FILE_LINKS+=1
677	elif [ $IS_FILE -eq 0 ]; then
678	STR_TYPE="page"
679	let PAGE_LINKS+=1
680	fi
681
682	# Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
683	# issue with sites that require HTTPS
684	CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{http_code}\n' $URL)
685	CURL_ERR=$(echo $?)
686	CURL_RESULT=$CURL_CODE
687
688	# Tack 'curl' exit code onto result string if 'curl' returned "000" (no HTTP response)
689	if [ $CURL_CODE == "000" ]; then
690	CURL_RESULT="$CURL_RESULT-$CURL_ERR"
691	fi
692
693	# Determine if this code is in our "OK" list
694	STATUS="??"
695	NEW_URL=""
696	INTERWIKI_INDEX=-1
697	for CODE in "${OK_CODES[@]}"; do
698	if [[ $CODE == $CURL_CODE ]]; then
699	let OK_LINKS+=1
700
701	# Determine if this is a link to a domain that we have an interwiki prefix for
702	for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
703	if [[ $URL == ${INTERWIKI_DOMAINS[$i]} ]]; then
704	STATUS="IW"
705	let IW_LINKS+=1
706	INTERWIKI_INDEX=$i
707	break
708	fi
709	done
710
711	# If this link is OK and no interwiki advisory is needed, just mark as "OK"
712	if [ $INTERWIKI_INDEX == -1 ]; then
713	STATUS="OK"
714	fi
715	break
716	fi
717	done
718
719	# If we didn't get a match with the "OK" codes, check it against the "RD" codes
720	if [ $STATUS == "??" ]; then
721	for CODE in "${RD_CODES[@]}"; do
722	if [[ $CODE == $CURL_CODE ]]; then
723	STATUS="RD"
724	let RD_LINKS+=1
725
726	# Get URL header again in order to retrieve the URL we are being redirected to
727	NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{redirect_url}\n' $URL)
728
729	break
730	fi
731	done
732	fi
733
734	# If we didn't get a match with the "RD" codes, check it against the "NG" codes
735	if [ $STATUS == "??" ]; then
736	for CODE in "${NG_CODES[@]}"; do
737	if [[ $CODE == $CURL_CODE ]]; then
738	STATUS="NG"
739	let NG_LINKS+=1
740	break
741	fi
742	done
743	fi
744
745	# If we didn't match a known status code, advise the reader
746	if [ $STATUS == "??" ]; then
747	valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because I encountered the unknown return code $CURL_CODE."
748	let SKIP_UNK_CODE+=1
749	continue
750	fi
751
752	# If link is "NG" and there is an exceptions file, compare URL against the list before logging it
753	if [ $STATUS == "NG" ] && [ ! -z $EXCEPT_URL ]; then
754	GREP_RESULT=$(grep --max-count=1 "$URL" "$EXCEPT_FILE")
755	EXCEPT_CODE=${GREP_RESULT%%,*}
756	if [ "$EXCEPT_CODE" == $CURL_RESULT ]; then
757	valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because its status code, $CURL_RESULT, is listed in the exceptions file."
758	let SKIP_EXCEPT+=1
759	continue
760	fi
761	fi
762
763	# If appropriate, record this link to the log, with clickable URLs when possible
764	if [ $STATUS != "OK" ] \|\| [ $RECORD_OK_LINKS -eq 1 ]; then
765	FULL_PAGE_PATH=http://$WIKI_PATH/$NS_NAME:$PAGE_NAME
766	LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
767	# Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it explicitly breaks the link
768	if [ $NS_ID -eq 0 ]; then
769	FULL_PAGE_PATH=http://$WIKI_PATH/$PAGE_NAME
770	LOCAL_PAGE_PATH=$PAGE_NAME
771	fi
772
773	# Stupid hack since the text "IW" is narrower than "OK", "RD", or "NG" and it takes an extra tab
774	# to get to the desired level of indentation in the RTF log
775	RTF_TABS=" "
776	if [ $STATUS == "IW" ]; then
777	RTF_TABS=" "
778	fi
779
780	# Record link and its wiki page in TXT, RTF, and HTML markup
781	valPrint t "$STATUS ($CURL_RESULT) $STR_TYPE $URL"
782	valPrint t " linked from $FULL_PAGE_PATH"
783	valPrint r "$STATUS ($CURL_RESULT)$RTF_TABS$STR_TYPE {\field{\*\fldinst{HYPERLINK \"$URL\"}}{\fldrslt $URL}}"
784	valPrint r " linked from {\field{\*\fldinst{HYPERLINK \"$FULL_PAGE_PATH\"}}{\fldrslt $LOCAL_PAGE_PATH}}"
785	valPrint hn "<tr><td style=\"white-space:nowrap\">$STATUS ($CURL_RESULT)</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>"
786	valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"
787
788	# Record redirect URL if one was given by a 3xx response page
789	if [ $STATUS == "RD" ]; then
790	valPrint t " Server suggests $NEW_URL"
791	valPrint r " Server suggests {\field{\*\fldinst{HYPERLINK \"$NEW_URL\"}}{\fldrslt $NEW_URL}}"
792	valPrint hn "<tr><td colspan=\"2\" align=\"right\">Server suggests</td><td><a href=\"$NEW_URL\" target=\"_blank\">$NEW_URL</a></td></tr>"
793	fi
794
795	# Notify reader if we can use an interwiki prefix for this URL
796	if [ $STATUS == "IW" ]; then
797	valPrint t " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$POST_SLASH]]"
798	valPrint r " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$POST_SLASH]]"
799	valPrint hn "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$POST_SLASH]]</td></tr>"
800	fi
801
802	# Query Internet Archive for latest "OK" snapshot for "NG" page
803	if [ $STATUS == "NG" ] && [ $SUGGEST_SNAPSHOTS -eq 1 ]; then
804	ARCHIVE_QUERY=$(curl --silent --max-time 10 "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
805
806	# Isolate "url" property in response and log it if a "closest" snapshot was received...
807	if [[ "$ARCHIVE_QUERY" == \"closest\": ]]; then
808	SNAPSHOT_URL=${ARCHIVE_QUERY##*\"url\": \"}
809	SNAPSHOT_URL=${SNAPSHOT_URL%\", \"timestamp*}
810	valPrint t " IA suggests $SNAPSHOT_URL"
811	valPrint r " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
812	valPrint hn "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
813	else # ...otherwise give generic Wayback Machine link for this URL
814	valPrint t " Try browsing $ARCHIVE_GENERIC/$URL"
815	valPrint r " Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
816	valPrint hn "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"
817	fi
818	fi
819	fi
820
821	# If this is a page and it seems to be there, take screenshot of it using headless Google Chrome
822	if [ $IS_FILE -eq 0 ] && [ $TAKE_PAGE_SHOT -eq 1 ] && [ $STATUS == "OK" ]; then
823	# Sanitize URL by removing "http(s)://" and converting any other colons or slashes to underscores
824	SHOT_NAME=$(echo "$URL" \| sed 's/https*\:\/\///' \| sed 'y/:\//__/')
825	SHOT_FILE="$SHOT_PATH/$SHOT_NAME.png"
826
827	# Don't take screenshot if we already encountered this page and screenshotted it
828	if [ ! -f "$SHOT_FILE" ]; then
829	"$CHROME" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
830	if [ -f "$WORKING_DIR/$CHROME_SCREENSHOT" ]; then
831	mv -n "$WORKING_DIR/$CHROME_SCREENSHOT" "$SHOT_FILE"
832	else
833	valPrint trh "Screenshot of URL $URL seems to have failed!"
834	fi
835	else
836	valPrint trh "Skipping screenshot of URL $URL because $SHOT_FILE already exists."
837	fi
838	fi
839	done
840	FINISHED_LIST="yes"
841	wrapupAndExit

Note: See TracBrowser for help on using the repository browser.

Download in other formats: