Context Navigation

source: Validate External Links/validate_external_links.sh@ 1069

Last change on this file since 1069 was 1069, checked in by iritscen, 8 years ago
ValExtLinks: IW links now reported as separate category from OK links. RD links that are just redirecting from http:// to https:// are now regarded as OK.
File size: 36.5 KB

Line
1	#!/bin/bash
2
3	# Validate External Links by Iritscen
4	# Provided with a list of external links found in the OniGalore wiki, this script validates them.
5	# The resulting logs are produced in three formats: TXT (for easy diffing with an earlier log), RTF
6	# (for reading as a local file with clickable links), and HTML (for uploading as a web page).
7	# Call script with "--help" argument for documentation. Also see Read Me First.rtf for critical notes.
8	# Recommended rule:
9	# ------------------------------------------------------------------------------------------------------
10
11	# Set separator token to newline
12	IFS="
13	"
14
15	### GLOBALS ###
16	# Settings -- these will be changed from their defaults by the arguments passed in to the script
17	LINKS_URL="" # use 'curl' to download file with links from this location (can be file://)
18	EXCEPT_URL="" # ditto above for file with exceptions to NG results
19	OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder
20	RECORD_OK_LINKS=0 # record response code to the log whether it's a value in OK_CODES or NG_CODES
21	SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page
22	TAKE_PAGE_SHOT=0 # take a screenshot of each OK page
23	URL_START=1 # start at this URL in LINKS_FILE (1 by default)
24	URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE
25	UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report
26
27	# Fixed strings -- see the occurrences of these variables to learn their purpose
28	AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:54.0) Gecko/20100101 Firefox/54.0"
29	ARCHIVE_API="http://archive.org/wayback/available"
30	ARCHIVE_GENERIC="https://web.archive.org/web/*"
31	ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
32	CHROME="/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary"
33	CHROME_SCREENSHOT="screenshot.png"
34	CURL_CODES="http://iritscen.oni2.net/val/curl_codes.txt"
35	EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
36	HTTP_CODES="http://iritscen.oni2.net/val/http_codes.txt"
37	MY_WIKI_PAGE="http://wiki.oni2.net/User:Iritscen"
38	THIS_DIR=$(cd $(dirname $0); pwd)
39	WORKING_DIR=$(pwd)
40	WIKI_PATH="wiki.oni2.net"
41
42	# These are parallel arrays of the IDs and names of OniGalore's current namespaces
43	declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 100 101 102 103 104 105 108 109 110 111)
44	declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk")
45
46	# These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
47	# This determines whether the script tries to take a screenshot of the page or just gets its HTTP code.
48	declare -a HTTP_FILES=(txt zip wmv jpg png m4a bsl rar oni mp3 mov ONWC vbs TRMA mp4 doc avi log gif pdf dmg exe cpp tga 7z wav east BINA xml dll dae xaf fbx 3ds blend flv csv)
49	declare -a HTTP_TLDS_AND_PAGES=(com net org uk ru de it htm html php pl asp aspx shtml pgi cgi php3 x jsp phtml cfm css action stm js)
50
51	# These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
52	# are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt
53	# if you add a new code.
54	declare -a OK_CODES=(200 401 405 406 501)
55	declare -a RD_CODES=(301 302 303 307 308)
56	declare -a NG_CODES=(000 403 404 410 500 503)
57
58	# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
59	# transcluded text, and if the transclusion fails, then the braces show up in the URL
60	ILLEGAL_CHARS="{ }"
61
62	# These are parallel arrays giving the prefixes that can be used in place of normal external links to
63	# some wikis and other sites
64	declare -a INTERWIKI_PREFIXES=(metawikipedia wikipedia wikiquote wiktionary)
65	declare -a INTERWIKI_DOMAINS=(meta.wikipedia.org wikipedia.org wikiquote.org wiktionary.org)
66
67	# Variables for keeping track of main loop progress and findings
68	LINK_NUM=0
69	OK_LINKS=0
70	RD_LINKS=0
71	IW_LINKS=0
72	NG_LINKS=0
73	SKIP_UNK_NS=0
74	SKIP_JS_PAGE=0
75	SKIP_BAD_URL=0
76	SKIP_NON_ASCII=0
77	SKIP_UNK_SUFFIX=0
78	SKIP_UNK_CODE=0
79	SKIP_EXCEPT=0
80	FILE_LINKS=0
81	PAGE_LINKS=0
82	SKIPPED_HEADER_ROW=0
83	FINISHED_LIST="no"
84
85
86	### HELP ###
87	# A pseudo-man page. Here is the 80-character rule for the page text:
88	# 234567890123456789012345678901234567890123456789012345678901234567890123456789
89	function printHelp()
90	{
91	cat << EOF
92
93	NAME
94	Validate External Links
95
96	SYNOPSIS
97	validate_external_links.sh --help
98	validate_external_links.sh --links URL --output PATH [--exceptions FILE]
99	[--record-ok-links] [--suggest-snapshots] [--take-screenshots]
100	[--start-url NUM] [--end-url NUM] [--upload PATH]
101
102	DESCRIPTION
103	This script parses a list of external links found in the OniGalore wiki
104	(which is dumped by the Oni2.net domain periodically in a particular
105	format), validates them using the Unix tool 'curl', and produces a report
106	of which links were OK (responded positively to an HTTP query), which
107	were RD (responded with a 3xx redirect code), which could be IW (inter-
108	wiki) links, and which were NG (no good; a negative response to the
109	query). This report can then be automatically uploaded to the location of
110	your choice. The script can also suggest Internet Archive snapshots for
111	NG links, and take screenshots of OK links for visual verification by the
112	reader that the page in question is the one intended to be displayed.
113
114	You must pass this script the URL at which the list of links is found
115	(--links) and the path where logs should be outputted (--output). All
116	other arguments are optional.
117
118	OPTIONS
119	--help Show this page
120	--links URL URL from which to download file with external links
121	(note that this can be a local file if you use the
122	file:// protocol) (required)
123	--output DIR Place the folder which will contain the reports and
124	optional screenshots at this path (required)
125	--exceptions URL In order to remove links from the list which show as
126	NG but which you regard as OK, prepare a plain-text
127	file where each line contains a response code being
128	returned and the URL returning it, separated by a
129	comma, e.g. "403,http://www.example.com" (note that
130	this can be a local file if you use the
131	file:// protocol)
132	--record-ok-links Log a link in the report even if its response code is
133	OK
134	--suggest-snapshots Query the Internet Archive for a possible snapshot
135	URL for each NG page
136	--take-screenshots Save screenshots of each OK page (requires Google
137	Chrome to be found at the path in CHROME)
138	--start-url NUM Start at this link in the links file
139	--end-url NUM Stop at this link in the links file
140	--upload FILE Upload report using info in this local file
141
142	BUGS
143	The script cannot properly parse any line in the external links file
144	which contains a comma in the name of the wiki page containing a link.
145	Commas in the link itself are not an issue.
146	EOF
147	}
148
149
150	### SETUP ###
151	# If first argument is a help request, or if nothing was passed in at all, print help page and quit
152	if [ "$#" -eq 0 ] \|\| [ "$1" == "--help" ]; then
153	printHelp \| less
154	exit 0
155	fi
156
157	# Parse arguments as long as there are more arguments to process
158	while (( "$#" )); do
159	case "$1" in
160	--links ) LINKS_URL="$2"; shift 2;;
161	--exceptions ) EXCEPT_URL="$2"; shift 2;;
162	--output ) OUTPUT_DIR="$2"; shift 2;;
163	--record-ok-links ) RECORD_OK_LINKS=1; shift;;
164	--suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;;
165	--take-screenshots ) TAKE_PAGE_SHOT=1; shift;;
166	--start-url ) URL_START=$2; shift 2;;
167	--end-url ) URL_LIMIT=$2; shift 2;;
168	--upload ) UPLOAD_INFO=$2; shift 2;;
169	* ) echo "Invalid argument $1 detected. Aborting."; exit 1;;
170	esac
171	done
172
173	# If the required arguments were not supplied, print help page and quit
174	if [ -z $LINKS_URL ] \|\| [ -z $OUTPUT_DIR ]; then
175	printHelp
176	echo "Error: I did not receive one or both required arguments."
177	exit 2
178	fi
179
180	# Check that UPLOAD_INFO exists, if this argument was supplied
181	if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then
182	echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting."
183	exit 3
184	fi
185
186	# Check that OUTPUT_DIR is a directory
187	if [ ! -d "$OUTPUT_DIR" ]; then
188	echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting."
189	exit 4
190	fi
191
192	# Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots
193	SAFE_TIME=$(date "+%Y-%m-%d--%H-%M-%S") # for use in path names
194	NICE_TIME=$(TZ=":UTC" date "+%a, %d %b %Y %k:%M:%S GMT") # matches format of LINKS_DATE, which is extracted from the extlinks file
195	OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
196	OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
197	SHOT_PATH="$OUTPUT_PATH/Screenshots"
198	LOG_NAME="ValExtLinks report"
199	LOG_TXT="$OUTPUT_PATH/$LOG_NAME.txt"
200	LOG_RTF="$OUTPUT_PATH/$LOG_NAME.rtf"
201	LOG_HTM="$OUTPUT_PATH/$LOG_NAME.htm"
202	mkdir "$OUTPUT_PATH"
203	if [ $TAKE_PAGE_SHOT -eq 1 ]; then
204	mkdir "$SHOT_PATH"
205	fi
206
207	# Check that 'mkdir' succeeded
208	if [ ! -d "$OUTPUT_PATH" ]; then
209	echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting."
210	exit 5
211	fi
212
213	# Get date on the file at LINKS_URL and print to log
214	LINKS_DATE=$(curl --silent --head $LINKS_URL \| grep "Last-Modified")
215	if [ -z "$LINKS_DATE" ]; then
216	echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting."
217	exit 6
218	fi
219	LINKS_DATE=${LINKS_DATE#Last-Modified: }
220
221
222	### UTILITY FUNCTIONS ###
223	# Writes a plain-text header to TXT log file
224	function printTXTheader()
225	{
226	valPrint t "Validate External Links report"
227	valPrint t "generated $NICE_TIME"
228	valPrint t "from data of $LINKS_DATE"
229	valPrint t "script by Iritscen (contact: $MY_WIKI_PAGE)"
230	valPrint t ""
231	}
232
233	# Writes the RTF header to RTF log file, then a centered title, then sets text to left-justified
234	function printRTFheader()
235	{
236	valPrint r "{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf820
237	{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
238	{\colortbl;\red255\green255\blue255;}
239	{\*\expandedcolortbl;;}
240	\margl1440\margr1440\vieww12600\viewh12100\viewkind0
241	\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\qc\partightenfactor0
242
243	\f0\fs28 \cf0 \b1 Validate External Links report \b0\\
244	generated $NICE_TIME\\
245	from data of $LINKS_DATE\\
246	script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$MY_WIKI_PAGE\"}}{\fldrslt contact}})
247	\\
248	\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
249	\cf0 "
250	}
251
252	# Closes the RTF markup of the RTF log file
253	function printRTFfooter()
254	{
255	valPrint r "}"
256	}
257
258	# Writes the HTML header to HTML log file
259	function printHTMheader()
260	{
261	valPrint h "<html>
262	<head>
263	<title>Validate External Links report</title>
264	</head>
265	<body>
266	<h2>Validate External Links report</h2>
267	<h3>generated $NICE_TIME<br />
268	from data of $LINKS_DATE<br />
269	script by Iritscen (<a href=\"$MY_WIKI_PAGE\" target=\"_blank\">contact</a>)</h3>"
270	}
271
272	# Closes the HTML markup of the HTML log file
273	function printHTMfooter()
274	{
275	valPrint h "</body>
276	</html>"
277	}
278
279	# The central logging function. The first parameter is a string composed of one or more characters that
280	# indicates which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
281	# 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 'w' means "Don't
282	# pass console output through 'fmt'" ("fmt" fits the output to an 80-column CLI but can break special
283	# formatting and the 'n' option).
284	function valPrint()
285	{
286	if [[ "$1" == c ]]; then
287	if [[ "$1" == n ]]; then
288	echo -n "$2"
289	elif [[ "$1" == w ]]; then
290	echo "$2"
291	else
292	echo "$2" \| fmt -w 80
293	fi
294	fi
295	if [[ "$1" == t ]]; then
296	if [[ "$1" == n ]]; then
297	echo -n "$2" >> "$LOG_TXT"
298	else
299	echo "$2" >> "$LOG_TXT"
300	fi
301	fi
302	if [[ "$1" == r ]]; then
303	if [[ "$1" == n ]]; then
304	echo "$2" >> "$LOG_RTF"
305	else
306	echo "$2\\" >> "$LOG_RTF"
307	fi
308	fi
309	if [[ "$1" == h ]]; then
310	if [[ "$1" == n ]]; then
311	echo "$2" >> "$LOG_HTM"
312	else
313	echo "$2<br />" >> "$LOG_HTM"
314	fi
315	fi
316	}
317
318	# Pluralize the string in parameter 1 if the number in parameter 2 is not 1
319	function pluralCheckNoun()
320	{
321	if [ $2 -ne 1 ]; then
322	if [[ $1 =~ x$ ]]; then
323	echo $1es
324	else
325	echo $1s
326	fi
327	else
328	echo $1
329	fi
330	}
331
332	# Output "is" if parameter 1 is 1, otherwise "are"
333	function pluralCheckIs()
334	{
335	if [ $1 -ne 1 ]; then
336	echo "are"
337	else
338	echo "is"
339	fi
340	}
341
342	# Output "was" if parameter 1 is 1, otherwise "were"
343	function pluralCheckWas()
344	{
345	if [ $1 -ne 1 ]; then
346	echo "were"
347	else
348	echo "was"
349	fi
350	}
351
352	# Output "a " if parameter 1 is 1, otherwise nothing
353	function pluralCheckA()
354	{
355	if [ $1 -eq 1 ]; then
356	echo "a "
357	fi
358	}
359
360	# Output "an " if parameter 1 is 1, otherwise nothing
361	function pluralCheckAn()
362	{
363	if [ $1 -eq 1 ]; then
364	echo "an "
365	fi
366	}
367
368	# Upload HTML report using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
369	# reports being saved to disk have already been closed.
370	function uploadReport()
371	{
372	valPrint c "Uploading HTML report..."
373
374	SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
375	SFTP_USER_NAME_MARKER="user:"
376	SFTP_PASSWORD_MARKER="pw:"
377	SFTP_PORT_MARKER="port:"
378	SFTP_PATH_MARKER="path:"
379	SFTP_USER_NAME=$(grep $SFTP_USER_NAME_MARKER $UPLOAD_INFO)
380	SFTP_USER_NAME=${SFTP_USER_NAME#$SFTP_USER_NAME_MARKER}
381	SFTP_PASSWORD=$(grep $SFTP_PASSWORD_MARKER $UPLOAD_INFO)
382	SFTP_PASSWORD=${SFTP_PASSWORD#$SFTP_PASSWORD_MARKER}
383	SFTP_PORT=$(grep $SFTP_PORT_MARKER $UPLOAD_INFO)
384	SFTP_PORT=${SFTP_PORT#$SFTP_PORT_MARKER}
385	SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
386	SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}
387
388	expect "$SCRIPT_PATH" "$LOG_HTM" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.htm"
389
390	valPrint c "Report was uploaded, unless an error message appears above."
391	}
392
393	# Prints session summary when script is done
394	function wrapupAndExit()
395	{
396	# Get off progress line on console, drop down a line from last link in log, and close HTML table
397	valPrint ctr ""
398	valPrint h "</table><br />"
399
400	# If we didn't finish processing the last URL, then the iterator is one too high
401	if [ $FINISHED_LIST != "yes" ]; then
402	let LINK_NUM-=1
403	if [ $FINISHED_LIST == "no" ]; then
404	valPrint ctrh "The session was canceled by the user."
405	fi
406	fi
407
408	# Output results of session and close the log file's markup
409	LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
410	LINKS_SKIPPED=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
411	LINKS_CHECKED=$((LINKS_PROCESSED-LINKS_SKIPPED))
412	valPrint ct "Summary:"
413	valPrint r "\b1 Summary \b0"
414	valPrint hn "<h3><span id=\"summary\">Summary</span></h3>"
415	valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT)."
416	valPrint ctrh "I skipped $LINKS_SKIPPED $(pluralCheckNoun link $LINKS_SKIPPED), and found $FILE_LINKS $(pluralCheckNoun file $FILE_LINKS) and $PAGE_LINKS $(pluralCheckNoun page $PAGE_LINKS)."
417	if [ $LINKS_SKIPPED -gt 0 ]; then valPrint ctrh "Skip breakdown: "; fi
418	if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
419	if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE links on JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
420	if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
421	if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
422	if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
423	if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
424	valPrint ctrh "Out of the $LINKS_CHECKED links checked, $IW_LINKS could be $(pluralCheckAn $IW_LINKS)interwiki $(pluralCheckNoun link $IW_LINKS), $OK_LINKS $(pluralCheckWas $OK_LINKS) OK, $RD_LINKS $(pluralCheckWas $RD_LINKS) $(pluralCheckA $RD_LINKS)redirection $(pluralCheckNoun notice $RD_LINKS), and $NG_LINKS $(pluralCheckWas $NG_LINKS) NG."
425	if [ $SKIP_EXCEPT -gt 0 ]; then
426	valPrint ctrh "$SKIP_EXCEPT/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS) went unlisted due to being found in the exceptions file."
427	fi
428	printRTFfooter
429	printHTMfooter
430
431	# Upload report if this was requested
432	if [ ! -z $UPLOAD_INFO ] && [ $FINISHED_LIST != "no" ]; then
433	uploadReport
434	fi
435
436	# Really quit now
437	valPrint c "ValExtLinks says goodbye."
438	exit 0
439	}
440	trap wrapupAndExit INT
441
442
443	### INITIALIZATION ###
444	# Print opening message to console and log files
445	valPrint cw "$(tput bold)--Validate External Links--$(tput sgr0)"
446	printTXTheader
447	printRTFheader
448	printHTMheader
449
450	# Attempt to download file at LINKS_URL, then check that it succeeded
451	valPrint cwtrh "Downloading list of external links from $LINKS_URL."
452	LINKS_FILE_NAME=$(echo "$LINKS_URL" \| sed 's/.*\///')
453	LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
454	curl --silent -o "$LINKS_FILE" $LINKS_URL
455	if [ ! -f "$LINKS_FILE" ]; then
456	echo "The download of $LINKS_URL appears to have failed. Aborting."
457	wrapupAndExit
458	fi
459
460	# Attempt to download file at EXCEPT_URL, then check that it succeeded
461	if [ ! -z $EXCEPT_URL ]; then
462	valPrint cwtrh "Downloading list of NG exceptions from $EXCEPT_URL."
463	EXCEPT_FILE_NAME=$(echo "$EXCEPT_URL" \| sed 's/.*\///')
464	EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
465	curl --silent -o "$EXCEPT_FILE" $EXCEPT_URL
466	if [ ! -f "$EXCEPT_FILE" ]; then
467	echo "The download of $EXCEPT_URL appears to have failed. Aborting."
468	wrapupAndExit
469	fi
470	fi
471
472	# Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
473	LINK_COUNT_STRING=$(cat "$LINKS_FILE" \| wc -l)
474
475	# Number of URLs is number of lines minus one (first line is column header row for the CSV)
476	LINK_COUNT=$(echo "${LINK_COUNT_STRING}" \| tr -d '[:space:]')
477	let LINK_COUNT-=1
478
479	# Calculate number of URLs to consider
480	if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
481	valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((URL_LIMIT-URL_START+1)) of them, from link $URL_START to $URL_LIMIT."
482	elif [ $URL_START -ne 1 ]; then
483	valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((LINK_COUNT-URL_START+1)) of them, from link $URL_START to $LINK_COUNT."
484	else
485	valPrint ctrh "Found $LINK_COUNT links to process. I will be considering all of them."
486	fi
487
488	# Print settings to console and log
489	declare -a SETTINGS_MSG=(I will be checking the response code of each link "and will" take a screenshot of each page. Pages that are OK will "also" be logged. I "will" ask the Internet Archive for a suggested snapshot URL for each NG page. "I will not print NG links that are listed in the exceptions file.")
490	if [ $TAKE_PAGE_SHOT -eq 0 ]; then SETTINGS_MSG[10]="but will not"; fi
491	if [ $RECORD_OK_LINKS -eq 0 ]; then SETTINGS_MSG[22]="not"; fi
492	if [ $SUGGEST_SNAPSHOTS -eq 0 ]; then SETTINGS_MSG[26]="will not"; fi
493	if [ -z $EXCEPT_URL ] \|\| [ $RECORD_OK_LINKS -eq 1 ]; then SETTINGS_MSG[40]=""; fi
494	SETTINGS_STR=${SETTINGS_MSG[@]}
495	valPrint ctrh "$SETTINGS_STR"
496	valPrint tr "A summary of my findings will be found at the bottom of the report."
497	valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
498	valPrint trh ""
499
500	# Print legend to logs
501	valPrint t "Legend:"
502	valPrint r "\b1 Legend \b0"
503	valPrint hn "<h3>Legend</h3>"
504	valPrint trh "OK = URL seems to be working."
505	valPrint trh "NG = URL no longer seems to work. You should click each URL marked as NG before attempting to fix it, because false negatives will occur from time to time due to hiccups in the Internet. Please report any persistent false negatives or other issues to Iritscen. An NG link should be followed by a link to the Internet Archive's Wayback Machine which may help you repair the link. If the link cannot be repaired, you can disable it on the wiki (which prevents it from showing up in future ValExtLinks reports) by wrapping it in nowiki tags."
506	valPrint trh "RD = The server responding to this URL is saying that the page moved and you should instead use the supplied new URL. Some RD links represent minor adjustments in the organization of a web site, and some are soft 404s (the file/page has been removed and you are being redirected to something like the main page of the web site). You will have to look at the new URL yourself to determine if it represents an OK link and the link on the wiki should be updated to this one, or if the desired file/page is actually gone and we need to replace the wiki link with an Internet Archive snapshot link -- or disable the URL if it has not been archived."
507	valPrint trh "IW = URL is working but should be converted to interwiki link using the suggested markup."
508	valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $HTTP_CODES)."
509	valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$HTTP_CODES\"}}{\fldrslt here}} for code reference)."
510	valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$HTTP_CODES\" target=\"_blank\">here</a> for code reference)."
511	valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $CURL_CODES)."
512	valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$CURL_CODES\"}}{\fldrslt here}} for code reference)."
513	valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$CURL_CODES\" target=\"_blank\">here</a> for code reference)."
514	valPrint trh "IA suggests = Last available snapshot suggested by the Internet Archive."
515	valPrint trh "Try browsing = The Archive occasionally fails to return a snapshot URL even when one exists, so you will need to check for a snapshot manually using the Wayback Machine before concluding that a site has not been archived."
516	valPrint trh ""
517
518
519	### MAIN LOOP ###
520	# Process each line of the .csv in LINKS_FILE
521	for LINE in `cat "$LINKS_FILE"`; do
522	let LINK_NUM+=1
523
524	# First line is the column header row for the CSV, so let's verify that the format hasn't changed
525	if [ $SKIPPED_HEADER_ROW -eq 0 ]; then
526	if [ $LINE == "namespace,title,target" ]; then
527	SKIPPED_HEADER_ROW=1
528	LINK_NUM=0 # this line is it's not a link, so reset the link counter
529	valPrint hn "<table>"
530	continue
531	else
532	valPrint ctrh "Did not find expected header row in $LINKS_FILE. Aborting."
533	wrapupAndExit
534	fi
535	fi
536
537	# Skip this link if we are not at URL_START yet
538	if [ $LINK_NUM -lt $URL_START ]; then
539	continue
540	fi
541
542	# Stop if we are at the limit declared for testing purposes
543	if [ $URL_LIMIT -gt 0 ] && [ $LINK_NUM -gt $URL_LIMIT ]; then
544	FINISHED_LIST="limit"
545	wrapupAndExit
546	fi
547
548	# Print progress to screen
549	if [ $LINK_NUM -gt 1 ]; then
550	printf "\e[1A\n" # erase previous progress message so that new one appears in its place
551	fi
552	valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."
553
554	# The number of the namespace is the element before the first comma on the line
555	NS_ID=${LINE%%,*}
556
557	# Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
558	NS_NAME=""
559	a=0
560	while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done
561	if [ $NS_ID -eq ${NS_IDS[$a]} ]; then
562	NS_NAME="${NS_NAMES[$a]}"
563	break
564	fi
565	let a+=1
566	done
567	if [ -z "$NS_NAME" ]; then
568	valPrint tr "Skipping URL found on page $PAGE_NAME because I could not find a name for namespace ID $NS_ID."
569	let SKIP_UNK_NS+=1
570	continue
571	fi
572
573	# The name of the page is everything between the namespace ID and the next comma on the line (commas
574	# in page names will break this)
575	PAGE_NAME=${LINE#$NS_ID,}
576	PAGE_NAME=${PAGE_NAME%%,*}
577
578	# We don't want to consider wiki pages ending in .js, as the parser cannot reliably isolate URLS in
579	# JavaScript code, so it will return erroneous links
580	PAGE_NAME_SUFFIX=$(echo $PAGE_NAME \| sed 's/.*\.//')
581	if [ $PAGE_NAME_SUFFIX == "js" ]; then
582	valPrint tr "Skipping URL found on JavaScript page $PAGE_NAME."
583	let SKIP_JS_PAGE+=1
584	continue
585	fi
586
587	# The URL being linked to is everything after the previous two fields (this allows commas to be in
588	# the URLs, but a comma in the previous field, the page name, will break this)
589	URL=${LINE#$NS_ID,$PAGE_NAME,}
590
591	# Scan for illegal characters
592	if [[ $URL == [$ILLEGAL_CHARS] ]]; then
593	valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because it contains characters illegal in a URL."
594	let SKIP_BAD_URL+=1
595	continue
596	fi
597
598	# Now we need to know if the URL is for a file or a web page. First step is to determine if the
599	# URL ends in a suffix
600	HAS_SUFFIX=0
601
602	# If the URL ends in something like ".php?foo=bar", strip everything from the '?' onward
603	SAN_URL=${URL%%\?*}
604
605	# If the URL ends in something like "#section_15", strip everything from the '#' onward
606	SAN_URL=${SAN_URL%%\#*}
607
608	# 'sed' cannot handle Unicode in my Bash shell, so skip this URL and make user check it
609	if [[ $SAN_URL == [![:ascii:]] ]]; then
610	valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because I cannot handle non-ASCII characters."
611	let SKIP_NON_ASCII+=1
612	continue
613	fi
614
615	# Isolate the characters after the last period and after the last slash
616	POST_DOT=$(echo "$SAN_URL" \| sed 's/.*\.//')
617	POST_SLASH=$(echo "$SAN_URL" \| sed 's/.*\///')
618
619	# If the last period comes after the last slash, then the URL ends in a suffix
620	POST_DOT_LENGTH=$(echo \| awk -v input=$POST_DOT '{print length(input)}')
621	POST_SLASH_LENGTH=$(echo \| awk -v input=$POST_SLASH '{print length(input)}')
622	if [ $POST_DOT_LENGTH -lt $POST_SLASH_LENGTH ]; then
623	HAS_SUFFIX=1
624	else
625	HAS_SUFFIX=0
626	fi
627
628	# Now we need to know if the URL is for a file. If the URL ends in a suffix, compare it against the
629	# known URL endings supplied in HTTP_FILES and HTTP_TLDS_AND_PAGES
630	IS_FILE=-1
631	if [ $HAS_SUFFIX -eq 0 ]; then
632	IS_FILE=0
633	else
634	# Turn off case sensitivity while we compare suffixes
635	shopt -s nocasematch
636
637	# Special case: URLs ending in something like "/productID.304297400" are pages, not files, so if
638	# the URL's suffix is all numbers, we are looking at the end of a web page URL
639	if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
640	IS_FILE=0
641	fi
642
643	# If we did not identify this URL as a web page above, we need to compare the suffix against known
644	# file extensions
645	if [ $IS_FILE -eq -1 ]; then
646	for EXTENSION in "${HTTP_FILES[@]}"; do
647	if [[ $EXTENSION == $POST_DOT ]]; then # double brackets respect the nocasematch setting
648	IS_FILE=1
649	break
650	fi
651	done
652	fi
653
654	# If we did not identify this URL as a file above, we need to compare the suffix against known
655	# pages and TLDs in order to rule out that this is an unknown file suffix or TLD that the user
656	# needs to add to the HTTP_FILES or HTTP_TLDS_AND_PAGES array
657	if [ $IS_FILE -eq -1 ]; then
658	for PAGE_OR_TLD in "${HTTP_TLDS_AND_PAGES[@]}"; do
659	if [[ $PAGE_OR_TLD == $POST_DOT ]]; then
660	IS_FILE=0
661	break
662	fi
663	done
664	fi
665
666	# Turn case sensitivity back on in Bash
667	shopt -u nocasematch
668	fi
669
670	# If this suffix escaped identification as either a file, page or TLD, inform the user
671	STR_TYPE=""
672	if [ $IS_FILE -eq -1 ]; then
673	valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because I encountered the unknown URL ending $POST_DOT. Please add this ending to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
674	let SKIP_UNK_SUFFIX+=1
675	continue
676	elif [ $IS_FILE -eq 1 ]; then
677	STR_TYPE="file"
678	let FILE_LINKS+=1
679	elif [ $IS_FILE -eq 0 ]; then
680	STR_TYPE="page"
681	let PAGE_LINKS+=1
682	fi
683
684	# Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
685	# issue with sites that require HTTPS
686	CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{http_code}\n' $URL)
687	CURL_ERR=$(echo $?)
688	CURL_RESULT=$CURL_CODE
689
690	# Tack 'curl' exit code onto result string if 'curl' returned "000" (no HTTP response)
691	if [ $CURL_CODE == "000" ]; then
692	CURL_RESULT="$CURL_RESULT-$CURL_ERR"
693	fi
694
695	# Determine our status code for this URL (IW, OK, RD, or NG)
696	STATUS="??"
697	NEW_URL=""
698	INTERWIKI_INDEX=-1
699	# First check if this is a link to a domain that we have an interwiki prefix for
700	for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
701	if [[ $URL == ${INTERWIKI_DOMAINS[$i]} ]]; then
702	STATUS="IW"
703	let IW_LINKS+=1
704	INTERWIKI_INDEX=$i
705	break
706	fi
707	done
708
709	# If we didn't match an interwiki domain, see if the status code is in our "OK" codes list
710	if [ $STATUS == "??" ]; then
711	for CODE in "${OK_CODES[@]}"; do
712	if [[ $CODE == $CURL_CODE ]]; then
713	STATUS="OK"
714	let OK_LINKS+=1
715	break
716	fi
717	done
718	fi
719
720	# If we didn't get a match with the "OK" codes, check it against the "RD" codes
721	if [ $STATUS == "??" ]; then
722	for CODE in "${RD_CODES[@]}"; do
723	if [[ $CODE == $CURL_CODE ]]; then
724	# Get URL header again in order to retrieve the URL we are being redirected to
725	NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{redirect_url}\n' $URL)
726
727	# Check if the redirect URL is just the original URL with https:// instead of http://
728	# (this happens a lot and is not an important correction to us); if so, just make it "OK"
729	URL_NO_PROTOCOL=${URL#*://}
730	NEW_URL_NO_PROTOCOL=${NEW_URL#*://}
731	if [ $URL_NO_PROTOCOL == $NEW_URL_NO_PROTOCOL ]; then
732	STATUS="OK"
733	let OK_LINKS+=1
734	else
735	STATUS="RD"
736	let RD_LINKS+=1
737	fi
738	break
739	fi
740	done
741	fi
742
743	# If we didn't get a match with the "RD" codes, check it against the "NG" codes
744	if [ $STATUS == "??" ]; then
745	for CODE in "${NG_CODES[@]}"; do
746	if [[ $CODE == $CURL_CODE ]]; then
747	STATUS="NG"
748	let NG_LINKS+=1
749	break
750	fi
751	done
752	fi
753
754	# If we didn't match a known status code, advise the reader
755	if [ $STATUS == "??" ]; then
756	valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because I encountered the unknown return code $CURL_CODE."
757	let SKIP_UNK_CODE+=1
758	continue
759	fi
760
761	# If link is "NG" and there is an exceptions file, compare URL against the list before logging it
762	if [ $STATUS == "NG" ] && [ ! -z $EXCEPT_URL ]; then
763	GREP_RESULT=$(grep --max-count=1 "$URL" "$EXCEPT_FILE")
764	EXCEPT_CODE=${GREP_RESULT%%,*}
765	if [ "$EXCEPT_CODE" == $CURL_RESULT ]; then
766	valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because its status code, $CURL_RESULT, is listed in the exceptions file."
767	let SKIP_EXCEPT+=1
768	continue
769	fi
770	fi
771
772	# If appropriate, record this link to the log, with clickable URLs when possible
773	if [ $STATUS != "OK" ] \|\| [ $RECORD_OK_LINKS -eq 1 ]; then
774	FULL_PAGE_PATH=http://$WIKI_PATH/$NS_NAME:$PAGE_NAME
775	LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
776	# Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it explicitly breaks the link
777	if [ $NS_ID -eq 0 ]; then
778	FULL_PAGE_PATH=http://$WIKI_PATH/$PAGE_NAME
779	LOCAL_PAGE_PATH=$PAGE_NAME
780	fi
781
782	# Stupid hack since the text "IW" is narrower than "OK", "RD", or "NG" and it takes an extra tab
783	# to get to the desired level of indentation in the RTF log
784	RTF_TABS=" "
785	if [ $STATUS == "IW" ]; then
786	RTF_TABS=" "
787	fi
788
789	# Record link and its wiki page in TXT, RTF, and HTML markup
790	valPrint t "$STATUS ($CURL_RESULT) $STR_TYPE $URL"
791	valPrint t " linked from $FULL_PAGE_PATH"
792	valPrint r "$STATUS ($CURL_RESULT)$RTF_TABS$STR_TYPE {\field{\*\fldinst{HYPERLINK \"$URL\"}}{\fldrslt $URL}}"
793	valPrint r " linked from {\field{\*\fldinst{HYPERLINK \"$FULL_PAGE_PATH\"}}{\fldrslt $LOCAL_PAGE_PATH}}"
794	valPrint hn "<tr><td style=\"white-space:nowrap\">$STATUS ($CURL_RESULT)</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>"
795	valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"
796
797	# Record redirect URL if one was given by a 3xx response page
798	if [ $STATUS == "RD" ]; then
799	valPrint t " Server suggests $NEW_URL"
800	valPrint r " Server suggests {\field{\*\fldinst{HYPERLINK \"$NEW_URL\"}}{\fldrslt $NEW_URL}}"
801	valPrint hn "<tr><td colspan=\"2\" align=\"right\">Server suggests</td><td><a href=\"$NEW_URL\" target=\"_blank\">$NEW_URL</a></td></tr>"
802	fi
803
804	# Notify reader if we can use an interwiki prefix for this URL
805	if [ $STATUS == "IW" ]; then
806	valPrint t " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$POST_SLASH]]"
807	valPrint r " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$POST_SLASH]]"
808	valPrint hn "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$POST_SLASH]]</td></tr>"
809	fi
810
811	# Query Internet Archive for latest "OK" snapshot for "NG" page
812	if [ $STATUS == "NG" ] && [ $SUGGEST_SNAPSHOTS -eq 1 ]; then
813	ARCHIVE_QUERY=$(curl --silent --max-time 10 "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
814
815	# Isolate "url" property in response and log it if a "closest" snapshot was received...
816	if [[ "$ARCHIVE_QUERY" == \"closest\": ]]; then
817	SNAPSHOT_URL=${ARCHIVE_QUERY##*\"url\": \"}
818	SNAPSHOT_URL=${SNAPSHOT_URL%\", \"timestamp*}
819	valPrint t " IA suggests $SNAPSHOT_URL"
820	valPrint r " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
821	valPrint hn "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
822	else # ...otherwise give generic Wayback Machine link for this URL
823	valPrint t " Try browsing $ARCHIVE_GENERIC/$URL"
824	valPrint r " Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
825	valPrint hn "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"
826	fi
827	fi
828	fi
829
830	# If this is a page and it seems to be there, take screenshot of it using headless Google Chrome
831	if [ $IS_FILE -eq 0 ] && [ $TAKE_PAGE_SHOT -eq 1 ] && [ $STATUS == "OK" ]; then
832	# Sanitize URL by removing "http(s)://" and converting any other colons or slashes to underscores
833	SHOT_NAME=$(echo "$URL" \| sed 's/https*\:\/\///' \| sed 'y/:\//__/')
834	SHOT_FILE="$SHOT_PATH/$SHOT_NAME.png"
835
836	# Don't take screenshot if we already encountered this page and screenshotted it
837	if [ ! -f "$SHOT_FILE" ]; then
838	"$CHROME" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
839	if [ -f "$WORKING_DIR/$CHROME_SCREENSHOT" ]; then
840	mv -n "$WORKING_DIR/$CHROME_SCREENSHOT" "$SHOT_FILE"
841	else
842	valPrint trh "Screenshot of URL $URL seems to have failed!"
843	fi
844	else
845	valPrint trh "Skipping screenshot of URL $URL because $SHOT_FILE already exists."
846	fi
847	fi
848	done
849	FINISHED_LIST="yes"
850	wrapupAndExit

Note: See TracBrowser for help on using the repository browser.

Download in other formats: