Context Navigation

source: Validate External Links/validate_external_links.sh@ 1066

Last change on this file since 1066 was 1066, checked in by iritscen, 7 years ago
Updating Val to new location of files on oni2.net and slight changes to Archive API.
File size: 33.2 KB

Line
1	#!/bin/bash
2
3	# Validate External Links by Iritscen
4	# Provided with a list of external links found in the OniGalore wiki, this script validates them.
5	# The resulting logs are produced in three formats: TXT (for easy diffing with an earlier log), RTF
6	# (for reading as a local file with clickable links), and HTML (for uploading as a web page).
7	# Call script with "--help" argument for documentation. Also see Read Me First.rtf for critical notes.
8	# Recommended rule:
9	# ------------------------------------------------------------------------------------------------------
10
11	# Set separator token to newline
12	IFS="
13	"
14
15	### GLOBALS ###
16	# Settings -- these will be changed from their defaults by the arguments passed in to the script
17	LINKS_URL="" # use 'curl' to download file with links from this location (can be file://)
18	EXCEPT_URL="" # ditto above for file with exceptions to NG results
19	OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder
20	RECORD_OK_LINKS=0 # record response code to the log whether it's a value in OK_CODES or NG_CODES
21	SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page
22	TAKE_PAGE_SHOT=0 # take a screenshot of each OK page
23	URL_START=1 # start at this URL in LINKS_FILE (1 by default)
24	URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE
25	UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report
26
27	# Fixed strings -- see the occurrences of these variables to learn their purpose
28	AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:54.0) Gecko/20100101 Firefox/54.0"
29	ARCHIVE_API="http://archive.org/wayback/available"
30	ARCHIVE_GENERIC="https://web.archive.org/web/*"
31	ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
32	CHROME="/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary"
33	CHROME_SCREENSHOT="screenshot.png"
34	CURL_CODES="http://iritscen.oni2.net/val/curl_codes.txt"
35	EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
36	HTTP_CODES="http://iritscen.oni2.net/val/http_codes.txt"
37	MY_WIKI_PAGE="http://wiki.oni2.net/User:Iritscen"
38	THIS_DIR=$(cd $(dirname $0); pwd)
39	WORKING_DIR=$(pwd)
40	WIKI_PATH="wiki.oni2.net"
41
42	# These are parallel arrays of the IDs and names of OniGalore's current namespaces
43	declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 100 101 102 103 104 105 108 109 110 111)
44	declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk")
45
46	# These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
47	# This determines whether the script tries to take a screenshot of the page or just gets its HTTP code.
48	declare -a HTTP_FILES=(txt zip wmv jpg png m4a bsl rar oni mp3 mov ONWC vbs TRMA mp4 doc avi log gif pdf dmg exe cpp tga 7z wav east BINA xml dll dae xaf fbx 3ds blend flv csv)
49	declare -a HTTP_TLDS_AND_PAGES=(com net org uk ru de it htm html php pl asp aspx shtml pgi cgi php3 x jsp phtml cfm css action stm js)
50
51	# These arrays tells us which HTTP response codes are OK (good) and which are NG (no good). Pages that
52	# return NG codes will not be screenshotted. Remember to update http_codes.txt if you add a new code.
53	declare -a OK_CODES=(200 301 307 401 405 406 501)
54	declare -a NG_CODES=(000 302 403 404 410 500 503)
55
56	# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
57	# transcluded text, and if the transclusion fails, then the braces show up in the URL
58	ILLEGAL_CHARS="{ }"
59
60	# These are parallel arrays giving the prefixes that can be used in place of normal external links to
61	# some wikis and other sites
62	declare -a INTERWIKI_PREFIXES=(metawikipedia wikipedia wikiquote wiktionary)
63	declare -a INTERWIKI_DOMAINS=(meta.wikipedia.org wikipedia.org wikiquote.org wiktionary.org)
64
65	# Variables for keeping track of main loop progress and findings
66	LINK_NUM=0
67	OK_LINKS=0
68	NG_LINKS=0
69	SKIP_UNK_NS=0
70	SKIP_JS_PAGE=0
71	SKIP_BAD_URL=0
72	SKIP_NON_ASCII=0
73	SKIP_UNK_SUFFIX=0
74	SKIP_UNK_CODE=0
75	SKIP_EXCEPT=0
76	FILE_LINKS=0
77	PAGE_LINKS=0
78	SKIPPED_HEADER_ROW=0
79	FINISHED_LIST="no"
80
81
82	### HELP ###
83	# A pseudo-man page. Here is the 80-character rule for the page text:
84	# 234567890123456789012345678901234567890123456789012345678901234567890123456789
85	function printHelp()
86	{
87	cat << EOF
88
89	NAME
90	Validate External Links
91
92	SYNOPSIS
93	validate_external_links.sh --help
94	validate_external_links.sh --links URL --output PATH [--exceptions FILE]
95	[--record-ok-links] [--suggest-snapshots] [--take-screenshots]
96	[--start-url NUM] [--end-url NUM] [--upload PATH]
97
98	DESCRIPTION
99	This script parses a list of external links found in the OniGalore wiki
100	(which is dumped by the Oni2.net domain periodically in a particular
101	format), validates them using the Unix tool 'curl', and produces a report
102	of which links were OK (responded to an HTTP query) and which were NG (no
103	good). This report can then be automatically uploaded to the location of
104	your choice. The script can also suggest Internet Archive snapshots for
105	NG links, and take screenshots of OK links for visual verification by the
106	reader that the page in question is the one intended to be displayed.
107
108	You must pass this script the URL at which the list of links is found
109	(--links) and the path where logs should be outputted (--output). All
110	other arguments are optional.
111
112	OPTIONS
113	--help Show this page
114	--links URL URL from which to download file with external links
115	(note that this can be a local file if you use the
116	file:// protocol) (required)
117	--output DIR Place the folder which will contain the reports and
118	optional screenshots at this path (required)
119	--exceptions DIR Don't log an NG link if it is listed in the file
120	provided at this path as long as the response code is
121	the same as the one associated with the link
122	--record-ok-links Log a link in the report whether its response code is
123	in the OK_CODES or the NG_CODES array
124	--suggest-snapshots Query the Internet Archive for a possible snapshot
125	URL for each NG page
126	--take-screenshots Save screenshots of each OK page (requires Google
127	Chrome to be found at the path in CHROME)
128	--start-url NUM Start at this link in the links file
129	--end-url NUM Stop at this link in the links file
130	--upload FILE Upload report using info in this local file
131
132	BUGS
133	The script cannot properly parse any line in the external links file
134	which contains a comma in the name of the wiki page containing a link.
135	Commas in the link itself are not an issue.
136	EOF
137	}
138
139
140	### SETUP ###
141	# If first argument is a help request, or if nothing was passed in at all, print help page and quit
142	if [ "$#" -eq 0 ] \|\| [ "$1" == "--help" ]; then
143	printHelp \| less
144	exit 0
145	fi
146
147	# Parse arguments as long as there are more arguments to process
148	while (( "$#" )); do
149	case "$1" in
150	--links ) LINKS_URL="$2"; shift 2;;
151	--exceptions ) EXCEPT_URL="$2"; shift 2;;
152	--output ) OUTPUT_DIR="$2"; shift 2;;
153	--record-ok-links ) RECORD_OK_LINKS=1; shift;;
154	--suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;;
155	--take-screenshots ) TAKE_PAGE_SHOT=1; shift;;
156	--start-url ) URL_START=$2; shift 2;;
157	--end-url ) URL_LIMIT=$2; shift 2;;
158	--upload ) UPLOAD_INFO=$2; shift 2;;
159	* ) echo "Invalid argument $1 detected. Aborting."; exit 1;;
160	esac
161	done
162
163	# If the required arguments were not supplied, print help page and quit
164	if [ -z $LINKS_URL ] \|\| [ -z $OUTPUT_DIR ]; then
165	printHelp
166	echo "Error: I did not receive one or both required arguments."
167	exit 2
168	fi
169
170	# Check that UPLOAD_INFO exists, if this argument was supplied
171	if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then
172	echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting."
173	exit 3
174	fi
175
176	# Check that OUTPUT_DIR is a directory
177	if [ ! -d "$OUTPUT_DIR" ]; then
178	echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting."
179	exit 4
180	fi
181
182	# Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots
183	SAFE_TIME=$(date "+%Y-%m-%d--%H-%M-%S") # for use in path names
184	NICE_TIME=$(TZ=":UTC" date "+%a, %d %b %Y %k:%M:%S GMT") # matches format of LINKS_DATE, which is extracted from the extlinks file
185	OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
186	OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
187	SHOT_PATH="$OUTPUT_PATH/Screenshots"
188	LOG_NAME="ValExtLinks report"
189	LOG_TXT="$OUTPUT_PATH/$LOG_NAME.txt"
190	LOG_RTF="$OUTPUT_PATH/$LOG_NAME.rtf"
191	LOG_HTM="$OUTPUT_PATH/$LOG_NAME.htm"
192	mkdir "$OUTPUT_PATH"
193	if [ $TAKE_PAGE_SHOT -eq 1 ]; then
194	mkdir "$SHOT_PATH"
195	fi
196
197	# Check that 'mkdir' succeeded
198	if [ ! -d "$OUTPUT_PATH" ]; then
199	echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting."
200	exit 5
201	fi
202
203	# Get date on the file at LINKS_URL and print to log
204	LINKS_DATE=$(curl --silent --head $LINKS_URL \| grep "Last-Modified")
205	if [ -z "$LINKS_DATE" ]; then
206	echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting."
207	exit 6
208	fi
209	LINKS_DATE=${LINKS_DATE#Last-Modified: }
210
211
212	### UTILITY FUNCTIONS ###
213	# Writes a plain-text header to TXT log file
214	function printTXTheader()
215	{
216	valPrint t "Validate External Links report"
217	valPrint t "generated $NICE_TIME"
218	valPrint t "from data of $LINKS_DATE"
219	valPrint t "script by Iritscen (contact: $MY_WIKI_PAGE)"
220	valPrint t ""
221	}
222
223	# Writes the RTF header to RTF log file, then a centered title, then sets text to left-justified
224	function printRTFheader()
225	{
226	valPrint r "{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf820
227	{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
228	{\colortbl;\red255\green255\blue255;}
229	{\*\expandedcolortbl;;}
230	\margl1440\margr1440\vieww12600\viewh12100\viewkind0
231	\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\qc\partightenfactor0
232
233	\f0\fs28 \cf0 \b1 Validate External Links report \b0\\
234	generated $NICE_TIME\\
235	from data of $LINKS_DATE\\
236	script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$MY_WIKI_PAGE\"}}{\fldrslt contact}})
237	\\
238	\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
239	\cf0 "
240	}
241
242	# Closes the RTF markup of the RTF log file
243	function printRTFfooter()
244	{
245	valPrint r "}"
246	}
247
248	# Writes the HTML header to HTML log file
249	function printHTMheader()
250	{
251	valPrint h "<html>
252	<head>
253	<title>Validate External Links report</title>
254	</head>
255	<body>
256	<h2>Validate External Links report</h2>
257	<h3>generated $NICE_TIME<br />
258	from data of $LINKS_DATE<br />
259	script by Iritscen (<a href=\"$MY_WIKI_PAGE\" target=\"_blank\">contact</a>)</h3>"
260	}
261
262	# Closes the HTML markup of the HTML log file
263	function printHTMfooter()
264	{
265	valPrint h "</body>
266	</html>"
267	}
268
269	# The central logging function. The first parameter is a string composed of one or more characters that
270	# indicates which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
271	# 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 'w' means "Don't
272	# pass console output through 'fmt'" ("fmt" fits the output to an 80-column CLI but can break special
273	# formatting and the 'n' option).
274	function valPrint()
275	{
276	if [[ "$1" == c ]]; then
277	if [[ "$1" == n ]]; then
278	echo -n "$2"
279	elif [[ "$1" == w ]]; then
280	echo "$2"
281	else
282	echo "$2" \| fmt -w 80
283	fi
284	fi
285	if [[ "$1" == t ]]; then
286	if [[ "$1" == n ]]; then
287	echo -n "$2" >> "$LOG_TXT"
288	else
289	echo "$2" >> "$LOG_TXT"
290	fi
291	fi
292	if [[ "$1" == r ]]; then
293	if [[ "$1" == n ]]; then
294	echo "$2" >> "$LOG_RTF"
295	else
296	echo "$2\\" >> "$LOG_RTF"
297	fi
298	fi
299	if [[ "$1" == h ]]; then
300	if [[ "$1" == n ]]; then
301	echo "$2" >> "$LOG_HTM"
302	else
303	echo "$2<br />" >> "$LOG_HTM"
304	fi
305	fi
306	}
307
308	# Pluralize the string in parameter 1 if the number in parameter 2 is not 1
309	function pluralCheckNoun()
310	{
311	if [ $2 -ne 1 ]; then
312	if [[ $1 =~ x$ ]]; then
313	echo $1es
314	else
315	echo $1s
316	fi
317	else
318	echo $1
319	fi
320	}
321
322	# Output "was" if parameter 1 is 1, otherwise "were"
323	function pluralCheckWas()
324	{
325	if [ $1 -ne 1 ]; then
326	echo "were"
327	else
328	echo "was"
329	fi
330	}
331
332	# Upload HTML report using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
333	# reports being saved to disk have already been closed.
334	function uploadReport()
335	{
336	valPrint c "Uploading HTML report..."
337
338	SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
339	SFTP_USER_NAME_MARKER="user:"
340	SFTP_PASSWORD_MARKER="pw:"
341	SFTP_PORT_MARKER="port:"
342	SFTP_PATH_MARKER="path:"
343	SFTP_USER_NAME=$(grep $SFTP_USER_NAME_MARKER $UPLOAD_INFO)
344	SFTP_USER_NAME=${SFTP_USER_NAME#$SFTP_USER_NAME_MARKER}
345	SFTP_PASSWORD=$(grep $SFTP_PASSWORD_MARKER $UPLOAD_INFO)
346	SFTP_PASSWORD=${SFTP_PASSWORD#$SFTP_PASSWORD_MARKER}
347	SFTP_PORT=$(grep $SFTP_PORT_MARKER $UPLOAD_INFO)
348	SFTP_PORT=${SFTP_PORT#$SFTP_PORT_MARKER}
349	SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
350	SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}
351
352	expect "$SCRIPT_PATH" "$LOG_HTM" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.htm"
353
354	valPrint c "Report was uploaded, unless an error message appears above."
355	}
356
357	# Prints session summary when script is done
358	function wrapupAndExit()
359	{
360	# Get off progress line on console, drop down a line from last link in log, and close HTML table
361	valPrint ctr ""
362	valPrint h "</table><br />"
363
364	# If we didn't finish processing the last URL, then the iterator is one too high
365	if [ $FINISHED_LIST != "yes" ]; then
366	let LINK_NUM-=1
367	if [ $FINISHED_LIST == "no" ]; then
368	valPrint ctrh "The session was canceled by the user."
369	fi
370	fi
371
372	# Output results of session and close the log file's markup
373	LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
374	LINKS_SKIPPED=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
375	LINKS_CHECKED=$((LINKS_PROCESSED-LINKS_SKIPPED))
376	valPrint ct "Summary:"
377	valPrint r "\b1 Summary \b0"
378	valPrint hn "<h3><span id=\"summary\">Summary</span></h3>"
379	valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT)."
380	valPrint ctrh "I skipped $LINKS_SKIPPED $(pluralCheckNoun link $LINKS_SKIPPED), and found $FILE_LINKS $(pluralCheckNoun file $FILE_LINKS) and $PAGE_LINKS $(pluralCheckNoun page $PAGE_LINKS)."
381	if [ $LINKS_SKIPPED -gt 0 ]; then valPrint ctrh "Skip breakdown: "; fi
382	if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
383	if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE links on JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
384	if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
385	if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
386	if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
387	if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
388	valPrint ctrh "Out of the $LINKS_CHECKED links checked, $OK_LINKS $(pluralCheckWas $OK_LINKS) OK and $NG_LINKS $(pluralCheckWas $NG_LINKS) NG."
389	if [ $SKIP_EXCEPT -gt 0 ]; then
390	valPrint ctrh "$SKIP_EXCEPT/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS) went unlisted due to being found in the exceptions file."
391	fi
392	printRTFfooter
393	printHTMfooter
394
395	# Upload report if this was requested
396	if [ ! -z $UPLOAD_INFO ] && [ $FINISHED_LIST != "no" ]; then
397	uploadReport
398	fi
399
400	# Really quit now
401	valPrint c "ValExtLinks says goodbye."
402	exit 0
403	}
404	trap wrapupAndExit INT
405
406
407	### INITIALIZATION ###
408	# Print opening message to console and log files
409	valPrint cw "$(tput bold)--Validate External Links--$(tput sgr0)"
410	printTXTheader
411	printRTFheader
412	printHTMheader
413
414	# Attempt to download file at LINKS_URL, then check that it succeeded
415	valPrint ctrh "Downloading list of external links from $LINKS_URL."
416	LINKS_FILE_NAME=$(echo "$LINKS_URL" \| sed 's/.*\///')
417	LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
418	curl --silent -o "$LINKS_FILE" $LINKS_URL
419	if [ ! -f "$LINKS_FILE" ]; then
420	echo "The download of $LINKS_URL appears to have failed. Aborting."
421	wrapupAndExit
422	fi
423
424	# Attempt to download file at EXCEPT_URL, then check that it succeeded
425	if [ ! -z $EXCEPT_URL ]; then
426	valPrint ctrh "Downloading list of NG exceptions from $EXCEPT_URL."
427	EXCEPT_FILE_NAME=$(echo "$EXCEPT_URL" \| sed 's/.*\///')
428	EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
429	curl --silent -o "$EXCEPT_FILE" $EXCEPT_URL
430	if [ ! -f "$EXCEPT_FILE" ]; then
431	echo "The download of $EXCEPT_URL appears to have failed. Aborting."
432	wrapupAndExit
433	fi
434	fi
435
436	# Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
437	LINK_COUNT_STRING=$(cat "$LINKS_FILE" \| wc -l)
438
439	# Number of URLs is number of lines minus one (first line is column header row for the CSV)
440	LINK_COUNT=$(echo "${LINK_COUNT_STRING}" \| tr -d '[:space:]')
441	let LINK_COUNT-=1
442
443	# Calculate number of URLs to consider
444	if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
445	valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((URL_LIMIT-URL_START+1)) of them, from link $URL_START to $URL_LIMIT."
446	elif [ $URL_START -ne 1 ]; then
447	valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((LINK_COUNT-URL_START+1)) of them, from link $URL_START to $LINK_COUNT."
448	else
449	valPrint ctrh "Found $LINK_COUNT links to process. I will be considering all of them."
450	fi
451
452	# Print settings to console and log
453	declare -a SETTINGS_MSG=(I will be checking the response code of each link "and will" take a screenshot of each page. Pages that are OK will "also" be logged. I "will" ask the Internet Archive for a suggested snapshot URL for each NG page. "I will not print NG links that are listed in the exceptions file.")
454	if [ $TAKE_PAGE_SHOT -eq 0 ]; then SETTINGS_MSG[10]="but will not"; fi
455	if [ $RECORD_OK_LINKS -eq 0 ]; then SETTINGS_MSG[22]="not"; fi
456	if [ $SUGGEST_SNAPSHOTS -eq 0 ]; then SETTINGS_MSG[26]="will not"; fi
457	if [ -z $EXCEPT_URL ] \|\| [ $RECORD_OK_LINKS -eq 1 ]; then SETTINGS_MSG[40]=""; fi
458	SETTINGS_STR=${SETTINGS_MSG[@]}
459	valPrint ctrh "$SETTINGS_STR"
460	valPrint tr "A summary of my findings will be found at the bottom of the report."
461	valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
462	valPrint trh ""
463
464	# Print legend to logs
465	valPrint t "Legend:"
466	valPrint r "\b1 Legend \b0"
467	valPrint hn "<h3>Legend</h3>"
468	valPrint trh "OK = URL seems to be working."
469	valPrint trh "NG = URL no longer seems to work. You should click each URL marked as NG before attempting to fix it. False negatives will occur from time to time due to hiccups in the Internet. Please report any persistent false negatives or other issues to Iritscen."
470	valPrint trh "IW = URL is working but should be converted to interwiki link using the suggested markup."
471	valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $HTTP_CODES)."
472	valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$HTTP_CODES\"}}{\fldrslt here}} for code reference)."
473	valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$HTTP_CODES\" target=\"_blank\">here</a> for code reference)."
474	valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $CURL_CODES)."
475	valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$CURL_CODES\"}}{\fldrslt here}} for code reference)."
476	valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$CURL_CODES\" target=\"_blank\">here</a> for code reference)."
477	valPrint trh "IA suggests = Last available snapshot suggested by the Internet Archive."
478	valPrint trh "Try browsing = The Archive occasionally fails to return a snapshot URL even when one exists, so you will need to check for a snapshot manually using the Wayback Machine before concluding that a site has not been archived."
479	valPrint trh ""
480
481
482	### MAIN LOOP ###
483	# Process each line of the .csv in LINKS_FILE
484	for LINE in `cat "$LINKS_FILE"`; do
485	let LINK_NUM+=1
486
487	# First line is the column header row for the CSV, so let's verify that the format hasn't changed
488	if [ $SKIPPED_HEADER_ROW -eq 0 ]; then
489	if [ $LINE == "namespace,title,target" ]; then
490	SKIPPED_HEADER_ROW=1
491	LINK_NUM=0 # this line is it's not a link, so reset the link counter
492	valPrint hn "<table>"
493	continue
494	else
495	valPrint ctrh "Did not find expected header row in $LINKS_FILE. Aborting."
496	wrapupAndExit
497	fi
498	fi
499
500	# Skip this link if we are not at URL_START yet
501	if [ $LINK_NUM -lt $URL_START ]; then
502	continue
503	fi
504
505	# Stop if we are at the limit declared for testing purposes
506	if [ $URL_LIMIT -gt 0 ] && [ $LINK_NUM -gt $URL_LIMIT ]; then
507	FINISHED_LIST="limit"
508	wrapupAndExit
509	fi
510
511	# Print progress to screen
512	if [ $LINK_NUM -gt 1 ]; then
513	printf "\e[1A\n" # erase previous progress message so that new one appears in its place
514	fi
515	valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."
516
517	# The number of the namespace is the element before the first comma on the line
518	NS_ID=${LINE%%,*}
519
520	# Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
521	NS_NAME=""
522	a=0
523	while [ "x${NS_IDS[$a]}" != "x" ] # once this evaluates to "x", the array is done
524	do
525	if [ $NS_ID -eq ${NS_IDS[$a]} ]; then
526	NS_NAME="${NS_NAMES[$a]}"
527	break
528	fi
529	let a+=1
530	done
531	if [ -z "$NS_NAME" ]; then
532	valPrint tr "Skipping URL found on page $PAGE_NAME because I could not find a name for namespace ID $NS_ID."
533	let SKIP_UNK_NS+=1
534	continue
535	fi
536
537	# The name of the page is everything between the namespace ID and the next comma on the line (commas
538	# in page names will break this)
539	PAGE_NAME=${LINE#$NS_ID,}
540	PAGE_NAME=${PAGE_NAME%%,*}
541
542	# We don't want to consider wiki pages ending in .js, as the parser cannot reliably isolate URLS in
543	# JavaScript code, so it will return erroneous links
544	PAGE_NAME_SUFFIX=$(echo $PAGE_NAME \| sed 's/.*\.//')
545	if [ $PAGE_NAME_SUFFIX == "js" ]; then
546	valPrint tr "Skipping URL found on JavaScript page $PAGE_NAME."
547	let SKIP_JS_PAGE+=1
548	continue
549	fi
550
551	# The URL being linked to is everything after the previous two fields (this allows commas to be in
552	# the URLs, but a comma in the previous field, the page name, will break this)
553	URL=${LINE#$NS_ID,$PAGE_NAME,}
554
555	# Scan for illegal characters
556	if [[ $URL == [$ILLEGAL_CHARS] ]]; then
557	valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because it contains characters illegal in a URL."
558	let SKIP_BAD_URL+=1
559	continue
560	fi
561
562	# Now we need to know if the URL is for a file or a web page. First step is to determine if the
563	# URL ends in a suffix
564	HAS_SUFFIX=0
565
566	# If the URL ends in something like ".php?foo=bar", strip everything from the '?' onward
567	SAN_URL=${URL%%\?*}
568
569	# If the URL ends in something like "#section_15", strip everything from the '#' onward
570	SAN_URL=${SAN_URL%%\#*}
571
572	# 'sed' cannot handle Unicode in my Bash shell, so skip this URL and make user check it
573	if [[ $SAN_URL == [![:ascii:]] ]]; then
574	valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because I cannot handle non-ASCII characters."
575	let SKIP_NON_ASCII+=1
576	continue
577	fi
578
579	# Isolate the characters after the last period and after the last slash
580	POST_DOT=$(echo "$SAN_URL" \| sed 's/.*\.//')
581	POST_SLASH=$(echo "$SAN_URL" \| sed 's/.*\///')
582
583	# If the last period comes after the last slash, then the URL ends in a suffix
584	POST_DOT_LENGTH=$(echo \| awk -v input=$POST_DOT '{print length(input)}')
585	POST_SLASH_LENGTH=$(echo \| awk -v input=$POST_SLASH '{print length(input)}')
586	if [ $POST_DOT_LENGTH -lt $POST_SLASH_LENGTH ]; then
587	HAS_SUFFIX=1
588	else
589	HAS_SUFFIX=0
590	fi
591
592	# Now we need to know if the URL is for a file. If the URL ends in a suffix, compare it against the
593	# known URL endings supplied in HTTP_FILES and HTTP_TLDS_AND_PAGES
594	IS_FILE=-1
595	if [ $HAS_SUFFIX -eq 0 ]; then
596	IS_FILE=0
597	else
598	# Turn off case sensitivity while we compare suffixes
599	shopt -s nocasematch
600
601	# Special case: URLs ending in something like "/productID.304297400" are pages, not files, so if
602	# the URL's suffix is all numbers, we are looking at the end of a web page URL
603	if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
604	IS_FILE=0
605	fi
606
607	# If we did not identify this URL as a web page above, we need to compare the suffix against known
608	# file extensions
609	if [ $IS_FILE -eq -1 ]; then
610	for EXTENSION in "${HTTP_FILES[@]}"; do
611	if [[ $EXTENSION == $POST_DOT ]]; then # double brackets respect the nocasematch setting
612	IS_FILE=1
613	break
614	fi
615	done
616	fi
617
618	# If we did not identify this URL as a file above, we need to compare the suffix against known
619	# pages and TLDs in order to rule out that this is an unknown file suffix or TLD that the user
620	# needs to add to the HTTP_FILES or HTTP_TLDS_AND_PAGES array
621	if [ $IS_FILE -eq -1 ]; then
622	for PAGE_OR_TLD in "${HTTP_TLDS_AND_PAGES[@]}"; do
623	if [[ $PAGE_OR_TLD == $POST_DOT ]]; then
624	IS_FILE=0
625	break
626	fi
627	done
628	fi
629
630	# Turn case sensitivity back on in Bash
631	shopt -u nocasematch
632	fi
633
634	# If this suffix escaped identification as either a file, page or TLD, inform the user
635	STR_TYPE=""
636	if [ $IS_FILE -eq -1 ]; then
637	valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because I encountered the unknown URL ending $POST_DOT. Please add this ending to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
638	let SKIP_UNK_SUFFIX+=1
639	continue
640	elif [ $IS_FILE -eq 1 ]; then
641	STR_TYPE="file"
642	let FILE_LINKS+=1
643	elif [ $IS_FILE -eq 0 ]; then
644	STR_TYPE="page"
645	let PAGE_LINKS+=1
646	fi
647
648	# Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
649	# issue with sites that require HTTPS
650	CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{http_code}\n' $URL)
651	CURL_ERR=$(echo $?)
652	CURL_RESULT=$CURL_CODE
653
654	# Tack 'curl' exit code onto result string if 'curl' returned "000" (no HTTP response)
655	if [ $CURL_CODE == "000" ]; then
656	CURL_RESULT="$CURL_RESULT-$CURL_ERR"
657	fi
658
659	# Determine if this code is in our "OK" list
660	STATUS="??"
661	INTERWIKI_INDEX=-1
662	for CODE in "${OK_CODES[@]}"; do
663	if [[ $CODE == $CURL_CODE ]]; then
664	let OK_LINKS+=1
665
666	# Determine if this is a link to a domain that we have an interwiki prefix for
667	for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
668	if [[ $URL == ${INTERWIKI_DOMAINS[$i]} ]]; then
669	STATUS="IW"
670	INTERWIKI_INDEX=$i
671	break
672	fi
673	done
674
675	# If this link is OK and no interwiki advisory is needed, just mark as "OK"
676	if [ $INTERWIKI_INDEX == -1 ]; then
677	STATUS="OK"
678	fi
679	break
680	fi
681	done
682
683	# If we didn't get a match with the "OK" codes, check it against the "NG" codes
684	if [ $STATUS == "??" ]; then
685	for CODE in "${NG_CODES[@]}"; do
686	if [[ $CODE == $CURL_CODE ]]; then
687	STATUS="NG"
688	let NG_LINKS+=1
689	break
690	fi
691	done
692	fi
693
694	# If we didn't match a known status code, advise the reader
695	if [ $STATUS == "??" ]; then
696	valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because I encountered the unknown return code $CURL_CODE."
697	let SKIP_UNK_CODE+=1
698	continue
699	fi
700
701	# If link is "NG" and there is an exceptions file, compare URL against the list before logging it
702	if [ $STATUS == "NG" ] && [ ! -z $EXCEPT_URL ]; then
703	GREP_RESULT=$(grep --max-count=1 "$URL" "$EXCEPT_FILE")
704	EXCEPT_CODE=${GREP_RESULT%%,*}
705	if [ "$EXCEPT_CODE" == $CURL_RESULT ]; then
706	valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because its status code, $CURL_RESULT, is listed in the exceptions file."
707	let SKIP_EXCEPT+=1
708	continue
709	fi
710	fi
711
712	# If appropriate, record this link to the log, with clickable URLs when possible
713	if [ $STATUS != "OK" ] \|\| [ $RECORD_OK_LINKS -eq 1 ]; then
714	FULL_PAGE_PATH=http://$WIKI_PATH/$NS_NAME:$PAGE_NAME
715	LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
716	# Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it explicitly breaks the link
717	if [ $NS_ID -eq 0 ]; then
718	FULL_PAGE_PATH=http://$WIKI_PATH/$PAGE_NAME
719	LOCAL_PAGE_PATH=$PAGE_NAME
720	fi
721
722	# Stupid hack since the text "IW" is narrower than "OK" or "NG" and it takes an extra tab to get
723	# to the desired level of indentation in the RTF log
724	RTF_TABS=" "
725	if [ $STATUS == "IW" ]; then
726	RTF_TABS=" "
727	fi
728
729	# Record link and its wiki page in TXT, RTF, and HTML markup
730	valPrint t "$STATUS ($CURL_RESULT) $STR_TYPE $URL"
731	valPrint t " linked from $FULL_PAGE_PATH"
732	valPrint r "$STATUS ($CURL_RESULT)$RTF_TABS$STR_TYPE {\field{\*\fldinst{HYPERLINK \"$URL\"}}{\fldrslt $URL}}"
733	valPrint r " linked from {\field{\*\fldinst{HYPERLINK \"$FULL_PAGE_PATH\"}}{\fldrslt $LOCAL_PAGE_PATH}}"
734	valPrint hn "<tr><td style=\"white-space:nowrap\">$STATUS ($CURL_RESULT)</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>"
735	valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"
736
737	# Notify reader if we can use an interwiki prefix for this URL
738	if [ $STATUS == "IW" ]; then
739	valPrint t " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$POST_SLASH]]"
740	valPrint r " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$POST_SLASH]]"
741	valPrint hn "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$POST_SLASH]]</td></tr>"
742	fi
743
744	# Query Internet Archive for latest "OK" snapshot for "NG" page
745	if [ $STATUS == "NG" ] && [ $SUGGEST_SNAPSHOTS -eq 1 ]; then
746	ARCHIVE_QUERY=$(curl --silent --max-time 10 "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
747
748	# Isolate "url" property in response and log it if a "closest" snapshot was received...
749	if [[ "$ARCHIVE_QUERY" == \"closest\": ]]; then
750	SNAPSHOT_URL=${ARCHIVE_QUERY##*\"url\": \"}
751	SNAPSHOT_URL=${SNAPSHOT_URL%\", \"timestamp*}
752	valPrint t " IA suggests $SNAPSHOT_URL"
753	valPrint r " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
754	valPrint hn "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
755	else # ...otherwise give generic Wayback Machine link for this URL
756	valPrint t " Try browsing $ARCHIVE_GENERIC/$URL"
757	valPrint r " Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
758	valPrint hn "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"
759	fi
760	fi
761	fi
762
763	# If this is a page and it seems to be there, take screenshot of it using headless Google Chrome
764	if [ $IS_FILE -eq 0 ] && [ $TAKE_PAGE_SHOT -eq 1 ] && [ $STATUS == "OK" ]; then
765	# Sanitize URL by removing "http(s)://" and converting any other colons or slashes to underscores
766	SHOT_NAME=$(echo "$URL" \| sed 's/https*\:\/\///' \| sed 'y/:\//__/')
767	SHOT_FILE="$SHOT_PATH/$SHOT_NAME.png"
768
769	# Don't take screenshot if we already encountered this page and screenshotted it
770	if [ ! -f "$SHOT_FILE" ]; then
771	"$CHROME" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
772	if [ -f "$WORKING_DIR/$CHROME_SCREENSHOT" ]; then
773	mv -n "$WORKING_DIR/$CHROME_SCREENSHOT" "$SHOT_FILE"
774	else
775	valPrint trh "Screenshot of URL $URL seems to have failed!"
776	fi
777	else
778	valPrint trh "Skipping screenshot of URL $URL because $SHOT_FILE already exists."
779	fi
780	fi
781	done
782	FINISHED_LIST="yes"
783	wrapupAndExit

Note: See TracBrowser for help on using the repository browser.

Download in other formats: