Context Navigation

source: Validate External Links/validate_external_links.sh@ 1115

Last change on this file since 1115 was 1075, checked in by iritscen, 7 years ago
ValExtLinks: Fixed bug in interwiki link suggestions. Corrected documentation error.
File size: 40.6 KB

Line
1	#!/bin/bash
2
3	# Validate External Links by Iritscen
4	# Provided with a list of external links found in the OniGalore wiki, this script validates them.
5	# The resulting logs are produced in three formats: TXT (for easy diffing with an earlier log), RTF
6	# (for reading as a local file with clickable links), and HTML (for uploading as a web page).
7	# Call script with "--help" argument for documentation. Also see Read Me First.rtf for critical notes.
8	# Recommended rule:
9	# ------------------------------------------------------------------------------------------------------
10
11	# Set separator token to newline
12	IFS="
13	"
14
15	### GLOBALS ###
16	# Settings -- these will be changed from their defaults by the arguments passed in to the script
17	LINKS_URL="" # use 'curl' to download file with links from this location (can be file://)
18	EXCEPT_URL="" # ditto above for file with exceptions to NG results
19	OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder
20	RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES
21	SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page
22	TAKE_PAGE_SHOT=0 # take a screenshot of each OK page
23	CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature
24	URL_START=1 # start at this URL in LINKS_FILE (1 by default)
25	URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE
26	UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report
27
28	# Fixed strings -- see the occurrences of these variables to learn their purpose
29	AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0"
30	ARCHIVE_API="http://archive.org/wayback/available"
31	ARCHIVE_GENERIC="https://web.archive.org/web/*"
32	ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
33	CHROME_SCREENSHOT="screenshot.png"
34	CURL_CODES="http://iritscen.oni2.net/val/curl_codes.txt"
35	EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
36	HTTP_CODES="http://iritscen.oni2.net/val/http_codes.txt"
37	MY_WIKI_PAGE="http://wiki.oni2.net/User:Iritscen"
38	THIS_DIR=$(cd $(dirname $0); pwd)
39	WORKING_DIR=$(pwd)
40	WIKI_PATH="wiki.oni2.net"
41
42	# These are parallel arrays of the IDs and names of OniGalore's current namespaces
43	declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 100 101 102 103 104 105 108 109 110 111)
44	declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk")
45
46	# These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
47	# This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code.
48	declare -a HTTP_FILES=(txt zip wmv jpg png m4a bsl rar oni mp3 mov ONWC vbs TRMA mp4 doc avi log gif pdf dmg exe cpp tga 7z wav east BINA xml dll dae xaf fbx 3ds blend flv csv)
49	declare -a HTTP_TLDS_AND_PAGES=(com net org uk ru de it htm html php pl asp aspx shtml pgi cgi php3 x jsp phtml cfm css action stm js)
50
51	# These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
52	# are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt
53	# if you add a new code.
54	declare -a OK_CODES=(200 401 405 406 501)
55	declare -a RD_CODES=(301 302 303 307 308)
56	declare -a NG_CODES=(000 403 404 410 500 503)
57
58	# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
59	# transcluded text, and if the transclusion fails, then the braces show up in the URL
60	ILLEGAL_CHARS="{ }"
61
62	# The shortest URL possible, used for sanity-checking some URLs: http://a.co
63	MIN_URL_LENGTH=11
64
65	# These are parallel arrays giving the prefixes that can be used in place of normal external links to
66	# some wikis and other sites
67	declare -a INTERWIKI_PREFIXES=(commons metawikimedia mw wikibooks wikidata wikimedia wikinews wikiquote wikisource wikispecies wikiversity wikivoyage wikt wp)
68	declare -a INTERWIKI_DOMAINS=(commons.wikimedia.org meta.wikimedia.org mediawiki.org wikibooks.org wikidata.org wikimediafoundation.org wikinews.org wikiquote.org wikisource.org species.wikimedia.org wikiversity.org wikivoyage.org wiktionary.org wikipedia.org)
69
70	# Variables for keeping track of main loop progress and findings
71	LINK_NUM=0
72	EI_LINKS=0
73	IW_LINKS=0
74	OK_LINKS=0
75	RD_LINKS=0
76	NG_LINKS=0
77	SKIP_UNK_NS=0
78	SKIP_JS_PAGE=0
79	SKIP_BAD_URL=0
80	SKIP_NON_ASCII=0
81	SKIP_UNK_SUFFIX=0
82	SKIP_UNK_CODE=0
83	SKIP_EXPECT_NG=0
84	SKIP_EXPECT_EI=0
85	SKIP_EXPECT_IW=0
86	FILE_LINKS=0
87	PAGE_LINKS=0
88	SKIPPED_HEADER_ROW=0
89	FINISHED_LIST="no"
90
91
92	### HELP ###
93	# A pseudo-man page. Here is the 80-character rule for the page text:
94	# 234567890123456789012345678901234567890123456789012345678901234567890123456789
95	function printHelp()
96	{
97	cat << EOF
98
99	NAME
100	Validate External Links
101
102	SYNOPSIS
103	validate_external_links.sh --help
104	validate_external_links.sh --links URL --output DIR [--exceptions URL]
105	[--record-ok-links] [--suggest-snapshots] [--take-screenshots FILE]
106	[--start-url NUM] [--end-url NUM] [--upload FILE]
107
108	DESCRIPTION
109	This script parses a list of external links found in the OniGalore wiki
110	(which is dumped by the Oni2.net domain periodically in a particular
111	format), validates them using the Unix tool 'curl', and produces a report
112	of which links were "OK" (responded positively to an HTTP query), which
113	were "RD" (responded with a 3xx redirect code), which could be "IW"
114	(interwiki) links, which are "EI" (external internal) links and could be
115	intrawiki links, and which were "NG" (no good; a negative response to the
116	query). This report can then be automatically uploaded to the location of
117	your choice. The script can also suggest Internet Archive snapshots for
118	"NG" links, and take screenshots of "OK" links for visual verification by
119	the reader that the page in question is the one intended to be displayed.
120
121	You must pass this script the URL at which the list of links is found
122	(--links) and the path where the directory of logs should be outputted
123	(--output). All other arguments are optional.
124
125	OPTIONS
126	--help Show this page.
127	--links URL (required) URL from which to download the CSV
128	file with external links. Note that this URL can
129	be a local file if you supply a file:// path.
130	--output DIR (required) Unix path to directory in which Val
131	should place its reports.
132	--exceptions URL In order to remove links from the report which
133	Val finds an issue with, but which you regard as
134	OK, list those desired exceptions in this file.
135	See the sample file exceptions.txt for details.
136	Note that this URL can point to a local file if
137	you supply a file:// path.
138	--record-ok-links Log a link in the report even if its response
139	code is "OK".
140	--suggest-snapshots Query the Internet Archive for a possible
141	snapshot URL for each "NG" page.
142	--take-screenshots FILE Call the Google Chrome binary at this path to
143	take screenshots of each "OK" page.
144	--start-url NUM Start at this link in the links CSV file.
145	--end-url NUM Stop at this link in the links CSV file.
146	--upload FILE Upload report using the credentials and path
147	given in this local text file. See sftp_login.txt
148	for template.
149
150	BUGS
151	The script cannot properly parse any line in the external links file
152	which contains a comma in the name of the wiki page containing a link.
153	Commas in the link itself are not an issue.
154	EOF
155	}
156
157
158	### SETUP ###
159	# If first argument is a help request, or if nothing was passed in at all, print help page and quit
160	if [ "$#" -eq 0 ] \|\| [ "$1" == "--help" ]; then
161	printHelp \| less
162	exit 0
163	fi
164
165	# Parse arguments as long as there are more arguments to process
166	while (( "$#" )); do
167	case "$1" in
168	--links ) LINKS_URL="$2"; shift 2;;
169	--exceptions ) EXCEPT_URL="$2"; shift 2;;
170	--output ) OUTPUT_DIR="$2"; shift 2;;
171	--record-ok-links ) RECORD_OK_LINKS=1; shift;;
172	--suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;;
173	--take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
174	--start-url ) URL_START=$2; shift 2;;
175	--end-url ) URL_LIMIT=$2; shift 2;;
176	--upload ) UPLOAD_INFO=$2; shift 2;;
177	* ) echo "Invalid argument $1 detected. Aborting."; exit 1;;
178	esac
179	done
180
181	# If the required arguments were not supplied, print help page and quit
182	if [ -z $LINKS_URL ] \|\| [ -z $OUTPUT_DIR ]; then
183	echo "Error: I did not receive one or both required arguments. Run me with the \"--help\" argument for documentation."
184	exit 2
185	fi
186
187	# If user wants screenshots, make sure path to Chrome was passed in and is valid
188	if [ $TAKE_PAGE_SHOT -eq 1 ]; then
189	if [ ! -f "$CHROME_PATH" ]; then
190	echo "Error: You need to supply a path to the Google Chrome application in order to take screenshots."
191	exit 3
192	fi
193	fi
194
195	# Check that UPLOAD_INFO exists, if this argument was supplied
196	if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then
197	echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting."
198	exit 4
199	fi
200
201	# Check that OUTPUT_DIR is a directory
202	if [ ! -d "$OUTPUT_DIR" ]; then
203	echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting."
204	exit 5
205	fi
206
207	# Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots
208	SAFE_TIME=$(date "+%Y-%m-%d--%H-%M-%S") # for use in path names
209	NICE_TIME=$(TZ=":UTC" date "+%a, %d %b %Y %k:%M:%S GMT") # matches format of LINKS_DATE, which is extracted from the extlinks file
210	OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
211	OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
212	SHOT_PATH="$OUTPUT_PATH/Screenshots"
213	LOG_NAME="ValExtLinks report"
214	LOG_TXT="$OUTPUT_PATH/$LOG_NAME.txt"
215	LOG_RTF="$OUTPUT_PATH/$LOG_NAME.rtf"
216	LOG_HTM="$OUTPUT_PATH/$LOG_NAME.htm"
217	mkdir "$OUTPUT_PATH"
218	if [ $TAKE_PAGE_SHOT -eq 1 ]; then
219	mkdir "$SHOT_PATH"
220	fi
221
222	# Check that 'mkdir' succeeded
223	if [ ! -d "$OUTPUT_PATH" ]; then
224	echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting."
225	exit 6
226	fi
227
228	# Get date on the file at LINKS_URL and print to log
229	LINKS_DATE=$(curl --silent --head $LINKS_URL \| grep "Last-Modified")
230	if [ -z "$LINKS_DATE" ]; then
231	echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting."
232	exit 7
233	fi
234	LINKS_DATE=${LINKS_DATE#Last-Modified: }
235
236
237	### UTILITY FUNCTIONS ###
238	# Writes a plain-text header to TXT log file
239	function printTXTheader()
240	{
241	valPrint t "Validate External Links report"
242	valPrint t "generated $NICE_TIME"
243	valPrint t "from data of $LINKS_DATE"
244	valPrint t "script by Iritscen (contact: $MY_WIKI_PAGE)"
245	valPrint t ""
246	}
247
248	# Writes the RTF header to RTF log file, then a centered title, then sets text to left-justified
249	function printRTFheader()
250	{
251	valPrint r "{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf820
252	{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
253	{\colortbl;\red255\green255\blue255;}
254	{\*\expandedcolortbl;;}
255	\margl1440\margr1440\vieww12600\viewh12100\viewkind0
256	\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\qc\partightenfactor0
257
258	\f0\fs28 \cf0 \b1 Validate External Links report \b0\\
259	generated $NICE_TIME\\
260	from data of $LINKS_DATE\\
261	script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$MY_WIKI_PAGE\"}}{\fldrslt contact}})
262	\\
263	\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
264	\cf0 "
265	}
266
267	# Closes the RTF markup of the RTF log file
268	function printRTFfooter()
269	{
270	valPrint r "}"
271	}
272
273	# Writes the HTML header to HTML log file
274	function printHTMheader()
275	{
276	valPrint h "<html>
277	<head>
278	<title>Validate External Links report</title>
279	</head>
280	<body>
281	<h2>Validate External Links report</h2>
282	<h3>generated $NICE_TIME<br />
283	from data of $LINKS_DATE<br />
284	script by Iritscen (<a href=\"$MY_WIKI_PAGE\" target=\"_blank\">contact</a>)</h3>"
285	}
286
287	# Closes the HTML markup of the HTML log file
288	function printHTMfooter()
289	{
290	valPrint h "</body>
291	</html>"
292	}
293
294	# The central logging function. The first parameter is a string composed of one or more characters that
295	# indicate which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
296	# 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 'w' means "Don't
297	# pass console output through 'fmt'" ("fmt" fits the output to an 80-column CLI but can break special
298	# formatting and the 'n' option).
299	function valPrint()
300	{
301	if [[ "$1" == c ]]; then
302	if [[ "$1" == n ]]; then
303	echo -n "$2"
304	elif [[ "$1" == w ]]; then
305	echo "$2"
306	else
307	echo "$2" \| fmt -w 80
308	fi
309	fi
310	if [[ "$1" == t ]]; then
311	if [[ "$1" == n ]]; then
312	echo -n "$2" >> "$LOG_TXT"
313	else
314	echo "$2" >> "$LOG_TXT"
315	fi
316	fi
317	if [[ "$1" == r ]]; then
318	if [[ "$1" == n ]]; then
319	echo "$2" >> "$LOG_RTF"
320	else
321	echo "$2\\" >> "$LOG_RTF"
322	fi
323	fi
324	if [[ "$1" == h ]]; then
325	if [[ "$1" == n ]]; then
326	echo "$2" >> "$LOG_HTM"
327	else
328	echo "$2<br />" >> "$LOG_HTM"
329	fi
330	fi
331	}
332
333	# Pluralize the string in parameter 1 if the number in parameter 2 is not 1
334	function pluralCheckNoun()
335	{
336	if [ $2 -ne 1 ]; then
337	if [[ $1 =~ x$ ]]; then
338	echo $1es
339	else
340	echo $1s
341	fi
342	else
343	echo $1
344	fi
345	}
346
347	# Output "is" if parameter 1 is 1, otherwise "are"
348	function pluralCheckIs()
349	{
350	if [ $1 -ne 1 ]; then
351	echo "are"
352	else
353	echo "is"
354	fi
355	}
356
357	# Output "was" if parameter 1 is 1, otherwise "were"
358	function pluralCheckWas()
359	{
360	if [ $1 -ne 1 ]; then
361	echo "were"
362	else
363	echo "was"
364	fi
365	}
366
367	# Output "a " if parameter 1 is 1, otherwise nothing
368	function pluralCheckA()
369	{
370	if [ $1 -eq 1 ]; then
371	echo "a "
372	fi
373	}
374
375	# Output "an " if parameter 1 is 1, otherwise nothing
376	function pluralCheckAn()
377	{
378	if [ $1 -eq 1 ]; then
379	echo "an "
380	fi
381	}
382
383	# Upload HTML report using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
384	# reports being saved to disk have already been closed.
385	function uploadReport()
386	{
387	valPrint c "Uploading HTML report..."
388
389	SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
390	SFTP_USER_NAME_MARKER="user:"
391	SFTP_PASSWORD_MARKER="pw:"
392	SFTP_PORT_MARKER="port:"
393	SFTP_PATH_MARKER="path:"
394	SFTP_USER_NAME=$(grep $SFTP_USER_NAME_MARKER $UPLOAD_INFO)
395	SFTP_USER_NAME=${SFTP_USER_NAME#$SFTP_USER_NAME_MARKER}
396	SFTP_PASSWORD=$(grep $SFTP_PASSWORD_MARKER $UPLOAD_INFO)
397	SFTP_PASSWORD=${SFTP_PASSWORD#$SFTP_PASSWORD_MARKER}
398	SFTP_PORT=$(grep $SFTP_PORT_MARKER $UPLOAD_INFO)
399	SFTP_PORT=${SFTP_PORT#$SFTP_PORT_MARKER}
400	SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
401	SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}
402
403	expect "$SCRIPT_PATH" "$LOG_HTM" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.htm"
404
405	valPrint c "Report was uploaded, unless an error message appears above."
406	}
407
408	# Prints session summary when script is done
409	function wrapupAndExit()
410	{
411	# Get off progress line on console, drop down a line from last link in log, and close HTML table
412	valPrint ctr ""
413	valPrint h "</table><br />"
414
415	# If we didn't finish processing the last URL, then the iterator is one too high
416	if [ $FINISHED_LIST != "yes" ]; then
417	let LINK_NUM-=1
418	if [ $FINISHED_LIST == "no" ]; then
419	valPrint ctrh "The session was canceled by the user."
420	fi
421	fi
422
423	# Output results of session and close the log file's markup
424	LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
425	LINKS_SKIPPED=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
426	LINKS_CHECKED=$((LINKS_PROCESSED-LINKS_SKIPPED))
427	valPrint ct "Summary:"
428	valPrint r "\b1 Summary \b0"
429	valPrint hn "<h3><span id=\"summary\">Summary</span></h3>"
430	valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT)."
431	valPrint ctrh "I skipped $LINKS_SKIPPED $(pluralCheckNoun link $LINKS_SKIPPED), and found $FILE_LINKS $(pluralCheckNoun file $FILE_LINKS) and $PAGE_LINKS $(pluralCheckNoun page $PAGE_LINKS)."
432	if [ $LINKS_SKIPPED -gt 0 ]; then valPrint ctrh "Skip breakdown: "; fi
433	if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
434	if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
435	if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
436	if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
437	if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
438	if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
439	valPrint ctrh "Out of the $LINKS_CHECKED links checked, $EI_LINKS could be $(pluralCheckAn $EI_LINKS)intrawiki $(pluralCheckNoun link $EI_LINKS), $IW_LINKS could be $(pluralCheckAn $IW_LINKS)interwiki $(pluralCheckNoun link $IW_LINKS), $OK_LINKS $(pluralCheckWas $OK_LINKS) OK, $RD_LINKS $(pluralCheckWas $RD_LINKS) $(pluralCheckA $RD_LINKS)redirection $(pluralCheckNoun notice $RD_LINKS), and $NG_LINKS $(pluralCheckWas $NG_LINKS) NG."
440	if [ $SKIP_EXPECT_NG -gt 0 ]; then
441	valPrint ctrh "$SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS) went unlisted due to being found in the exceptions file."
442	fi
443	if [ $SKIP_EXPECT_EI -gt 0 ]; then
444	valPrint ctrh "$SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS) went unlisted due to being found in the exceptions file."
445	fi
446	if [ $SKIP_EXPECT_IW -gt 0 ]; then
447	valPrint ctrh "$SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS) went unlisted due to being found in the exceptions file."
448	fi
449	valPrint trh "ValExtLinks says goodbye."
450	printRTFfooter
451	printHTMfooter
452
453	# Upload report if this was requested
454	if [ ! -z $UPLOAD_INFO ] && [ $FINISHED_LIST != "no" ]; then
455	uploadReport
456	fi
457
458	# Really quit now
459	valPrint c "ValExtLinks says goodbye."
460	exit 0
461	}
462	trap wrapupAndExit INT
463
464
465	### INITIALIZATION ###
466	# Print opening message to console and log files
467	valPrint cw "$(tput bold)--Validate External Links--$(tput sgr0)"
468	printTXTheader
469	printRTFheader
470	printHTMheader
471
472	# Attempt to download file at LINKS_URL, then check that it succeeded
473	valPrint cwtrh "Downloading list of external links from $LINKS_URL."
474	LINKS_FILE_NAME=$(echo "$LINKS_URL" \| sed 's/.*\///')
475	LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
476	curl --silent -o "$LINKS_FILE" $LINKS_URL
477	if [ ! -f "$LINKS_FILE" ]; then
478	echo "The download of $LINKS_URL appears to have failed. Aborting."
479	wrapupAndExit
480	fi
481
482	# Attempt to download file at EXCEPT_URL, then check that it succeeded
483	if [ ! -z $EXCEPT_URL ]; then
484	valPrint cwtrh "Downloading list of reporting exceptions from $EXCEPT_URL."
485	EXCEPT_FILE_NAME=$(echo "$EXCEPT_URL" \| sed 's/.*\///')
486	EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
487	curl --silent -o "$EXCEPT_FILE" $EXCEPT_URL
488	if [ ! -f "$EXCEPT_FILE" ]; then
489	echo "The download of $EXCEPT_URL appears to have failed. Aborting."
490	wrapupAndExit
491	fi
492	fi
493
494	# Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
495	LINK_COUNT_STRING=$(cat "$LINKS_FILE" \| wc -l)
496
497	# Number of URLs is number of lines minus one (first line is column header row for the CSV)
498	LINK_COUNT=$(echo "${LINK_COUNT_STRING}" \| tr -d '[:space:]')
499	let LINK_COUNT-=1
500
501	# Calculate number of URLs to consider
502	if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
503	valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((URL_LIMIT-URL_START+1)) of them, from link $URL_START to $URL_LIMIT."
504	elif [ $URL_START -ne 1 ]; then
505	valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((LINK_COUNT-URL_START+1)) of them, from link $URL_START to $LINK_COUNT."
506	else
507	valPrint ctrh "Found $LINK_COUNT links to process. I will be considering all of them."
508	fi
509
510	# Print settings to console and log
511	declare -a SETTINGS_MSG=(I will be checking the response code of each link "and will" take a screenshot of each page. Pages that are OK will "also" be logged. I "will" ask the Internet Archive for a suggested snapshot URL for each NG page. "I will not report links that are listed in the exceptions file.")
512	if [ $TAKE_PAGE_SHOT -eq 0 ]; then SETTINGS_MSG[10]="but will not"; fi
513	if [ $RECORD_OK_LINKS -eq 0 ]; then SETTINGS_MSG[22]="not"; fi
514	if [ $SUGGEST_SNAPSHOTS -eq 0 ]; then SETTINGS_MSG[26]="will not"; fi
515	if [ -z $EXCEPT_URL ] \|\| [ $RECORD_OK_LINKS -eq 1 ]; then SETTINGS_MSG[40]=""; fi
516	SETTINGS_STR=${SETTINGS_MSG[@]}
517	valPrint ctrh "$SETTINGS_STR"
518	valPrint tr "A summary of my findings will be found at the bottom of the report."
519	valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
520	valPrint trh ""
521
522	# Print legend to logs
523	valPrint t "Legend:"
524	valPrint r "\b1 Legend \b0"
525	valPrint hn "<h3>Legend</h3>"
526	valPrint trh "OK = URL seems to be working."
527	valPrint trh "NG = URL no longer seems to work. You should click each URL marked as NG before attempting to fix it, because false negatives will occur from time to time due to hiccups in the Internet. Please report any persistent false negatives or other issues to Iritscen. An NG link should be followed by a link to the Internet Archive's Wayback Machine which may help you repair the link. If the link cannot be repaired, you can disable it on the wiki (which prevents it from showing up in future ValExtLinks reports) by wrapping it in nowiki tags."
528	valPrint trh "RD = The server responding to this URL is saying that the page moved and you should instead use the supplied new URL. Some RD links represent minor adjustments in the organization of a web site, and some are soft 404s (the file/page has been removed and you are being redirected to something like the main page of the web site). You will have to look at the new URL yourself to determine if it represents an OK link and the link on the wiki should be updated to this one, or if the desired file/page is actually gone and we need to replace the wiki link with an Internet Archive snapshot link -- or disable the URL if it has not been archived."
529	valPrint trh "EI = URL is an external link to an internal page and should be converted to an intrawiki link using the suggested markup."
530	valPrint trh "IW = URL is an external link to a fellow wiki and should be converted to an interwiki link using the suggested markup."
531	valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $HTTP_CODES)."
532	valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$HTTP_CODES\"}}{\fldrslt here}} for code reference)."
533	valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$HTTP_CODES\" target=\"_blank\">here</a> for code reference)."
534	valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $CURL_CODES)."
535	valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$CURL_CODES\"}}{\fldrslt here}} for code reference)."
536	valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$CURL_CODES\" target=\"_blank\">here</a> for code reference)."
537	valPrint trh "IA suggests = Last available snapshot suggested by the Internet Archive."
538	valPrint trh "Try browsing = The Archive occasionally fails to return a snapshot URL even when one exists, so you will need to check for a snapshot manually using this link to the Wayback Machine before concluding that a site has not been archived."
539	valPrint trh ""
540
541
542	### MAIN LOOP ###
543	# Process each line of the .csv in LINKS_FILE
544	for LINE in `cat "$LINKS_FILE"`; do
545	let LINK_NUM+=1
546
547	# First line is the column header row for the CSV, so let's verify that the format hasn't changed
548	if [ $SKIPPED_HEADER_ROW -eq 0 ]; then
549	if [ $LINE == "namespace,title,target" ]; then
550	SKIPPED_HEADER_ROW=1
551	LINK_NUM=0 # this line is it's not a link, so reset the link counter
552	valPrint hn "<table>"
553	continue
554	else
555	valPrint ctrh "Did not find expected header row in $LINKS_FILE. Aborting."
556	wrapupAndExit
557	fi
558	fi
559
560	# Skip this link if we are not at URL_START yet
561	if [ $LINK_NUM -lt $URL_START ]; then
562	continue
563	fi
564
565	# Stop if we are at the limit declared for testing purposes
566	if [ $URL_LIMIT -gt 0 ] && [ $LINK_NUM -gt $URL_LIMIT ]; then
567	FINISHED_LIST="limit"
568	wrapupAndExit
569	fi
570
571	# Print progress to screen
572	if [ $LINK_NUM -gt 1 ]; then
573	printf "\e[1A\n" # erase previous progress message so that new one appears in its place
574	fi
575	valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."
576
577	# The number of the namespace is the element before the first comma on the line
578	NS_ID=${LINE%%,*}
579
580	# Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
581	NS_NAME=""
582	a=0
583	while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done
584	if [ $NS_ID -eq ${NS_IDS[$a]} ]; then
585	NS_NAME="${NS_NAMES[$a]}"
586	break
587	fi
588	let a+=1
589	done
590	if [ -z "$NS_NAME" ]; then
591	valPrint tr "Skipping URL found on page $PAGE_NAME because I could not find a name for namespace ID $NS_ID."
592	let SKIP_UNK_NS+=1
593	continue
594	fi
595
596	# The name of the page is everything between the namespace ID and the next comma on the line (commas
597	# in page names will break this)
598	PAGE_NAME=${LINE#$NS_ID,}
599	PAGE_NAME=${PAGE_NAME%%,*}
600
601	# We don't want to consider wiki pages ending in .js, as the parser cannot reliably isolate URLS in
602	# JavaScript code, so it will return erroneous links
603	PAGE_NAME_SUFFIX=$(echo $PAGE_NAME \| sed 's/.*\.//')
604	if [ $PAGE_NAME_SUFFIX == "js" ]; then
605	valPrint tr "Skipping URL found on JavaScript page $PAGE_NAME."
606	let SKIP_JS_PAGE+=1
607	continue
608	fi
609
610	# Build longer wiki page URLs from namespace and page names
611	FULL_PAGE_PATH=http://$WIKI_PATH/$NS_NAME:$PAGE_NAME
612	LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
613	# Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it
614	# explicitly breaks the link
615	if [ $NS_ID -eq 0 ]; then
616	FULL_PAGE_PATH=http://$WIKI_PATH/$PAGE_NAME
617	LOCAL_PAGE_PATH=$PAGE_NAME
618	fi
619
620	# The URL being linked to is everything after the previous two fields (this allows commas to be in
621	# the URLs, but a comma in the previous field, the page name, will break this)
622	URL=${LINE#$NS_ID,$PAGE_NAME,}
623
624	# Scan for illegal characters
625	if [[ $URL == [$ILLEGAL_CHARS] ]]; then
626	valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because it contains characters illegal in a URL."
627	let SKIP_BAD_URL+=1
628	continue
629	fi
630
631	# Now we need to know if the URL is for a file or a web page. First step is to determine if the
632	# URL ends in a suffix
633	HAS_SUFFIX=0
634
635	# If the URL ends in something like ".php?foo=bar", strip everything from the '?' onward
636	CLEAN_URL=${URL%%\?*}
637
638	# If the URL ends in something like "#section_15", strip everything from the '#' onward
639	CLEAN_URL=${CLEAN_URL%%\#*}
640
641	# 'sed' cannot handle Unicode in my Bash shell, so skip this URL and make user check it
642	if [[ $CLEAN_URL == [![:ascii:]] ]]; then
643	valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because I cannot handle non-ASCII characters."
644	let SKIP_NON_ASCII+=1
645	continue
646	fi
647
648	# Isolate the characters after the last period and after the last slash
649	POST_DOT=$(echo "$CLEAN_URL" \| sed 's/.*\.//')
650	POST_SLASH=$(echo "$CLEAN_URL" \| sed 's/.*\///')
651
652	# If the last period comes after the last slash, then the URL ends in a suffix
653	POST_DOT_LENGTH=$(echo \| awk -v input=$POST_DOT '{print length(input)}')
654	POST_SLASH_LENGTH=$(echo \| awk -v input=$POST_SLASH '{print length(input)}')
655	if [ $POST_DOT_LENGTH -lt $POST_SLASH_LENGTH ]; then
656	HAS_SUFFIX=1
657	else
658	HAS_SUFFIX=0
659	fi
660
661	# Now we need to know if the URL is for a file. If the URL ends in a suffix, compare it against the
662	# known URL endings supplied in HTTP_FILES and HTTP_TLDS_AND_PAGES
663	IS_FILE=-1
664	if [ $HAS_SUFFIX -eq 0 ]; then
665	IS_FILE=0
666	else
667	# Turn off case sensitivity while we compare suffixes
668	shopt -s nocasematch
669
670	# Special case: URLs ending in something like "/productID.304297400" are pages, not files, so if
671	# the URL's suffix is all numbers, we are looking at the end of a web page URL
672	if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
673	IS_FILE=0
674	fi
675
676	# If we did not identify this URL as a web page above, we need to compare the suffix against known
677	# file extensions
678	if [ $IS_FILE -eq -1 ]; then
679	for EXTENSION in "${HTTP_FILES[@]}"; do
680	if [[ $EXTENSION == $POST_DOT ]]; then # double brackets respect the nocasematch setting
681	IS_FILE=1
682	break
683	fi
684	done
685	fi
686
687	# If we did not identify this URL as a file above, we need to compare the suffix against known
688	# pages and TLDs in order to rule out that this is an unknown file suffix or TLD that the user
689	# needs to add to the HTTP_FILES or HTTP_TLDS_AND_PAGES array
690	if [ $IS_FILE -eq -1 ]; then
691	for PAGE_OR_TLD in "${HTTP_TLDS_AND_PAGES[@]}"; do
692	if [[ $PAGE_OR_TLD == $POST_DOT ]]; then
693	IS_FILE=0
694	break
695	fi
696	done
697	fi
698
699	# Turn case sensitivity back on in Bash
700	shopt -u nocasematch
701	fi
702
703	# If this suffix escaped identification as either a file, page or TLD, inform the user
704	STR_TYPE=""
705	if [ $IS_FILE -eq -1 ]; then
706	valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because I encountered the unknown URL ending $POST_DOT. Please add this ending to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
707	let SKIP_UNK_SUFFIX+=1
708	continue
709	elif [ $IS_FILE -eq 1 ]; then
710	STR_TYPE="file"
711	let FILE_LINKS+=1
712	elif [ $IS_FILE -eq 0 ]; then
713	STR_TYPE="page"
714	let PAGE_LINKS+=1
715	fi
716
717	# Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
718	# issue with sites that require HTTPS
719	CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{http_code}\n' $URL)
720	CURL_ERR=$(echo $?)
721	CURL_RESULT=$CURL_CODE
722
723	# Tack 'curl' exit code onto result string if 'curl' returned "000" (no HTTP response)
724	if [ $CURL_CODE == "000" ]; then
725	CURL_RESULT="$CURL_RESULT-$CURL_ERR"
726	fi
727
728	# Begin to determine our status code for this URL (EI, IW, OK, RD, or NG)
729	STATUS="??"
730	NEW_URL=""
731	INTERWIKI_INDEX=-1
732
733	# First make sure that this isn't an "external internal" link to our own wiki that can be replaced
734	# by "[[page_name]]". If it uses a special access URL beginning with "/w/", let it pass, as it
735	# probably cannot be replaced by "[[ ]]" markup
736	if [[ $URL == $WIKI_PATH ]] && [[ $URL != $WIKI_PATH/w/ ]]; then
737	STATUS="EI"
738	let EI_LINKS+=1
739	fi
740
741	# If it's not, check if this is a link to a domain that we have an interwiki prefix for
742	if [ $STATUS == "??" ]; then
743	for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
744	if [[ $URL == ${INTERWIKI_DOMAINS[$i]} ]] && [[ $URL != ${INTERWIKI_DOMAINS[$i]}/w/ ]]; then
745	STATUS="IW"
746	let IW_LINKS+=1
747	INTERWIKI_INDEX=$i
748	break
749	fi
750	done
751	fi
752
753	# If we didn't match an interwiki domain, see if the status code is in our "OK" codes list
754	if [ $STATUS == "??" ]; then
755	for CODE in "${OK_CODES[@]}"; do
756	if [[ $CODE == $CURL_CODE ]]; then
757	STATUS="OK"
758	let OK_LINKS+=1
759	break
760	fi
761	done
762	fi
763
764	# If we didn't get a match with the "OK" codes, check it against the "RD" codes
765	if [ $STATUS == "??" ]; then
766	for CODE in "${RD_CODES[@]}"; do
767	if [[ $CODE == $CURL_CODE ]]; then
768	# Get URL header again in order to retrieve the URL we are being redirected to
769	NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{redirect_url}\n' $URL)
770
771	# Filter out cases where the redirect URL is just the original URL with https:// instead of
772	# http://, or with an added '/' at the end. These corrections happen a lot and are not
773	# important to us.
774	URL_NO_PROTOCOL=${URL#http://}
775	URL_NO_PROTOCOL=${URL_NO_PROTOCOL%/}
776	NEW_URL_NO_PROTOCOL=${NEW_URL#https://}
777	NEW_URL_NO_PROTOCOL=${NEW_URL_NO_PROTOCOL%/}
778
779	# Sometimes 'curl' fails to get the redirect_url due to time-out or bad web site config
780	NEW_URL_LENGTH=$(echo \| awk -v input=$NEW_URL_NO_PROTOCOL '{print length(input)}')
781	if [ $NEW_URL_LENGTH -lt $MIN_URL_LENGTH ]; then
782	NEW_URL_NO_PROTOCOL="[new URL not retrieved]"
783	fi
784
785	# If the URLs match after the above filters were applied, then the link is OK
786	if [ $URL_NO_PROTOCOL == $NEW_URL_NO_PROTOCOL ]; then
787	STATUS="OK"
788	let OK_LINKS+=1
789	else
790	STATUS="RD"
791	let RD_LINKS+=1
792	fi
793	break
794	fi
795	done
796	fi
797
798	# If we didn't get a match with the "RD" codes, check it against the "NG" codes
799	if [ $STATUS == "??" ]; then
800	for CODE in "${NG_CODES[@]}"; do
801	if [[ $CODE == $CURL_CODE ]]; then
802	STATUS="NG"
803	let NG_LINKS+=1
804	break
805	fi
806	done
807	fi
808
809	# If we didn't match a known status code, advise the reader
810	if [ $STATUS == "??" ]; then
811	valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because I encountered the unknown return code $CURL_CODE."
812	let SKIP_UNK_CODE+=1
813	continue
814	fi
815
816	# Check problem links against exceptions file before proceeding
817	if [ $STATUS != "OK" ] && [ ! -z $EXCEPT_URL ]; then
818	# The code we expect to find in the exceptions file is either the 'curl' result or "EI"/"IW"
819	EXPECT_CODE="$CURL_RESULT"
820	if [ $STATUS == "EI" ]; then
821	EXPECT_CODE="EI"
822	elif [ $STATUS == "IW" ]; then
823	EXPECT_CODE="IW"
824	fi
825
826	# Look for link in exceptions file and make sure its listed result code and wiki page also match
827	GREP_RESULT=$(grep --max-count=1 "$URL" "$EXCEPT_FILE")
828	EXCEPT_PAGE=${GREP_RESULT##*,}
829	if [ "$EXCEPT_PAGE" == "*" ] \|\| [ "$EXCEPT_PAGE" == $LOCAL_PAGE_PATH ]; then
830	EXCEPT_CODE=${GREP_RESULT%%,*}
831	if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then
832	valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because its expected result, $EXPECT_CODE, is listed in the exceptions file."
833	if [ $STATUS == "EI" ]; then
834	let SKIP_EXPECT_EI+=1
835	elif [ $STATUS == "IW" ]; then
836	let SKIP_EXPECT_IW+=1
837	else
838	let SKIP_EXPECT_NG+=1
839	fi
840	continue
841	fi
842	fi
843	fi
844
845	# If appropriate, record this link to the log, with clickable URLs when possible
846	if [ $STATUS != "OK" ] \|\| [ $RECORD_OK_LINKS -eq 1 ]; then
847	# Stupid hack since the strings "IW" and "EI" are narrower than "OK", "RD", or "NG" and it takes
848	# an extra tab to get to the desired level of indentation in the RTF log
849	RTF_TABS=" "
850	if [ $STATUS == "IW" ] \|\| [ $STATUS == "EI" ]; then
851	RTF_TABS=" "
852	fi
853
854	# Record link and its wiki page in TXT, RTF, and HTML markup
855	valPrint t "$STATUS ($CURL_RESULT) $STR_TYPE $URL"
856	valPrint t " linked from $FULL_PAGE_PATH"
857	valPrint r "$STATUS ($CURL_RESULT)$RTF_TABS$STR_TYPE {\field{\*\fldinst{HYPERLINK \"$URL\"}}{\fldrslt $URL}}"
858	valPrint r " linked from {\field{\*\fldinst{HYPERLINK \"$FULL_PAGE_PATH\"}}{\fldrslt $LOCAL_PAGE_PATH}}"
859	valPrint hn "<tr><td style=\"white-space:nowrap\">$STATUS ($CURL_RESULT)</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>"
860	valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"
861
862	# Record redirect URL if one was given by a 3xx response page
863	if [ $STATUS == "RD" ]; then
864	valPrint t " Server suggests $NEW_URL"
865	valPrint r " Server suggests {\field{\*\fldinst{HYPERLINK \"$NEW_URL\"}}{\fldrslt $NEW_URL}}"
866	valPrint hn "<tr><td colspan=\"2\" align=\"right\">Server suggests</td><td><a href=\"$NEW_URL\" target=\"_blank\">$NEW_URL</a></td></tr>"
867	fi
868
869	# Notify reader if we can use an intrawiki link for this URL
870	if [ $STATUS == "EI" ]; then
871	INTRA_PAGE=${URL#:///}
872	valPrint t " Just use [[$INTRA_PAGE]]"
873	valPrint r " Just use [[$INTRA_PAGE]]"
874	valPrint hn "<tr><td colspan=\"2\" align=\"right\">Just use</td><td>[[$INTRA_PAGE]]</td></tr>"
875	fi
876
877	# Notify reader if we can use an interwiki prefix for this URL
878	if [ $STATUS == "IW" ]; then
879	INTER_PAGE=$(echo "$URL" \| sed 's/.*\///')
880	valPrint t " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
881	valPrint r " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
882	valPrint hn "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]</td></tr>"
883	fi
884
885	# Query Internet Archive for latest "OK" snapshot for "NG" page
886	if [ $STATUS == "NG" ] && [ $SUGGEST_SNAPSHOTS -eq 1 ]; then
887	ARCHIVE_QUERY=$(curl --silent --max-time 10 "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
888
889	# Isolate "url" property in response and log it if a "closest" snapshot was received...
890	if [[ "$ARCHIVE_QUERY" == \"closest\": ]]; then
891	SNAPSHOT_URL=${ARCHIVE_QUERY##*\"url\": \"}
892	SNAPSHOT_URL=${SNAPSHOT_URL%\", \"timestamp*}
893	valPrint t " IA suggests $SNAPSHOT_URL"
894	valPrint r " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
895	valPrint hn "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
896	else # ...otherwise give generic Wayback Machine link for this URL
897	valPrint t " Try browsing $ARCHIVE_GENERIC/$URL"
898	valPrint r " Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
899	valPrint hn "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"
900	fi
901	fi
902	fi
903
904	# If this is a page and it seems to be there, take screenshot of it using headless Google Chrome
905	if [ $IS_FILE -eq 0 ] && [ $TAKE_PAGE_SHOT -eq 1 ] && [ $STATUS == "OK" ]; then
906	# Sanitize URL by removing "http(s)://" and converting any other colons or slashes to underscores
907	SHOT_NAME=$(echo "$URL" \| sed 's/https*\:\/\///' \| sed 'y/:\//__/')
908	SHOT_FILE="$SHOT_PATH/$SHOT_NAME.png"
909
910	# Don't take screenshot if we already encountered this page and screenshotted it
911	if [ ! -f "$SHOT_FILE" ]; then
912	"$CHROME_PATH" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
913	if [ -f "$WORKING_DIR/$CHROME_SCREENSHOT" ]; then
914	mv -n "$WORKING_DIR/$CHROME_SCREENSHOT" "$SHOT_FILE"
915	else
916	valPrint trh "Screenshot of URL $URL seems to have failed!"
917	fi
918	else
919	valPrint trh "Skipping screenshot of URL $URL because $SHOT_FILE already exists."
920	fi
921	fi
922	done
923	FINISHED_LIST="yes"
924	wrapupAndExit

Note: See TracBrowser for help on using the repository browser.

Download in other formats: