source: Validate External Links/validate_external_links.sh@ 1118

Last change on this file since 1118 was 1118, checked in by iritscen, 5 years ago

Fixed ValExtLinks' reading of Archive API replies. Fix for reading links that happen to have a shebang in them. Now knows how to handle NULL namespace links. Now prints elapsed time.

File size: 41.6 KB
Line 
1#!/bin/bash
2
3# Validate External Links by Iritscen
4# Provided with a list of external links found in the OniGalore wiki, this script validates them.
5# The resulting logs are produced in three formats: TXT (for easy diffing with an earlier log), RTF
6# (for reading as a local file with clickable links), and HTML (for uploading as a web page).
7# Call script with "--help" argument for documentation. Also see Read Me First.rtf for critical notes.
8# Recommended rule:
9# |----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----|
10
11# Set separator token to newline
12IFS="
13"
14
15### GLOBALS ###
16# Settings -- these will be changed from their defaults by the arguments passed in to the script
17LINKS_URL="" # use 'curl' to download file with links from this location (can be file://)
18EXCEPT_URL="" # ditto above for file with exceptions to NG results
19OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder
20RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES
21SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page
22TAKE_PAGE_SHOT=0 # take a screenshot of each OK page
23CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature
24URL_START=1 # start at this URL in LINKS_FILE (1 by default)
25URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE
26UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report
27
28# Fixed strings -- see the occurrences of these variables to learn their purpose
29AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36 OPR/67.0.3575.53"
30ARCHIVE_API="http://archive.org/wayback/available"
31ARCHIVE_GENERIC="https://web.archive.org/web/*"
32ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
33CHROME_SCREENSHOT="screenshot.png"
34CURL_CODES="http://iritscen.oni2.net/val/curl_codes.txt"
35EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
36HTTP_CODES="http://iritscen.oni2.net/val/http_codes.txt"
37MY_WIKI_PAGE="http://wiki.oni2.net/User:Iritscen"
38THIS_DIR=$(cd $(dirname $0); pwd)
39WORKING_DIR=$(pwd)
40WIKI_PATH="wiki.oni2.net"
41
42# These are parallel arrays of the IDs and names of OniGalore's current namespaces
43declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 100 101 102 103 104 105 108 109 110 111)
44declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk")
45
46# These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
47# This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code.
48declare -a HTTP_FILES=(txt zip wmv jpg png m4a bsl rar oni mp3 mov ONWC vbs TRMA mp4 doc avi log gif pdf dmg exe cpp tga 7z wav east BINA xml dll dae xaf fbx 3ds blend flv csv)
49declare -a HTTP_TLDS_AND_PAGES=(com net org uk ru de it htm html php pl asp aspx shtml pgi cgi php3 x jsp phtml cfm css action stm js)
50
51# These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
52# are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt
53# if you add a new code.
54declare -a OK_CODES=(200 401 405 406 501)
55declare -a RD_CODES=(301 302 303 307 308)
56declare -a NG_CODES=(000 403 404 410 500 503)
57
58# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
59# transcluded text, and if the transclusion fails, then the braces show up in the URL
60ILLEGAL_CHARS="{ }"
61
62# The shortest URL possible, used for sanity-checking some URLs: http://a.co
63MIN_URL_LENGTH=11
64
65# These are parallel arrays giving the prefixes that can be used in place of normal external links to
66# some wikis and other sites
67declare -a INTERWIKI_PREFIXES=(commons metawikimedia mw wikibooks wikidata wikimedia wikinews wikiquote wikisource wikispecies wikiversity wikivoyage wikt wp)
68declare -a INTERWIKI_DOMAINS=(commons.wikimedia.org meta.wikimedia.org mediawiki.org wikibooks.org wikidata.org wikimediafoundation.org wikinews.org wikiquote.org wikisource.org species.wikimedia.org wikiversity.org wikivoyage.org wiktionary.org wikipedia.org)
69
70# Variables for keeping track of main loop progress and findings
71LINK_NUM=0
72EI_LINKS=0
73IW_LINKS=0
74OK_LINKS=0
75RD_LINKS=0
76NG_LINKS=0
77SKIP_UNK_NS=0
78SKIP_JS_PAGE=0
79SKIP_BAD_URL=0
80SKIP_NON_ASCII=0
81SKIP_UNK_SUFFIX=0
82SKIP_UNK_CODE=0
83SKIP_EXPECT_NG=0
84SKIP_EXPECT_EI=0
85SKIP_EXPECT_IW=0
86FILE_LINKS=0
87PAGE_LINKS=0
88SKIPPED_HEADER_ROW=0
89FINISHED_LIST="no"
90START_RUN=0
91END_RUN=0
92
93
94### HELP ###
95# A pseudo-man page. Here is the 80-character rule for the page text:
96# 234567890123456789012345678901234567890123456789012345678901234567890123456789
97function printHelp()
98{
99 cat << EOF
100
101NAME
102 Validate External Links
103
104SYNOPSIS
105 validate_external_links.sh --help
106 validate_external_links.sh --links URL --output DIR [--exceptions URL]
107 [--record-ok-links] [--suggest-snapshots] [--take-screenshots FILE]
108 [--start-url NUM] [--end-url NUM] [--upload FILE]
109
110DESCRIPTION
111 This script parses a list of external links found in the OniGalore wiki
112 (which is dumped by the Oni2.net domain periodically in a particular
113 format), validates them using the Unix tool 'curl', and produces a report
114 of which links were "OK" (responded positively to an HTTP query), which
115 were "RD" (responded with a 3xx redirect code), which could be "IW"
116 (interwiki) links, which are "EI" (external internal) links and could be
117 intrawiki links, and which were "NG" (no good; a negative response to the
118 query). This report can then be automatically uploaded to the location of
119 your choice. The script can also suggest Internet Archive snapshots for
120 "NG" links, and take screenshots of "OK" links for visual verification by
121 the reader that the page in question is the one intended to be displayed.
122
123 You must pass this script the URL at which the list of links is found
124 (--links) and the path where the directory of logs should be outputted
125 (--output). All other arguments are optional.
126
127OPTIONS
128 --help Show this page.
129 --links URL (required) URL from which to download the CSV
130 file with external links. Note that this URL can
131 be a local file if you supply a file:// path.
132 --output DIR (required) Unix path to directory in which Val
133 should place its reports.
134 --exceptions URL In order to remove links from the report which
135 Val finds an issue with, but which you regard as
136 OK, list those desired exceptions in this file.
137 See the sample file exceptions.txt for details.
138 Note that this URL can point to a local file if
139 you supply a file:// path.
140 --record-ok-links Log a link in the report even if its response
141 code is "OK".
142 --suggest-snapshots Query the Internet Archive for a possible
143 snapshot URL for each "NG" page.
144 --take-screenshots FILE Call the Google Chrome binary at this path to
145 take screenshots of each "OK" page.
146 --start-url NUM Start at this link in the links CSV file.
147 --end-url NUM Stop at this link in the links CSV file.
148 --upload FILE Upload report using the credentials and path
149 given in this local text file. See sftp_login.txt
150 for template.
151
152BUGS
153 The script cannot properly parse any line in the external links file
154 which contains a comma in the name of the wiki page containing a link.
155 Commas in the link itself are not an issue.
156EOF
157}
158
159
160### SETUP ###
161# If first argument is a help request, or if nothing was passed in at all, print help page and quit
162if [ "$#" -eq 0 ] || [ "$1" == "--help" ]; then
163 printHelp | less
164 exit 0
165fi
166
167# Parse arguments as long as there are more arguments to process
168while (( "$#" )); do
169 case "$1" in
170 --links ) LINKS_URL="$2"; shift 2;;
171 --exceptions ) EXCEPT_URL="$2"; shift 2;;
172 --output ) OUTPUT_DIR="$2"; shift 2;;
173 --record-ok-links ) RECORD_OK_LINKS=1; shift;;
174 --suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;;
175 --take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
176 --start-url ) URL_START=$2; shift 2;;
177 --end-url ) URL_LIMIT=$2; shift 2;;
178 --upload ) UPLOAD_INFO=$2; shift 2;;
179 * ) echo "Invalid argument $1 detected. Aborting."; exit 1;;
180 esac
181done
182
183# If the required arguments were not supplied, print help page and quit
184if [ -z $LINKS_URL ] || [ -z $OUTPUT_DIR ]; then
185 echo "Error: I did not receive one or both required arguments. Run me with the \"--help\" argument for documentation."
186 exit 2
187fi
188
189# If user wants screenshots, make sure path to Chrome was passed in and is valid
190if [ $TAKE_PAGE_SHOT -eq 1 ]; then
191 if [ ! -f "$CHROME_PATH" ]; then
192 echo "Error: You need to supply a path to the Google Chrome application in order to take screenshots."
193 exit 3
194 fi
195fi
196
197# Check that UPLOAD_INFO exists, if this argument was supplied
198if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then
199 echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting."
200 exit 4
201fi
202
203# Check that OUTPUT_DIR is a directory
204if [ ! -d "$OUTPUT_DIR" ]; then
205 echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting."
206 exit 5
207fi
208
209# Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots
210SAFE_TIME=$(date "+%Y-%m-%d--%H-%M-%S") # for use in path names
211NICE_TIME=$(TZ=":UTC" date "+%a, %d %b %Y %k:%M:%S GMT") # matches format of LINKS_DATE, which is extracted from the extlinks file
212OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
213OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
214SHOT_PATH="$OUTPUT_PATH/Screenshots"
215LOG_NAME="ValExtLinks report"
216LOG_TXT="$OUTPUT_PATH/$LOG_NAME.txt"
217LOG_RTF="$OUTPUT_PATH/$LOG_NAME.rtf"
218LOG_HTM="$OUTPUT_PATH/$LOG_NAME.htm"
219mkdir "$OUTPUT_PATH"
220if [ $TAKE_PAGE_SHOT -eq 1 ]; then
221 mkdir "$SHOT_PATH"
222fi
223
224# Check that 'mkdir' succeeded
225if [ ! -d "$OUTPUT_PATH" ]; then
226 echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting."
227 exit 6
228fi
229
230# Get date on the file at LINKS_URL and print to log
231LINKS_DATE=$(curl --silent --head $LINKS_URL | grep "Last-Modified")
232if [ -z "$LINKS_DATE" ]; then
233 echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting."
234 exit 7
235fi
236LINKS_DATE=${LINKS_DATE#Last-Modified: }
237
238
239### UTILITY FUNCTIONS ###
240# Writes a plain-text header to TXT log file
241function printTXTheader()
242{
243 valPrint t "Validate External Links report"
244 valPrint t "generated $NICE_TIME"
245 valPrint t "from data of $LINKS_DATE"
246 valPrint t "script by Iritscen (contact: $MY_WIKI_PAGE)"
247 valPrint t ""
248}
249
250# Writes the RTF header to RTF log file, then a centered title, then sets text to left-justified
251function printRTFheader()
252{
253 valPrint r "{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf820
254{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
255{\colortbl;\red255\green255\blue255;}
256{\*\expandedcolortbl;;}
257\margl1440\margr1440\vieww12600\viewh12100\viewkind0
258\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\qc\partightenfactor0
259
260\f0\fs28 \cf0 \b1 Validate External Links report \b0\\
261generated $NICE_TIME\\
262from data of $LINKS_DATE\\
263script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$MY_WIKI_PAGE\"}}{\fldrslt contact}})
264\\
265\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
266\cf0 "
267}
268
269# Closes the RTF markup of the RTF log file
270function printRTFfooter()
271{
272 valPrint r "}"
273}
274
275# Writes the HTML header to HTML log file
276function printHTMheader()
277{
278 valPrint h "<html>
279<head>
280<title>Validate External Links report</title>
281</head>
282<body>
283<h2>Validate External Links report</h2>
284<h3>generated $NICE_TIME<br />
285from data of $LINKS_DATE<br />
286script by Iritscen (<a href=\"$MY_WIKI_PAGE\" target=\"_blank\">contact</a>)</h3>"
287}
288
289# Closes the HTML markup of the HTML log file
290function printHTMfooter()
291{
292 valPrint h "</body>
293</html>"
294}
295
296# The central logging function. The first parameter is a string composed of one or more characters that
297# indicate which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
298# 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 'w' means "Don't
299# pass console output through 'fmt'" ("fmt" fits the output to an 80-column CLI but can break special
300# formatting and the 'n' option).
301function valPrint()
302{
303 if [[ "$1" == *c* ]]; then
304 if [[ "$1" == *n* ]]; then
305 echo -n "$2"
306 elif [[ "$1" == *w* ]]; then
307 echo "$2"
308 else
309 echo "$2" | fmt -w 80
310 fi
311 fi
312 if [[ "$1" == *t* ]]; then
313 if [[ "$1" == *n* ]]; then
314 echo -n "$2" >> "$LOG_TXT"
315 else
316 echo "$2" >> "$LOG_TXT"
317 fi
318 fi
319 if [[ "$1" == *r* ]]; then
320 if [[ "$1" == *n* ]]; then
321 echo "$2" >> "$LOG_RTF"
322 else
323 echo "$2\\" >> "$LOG_RTF"
324 fi
325 fi
326 if [[ "$1" == *h* ]]; then
327 if [[ "$1" == *n* ]]; then
328 echo "$2" >> "$LOG_HTM"
329 else
330 echo "$2<br />" >> "$LOG_HTM"
331 fi
332 fi
333}
334
335# Pluralize the string in parameter 1 if the number in parameter 2 is not 1
336function pluralCheckNoun()
337{
338 if [ $2 -ne 1 ]; then
339 if [[ $1 =~ x$ ]]; then
340 echo $1es
341 else
342 echo $1s
343 fi
344 else
345 echo $1
346 fi
347}
348
349# Output "is" if parameter 1 is 1, otherwise "are"
350function pluralCheckIs()
351{
352 if [ $1 -ne 1 ]; then
353 echo "are"
354 else
355 echo "is"
356 fi
357}
358
359# Output "was" if parameter 1 is 1, otherwise "were"
360function pluralCheckWas()
361{
362 if [ $1 -ne 1 ]; then
363 echo "were"
364 else
365 echo "was"
366 fi
367}
368
369# Output "a " if parameter 1 is 1, otherwise nothing
370function pluralCheckA()
371{
372 if [ $1 -eq 1 ]; then
373 echo "a "
374 fi
375}
376
377# Output "an " if parameter 1 is 1, otherwise nothing
378function pluralCheckAn()
379{
380 if [ $1 -eq 1 ]; then
381 echo "an "
382 fi
383}
384
385# Upload HTML report using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
386# reports being saved to disk have already been closed.
387function uploadReport()
388{
389 valPrint c "Uploading HTML report..."
390
391 SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
392 SFTP_USER_NAME_MARKER="user:"
393 SFTP_PASSWORD_MARKER="pw:"
394 SFTP_PORT_MARKER="port:"
395 SFTP_PATH_MARKER="path:"
396 SFTP_USER_NAME=$(grep $SFTP_USER_NAME_MARKER $UPLOAD_INFO)
397 SFTP_USER_NAME=${SFTP_USER_NAME#$SFTP_USER_NAME_MARKER}
398 SFTP_PASSWORD=$(grep $SFTP_PASSWORD_MARKER $UPLOAD_INFO)
399 SFTP_PASSWORD=${SFTP_PASSWORD#$SFTP_PASSWORD_MARKER}
400 SFTP_PORT=$(grep $SFTP_PORT_MARKER $UPLOAD_INFO)
401 SFTP_PORT=${SFTP_PORT#$SFTP_PORT_MARKER}
402 SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
403 SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}
404
405 expect "$SCRIPT_PATH" "$LOG_HTM" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.htm"
406
407 valPrint c "Report was uploaded, unless an error message appears above."
408}
409
410# Prints session summary when script is done
411function wrapupAndExit()
412{
413 # Get off progress line on console, drop down a line from last link in log, and close HTML table
414 valPrint ctr ""
415 valPrint h "</table><br />"
416
417 # If we didn't finish processing the last URL, then the iterator is one too high
418 if [ $FINISHED_LIST != "yes" ]; then
419 let LINK_NUM-=1
420 if [ $FINISHED_LIST == "no" ]; then
421 valPrint ctrh "The session was canceled by the user."
422 fi
423 fi
424
425 # Generate string with elapsed time
426 END_RUN=$(date +%s)
427 ELAPSED=$(echo $(($END_RUN - $START_RUN)) | awk '{printf "%d min. %d sec. elapsed", int($1/60), int($1%60)}')
428
429 # Output results of session and close the log file's markup
430 LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
431 LINKS_SKIPPED=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
432 LINKS_CHECKED=$((LINKS_PROCESSED-LINKS_SKIPPED))
433 valPrint ct "Summary ($ELAPSED):"
434 valPrint r "\b1 Summary \b0 ($ELAPSED)"
435 valPrint hn "<h3><span id=\"summary\">Summary ($ELAPSED)</span></h3>"
436 valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT)."
437 valPrint ctrh "I skipped $LINKS_SKIPPED $(pluralCheckNoun link $LINKS_SKIPPED), and found $FILE_LINKS $(pluralCheckNoun file $FILE_LINKS) and $PAGE_LINKS $(pluralCheckNoun page $PAGE_LINKS)."
438 if [ $LINKS_SKIPPED -gt 0 ]; then valPrint ctrh "Skip breakdown: "; fi
439 if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
440 if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
441 if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
442 if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
443 if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
444 if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
445 valPrint ctrh "Out of the $LINKS_CHECKED links checked, $EI_LINKS could be $(pluralCheckAn $EI_LINKS)intrawiki $(pluralCheckNoun link $EI_LINKS), $IW_LINKS could be $(pluralCheckAn $IW_LINKS)interwiki $(pluralCheckNoun link $IW_LINKS), $OK_LINKS $(pluralCheckWas $OK_LINKS) OK, $RD_LINKS $(pluralCheckWas $RD_LINKS) $(pluralCheckA $RD_LINKS)redirection $(pluralCheckNoun notice $RD_LINKS), and $NG_LINKS $(pluralCheckWas $NG_LINKS) NG."
446 if [ $SKIP_EXPECT_NG -gt 0 ]; then
447 valPrint ctrh "$SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS) went unlisted due to being found in the exceptions file."
448 fi
449 if [ $SKIP_EXPECT_EI -gt 0 ]; then
450 valPrint ctrh "$SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS) went unlisted due to being found in the exceptions file."
451 fi
452 if [ $SKIP_EXPECT_IW -gt 0 ]; then
453 valPrint ctrh "$SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS) went unlisted due to being found in the exceptions file."
454 fi
455 valPrint trh "ValExtLinks says goodbye."
456 printRTFfooter
457 printHTMfooter
458
459 # Upload report if this was requested
460 if [ ! -z $UPLOAD_INFO ] && [ $FINISHED_LIST != "no" ]; then
461 uploadReport
462 fi
463
464 # Really quit now
465 valPrint c "ValExtLinks says goodbye."
466 exit 0
467}
468trap wrapupAndExit INT
469
470
471### INITIALIZATION ###
472# Print opening message to console and log files
473valPrint cw "$(tput bold)--Validate External Links--$(tput sgr0)"
474printTXTheader
475printRTFheader
476printHTMheader
477
478# Attempt to download file at LINKS_URL, then check that it succeeded
479valPrint cwtrh "Downloading list of external links from $LINKS_URL."
480LINKS_FILE_NAME=$(echo "$LINKS_URL" | sed 's/.*\///')
481LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
482curl --silent -o "$LINKS_FILE" $LINKS_URL
483if [ ! -f "$LINKS_FILE" ]; then
484 echo "The download of $LINKS_URL appears to have failed. Aborting."
485 wrapupAndExit
486fi
487
488# Attempt to download file at EXCEPT_URL, then check that it succeeded
489if [ ! -z $EXCEPT_URL ]; then
490 valPrint cwtrh "Downloading list of reporting exceptions from $EXCEPT_URL."
491 EXCEPT_FILE_NAME=$(echo "$EXCEPT_URL" | sed 's/.*\///')
492 EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
493 curl --silent -o "$EXCEPT_FILE" $EXCEPT_URL
494 if [ ! -f "$EXCEPT_FILE" ]; then
495 echo "The download of $EXCEPT_URL appears to have failed. Aborting."
496 wrapupAndExit
497 fi
498fi
499
500# Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
501LINK_COUNT_STRING=$(cat "$LINKS_FILE" | wc -l)
502
503# Number of URLs is number of lines minus one (first line is column header row for the CSV)
504LINK_COUNT=$(echo "${LINK_COUNT_STRING}" | tr -d '[:space:]')
505let LINK_COUNT-=1
506
507# Calculate number of URLs to consider
508if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
509 valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((URL_LIMIT-URL_START+1)) of them, from link $URL_START to $URL_LIMIT."
510elif [ $URL_START -ne 1 ]; then
511 valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((LINK_COUNT-URL_START+1)) of them, from link $URL_START to $LINK_COUNT."
512else
513 valPrint ctrh "Found $LINK_COUNT links to process. I will be considering all of them."
514fi
515
516# Print settings to console and log
517declare -a SETTINGS_MSG=(I will be checking the response code of each link "and will" take a screenshot of each page. Pages that are OK will "also" be logged. I "will" ask the Internet Archive for a suggested snapshot URL for each NG page. "I will not report links that are listed in the exceptions file.")
518if [ $TAKE_PAGE_SHOT -eq 0 ]; then SETTINGS_MSG[10]="but will not"; fi
519if [ $RECORD_OK_LINKS -eq 0 ]; then SETTINGS_MSG[22]="not"; fi
520if [ $SUGGEST_SNAPSHOTS -eq 0 ]; then SETTINGS_MSG[26]="will not"; fi
521if [ -z $EXCEPT_URL ] || [ $RECORD_OK_LINKS -eq 1 ]; then SETTINGS_MSG[40]=""; fi
522SETTINGS_STR=${SETTINGS_MSG[@]}
523valPrint ctrh "$SETTINGS_STR"
524valPrint tr "A summary of my findings will be found at the bottom of the report."
525valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
526valPrint trh ""
527
528# Print legend to logs
529valPrint t "Legend:"
530valPrint r "\b1 Legend \b0"
531valPrint hn "<h3>Legend</h3>"
532valPrint trh "OK = URL seems to be working."
533valPrint trh "NG = URL no longer seems to work. You should click each URL marked as NG before attempting to fix it, because false negatives will occur from time to time due to hiccups in the Internet. Please report any persistent false negatives or other issues to Iritscen. An NG link should be followed by a link to the Internet Archive's Wayback Machine which may help you repair the link. If the link cannot be repaired, you can disable it on the wiki (which prevents it from showing up in future ValExtLinks reports) by wrapping it in nowiki tags."
534valPrint trh "RD = The server responding to this URL is saying that the page moved and you should instead use the supplied new URL. Some RD links represent minor adjustments in the organization of a web site, and some are soft 404s (the file/page has been removed and you are being redirected to something like the main page of the web site). You will have to look at the new URL yourself to determine if it represents an OK link and the link on the wiki should be updated to this one, or if the desired file/page is actually gone and we need to replace the wiki link with an Internet Archive snapshot link -- or disable the URL if it has not been archived."
535valPrint trh "EI = URL is an external link to an internal page and should be converted to an intrawiki link using the suggested markup."
536valPrint trh "IW = URL is an external link to a fellow wiki and should be converted to an interwiki link using the suggested markup."
537valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $HTTP_CODES)."
538valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$HTTP_CODES\"}}{\fldrslt here}} for code reference)."
539valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$HTTP_CODES\" target=\"_blank\">here</a> for code reference)."
540valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $CURL_CODES)."
541valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$CURL_CODES\"}}{\fldrslt here}} for code reference)."
542valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$CURL_CODES\" target=\"_blank\">here</a> for code reference)."
543valPrint trh "IA suggests = Last available snapshot suggested by the Internet Archive."
544valPrint trh "Try browsing = The Archive occasionally fails to return a snapshot URL even when one exists, so you will need to check for a snapshot manually using this link to the Wayback Machine before concluding that a site has not been archived."
545valPrint trh ""
546
547
548### MAIN LOOP ###
549START_RUN=$(date +%s)
550# Process each line of the .csv in LINKS_FILE
551for LINE in `cat "$LINKS_FILE"`; do
552 let LINK_NUM+=1
553
554 # First line is the column header row for the CSV, so let's verify that the format hasn't changed
555 if [ $SKIPPED_HEADER_ROW -eq 0 ]; then
556 if [ $LINE == "namespace,title,target" ]; then
557 SKIPPED_HEADER_ROW=1
558 LINK_NUM=0 # this line is it's not a link, so reset the link counter
559 valPrint hn "<table>"
560 continue
561 else
562 valPrint ctrh "Did not find expected header row in $LINKS_FILE. Aborting."
563 wrapupAndExit
564 fi
565 fi
566
567 # Skip this link if we are not at URL_START yet
568 if [ $LINK_NUM -lt $URL_START ]; then
569 continue
570 fi
571
572 # Stop if we are at the limit declared for testing purposes
573 if [ $URL_LIMIT -gt 0 ] && [ $LINK_NUM -gt $URL_LIMIT ]; then
574 FINISHED_LIST="limit"
575 wrapupAndExit
576 fi
577
578 # Print progress to screen
579 if [ $LINK_NUM -gt 1 ]; then
580 printf "\e[1A\n" # erase previous progress message so that new one appears in its place
581 fi
582 valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."
583
584 # The number of the namespace is the element before the first comma on the line
585 NS_ID=${LINE%%,*}
586
587 # Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
588 NS_NAME=""
589 a=0
590 while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done
591 if [ $NS_ID == "NULL" ]; then
592 break
593 elif [ $NS_ID -eq ${NS_IDS[$a]} ]; then
594 NS_NAME="${NS_NAMES[$a]}"
595 break
596 fi
597 let a+=1
598 done
599 if [ "$NS_NAME" == "" ]; then
600 if [ $NS_ID == "NULL" ]; then
601 valPrint tr "Skipping URL on line $LINK_NUM because the namespace (and probably the page too) is \"NULL\". Probably the link is no longer in existence on the wiki."
602 else
603 valPrint tr "Skipping URL on line $LINK_NUM found on page $PAGE_NAME because I could not find a name for namespace ID $NS_ID."
604 fi
605 let SKIP_UNK_NS+=1
606 continue
607 fi
608
609 # The name of the page is everything between the namespace ID and the next comma on the line (commas
610 # in page names will break this)
611 PAGE_NAME=${LINE#$NS_ID,}
612 PAGE_NAME=${PAGE_NAME%%,*}
613
614 # We don't want to consider wiki pages ending in .js, as the parser cannot reliably isolate URLS in
615 # JavaScript code, so it will return erroneous links
616 PAGE_NAME_SUFFIX=$(echo $PAGE_NAME | sed 's/.*\.//')
617 if [ $PAGE_NAME_SUFFIX == "js" ]; then
618 valPrint tr "Skipping URL on line $LINK_NUM because it was found on JavaScript page $PAGE_NAME."
619 let SKIP_JS_PAGE+=1
620 continue
621 fi
622
623 # Build longer wiki page URLs from namespace and page names
624 FULL_PAGE_PATH=http://$WIKI_PATH/$NS_NAME:$PAGE_NAME
625 LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
626 # Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it
627 # explicitly breaks the link
628 if [ $NS_ID -eq 0 ]; then
629 FULL_PAGE_PATH=http://$WIKI_PATH/$PAGE_NAME
630 LOCAL_PAGE_PATH=$PAGE_NAME
631 fi
632
633 # The URL being linked to is everything after the previous two fields (this allows commas to be in
634 # the URLs, but a comma in the previous field, the page name, will break this)
635 URL=${LINE#$NS_ID,$PAGE_NAME,}
636
637 # Scan for illegal characters
638 if [[ $URL == *[$ILLEGAL_CHARS]* ]]; then
639 valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because it contains characters illegal in a URL."
640 let SKIP_BAD_URL+=1
641 continue
642 fi
643
644 # Now we need to know if the URL is for a file or a web page. First step is to determine if the
645 # URL ends in a suffix
646 HAS_SUFFIX=0
647
648 # If the URL ends in something like ".php?foo=bar", strip everything from the '?' onward
649 CLEAN_URL=${URL%%\?*}
650
651 # If the URL ends in something like "#section_15", strip everything from the '#' onward
652 CLEAN_URL=${CLEAN_URL%%\#*}
653
654 # 'sed' cannot handle Unicode in my Bash shell, so skip this URL and make user check it
655 if [[ $CLEAN_URL == *[![:ascii:]]* ]]; then
656 valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because I cannot handle non-ASCII characters."
657 let SKIP_NON_ASCII+=1
658 continue
659 fi
660
661 # Isolate the characters after the last period and after the last slash
662 POST_DOT=$(echo "$CLEAN_URL" | sed 's/.*\.//')
663 POST_SLASH=$(echo "$CLEAN_URL" | sed 's/.*\///')
664
665 # If the last period comes after the last slash, then the URL ends in a suffix
666 POST_DOT_LENGTH=$(echo | awk -v input=$POST_DOT '{print length(input)}')
667 POST_SLASH_LENGTH=$(echo | awk -v input=$POST_SLASH '{print length(input)}')
668 if [ $POST_DOT_LENGTH -lt $POST_SLASH_LENGTH ]; then
669 HAS_SUFFIX=1
670 else
671 HAS_SUFFIX=0
672 fi
673
674 # Now we need to know if the URL is for a file. If the URL ends in a suffix, compare it against the
675 # known URL endings supplied in HTTP_FILES and HTTP_TLDS_AND_PAGES
676 IS_FILE=-1
677 if [ $HAS_SUFFIX -eq 0 ]; then
678 IS_FILE=0
679 else
680 # Turn off case sensitivity while we compare suffixes
681 shopt -s nocasematch
682
683 # Special case: URLs ending in something like "/productID.304297400" are pages, not files, so if
684 # the URL's suffix is all numbers, we are looking at the end of a web page URL
685 if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
686 IS_FILE=0
687 fi
688
689 # If we did not identify this URL as a web page above, we need to compare the suffix against known
690 # file extensions
691 if [ $IS_FILE -eq -1 ]; then
692 for EXTENSION in "${HTTP_FILES[@]}"; do
693 if [[ $EXTENSION == $POST_DOT ]]; then # double brackets respect the nocasematch setting
694 IS_FILE=1
695 break
696 fi
697 done
698 fi
699
700 # If we did not identify this URL as a file above, we need to compare the suffix against known
701 # pages and TLDs in order to rule out that this is an unknown file suffix or TLD that the user
702 # needs to add to the HTTP_FILES or HTTP_TLDS_AND_PAGES array
703 if [ $IS_FILE -eq -1 ]; then
704 for PAGE_OR_TLD in "${HTTP_TLDS_AND_PAGES[@]}"; do
705 if [[ $PAGE_OR_TLD == $POST_DOT ]]; then
706 IS_FILE=0
707 break
708 fi
709 done
710 fi
711
712 # Turn case sensitivity back on in Bash
713 shopt -u nocasematch
714 fi
715
716 # If this suffix escaped identification as either a file, page or TLD, inform the user
717 STR_TYPE=""
718 if [ $IS_FILE -eq -1 ]; then
719 valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because I encountered the unknown URL ending $POST_DOT. Please add this ending to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
720 let SKIP_UNK_SUFFIX+=1
721 continue
722 elif [ $IS_FILE -eq 1 ]; then
723 STR_TYPE="file"
724 let FILE_LINKS+=1
725 elif [ $IS_FILE -eq 0 ]; then
726 STR_TYPE="page"
727 let PAGE_LINKS+=1
728 fi
729
730 # Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
731 # issue with sites that require HTTPS
732 CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{http_code}\n' $URL)
733 CURL_ERR=$(echo $?)
734 CURL_RESULT=$CURL_CODE
735
736 # Tack 'curl' exit code onto result string if 'curl' returned "000" (no HTTP response)
737 if [ $CURL_CODE == "000" ]; then
738 CURL_RESULT="$CURL_RESULT-$CURL_ERR"
739 fi
740
741 # Begin to determine our status code for this URL (EI, IW, OK, RD, or NG)
742 STATUS="??"
743 NEW_URL=""
744 INTERWIKI_INDEX=-1
745
746 # First make sure that this isn't an "external internal" link to our own wiki that can be replaced
747 # by "[[page_name]]". If it uses a special access URL beginning with "/w/", let it pass, as it
748 # probably cannot be replaced by "[[ ]]" markup
749 if [[ $URL == *$WIKI_PATH* ]] && [[ $URL != *$WIKI_PATH/w/* ]]; then
750 STATUS="EI"
751 let EI_LINKS+=1
752 fi
753
754 # If it's not, check if this is a link to a domain that we have an interwiki prefix for
755 if [ $STATUS == "??" ]; then
756 for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
757 if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]] && [[ $URL != *${INTERWIKI_DOMAINS[$i]}/w/* ]]; then
758 STATUS="IW"
759 let IW_LINKS+=1
760 INTERWIKI_INDEX=$i
761 break
762 fi
763 done
764 fi
765
766 # If we didn't match an interwiki domain, see if the status code is in our "OK" codes list
767 if [ $STATUS == "??" ]; then
768 for CODE in "${OK_CODES[@]}"; do
769 if [[ $CODE == $CURL_CODE ]]; then
770 STATUS="OK"
771 let OK_LINKS+=1
772 break
773 fi
774 done
775 fi
776
777 # If we didn't get a match with the "OK" codes, check it against the "RD" codes
778 if [ $STATUS == "??" ]; then
779 for CODE in "${RD_CODES[@]}"; do
780 if [[ $CODE == $CURL_CODE ]]; then
781 # Get URL header again in order to retrieve the URL we are being redirected to
782 NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{redirect_url}\n' $URL)
783
784 # Filter out cases where the redirect URL is just the original URL with https:// instead of
785 # http://, or with an added '/' at the end. These corrections happen a lot and are not
786 # important to us.
787 URL_NO_PROTOCOL=${URL#http://}
788 URL_NO_PROTOCOL=${URL_NO_PROTOCOL%/}
789 NEW_URL_NO_PROTOCOL=${NEW_URL#https://}
790 NEW_URL_NO_PROTOCOL=${NEW_URL_NO_PROTOCOL%/}
791
792 # Sometimes 'curl' fails to get the redirect_url due to time-out or bad web site config
793 NEW_URL_LENGTH=$(echo | awk -v input=$NEW_URL_NO_PROTOCOL '{print length(input)}')
794 if [ $NEW_URL_LENGTH -lt $MIN_URL_LENGTH ]; then
795 NEW_URL_NO_PROTOCOL="[new URL not retrieved]"
796 fi
797
798 # If the URLs match after the above filters were applied, then the link is OK
799 if [ $URL_NO_PROTOCOL == $NEW_URL_NO_PROTOCOL ]; then
800 STATUS="OK"
801 let OK_LINKS+=1
802 else
803 STATUS="RD"
804 let RD_LINKS+=1
805 fi
806 break
807 fi
808 done
809 fi
810
811 # If we didn't get a match with the "RD" codes, check it against the "NG" codes
812 if [ $STATUS == "??" ]; then
813 for CODE in "${NG_CODES[@]}"; do
814 if [[ $CODE == $CURL_CODE ]]; then
815 STATUS="NG"
816 let NG_LINKS+=1
817 break
818 fi
819 done
820 fi
821
822 # If we didn't match a known status code, advise the reader
823 if [ $STATUS == "??" ]; then
824 valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because I encountered the unknown return code $CURL_CODE."
825 let SKIP_UNK_CODE+=1
826 continue
827 fi
828
829 # Check problem links against exceptions file before proceeding
830 if [ $STATUS != "OK" ] && [ ! -z $EXCEPT_URL ]; then
831 # The code we expect to find in the exceptions file is either the 'curl' result or "EI"/"IW"
832 EXPECT_CODE="$CURL_RESULT"
833 if [ $STATUS == "EI" ]; then
834 EXPECT_CODE="EI"
835 elif [ $STATUS == "IW" ]; then
836 EXPECT_CODE="IW"
837 fi
838
839 # Look for link in exceptions file and make sure its listed result code and wiki page also match
840 GREP_RESULT=$(grep --max-count=1 "$URL" "$EXCEPT_FILE")
841 EXCEPT_PAGE=${GREP_RESULT##*,}
842 if [ "$EXCEPT_PAGE" == "*" ] || [ "$EXCEPT_PAGE" == $LOCAL_PAGE_PATH ]; then
843 EXCEPT_CODE=${GREP_RESULT%%,*}
844 if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then
845 valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because its expected result, $EXPECT_CODE, is listed in the exceptions file."
846 if [ $STATUS == "EI" ]; then
847 let SKIP_EXPECT_EI+=1
848 elif [ $STATUS == "IW" ]; then
849 let SKIP_EXPECT_IW+=1
850 else
851 let SKIP_EXPECT_NG+=1
852 fi
853 continue
854 fi
855 fi
856 fi
857
858 # If appropriate, record this link to the log, with clickable URLs when possible
859 if [ $STATUS != "OK" ] || [ $RECORD_OK_LINKS -eq 1 ]; then
860 # Stupid hack since the strings "IW" and "EI" are narrower than "OK", "RD", or "NG" and it takes
861 # an extra tab to get to the desired level of indentation in the RTF log
862 RTF_TABS=" "
863 if [ $STATUS == "IW" ] || [ $STATUS == "EI" ]; then
864 RTF_TABS=" "
865 fi
866
867 # Record link and its wiki page in TXT, RTF, and HTML markup
868 valPrint t "$STATUS ($CURL_RESULT) $STR_TYPE $URL"
869 valPrint t " linked from $FULL_PAGE_PATH"
870 valPrint r "$STATUS ($CURL_RESULT)$RTF_TABS$STR_TYPE {\field{\*\fldinst{HYPERLINK \"$URL\"}}{\fldrslt $URL}}"
871 valPrint r " linked from {\field{\*\fldinst{HYPERLINK \"$FULL_PAGE_PATH\"}}{\fldrslt $LOCAL_PAGE_PATH}}"
872 valPrint hn "<tr><td style=\"white-space:nowrap\">$STATUS ($CURL_RESULT)</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>"
873 valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"
874
875 # Record redirect URL if one was given by a 3xx response page
876 if [ $STATUS == "RD" ]; then
877 valPrint t " Server suggests $NEW_URL"
878 valPrint r " Server suggests {\field{\*\fldinst{HYPERLINK \"$NEW_URL\"}}{\fldrslt $NEW_URL}}"
879 valPrint hn "<tr><td colspan=\"2\" align=\"right\">Server suggests</td><td><a href=\"$NEW_URL\" target=\"_blank\">$NEW_URL</a></td></tr>"
880 fi
881
882 # Notify reader if we can use an intrawiki link for this URL
883 if [ $STATUS == "EI" ]; then
884 INTRA_PAGE=${URL#*://*/}
885 valPrint t " Just use [[$INTRA_PAGE]]"
886 valPrint r " Just use [[$INTRA_PAGE]]"
887 valPrint hn "<tr><td colspan=\"2\" align=\"right\">Just use</td><td>[[$INTRA_PAGE]]</td></tr>"
888 fi
889
890 # Notify reader if we can use an interwiki prefix for this URL
891 if [ $STATUS == "IW" ]; then
892 INTER_PAGE=$(echo "$URL" | sed 's/.*\///')
893 valPrint t " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
894 valPrint r " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
895 valPrint hn "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]</td></tr>"
896 fi
897
898 # Query Internet Archive for latest "OK" snapshot for "NG" page
899 if [ $STATUS == "NG" ] && [ $SUGGEST_SNAPSHOTS -eq 1 ]; then
900 ARCHIVE_QUERY=$(curl --silent --max-time 10 "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
901
902 # If a "closest" snapshot was received...
903 if [[ "$ARCHIVE_QUERY" == *\"closest\":* ]]; then
904 # In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it
905 ARCHIVE_QUERY=$(echo "$ARCHIVE_QUERY" | sed 's/#!/#\\!/')
906
907 # ...isolate "url" property in the response that follows the "closest" tag
908 SNAPSHOT_URL=${ARCHIVE_QUERY##*\"closest\":} # everything after '"closest":'
909 SNAPSHOT_URL=${SNAPSHOT_URL##*\"url\": \"} # everything after '"url": "'
910 SNAPSHOT_URL=${SNAPSHOT_URL%%\"*} # everything before '"'
911
912 # Inform the user of the snapshot URL
913 valPrint t " IA suggests $SNAPSHOT_URL"
914 valPrint r " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
915 valPrint hn "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
916 else # ...otherwise give generic Wayback Machine link for this URL
917 valPrint t " Try browsing $ARCHIVE_GENERIC/$URL"
918 valPrint r " Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
919 valPrint hn "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"
920 fi
921 fi
922 fi
923
924 # If this is a page and it seems to be there, take screenshot of it using headless Google Chrome
925 if [ $IS_FILE -eq 0 ] && [ $TAKE_PAGE_SHOT -eq 1 ] && [ $STATUS == "OK" ]; then
926 # Sanitize URL by removing "http(s)://" and converting any other colons or slashes to underscores
927 SHOT_NAME=$(echo "$URL" | sed 's/https*\:\/\///' | sed 'y/:\//__/')
928 SHOT_FILE="$SHOT_PATH/$SHOT_NAME.png"
929
930 # Don't take screenshot if we already encountered this page and screenshotted it
931 if [ ! -f "$SHOT_FILE" ]; then
932 "$CHROME_PATH" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
933 if [ -f "$WORKING_DIR/$CHROME_SCREENSHOT" ]; then
934 mv -n "$WORKING_DIR/$CHROME_SCREENSHOT" "$SHOT_FILE"
935 else
936 valPrint trh "Screenshot of URL $URL seems to have failed!"
937 fi
938 else
939 valPrint trh "Skipping screenshot of URL $URL because $SHOT_FILE already exists."
940 fi
941 fi
942done
943FINISHED_LIST="yes"
944wrapupAndExit
Note: See TracBrowser for help on using the repository browser.