source: Validate External Links/validate_external_links.sh@ 1120

Last change on this file since 1120 was 1120, checked in by iritscen, 5 years ago

Val's reports now print section headers for the init/config stage and for the link results themselves.

File size: 42.1 KB
Line 
1#!/bin/bash
2
3# Validate External Links by Iritscen
4# Provided with a list of external links found in the OniGalore wiki, this script validates them.
5# The resulting logs are produced in three formats: TXT (for easy diffing with an earlier log), RTF
6# (for reading as a local file with clickable links), and HTML (for uploading as a web page).
7# Call script with "--help" argument for documentation. Also see Read Me First.rtf for critical notes.
8# Recommended rule:
9# |----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----|
10
11# Set separator token to newline
12IFS="
13"
14
15### GLOBALS ###
16# Settings -- these will be changed from their defaults by the arguments passed in to the script
17LINKS_URL="" # use 'curl' to download file with links from this location (can be file://)
18EXCEPT_URL="" # ditto above for file with exceptions to NG results
19OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder
20RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES
21SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page
22TAKE_PAGE_SHOT=0 # take a screenshot of each OK page
23CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature
24URL_START=1 # start at this URL in LINKS_FILE (1 by default)
25URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE
26UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report
27
28# Fixed strings -- see the occurrences of these variables to learn their purpose
29AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36 OPR/67.0.3575.53"
30ARCHIVE_API="http://archive.org/wayback/available"
31ARCHIVE_GENERIC="https://web.archive.org/web/*"
32ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
33CHROME_SCREENSHOT="screenshot.png"
34CURL_CODES="http://iritscen.oni2.net/val/curl_codes.txt"
35EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
36HTTP_CODES="http://iritscen.oni2.net/val/http_codes.txt"
37MY_WIKI_PAGE="http://wiki.oni2.net/User:Iritscen"
38THIS_DIR=$(cd $(dirname $0); pwd)
39WORKING_DIR=$(pwd)
40WIKI_PATH="wiki.oni2.net"
41
42# These are parallel arrays of the IDs and names of OniGalore's current namespaces
43declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 100 101 102 103 104 105 108 109 110 111)
44declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk")
45
46# These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
47# This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code.
48declare -a HTTP_FILES=(txt zip wmv jpg png m4a bsl rar oni mp3 mov ONWC vbs TRMA mp4 doc avi log gif pdf dmg exe cpp tga 7z wav east BINA xml dll dae xaf fbx 3ds blend flv csv)
49declare -a HTTP_TLDS_AND_PAGES=(com net org uk ru de it htm html php pl asp aspx shtml pgi cgi php3 x jsp phtml cfm css action stm js)
50
51# These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
52# are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt
53# if you add a new code.
54declare -a OK_CODES=(200 401 405 406 501)
55declare -a RD_CODES=(301 302 303 307 308)
56declare -a NG_CODES=(000 403 404 410 500 503)
57
58# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
59# transcluded text, and if the transclusion fails, then the braces show up in the URL
60ILLEGAL_CHARS="{ }"
61
62# The shortest URL possible, used for sanity-checking some URLs: http://a.co
63MIN_URL_LENGTH=11
64
65# These are parallel arrays giving the prefixes that can be used in place of normal external links to
66# some wikis and other sites
67declare -a INTERWIKI_PREFIXES=(commons metawikimedia mw wikibooks wikidata wikimedia wikinews wikiquote wikisource wikispecies wikiversity wikivoyage wikt wp)
68declare -a INTERWIKI_DOMAINS=(commons.wikimedia.org meta.wikimedia.org mediawiki.org wikibooks.org wikidata.org wikimediafoundation.org wikinews.org wikiquote.org wikisource.org species.wikimedia.org wikiversity.org wikivoyage.org wiktionary.org wikipedia.org)
69
70# Variables for keeping track of main loop progress and findings
71LINK_NUM=0
72EI_LINKS=0
73IW_LINKS=0
74OK_LINKS=0
75RD_LINKS=0
76NG_LINKS=0
77SKIP_UNK_NS=0
78SKIP_JS_PAGE=0
79SKIP_BAD_URL=0
80SKIP_NON_ASCII=0
81SKIP_UNK_SUFFIX=0
82SKIP_UNK_CODE=0
83SKIP_EXPECT_NG=0
84SKIP_EXPECT_EI=0
85SKIP_EXPECT_IW=0
86FILE_LINKS=0
87PAGE_LINKS=0
88SKIPPED_HEADER_ROW=0
89FINISHED_LIST="no"
90START_RUN=0
91END_RUN=0
92
93
94### HELP ###
95# A pseudo-man page. Here is the 80-character rule for the page text:
96# 234567890123456789012345678901234567890123456789012345678901234567890123456789
97function printHelp()
98{
99 cat << EOF
100
101NAME
102 Validate External Links
103
104SYNOPSIS
105 validate_external_links.sh --help
106 validate_external_links.sh --links URL --output DIR [--exceptions URL]
107 [--record-ok-links] [--suggest-snapshots] [--take-screenshots FILE]
108 [--start-url NUM] [--end-url NUM] [--upload FILE]
109
110DESCRIPTION
111 This script parses a list of external links found in the OniGalore wiki
112 (which is dumped by the Oni2.net domain periodically in a particular
113 format), validates them using the Unix tool 'curl', and produces a report
114 of which links were "OK" (responded positively to an HTTP query), which
115 were "RD" (responded with a 3xx redirect code), which could be "IW"
116 (interwiki) links, which are "EI" (external internal) links and could be
117 intrawiki links, and which were "NG" (no good; a negative response to the
118 query). This report can then be automatically uploaded to the location of
119 your choice. The script can also suggest Internet Archive snapshots for
120 "NG" links, and take screenshots of "OK" links for visual verification by
121 the reader that the page in question is the one intended to be displayed.
122
123 You must pass this script the URL at which the list of links is found
124 (--links) and the path where the directory of logs should be outputted
125 (--output). All other arguments are optional.
126
127OPTIONS
128 --help Show this page.
129 --links URL (required) URL from which to download the CSV
130 file with external links. Note that this URL can
131 be a local file if you supply a file:// path.
132 --output DIR (required) Unix path to directory in which Val
133 should place its reports.
134 --exceptions URL In order to remove links from the report which
135 Val finds an issue with, but which you regard as
136 OK, list those desired exceptions in this file.
137 See the sample file exceptions.txt for details.
138 Note that this URL can point to a local file if
139 you supply a file:// path.
140 --record-ok-links Log a link in the report even if its response
141 code is "OK".
142 --suggest-snapshots Query the Internet Archive for a possible
143 snapshot URL for each "NG" page.
144 --take-screenshots FILE Call the Google Chrome binary at this path to
145 take screenshots of each "OK" page.
146 --start-url NUM Start at this link in the links CSV file.
147 --end-url NUM Stop at this link in the links CSV file.
148 --upload FILE Upload report using the credentials and path
149 given in this local text file. See sftp_login.txt
150 for template.
151
152BUGS
153 The script cannot properly parse any line in the external links file
154 which contains a comma in the name of the wiki page containing a link.
155 Commas in the link itself are not an issue.
156EOF
157}
158
159
160### SETUP ###
161# If first argument is a help request, or if nothing was passed in at all, print help page and quit
162if [ "$#" -eq 0 ] || [ "$1" == "--help" ]; then
163 printHelp | less
164 exit 0
165fi
166
167# Parse arguments as long as there are more arguments to process
168while (( "$#" )); do
169 case "$1" in
170 --links ) LINKS_URL="$2"; shift 2;;
171 --exceptions ) EXCEPT_URL="$2"; shift 2;;
172 --output ) OUTPUT_DIR="$2"; shift 2;;
173 --record-ok-links ) RECORD_OK_LINKS=1; shift;;
174 --suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;;
175 --take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
176 --start-url ) URL_START=$2; shift 2;;
177 --end-url ) URL_LIMIT=$2; shift 2;;
178 --upload ) UPLOAD_INFO=$2; shift 2;;
179 * ) echo "Invalid argument $1 detected. Aborting."; exit 1;;
180 esac
181done
182
183# If the required arguments were not supplied, print help page and quit
184if [ -z $LINKS_URL ] || [ -z $OUTPUT_DIR ]; then
185 echo "Error: I did not receive one or both required arguments. Run me with the \"--help\" argument for documentation."
186 exit 2
187fi
188
189# If user wants screenshots, make sure path to Chrome was passed in and is valid
190if [ $TAKE_PAGE_SHOT -eq 1 ]; then
191 if [ ! -f "$CHROME_PATH" ]; then
192 echo "Error: You need to supply a path to the Google Chrome application in order to take screenshots."
193 exit 3
194 fi
195fi
196
197# Check that UPLOAD_INFO exists, if this argument was supplied
198if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then
199 echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting."
200 exit 4
201fi
202
203# Check that OUTPUT_DIR is a directory
204if [ ! -d "$OUTPUT_DIR" ]; then
205 echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting."
206 exit 5
207fi
208
209# Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots
210SAFE_TIME=$(date "+%Y-%m-%d--%H-%M-%S") # for use in path names
211NICE_TIME=$(TZ=":UTC" date "+%a, %d %b %Y %k:%M:%S GMT") # matches format of LINKS_DATE, which is extracted from the extlinks file
212OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
213OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
214SHOT_PATH="$OUTPUT_PATH/Screenshots"
215LOG_NAME="ValExtLinks report"
216LOG_TXT="$OUTPUT_PATH/$LOG_NAME.txt"
217LOG_RTF="$OUTPUT_PATH/$LOG_NAME.rtf"
218LOG_HTM="$OUTPUT_PATH/$LOG_NAME.htm"
219mkdir "$OUTPUT_PATH"
220if [ $TAKE_PAGE_SHOT -eq 1 ]; then
221 mkdir "$SHOT_PATH"
222fi
223
224# Check that 'mkdir' succeeded
225if [ ! -d "$OUTPUT_PATH" ]; then
226 echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting."
227 exit 6
228fi
229
230# Get date on the file at LINKS_URL and print to log
231LINKS_DATE=$(curl --silent --head $LINKS_URL | grep "Last-Modified")
232if [ -z "$LINKS_DATE" ]; then
233 echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting."
234 exit 7
235fi
236LINKS_DATE=${LINKS_DATE#Last-Modified: }
237
238
239### UTILITY FUNCTIONS ###
240# Writes a plain-text header to TXT log file
241function printTXTheader()
242{
243 valPrint t "Validate External Links report"
244 valPrint t "generated $NICE_TIME"
245 valPrint t "from data of $LINKS_DATE"
246 valPrint t "script by Iritscen (contact: $MY_WIKI_PAGE)"
247 valPrint t ""
248}
249
250# Writes the RTF header to RTF log file, then a centered title, then sets text to left-justified
251function printRTFheader()
252{
253 valPrint r "{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf820
254{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
255{\colortbl;\red255\green255\blue255;}
256{\*\expandedcolortbl;;}
257\margl1440\margr1440\vieww12600\viewh12100\viewkind0
258\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\qc\partightenfactor0
259
260\f0\fs28 \cf0 \b1 Validate External Links report \b0\\
261generated $NICE_TIME\\
262from data of $LINKS_DATE\\
263script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$MY_WIKI_PAGE\"}}{\fldrslt contact}})
264\\
265\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
266\cf0 "
267}
268
269# Closes the RTF markup of the RTF log file
270function printRTFfooter()
271{
272 valPrint r "}"
273}
274
275# Writes the HTML header to HTML log file
276function printHTMheader()
277{
278 valPrint h "<html>
279<head>
280<title>Validate External Links report</title>
281</head>
282<body>
283<h2>Validate External Links report</h2>
284<h3>generated $NICE_TIME<br />
285from data of $LINKS_DATE<br />
286script by Iritscen (<a href=\"$MY_WIKI_PAGE\" target=\"_blank\">contact</a>)</h3>"
287}
288
289# Closes the HTML markup of the HTML log file
290function printHTMfooter()
291{
292 valPrint h "</body>
293</html>"
294}
295
296# The central logging function. The first parameter is a string composed of one or more characters that
297# indicate which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
298# 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 's' means "Print an extra newline at the end." 'w' means "Don't pass console output through 'fmt'" ("fmt" fits the output
299# to an 80-column CLI but can break special formatting and the 'n' option).
300function valPrint()
301{
302 if [[ "$1" == *c* ]]; then
303 if [[ "$1" == *n* ]]; then
304 echo -n "$2"
305 elif [[ "$1" == *w* ]]; then
306 echo "$2"
307 elif [[ "$1" == *s* ]]; then
308 echo -e "$2\n"
309 else
310 echo "$2" | fmt -w 80
311 fi
312 fi
313 if [[ "$1" == *t* ]]; then
314 if [[ "$1" == *n* ]]; then
315 echo -n "$2" >> "$LOG_TXT"
316 elif [[ "$1" == *s* ]]; then
317 echo -e "$2\n" >> "$LOG_TXT"
318 else
319 echo "$2" >> "$LOG_TXT"
320 fi
321 fi
322 if [[ "$1" == *r* ]]; then
323 if [[ "$1" == *n* ]]; then
324 echo "$2" >> "$LOG_RTF"
325 elif [[ "$1" == *s* ]]; then
326 echo "$2\line\line" >> "$LOG_RTF"
327 else
328 echo "$2\line" >> "$LOG_RTF"
329 fi
330 fi
331 if [[ "$1" == *h* ]]; then
332 if [[ "$1" == *s* ]]; then
333 echo "$2<tr><td>&nbsp;</td></tr>" >> "$LOG_HTM"
334 elif [[ "$1" == *n* ]]; then
335 echo "$2" >> "$LOG_HTM"
336 else
337 echo "$2<br />" >> "$LOG_HTM"
338 fi
339 fi
340}
341
342# Pluralize the string in parameter 1 if the number in parameter 2 is not 1
343function pluralCheckNoun()
344{
345 if [ $2 -ne 1 ]; then
346 if [[ $1 =~ x$ ]]; then
347 echo $1es
348 else
349 echo $1s
350 fi
351 else
352 echo $1
353 fi
354}
355
356# Output "is" if parameter 1 is 1, otherwise "are"
357function pluralCheckIs()
358{
359 if [ $1 -ne 1 ]; then
360 echo "are"
361 else
362 echo "is"
363 fi
364}
365
366# Output "was" if parameter 1 is 1, otherwise "were"
367function pluralCheckWas()
368{
369 if [ $1 -ne 1 ]; then
370 echo "were"
371 else
372 echo "was"
373 fi
374}
375
376# Output "a " if parameter 1 is 1, otherwise nothing
377function pluralCheckA()
378{
379 if [ $1 -eq 1 ]; then
380 echo "a "
381 fi
382}
383
384# Output "an " if parameter 1 is 1, otherwise nothing
385function pluralCheckAn()
386{
387 if [ $1 -eq 1 ]; then
388 echo "an "
389 fi
390}
391
392# Upload HTML report using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
393# reports being saved to disk have already been closed.
394function uploadReport()
395{
396 valPrint c "Uploading HTML report..."
397
398 SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
399 SFTP_USER_NAME_MARKER="user:"
400 SFTP_PASSWORD_MARKER="pw:"
401 SFTP_PORT_MARKER="port:"
402 SFTP_PATH_MARKER="path:"
403 SFTP_USER_NAME=$(grep $SFTP_USER_NAME_MARKER $UPLOAD_INFO)
404 SFTP_USER_NAME=${SFTP_USER_NAME#$SFTP_USER_NAME_MARKER}
405 SFTP_PASSWORD=$(grep $SFTP_PASSWORD_MARKER $UPLOAD_INFO)
406 SFTP_PASSWORD=${SFTP_PASSWORD#$SFTP_PASSWORD_MARKER}
407 SFTP_PORT=$(grep $SFTP_PORT_MARKER $UPLOAD_INFO)
408 SFTP_PORT=${SFTP_PORT#$SFTP_PORT_MARKER}
409 SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
410 SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}
411
412 expect "$SCRIPT_PATH" "$LOG_HTM" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.htm"
413
414 valPrint c "Report was uploaded, unless an error message appears above."
415}
416
417# Prints session summary when script is done
418function wrapupAndExit()
419{
420 # Get off progress line on console, drop down a line from last link in log, and close HTML table
421 valPrint ctr ""
422 valPrint h "</table><br />"
423
424 # If we didn't finish processing the last URL, then the iterator is one too high
425 if [ $FINISHED_LIST != "yes" ]; then
426 let LINK_NUM-=1
427 if [ $FINISHED_LIST == "no" ]; then
428 valPrint ctrh "The session was canceled by the user."
429 fi
430 fi
431
432 # Generate string with elapsed time
433 END_RUN=$(date +%s)
434 ELAPSED=$(echo $(($END_RUN - $START_RUN)) | awk '{printf "%d min. %d sec. elapsed", int($1/60), int($1%60)}')
435
436 # Output results of session and close the log file's markup
437 LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
438 LINKS_SKIPPED=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
439 LINKS_CHECKED=$((LINKS_PROCESSED-LINKS_SKIPPED))
440 valPrint ct "Summary ($ELAPSED):"
441 valPrint r "\b1 Summary \b0 ($ELAPSED)"
442 valPrint hn "<h3><span id=\"summary\">Summary ($ELAPSED)</span></h3>"
443 valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT)."
444 valPrint ctrh "I skipped $LINKS_SKIPPED $(pluralCheckNoun link $LINKS_SKIPPED), and found $FILE_LINKS $(pluralCheckNoun file $FILE_LINKS) and $PAGE_LINKS $(pluralCheckNoun page $PAGE_LINKS)."
445 if [ $LINKS_SKIPPED -gt 0 ]; then valPrint ctrh "Skip breakdown: "; fi
446 if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
447 if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
448 if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
449 if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
450 if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
451 if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
452 valPrint ctrh "Out of the $LINKS_CHECKED links checked, $EI_LINKS could be $(pluralCheckAn $EI_LINKS)intrawiki $(pluralCheckNoun link $EI_LINKS), $IW_LINKS could be $(pluralCheckAn $IW_LINKS)interwiki $(pluralCheckNoun link $IW_LINKS), $OK_LINKS $(pluralCheckWas $OK_LINKS) OK, $RD_LINKS $(pluralCheckWas $RD_LINKS) $(pluralCheckA $RD_LINKS)redirection $(pluralCheckNoun notice $RD_LINKS), and $NG_LINKS $(pluralCheckWas $NG_LINKS) NG."
453 if [ $SKIP_EXPECT_NG -gt 0 ]; then
454 valPrint ctrh "$SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS) went unlisted due to being found in the exceptions file."
455 fi
456 if [ $SKIP_EXPECT_EI -gt 0 ]; then
457 valPrint ctrh "$SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS) went unlisted due to being found in the exceptions file."
458 fi
459 if [ $SKIP_EXPECT_IW -gt 0 ]; then
460 valPrint ctrh "$SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS) went unlisted due to being found in the exceptions file."
461 fi
462 valPrint trh "ValExtLinks says goodbye."
463 printRTFfooter
464 printHTMfooter
465
466 # Upload report if this was requested
467 if [ ! -z $UPLOAD_INFO ] && [ $FINISHED_LIST != "no" ]; then
468 uploadReport
469 fi
470
471 # Really quit now
472 valPrint c "ValExtLinks says goodbye."
473 exit 0
474}
475trap wrapupAndExit INT
476
477
478### INITIALIZATION ###
479# Print opening message to console and log files
480valPrint cw "$(tput bold)--Validate External Links--$(tput sgr0)"
481printTXTheader
482printRTFheader
483printHTMheader
484
485# Attempt to download file at LINKS_URL, then check that it succeeded
486valPrint t "Config:"
487valPrint r "\b1 Config \b0"
488valPrint hn "<h3>Config</h3>"
489valPrint cwtrh "Downloading list of external links from $LINKS_URL."
490LINKS_FILE_NAME=$(echo "$LINKS_URL" | sed 's/.*\///')
491LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
492curl --silent -o "$LINKS_FILE" $LINKS_URL
493if [ ! -f "$LINKS_FILE" ]; then
494 echo "The download of $LINKS_URL appears to have failed. Aborting."
495 wrapupAndExit
496fi
497
498# Attempt to download file at EXCEPT_URL, then check that it succeeded
499if [ ! -z $EXCEPT_URL ]; then
500 valPrint cwtrh "Downloading list of reporting exceptions from $EXCEPT_URL."
501 EXCEPT_FILE_NAME=$(echo "$EXCEPT_URL" | sed 's/.*\///')
502 EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
503 curl --silent -o "$EXCEPT_FILE" $EXCEPT_URL
504 if [ ! -f "$EXCEPT_FILE" ]; then
505 echo "The download of $EXCEPT_URL appears to have failed. Aborting."
506 wrapupAndExit
507 fi
508fi
509
510# Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
511LINK_COUNT_STRING=$(cat "$LINKS_FILE" | wc -l)
512
513# Number of URLs is number of lines minus one (first line is column header row for the CSV)
514LINK_COUNT=$(echo "${LINK_COUNT_STRING}" | tr -d '[:space:]')
515let LINK_COUNT-=1
516
517# Calculate number of URLs to consider
518if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
519 valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((URL_LIMIT-URL_START+1)) of them, from link $URL_START to $URL_LIMIT."
520elif [ $URL_START -ne 1 ]; then
521 valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((LINK_COUNT-URL_START+1)) of them, from link $URL_START to $LINK_COUNT."
522else
523 valPrint ctrh "Found $LINK_COUNT links to process. I will be considering all of them."
524fi
525
526# Print settings to console and log
527declare -a SETTINGS_MSG=(I will be checking the response code of each link "and will" take a screenshot of each page. Pages that are OK will "also" be logged. I "will" ask the Internet Archive for a suggested snapshot URL for each NG page. "I will not report links that are listed in the exceptions file.")
528if [ $TAKE_PAGE_SHOT -eq 0 ]; then SETTINGS_MSG[10]="but will not"; fi
529if [ $RECORD_OK_LINKS -eq 0 ]; then SETTINGS_MSG[22]="not"; fi
530if [ $SUGGEST_SNAPSHOTS -eq 0 ]; then SETTINGS_MSG[26]="will not"; fi
531if [ -z $EXCEPT_URL ] || [ $RECORD_OK_LINKS -eq 1 ]; then SETTINGS_MSG[40]=""; fi
532SETTINGS_STR=${SETTINGS_MSG[@]}
533valPrint ctrh "$SETTINGS_STR"
534valPrint tr "A summary of my findings will be found at the bottom of the report."
535valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
536valPrint trh ""
537
538# Print legend to logs
539valPrint t "Legend:"
540valPrint r "\b1 Legend \b0"
541valPrint hn "<h3>Legend</h3>"
542valPrint trh "OK = URL seems to be working."
543valPrint trh "NG = URL no longer seems to work. You should click each URL marked as NG before attempting to fix it, because false negatives will occur from time to time due to hiccups in the Internet. Please report any persistent false negatives or other issues to Iritscen. An NG link should be followed by a link to the Internet Archive's Wayback Machine which may help you repair the link. If the link cannot be repaired, you can disable it on the wiki (which prevents it from showing up in future ValExtLinks reports) by wrapping it in nowiki tags."
544valPrint trh "RD = The server responding to this URL is saying that the page moved and you should instead use the supplied new URL. Some RD links represent minor adjustments in the organization of a web site, and some are soft 404s (the file/page has been removed and you are being redirected to something like the main page of the web site). You will have to look at the new URL yourself to determine if it represents an OK link and the link on the wiki should be updated to this one, or if the desired file/page is actually gone and we need to replace the wiki link with an Internet Archive snapshot link -- or disable the URL if it has not been archived."
545valPrint trh "EI = URL is an external link to an internal page and should be converted to an intrawiki link using the suggested markup."
546valPrint trh "IW = URL is an external link to a fellow wiki and should be converted to an interwiki link using the suggested markup."
547valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $HTTP_CODES)."
548valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$HTTP_CODES\"}}{\fldrslt here}} for code reference)."
549valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$HTTP_CODES\" target=\"_blank\">here</a> for code reference)."
550valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $CURL_CODES)."
551valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$CURL_CODES\"}}{\fldrslt here}} for code reference)."
552valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$CURL_CODES\" target=\"_blank\">here</a> for code reference)."
553valPrint trh "IA suggests = Last available snapshot suggested by the Internet Archive."
554valPrint trh "Try browsing = The Archive occasionally fails to return a snapshot URL even when one exists, so you will need to check for a snapshot manually using this link to the Wayback Machine before concluding that a site has not been archived."
555valPrint trh ""
556
557
558### MAIN LOOP ###
559valPrint t "Links:"
560valPrint r "\b1 Links \b0"
561valPrint hn "<h3>Links</h3>"
562START_RUN=$(date +%s)
563# Process each line of the .csv in LINKS_FILE
564for LINE in `cat "$LINKS_FILE"`; do
565 let LINK_NUM+=1
566
567 # First line is the column header row for the CSV, so let's verify that the format hasn't changed
568 if [ $SKIPPED_HEADER_ROW -eq 0 ]; then
569 if [ $LINE == "namespace,title,target" ]; then
570 SKIPPED_HEADER_ROW=1
571 LINK_NUM=0 # this line is it's not a link, so reset the link counter
572 valPrint hn "<table>"
573 continue
574 else
575 valPrint ctrh "Did not find expected header row in $LINKS_FILE. Aborting."
576 wrapupAndExit
577 fi
578 fi
579
580 # Skip this link if we are not at URL_START yet
581 if [ $LINK_NUM -lt $URL_START ]; then
582 continue
583 fi
584
585 # Stop if we are at the limit declared for testing purposes
586 if [ $URL_LIMIT -gt 0 ] && [ $LINK_NUM -gt $URL_LIMIT ]; then
587 FINISHED_LIST="limit"
588 wrapupAndExit
589 fi
590
591 # Print progress to screen
592 if [ $LINK_NUM -gt 1 ]; then
593 printf "\e[1A\n" # erase previous progress message so that new one appears in its place
594 fi
595 valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."
596
597 # The number of the namespace is the element before the first comma on the line
598 NS_ID=${LINE%%,*}
599
600 # Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
601 NS_NAME=""
602 a=0
603 while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done
604 if [ $NS_ID == "NULL" ]; then
605 break
606 elif [ $NS_ID -eq ${NS_IDS[$a]} ]; then
607 NS_NAME="${NS_NAMES[$a]}"
608 break
609 fi
610 let a+=1
611 done
612 if [ "$NS_NAME" == "" ]; then
613 if [ $NS_ID == "NULL" ]; then
614 valPrint trs "Skipping URL on line $LINK_NUM because the namespace (and probably the page too) is \"NULL\". Probably the link is no longer in existence on the wiki."
615 else
616 valPrint trs "Skipping URL on line $LINK_NUM found on page $PAGE_NAME because I could not find a name for namespace ID $NS_ID."
617 fi
618 let SKIP_UNK_NS+=1
619 continue
620 fi
621
622 # The name of the page is everything between the namespace ID and the next comma on the line (commas
623 # in page names will break this)
624 PAGE_NAME=${LINE#$NS_ID,}
625 PAGE_NAME=${PAGE_NAME%%,*}
626
627 # We don't want to consider wiki pages ending in .js, as the parser cannot reliably isolate URLS in
628 # JavaScript code, so it will return erroneous links
629 PAGE_NAME_SUFFIX=$(echo $PAGE_NAME | sed 's/.*\.//')
630 if [ $PAGE_NAME_SUFFIX == "js" ]; then
631 valPrint trs "Skipping URL on line $LINK_NUM because it was found on JavaScript page $PAGE_NAME."
632 let SKIP_JS_PAGE+=1
633 continue
634 fi
635
636 # Build longer wiki page URLs from namespace and page names
637 FULL_PAGE_PATH=http://$WIKI_PATH/$NS_NAME:$PAGE_NAME
638 LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
639 # Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it
640 # explicitly breaks the link
641 if [ $NS_ID -eq 0 ]; then
642 FULL_PAGE_PATH=http://$WIKI_PATH/$PAGE_NAME
643 LOCAL_PAGE_PATH=$PAGE_NAME
644 fi
645
646 # The URL being linked to is everything after the previous two fields (this allows commas to be in
647 # the URLs, but a comma in the previous field, the page name, will break this)
648 URL=${LINE#$NS_ID,$PAGE_NAME,}
649
650 # Scan for illegal characters
651 if [[ $URL == *[$ILLEGAL_CHARS]* ]]; then
652 valPrint trs "Skipping URL $URL (found on page $PAGE_NAME) because it contains characters illegal in a URL."
653 let SKIP_BAD_URL+=1
654 continue
655 fi
656
657 # Now we need to know if the URL is for a file or a web page. First step is to determine if the
658 # URL ends in a suffix
659 HAS_SUFFIX=0
660
661 # If the URL ends in something like ".php?foo=bar", strip everything from the '?' onward
662 CLEAN_URL=${URL%%\?*}
663
664 # If the URL ends in something like "#section_15", strip everything from the '#' onward
665 CLEAN_URL=${CLEAN_URL%%\#*}
666
667 # 'sed' cannot handle Unicode in my Bash shell, so skip this URL and make user check it
668 if [[ $CLEAN_URL == *[![:ascii:]]* ]]; then
669 valPrint trs "Skipping URL $URL (found on page $PAGE_NAME) because I cannot handle non-ASCII characters."
670 let SKIP_NON_ASCII+=1
671 continue
672 fi
673
674 # Isolate the characters after the last period and after the last slash
675 POST_DOT=$(echo "$CLEAN_URL" | sed 's/.*\.//')
676 POST_SLASH=$(echo "$CLEAN_URL" | sed 's/.*\///')
677
678 # If the last period comes after the last slash, then the URL ends in a suffix
679 POST_DOT_LENGTH=$(echo | awk -v input=$POST_DOT '{print length(input)}')
680 POST_SLASH_LENGTH=$(echo | awk -v input=$POST_SLASH '{print length(input)}')
681 if [ $POST_DOT_LENGTH -lt $POST_SLASH_LENGTH ]; then
682 HAS_SUFFIX=1
683 else
684 HAS_SUFFIX=0
685 fi
686
687 # Now we need to know if the URL is for a file. If the URL ends in a suffix, compare it against the
688 # known URL endings supplied in HTTP_FILES and HTTP_TLDS_AND_PAGES
689 IS_FILE=-1
690 if [ $HAS_SUFFIX -eq 0 ]; then
691 IS_FILE=0
692 else
693 # Turn off case sensitivity while we compare suffixes
694 shopt -s nocasematch
695
696 # Special case: URLs ending in something like "/productID.304297400" are pages, not files, so if
697 # the URL's suffix is all numbers, we are looking at the end of a web page URL
698 if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
699 IS_FILE=0
700 fi
701
702 # If we did not identify this URL as a web page above, we need to compare the suffix against known
703 # file extensions
704 if [ $IS_FILE -eq -1 ]; then
705 for EXTENSION in "${HTTP_FILES[@]}"; do
706 if [[ $EXTENSION == $POST_DOT ]]; then # double brackets respect the nocasematch setting
707 IS_FILE=1
708 break
709 fi
710 done
711 fi
712
713 # If we did not identify this URL as a file above, we need to compare the suffix against known
714 # pages and TLDs in order to rule out that this is an unknown file suffix or TLD that the user
715 # needs to add to the HTTP_FILES or HTTP_TLDS_AND_PAGES array
716 if [ $IS_FILE -eq -1 ]; then
717 for PAGE_OR_TLD in "${HTTP_TLDS_AND_PAGES[@]}"; do
718 if [[ $PAGE_OR_TLD == $POST_DOT ]]; then
719 IS_FILE=0
720 break
721 fi
722 done
723 fi
724
725 # Turn case sensitivity back on in Bash
726 shopt -u nocasematch
727 fi
728
729 # If this suffix escaped identification as either a file, page or TLD, inform the user
730 STR_TYPE=""
731 if [ $IS_FILE -eq -1 ]; then
732 valPrint trs "Skipping URL $URL (found on page $PAGE_NAME) because I encountered the unknown URL ending $POST_DOT. Please add this ending to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
733 let SKIP_UNK_SUFFIX+=1
734 continue
735 elif [ $IS_FILE -eq 1 ]; then
736 STR_TYPE="file"
737 let FILE_LINKS+=1
738 elif [ $IS_FILE -eq 0 ]; then
739 STR_TYPE="page"
740 let PAGE_LINKS+=1
741 fi
742
743 # Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
744 # issue with sites that require HTTPS
745 CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{http_code}\n' $URL)
746 CURL_ERR=$(echo $?)
747 CURL_RESULT=$CURL_CODE
748
749 # Tack 'curl' exit code onto result string if 'curl' returned "000" (no HTTP response)
750 if [ $CURL_CODE == "000" ]; then
751 CURL_RESULT="$CURL_RESULT-$CURL_ERR"
752 fi
753
754 # Begin to determine our status code for this URL (EI, IW, OK, RD, or NG)
755 STATUS="??"
756 NEW_URL=""
757 INTERWIKI_INDEX=-1
758
759 # First make sure that this isn't an "external internal" link to our own wiki that can be replaced
760 # by "[[page_name]]". If it uses a special access URL beginning with "/w/", let it pass, as it
761 # probably cannot be replaced by "[[ ]]" markup
762 if [[ $URL == *$WIKI_PATH* ]] && [[ $URL != *$WIKI_PATH/w/* ]]; then
763 STATUS="EI"
764 let EI_LINKS+=1
765 fi
766
767 # If it's not, check if this is a link to a domain that we have an interwiki prefix for
768 if [ $STATUS == "??" ]; then
769 for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
770 if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]] && [[ $URL != *${INTERWIKI_DOMAINS[$i]}/w/* ]]; then
771 STATUS="IW"
772 let IW_LINKS+=1
773 INTERWIKI_INDEX=$i
774 break
775 fi
776 done
777 fi
778
779 # If we didn't match an interwiki domain, see if the status code is in our "OK" codes list
780 if [ $STATUS == "??" ]; then
781 for CODE in "${OK_CODES[@]}"; do
782 if [[ $CODE == $CURL_CODE ]]; then
783 STATUS="OK"
784 let OK_LINKS+=1
785 break
786 fi
787 done
788 fi
789
790 # If we didn't get a match with the "OK" codes, check it against the "RD" codes
791 if [ $STATUS == "??" ]; then
792 for CODE in "${RD_CODES[@]}"; do
793 if [[ $CODE == $CURL_CODE ]]; then
794 # Get URL header again in order to retrieve the URL we are being redirected to
795 NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{redirect_url}\n' $URL)
796
797 # Filter out cases where the redirect URL is just the original URL with https:// instead of
798 # http://, or with an added '/' at the end. These corrections happen a lot and are not
799 # important to us.
800 URL_NO_PROTOCOL=${URL#http://}
801 URL_NO_PROTOCOL=${URL_NO_PROTOCOL%/}
802 NEW_URL_NO_PROTOCOL=${NEW_URL#https://}
803 NEW_URL_NO_PROTOCOL=${NEW_URL_NO_PROTOCOL%/}
804
805 # Sometimes 'curl' fails to get the redirect_url due to time-out or bad web site config
806 NEW_URL_LENGTH=$(echo | awk -v input=$NEW_URL_NO_PROTOCOL '{print length(input)}')
807 if [ $NEW_URL_LENGTH -lt $MIN_URL_LENGTH ]; then
808 NEW_URL_NO_PROTOCOL="[new URL not retrieved]"
809 fi
810
811 # If the URLs match after the above filters were applied, then the link is OK
812 if [ $URL_NO_PROTOCOL == $NEW_URL_NO_PROTOCOL ]; then
813 STATUS="OK"
814 let OK_LINKS+=1
815 else
816 STATUS="RD"
817 let RD_LINKS+=1
818 fi
819 break
820 fi
821 done
822 fi
823
824 # If we didn't get a match with the "RD" codes, check it against the "NG" codes
825 if [ $STATUS == "??" ]; then
826 for CODE in "${NG_CODES[@]}"; do
827 if [[ $CODE == $CURL_CODE ]]; then
828 STATUS="NG"
829 let NG_LINKS+=1
830 break
831 fi
832 done
833 fi
834
835 # If we didn't match a known status code, advise the reader
836 if [ $STATUS == "??" ]; then
837 valPrint trs "Skipping URL $URL (found on page $PAGE_NAME) because I encountered the unknown return code $CURL_CODE."
838 let SKIP_UNK_CODE+=1
839 continue
840 fi
841
842 # Check problem links against exceptions file before proceeding
843 if [ $STATUS != "OK" ] && [ ! -z $EXCEPT_URL ]; then
844 # The code we expect to find in the exceptions file is either the 'curl' result or "EI"/"IW"
845 EXPECT_CODE="$CURL_RESULT"
846 if [ $STATUS == "EI" ]; then
847 EXPECT_CODE="EI"
848 elif [ $STATUS == "IW" ]; then
849 EXPECT_CODE="IW"
850 fi
851
852 # Look for link in exceptions file and make sure its listed result code and wiki page also match
853 GREP_RESULT=$(grep --max-count=1 "$URL" "$EXCEPT_FILE")
854 EXCEPT_PAGE=${GREP_RESULT##*,}
855 if [ "$EXCEPT_PAGE" == "*" ] || [ "$EXCEPT_PAGE" == $LOCAL_PAGE_PATH ]; then
856 EXCEPT_CODE=${GREP_RESULT%%,*}
857 if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then
858 valPrint trs "Skipping URL $URL (found on page $PAGE_NAME) because its expected result, $EXPECT_CODE, is listed in the exceptions file."
859 if [ $STATUS == "EI" ]; then
860 let SKIP_EXPECT_EI+=1
861 elif [ $STATUS == "IW" ]; then
862 let SKIP_EXPECT_IW+=1
863 else
864 let SKIP_EXPECT_NG+=1
865 fi
866 continue
867 fi
868 fi
869 fi
870
871 # If appropriate, record this link to the log, with clickable URLs when possible
872 if [ $STATUS != "OK" ] || [ $RECORD_OK_LINKS -eq 1 ]; then
873 # Stupid hack since the strings "IW" and "EI" are narrower than "OK", "RD", or "NG" and it takes
874 # an extra tab to get to the desired level of indentation in the RTF log
875 RTF_TABS=" "
876 if [ $STATUS == "IW" ] || [ $STATUS == "EI" ]; then
877 RTF_TABS=" "
878 fi
879
880 # Record link and its wiki page in TXT, RTF, and HTML markup
881 valPrint t "$STATUS ($CURL_RESULT) $STR_TYPE $URL"
882 valPrint t " linked from $FULL_PAGE_PATH"
883 valPrint r "$STATUS ($CURL_RESULT)$RTF_TABS$STR_TYPE {\field{\*\fldinst{HYPERLINK \"$URL\"}}{\fldrslt $URL}}"
884 valPrint r " linked from {\field{\*\fldinst{HYPERLINK \"$FULL_PAGE_PATH\"}}{\fldrslt $LOCAL_PAGE_PATH}}"
885 valPrint hn "<tr><td style=\"white-space:nowrap\">$STATUS ($CURL_RESULT)</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>"
886 valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"
887
888 # Record redirect URL if one was given by a 3xx response page
889 if [ $STATUS == "RD" ]; then
890 valPrint ts " Server suggests $NEW_URL"
891 valPrint rs " Server suggests {\field{\*\fldinst{HYPERLINK \"$NEW_URL\"}}{\fldrslt $NEW_URL}}"
892 valPrint hs "<tr><td colspan=\"2\" align=\"right\">Server suggests</td><td><a href=\"$NEW_URL\" target=\"_blank\">$NEW_URL</a></td></tr>"
893 fi
894
895 # Notify reader if we can use an intrawiki link for this URL
896 if [ $STATUS == "EI" ]; then
897 INTRA_PAGE=${URL#*://*/}
898 valPrint ts " Just use [[$INTRA_PAGE]]"
899 valPrint rs " Just use [[$INTRA_PAGE]]"
900 valPrint hs "<tr><td colspan=\"2\" align=\"right\">Just use</td><td>[[$INTRA_PAGE]]</td></tr>"
901 fi
902
903 # Notify reader if we can use an interwiki prefix for this URL
904 if [ $STATUS == "IW" ]; then
905 INTER_PAGE=$(echo "$URL" | sed 's/.*\///')
906 valPrint ts " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
907 valPrint rs " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
908 valPrint hs "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]</td></tr>"
909 fi
910
911 # Query Internet Archive for latest "OK" snapshot for "NG" page
912 if [ $STATUS == "NG" ] && [ $SUGGEST_SNAPSHOTS -eq 1 ]; then
913 ARCHIVE_QUERY=$(curl --silent --max-time 10 "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
914
915 # If a "closest" snapshot was received...
916 if [[ "$ARCHIVE_QUERY" == *\"closest\":* ]]; then
917 # In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it
918 ARCHIVE_QUERY=$(echo "$ARCHIVE_QUERY" | sed 's/#!/#\\!/')
919
920 # ...isolate "url" property in the response that follows the "closest" tag
921 SNAPSHOT_URL=${ARCHIVE_QUERY##*\"closest\":} # everything after '"closest":'
922 SNAPSHOT_URL=${SNAPSHOT_URL#*\"url\": \"} # everything after '"url": "'
923 SNAPSHOT_URL=${SNAPSHOT_URL%%\"*} # everything before '"'
924
925 # Inform the user of the snapshot URL
926 valPrint ts " IA suggests $SNAPSHOT_URL"
927 valPrint rs " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
928 valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
929 else # ...otherwise give generic Wayback Machine link for this URL
930 valPrint ts " Try browsing $ARCHIVE_GENERIC/$URL"
931 valPrint rs " Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
932 valPrint hs "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"
933 fi
934 fi
935 fi
936
937 # If this is a page and it seems to be there, take screenshot of it using headless Google Chrome
938 if [ $IS_FILE -eq 0 ] && [ $TAKE_PAGE_SHOT -eq 1 ] && [ $STATUS == "OK" ]; then
939 # Sanitize URL by removing "http(s)://" and converting any other colons or slashes to underscores
940 SHOT_NAME=$(echo "$URL" | sed 's/https*\:\/\///' | sed 'y/:\//__/')
941 SHOT_FILE="$SHOT_PATH/$SHOT_NAME.png"
942
943 # Don't take screenshot if we already encountered this page and screenshotted it
944 if [ ! -f "$SHOT_FILE" ]; then
945 "$CHROME_PATH" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
946 if [ -f "$WORKING_DIR/$CHROME_SCREENSHOT" ]; then
947 mv -n "$WORKING_DIR/$CHROME_SCREENSHOT" "$SHOT_FILE"
948 else
949 valPrint trhs "Screenshot of URL $URL seems to have failed!"
950 fi
951 else
952 valPrint trhs "Skipping screenshot of URL $URL because $SHOT_FILE already exists."
953 fi
954 fi
955done
956FINISHED_LIST="yes"
957wrapupAndExit
Note: See TracBrowser for help on using the repository browser.