source: Validate External Links/validate_external_links.sh@ 1073

Last change on this file since 1073 was 1070, checked in by iritscen, 7 years ago

ValExtLinks improvements:

  • Now advises reader of external internal links.
  • The exceptions file now allows finer-grained exemption of a URL by matching to the specific page that contains it instead of exempting all occurrences of that URL (but the '*' wildcard will match all containing pages). Currently you can only list a URL once, however.
  • The exceptions file now allows external internal and potential intrawiki links to be exempted from the report.
  • The path to Google Chrome (for taking screenshots) is now external to the script, supplied as an argument after "--take-screenshots".
  • All of OniGalore's interwiki shortcuts are now recognized.
  • Protection against failed retrieval of redirect URL.
  • Better recognition of unimportant redirects (http->https, added ending slash).
File size: 40.6 KB
Line 
1#!/bin/bash
2
3# Validate External Links by Iritscen
4# Provided with a list of external links found in the OniGalore wiki, this script validates them.
5# The resulting logs are produced in three formats: TXT (for easy diffing with an earlier log), RTF
6# (for reading as a local file with clickable links), and HTML (for uploading as a web page).
7# Call script with "--help" argument for documentation. Also see Read Me First.rtf for critical notes.
8# Recommended rule:
9# ------------------------------------------------------------------------------------------------------
10
11# Set separator token to newline
12IFS="
13"
14
15### GLOBALS ###
16# Settings -- these will be changed from their defaults by the arguments passed in to the script
17LINKS_URL="" # use 'curl' to download file with links from this location (can be file://)
18EXCEPT_URL="" # ditto above for file with exceptions to NG results
19OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder
20RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES
21SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page
22TAKE_PAGE_SHOT=0 # take a screenshot of each OK page
23CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature
24URL_START=1 # start at this URL in LINKS_FILE (1 by default)
25URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE
26UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report
27
28# Fixed strings -- see the occurrences of these variables to learn their purpose
29AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0"
30ARCHIVE_API="http://archive.org/wayback/available"
31ARCHIVE_GENERIC="https://web.archive.org/web/*"
32ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
33CHROME_SCREENSHOT="screenshot.png"
34CURL_CODES="http://iritscen.oni2.net/val/curl_codes.txt"
35EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
36HTTP_CODES="http://iritscen.oni2.net/val/http_codes.txt"
37MY_WIKI_PAGE="http://wiki.oni2.net/User:Iritscen"
38THIS_DIR=$(cd $(dirname $0); pwd)
39WORKING_DIR=$(pwd)
40WIKI_PATH="wiki.oni2.net"
41
42# These are parallel arrays of the IDs and names of OniGalore's current namespaces
43declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 100 101 102 103 104 105 108 109 110 111)
44declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk")
45
46# These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
47# This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code.
48declare -a HTTP_FILES=(txt zip wmv jpg png m4a bsl rar oni mp3 mov ONWC vbs TRMA mp4 doc avi log gif pdf dmg exe cpp tga 7z wav east BINA xml dll dae xaf fbx 3ds blend flv csv)
49declare -a HTTP_TLDS_AND_PAGES=(com net org uk ru de it htm html php pl asp aspx shtml pgi cgi php3 x jsp phtml cfm css action stm js)
50
51# These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
52# are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt
53# if you add a new code.
54declare -a OK_CODES=(200 401 405 406 501)
55declare -a RD_CODES=(301 302 303 307 308)
56declare -a NG_CODES=(000 403 404 410 500 503)
57
58# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
59# transcluded text, and if the transclusion fails, then the braces show up in the URL
60ILLEGAL_CHARS="{ }"
61
62# The shortest URL possible, used for sanity-checking some URLs: http://a.co
63MIN_URL_LENGTH=11
64
65# These are parallel arrays giving the prefixes that can be used in place of normal external links to
66# some wikis and other sites
67declare -a INTERWIKI_PREFIXES=(commons metawikimedia mw wikibooks wikidata wikimedia wikinews wikiquote wikisource wikispecies wikiversity wikivoyage wikt wp)
68declare -a INTERWIKI_DOMAINS=(commons.wikimedia.org meta.wikimedia.org mediawiki.org wikibooks.org wikidata.org wikimediafoundation.org wikinews.org wikiquote.org wikisource.org species.wikimedia.org wikiversity.org wikivoyage.org wiktionary.org wikipedia.org)
69
70# Variables for keeping track of main loop progress and findings
71LINK_NUM=0
72EI_LINKS=0
73IW_LINKS=0
74OK_LINKS=0
75RD_LINKS=0
76NG_LINKS=0
77SKIP_UNK_NS=0
78SKIP_JS_PAGE=0
79SKIP_BAD_URL=0
80SKIP_NON_ASCII=0
81SKIP_UNK_SUFFIX=0
82SKIP_UNK_CODE=0
83SKIP_EXPECT_NG=0
84SKIP_EXPECT_EI=0
85SKIP_EXPECT_IW=0
86FILE_LINKS=0
87PAGE_LINKS=0
88SKIPPED_HEADER_ROW=0
89FINISHED_LIST="no"
90
91
92### HELP ###
93# A pseudo-man page. Here is the 80-character rule for the page text:
94# 234567890123456789012345678901234567890123456789012345678901234567890123456789
95function printHelp()
96{
97 cat << EOF
98
99NAME
100 Validate External Links
101
102SYNOPSIS
103 validate_external_links.sh --help
104 validate_external_links.sh --links URL --output DIR [--exceptions URL]
105 [--record-ok-links] [--suggest-snapshots] [--take-screenshots DIR]
106 [--start-url NUM] [--end-url NUM] [--upload FILE]
107
108DESCRIPTION
109 This script parses a list of external links found in the OniGalore wiki
110 (which is dumped by the Oni2.net domain periodically in a particular
111 format), validates them using the Unix tool 'curl', and produces a report
112 of which links were "OK" (responded positively to an HTTP query), which
113 were "RD" (responded with a 3xx redirect code), which could be "IW"
114 (interwiki) links, which are "EI" (external internal) links and could be
115 intrawiki links, and which were "NG" (no good; a negative response to the
116 query). This report can then be automatically uploaded to the location of
117 your choice. The script can also suggest Internet Archive snapshots for
118 "NG" links, and take screenshots of "OK" links for visual verification by
119 the reader that the page in question is the one intended to be displayed.
120
121 You must pass this script the URL at which the list of links is found
122 (--links) and the path where the directory of logs should be outputted
123 (--output). All other arguments are optional.
124
125OPTIONS
126 --help Show this page.
127 --links URL (required) URL from which to download the CSV file
128 with external links. Note that this URL can be a
129 local file if you supply a file:// path.
130 --output DIR (required) Place the folder which will contain the
131 reports and optional screenshots at this (Unix-
132 format) path.
133 --exceptions URL In order to remove links from the report which Val
134 finds an issue with, but which you regard as OK,
135 list those desired exceptions in this file. See
136 the sample file exceptions.txt for details. Note
137 that this text file can be a local file if you
138 supply a file:// path.
139 --record-ok-links Log a link in the report even if its response code
140 is "OK".
141 --suggest-snapshots Query the Internet Archive for a possible snapshot
142 URL for each "NG" page.
143 --take-screenshots DIR Use the copy of Google Chrome at this path to take
144 screenshots of each "OK" page.
145 --start-url NUM Start at this link in the link dump CSV file.
146 --end-url NUM Stop at this link in the link dump CSV file.
147 --upload FILE Upload report using the credentials in this local
148 text file. See sftp_login.txt for example.
149
150BUGS
151 The script cannot properly parse any line in the external links file
152 which contains a comma in the name of the wiki page containing a link.
153 Commas in the link itself are not an issue.
154EOF
155}
156
157
158### SETUP ###
159# If first argument is a help request, or if nothing was passed in at all, print help page and quit
160if [ "$#" -eq 0 ] || [ "$1" == "--help" ]; then
161 printHelp | less
162 exit 0
163fi
164
165# Parse arguments as long as there are more arguments to process
166while (( "$#" )); do
167 case "$1" in
168 --links ) LINKS_URL="$2"; shift 2;;
169 --exceptions ) EXCEPT_URL="$2"; shift 2;;
170 --output ) OUTPUT_DIR="$2"; shift 2;;
171 --record-ok-links ) RECORD_OK_LINKS=1; shift;;
172 --suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;;
173 --take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
174 --start-url ) URL_START=$2; shift 2;;
175 --end-url ) URL_LIMIT=$2; shift 2;;
176 --upload ) UPLOAD_INFO=$2; shift 2;;
177 * ) echo "Invalid argument $1 detected. Aborting."; exit 1;;
178 esac
179done
180
181# If the required arguments were not supplied, print help page and quit
182if [ -z $LINKS_URL ] || [ -z $OUTPUT_DIR ]; then
183 echo "Error: I did not receive one or both required arguments. Run me with the \"--help\" argument for documentation."
184 exit 2
185fi
186
187# If user wants screenshots, make sure path to Chrome was passed in and is valid
188if [ $TAKE_PAGE_SHOT -eq 1 ]; then
189 if [ ! -f "$CHROME_PATH" ]; then
190 echo "Error: You need to supply a path to the Google Chrome application in order to take screenshots."
191 exit 3
192 fi
193fi
194
195# Check that UPLOAD_INFO exists, if this argument was supplied
196if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then
197 echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting."
198 exit 4
199fi
200
201# Check that OUTPUT_DIR is a directory
202if [ ! -d "$OUTPUT_DIR" ]; then
203 echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting."
204 exit 5
205fi
206
207# Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots
208SAFE_TIME=$(date "+%Y-%m-%d--%H-%M-%S") # for use in path names
209NICE_TIME=$(TZ=":UTC" date "+%a, %d %b %Y %k:%M:%S GMT") # matches format of LINKS_DATE, which is extracted from the extlinks file
210OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
211OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
212SHOT_PATH="$OUTPUT_PATH/Screenshots"
213LOG_NAME="ValExtLinks report"
214LOG_TXT="$OUTPUT_PATH/$LOG_NAME.txt"
215LOG_RTF="$OUTPUT_PATH/$LOG_NAME.rtf"
216LOG_HTM="$OUTPUT_PATH/$LOG_NAME.htm"
217mkdir "$OUTPUT_PATH"
218if [ $TAKE_PAGE_SHOT -eq 1 ]; then
219 mkdir "$SHOT_PATH"
220fi
221
222# Check that 'mkdir' succeeded
223if [ ! -d "$OUTPUT_PATH" ]; then
224 echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting."
225 exit 6
226fi
227
228# Get date on the file at LINKS_URL and print to log
229LINKS_DATE=$(curl --silent --head $LINKS_URL | grep "Last-Modified")
230if [ -z "$LINKS_DATE" ]; then
231 echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting."
232 exit 7
233fi
234LINKS_DATE=${LINKS_DATE#Last-Modified: }
235
236
237### UTILITY FUNCTIONS ###
238# Writes a plain-text header to TXT log file
239function printTXTheader()
240{
241 valPrint t "Validate External Links report"
242 valPrint t "generated $NICE_TIME"
243 valPrint t "from data of $LINKS_DATE"
244 valPrint t "script by Iritscen (contact: $MY_WIKI_PAGE)"
245 valPrint t ""
246}
247
248# Writes the RTF header to RTF log file, then a centered title, then sets text to left-justified
249function printRTFheader()
250{
251 valPrint r "{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf820
252{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
253{\colortbl;\red255\green255\blue255;}
254{\*\expandedcolortbl;;}
255\margl1440\margr1440\vieww12600\viewh12100\viewkind0
256\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\qc\partightenfactor0
257
258\f0\fs28 \cf0 \b1 Validate External Links report \b0\\
259generated $NICE_TIME\\
260from data of $LINKS_DATE\\
261script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$MY_WIKI_PAGE\"}}{\fldrslt contact}})
262\\
263\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
264\cf0 "
265}
266
267# Closes the RTF markup of the RTF log file
268function printRTFfooter()
269{
270 valPrint r "}"
271}
272
273# Writes the HTML header to HTML log file
274function printHTMheader()
275{
276 valPrint h "<html>
277<head>
278<title>Validate External Links report</title>
279</head>
280<body>
281<h2>Validate External Links report</h2>
282<h3>generated $NICE_TIME<br />
283from data of $LINKS_DATE<br />
284script by Iritscen (<a href=\"$MY_WIKI_PAGE\" target=\"_blank\">contact</a>)</h3>"
285}
286
287# Closes the HTML markup of the HTML log file
288function printHTMfooter()
289{
290 valPrint h "</body>
291</html>"
292}
293
294# The central logging function. The first parameter is a string composed of one or more characters that
295# indicate which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
296# 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 'w' means "Don't
297# pass console output through 'fmt'" ("fmt" fits the output to an 80-column CLI but can break special
298# formatting and the 'n' option).
299function valPrint()
300{
301 if [[ "$1" == *c* ]]; then
302 if [[ "$1" == *n* ]]; then
303 echo -n "$2"
304 elif [[ "$1" == *w* ]]; then
305 echo "$2"
306 else
307 echo "$2" | fmt -w 80
308 fi
309 fi
310 if [[ "$1" == *t* ]]; then
311 if [[ "$1" == *n* ]]; then
312 echo -n "$2" >> "$LOG_TXT"
313 else
314 echo "$2" >> "$LOG_TXT"
315 fi
316 fi
317 if [[ "$1" == *r* ]]; then
318 if [[ "$1" == *n* ]]; then
319 echo "$2" >> "$LOG_RTF"
320 else
321 echo "$2\\" >> "$LOG_RTF"
322 fi
323 fi
324 if [[ "$1" == *h* ]]; then
325 if [[ "$1" == *n* ]]; then
326 echo "$2" >> "$LOG_HTM"
327 else
328 echo "$2<br />" >> "$LOG_HTM"
329 fi
330 fi
331}
332
333# Pluralize the string in parameter 1 if the number in parameter 2 is not 1
334function pluralCheckNoun()
335{
336 if [ $2 -ne 1 ]; then
337 if [[ $1 =~ x$ ]]; then
338 echo $1es
339 else
340 echo $1s
341 fi
342 else
343 echo $1
344 fi
345}
346
347# Output "is" if parameter 1 is 1, otherwise "are"
348function pluralCheckIs()
349{
350 if [ $1 -ne 1 ]; then
351 echo "are"
352 else
353 echo "is"
354 fi
355}
356
357# Output "was" if parameter 1 is 1, otherwise "were"
358function pluralCheckWas()
359{
360 if [ $1 -ne 1 ]; then
361 echo "were"
362 else
363 echo "was"
364 fi
365}
366
367# Output "a " if parameter 1 is 1, otherwise nothing
368function pluralCheckA()
369{
370 if [ $1 -eq 1 ]; then
371 echo "a "
372 fi
373}
374
375# Output "an " if parameter 1 is 1, otherwise nothing
376function pluralCheckAn()
377{
378 if [ $1 -eq 1 ]; then
379 echo "an "
380 fi
381}
382
383# Upload HTML report using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
384# reports being saved to disk have already been closed.
385function uploadReport()
386{
387 valPrint c "Uploading HTML report..."
388
389 SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
390 SFTP_USER_NAME_MARKER="user:"
391 SFTP_PASSWORD_MARKER="pw:"
392 SFTP_PORT_MARKER="port:"
393 SFTP_PATH_MARKER="path:"
394 SFTP_USER_NAME=$(grep $SFTP_USER_NAME_MARKER $UPLOAD_INFO)
395 SFTP_USER_NAME=${SFTP_USER_NAME#$SFTP_USER_NAME_MARKER}
396 SFTP_PASSWORD=$(grep $SFTP_PASSWORD_MARKER $UPLOAD_INFO)
397 SFTP_PASSWORD=${SFTP_PASSWORD#$SFTP_PASSWORD_MARKER}
398 SFTP_PORT=$(grep $SFTP_PORT_MARKER $UPLOAD_INFO)
399 SFTP_PORT=${SFTP_PORT#$SFTP_PORT_MARKER}
400 SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
401 SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}
402
403 expect "$SCRIPT_PATH" "$LOG_HTM" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.htm"
404
405 valPrint c "Report was uploaded, unless an error message appears above."
406}
407
408# Prints session summary when script is done
409function wrapupAndExit()
410{
411 # Get off progress line on console, drop down a line from last link in log, and close HTML table
412 valPrint ctr ""
413 valPrint h "</table><br />"
414
415 # If we didn't finish processing the last URL, then the iterator is one too high
416 if [ $FINISHED_LIST != "yes" ]; then
417 let LINK_NUM-=1
418 if [ $FINISHED_LIST == "no" ]; then
419 valPrint ctrh "The session was canceled by the user."
420 fi
421 fi
422
423 # Output results of session and close the log file's markup
424 LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
425 LINKS_SKIPPED=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
426 LINKS_CHECKED=$((LINKS_PROCESSED-LINKS_SKIPPED))
427 valPrint ct "Summary:"
428 valPrint r "\b1 Summary \b0"
429 valPrint hn "<h3><span id=\"summary\">Summary</span></h3>"
430 valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT)."
431 valPrint ctrh "I skipped $LINKS_SKIPPED $(pluralCheckNoun link $LINKS_SKIPPED), and found $FILE_LINKS $(pluralCheckNoun file $FILE_LINKS) and $PAGE_LINKS $(pluralCheckNoun page $PAGE_LINKS)."
432 if [ $LINKS_SKIPPED -gt 0 ]; then valPrint ctrh "Skip breakdown: "; fi
433 if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
434 if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
435 if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
436 if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
437 if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
438 if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
439 valPrint ctrh "Out of the $LINKS_CHECKED links checked, $EI_LINKS could be $(pluralCheckAn $EI_LINKS)intrawiki $(pluralCheckNoun link $EI_LINKS), $IW_LINKS could be $(pluralCheckAn $IW_LINKS)interwiki $(pluralCheckNoun link $IW_LINKS), $OK_LINKS $(pluralCheckWas $OK_LINKS) OK, $RD_LINKS $(pluralCheckWas $RD_LINKS) $(pluralCheckA $RD_LINKS)redirection $(pluralCheckNoun notice $RD_LINKS), and $NG_LINKS $(pluralCheckWas $NG_LINKS) NG."
440 if [ $SKIP_EXPECT_NG -gt 0 ]; then
441 valPrint ctrh "$SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS) went unlisted due to being found in the exceptions file."
442 fi
443 if [ $SKIP_EXPECT_EI -gt 0 ]; then
444 valPrint ctrh "$SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS) went unlisted due to being found in the exceptions file."
445 fi
446 if [ $SKIP_EXPECT_IW -gt 0 ]; then
447 valPrint ctrh "$SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS) went unlisted due to being found in the exceptions file."
448 fi
449 valPrint trh "ValExtLinks says goodbye."
450 printRTFfooter
451 printHTMfooter
452
453 # Upload report if this was requested
454 if [ ! -z $UPLOAD_INFO ] && [ $FINISHED_LIST != "no" ]; then
455 uploadReport
456 fi
457
458 # Really quit now
459 valPrint c "ValExtLinks says goodbye."
460 exit 0
461}
462trap wrapupAndExit INT
463
464
465### INITIALIZATION ###
466# Print opening message to console and log files
467valPrint cw "$(tput bold)--Validate External Links--$(tput sgr0)"
468printTXTheader
469printRTFheader
470printHTMheader
471
472# Attempt to download file at LINKS_URL, then check that it succeeded
473valPrint cwtrh "Downloading list of external links from $LINKS_URL."
474LINKS_FILE_NAME=$(echo "$LINKS_URL" | sed 's/.*\///')
475LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
476curl --silent -o "$LINKS_FILE" $LINKS_URL
477if [ ! -f "$LINKS_FILE" ]; then
478 echo "The download of $LINKS_URL appears to have failed. Aborting."
479 wrapupAndExit
480fi
481
482# Attempt to download file at EXCEPT_URL, then check that it succeeded
483if [ ! -z $EXCEPT_URL ]; then
484 valPrint cwtrh "Downloading list of reporting exceptions from $EXCEPT_URL."
485 EXCEPT_FILE_NAME=$(echo "$EXCEPT_URL" | sed 's/.*\///')
486 EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
487 curl --silent -o "$EXCEPT_FILE" $EXCEPT_URL
488 if [ ! -f "$EXCEPT_FILE" ]; then
489 echo "The download of $EXCEPT_URL appears to have failed. Aborting."
490 wrapupAndExit
491 fi
492fi
493
494# Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
495LINK_COUNT_STRING=$(cat "$LINKS_FILE" | wc -l)
496
497# Number of URLs is number of lines minus one (first line is column header row for the CSV)
498LINK_COUNT=$(echo "${LINK_COUNT_STRING}" | tr -d '[:space:]')
499let LINK_COUNT-=1
500
501# Calculate number of URLs to consider
502if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
503 valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((URL_LIMIT-URL_START+1)) of them, from link $URL_START to $URL_LIMIT."
504elif [ $URL_START -ne 1 ]; then
505 valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((LINK_COUNT-URL_START+1)) of them, from link $URL_START to $LINK_COUNT."
506else
507 valPrint ctrh "Found $LINK_COUNT links to process. I will be considering all of them."
508fi
509
510# Print settings to console and log
511declare -a SETTINGS_MSG=(I will be checking the response code of each link "and will" take a screenshot of each page. Pages that are OK will "also" be logged. I "will" ask the Internet Archive for a suggested snapshot URL for each NG page. "I will not report links that are listed in the exceptions file.")
512if [ $TAKE_PAGE_SHOT -eq 0 ]; then SETTINGS_MSG[10]="but will not"; fi
513if [ $RECORD_OK_LINKS -eq 0 ]; then SETTINGS_MSG[22]="not"; fi
514if [ $SUGGEST_SNAPSHOTS -eq 0 ]; then SETTINGS_MSG[26]="will not"; fi
515if [ -z $EXCEPT_URL ] || [ $RECORD_OK_LINKS -eq 1 ]; then SETTINGS_MSG[40]=""; fi
516SETTINGS_STR=${SETTINGS_MSG[@]}
517valPrint ctrh "$SETTINGS_STR"
518valPrint tr "A summary of my findings will be found at the bottom of the report."
519valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
520valPrint trh ""
521
522# Print legend to logs
523valPrint t "Legend:"
524valPrint r "\b1 Legend \b0"
525valPrint hn "<h3>Legend</h3>"
526valPrint trh "OK = URL seems to be working."
527valPrint trh "NG = URL no longer seems to work. You should click each URL marked as NG before attempting to fix it, because false negatives will occur from time to time due to hiccups in the Internet. Please report any persistent false negatives or other issues to Iritscen. An NG link should be followed by a link to the Internet Archive's Wayback Machine which may help you repair the link. If the link cannot be repaired, you can disable it on the wiki (which prevents it from showing up in future ValExtLinks reports) by wrapping it in nowiki tags."
528valPrint trh "RD = The server responding to this URL is saying that the page moved and you should instead use the supplied new URL. Some RD links represent minor adjustments in the organization of a web site, and some are soft 404s (the file/page has been removed and you are being redirected to something like the main page of the web site). You will have to look at the new URL yourself to determine if it represents an OK link and the link on the wiki should be updated to this one, or if the desired file/page is actually gone and we need to replace the wiki link with an Internet Archive snapshot link -- or disable the URL if it has not been archived."
529valPrint trh "EI = URL is an external link to an internal page and should be converted to an intrawiki link using the suggested markup."
530valPrint trh "IW = URL is an external link to a fellow wiki and should be converted to an interwiki link using the suggested markup."
531valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $HTTP_CODES)."
532valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$HTTP_CODES\"}}{\fldrslt here}} for code reference)."
533valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$HTTP_CODES\" target=\"_blank\">here</a> for code reference)."
534valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $CURL_CODES)."
535valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$CURL_CODES\"}}{\fldrslt here}} for code reference)."
536valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$CURL_CODES\" target=\"_blank\">here</a> for code reference)."
537valPrint trh "IA suggests = Last available snapshot suggested by the Internet Archive."
538valPrint trh "Try browsing = The Archive occasionally fails to return a snapshot URL even when one exists, so you will need to check for a snapshot manually using this link to the Wayback Machine before concluding that a site has not been archived."
539valPrint trh ""
540
541
542### MAIN LOOP ###
543# Process each line of the .csv in LINKS_FILE
544for LINE in `cat "$LINKS_FILE"`; do
545 let LINK_NUM+=1
546
547 # First line is the column header row for the CSV, so let's verify that the format hasn't changed
548 if [ $SKIPPED_HEADER_ROW -eq 0 ]; then
549 if [ $LINE == "namespace,title,target" ]; then
550 SKIPPED_HEADER_ROW=1
551 LINK_NUM=0 # this line is it's not a link, so reset the link counter
552 valPrint hn "<table>"
553 continue
554 else
555 valPrint ctrh "Did not find expected header row in $LINKS_FILE. Aborting."
556 wrapupAndExit
557 fi
558 fi
559
560 # Skip this link if we are not at URL_START yet
561 if [ $LINK_NUM -lt $URL_START ]; then
562 continue
563 fi
564
565 # Stop if we are at the limit declared for testing purposes
566 if [ $URL_LIMIT -gt 0 ] && [ $LINK_NUM -gt $URL_LIMIT ]; then
567 FINISHED_LIST="limit"
568 wrapupAndExit
569 fi
570
571 # Print progress to screen
572 if [ $LINK_NUM -gt 1 ]; then
573 printf "\e[1A\n" # erase previous progress message so that new one appears in its place
574 fi
575 valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."
576
577 # The number of the namespace is the element before the first comma on the line
578 NS_ID=${LINE%%,*}
579
580 # Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
581 NS_NAME=""
582 a=0
583 while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done
584 if [ $NS_ID -eq ${NS_IDS[$a]} ]; then
585 NS_NAME="${NS_NAMES[$a]}"
586 break
587 fi
588 let a+=1
589 done
590 if [ -z "$NS_NAME" ]; then
591 valPrint tr "Skipping URL found on page $PAGE_NAME because I could not find a name for namespace ID $NS_ID."
592 let SKIP_UNK_NS+=1
593 continue
594 fi
595
596 # The name of the page is everything between the namespace ID and the next comma on the line (commas
597 # in page names will break this)
598 PAGE_NAME=${LINE#$NS_ID,}
599 PAGE_NAME=${PAGE_NAME%%,*}
600
601 # We don't want to consider wiki pages ending in .js, as the parser cannot reliably isolate URLS in
602 # JavaScript code, so it will return erroneous links
603 PAGE_NAME_SUFFIX=$(echo $PAGE_NAME | sed 's/.*\.//')
604 if [ $PAGE_NAME_SUFFIX == "js" ]; then
605 valPrint tr "Skipping URL found on JavaScript page $PAGE_NAME."
606 let SKIP_JS_PAGE+=1
607 continue
608 fi
609
610 # Build longer wiki page URLs from namespace and page names
611 FULL_PAGE_PATH=http://$WIKI_PATH/$NS_NAME:$PAGE_NAME
612 LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
613 # Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it
614 # explicitly breaks the link
615 if [ $NS_ID -eq 0 ]; then
616 FULL_PAGE_PATH=http://$WIKI_PATH/$PAGE_NAME
617 LOCAL_PAGE_PATH=$PAGE_NAME
618 fi
619
620 # The URL being linked to is everything after the previous two fields (this allows commas to be in
621 # the URLs, but a comma in the previous field, the page name, will break this)
622 URL=${LINE#$NS_ID,$PAGE_NAME,}
623
624 # Scan for illegal characters
625 if [[ $URL == *[$ILLEGAL_CHARS]* ]]; then
626 valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because it contains characters illegal in a URL."
627 let SKIP_BAD_URL+=1
628 continue
629 fi
630
631 # Now we need to know if the URL is for a file or a web page. First step is to determine if the
632 # URL ends in a suffix
633 HAS_SUFFIX=0
634
635 # If the URL ends in something like ".php?foo=bar", strip everything from the '?' onward
636 CLEAN_URL=${URL%%\?*}
637
638 # If the URL ends in something like "#section_15", strip everything from the '#' onward
639 CLEAN_URL=${CLEAN_URL%%\#*}
640
641 # 'sed' cannot handle Unicode in my Bash shell, so skip this URL and make user check it
642 if [[ $CLEAN_URL == *[![:ascii:]]* ]]; then
643 valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because I cannot handle non-ASCII characters."
644 let SKIP_NON_ASCII+=1
645 continue
646 fi
647
648 # Isolate the characters after the last period and after the last slash
649 POST_DOT=$(echo "$CLEAN_URL" | sed 's/.*\.//')
650 POST_SLASH=$(echo "$CLEAN_URL" | sed 's/.*\///')
651
652 # If the last period comes after the last slash, then the URL ends in a suffix
653 POST_DOT_LENGTH=$(echo | awk -v input=$POST_DOT '{print length(input)}')
654 POST_SLASH_LENGTH=$(echo | awk -v input=$POST_SLASH '{print length(input)}')
655 if [ $POST_DOT_LENGTH -lt $POST_SLASH_LENGTH ]; then
656 HAS_SUFFIX=1
657 else
658 HAS_SUFFIX=0
659 fi
660
661 # Now we need to know if the URL is for a file. If the URL ends in a suffix, compare it against the
662 # known URL endings supplied in HTTP_FILES and HTTP_TLDS_AND_PAGES
663 IS_FILE=-1
664 if [ $HAS_SUFFIX -eq 0 ]; then
665 IS_FILE=0
666 else
667 # Turn off case sensitivity while we compare suffixes
668 shopt -s nocasematch
669
670 # Special case: URLs ending in something like "/productID.304297400" are pages, not files, so if
671 # the URL's suffix is all numbers, we are looking at the end of a web page URL
672 if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
673 IS_FILE=0
674 fi
675
676 # If we did not identify this URL as a web page above, we need to compare the suffix against known
677 # file extensions
678 if [ $IS_FILE -eq -1 ]; then
679 for EXTENSION in "${HTTP_FILES[@]}"; do
680 if [[ $EXTENSION == $POST_DOT ]]; then # double brackets respect the nocasematch setting
681 IS_FILE=1
682 break
683 fi
684 done
685 fi
686
687 # If we did not identify this URL as a file above, we need to compare the suffix against known
688 # pages and TLDs in order to rule out that this is an unknown file suffix or TLD that the user
689 # needs to add to the HTTP_FILES or HTTP_TLDS_AND_PAGES array
690 if [ $IS_FILE -eq -1 ]; then
691 for PAGE_OR_TLD in "${HTTP_TLDS_AND_PAGES[@]}"; do
692 if [[ $PAGE_OR_TLD == $POST_DOT ]]; then
693 IS_FILE=0
694 break
695 fi
696 done
697 fi
698
699 # Turn case sensitivity back on in Bash
700 shopt -u nocasematch
701 fi
702
703 # If this suffix escaped identification as either a file, page or TLD, inform the user
704 STR_TYPE=""
705 if [ $IS_FILE -eq -1 ]; then
706 valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because I encountered the unknown URL ending $POST_DOT. Please add this ending to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
707 let SKIP_UNK_SUFFIX+=1
708 continue
709 elif [ $IS_FILE -eq 1 ]; then
710 STR_TYPE="file"
711 let FILE_LINKS+=1
712 elif [ $IS_FILE -eq 0 ]; then
713 STR_TYPE="page"
714 let PAGE_LINKS+=1
715 fi
716
717 # Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
718 # issue with sites that require HTTPS
719 CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{http_code}\n' $URL)
720 CURL_ERR=$(echo $?)
721 CURL_RESULT=$CURL_CODE
722
723 # Tack 'curl' exit code onto result string if 'curl' returned "000" (no HTTP response)
724 if [ $CURL_CODE == "000" ]; then
725 CURL_RESULT="$CURL_RESULT-$CURL_ERR"
726 fi
727
728 # Begin to determine our status code for this URL (EI, IW, OK, RD, or NG)
729 STATUS="??"
730 NEW_URL=""
731 INTERWIKI_INDEX=-1
732
733 # First make sure that this isn't an "external internal" link to our own wiki that can be replaced
734 # by "[[page_name]]". If it uses a special access URL beginning with "/w/", let it pass, as it
735 # probably cannot be replaced by "[[ ]]" markup
736 if [[ $URL == *$WIKI_PATH* ]] && [[ $URL != *$WIKI_PATH/w/* ]]; then
737 STATUS="EI"
738 let EI_LINKS+=1
739 fi
740
741 # If it's not, check if this is a link to a domain that we have an interwiki prefix for
742 if [ $STATUS == "??" ]; then
743 for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
744 if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]] && [[ $URL != *${INTERWIKI_DOMAINS[$i]}/w/* ]]; then
745 STATUS="IW"
746 let IW_LINKS+=1
747 INTERWIKI_INDEX=$i
748 break
749 fi
750 done
751 fi
752
753 # If we didn't match an interwiki domain, see if the status code is in our "OK" codes list
754 if [ $STATUS == "??" ]; then
755 for CODE in "${OK_CODES[@]}"; do
756 if [[ $CODE == $CURL_CODE ]]; then
757 STATUS="OK"
758 let OK_LINKS+=1
759 break
760 fi
761 done
762 fi
763
764 # If we didn't get a match with the "OK" codes, check it against the "RD" codes
765 if [ $STATUS == "??" ]; then
766 for CODE in "${RD_CODES[@]}"; do
767 if [[ $CODE == $CURL_CODE ]]; then
768 # Get URL header again in order to retrieve the URL we are being redirected to
769 NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{redirect_url}\n' $URL)
770
771 # Filter out cases where the redirect URL is just the original URL with https:// instead of
772 # http://, or with an added '/' at the end. These corrections happen a lot and are not
773 # important to us.
774 URL_NO_PROTOCOL=${URL#http://}
775 URL_NO_PROTOCOL=${URL_NO_PROTOCOL%/}
776 NEW_URL_NO_PROTOCOL=${NEW_URL#https://}
777 NEW_URL_NO_PROTOCOL=${NEW_URL_NO_PROTOCOL%/}
778
779 # Sometimes 'curl' fails to get the redirect_url due to time-out or bad web site config
780 NEW_URL_LENGTH=$(echo | awk -v input=$NEW_URL_NO_PROTOCOL '{print length(input)}')
781 if [ $NEW_URL_LENGTH -lt $MIN_URL_LENGTH ]; then
782 NEW_URL_NO_PROTOCOL="[new URL not retrieved]"
783 fi
784
785 # If the URLs match after the above filters were applied, then the link is OK
786 if [ $URL_NO_PROTOCOL == $NEW_URL_NO_PROTOCOL ]; then
787 STATUS="OK"
788 let OK_LINKS+=1
789 else
790 STATUS="RD"
791 let RD_LINKS+=1
792 fi
793 break
794 fi
795 done
796 fi
797
798 # If we didn't get a match with the "RD" codes, check it against the "NG" codes
799 if [ $STATUS == "??" ]; then
800 for CODE in "${NG_CODES[@]}"; do
801 if [[ $CODE == $CURL_CODE ]]; then
802 STATUS="NG"
803 let NG_LINKS+=1
804 break
805 fi
806 done
807 fi
808
809 # If we didn't match a known status code, advise the reader
810 if [ $STATUS == "??" ]; then
811 valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because I encountered the unknown return code $CURL_CODE."
812 let SKIP_UNK_CODE+=1
813 continue
814 fi
815
816 # Check problem links against exceptions file before proceeding
817 if [ $STATUS != "OK" ] && [ ! -z $EXCEPT_URL ]; then
818 # The code we expect to find in the exceptions file is either the 'curl' result or "EI"/"IW"
819 EXPECT_CODE="$CURL_RESULT"
820 if [ $STATUS == "EI" ]; then
821 EXPECT_CODE="EI"
822 elif [ $STATUS == "IW" ]; then
823 EXPECT_CODE="IW"
824 fi
825
826 # Look for link in exceptions file and make sure its listed result code and wiki page also match
827 GREP_RESULT=$(grep --max-count=1 "$URL" "$EXCEPT_FILE")
828 EXCEPT_PAGE=${GREP_RESULT##*,}
829 if [ "$EXCEPT_PAGE" == "*" ] || [ "$EXCEPT_PAGE" == $LOCAL_PAGE_PATH ]; then
830 EXCEPT_CODE=${GREP_RESULT%%,*}
831 if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then
832 valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because its expected result, $EXPECT_CODE, is listed in the exceptions file."
833 if [ $STATUS == "EI" ]; then
834 let SKIP_EXPECT_EI+=1
835 elif [ $STATUS == "IW" ]; then
836 let SKIP_EXPECT_IW+=1
837 else
838 let SKIP_EXPECT_NG+=1
839 fi
840 continue
841 fi
842 fi
843 fi
844
845 # If appropriate, record this link to the log, with clickable URLs when possible
846 if [ $STATUS != "OK" ] || [ $RECORD_OK_LINKS -eq 1 ]; then
847 # Stupid hack since the strings "IW" and "EI" are narrower than "OK", "RD", or "NG" and it takes
848 # an extra tab to get to the desired level of indentation in the RTF log
849 RTF_TABS=" "
850 if [ $STATUS == "IW" ] || [ $STATUS == "EI" ]; then
851 RTF_TABS=" "
852 fi
853
854 # Record link and its wiki page in TXT, RTF, and HTML markup
855 valPrint t "$STATUS ($CURL_RESULT) $STR_TYPE $URL"
856 valPrint t " linked from $FULL_PAGE_PATH"
857 valPrint r "$STATUS ($CURL_RESULT)$RTF_TABS$STR_TYPE {\field{\*\fldinst{HYPERLINK \"$URL\"}}{\fldrslt $URL}}"
858 valPrint r " linked from {\field{\*\fldinst{HYPERLINK \"$FULL_PAGE_PATH\"}}{\fldrslt $LOCAL_PAGE_PATH}}"
859 valPrint hn "<tr><td style=\"white-space:nowrap\">$STATUS ($CURL_RESULT)</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>"
860 valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"
861
862 # Record redirect URL if one was given by a 3xx response page
863 if [ $STATUS == "RD" ]; then
864 valPrint t " Server suggests $NEW_URL"
865 valPrint r " Server suggests {\field{\*\fldinst{HYPERLINK \"$NEW_URL\"}}{\fldrslt $NEW_URL}}"
866 valPrint hn "<tr><td colspan=\"2\" align=\"right\">Server suggests</td><td><a href=\"$NEW_URL\" target=\"_blank\">$NEW_URL</a></td></tr>"
867 fi
868
869 # Get everything after domain name in URL for use in EI and IW listings
870 POST_DOMAIN=${URL#*://*/}
871
872 # Notify reader if we can use an intrawiki link for this URL
873 if [ $STATUS == "EI" ]; then
874 valPrint t " Just use [[$POST_DOMAIN]]"
875 valPrint r " Just use [[$POST_DOMAIN]]"
876 valPrint hn "<tr><td colspan=\"2\" align=\"right\">Just use</td><td>[[$POST_DOMAIN]]</td></tr>"
877 fi
878
879 # Notify reader if we can use an interwiki prefix for this URL
880 if [ $STATUS == "IW" ]; then
881 valPrint t " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$POST_DOMAIN]]"
882 valPrint r " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$POST_DOMAIN]]"
883 valPrint hn "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$POST_DOMAIN]]</td></tr>"
884 fi
885
886 # Query Internet Archive for latest "OK" snapshot for "NG" page
887 if [ $STATUS == "NG" ] && [ $SUGGEST_SNAPSHOTS -eq 1 ]; then
888 ARCHIVE_QUERY=$(curl --silent --max-time 10 "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
889
890 # Isolate "url" property in response and log it if a "closest" snapshot was received...
891 if [[ "$ARCHIVE_QUERY" == *\"closest\":* ]]; then
892 SNAPSHOT_URL=${ARCHIVE_QUERY##*\"url\": \"}
893 SNAPSHOT_URL=${SNAPSHOT_URL%\", \"timestamp*}
894 valPrint t " IA suggests $SNAPSHOT_URL"
895 valPrint r " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
896 valPrint hn "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
897 else # ...otherwise give generic Wayback Machine link for this URL
898 valPrint t " Try browsing $ARCHIVE_GENERIC/$URL"
899 valPrint r " Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
900 valPrint hn "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"
901 fi
902 fi
903 fi
904
905 # If this is a page and it seems to be there, take screenshot of it using headless Google Chrome
906 if [ $IS_FILE -eq 0 ] && [ $TAKE_PAGE_SHOT -eq 1 ] && [ $STATUS == "OK" ]; then
907 # Sanitize URL by removing "http(s)://" and converting any other colons or slashes to underscores
908 SHOT_NAME=$(echo "$URL" | sed 's/https*\:\/\///' | sed 'y/:\//__/')
909 SHOT_FILE="$SHOT_PATH/$SHOT_NAME.png"
910
911 # Don't take screenshot if we already encountered this page and screenshotted it
912 if [ ! -f "$SHOT_FILE" ]; then
913 "$CHROME_PATH" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
914 if [ -f "$WORKING_DIR/$CHROME_SCREENSHOT" ]; then
915 mv -n "$WORKING_DIR/$CHROME_SCREENSHOT" "$SHOT_FILE"
916 else
917 valPrint trh "Screenshot of URL $URL seems to have failed!"
918 fi
919 else
920 valPrint trh "Skipping screenshot of URL $URL because $SHOT_FILE already exists."
921 fi
922 fi
923done
924FINISHED_LIST="yes"
925wrapupAndExit
Note: See TracBrowser for help on using the repository browser.