source: Validate External Links/validate_external_links.sh@ 1065

Last change on this file since 1065 was 1064, checked in by iritscen, 7 years ago

Committing my wiki link validation script, as it is reasonably mature now.

File size: 33.2 KB
RevLine 
[1064]1#!/bin/bash
2
3# Validate External Links by Iritscen
4# Provided with a list of external links found in the OniGalore wiki, this script validates them.
5# The resulting logs are produced in three formats: TXT (for easy diffing with an earlier log), RTF
6# (for reading as a local file with clickable links), and HTML (for uploading as a web page).
7# Call script with "--help" argument for documentation. Also see Read Me First.rtf for critical notes.
8# Recommended rule:
9# ------------------------------------------------------------------------------------------------------
10
11# Set separator token to newline
12IFS="
13"
14
15### GLOBALS ###
16# Settings -- these will be changed from their defaults by the arguments passed in to the script
17LINKS_URL="" # use 'curl' to download file with links from this location (can be file://)
18EXCEPT_URL="" # ditto above for file with exceptions to NG results
19OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder
20RECORD_OK_LINKS=0 # record response code to the log whether it's a value in OK_CODES or NG_CODES
21SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page
22TAKE_PAGE_SHOT=0 # take a screenshot of each OK page
23URL_START=1 # start at this URL in LINKS_FILE (1 by default)
24URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE
25UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report
26
27# Fixed strings -- see the occurrences of these variables to learn their purpose
28AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3146.0 Safari/537.36"
29ARCHIVE_API="http://archive.org/wayback/available"
30ARCHIVE_GENERIC="https://web.archive.org/web/*"
31ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
32CHROME="/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary"
33CHROME_SCREENSHOT="screenshot.png"
34CURL_CODES="http://iritscen.oni2.net/wiki/curl_codes.txt"
35EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
36HTTP_CODES="http://iritscen.oni2.net/wiki/http_codes.txt"
37MY_WIKI_PAGE="http://wiki.oni2.net/User:Iritscen"
38THIS_DIR=$(cd $(dirname $0); pwd)
39WORKING_DIR=$(pwd)
40WIKI_PATH="wiki.oni2.net"
41
42# These are parallel arrays of the IDs and names of OniGalore's current namespaces
43declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 100 101 102 103 104 105 108 109 110 111)
44declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk")
45
46# These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
47# This determines whether the script tries to take a screenshot of the page or just gets its HTTP code.
48declare -a HTTP_FILES=(txt zip wmv jpg png m4a bsl rar oni mp3 mov ONWC vbs TRMA mp4 doc avi log gif pdf dmg exe cpp tga 7z wav east BINA xml dll dae xaf fbx 3ds blend flv csv)
49declare -a HTTP_TLDS_AND_PAGES=(com net org uk ru de it htm html php pl asp aspx shtml pgi cgi php3 x jsp phtml cfm css action stm js)
50
51# These arrays tells us which HTTP response codes are OK (good) and which are NG (no good). Pages that
52# return NG codes will not be screenshotted. Remember to update http_codes.txt if you add a new code.
53declare -a OK_CODES=(200 301 302 307 401 405 406 501)
54declare -a NG_CODES=(000 403 404 410 500 503)
55
56# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
57# transcluded text, and if the transclusion fails, then the braces show up in the URL
58ILLEGAL_CHARS="{ }"
59
60# These are parallel arrays giving the prefixes that can be used in place of normal external links to
61# some wikis and other sites
62declare -a INTERWIKI_PREFIXES=(metawikipedia wikipedia wikiquote wiktionary)
63declare -a INTERWIKI_DOMAINS=(meta.wikipedia.org wikipedia.org wikiquote.org wiktionary.org)
64
65# Variables for keeping track of main loop progress and findings
66LINK_NUM=0
67OK_LINKS=0
68NG_LINKS=0
69SKIP_UNK_NS=0
70SKIP_JS_PAGE=0
71SKIP_BAD_URL=0
72SKIP_NON_ASCII=0
73SKIP_UNK_SUFFIX=0
74SKIP_UNK_CODE=0
75SKIP_EXCEPT=0
76FILE_LINKS=0
77PAGE_LINKS=0
78SKIPPED_HEADER_ROW=0
79FINISHED_LIST="no"
80
81
82### HELP ###
83# A pseudo-man page. Here is the 80-character rule for the page text:
84# 234567890123456789012345678901234567890123456789012345678901234567890123456789
85function printHelp()
86{
87 cat << EOF
88
89NAME
90 Validate External Links
91
92SYNOPSIS
93 validate_external_links.sh --help
94 validate_external_links.sh --links URL --output PATH [--exceptions FILE]
95 [--record-ok-links] [--suggest-snapshots] [--take-screenshots]
96 [--start-url NUM] [--end-url NUM] [--upload PATH]
97
98DESCRIPTION
99 This script parses a list of external links found in the OniGalore wiki
100 (which is dumped by the Oni2.net domain periodically in a particular
101 format), validates them using the Unix tool 'curl', and produces a report
102 of which links were OK (responded to an HTTP query) and which were NG (no
103 good). This report can then be automatically uploaded to the location of
104 your choice. The script can also suggest Internet Archive snapshots for
105 NG links, and take screenshots of OK links for visual verification by the
106 reader that the page in question is the one intended to be displayed.
107
108 You must pass this script the URL at which the list of links is found
109 (--links) and the path where logs should be outputted (--output). All
110 other arguments are optional.
111
112OPTIONS
113 --help Show this page
114 --links URL URL from which to download file with external links
115 (note that this can be a local file if you use the
116 file:// protocol) (required)
117 --output DIR Place the folder which will contain the reports and
118 optional screenshots at this path (required)
119 --exceptions DIR Don't log an NG link if it is listed in the file
120 provided at this path as long as the response code is
121 the same as the one associated with the link
122 --record-ok-links Log a link in the report whether its response code is
123 in the OK_CODES or the NG_CODES array
124 --suggest-snapshots Query the Internet Archive for a possible snapshot
125 URL for each NG page
126 --take-screenshots Save screenshots of each OK page (requires Google
127 Chrome to be found at the path in CHROME)
128 --start-url NUM Start at this link in the links file
129 --end-url NUM Stop at this link in the links file
130 --upload FILE Upload report using info in this local file
131
132BUGS
133 The script cannot properly parse any line in the external links file
134 which contains a comma in the name of the wiki page containing a link.
135 Commas in the link itself are not an issue.
136EOF
137}
138
139
140### SETUP ###
141# If first argument is a help request, or if nothing was passed in at all, print help page and quit
142if [ "$#" -eq 0 ] || [ "$1" == "--help" ]; then
143 printHelp | less
144 exit 0
145fi
146
147# Parse arguments as long as there are more arguments to process
148while (( "$#" )); do
149 case "$1" in
150 --links ) LINKS_URL="$2"; shift 2;;
151 --exceptions ) EXCEPT_URL="$2"; shift 2;;
152 --output ) OUTPUT_DIR="$2"; shift 2;;
153 --record-ok-links ) RECORD_OK_LINKS=1; shift;;
154 --suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;;
155 --take-screenshots ) TAKE_PAGE_SHOT=1; shift;;
156 --start-url ) URL_START=$2; shift 2;;
157 --end-url ) URL_LIMIT=$2; shift 2;;
158 --upload ) UPLOAD_INFO=$2; shift 2;;
159 * ) echo "Invalid argument $1 detected. Aborting."; exit 1;;
160 esac
161done
162
163# If the required arguments were not supplied, print help page and quit
164if [ -z $LINKS_URL ] || [ -z $OUTPUT_DIR ]; then
165 printHelp
166 echo "Error: I did not receive one or both required arguments."
167 exit 2
168fi
169
170# Check that UPLOAD_INFO exists, if this argument was supplied
171if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then
172 echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting."
173 exit 3
174fi
175
176# Check that OUTPUT_DIR is a directory
177if [ ! -d "$OUTPUT_DIR" ]; then
178 echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting."
179 exit 4
180fi
181
182# Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots
183SAFE_TIME=$(date "+%Y-%m-%d--%H-%M-%S") # for use in path names
184NICE_TIME=$(TZ=":UTC" date "+%a, %d %b %Y %k:%M:%S GMT") # matches format of LINKS_DATE, which is extracted from the extlinks file
185OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
186OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
187SHOT_PATH="$OUTPUT_PATH/Screenshots"
188LOG_NAME="ValExtLinks report"
189LOG_TXT="$OUTPUT_PATH/$LOG_NAME.txt"
190LOG_RTF="$OUTPUT_PATH/$LOG_NAME.rtf"
191LOG_HTM="$OUTPUT_PATH/$LOG_NAME.htm"
192mkdir "$OUTPUT_PATH"
193if [ $TAKE_PAGE_SHOT -eq 1 ]; then
194 mkdir "$SHOT_PATH"
195fi
196
197# Check that 'mkdir' succeeded
198if [ ! -d "$OUTPUT_PATH" ]; then
199 echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting."
200 exit 5
201fi
202
203# Get date on the file at LINKS_URL and print to log
204LINKS_DATE=$(curl --silent --head $LINKS_URL | grep "Last-Modified")
205if [ -z "$LINKS_DATE" ]; then
206 echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting."
207 exit 6
208fi
209LINKS_DATE=${LINKS_DATE#Last-Modified: }
210
211
212### UTILITY FUNCTIONS ###
213# Writes a plain-text header to TXT log file
214function printTXTheader()
215{
216 valPrint t "Validate External Links report"
217 valPrint t "generated $NICE_TIME"
218 valPrint t "from data of $LINKS_DATE"
219 valPrint t "script by Iritscen (contact: $MY_WIKI_PAGE)"
220 valPrint t ""
221}
222
223# Writes the RTF header to RTF log file, then a centered title, then sets text to left-justified
224function printRTFheader()
225{
226 valPrint r "{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf820
227{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
228{\colortbl;\red255\green255\blue255;}
229{\*\expandedcolortbl;;}
230\margl1440\margr1440\vieww12600\viewh12100\viewkind0
231\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\qc\partightenfactor0
232
233\f0\fs28 \cf0 \b1 Validate External Links report \b0\\
234generated $NICE_TIME\\
235from data of $LINKS_DATE\\
236script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$MY_WIKI_PAGE\"}}{\fldrslt contact}})
237\\
238\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
239\cf0 "
240}
241
242# Closes the RTF markup of the RTF log file
243function printRTFfooter()
244{
245 valPrint r "}"
246}
247
248# Writes the HTML header to HTML log file
249function printHTMheader()
250{
251 valPrint h "<html>
252<head>
253<title>Validate External Links report</title>
254</head>
255<body>
256<h2>Validate External Links report</h2>
257<h3>generated $NICE_TIME<br />
258from data of $LINKS_DATE<br />
259script by Iritscen (<a href=\"$MY_WIKI_PAGE\" target=\"_blank\">contact</a>)</h3>"
260}
261
262# Closes the HTML markup of the HTML log file
263function printHTMfooter()
264{
265 valPrint h "</body>
266</html>"
267}
268
269# The central logging function. The first parameter is a string composed of one or more characters that
270# indicates which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
271# 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 'w' means "Don't
272# pass console output through 'fmt'" ("fmt" fits the output to an 80-column CLI but can break special
273# formatting and the 'n' option).
274function valPrint()
275{
276 if [[ "$1" == *c* ]]; then
277 if [[ "$1" == *n* ]]; then
278 echo -n "$2"
279 elif [[ "$1" == *w* ]]; then
280 echo "$2"
281 else
282 echo "$2" | fmt -w 80
283 fi
284 fi
285 if [[ "$1" == *t* ]]; then
286 if [[ "$1" == *n* ]]; then
287 echo -n "$2" >> "$LOG_TXT"
288 else
289 echo "$2" >> "$LOG_TXT"
290 fi
291 fi
292 if [[ "$1" == *r* ]]; then
293 if [[ "$1" == *n* ]]; then
294 echo "$2" >> "$LOG_RTF"
295 else
296 echo "$2\\" >> "$LOG_RTF"
297 fi
298 fi
299 if [[ "$1" == *h* ]]; then
300 if [[ "$1" == *n* ]]; then
301 echo "$2" >> "$LOG_HTM"
302 else
303 echo "$2<br />" >> "$LOG_HTM"
304 fi
305 fi
306}
307
308# Pluralize the string in parameter 1 if the number in parameter 2 is not 1
309function pluralCheckNoun()
310{
311 if [ $2 -ne 1 ]; then
312 if [[ $1 =~ x$ ]]; then
313 echo $1es
314 else
315 echo $1s
316 fi
317 else
318 echo $1
319 fi
320}
321
322# Output "was" if parameter 1 is 1, otherwise "were"
323function pluralCheckWas()
324{
325 if [ $1 -ne 1 ]; then
326 echo "were"
327 else
328 echo "was"
329 fi
330}
331
332# Upload HTML report using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
333# reports being saved to disk have already been closed.
334function uploadReport()
335{
336 valPrint c "Uploading HTML report..."
337
338 SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
339 SFTP_USER_NAME_MARKER="user:"
340 SFTP_PASSWORD_MARKER="pw:"
341 SFTP_PORT_MARKER="port:"
342 SFTP_PATH_MARKER="path:"
343 SFTP_USER_NAME=$(grep $SFTP_USER_NAME_MARKER $UPLOAD_INFO)
344 SFTP_USER_NAME=${SFTP_USER_NAME#$SFTP_USER_NAME_MARKER}
345 SFTP_PASSWORD=$(grep $SFTP_PASSWORD_MARKER $UPLOAD_INFO)
346 SFTP_PASSWORD=${SFTP_PASSWORD#$SFTP_PASSWORD_MARKER}
347 SFTP_PORT=$(grep $SFTP_PORT_MARKER $UPLOAD_INFO)
348 SFTP_PORT=${SFTP_PORT#$SFTP_PORT_MARKER}
349 SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
350 SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}
351
352 expect "$SCRIPT_PATH" "$LOG_HTM" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.htm"
353
354 valPrint c "Report was uploaded, unless an error message appears above."
355}
356
357# Prints session summary when script is done
358function wrapupAndExit()
359{
360 # Get off progress line on console, drop down a line from last link in log, and close HTML table
361 valPrint ctr ""
362 valPrint h "</table><br />"
363
364 # If we didn't finish processing the last URL, then the iterator is one too high
365 if [ $FINISHED_LIST != "yes" ]; then
366 let LINK_NUM-=1
367 if [ $FINISHED_LIST == "no" ]; then
368 valPrint ctrh "The session was canceled by the user."
369 fi
370 fi
371
372 # Output results of session and close the log file's markup
373 LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
374 LINKS_SKIPPED=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
375 LINKS_CHECKED=$((LINKS_PROCESSED-LINKS_SKIPPED))
376 valPrint ct "Summary:"
377 valPrint r "\b1 Summary \b0"
378 valPrint hn "<h3><span id=\"summary\">Summary</span></h3>"
379 valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT)."
380 valPrint ctrh "I skipped $LINKS_SKIPPED $(pluralCheckNoun link $LINKS_SKIPPED), and found $FILE_LINKS $(pluralCheckNoun file $FILE_LINKS) and $PAGE_LINKS $(pluralCheckNoun page $PAGE_LINKS)."
381 if [ $LINKS_SKIPPED -gt 0 ]; then valPrint ctrh "Skip breakdown: "; fi
382 if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
383 if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE links on JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
384 if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
385 if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
386 if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
387 if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
388 valPrint ctrh "Out of the $LINKS_CHECKED links checked, $OK_LINKS $(pluralCheckWas $OK_LINKS) OK and $NG_LINKS $(pluralCheckWas $NG_LINKS) NG."
389 if [ $SKIP_EXCEPT -gt 0 ]; then
390 valPrint ctrh "$SKIP_EXCEPT/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS) went unlisted due to being found in the exceptions file."
391 fi
392 printRTFfooter
393 printHTMfooter
394
395 # Upload report if this was requested
396 if [ ! -z $UPLOAD_INFO ] && [ $FINISHED_LIST != "no" ]; then
397 uploadReport
398 fi
399
400 # Really quit now
401 valPrint c "ValExtLinks says goodbye."
402 exit 0
403}
404trap wrapupAndExit INT
405
406
407### INITIALIZATION ###
408# Print opening message to console and log files
409valPrint cw "$(tput bold)--Validate External Links--$(tput sgr0)"
410printTXTheader
411printRTFheader
412printHTMheader
413
414# Attempt to download file at LINKS_URL, then check that it succeeded
415valPrint ctrh "Downloading list of external links from $LINKS_URL."
416LINKS_FILE_NAME=$(echo "$LINKS_URL" | sed 's/.*\///')
417LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
418curl --silent -o "$LINKS_FILE" $LINKS_URL
419if [ ! -f "$LINKS_FILE" ]; then
420 echo "The download of $LINKS_URL appears to have failed. Aborting."
421 wrapupAndExit
422fi
423
424# Attempt to download file at EXCEPT_URL, then check that it succeeded
425if [ ! -z $EXCEPT_URL ]; then
426 valPrint ctrh "Downloading list of NG exceptions from $EXCEPT_URL."
427 EXCEPT_FILE_NAME=$(echo "$EXCEPT_URL" | sed 's/.*\///')
428 EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
429 curl --silent -o "$EXCEPT_FILE" $EXCEPT_URL
430 if [ ! -f "$EXCEPT_FILE" ]; then
431 echo "The download of $EXCEPT_URL appears to have failed. Aborting."
432 wrapupAndExit
433 fi
434fi
435
436# Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
437LINK_COUNT_STRING=$(cat "$LINKS_FILE" | wc -l)
438
439# Number of URLs is number of lines minus one (first line is column header row for the CSV)
440LINK_COUNT=$(echo "${LINK_COUNT_STRING}" | tr -d '[:space:]')
441let LINK_COUNT-=1
442
443# Calculate number of URLs to consider
444if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
445 valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((URL_LIMIT-URL_START+1)) of them, from link $URL_START to $URL_LIMIT."
446elif [ $URL_START -ne 1 ]; then
447 valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((LINK_COUNT-URL_START+1)) of them, from link $URL_START to $LINK_COUNT."
448else
449 valPrint ctrh "Found $LINK_COUNT links to process. I will be considering all of them."
450fi
451
452# Print settings to console and log
453declare -a SETTINGS_MSG=(I will be checking the response code of each link "and will" take a screenshot of each page. Pages that are OK will "also" be logged. I "will" ask the Internet Archive for a suggested snapshot URL for each NG page. "I will not print NG links that are listed in the exceptions file.")
454if [ $TAKE_PAGE_SHOT -eq 0 ]; then SETTINGS_MSG[10]="but will not"; fi
455if [ $RECORD_OK_LINKS -eq 0 ]; then SETTINGS_MSG[22]="not"; fi
456if [ $SUGGEST_SNAPSHOTS -eq 0 ]; then SETTINGS_MSG[26]="will not"; fi
457if [ -z $EXCEPT_URL ] || [ $RECORD_OK_LINKS -eq 1 ]; then SETTINGS_MSG[40]=""; fi
458SETTINGS_STR=${SETTINGS_MSG[@]}
459valPrint ctrh "$SETTINGS_STR"
460valPrint tr "A summary of my findings will be found at the bottom of the report."
461valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
462valPrint trh ""
463
464# Print legend to logs
465valPrint t "Legend:"
466valPrint r "\b1 Legend \b0"
467valPrint hn "<h3>Legend</h3>"
468valPrint trh "OK = URL seems to be working."
469valPrint trh "NG = URL no longer seems to work. You should click each URL marked as NG before attempting to fix it. False negatives will occur from time to time due to hiccups in the Internet. Please report any persistent false negatives or other issues to Iritscen."
470valPrint trh "IW = URL is working but should be converted to interwiki link using the suggested markup."
471valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $HTTP_CODES)."
472valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$HTTP_CODES\"}}{\fldrslt here}} for code reference)."
473valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$HTTP_CODES\" target=\"_blank\">here</a> for code reference)."
474valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $CURL_CODES)."
475valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$CURL_CODES\"}}{\fldrslt here}} for code reference)."
476valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$CURL_CODES\" target=\"_blank\">here</a> for code reference)."
477valPrint trh "IA suggests = Last available snapshot suggested by the Internet Archive."
478valPrint trh "Try browsing = The Archive occasionally fails to return a snapshot URL even when one exists, so you will need to check for a snapshot manually using the Wayback Machine before concluding that a site has not been archived."
479valPrint trh ""
480
481
482### MAIN LOOP ###
483# Process each line of the .csv in LINKS_FILE
484for LINE in `cat "$LINKS_FILE"`; do
485 let LINK_NUM+=1
486
487 # First line is the column header row for the CSV, so let's verify that the format hasn't changed
488 if [ $SKIPPED_HEADER_ROW -eq 0 ]; then
489 if [ $LINE == "namespace,title,target" ]; then
490 SKIPPED_HEADER_ROW=1
491 LINK_NUM=0 # this line is it's not a link, so reset the link counter
492 valPrint hn "<table>"
493 continue
494 else
495 valPrint ctrh "Did not find expected header row in $LINKS_FILE. Aborting."
496 wrapupAndExit
497 fi
498 fi
499
500 # Skip this link if we are not at URL_START yet
501 if [ $LINK_NUM -lt $URL_START ]; then
502 continue
503 fi
504
505 # Stop if we are at the limit declared for testing purposes
506 if [ $URL_LIMIT -gt 0 ] && [ $LINK_NUM -gt $URL_LIMIT ]; then
507 FINISHED_LIST="limit"
508 wrapupAndExit
509 fi
510
511 # Print progress to screen
512 if [ $LINK_NUM -gt 1 ]; then
513 printf "\e[1A\n" # erase previous progress message so that new one appears in its place
514 fi
515 valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."
516
517 # The number of the namespace is the element before the first comma on the line
518 NS_ID=${LINE%%,*}
519
520 # Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
521 NS_NAME=""
522 a=0
523 while [ "x${NS_IDS[$a]}" != "x" ] # once this evaluates to "x", the array is done
524 do
525 if [ $NS_ID -eq ${NS_IDS[$a]} ]; then
526 NS_NAME="${NS_NAMES[$a]}"
527 break
528 fi
529 let a+=1
530 done
531 if [ -z "$NS_NAME" ]; then
532 valPrint tr "Skipping URL found on page $PAGE_NAME because I could not find a name for namespace ID $NS_ID."
533 let SKIP_UNK_NS+=1
534 continue
535 fi
536
537 # The name of the page is everything between the namespace ID and the next comma on the line (commas
538 # in page names will break this)
539 PAGE_NAME=${LINE#$NS_ID,}
540 PAGE_NAME=${PAGE_NAME%%,*}
541
542 # We don't want to consider wiki pages ending in .js, as the parser cannot reliably isolate URLS in
543 # JavaScript code, so it will return erroneous links
544 PAGE_NAME_SUFFIX=$(echo $PAGE_NAME | sed 's/.*\.//')
545 if [ $PAGE_NAME_SUFFIX == "js" ]; then
546 valPrint tr "Skipping URL found on JavaScript page $PAGE_NAME."
547 let SKIP_JS_PAGE+=1
548 continue
549 fi
550
551 # The URL being linked to is everything after the previous two fields (this allows commas to be in
552 # the URLs, but a comma in the previous field, the page name, will break this)
553 URL=${LINE#$NS_ID,$PAGE_NAME,}
554
555 # Scan for illegal characters
556 if [[ $URL == *[$ILLEGAL_CHARS]* ]]; then
557 valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because it contains characters illegal in a URL."
558 let SKIP_BAD_URL+=1
559 continue
560 fi
561
562 # Now we need to know if the URL is for a file or a web page. First step is to determine if the
563 # URL ends in a suffix
564 HAS_SUFFIX=0
565
566 # If the URL ends in something like ".php?foo=bar", strip everything from the '?' onward
567 SAN_URL=${URL%%\?*}
568
569 # If the URL ends in something like "#section_15", strip everything from the '#' onward
570 SAN_URL=${SAN_URL%%\#*}
571
572 # 'sed' cannot handle Unicode in my Bash shell, so skip this URL and make user check it
573 if [[ $SAN_URL == *[![:ascii:]]* ]]; then
574 valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because I cannot handle non-ASCII characters."
575 let SKIP_NON_ASCII+=1
576 continue
577 fi
578
579 # Isolate the characters after the last period and after the last slash
580 POST_DOT=$(echo "$SAN_URL" | sed 's/.*\.//')
581 POST_SLASH=$(echo "$SAN_URL" | sed 's/.*\///')
582
583 # If the last period comes after the last slash, then the URL ends in a suffix
584 POST_DOT_LENGTH=$(echo | awk -v input=$POST_DOT '{print length(input)}')
585 POST_SLASH_LENGTH=$(echo | awk -v input=$POST_SLASH '{print length(input)}')
586 if [ $POST_DOT_LENGTH -lt $POST_SLASH_LENGTH ]; then
587 HAS_SUFFIX=1
588 else
589 HAS_SUFFIX=0
590 fi
591
592 # Now we need to know if the URL is for a file. If the URL ends in a suffix, compare it against the
593 # known URL endings supplied in HTTP_FILES and HTTP_TLDS_AND_PAGES
594 IS_FILE=-1
595 if [ $HAS_SUFFIX -eq 0 ]; then
596 IS_FILE=0
597 else
598 # Turn off case sensitivity while we compare suffixes
599 shopt -s nocasematch
600
601 # Special case: URLs ending in something like "/productID.304297400" are pages, not files, so if
602 # the URL's suffix is all numbers, we are looking at the end of a web page URL
603 if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
604 IS_FILE=0
605 fi
606
607 # If we did not identify this URL as a web page above, we need to compare the suffix against known
608 # file extensions
609 if [ $IS_FILE -eq -1 ]; then
610 for EXTENSION in "${HTTP_FILES[@]}"; do
611 if [[ $EXTENSION == $POST_DOT ]]; then # double brackets respect the nocasematch setting
612 IS_FILE=1
613 break
614 fi
615 done
616 fi
617
618 # If we did not identify this URL as a file above, we need to compare the suffix against known
619 # pages and TLDs in order to rule out that this is an unknown file suffix or TLD that the user
620 # needs to add to the HTTP_FILES or HTTP_TLDS_AND_PAGES array
621 if [ $IS_FILE -eq -1 ]; then
622 for PAGE_OR_TLD in "${HTTP_TLDS_AND_PAGES[@]}"; do
623 if [[ $PAGE_OR_TLD == $POST_DOT ]]; then
624 IS_FILE=0
625 break
626 fi
627 done
628 fi
629
630 # Turn case sensitivity back on in Bash
631 shopt -u nocasematch
632 fi
633
634 # If this suffix escaped identification as either a file, page or TLD, inform the user
635 STR_TYPE=""
636 if [ $IS_FILE -eq -1 ]; then
637 valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because I encountered the unknown URL ending $POST_DOT. Please add this ending to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
638 let SKIP_UNK_SUFFIX+=1
639 continue
640 elif [ $IS_FILE -eq 1 ]; then
641 STR_TYPE="file"
642 let FILE_LINKS+=1
643 elif [ $IS_FILE -eq 0 ]; then
644 STR_TYPE="page"
645 let PAGE_LINKS+=1
646 fi
647
648 # Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
649 # issue with sites that require HTTPS
650 CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{http_code}\n' $URL)
651 CURL_ERR=$(echo $?)
652 CURL_RESULT=$CURL_CODE
653
654 # Tack 'curl' exit code onto result string if 'curl' returned "000" (no HTTP response)
655 if [ $CURL_CODE == "000" ]; then
656 CURL_RESULT="$CURL_RESULT-$CURL_ERR"
657 fi
658
659 # Determine if this code is in our "OK" list
660 STATUS="??"
661 INTERWIKI_INDEX=-1
662 for CODE in "${OK_CODES[@]}"; do
663 if [[ $CODE == $CURL_CODE ]]; then
664 let OK_LINKS+=1
665
666 # Determine if this is a link to a domain that we have an interwiki prefix for
667 for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
668 if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]]; then
669 STATUS="IW"
670 INTERWIKI_INDEX=$i
671 break
672 fi
673 done
674
675 # If this link is OK and no interwiki advisory is needed, just mark as "OK"
676 if [ $INTERWIKI_INDEX == -1 ]; then
677 STATUS="OK"
678 fi
679 break
680 fi
681 done
682
683 # If we didn't get a match with the "OK" codes, check it against the "NG" codes
684 if [ $STATUS == "??" ]; then
685 for CODE in "${NG_CODES[@]}"; do
686 if [[ $CODE == $CURL_CODE ]]; then
687 STATUS="NG"
688 let NG_LINKS+=1
689 break
690 fi
691 done
692 fi
693
694 # If we didn't match a known status code, advise the reader
695 if [ $STATUS == "??" ]; then
696 valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because I encountered the unknown return code $CURL_CODE."
697 let SKIP_UNK_CODE+=1
698 continue
699 fi
700
701 # If link is "NG" and there is an exceptions file, compare URL against the list before logging it
702 if [ $STATUS == "NG" ] && [ ! -z $EXCEPT_URL ]; then
703 GREP_RESULT=$(grep --max-count=1 "$URL" "$EXCEPT_FILE")
704 EXCEPT_CODE=${GREP_RESULT%%,*}
705 if [ "$EXCEPT_CODE" == $CURL_RESULT ]; then
706 valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because its status code, $CURL_RESULT, is listed in the exceptions file."
707 let SKIP_EXCEPT+=1
708 continue
709 fi
710 fi
711
712 # If appropriate, record this link to the log, with clickable URLs when possible
713 if [ $STATUS != "OK" ] || [ $RECORD_OK_LINKS -eq 1 ]; then
714 FULL_PAGE_PATH=http://$WIKI_PATH/$NS_NAME:$PAGE_NAME
715 LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
716 # Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it explicitly breaks the link
717 if [ $NS_ID -eq 0 ]; then
718 FULL_PAGE_PATH=http://$WIKI_PATH/$PAGE_NAME
719 LOCAL_PAGE_PATH=$PAGE_NAME
720 fi
721
722 # Stupid hack since the text "IW" is narrower than "OK" or "NG" and it takes an extra tab to get
723 # to the desired level of indentation in the RTF log
724 RTF_TABS=" "
725 if [ $STATUS == "IW" ]; then
726 RTF_TABS=" "
727 fi
728
729 # Record link and its wiki page in TXT, RTF, and HTML markup
730 valPrint t "$STATUS ($CURL_RESULT) $STR_TYPE $URL"
731 valPrint t " linked from $FULL_PAGE_PATH"
732 valPrint r "$STATUS ($CURL_RESULT)$RTF_TABS$STR_TYPE {\field{\*\fldinst{HYPERLINK \"$URL\"}}{\fldrslt $URL}}"
733 valPrint r " linked from {\field{\*\fldinst{HYPERLINK \"$FULL_PAGE_PATH\"}}{\fldrslt $LOCAL_PAGE_PATH}}"
734 valPrint hn "<tr><td style=\"white-space:nowrap\">$STATUS ($CURL_RESULT)</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>"
735 valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"
736
737 # Notify reader if we can use an interwiki prefix for this URL
738 if [ $STATUS == "IW" ]; then
739 valPrint t " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$POST_SLASH]]"
740 valPrint r " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$POST_SLASH]]"
741 valPrint hn "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$POST_SLASH]]</td></tr>"
742 fi
743
744 # Query Internet Archive for latest "OK" snapshot for "NG" page
745 if [ $STATUS == "NG" ] && [ $SUGGEST_SNAPSHOTS -eq 1 ]; then
746 ARCHIVE_QUERY=$(curl --silent --max-time 10 "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
747
748 # Isolate "url" property in response and log it if received...
749 if [[ "$ARCHIVE_QUERY" == *\"url\":* ]]; then
750 SNAPSHOT_URL=${ARCHIVE_QUERY#*\"url\":\"}
751 SNAPSHOT_URL=${SNAPSHOT_URL%\",\"timestamp*}
752 valPrint t " IA suggests $SNAPSHOT_URL"
753 valPrint r " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
754 valPrint hn "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
755 else # ...otherwise give generic Wayback Machine link for this URL
756 valPrint t " Try browsing $ARCHIVE_GENERIC/$URL"
757 valPrint r " Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
758 valPrint hn "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"
759 fi
760 fi
761 fi
762
763 # If this is a page and it seems to be there, take screenshot of it using headless Google Chrome
764 if [ $IS_FILE -eq 0 ] && [ $TAKE_PAGE_SHOT -eq 1 ] && [ $STATUS == "OK" ]; then
765 # Sanitize URL by removing "http(s)://" and converting any other colons or slashes to underscores
766 SHOT_NAME=$(echo "$URL" | sed 's/https*\:\/\///' | sed 'y/:\//__/')
767 SHOT_FILE="$SHOT_PATH/$SHOT_NAME.png"
768
769 # Don't take screenshot if we already encountered this page and screenshotted it
770 if [ ! -f "$SHOT_FILE" ]; then
771 "$CHROME" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
772 if [ -f "$WORKING_DIR/$CHROME_SCREENSHOT" ]; then
773 mv -n "$WORKING_DIR/$CHROME_SCREENSHOT" "$SHOT_FILE"
774 else
775 valPrint trh "Screenshot of URL $URL seems to have failed!"
776 fi
777 else
778 valPrint trh "Skipping screenshot of URL $URL because $SHOT_FILE already exists."
779 fi
780 fi
781done
782FINISHED_LIST="yes"
783wrapupAndExit
Note: See TracBrowser for help on using the repository browser.