source: Validate External Links/validate_external_links.sh@ 1068

Last change on this file since 1068 was 1067, checked in by iritscen, 7 years ago

Val now understands HTTP redirect responses and will report the URL we're redirected to. Also now tallies IW links.

File size: 35.9 KB
RevLine 
[1064]1#!/bin/bash
2
3# Validate External Links by Iritscen
4# Provided with a list of external links found in the OniGalore wiki, this script validates them.
5# The resulting logs are produced in three formats: TXT (for easy diffing with an earlier log), RTF
6# (for reading as a local file with clickable links), and HTML (for uploading as a web page).
7# Call script with "--help" argument for documentation. Also see Read Me First.rtf for critical notes.
8# Recommended rule:
9# ------------------------------------------------------------------------------------------------------
10
11# Set separator token to newline
12IFS="
13"
14
15### GLOBALS ###
16# Settings -- these will be changed from their defaults by the arguments passed in to the script
17LINKS_URL="" # use 'curl' to download file with links from this location (can be file://)
18EXCEPT_URL="" # ditto above for file with exceptions to NG results
19OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder
20RECORD_OK_LINKS=0 # record response code to the log whether it's a value in OK_CODES or NG_CODES
21SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page
22TAKE_PAGE_SHOT=0 # take a screenshot of each OK page
23URL_START=1 # start at this URL in LINKS_FILE (1 by default)
24URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE
25UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report
26
27# Fixed strings -- see the occurrences of these variables to learn their purpose
[1066]28AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:54.0) Gecko/20100101 Firefox/54.0"
[1064]29ARCHIVE_API="http://archive.org/wayback/available"
30ARCHIVE_GENERIC="https://web.archive.org/web/*"
31ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
32CHROME="/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary"
33CHROME_SCREENSHOT="screenshot.png"
[1066]34CURL_CODES="http://iritscen.oni2.net/val/curl_codes.txt"
[1064]35EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
[1066]36HTTP_CODES="http://iritscen.oni2.net/val/http_codes.txt"
[1064]37MY_WIKI_PAGE="http://wiki.oni2.net/User:Iritscen"
38THIS_DIR=$(cd $(dirname $0); pwd)
39WORKING_DIR=$(pwd)
40WIKI_PATH="wiki.oni2.net"
41
42# These are parallel arrays of the IDs and names of OniGalore's current namespaces
43declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 100 101 102 103 104 105 108 109 110 111)
44declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk")
45
46# These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
47# This determines whether the script tries to take a screenshot of the page or just gets its HTTP code.
48declare -a HTTP_FILES=(txt zip wmv jpg png m4a bsl rar oni mp3 mov ONWC vbs TRMA mp4 doc avi log gif pdf dmg exe cpp tga 7z wav east BINA xml dll dae xaf fbx 3ds blend flv csv)
49declare -a HTTP_TLDS_AND_PAGES=(com net org uk ru de it htm html php pl asp aspx shtml pgi cgi php3 x jsp phtml cfm css action stm js)
50
[1067]51# These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
52# are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt
53# if you add a new code.
54declare -a OK_CODES=(200 401 405 406 501)
55declare -a RD_CODES=(301 302 303 307 308)
56declare -a NG_CODES=(000 403 404 410 500 503)
[1064]57
58# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
59# transcluded text, and if the transclusion fails, then the braces show up in the URL
60ILLEGAL_CHARS="{ }"
61
62# These are parallel arrays giving the prefixes that can be used in place of normal external links to
63# some wikis and other sites
64declare -a INTERWIKI_PREFIXES=(metawikipedia wikipedia wikiquote wiktionary)
65declare -a INTERWIKI_DOMAINS=(meta.wikipedia.org wikipedia.org wikiquote.org wiktionary.org)
66
67# Variables for keeping track of main loop progress and findings
68LINK_NUM=0
69OK_LINKS=0
[1067]70RD_LINKS=0
71IW_LINKS=0
[1064]72NG_LINKS=0
73SKIP_UNK_NS=0
74SKIP_JS_PAGE=0
75SKIP_BAD_URL=0
76SKIP_NON_ASCII=0
77SKIP_UNK_SUFFIX=0
78SKIP_UNK_CODE=0
79SKIP_EXCEPT=0
80FILE_LINKS=0
81PAGE_LINKS=0
82SKIPPED_HEADER_ROW=0
83FINISHED_LIST="no"
84
85
86### HELP ###
87# A pseudo-man page. Here is the 80-character rule for the page text:
88# 234567890123456789012345678901234567890123456789012345678901234567890123456789
89function printHelp()
90{
91 cat << EOF
92
93NAME
94 Validate External Links
95
96SYNOPSIS
97 validate_external_links.sh --help
98 validate_external_links.sh --links URL --output PATH [--exceptions FILE]
99 [--record-ok-links] [--suggest-snapshots] [--take-screenshots]
100 [--start-url NUM] [--end-url NUM] [--upload PATH]
101
102DESCRIPTION
103 This script parses a list of external links found in the OniGalore wiki
104 (which is dumped by the Oni2.net domain periodically in a particular
105 format), validates them using the Unix tool 'curl', and produces a report
106 of which links were OK (responded to an HTTP query) and which were NG (no
107 good). This report can then be automatically uploaded to the location of
108 your choice. The script can also suggest Internet Archive snapshots for
109 NG links, and take screenshots of OK links for visual verification by the
110 reader that the page in question is the one intended to be displayed.
111
112 You must pass this script the URL at which the list of links is found
113 (--links) and the path where logs should be outputted (--output). All
114 other arguments are optional.
115
116OPTIONS
117 --help Show this page
118 --links URL URL from which to download file with external links
119 (note that this can be a local file if you use the
120 file:// protocol) (required)
121 --output DIR Place the folder which will contain the reports and
122 optional screenshots at this path (required)
123 --exceptions DIR Don't log an NG link if it is listed in the file
124 provided at this path as long as the response code is
125 the same as the one associated with the link
126 --record-ok-links Log a link in the report whether its response code is
127 in the OK_CODES or the NG_CODES array
128 --suggest-snapshots Query the Internet Archive for a possible snapshot
129 URL for each NG page
130 --take-screenshots Save screenshots of each OK page (requires Google
131 Chrome to be found at the path in CHROME)
132 --start-url NUM Start at this link in the links file
133 --end-url NUM Stop at this link in the links file
134 --upload FILE Upload report using info in this local file
135
136BUGS
137 The script cannot properly parse any line in the external links file
138 which contains a comma in the name of the wiki page containing a link.
139 Commas in the link itself are not an issue.
140EOF
141}
142
143
144### SETUP ###
145# If first argument is a help request, or if nothing was passed in at all, print help page and quit
146if [ "$#" -eq 0 ] || [ "$1" == "--help" ]; then
147 printHelp | less
148 exit 0
149fi
150
151# Parse arguments as long as there are more arguments to process
152while (( "$#" )); do
153 case "$1" in
154 --links ) LINKS_URL="$2"; shift 2;;
155 --exceptions ) EXCEPT_URL="$2"; shift 2;;
156 --output ) OUTPUT_DIR="$2"; shift 2;;
157 --record-ok-links ) RECORD_OK_LINKS=1; shift;;
158 --suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;;
159 --take-screenshots ) TAKE_PAGE_SHOT=1; shift;;
160 --start-url ) URL_START=$2; shift 2;;
161 --end-url ) URL_LIMIT=$2; shift 2;;
162 --upload ) UPLOAD_INFO=$2; shift 2;;
163 * ) echo "Invalid argument $1 detected. Aborting."; exit 1;;
164 esac
165done
166
167# If the required arguments were not supplied, print help page and quit
168if [ -z $LINKS_URL ] || [ -z $OUTPUT_DIR ]; then
169 printHelp
170 echo "Error: I did not receive one or both required arguments."
171 exit 2
172fi
173
174# Check that UPLOAD_INFO exists, if this argument was supplied
175if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then
176 echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting."
177 exit 3
178fi
179
180# Check that OUTPUT_DIR is a directory
181if [ ! -d "$OUTPUT_DIR" ]; then
182 echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting."
183 exit 4
184fi
185
186# Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots
187SAFE_TIME=$(date "+%Y-%m-%d--%H-%M-%S") # for use in path names
188NICE_TIME=$(TZ=":UTC" date "+%a, %d %b %Y %k:%M:%S GMT") # matches format of LINKS_DATE, which is extracted from the extlinks file
189OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
190OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
191SHOT_PATH="$OUTPUT_PATH/Screenshots"
192LOG_NAME="ValExtLinks report"
193LOG_TXT="$OUTPUT_PATH/$LOG_NAME.txt"
194LOG_RTF="$OUTPUT_PATH/$LOG_NAME.rtf"
195LOG_HTM="$OUTPUT_PATH/$LOG_NAME.htm"
196mkdir "$OUTPUT_PATH"
197if [ $TAKE_PAGE_SHOT -eq 1 ]; then
198 mkdir "$SHOT_PATH"
199fi
200
201# Check that 'mkdir' succeeded
202if [ ! -d "$OUTPUT_PATH" ]; then
203 echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting."
204 exit 5
205fi
206
207# Get date on the file at LINKS_URL and print to log
208LINKS_DATE=$(curl --silent --head $LINKS_URL | grep "Last-Modified")
209if [ -z "$LINKS_DATE" ]; then
210 echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting."
211 exit 6
212fi
213LINKS_DATE=${LINKS_DATE#Last-Modified: }
214
215
216### UTILITY FUNCTIONS ###
217# Writes a plain-text header to TXT log file
218function printTXTheader()
219{
220 valPrint t "Validate External Links report"
221 valPrint t "generated $NICE_TIME"
222 valPrint t "from data of $LINKS_DATE"
223 valPrint t "script by Iritscen (contact: $MY_WIKI_PAGE)"
224 valPrint t ""
225}
226
227# Writes the RTF header to RTF log file, then a centered title, then sets text to left-justified
228function printRTFheader()
229{
230 valPrint r "{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf820
231{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
232{\colortbl;\red255\green255\blue255;}
233{\*\expandedcolortbl;;}
234\margl1440\margr1440\vieww12600\viewh12100\viewkind0
235\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\qc\partightenfactor0
236
237\f0\fs28 \cf0 \b1 Validate External Links report \b0\\
238generated $NICE_TIME\\
239from data of $LINKS_DATE\\
240script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$MY_WIKI_PAGE\"}}{\fldrslt contact}})
241\\
242\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
243\cf0 "
244}
245
246# Closes the RTF markup of the RTF log file
247function printRTFfooter()
248{
249 valPrint r "}"
250}
251
252# Writes the HTML header to HTML log file
253function printHTMheader()
254{
255 valPrint h "<html>
256<head>
257<title>Validate External Links report</title>
258</head>
259<body>
260<h2>Validate External Links report</h2>
261<h3>generated $NICE_TIME<br />
262from data of $LINKS_DATE<br />
263script by Iritscen (<a href=\"$MY_WIKI_PAGE\" target=\"_blank\">contact</a>)</h3>"
264}
265
266# Closes the HTML markup of the HTML log file
267function printHTMfooter()
268{
269 valPrint h "</body>
270</html>"
271}
272
273# The central logging function. The first parameter is a string composed of one or more characters that
274# indicates which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
275# 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 'w' means "Don't
276# pass console output through 'fmt'" ("fmt" fits the output to an 80-column CLI but can break special
277# formatting and the 'n' option).
278function valPrint()
279{
280 if [[ "$1" == *c* ]]; then
281 if [[ "$1" == *n* ]]; then
282 echo -n "$2"
283 elif [[ "$1" == *w* ]]; then
284 echo "$2"
285 else
286 echo "$2" | fmt -w 80
287 fi
288 fi
289 if [[ "$1" == *t* ]]; then
290 if [[ "$1" == *n* ]]; then
291 echo -n "$2" >> "$LOG_TXT"
292 else
293 echo "$2" >> "$LOG_TXT"
294 fi
295 fi
296 if [[ "$1" == *r* ]]; then
297 if [[ "$1" == *n* ]]; then
298 echo "$2" >> "$LOG_RTF"
299 else
300 echo "$2\\" >> "$LOG_RTF"
301 fi
302 fi
303 if [[ "$1" == *h* ]]; then
304 if [[ "$1" == *n* ]]; then
305 echo "$2" >> "$LOG_HTM"
306 else
307 echo "$2<br />" >> "$LOG_HTM"
308 fi
309 fi
310}
311
312# Pluralize the string in parameter 1 if the number in parameter 2 is not 1
313function pluralCheckNoun()
314{
315 if [ $2 -ne 1 ]; then
316 if [[ $1 =~ x$ ]]; then
317 echo $1es
318 else
319 echo $1s
320 fi
321 else
322 echo $1
323 fi
324}
325
[1067]326# Output "is" if parameter 1 is 1, otherwise "are"
327function pluralCheckIs()
328{
329 if [ $1 -ne 1 ]; then
330 echo "are"
331 else
332 echo "is"
333 fi
334}
335
[1064]336# Output "was" if parameter 1 is 1, otherwise "were"
337function pluralCheckWas()
338{
339 if [ $1 -ne 1 ]; then
340 echo "were"
341 else
342 echo "was"
343 fi
344}
345
[1067]346# Output "a " if parameter 1 is 1, otherwise nothing
347function pluralCheckA()
348{
349 if [ $1 -eq 1 ]; then
350 echo "a "
351 fi
352}
353
354# Output "an " if parameter 1 is 1, otherwise nothing
355function pluralCheckAn()
356{
357 if [ $1 -eq 1 ]; then
358 echo "an "
359 fi
360}
361
[1064]362# Upload HTML report using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
363# reports being saved to disk have already been closed.
364function uploadReport()
365{
366 valPrint c "Uploading HTML report..."
367
368 SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
369 SFTP_USER_NAME_MARKER="user:"
370 SFTP_PASSWORD_MARKER="pw:"
371 SFTP_PORT_MARKER="port:"
372 SFTP_PATH_MARKER="path:"
373 SFTP_USER_NAME=$(grep $SFTP_USER_NAME_MARKER $UPLOAD_INFO)
374 SFTP_USER_NAME=${SFTP_USER_NAME#$SFTP_USER_NAME_MARKER}
375 SFTP_PASSWORD=$(grep $SFTP_PASSWORD_MARKER $UPLOAD_INFO)
376 SFTP_PASSWORD=${SFTP_PASSWORD#$SFTP_PASSWORD_MARKER}
377 SFTP_PORT=$(grep $SFTP_PORT_MARKER $UPLOAD_INFO)
378 SFTP_PORT=${SFTP_PORT#$SFTP_PORT_MARKER}
379 SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
380 SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}
381
382 expect "$SCRIPT_PATH" "$LOG_HTM" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.htm"
383
384 valPrint c "Report was uploaded, unless an error message appears above."
385}
386
387# Prints session summary when script is done
388function wrapupAndExit()
389{
390 # Get off progress line on console, drop down a line from last link in log, and close HTML table
391 valPrint ctr ""
392 valPrint h "</table><br />"
393
394 # If we didn't finish processing the last URL, then the iterator is one too high
395 if [ $FINISHED_LIST != "yes" ]; then
396 let LINK_NUM-=1
397 if [ $FINISHED_LIST == "no" ]; then
398 valPrint ctrh "The session was canceled by the user."
399 fi
400 fi
401
402 # Output results of session and close the log file's markup
403 LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
404 LINKS_SKIPPED=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
405 LINKS_CHECKED=$((LINKS_PROCESSED-LINKS_SKIPPED))
406 valPrint ct "Summary:"
407 valPrint r "\b1 Summary \b0"
408 valPrint hn "<h3><span id=\"summary\">Summary</span></h3>"
409 valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT)."
410 valPrint ctrh "I skipped $LINKS_SKIPPED $(pluralCheckNoun link $LINKS_SKIPPED), and found $FILE_LINKS $(pluralCheckNoun file $FILE_LINKS) and $PAGE_LINKS $(pluralCheckNoun page $PAGE_LINKS)."
411 if [ $LINKS_SKIPPED -gt 0 ]; then valPrint ctrh "Skip breakdown: "; fi
412 if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
413 if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE links on JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
414 if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
415 if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
416 if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
417 if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
[1067]418 valPrint ctrh "Out of the $LINKS_CHECKED links checked, $OK_LINKS $(pluralCheckWas $OK_LINKS) OK, $RD_LINKS $(pluralCheckWas $RD_LINKS) $(pluralCheckA $RD_LINKS)redirection $(pluralCheckNoun notice $RD_LINKS), and $NG_LINKS $(pluralCheckWas $NG_LINKS) NG."
419 if [ $IW_LINKS -gt 0 ]; then
420 valPrint ctrh "$IW_LINKS/$OK_LINKS OK $(pluralCheckNoun link $OK_LINKS) $(pluralCheckIs $IW_LINKS) $(pluralCheckAn $IW_LINKS)external $(pluralCheckNoun link $IW_LINKS) that could be $(pluralCheckAn $IW_LINKS)interwiki $(pluralCheckNoun link $IW_LINKS)."
421 fi
[1064]422 if [ $SKIP_EXCEPT -gt 0 ]; then
423 valPrint ctrh "$SKIP_EXCEPT/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS) went unlisted due to being found in the exceptions file."
424 fi
425 printRTFfooter
426 printHTMfooter
427
428 # Upload report if this was requested
429 if [ ! -z $UPLOAD_INFO ] && [ $FINISHED_LIST != "no" ]; then
430 uploadReport
431 fi
432
433 # Really quit now
434 valPrint c "ValExtLinks says goodbye."
435 exit 0
436}
437trap wrapupAndExit INT
438
439
440### INITIALIZATION ###
441# Print opening message to console and log files
442valPrint cw "$(tput bold)--Validate External Links--$(tput sgr0)"
443printTXTheader
444printRTFheader
445printHTMheader
446
447# Attempt to download file at LINKS_URL, then check that it succeeded
448valPrint ctrh "Downloading list of external links from $LINKS_URL."
449LINKS_FILE_NAME=$(echo "$LINKS_URL" | sed 's/.*\///')
450LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
451curl --silent -o "$LINKS_FILE" $LINKS_URL
452if [ ! -f "$LINKS_FILE" ]; then
453 echo "The download of $LINKS_URL appears to have failed. Aborting."
454 wrapupAndExit
455fi
456
457# Attempt to download file at EXCEPT_URL, then check that it succeeded
458if [ ! -z $EXCEPT_URL ]; then
459 valPrint ctrh "Downloading list of NG exceptions from $EXCEPT_URL."
460 EXCEPT_FILE_NAME=$(echo "$EXCEPT_URL" | sed 's/.*\///')
461 EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
462 curl --silent -o "$EXCEPT_FILE" $EXCEPT_URL
463 if [ ! -f "$EXCEPT_FILE" ]; then
464 echo "The download of $EXCEPT_URL appears to have failed. Aborting."
465 wrapupAndExit
466 fi
467fi
468
469# Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
470LINK_COUNT_STRING=$(cat "$LINKS_FILE" | wc -l)
471
472# Number of URLs is number of lines minus one (first line is column header row for the CSV)
473LINK_COUNT=$(echo "${LINK_COUNT_STRING}" | tr -d '[:space:]')
474let LINK_COUNT-=1
475
476# Calculate number of URLs to consider
477if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
478 valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((URL_LIMIT-URL_START+1)) of them, from link $URL_START to $URL_LIMIT."
479elif [ $URL_START -ne 1 ]; then
480 valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((LINK_COUNT-URL_START+1)) of them, from link $URL_START to $LINK_COUNT."
481else
482 valPrint ctrh "Found $LINK_COUNT links to process. I will be considering all of them."
483fi
484
485# Print settings to console and log
486declare -a SETTINGS_MSG=(I will be checking the response code of each link "and will" take a screenshot of each page. Pages that are OK will "also" be logged. I "will" ask the Internet Archive for a suggested snapshot URL for each NG page. "I will not print NG links that are listed in the exceptions file.")
487if [ $TAKE_PAGE_SHOT -eq 0 ]; then SETTINGS_MSG[10]="but will not"; fi
488if [ $RECORD_OK_LINKS -eq 0 ]; then SETTINGS_MSG[22]="not"; fi
489if [ $SUGGEST_SNAPSHOTS -eq 0 ]; then SETTINGS_MSG[26]="will not"; fi
490if [ -z $EXCEPT_URL ] || [ $RECORD_OK_LINKS -eq 1 ]; then SETTINGS_MSG[40]=""; fi
491SETTINGS_STR=${SETTINGS_MSG[@]}
492valPrint ctrh "$SETTINGS_STR"
493valPrint tr "A summary of my findings will be found at the bottom of the report."
494valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
495valPrint trh ""
496
497# Print legend to logs
498valPrint t "Legend:"
499valPrint r "\b1 Legend \b0"
500valPrint hn "<h3>Legend</h3>"
501valPrint trh "OK = URL seems to be working."
[1067]502valPrint trh "NG = URL no longer seems to work. You should click each URL marked as NG before attempting to fix it, because false negatives will occur from time to time due to hiccups in the Internet. Please report any persistent false negatives or other issues to Iritscen. An NG link should be followed by a link to the Internet Archive's Wayback Machine which may help you repair the link. If the link cannot be repaired, you can disable it on the wiki (which prevents it from showing up in future ValExtLinks reports) by wrapping it in nowiki tags."
503valPrint trh "RD = The server responding to this URL is saying that the page moved and you should instead use the supplied new URL. Some RD links represent minor adjustments in the organization of a web site, and some are soft 404s (the file/page has been removed and you are being redirected to something like the main page of the web site). You will have to look at the new URL yourself to determine if it represents an OK link and the link on the wiki should be updated to this one, or if the desired file/page is actually gone and we need to replace the wiki link with an Internet Archive snapshot link -- or disable the URL if it has not been archived."
[1064]504valPrint trh "IW = URL is working but should be converted to interwiki link using the suggested markup."
505valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $HTTP_CODES)."
506valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$HTTP_CODES\"}}{\fldrslt here}} for code reference)."
507valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$HTTP_CODES\" target=\"_blank\">here</a> for code reference)."
508valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $CURL_CODES)."
509valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$CURL_CODES\"}}{\fldrslt here}} for code reference)."
510valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$CURL_CODES\" target=\"_blank\">here</a> for code reference)."
511valPrint trh "IA suggests = Last available snapshot suggested by the Internet Archive."
512valPrint trh "Try browsing = The Archive occasionally fails to return a snapshot URL even when one exists, so you will need to check for a snapshot manually using the Wayback Machine before concluding that a site has not been archived."
513valPrint trh ""
514
515
516### MAIN LOOP ###
517# Process each line of the .csv in LINKS_FILE
518for LINE in `cat "$LINKS_FILE"`; do
519 let LINK_NUM+=1
520
521 # First line is the column header row for the CSV, so let's verify that the format hasn't changed
522 if [ $SKIPPED_HEADER_ROW -eq 0 ]; then
523 if [ $LINE == "namespace,title,target" ]; then
524 SKIPPED_HEADER_ROW=1
525 LINK_NUM=0 # this line is it's not a link, so reset the link counter
526 valPrint hn "<table>"
527 continue
528 else
529 valPrint ctrh "Did not find expected header row in $LINKS_FILE. Aborting."
530 wrapupAndExit
531 fi
532 fi
533
534 # Skip this link if we are not at URL_START yet
535 if [ $LINK_NUM -lt $URL_START ]; then
536 continue
537 fi
538
539 # Stop if we are at the limit declared for testing purposes
540 if [ $URL_LIMIT -gt 0 ] && [ $LINK_NUM -gt $URL_LIMIT ]; then
541 FINISHED_LIST="limit"
542 wrapupAndExit
543 fi
544
545 # Print progress to screen
546 if [ $LINK_NUM -gt 1 ]; then
547 printf "\e[1A\n" # erase previous progress message so that new one appears in its place
548 fi
549 valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."
550
551 # The number of the namespace is the element before the first comma on the line
552 NS_ID=${LINE%%,*}
553
554 # Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
555 NS_NAME=""
556 a=0
557 while [ "x${NS_IDS[$a]}" != "x" ] # once this evaluates to "x", the array is done
558 do
559 if [ $NS_ID -eq ${NS_IDS[$a]} ]; then
560 NS_NAME="${NS_NAMES[$a]}"
561 break
562 fi
563 let a+=1
564 done
565 if [ -z "$NS_NAME" ]; then
566 valPrint tr "Skipping URL found on page $PAGE_NAME because I could not find a name for namespace ID $NS_ID."
567 let SKIP_UNK_NS+=1
568 continue
569 fi
570
571 # The name of the page is everything between the namespace ID and the next comma on the line (commas
572 # in page names will break this)
573 PAGE_NAME=${LINE#$NS_ID,}
574 PAGE_NAME=${PAGE_NAME%%,*}
575
576 # We don't want to consider wiki pages ending in .js, as the parser cannot reliably isolate URLS in
577 # JavaScript code, so it will return erroneous links
578 PAGE_NAME_SUFFIX=$(echo $PAGE_NAME | sed 's/.*\.//')
579 if [ $PAGE_NAME_SUFFIX == "js" ]; then
580 valPrint tr "Skipping URL found on JavaScript page $PAGE_NAME."
581 let SKIP_JS_PAGE+=1
582 continue
583 fi
584
585 # The URL being linked to is everything after the previous two fields (this allows commas to be in
586 # the URLs, but a comma in the previous field, the page name, will break this)
587 URL=${LINE#$NS_ID,$PAGE_NAME,}
588
589 # Scan for illegal characters
590 if [[ $URL == *[$ILLEGAL_CHARS]* ]]; then
591 valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because it contains characters illegal in a URL."
592 let SKIP_BAD_URL+=1
593 continue
594 fi
595
596 # Now we need to know if the URL is for a file or a web page. First step is to determine if the
597 # URL ends in a suffix
598 HAS_SUFFIX=0
599
600 # If the URL ends in something like ".php?foo=bar", strip everything from the '?' onward
601 SAN_URL=${URL%%\?*}
602
603 # If the URL ends in something like "#section_15", strip everything from the '#' onward
604 SAN_URL=${SAN_URL%%\#*}
605
606 # 'sed' cannot handle Unicode in my Bash shell, so skip this URL and make user check it
607 if [[ $SAN_URL == *[![:ascii:]]* ]]; then
608 valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because I cannot handle non-ASCII characters."
609 let SKIP_NON_ASCII+=1
610 continue
611 fi
612
613 # Isolate the characters after the last period and after the last slash
614 POST_DOT=$(echo "$SAN_URL" | sed 's/.*\.//')
615 POST_SLASH=$(echo "$SAN_URL" | sed 's/.*\///')
616
617 # If the last period comes after the last slash, then the URL ends in a suffix
618 POST_DOT_LENGTH=$(echo | awk -v input=$POST_DOT '{print length(input)}')
619 POST_SLASH_LENGTH=$(echo | awk -v input=$POST_SLASH '{print length(input)}')
620 if [ $POST_DOT_LENGTH -lt $POST_SLASH_LENGTH ]; then
621 HAS_SUFFIX=1
622 else
623 HAS_SUFFIX=0
624 fi
625
626 # Now we need to know if the URL is for a file. If the URL ends in a suffix, compare it against the
627 # known URL endings supplied in HTTP_FILES and HTTP_TLDS_AND_PAGES
628 IS_FILE=-1
629 if [ $HAS_SUFFIX -eq 0 ]; then
630 IS_FILE=0
631 else
632 # Turn off case sensitivity while we compare suffixes
633 shopt -s nocasematch
634
635 # Special case: URLs ending in something like "/productID.304297400" are pages, not files, so if
636 # the URL's suffix is all numbers, we are looking at the end of a web page URL
637 if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
638 IS_FILE=0
639 fi
640
641 # If we did not identify this URL as a web page above, we need to compare the suffix against known
642 # file extensions
643 if [ $IS_FILE -eq -1 ]; then
644 for EXTENSION in "${HTTP_FILES[@]}"; do
645 if [[ $EXTENSION == $POST_DOT ]]; then # double brackets respect the nocasematch setting
646 IS_FILE=1
647 break
648 fi
649 done
650 fi
651
652 # If we did not identify this URL as a file above, we need to compare the suffix against known
653 # pages and TLDs in order to rule out that this is an unknown file suffix or TLD that the user
654 # needs to add to the HTTP_FILES or HTTP_TLDS_AND_PAGES array
655 if [ $IS_FILE -eq -1 ]; then
656 for PAGE_OR_TLD in "${HTTP_TLDS_AND_PAGES[@]}"; do
657 if [[ $PAGE_OR_TLD == $POST_DOT ]]; then
658 IS_FILE=0
659 break
660 fi
661 done
662 fi
663
664 # Turn case sensitivity back on in Bash
665 shopt -u nocasematch
666 fi
667
668 # If this suffix escaped identification as either a file, page or TLD, inform the user
669 STR_TYPE=""
670 if [ $IS_FILE -eq -1 ]; then
671 valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because I encountered the unknown URL ending $POST_DOT. Please add this ending to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
672 let SKIP_UNK_SUFFIX+=1
673 continue
674 elif [ $IS_FILE -eq 1 ]; then
675 STR_TYPE="file"
676 let FILE_LINKS+=1
677 elif [ $IS_FILE -eq 0 ]; then
678 STR_TYPE="page"
679 let PAGE_LINKS+=1
680 fi
681
682 # Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
683 # issue with sites that require HTTPS
684 CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{http_code}\n' $URL)
685 CURL_ERR=$(echo $?)
686 CURL_RESULT=$CURL_CODE
687
688 # Tack 'curl' exit code onto result string if 'curl' returned "000" (no HTTP response)
689 if [ $CURL_CODE == "000" ]; then
690 CURL_RESULT="$CURL_RESULT-$CURL_ERR"
691 fi
692
693 # Determine if this code is in our "OK" list
694 STATUS="??"
[1067]695 NEW_URL=""
[1064]696 INTERWIKI_INDEX=-1
697 for CODE in "${OK_CODES[@]}"; do
698 if [[ $CODE == $CURL_CODE ]]; then
699 let OK_LINKS+=1
700
701 # Determine if this is a link to a domain that we have an interwiki prefix for
702 for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
703 if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]]; then
704 STATUS="IW"
[1067]705 let IW_LINKS+=1
[1064]706 INTERWIKI_INDEX=$i
707 break
708 fi
709 done
710
711 # If this link is OK and no interwiki advisory is needed, just mark as "OK"
712 if [ $INTERWIKI_INDEX == -1 ]; then
713 STATUS="OK"
714 fi
715 break
716 fi
717 done
718
[1067]719 # If we didn't get a match with the "OK" codes, check it against the "RD" codes
[1064]720 if [ $STATUS == "??" ]; then
[1067]721 for CODE in "${RD_CODES[@]}"; do
722 if [[ $CODE == $CURL_CODE ]]; then
723 STATUS="RD"
724 let RD_LINKS+=1
725
726 # Get URL header again in order to retrieve the URL we are being redirected to
727 NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{redirect_url}\n' $URL)
728
729 break
730 fi
731 done
732 fi
733
734 # If we didn't get a match with the "RD" codes, check it against the "NG" codes
735 if [ $STATUS == "??" ]; then
[1064]736 for CODE in "${NG_CODES[@]}"; do
737 if [[ $CODE == $CURL_CODE ]]; then
738 STATUS="NG"
739 let NG_LINKS+=1
740 break
741 fi
742 done
743 fi
744
745 # If we didn't match a known status code, advise the reader
746 if [ $STATUS == "??" ]; then
747 valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because I encountered the unknown return code $CURL_CODE."
748 let SKIP_UNK_CODE+=1
749 continue
750 fi
751
752 # If link is "NG" and there is an exceptions file, compare URL against the list before logging it
753 if [ $STATUS == "NG" ] && [ ! -z $EXCEPT_URL ]; then
754 GREP_RESULT=$(grep --max-count=1 "$URL" "$EXCEPT_FILE")
755 EXCEPT_CODE=${GREP_RESULT%%,*}
756 if [ "$EXCEPT_CODE" == $CURL_RESULT ]; then
757 valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because its status code, $CURL_RESULT, is listed in the exceptions file."
758 let SKIP_EXCEPT+=1
759 continue
760 fi
761 fi
762
763 # If appropriate, record this link to the log, with clickable URLs when possible
764 if [ $STATUS != "OK" ] || [ $RECORD_OK_LINKS -eq 1 ]; then
765 FULL_PAGE_PATH=http://$WIKI_PATH/$NS_NAME:$PAGE_NAME
766 LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
767 # Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it explicitly breaks the link
768 if [ $NS_ID -eq 0 ]; then
769 FULL_PAGE_PATH=http://$WIKI_PATH/$PAGE_NAME
770 LOCAL_PAGE_PATH=$PAGE_NAME
771 fi
772
[1067]773 # Stupid hack since the text "IW" is narrower than "OK", "RD", or "NG" and it takes an extra tab
774 # to get to the desired level of indentation in the RTF log
[1064]775 RTF_TABS=" "
776 if [ $STATUS == "IW" ]; then
777 RTF_TABS=" "
778 fi
779
780 # Record link and its wiki page in TXT, RTF, and HTML markup
781 valPrint t "$STATUS ($CURL_RESULT) $STR_TYPE $URL"
782 valPrint t " linked from $FULL_PAGE_PATH"
783 valPrint r "$STATUS ($CURL_RESULT)$RTF_TABS$STR_TYPE {\field{\*\fldinst{HYPERLINK \"$URL\"}}{\fldrslt $URL}}"
784 valPrint r " linked from {\field{\*\fldinst{HYPERLINK \"$FULL_PAGE_PATH\"}}{\fldrslt $LOCAL_PAGE_PATH}}"
785 valPrint hn "<tr><td style=\"white-space:nowrap\">$STATUS ($CURL_RESULT)</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>"
786 valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"
787
[1067]788 # Record redirect URL if one was given by a 3xx response page
789 if [ $STATUS == "RD" ]; then
790 valPrint t " Server suggests $NEW_URL"
791 valPrint r " Server suggests {\field{\*\fldinst{HYPERLINK \"$NEW_URL\"}}{\fldrslt $NEW_URL}}"
792 valPrint hn "<tr><td colspan=\"2\" align=\"right\">Server suggests</td><td><a href=\"$NEW_URL\" target=\"_blank\">$NEW_URL</a></td></tr>"
793 fi
794
[1064]795 # Notify reader if we can use an interwiki prefix for this URL
796 if [ $STATUS == "IW" ]; then
797 valPrint t " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$POST_SLASH]]"
798 valPrint r " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$POST_SLASH]]"
799 valPrint hn "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$POST_SLASH]]</td></tr>"
800 fi
801
802 # Query Internet Archive for latest "OK" snapshot for "NG" page
803 if [ $STATUS == "NG" ] && [ $SUGGEST_SNAPSHOTS -eq 1 ]; then
804 ARCHIVE_QUERY=$(curl --silent --max-time 10 "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
805
[1066]806 # Isolate "url" property in response and log it if a "closest" snapshot was received...
807 if [[ "$ARCHIVE_QUERY" == *\"closest\":* ]]; then
808 SNAPSHOT_URL=${ARCHIVE_QUERY##*\"url\": \"}
809 SNAPSHOT_URL=${SNAPSHOT_URL%\", \"timestamp*}
[1064]810 valPrint t " IA suggests $SNAPSHOT_URL"
811 valPrint r " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
812 valPrint hn "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
813 else # ...otherwise give generic Wayback Machine link for this URL
814 valPrint t " Try browsing $ARCHIVE_GENERIC/$URL"
815 valPrint r " Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
816 valPrint hn "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"
817 fi
818 fi
819 fi
820
821 # If this is a page and it seems to be there, take screenshot of it using headless Google Chrome
822 if [ $IS_FILE -eq 0 ] && [ $TAKE_PAGE_SHOT -eq 1 ] && [ $STATUS == "OK" ]; then
823 # Sanitize URL by removing "http(s)://" and converting any other colons or slashes to underscores
824 SHOT_NAME=$(echo "$URL" | sed 's/https*\:\/\///' | sed 'y/:\//__/')
825 SHOT_FILE="$SHOT_PATH/$SHOT_NAME.png"
826
827 # Don't take screenshot if we already encountered this page and screenshotted it
828 if [ ! -f "$SHOT_FILE" ]; then
829 "$CHROME" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
830 if [ -f "$WORKING_DIR/$CHROME_SCREENSHOT" ]; then
831 mv -n "$WORKING_DIR/$CHROME_SCREENSHOT" "$SHOT_FILE"
832 else
833 valPrint trh "Screenshot of URL $URL seems to have failed!"
834 fi
835 else
836 valPrint trh "Skipping screenshot of URL $URL because $SHOT_FILE already exists."
837 fi
838 fi
839done
840FINISHED_LIST="yes"
841wrapupAndExit
Note: See TracBrowser for help on using the repository browser.