source: Validate External Links/validate_external_links.sh@ 1069

Last change on this file since 1069 was 1069, checked in by iritscen, 7 years ago

ValExtLinks: IW links now reported as separate category from OK links. RD links that are just redirecting from http:// to https:// are now regarded as OK.

File size: 36.5 KB
Line 
1#!/bin/bash
2
3# Validate External Links by Iritscen
4# Provided with a list of external links found in the OniGalore wiki, this script validates them.
5# The resulting logs are produced in three formats: TXT (for easy diffing with an earlier log), RTF
6# (for reading as a local file with clickable links), and HTML (for uploading as a web page).
7# Call script with "--help" argument for documentation. Also see Read Me First.rtf for critical notes.
8# Recommended rule:
9# ------------------------------------------------------------------------------------------------------
10
11# Set separator token to newline
12IFS="
13"
14
15### GLOBALS ###
16# Settings -- these will be changed from their defaults by the arguments passed in to the script
17LINKS_URL="" # use 'curl' to download file with links from this location (can be file://)
18EXCEPT_URL="" # ditto above for file with exceptions to NG results
19OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder
20RECORD_OK_LINKS=0 # record response code to the log whether it's a value in OK_CODES or NG_CODES
21SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page
22TAKE_PAGE_SHOT=0 # take a screenshot of each OK page
23URL_START=1 # start at this URL in LINKS_FILE (1 by default)
24URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE
25UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report
26
27# Fixed strings -- see the occurrences of these variables to learn their purpose
28AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:54.0) Gecko/20100101 Firefox/54.0"
29ARCHIVE_API="http://archive.org/wayback/available"
30ARCHIVE_GENERIC="https://web.archive.org/web/*"
31ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
32CHROME="/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary"
33CHROME_SCREENSHOT="screenshot.png"
34CURL_CODES="http://iritscen.oni2.net/val/curl_codes.txt"
35EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
36HTTP_CODES="http://iritscen.oni2.net/val/http_codes.txt"
37MY_WIKI_PAGE="http://wiki.oni2.net/User:Iritscen"
38THIS_DIR=$(cd $(dirname $0); pwd)
39WORKING_DIR=$(pwd)
40WIKI_PATH="wiki.oni2.net"
41
42# These are parallel arrays of the IDs and names of OniGalore's current namespaces
43declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 100 101 102 103 104 105 108 109 110 111)
44declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk")
45
46# These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
47# This determines whether the script tries to take a screenshot of the page or just gets its HTTP code.
48declare -a HTTP_FILES=(txt zip wmv jpg png m4a bsl rar oni mp3 mov ONWC vbs TRMA mp4 doc avi log gif pdf dmg exe cpp tga 7z wav east BINA xml dll dae xaf fbx 3ds blend flv csv)
49declare -a HTTP_TLDS_AND_PAGES=(com net org uk ru de it htm html php pl asp aspx shtml pgi cgi php3 x jsp phtml cfm css action stm js)
50
51# These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
52# are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt
53# if you add a new code.
54declare -a OK_CODES=(200 401 405 406 501)
55declare -a RD_CODES=(301 302 303 307 308)
56declare -a NG_CODES=(000 403 404 410 500 503)
57
58# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
59# transcluded text, and if the transclusion fails, then the braces show up in the URL
60ILLEGAL_CHARS="{ }"
61
62# These are parallel arrays giving the prefixes that can be used in place of normal external links to
63# some wikis and other sites
64declare -a INTERWIKI_PREFIXES=(metawikipedia wikipedia wikiquote wiktionary)
65declare -a INTERWIKI_DOMAINS=(meta.wikipedia.org wikipedia.org wikiquote.org wiktionary.org)
66
67# Variables for keeping track of main loop progress and findings
68LINK_NUM=0
69OK_LINKS=0
70RD_LINKS=0
71IW_LINKS=0
72NG_LINKS=0
73SKIP_UNK_NS=0
74SKIP_JS_PAGE=0
75SKIP_BAD_URL=0
76SKIP_NON_ASCII=0
77SKIP_UNK_SUFFIX=0
78SKIP_UNK_CODE=0
79SKIP_EXCEPT=0
80FILE_LINKS=0
81PAGE_LINKS=0
82SKIPPED_HEADER_ROW=0
83FINISHED_LIST="no"
84
85
86### HELP ###
87# A pseudo-man page. Here is the 80-character rule for the page text:
88# 234567890123456789012345678901234567890123456789012345678901234567890123456789
89function printHelp()
90{
91 cat << EOF
92
93NAME
94 Validate External Links
95
96SYNOPSIS
97 validate_external_links.sh --help
98 validate_external_links.sh --links URL --output PATH [--exceptions FILE]
99 [--record-ok-links] [--suggest-snapshots] [--take-screenshots]
100 [--start-url NUM] [--end-url NUM] [--upload PATH]
101
102DESCRIPTION
103 This script parses a list of external links found in the OniGalore wiki
104 (which is dumped by the Oni2.net domain periodically in a particular
105 format), validates them using the Unix tool 'curl', and produces a report
106 of which links were OK (responded positively to an HTTP query), which
107 were RD (responded with a 3xx redirect code), which could be IW (inter-
108 wiki) links, and which were NG (no good; a negative response to the
109 query). This report can then be automatically uploaded to the location of
110 your choice. The script can also suggest Internet Archive snapshots for
111 NG links, and take screenshots of OK links for visual verification by the
112 reader that the page in question is the one intended to be displayed.
113
114 You must pass this script the URL at which the list of links is found
115 (--links) and the path where logs should be outputted (--output). All
116 other arguments are optional.
117
118OPTIONS
119 --help Show this page
120 --links URL URL from which to download file with external links
121 (note that this can be a local file if you use the
122 file:// protocol) (required)
123 --output DIR Place the folder which will contain the reports and
124 optional screenshots at this path (required)
125 --exceptions URL In order to remove links from the list which show as
126 NG but which you regard as OK, prepare a plain-text
127 file where each line contains a response code being
128 returned and the URL returning it, separated by a
129 comma, e.g. "403,http://www.example.com" (note that
130 this can be a local file if you use the
131 file:// protocol)
132 --record-ok-links Log a link in the report even if its response code is
133 OK
134 --suggest-snapshots Query the Internet Archive for a possible snapshot
135 URL for each NG page
136 --take-screenshots Save screenshots of each OK page (requires Google
137 Chrome to be found at the path in CHROME)
138 --start-url NUM Start at this link in the links file
139 --end-url NUM Stop at this link in the links file
140 --upload FILE Upload report using info in this local file
141
142BUGS
143 The script cannot properly parse any line in the external links file
144 which contains a comma in the name of the wiki page containing a link.
145 Commas in the link itself are not an issue.
146EOF
147}
148
149
150### SETUP ###
151# If first argument is a help request, or if nothing was passed in at all, print help page and quit
152if [ "$#" -eq 0 ] || [ "$1" == "--help" ]; then
153 printHelp | less
154 exit 0
155fi
156
157# Parse arguments as long as there are more arguments to process
158while (( "$#" )); do
159 case "$1" in
160 --links ) LINKS_URL="$2"; shift 2;;
161 --exceptions ) EXCEPT_URL="$2"; shift 2;;
162 --output ) OUTPUT_DIR="$2"; shift 2;;
163 --record-ok-links ) RECORD_OK_LINKS=1; shift;;
164 --suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;;
165 --take-screenshots ) TAKE_PAGE_SHOT=1; shift;;
166 --start-url ) URL_START=$2; shift 2;;
167 --end-url ) URL_LIMIT=$2; shift 2;;
168 --upload ) UPLOAD_INFO=$2; shift 2;;
169 * ) echo "Invalid argument $1 detected. Aborting."; exit 1;;
170 esac
171done
172
173# If the required arguments were not supplied, print help page and quit
174if [ -z $LINKS_URL ] || [ -z $OUTPUT_DIR ]; then
175 printHelp
176 echo "Error: I did not receive one or both required arguments."
177 exit 2
178fi
179
180# Check that UPLOAD_INFO exists, if this argument was supplied
181if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then
182 echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting."
183 exit 3
184fi
185
186# Check that OUTPUT_DIR is a directory
187if [ ! -d "$OUTPUT_DIR" ]; then
188 echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting."
189 exit 4
190fi
191
192# Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots
193SAFE_TIME=$(date "+%Y-%m-%d--%H-%M-%S") # for use in path names
194NICE_TIME=$(TZ=":UTC" date "+%a, %d %b %Y %k:%M:%S GMT") # matches format of LINKS_DATE, which is extracted from the extlinks file
195OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
196OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
197SHOT_PATH="$OUTPUT_PATH/Screenshots"
198LOG_NAME="ValExtLinks report"
199LOG_TXT="$OUTPUT_PATH/$LOG_NAME.txt"
200LOG_RTF="$OUTPUT_PATH/$LOG_NAME.rtf"
201LOG_HTM="$OUTPUT_PATH/$LOG_NAME.htm"
202mkdir "$OUTPUT_PATH"
203if [ $TAKE_PAGE_SHOT -eq 1 ]; then
204 mkdir "$SHOT_PATH"
205fi
206
207# Check that 'mkdir' succeeded
208if [ ! -d "$OUTPUT_PATH" ]; then
209 echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting."
210 exit 5
211fi
212
213# Get date on the file at LINKS_URL and print to log
214LINKS_DATE=$(curl --silent --head $LINKS_URL | grep "Last-Modified")
215if [ -z "$LINKS_DATE" ]; then
216 echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting."
217 exit 6
218fi
219LINKS_DATE=${LINKS_DATE#Last-Modified: }
220
221
222### UTILITY FUNCTIONS ###
223# Writes a plain-text header to TXT log file
224function printTXTheader()
225{
226 valPrint t "Validate External Links report"
227 valPrint t "generated $NICE_TIME"
228 valPrint t "from data of $LINKS_DATE"
229 valPrint t "script by Iritscen (contact: $MY_WIKI_PAGE)"
230 valPrint t ""
231}
232
233# Writes the RTF header to RTF log file, then a centered title, then sets text to left-justified
234function printRTFheader()
235{
236 valPrint r "{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf820
237{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
238{\colortbl;\red255\green255\blue255;}
239{\*\expandedcolortbl;;}
240\margl1440\margr1440\vieww12600\viewh12100\viewkind0
241\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\qc\partightenfactor0
242
243\f0\fs28 \cf0 \b1 Validate External Links report \b0\\
244generated $NICE_TIME\\
245from data of $LINKS_DATE\\
246script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$MY_WIKI_PAGE\"}}{\fldrslt contact}})
247\\
248\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
249\cf0 "
250}
251
252# Closes the RTF markup of the RTF log file
253function printRTFfooter()
254{
255 valPrint r "}"
256}
257
258# Writes the HTML header to HTML log file
259function printHTMheader()
260{
261 valPrint h "<html>
262<head>
263<title>Validate External Links report</title>
264</head>
265<body>
266<h2>Validate External Links report</h2>
267<h3>generated $NICE_TIME<br />
268from data of $LINKS_DATE<br />
269script by Iritscen (<a href=\"$MY_WIKI_PAGE\" target=\"_blank\">contact</a>)</h3>"
270}
271
272# Closes the HTML markup of the HTML log file
273function printHTMfooter()
274{
275 valPrint h "</body>
276</html>"
277}
278
279# The central logging function. The first parameter is a string composed of one or more characters that
280# indicates which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
281# 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 'w' means "Don't
282# pass console output through 'fmt'" ("fmt" fits the output to an 80-column CLI but can break special
283# formatting and the 'n' option).
284function valPrint()
285{
286 if [[ "$1" == *c* ]]; then
287 if [[ "$1" == *n* ]]; then
288 echo -n "$2"
289 elif [[ "$1" == *w* ]]; then
290 echo "$2"
291 else
292 echo "$2" | fmt -w 80
293 fi
294 fi
295 if [[ "$1" == *t* ]]; then
296 if [[ "$1" == *n* ]]; then
297 echo -n "$2" >> "$LOG_TXT"
298 else
299 echo "$2" >> "$LOG_TXT"
300 fi
301 fi
302 if [[ "$1" == *r* ]]; then
303 if [[ "$1" == *n* ]]; then
304 echo "$2" >> "$LOG_RTF"
305 else
306 echo "$2\\" >> "$LOG_RTF"
307 fi
308 fi
309 if [[ "$1" == *h* ]]; then
310 if [[ "$1" == *n* ]]; then
311 echo "$2" >> "$LOG_HTM"
312 else
313 echo "$2<br />" >> "$LOG_HTM"
314 fi
315 fi
316}
317
318# Pluralize the string in parameter 1 if the number in parameter 2 is not 1
319function pluralCheckNoun()
320{
321 if [ $2 -ne 1 ]; then
322 if [[ $1 =~ x$ ]]; then
323 echo $1es
324 else
325 echo $1s
326 fi
327 else
328 echo $1
329 fi
330}
331
332# Output "is" if parameter 1 is 1, otherwise "are"
333function pluralCheckIs()
334{
335 if [ $1 -ne 1 ]; then
336 echo "are"
337 else
338 echo "is"
339 fi
340}
341
342# Output "was" if parameter 1 is 1, otherwise "were"
343function pluralCheckWas()
344{
345 if [ $1 -ne 1 ]; then
346 echo "were"
347 else
348 echo "was"
349 fi
350}
351
352# Output "a " if parameter 1 is 1, otherwise nothing
353function pluralCheckA()
354{
355 if [ $1 -eq 1 ]; then
356 echo "a "
357 fi
358}
359
360# Output "an " if parameter 1 is 1, otherwise nothing
361function pluralCheckAn()
362{
363 if [ $1 -eq 1 ]; then
364 echo "an "
365 fi
366}
367
368# Upload HTML report using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
369# reports being saved to disk have already been closed.
370function uploadReport()
371{
372 valPrint c "Uploading HTML report..."
373
374 SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
375 SFTP_USER_NAME_MARKER="user:"
376 SFTP_PASSWORD_MARKER="pw:"
377 SFTP_PORT_MARKER="port:"
378 SFTP_PATH_MARKER="path:"
379 SFTP_USER_NAME=$(grep $SFTP_USER_NAME_MARKER $UPLOAD_INFO)
380 SFTP_USER_NAME=${SFTP_USER_NAME#$SFTP_USER_NAME_MARKER}
381 SFTP_PASSWORD=$(grep $SFTP_PASSWORD_MARKER $UPLOAD_INFO)
382 SFTP_PASSWORD=${SFTP_PASSWORD#$SFTP_PASSWORD_MARKER}
383 SFTP_PORT=$(grep $SFTP_PORT_MARKER $UPLOAD_INFO)
384 SFTP_PORT=${SFTP_PORT#$SFTP_PORT_MARKER}
385 SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
386 SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}
387
388 expect "$SCRIPT_PATH" "$LOG_HTM" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.htm"
389
390 valPrint c "Report was uploaded, unless an error message appears above."
391}
392
393# Prints session summary when script is done
394function wrapupAndExit()
395{
396 # Get off progress line on console, drop down a line from last link in log, and close HTML table
397 valPrint ctr ""
398 valPrint h "</table><br />"
399
400 # If we didn't finish processing the last URL, then the iterator is one too high
401 if [ $FINISHED_LIST != "yes" ]; then
402 let LINK_NUM-=1
403 if [ $FINISHED_LIST == "no" ]; then
404 valPrint ctrh "The session was canceled by the user."
405 fi
406 fi
407
408 # Output results of session and close the log file's markup
409 LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
410 LINKS_SKIPPED=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
411 LINKS_CHECKED=$((LINKS_PROCESSED-LINKS_SKIPPED))
412 valPrint ct "Summary:"
413 valPrint r "\b1 Summary \b0"
414 valPrint hn "<h3><span id=\"summary\">Summary</span></h3>"
415 valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT)."
416 valPrint ctrh "I skipped $LINKS_SKIPPED $(pluralCheckNoun link $LINKS_SKIPPED), and found $FILE_LINKS $(pluralCheckNoun file $FILE_LINKS) and $PAGE_LINKS $(pluralCheckNoun page $PAGE_LINKS)."
417 if [ $LINKS_SKIPPED -gt 0 ]; then valPrint ctrh "Skip breakdown: "; fi
418 if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
419 if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE links on JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
420 if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
421 if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
422 if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
423 if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
424 valPrint ctrh "Out of the $LINKS_CHECKED links checked, $IW_LINKS could be $(pluralCheckAn $IW_LINKS)interwiki $(pluralCheckNoun link $IW_LINKS), $OK_LINKS $(pluralCheckWas $OK_LINKS) OK, $RD_LINKS $(pluralCheckWas $RD_LINKS) $(pluralCheckA $RD_LINKS)redirection $(pluralCheckNoun notice $RD_LINKS), and $NG_LINKS $(pluralCheckWas $NG_LINKS) NG."
425 if [ $SKIP_EXCEPT -gt 0 ]; then
426 valPrint ctrh "$SKIP_EXCEPT/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS) went unlisted due to being found in the exceptions file."
427 fi
428 printRTFfooter
429 printHTMfooter
430
431 # Upload report if this was requested
432 if [ ! -z $UPLOAD_INFO ] && [ $FINISHED_LIST != "no" ]; then
433 uploadReport
434 fi
435
436 # Really quit now
437 valPrint c "ValExtLinks says goodbye."
438 exit 0
439}
440trap wrapupAndExit INT
441
442
443### INITIALIZATION ###
444# Print opening message to console and log files
445valPrint cw "$(tput bold)--Validate External Links--$(tput sgr0)"
446printTXTheader
447printRTFheader
448printHTMheader
449
450# Attempt to download file at LINKS_URL, then check that it succeeded
451valPrint cwtrh "Downloading list of external links from $LINKS_URL."
452LINKS_FILE_NAME=$(echo "$LINKS_URL" | sed 's/.*\///')
453LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
454curl --silent -o "$LINKS_FILE" $LINKS_URL
455if [ ! -f "$LINKS_FILE" ]; then
456 echo "The download of $LINKS_URL appears to have failed. Aborting."
457 wrapupAndExit
458fi
459
460# Attempt to download file at EXCEPT_URL, then check that it succeeded
461if [ ! -z $EXCEPT_URL ]; then
462 valPrint cwtrh "Downloading list of NG exceptions from $EXCEPT_URL."
463 EXCEPT_FILE_NAME=$(echo "$EXCEPT_URL" | sed 's/.*\///')
464 EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
465 curl --silent -o "$EXCEPT_FILE" $EXCEPT_URL
466 if [ ! -f "$EXCEPT_FILE" ]; then
467 echo "The download of $EXCEPT_URL appears to have failed. Aborting."
468 wrapupAndExit
469 fi
470fi
471
472# Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
473LINK_COUNT_STRING=$(cat "$LINKS_FILE" | wc -l)
474
475# Number of URLs is number of lines minus one (first line is column header row for the CSV)
476LINK_COUNT=$(echo "${LINK_COUNT_STRING}" | tr -d '[:space:]')
477let LINK_COUNT-=1
478
479# Calculate number of URLs to consider
480if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
481 valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((URL_LIMIT-URL_START+1)) of them, from link $URL_START to $URL_LIMIT."
482elif [ $URL_START -ne 1 ]; then
483 valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((LINK_COUNT-URL_START+1)) of them, from link $URL_START to $LINK_COUNT."
484else
485 valPrint ctrh "Found $LINK_COUNT links to process. I will be considering all of them."
486fi
487
488# Print settings to console and log
489declare -a SETTINGS_MSG=(I will be checking the response code of each link "and will" take a screenshot of each page. Pages that are OK will "also" be logged. I "will" ask the Internet Archive for a suggested snapshot URL for each NG page. "I will not print NG links that are listed in the exceptions file.")
490if [ $TAKE_PAGE_SHOT -eq 0 ]; then SETTINGS_MSG[10]="but will not"; fi
491if [ $RECORD_OK_LINKS -eq 0 ]; then SETTINGS_MSG[22]="not"; fi
492if [ $SUGGEST_SNAPSHOTS -eq 0 ]; then SETTINGS_MSG[26]="will not"; fi
493if [ -z $EXCEPT_URL ] || [ $RECORD_OK_LINKS -eq 1 ]; then SETTINGS_MSG[40]=""; fi
494SETTINGS_STR=${SETTINGS_MSG[@]}
495valPrint ctrh "$SETTINGS_STR"
496valPrint tr "A summary of my findings will be found at the bottom of the report."
497valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
498valPrint trh ""
499
500# Print legend to logs
501valPrint t "Legend:"
502valPrint r "\b1 Legend \b0"
503valPrint hn "<h3>Legend</h3>"
504valPrint trh "OK = URL seems to be working."
505valPrint trh "NG = URL no longer seems to work. You should click each URL marked as NG before attempting to fix it, because false negatives will occur from time to time due to hiccups in the Internet. Please report any persistent false negatives or other issues to Iritscen. An NG link should be followed by a link to the Internet Archive's Wayback Machine which may help you repair the link. If the link cannot be repaired, you can disable it on the wiki (which prevents it from showing up in future ValExtLinks reports) by wrapping it in nowiki tags."
506valPrint trh "RD = The server responding to this URL is saying that the page moved and you should instead use the supplied new URL. Some RD links represent minor adjustments in the organization of a web site, and some are soft 404s (the file/page has been removed and you are being redirected to something like the main page of the web site). You will have to look at the new URL yourself to determine if it represents an OK link and the link on the wiki should be updated to this one, or if the desired file/page is actually gone and we need to replace the wiki link with an Internet Archive snapshot link -- or disable the URL if it has not been archived."
507valPrint trh "IW = URL is working but should be converted to interwiki link using the suggested markup."
508valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $HTTP_CODES)."
509valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$HTTP_CODES\"}}{\fldrslt here}} for code reference)."
510valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$HTTP_CODES\" target=\"_blank\">here</a> for code reference)."
511valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $CURL_CODES)."
512valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$CURL_CODES\"}}{\fldrslt here}} for code reference)."
513valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$CURL_CODES\" target=\"_blank\">here</a> for code reference)."
514valPrint trh "IA suggests = Last available snapshot suggested by the Internet Archive."
515valPrint trh "Try browsing = The Archive occasionally fails to return a snapshot URL even when one exists, so you will need to check for a snapshot manually using the Wayback Machine before concluding that a site has not been archived."
516valPrint trh ""
517
518
519### MAIN LOOP ###
520# Process each line of the .csv in LINKS_FILE
521for LINE in `cat "$LINKS_FILE"`; do
522 let LINK_NUM+=1
523
524 # First line is the column header row for the CSV, so let's verify that the format hasn't changed
525 if [ $SKIPPED_HEADER_ROW -eq 0 ]; then
526 if [ $LINE == "namespace,title,target" ]; then
527 SKIPPED_HEADER_ROW=1
528 LINK_NUM=0 # this line is it's not a link, so reset the link counter
529 valPrint hn "<table>"
530 continue
531 else
532 valPrint ctrh "Did not find expected header row in $LINKS_FILE. Aborting."
533 wrapupAndExit
534 fi
535 fi
536
537 # Skip this link if we are not at URL_START yet
538 if [ $LINK_NUM -lt $URL_START ]; then
539 continue
540 fi
541
542 # Stop if we are at the limit declared for testing purposes
543 if [ $URL_LIMIT -gt 0 ] && [ $LINK_NUM -gt $URL_LIMIT ]; then
544 FINISHED_LIST="limit"
545 wrapupAndExit
546 fi
547
548 # Print progress to screen
549 if [ $LINK_NUM -gt 1 ]; then
550 printf "\e[1A\n" # erase previous progress message so that new one appears in its place
551 fi
552 valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."
553
554 # The number of the namespace is the element before the first comma on the line
555 NS_ID=${LINE%%,*}
556
557 # Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
558 NS_NAME=""
559 a=0
560 while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done
561 if [ $NS_ID -eq ${NS_IDS[$a]} ]; then
562 NS_NAME="${NS_NAMES[$a]}"
563 break
564 fi
565 let a+=1
566 done
567 if [ -z "$NS_NAME" ]; then
568 valPrint tr "Skipping URL found on page $PAGE_NAME because I could not find a name for namespace ID $NS_ID."
569 let SKIP_UNK_NS+=1
570 continue
571 fi
572
573 # The name of the page is everything between the namespace ID and the next comma on the line (commas
574 # in page names will break this)
575 PAGE_NAME=${LINE#$NS_ID,}
576 PAGE_NAME=${PAGE_NAME%%,*}
577
578 # We don't want to consider wiki pages ending in .js, as the parser cannot reliably isolate URLS in
579 # JavaScript code, so it will return erroneous links
580 PAGE_NAME_SUFFIX=$(echo $PAGE_NAME | sed 's/.*\.//')
581 if [ $PAGE_NAME_SUFFIX == "js" ]; then
582 valPrint tr "Skipping URL found on JavaScript page $PAGE_NAME."
583 let SKIP_JS_PAGE+=1
584 continue
585 fi
586
587 # The URL being linked to is everything after the previous two fields (this allows commas to be in
588 # the URLs, but a comma in the previous field, the page name, will break this)
589 URL=${LINE#$NS_ID,$PAGE_NAME,}
590
591 # Scan for illegal characters
592 if [[ $URL == *[$ILLEGAL_CHARS]* ]]; then
593 valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because it contains characters illegal in a URL."
594 let SKIP_BAD_URL+=1
595 continue
596 fi
597
598 # Now we need to know if the URL is for a file or a web page. First step is to determine if the
599 # URL ends in a suffix
600 HAS_SUFFIX=0
601
602 # If the URL ends in something like ".php?foo=bar", strip everything from the '?' onward
603 SAN_URL=${URL%%\?*}
604
605 # If the URL ends in something like "#section_15", strip everything from the '#' onward
606 SAN_URL=${SAN_URL%%\#*}
607
608 # 'sed' cannot handle Unicode in my Bash shell, so skip this URL and make user check it
609 if [[ $SAN_URL == *[![:ascii:]]* ]]; then
610 valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because I cannot handle non-ASCII characters."
611 let SKIP_NON_ASCII+=1
612 continue
613 fi
614
615 # Isolate the characters after the last period and after the last slash
616 POST_DOT=$(echo "$SAN_URL" | sed 's/.*\.//')
617 POST_SLASH=$(echo "$SAN_URL" | sed 's/.*\///')
618
619 # If the last period comes after the last slash, then the URL ends in a suffix
620 POST_DOT_LENGTH=$(echo | awk -v input=$POST_DOT '{print length(input)}')
621 POST_SLASH_LENGTH=$(echo | awk -v input=$POST_SLASH '{print length(input)}')
622 if [ $POST_DOT_LENGTH -lt $POST_SLASH_LENGTH ]; then
623 HAS_SUFFIX=1
624 else
625 HAS_SUFFIX=0
626 fi
627
628 # Now we need to know if the URL is for a file. If the URL ends in a suffix, compare it against the
629 # known URL endings supplied in HTTP_FILES and HTTP_TLDS_AND_PAGES
630 IS_FILE=-1
631 if [ $HAS_SUFFIX -eq 0 ]; then
632 IS_FILE=0
633 else
634 # Turn off case sensitivity while we compare suffixes
635 shopt -s nocasematch
636
637 # Special case: URLs ending in something like "/productID.304297400" are pages, not files, so if
638 # the URL's suffix is all numbers, we are looking at the end of a web page URL
639 if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
640 IS_FILE=0
641 fi
642
643 # If we did not identify this URL as a web page above, we need to compare the suffix against known
644 # file extensions
645 if [ $IS_FILE -eq -1 ]; then
646 for EXTENSION in "${HTTP_FILES[@]}"; do
647 if [[ $EXTENSION == $POST_DOT ]]; then # double brackets respect the nocasematch setting
648 IS_FILE=1
649 break
650 fi
651 done
652 fi
653
654 # If we did not identify this URL as a file above, we need to compare the suffix against known
655 # pages and TLDs in order to rule out that this is an unknown file suffix or TLD that the user
656 # needs to add to the HTTP_FILES or HTTP_TLDS_AND_PAGES array
657 if [ $IS_FILE -eq -1 ]; then
658 for PAGE_OR_TLD in "${HTTP_TLDS_AND_PAGES[@]}"; do
659 if [[ $PAGE_OR_TLD == $POST_DOT ]]; then
660 IS_FILE=0
661 break
662 fi
663 done
664 fi
665
666 # Turn case sensitivity back on in Bash
667 shopt -u nocasematch
668 fi
669
670 # If this suffix escaped identification as either a file, page or TLD, inform the user
671 STR_TYPE=""
672 if [ $IS_FILE -eq -1 ]; then
673 valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because I encountered the unknown URL ending $POST_DOT. Please add this ending to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
674 let SKIP_UNK_SUFFIX+=1
675 continue
676 elif [ $IS_FILE -eq 1 ]; then
677 STR_TYPE="file"
678 let FILE_LINKS+=1
679 elif [ $IS_FILE -eq 0 ]; then
680 STR_TYPE="page"
681 let PAGE_LINKS+=1
682 fi
683
684 # Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
685 # issue with sites that require HTTPS
686 CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{http_code}\n' $URL)
687 CURL_ERR=$(echo $?)
688 CURL_RESULT=$CURL_CODE
689
690 # Tack 'curl' exit code onto result string if 'curl' returned "000" (no HTTP response)
691 if [ $CURL_CODE == "000" ]; then
692 CURL_RESULT="$CURL_RESULT-$CURL_ERR"
693 fi
694
695 # Determine our status code for this URL (IW, OK, RD, or NG)
696 STATUS="??"
697 NEW_URL=""
698 INTERWIKI_INDEX=-1
699 # First check if this is a link to a domain that we have an interwiki prefix for
700 for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
701 if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]]; then
702 STATUS="IW"
703 let IW_LINKS+=1
704 INTERWIKI_INDEX=$i
705 break
706 fi
707 done
708
709 # If we didn't match an interwiki domain, see if the status code is in our "OK" codes list
710 if [ $STATUS == "??" ]; then
711 for CODE in "${OK_CODES[@]}"; do
712 if [[ $CODE == $CURL_CODE ]]; then
713 STATUS="OK"
714 let OK_LINKS+=1
715 break
716 fi
717 done
718 fi
719
720 # If we didn't get a match with the "OK" codes, check it against the "RD" codes
721 if [ $STATUS == "??" ]; then
722 for CODE in "${RD_CODES[@]}"; do
723 if [[ $CODE == $CURL_CODE ]]; then
724 # Get URL header again in order to retrieve the URL we are being redirected to
725 NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{redirect_url}\n' $URL)
726
727 # Check if the redirect URL is just the original URL with https:// instead of http://
728 # (this happens a lot and is not an important correction to us); if so, just make it "OK"
729 URL_NO_PROTOCOL=${URL#*://}
730 NEW_URL_NO_PROTOCOL=${NEW_URL#*://}
731 if [ $URL_NO_PROTOCOL == $NEW_URL_NO_PROTOCOL ]; then
732 STATUS="OK"
733 let OK_LINKS+=1
734 else
735 STATUS="RD"
736 let RD_LINKS+=1
737 fi
738 break
739 fi
740 done
741 fi
742
743 # If we didn't get a match with the "RD" codes, check it against the "NG" codes
744 if [ $STATUS == "??" ]; then
745 for CODE in "${NG_CODES[@]}"; do
746 if [[ $CODE == $CURL_CODE ]]; then
747 STATUS="NG"
748 let NG_LINKS+=1
749 break
750 fi
751 done
752 fi
753
754 # If we didn't match a known status code, advise the reader
755 if [ $STATUS == "??" ]; then
756 valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because I encountered the unknown return code $CURL_CODE."
757 let SKIP_UNK_CODE+=1
758 continue
759 fi
760
761 # If link is "NG" and there is an exceptions file, compare URL against the list before logging it
762 if [ $STATUS == "NG" ] && [ ! -z $EXCEPT_URL ]; then
763 GREP_RESULT=$(grep --max-count=1 "$URL" "$EXCEPT_FILE")
764 EXCEPT_CODE=${GREP_RESULT%%,*}
765 if [ "$EXCEPT_CODE" == $CURL_RESULT ]; then
766 valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because its status code, $CURL_RESULT, is listed in the exceptions file."
767 let SKIP_EXCEPT+=1
768 continue
769 fi
770 fi
771
772 # If appropriate, record this link to the log, with clickable URLs when possible
773 if [ $STATUS != "OK" ] || [ $RECORD_OK_LINKS -eq 1 ]; then
774 FULL_PAGE_PATH=http://$WIKI_PATH/$NS_NAME:$PAGE_NAME
775 LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
776 # Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it explicitly breaks the link
777 if [ $NS_ID -eq 0 ]; then
778 FULL_PAGE_PATH=http://$WIKI_PATH/$PAGE_NAME
779 LOCAL_PAGE_PATH=$PAGE_NAME
780 fi
781
782 # Stupid hack since the text "IW" is narrower than "OK", "RD", or "NG" and it takes an extra tab
783 # to get to the desired level of indentation in the RTF log
784 RTF_TABS=" "
785 if [ $STATUS == "IW" ]; then
786 RTF_TABS=" "
787 fi
788
789 # Record link and its wiki page in TXT, RTF, and HTML markup
790 valPrint t "$STATUS ($CURL_RESULT) $STR_TYPE $URL"
791 valPrint t " linked from $FULL_PAGE_PATH"
792 valPrint r "$STATUS ($CURL_RESULT)$RTF_TABS$STR_TYPE {\field{\*\fldinst{HYPERLINK \"$URL\"}}{\fldrslt $URL}}"
793 valPrint r " linked from {\field{\*\fldinst{HYPERLINK \"$FULL_PAGE_PATH\"}}{\fldrslt $LOCAL_PAGE_PATH}}"
794 valPrint hn "<tr><td style=\"white-space:nowrap\">$STATUS ($CURL_RESULT)</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>"
795 valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"
796
797 # Record redirect URL if one was given by a 3xx response page
798 if [ $STATUS == "RD" ]; then
799 valPrint t " Server suggests $NEW_URL"
800 valPrint r " Server suggests {\field{\*\fldinst{HYPERLINK \"$NEW_URL\"}}{\fldrslt $NEW_URL}}"
801 valPrint hn "<tr><td colspan=\"2\" align=\"right\">Server suggests</td><td><a href=\"$NEW_URL\" target=\"_blank\">$NEW_URL</a></td></tr>"
802 fi
803
804 # Notify reader if we can use an interwiki prefix for this URL
805 if [ $STATUS == "IW" ]; then
806 valPrint t " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$POST_SLASH]]"
807 valPrint r " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$POST_SLASH]]"
808 valPrint hn "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$POST_SLASH]]</td></tr>"
809 fi
810
811 # Query Internet Archive for latest "OK" snapshot for "NG" page
812 if [ $STATUS == "NG" ] && [ $SUGGEST_SNAPSHOTS -eq 1 ]; then
813 ARCHIVE_QUERY=$(curl --silent --max-time 10 "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
814
815 # Isolate "url" property in response and log it if a "closest" snapshot was received...
816 if [[ "$ARCHIVE_QUERY" == *\"closest\":* ]]; then
817 SNAPSHOT_URL=${ARCHIVE_QUERY##*\"url\": \"}
818 SNAPSHOT_URL=${SNAPSHOT_URL%\", \"timestamp*}
819 valPrint t " IA suggests $SNAPSHOT_URL"
820 valPrint r " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
821 valPrint hn "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
822 else # ...otherwise give generic Wayback Machine link for this URL
823 valPrint t " Try browsing $ARCHIVE_GENERIC/$URL"
824 valPrint r " Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
825 valPrint hn "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"
826 fi
827 fi
828 fi
829
830 # If this is a page and it seems to be there, take screenshot of it using headless Google Chrome
831 if [ $IS_FILE -eq 0 ] && [ $TAKE_PAGE_SHOT -eq 1 ] && [ $STATUS == "OK" ]; then
832 # Sanitize URL by removing "http(s)://" and converting any other colons or slashes to underscores
833 SHOT_NAME=$(echo "$URL" | sed 's/https*\:\/\///' | sed 'y/:\//__/')
834 SHOT_FILE="$SHOT_PATH/$SHOT_NAME.png"
835
836 # Don't take screenshot if we already encountered this page and screenshotted it
837 if [ ! -f "$SHOT_FILE" ]; then
838 "$CHROME" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
839 if [ -f "$WORKING_DIR/$CHROME_SCREENSHOT" ]; then
840 mv -n "$WORKING_DIR/$CHROME_SCREENSHOT" "$SHOT_FILE"
841 else
842 valPrint trh "Screenshot of URL $URL seems to have failed!"
843 fi
844 else
845 valPrint trh "Skipping screenshot of URL $URL because $SHOT_FILE already exists."
846 fi
847 fi
848done
849FINISHED_LIST="yes"
850wrapupAndExit
Note: See TracBrowser for help on using the repository browser.