source: Validate External Links/validate_external_links.sh@ 1122

Last change on this file since 1122 was 1122, checked in by iritscen, 5 years ago

Val now links to wiki pages using HTTPS instead of HTTP. Fixed code that exempts minor forms of redirects from being listed. New arguments --show-added-slashes and --show-https-upgrade allow one to turn off these exemptions. Reworked summary section extensively to be more readable.

File size: 44.8 KB
RevLine 
[1064]1#!/bin/bash
2
3# Validate External Links by Iritscen
4# Provided with a list of external links found in the OniGalore wiki, this script validates them.
5# The resulting logs are produced in three formats: TXT (for easy diffing with an earlier log), RTF
6# (for reading as a local file with clickable links), and HTML (for uploading as a web page).
7# Call script with "--help" argument for documentation. Also see Read Me First.rtf for critical notes.
8# Recommended rule:
[1118]9# |----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----|
[1064]10
11# Set separator token to newline
12IFS="
13"
14
15### GLOBALS ###
16# Settings -- these will be changed from their defaults by the arguments passed in to the script
17LINKS_URL="" # use 'curl' to download file with links from this location (can be file://)
18EXCEPT_URL="" # ditto above for file with exceptions to NG results
19OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder
[1070]20RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES
[1122]21SHOW_SLASH=0 # record response code to the log when a slash is added to the end of a URL
22SHOW_HTTPS=0 # record response code to the log when "http" is upgraded to "https"
[1064]23SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page
24TAKE_PAGE_SHOT=0 # take a screenshot of each OK page
[1070]25CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature
[1064]26URL_START=1 # start at this URL in LINKS_FILE (1 by default)
27URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE
28UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report
29
30# Fixed strings -- see the occurrences of these variables to learn their purpose
[1118]31AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36 OPR/67.0.3575.53"
[1064]32ARCHIVE_API="http://archive.org/wayback/available"
33ARCHIVE_GENERIC="https://web.archive.org/web/*"
34ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
35CHROME_SCREENSHOT="screenshot.png"
[1066]36CURL_CODES="http://iritscen.oni2.net/val/curl_codes.txt"
[1064]37EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
[1066]38HTTP_CODES="http://iritscen.oni2.net/val/http_codes.txt"
[1122]39MY_WIKI_PAGE="https://wiki.oni2.net/User:Iritscen"
[1064]40THIS_DIR=$(cd $(dirname $0); pwd)
41WORKING_DIR=$(pwd)
42WIKI_PATH="wiki.oni2.net"
43
44# These are parallel arrays of the IDs and names of OniGalore's current namespaces
45declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 100 101 102 103 104 105 108 109 110 111)
46declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk")
47
48# These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
[1070]49# This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code.
[1064]50declare -a HTTP_FILES=(txt zip wmv jpg png m4a bsl rar oni mp3 mov ONWC vbs TRMA mp4 doc avi log gif pdf dmg exe cpp tga 7z wav east BINA xml dll dae xaf fbx 3ds blend flv csv)
51declare -a HTTP_TLDS_AND_PAGES=(com net org uk ru de it htm html php pl asp aspx shtml pgi cgi php3 x jsp phtml cfm css action stm js)
52
[1067]53# These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
54# are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt
55# if you add a new code.
56declare -a OK_CODES=(200 401 405 406 501)
57declare -a RD_CODES=(301 302 303 307 308)
58declare -a NG_CODES=(000 403 404 410 500 503)
[1064]59
60# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
61# transcluded text, and if the transclusion fails, then the braces show up in the URL
62ILLEGAL_CHARS="{ }"
63
[1070]64# The shortest URL possible, used for sanity-checking some URLs: http://a.co
65MIN_URL_LENGTH=11
66
[1064]67# These are parallel arrays giving the prefixes that can be used in place of normal external links to
68# some wikis and other sites
[1070]69declare -a INTERWIKI_PREFIXES=(commons metawikimedia mw wikibooks wikidata wikimedia wikinews wikiquote wikisource wikispecies wikiversity wikivoyage wikt wp)
70declare -a INTERWIKI_DOMAINS=(commons.wikimedia.org meta.wikimedia.org mediawiki.org wikibooks.org wikidata.org wikimediafoundation.org wikinews.org wikiquote.org wikisource.org species.wikimedia.org wikiversity.org wikivoyage.org wiktionary.org wikipedia.org)
[1064]71
72# Variables for keeping track of main loop progress and findings
73LINK_NUM=0
[1070]74EI_LINKS=0
75IW_LINKS=0
[1064]76OK_LINKS=0
[1067]77RD_LINKS=0
[1064]78NG_LINKS=0
79SKIP_UNK_NS=0
80SKIP_JS_PAGE=0
81SKIP_BAD_URL=0
82SKIP_NON_ASCII=0
83SKIP_UNK_SUFFIX=0
84SKIP_UNK_CODE=0
[1070]85SKIP_EXPECT_NG=0
86SKIP_EXPECT_EI=0
87SKIP_EXPECT_IW=0
[1122]88SKIP_HTTPS_UP=0
89SKIP_SLASH_ADD=0
[1064]90FILE_LINKS=0
91PAGE_LINKS=0
92SKIPPED_HEADER_ROW=0
93FINISHED_LIST="no"
[1118]94START_RUN=0
95END_RUN=0
[1064]96
97
98### HELP ###
99# A pseudo-man page. Here is the 80-character rule for the page text:
100# 234567890123456789012345678901234567890123456789012345678901234567890123456789
101function printHelp()
102{
103 cat << EOF
104
105NAME
106 Validate External Links
107
108SYNOPSIS
109 validate_external_links.sh --help
[1070]110 validate_external_links.sh --links URL --output DIR [--exceptions URL]
[1075]111 [--record-ok-links] [--suggest-snapshots] [--take-screenshots FILE]
[1070]112 [--start-url NUM] [--end-url NUM] [--upload FILE]
[1064]113
114DESCRIPTION
115 This script parses a list of external links found in the OniGalore wiki
116 (which is dumped by the Oni2.net domain periodically in a particular
117 format), validates them using the Unix tool 'curl', and produces a report
[1070]118 of which links were "OK" (responded positively to an HTTP query), which
119 were "RD" (responded with a 3xx redirect code), which could be "IW"
120 (interwiki) links, which are "EI" (external internal) links and could be
121 intrawiki links, and which were "NG" (no good; a negative response to the
[1069]122 query). This report can then be automatically uploaded to the location of
[1064]123 your choice. The script can also suggest Internet Archive snapshots for
[1070]124 "NG" links, and take screenshots of "OK" links for visual verification by
125 the reader that the page in question is the one intended to be displayed.
[1064]126
127 You must pass this script the URL at which the list of links is found
[1070]128 (--links) and the path where the directory of logs should be outputted
129 (--output). All other arguments are optional.
[1064]130
131OPTIONS
[1075]132 --help Show this page.
133 --links URL (required) URL from which to download the CSV
134 file with external links. Note that this URL can
135 be a local file if you supply a file:// path.
136 --output DIR (required) Unix path to directory in which Val
137 should place its reports.
138 --exceptions URL In order to remove links from the report which
139 Val finds an issue with, but which you regard as
140 OK, list those desired exceptions in this file.
141 See the sample file exceptions.txt for details.
142 Note that this URL can point to a local file if
143 you supply a file:// path.
144 --record-ok-links Log a link in the report even if its response
145 code is "OK".
[1122]146 --show-added-slashes Report on redirects that simply add a '/' to the
147 end of the URL.
148 --show-https-upgrade Report on redirects that simply upgrade a
149 "http://" URL to a "https://" URL.
[1075]150 --suggest-snapshots Query the Internet Archive for a possible
151 snapshot URL for each "NG" page.
152 --take-screenshots FILE Call the Google Chrome binary at this path to
153 take screenshots of each "OK" page.
154 --start-url NUM Start at this link in the links CSV file.
155 --end-url NUM Stop at this link in the links CSV file.
156 --upload FILE Upload report using the credentials and path
157 given in this local text file. See sftp_login.txt
158 for template.
[1064]159
160BUGS
161 The script cannot properly parse any line in the external links file
162 which contains a comma in the name of the wiki page containing a link.
163 Commas in the link itself are not an issue.
164EOF
165}
166
167
168### SETUP ###
169# If first argument is a help request, or if nothing was passed in at all, print help page and quit
170if [ "$#" -eq 0 ] || [ "$1" == "--help" ]; then
171 printHelp | less
172 exit 0
173fi
174
175# Parse arguments as long as there are more arguments to process
176while (( "$#" )); do
177 case "$1" in
[1122]178 --links ) LINKS_URL="$2"; shift 2;;
179 --exceptions ) EXCEPT_URL="$2"; shift 2;;
180 --output ) OUTPUT_DIR="$2"; shift 2;;
181 --record-ok-links ) RECORD_OK_LINKS=1; shift;;
182 --show-added-slashes ) SHOW_SLASH=1; shift;;
183 --show-https-upgrade ) SHOW_HTTPS=1; shift;;
184 --suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;;
185 --take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
186 --start-url ) URL_START=$2; shift 2;;
187 --end-url ) URL_LIMIT=$2; shift 2;;
188 --upload ) UPLOAD_INFO=$2; shift 2;;
189 * ) echo "Invalid argument $1 detected. Aborting."; exit 1;;
[1064]190 esac
191done
192
193# If the required arguments were not supplied, print help page and quit
194if [ -z $LINKS_URL ] || [ -z $OUTPUT_DIR ]; then
[1070]195 echo "Error: I did not receive one or both required arguments. Run me with the \"--help\" argument for documentation."
[1064]196 exit 2
197fi
198
[1070]199# If user wants screenshots, make sure path to Chrome was passed in and is valid
200if [ $TAKE_PAGE_SHOT -eq 1 ]; then
201 if [ ! -f "$CHROME_PATH" ]; then
202 echo "Error: You need to supply a path to the Google Chrome application in order to take screenshots."
203 exit 3
204 fi
205fi
206
[1064]207# Check that UPLOAD_INFO exists, if this argument was supplied
208if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then
209 echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting."
[1070]210 exit 4
[1064]211fi
212
213# Check that OUTPUT_DIR is a directory
214if [ ! -d "$OUTPUT_DIR" ]; then
215 echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting."
[1070]216 exit 5
[1064]217fi
218
219# Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots
220SAFE_TIME=$(date "+%Y-%m-%d--%H-%M-%S") # for use in path names
221NICE_TIME=$(TZ=":UTC" date "+%a, %d %b %Y %k:%M:%S GMT") # matches format of LINKS_DATE, which is extracted from the extlinks file
222OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
223OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
224SHOT_PATH="$OUTPUT_PATH/Screenshots"
225LOG_NAME="ValExtLinks report"
226LOG_TXT="$OUTPUT_PATH/$LOG_NAME.txt"
227LOG_RTF="$OUTPUT_PATH/$LOG_NAME.rtf"
228LOG_HTM="$OUTPUT_PATH/$LOG_NAME.htm"
229mkdir "$OUTPUT_PATH"
230if [ $TAKE_PAGE_SHOT -eq 1 ]; then
231 mkdir "$SHOT_PATH"
232fi
233
234# Check that 'mkdir' succeeded
235if [ ! -d "$OUTPUT_PATH" ]; then
236 echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting."
[1070]237 exit 6
[1064]238fi
239
240# Get date on the file at LINKS_URL and print to log
241LINKS_DATE=$(curl --silent --head $LINKS_URL | grep "Last-Modified")
242if [ -z "$LINKS_DATE" ]; then
243 echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting."
[1070]244 exit 7
[1064]245fi
246LINKS_DATE=${LINKS_DATE#Last-Modified: }
247
248
249### UTILITY FUNCTIONS ###
250# Writes a plain-text header to TXT log file
251function printTXTheader()
252{
253 valPrint t "Validate External Links report"
254 valPrint t "generated $NICE_TIME"
255 valPrint t "from data of $LINKS_DATE"
256 valPrint t "script by Iritscen (contact: $MY_WIKI_PAGE)"
257 valPrint t ""
258}
259
260# Writes the RTF header to RTF log file, then a centered title, then sets text to left-justified
261function printRTFheader()
262{
263 valPrint r "{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf820
264{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
265{\colortbl;\red255\green255\blue255;}
266{\*\expandedcolortbl;;}
267\margl1440\margr1440\vieww12600\viewh12100\viewkind0
268\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\qc\partightenfactor0
269
270\f0\fs28 \cf0 \b1 Validate External Links report \b0\\
271generated $NICE_TIME\\
272from data of $LINKS_DATE\\
273script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$MY_WIKI_PAGE\"}}{\fldrslt contact}})
274\\
275\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
276\cf0 "
277}
278
279# Closes the RTF markup of the RTF log file
280function printRTFfooter()
281{
282 valPrint r "}"
283}
284
285# Writes the HTML header to HTML log file
286function printHTMheader()
287{
288 valPrint h "<html>
289<head>
290<title>Validate External Links report</title>
291</head>
292<body>
293<h2>Validate External Links report</h2>
294<h3>generated $NICE_TIME<br />
295from data of $LINKS_DATE<br />
296script by Iritscen (<a href=\"$MY_WIKI_PAGE\" target=\"_blank\">contact</a>)</h3>"
297}
298
299# Closes the HTML markup of the HTML log file
300function printHTMfooter()
301{
302 valPrint h "</body>
303</html>"
304}
305
306# The central logging function. The first parameter is a string composed of one or more characters that
[1070]307# indicate which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
[1119]308# 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 's' means "Print an extra newline at the end." 'w' means "Don't pass console output through 'fmt'" ("fmt" fits the output
309# to an 80-column CLI but can break special formatting and the 'n' option).
[1064]310function valPrint()
311{
312 if [[ "$1" == *c* ]]; then
313 if [[ "$1" == *n* ]]; then
314 echo -n "$2"
315 elif [[ "$1" == *w* ]]; then
316 echo "$2"
[1119]317 elif [[ "$1" == *s* ]]; then
318 echo -e "$2\n"
[1064]319 else
320 echo "$2" | fmt -w 80
321 fi
322 fi
323 if [[ "$1" == *t* ]]; then
324 if [[ "$1" == *n* ]]; then
325 echo -n "$2" >> "$LOG_TXT"
[1119]326 elif [[ "$1" == *s* ]]; then
327 echo -e "$2\n" >> "$LOG_TXT"
[1064]328 else
329 echo "$2" >> "$LOG_TXT"
330 fi
331 fi
332 if [[ "$1" == *r* ]]; then
333 if [[ "$1" == *n* ]]; then
334 echo "$2" >> "$LOG_RTF"
[1119]335 elif [[ "$1" == *s* ]]; then
336 echo "$2\line\line" >> "$LOG_RTF"
[1064]337 else
[1119]338 echo "$2\line" >> "$LOG_RTF"
[1064]339 fi
340 fi
341 if [[ "$1" == *h* ]]; then
[1119]342 if [[ "$1" == *s* ]]; then
343 echo "$2<tr><td>&nbsp;</td></tr>" >> "$LOG_HTM"
344 elif [[ "$1" == *n* ]]; then
[1064]345 echo "$2" >> "$LOG_HTM"
346 else
347 echo "$2<br />" >> "$LOG_HTM"
348 fi
349 fi
350}
351
352# Pluralize the string in parameter 1 if the number in parameter 2 is not 1
353function pluralCheckNoun()
354{
355 if [ $2 -ne 1 ]; then
356 if [[ $1 =~ x$ ]]; then
357 echo $1es
358 else
359 echo $1s
360 fi
361 else
362 echo $1
363 fi
364}
365
[1067]366# Output "is" if parameter 1 is 1, otherwise "are"
367function pluralCheckIs()
368{
369 if [ $1 -ne 1 ]; then
370 echo "are"
371 else
372 echo "is"
373 fi
374}
375
[1064]376# Output "was" if parameter 1 is 1, otherwise "were"
377function pluralCheckWas()
378{
379 if [ $1 -ne 1 ]; then
380 echo "were"
381 else
382 echo "was"
383 fi
384}
385
[1067]386# Output "a " if parameter 1 is 1, otherwise nothing
387function pluralCheckA()
388{
389 if [ $1 -eq 1 ]; then
390 echo "a "
391 fi
392}
393
394# Output "an " if parameter 1 is 1, otherwise nothing
395function pluralCheckAn()
396{
397 if [ $1 -eq 1 ]; then
398 echo "an "
399 fi
400}
401
[1064]402# Upload HTML report using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
403# reports being saved to disk have already been closed.
404function uploadReport()
405{
406 valPrint c "Uploading HTML report..."
407
408 SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
409 SFTP_USER_NAME_MARKER="user:"
410 SFTP_PASSWORD_MARKER="pw:"
411 SFTP_PORT_MARKER="port:"
412 SFTP_PATH_MARKER="path:"
413 SFTP_USER_NAME=$(grep $SFTP_USER_NAME_MARKER $UPLOAD_INFO)
414 SFTP_USER_NAME=${SFTP_USER_NAME#$SFTP_USER_NAME_MARKER}
415 SFTP_PASSWORD=$(grep $SFTP_PASSWORD_MARKER $UPLOAD_INFO)
416 SFTP_PASSWORD=${SFTP_PASSWORD#$SFTP_PASSWORD_MARKER}
417 SFTP_PORT=$(grep $SFTP_PORT_MARKER $UPLOAD_INFO)
418 SFTP_PORT=${SFTP_PORT#$SFTP_PORT_MARKER}
419 SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
420 SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}
421
422 expect "$SCRIPT_PATH" "$LOG_HTM" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.htm"
423
424 valPrint c "Report was uploaded, unless an error message appears above."
425}
426
427# Prints session summary when script is done
428function wrapupAndExit()
429{
430 # Get off progress line on console, drop down a line from last link in log, and close HTML table
431 valPrint ctr ""
432 valPrint h "</table><br />"
433
434 # If we didn't finish processing the last URL, then the iterator is one too high
435 if [ $FINISHED_LIST != "yes" ]; then
436 let LINK_NUM-=1
437 if [ $FINISHED_LIST == "no" ]; then
438 valPrint ctrh "The session was canceled by the user."
439 fi
440 fi
441
[1118]442 # Generate string with elapsed time
443 END_RUN=$(date +%s)
444 ELAPSED=$(echo $(($END_RUN - $START_RUN)) | awk '{printf "%d min. %d sec. elapsed", int($1/60), int($1%60)}')
445
[1122]446 # Do some math on results of session
[1064]447 LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
[1122]448 LINK_PROBLEMS=$((EI_LINKS+IW_LINKS+RD_LINKS+NG_LINKS))
449 LINK_ERRORS=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
450 LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_EI+SKIP_EXPECT_IW))
451 TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP))
452 LINKS_CHECKED=$((LINKS_PROCESSED-LINK_ERRORS))
453
454 # Print summary header
[1118]455 valPrint ct "Summary ($ELAPSED):"
456 valPrint r "\b1 Summary \b0 ($ELAPSED)"
457 valPrint hn "<h3><span id=\"summary\">Summary ($ELAPSED)</span></h3>"
[1122]458 valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT) (there were $FILE_LINKS file $(pluralCheckNoun link $FILE_LINKS) and $PAGE_LINKS page $(pluralCheckNoun link $PAGE_LINKS))."
459
460 # Print processed link totals
461 if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi
462 if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi
463 if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS processed $(pluralCheckNoun link $LINK_PROBLEMS) had issues"; fi
464 if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctrh " (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi
465 if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) were OK"; fi
466 if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctrh " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi
467
468 # Print excepted link totals
469 if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctrh "$LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"; fi
470 if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
471 if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi
472 if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi
473
474 # Print errored link totals
475 if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"; fi
476 if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
[1070]477 if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
[1064]478 if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
479 if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
480 if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
481 if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
[1122]482
483 # Print checked link totals
484 if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS link $(pluralCheckNoun issues $LINKS_CHECKED):"; fi
485 if [ $NG_LINKS -gt 0 ]; then valPrint ctrh "- $NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
486 if [ $RD_LINKS -gt 0 ]; then valPrint ctrh "- $RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi
487 if [ $EI_LINKS -gt 0 ]; then valPrint ctrh "- $EI_LINKS $(pluralCheckNoun link $EI_LINKS) that could be intrawiki"; fi
488 if [ $IW_LINKS -gt 0 ]; then valPrint ctrh "- $IW_LINKS $(pluralCheckNoun link $IW_LINKS) that could be interwiki"; fi
489
490 # Close the log files' markup
[1070]491 valPrint trh "ValExtLinks says goodbye."
[1064]492 printRTFfooter
493 printHTMfooter
494
495 # Upload report if this was requested
496 if [ ! -z $UPLOAD_INFO ] && [ $FINISHED_LIST != "no" ]; then
497 uploadReport
498 fi
499
500 # Really quit now
501 valPrint c "ValExtLinks says goodbye."
502 exit 0
503}
504trap wrapupAndExit INT
505
506
507### INITIALIZATION ###
508# Print opening message to console and log files
509valPrint cw "$(tput bold)--Validate External Links--$(tput sgr0)"
510printTXTheader
511printRTFheader
512printHTMheader
513
514# Attempt to download file at LINKS_URL, then check that it succeeded
[1120]515valPrint t "Config:"
516valPrint r "\b1 Config \b0"
517valPrint hn "<h3>Config</h3>"
[1069]518valPrint cwtrh "Downloading list of external links from $LINKS_URL."
[1064]519LINKS_FILE_NAME=$(echo "$LINKS_URL" | sed 's/.*\///')
520LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
521curl --silent -o "$LINKS_FILE" $LINKS_URL
522if [ ! -f "$LINKS_FILE" ]; then
523 echo "The download of $LINKS_URL appears to have failed. Aborting."
524 wrapupAndExit
525fi
526
527# Attempt to download file at EXCEPT_URL, then check that it succeeded
528if [ ! -z $EXCEPT_URL ]; then
[1070]529 valPrint cwtrh "Downloading list of reporting exceptions from $EXCEPT_URL."
[1064]530 EXCEPT_FILE_NAME=$(echo "$EXCEPT_URL" | sed 's/.*\///')
531 EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
532 curl --silent -o "$EXCEPT_FILE" $EXCEPT_URL
533 if [ ! -f "$EXCEPT_FILE" ]; then
534 echo "The download of $EXCEPT_URL appears to have failed. Aborting."
535 wrapupAndExit
536 fi
537fi
538
539# Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
540LINK_COUNT_STRING=$(cat "$LINKS_FILE" | wc -l)
541
542# Number of URLs is number of lines minus one (first line is column header row for the CSV)
543LINK_COUNT=$(echo "${LINK_COUNT_STRING}" | tr -d '[:space:]')
544let LINK_COUNT-=1
545
546# Calculate number of URLs to consider
547if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
548 valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((URL_LIMIT-URL_START+1)) of them, from link $URL_START to $URL_LIMIT."
549elif [ $URL_START -ne 1 ]; then
550 valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((LINK_COUNT-URL_START+1)) of them, from link $URL_START to $LINK_COUNT."
551else
552 valPrint ctrh "Found $LINK_COUNT links to process. I will be considering all of them."
553fi
554
555# Print settings to console and log
[1070]556declare -a SETTINGS_MSG=(I will be checking the response code of each link "and will" take a screenshot of each page. Pages that are OK will "also" be logged. I "will" ask the Internet Archive for a suggested snapshot URL for each NG page. "I will not report links that are listed in the exceptions file.")
[1064]557if [ $TAKE_PAGE_SHOT -eq 0 ]; then SETTINGS_MSG[10]="but will not"; fi
558if [ $RECORD_OK_LINKS -eq 0 ]; then SETTINGS_MSG[22]="not"; fi
559if [ $SUGGEST_SNAPSHOTS -eq 0 ]; then SETTINGS_MSG[26]="will not"; fi
560if [ -z $EXCEPT_URL ] || [ $RECORD_OK_LINKS -eq 1 ]; then SETTINGS_MSG[40]=""; fi
561SETTINGS_STR=${SETTINGS_MSG[@]}
562valPrint ctrh "$SETTINGS_STR"
563valPrint tr "A summary of my findings will be found at the bottom of the report."
564valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
565valPrint trh ""
566
567# Print legend to logs
568valPrint t "Legend:"
569valPrint r "\b1 Legend \b0"
570valPrint hn "<h3>Legend</h3>"
571valPrint trh "OK = URL seems to be working."
[1067]572valPrint trh "NG = URL no longer seems to work. You should click each URL marked as NG before attempting to fix it, because false negatives will occur from time to time due to hiccups in the Internet. Please report any persistent false negatives or other issues to Iritscen. An NG link should be followed by a link to the Internet Archive's Wayback Machine which may help you repair the link. If the link cannot be repaired, you can disable it on the wiki (which prevents it from showing up in future ValExtLinks reports) by wrapping it in nowiki tags."
573valPrint trh "RD = The server responding to this URL is saying that the page moved and you should instead use the supplied new URL. Some RD links represent minor adjustments in the organization of a web site, and some are soft 404s (the file/page has been removed and you are being redirected to something like the main page of the web site). You will have to look at the new URL yourself to determine if it represents an OK link and the link on the wiki should be updated to this one, or if the desired file/page is actually gone and we need to replace the wiki link with an Internet Archive snapshot link -- or disable the URL if it has not been archived."
[1070]574valPrint trh "EI = URL is an external link to an internal page and should be converted to an intrawiki link using the suggested markup."
575valPrint trh "IW = URL is an external link to a fellow wiki and should be converted to an interwiki link using the suggested markup."
[1064]576valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $HTTP_CODES)."
577valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$HTTP_CODES\"}}{\fldrslt here}} for code reference)."
578valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$HTTP_CODES\" target=\"_blank\">here</a> for code reference)."
579valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $CURL_CODES)."
580valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$CURL_CODES\"}}{\fldrslt here}} for code reference)."
581valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$CURL_CODES\" target=\"_blank\">here</a> for code reference)."
582valPrint trh "IA suggests = Last available snapshot suggested by the Internet Archive."
[1070]583valPrint trh "Try browsing = The Archive occasionally fails to return a snapshot URL even when one exists, so you will need to check for a snapshot manually using this link to the Wayback Machine before concluding that a site has not been archived."
[1064]584valPrint trh ""
585
586
587### MAIN LOOP ###
[1120]588valPrint t "Links:"
589valPrint r "\b1 Links \b0"
590valPrint hn "<h3>Links</h3>"
[1118]591START_RUN=$(date +%s)
[1064]592# Process each line of the .csv in LINKS_FILE
593for LINE in `cat "$LINKS_FILE"`; do
594 let LINK_NUM+=1
595
596 # First line is the column header row for the CSV, so let's verify that the format hasn't changed
597 if [ $SKIPPED_HEADER_ROW -eq 0 ]; then
598 if [ $LINE == "namespace,title,target" ]; then
599 SKIPPED_HEADER_ROW=1
600 LINK_NUM=0 # this line is it's not a link, so reset the link counter
601 valPrint hn "<table>"
602 continue
603 else
604 valPrint ctrh "Did not find expected header row in $LINKS_FILE. Aborting."
605 wrapupAndExit
606 fi
607 fi
608
609 # Skip this link if we are not at URL_START yet
610 if [ $LINK_NUM -lt $URL_START ]; then
611 continue
612 fi
613
614 # Stop if we are at the limit declared for testing purposes
615 if [ $URL_LIMIT -gt 0 ] && [ $LINK_NUM -gt $URL_LIMIT ]; then
616 FINISHED_LIST="limit"
617 wrapupAndExit
618 fi
619
620 # Print progress to screen
621 if [ $LINK_NUM -gt 1 ]; then
622 printf "\e[1A\n" # erase previous progress message so that new one appears in its place
623 fi
624 valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."
625
626 # The number of the namespace is the element before the first comma on the line
627 NS_ID=${LINE%%,*}
628
629 # Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
630 NS_NAME=""
631 a=0
[1069]632 while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done
[1118]633 if [ $NS_ID == "NULL" ]; then
634 break
635 elif [ $NS_ID -eq ${NS_IDS[$a]} ]; then
[1064]636 NS_NAME="${NS_NAMES[$a]}"
637 break
638 fi
639 let a+=1
640 done
[1118]641 if [ "$NS_NAME" == "" ]; then
642 if [ $NS_ID == "NULL" ]; then
[1119]643 valPrint trs "Skipping URL on line $LINK_NUM because the namespace (and probably the page too) is \"NULL\". Probably the link is no longer in existence on the wiki."
[1118]644 else
[1119]645 valPrint trs "Skipping URL on line $LINK_NUM found on page $PAGE_NAME because I could not find a name for namespace ID $NS_ID."
[1118]646 fi
[1064]647 let SKIP_UNK_NS+=1
648 continue
649 fi
650
651 # The name of the page is everything between the namespace ID and the next comma on the line (commas
652 # in page names will break this)
653 PAGE_NAME=${LINE#$NS_ID,}
654 PAGE_NAME=${PAGE_NAME%%,*}
655
656 # We don't want to consider wiki pages ending in .js, as the parser cannot reliably isolate URLS in
657 # JavaScript code, so it will return erroneous links
658 PAGE_NAME_SUFFIX=$(echo $PAGE_NAME | sed 's/.*\.//')
659 if [ $PAGE_NAME_SUFFIX == "js" ]; then
[1119]660 valPrint trs "Skipping URL on line $LINK_NUM because it was found on JavaScript page $PAGE_NAME."
[1064]661 let SKIP_JS_PAGE+=1
662 continue
663 fi
664
[1070]665 # Build longer wiki page URLs from namespace and page names
[1122]666 FULL_PAGE_PATH=https://$WIKI_PATH/$NS_NAME:$PAGE_NAME
[1070]667 LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
668 # Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it
669 # explicitly breaks the link
670 if [ $NS_ID -eq 0 ]; then
[1122]671 FULL_PAGE_PATH=https://$WIKI_PATH/$PAGE_NAME
[1070]672 LOCAL_PAGE_PATH=$PAGE_NAME
673 fi
674
[1064]675 # The URL being linked to is everything after the previous two fields (this allows commas to be in
676 # the URLs, but a comma in the previous field, the page name, will break this)
677 URL=${LINE#$NS_ID,$PAGE_NAME,}
678
679 # Scan for illegal characters
680 if [[ $URL == *[$ILLEGAL_CHARS]* ]]; then
[1119]681 valPrint trs "Skipping URL $URL (found on page $PAGE_NAME) because it contains characters illegal in a URL."
[1064]682 let SKIP_BAD_URL+=1
683 continue
684 fi
685
686 # Now we need to know if the URL is for a file or a web page. First step is to determine if the
687 # URL ends in a suffix
688 HAS_SUFFIX=0
689
690 # If the URL ends in something like ".php?foo=bar", strip everything from the '?' onward
[1070]691 CLEAN_URL=${URL%%\?*}
[1064]692
693 # If the URL ends in something like "#section_15", strip everything from the '#' onward
[1070]694 CLEAN_URL=${CLEAN_URL%%\#*}
[1064]695
696 # 'sed' cannot handle Unicode in my Bash shell, so skip this URL and make user check it
[1070]697 if [[ $CLEAN_URL == *[![:ascii:]]* ]]; then
[1119]698 valPrint trs "Skipping URL $URL (found on page $PAGE_NAME) because I cannot handle non-ASCII characters."
[1064]699 let SKIP_NON_ASCII+=1
700 continue
701 fi
702
703 # Isolate the characters after the last period and after the last slash
[1070]704 POST_DOT=$(echo "$CLEAN_URL" | sed 's/.*\.//')
705 POST_SLASH=$(echo "$CLEAN_URL" | sed 's/.*\///')
[1064]706
707 # If the last period comes after the last slash, then the URL ends in a suffix
708 POST_DOT_LENGTH=$(echo | awk -v input=$POST_DOT '{print length(input)}')
709 POST_SLASH_LENGTH=$(echo | awk -v input=$POST_SLASH '{print length(input)}')
710 if [ $POST_DOT_LENGTH -lt $POST_SLASH_LENGTH ]; then
711 HAS_SUFFIX=1
712 else
713 HAS_SUFFIX=0
714 fi
715
716 # Now we need to know if the URL is for a file. If the URL ends in a suffix, compare it against the
717 # known URL endings supplied in HTTP_FILES and HTTP_TLDS_AND_PAGES
718 IS_FILE=-1
719 if [ $HAS_SUFFIX -eq 0 ]; then
720 IS_FILE=0
721 else
722 # Turn off case sensitivity while we compare suffixes
723 shopt -s nocasematch
724
725 # Special case: URLs ending in something like "/productID.304297400" are pages, not files, so if
726 # the URL's suffix is all numbers, we are looking at the end of a web page URL
727 if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
728 IS_FILE=0
729 fi
730
731 # If we did not identify this URL as a web page above, we need to compare the suffix against known
732 # file extensions
733 if [ $IS_FILE -eq -1 ]; then
734 for EXTENSION in "${HTTP_FILES[@]}"; do
735 if [[ $EXTENSION == $POST_DOT ]]; then # double brackets respect the nocasematch setting
736 IS_FILE=1
737 break
738 fi
739 done
740 fi
741
742 # If we did not identify this URL as a file above, we need to compare the suffix against known
743 # pages and TLDs in order to rule out that this is an unknown file suffix or TLD that the user
744 # needs to add to the HTTP_FILES or HTTP_TLDS_AND_PAGES array
745 if [ $IS_FILE -eq -1 ]; then
746 for PAGE_OR_TLD in "${HTTP_TLDS_AND_PAGES[@]}"; do
747 if [[ $PAGE_OR_TLD == $POST_DOT ]]; then
748 IS_FILE=0
749 break
750 fi
751 done
752 fi
753
754 # Turn case sensitivity back on in Bash
755 shopt -u nocasematch
756 fi
757
758 # If this suffix escaped identification as either a file, page or TLD, inform the user
759 STR_TYPE=""
760 if [ $IS_FILE -eq -1 ]; then
[1119]761 valPrint trs "Skipping URL $URL (found on page $PAGE_NAME) because I encountered the unknown URL ending $POST_DOT. Please add this ending to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
[1064]762 let SKIP_UNK_SUFFIX+=1
763 continue
764 elif [ $IS_FILE -eq 1 ]; then
765 STR_TYPE="file"
766 let FILE_LINKS+=1
767 elif [ $IS_FILE -eq 0 ]; then
768 STR_TYPE="page"
769 let PAGE_LINKS+=1
770 fi
771
772 # Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
773 # issue with sites that require HTTPS
774 CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{http_code}\n' $URL)
775 CURL_ERR=$(echo $?)
776 CURL_RESULT=$CURL_CODE
777
778 # Tack 'curl' exit code onto result string if 'curl' returned "000" (no HTTP response)
779 if [ $CURL_CODE == "000" ]; then
780 CURL_RESULT="$CURL_RESULT-$CURL_ERR"
781 fi
782
[1070]783 # Begin to determine our status code for this URL (EI, IW, OK, RD, or NG)
[1064]784 STATUS="??"
[1067]785 NEW_URL=""
[1064]786 INTERWIKI_INDEX=-1
787
[1070]788 # First make sure that this isn't an "external internal" link to our own wiki that can be replaced
789 # by "[[page_name]]". If it uses a special access URL beginning with "/w/", let it pass, as it
790 # probably cannot be replaced by "[[ ]]" markup
791 if [[ $URL == *$WIKI_PATH* ]] && [[ $URL != *$WIKI_PATH/w/* ]]; then
792 STATUS="EI"
793 let EI_LINKS+=1
794 fi
795
796 # If it's not, check if this is a link to a domain that we have an interwiki prefix for
797 if [ $STATUS == "??" ]; then
798 for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
799 if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]] && [[ $URL != *${INTERWIKI_DOMAINS[$i]}/w/* ]]; then
800 STATUS="IW"
801 let IW_LINKS+=1
802 INTERWIKI_INDEX=$i
803 break
804 fi
805 done
806 fi
807
[1069]808 # If we didn't match an interwiki domain, see if the status code is in our "OK" codes list
809 if [ $STATUS == "??" ]; then
810 for CODE in "${OK_CODES[@]}"; do
811 if [[ $CODE == $CURL_CODE ]]; then
812 STATUS="OK"
813 let OK_LINKS+=1
814 break
815 fi
816 done
817 fi
818
[1067]819 # If we didn't get a match with the "OK" codes, check it against the "RD" codes
[1064]820 if [ $STATUS == "??" ]; then
[1067]821 for CODE in "${RD_CODES[@]}"; do
822 if [[ $CODE == $CURL_CODE ]]; then
823 # Get URL header again in order to retrieve the URL we are being redirected to
824 NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{redirect_url}\n' $URL)
825
[1122]826 # Adjust the old and new URLs to both use HTTP for comparison purposes, so we can filter
827 # those changes out if the user didn't ask for them
828 URL_HTTP=$(echo $URL | sed -E 's/^https:/http:/')
829 NEW_URL_HTTP=$(echo $NEW_URL | sed -E 's/^https:/http:/')
[1070]830
831 # Sometimes 'curl' fails to get the redirect_url due to time-out or bad web site config
[1122]832 NEW_URL_LENGTH=$(echo | awk -v input=$NEW_URL_HTTP '{print length(input)}')
[1070]833 if [ $NEW_URL_LENGTH -lt $MIN_URL_LENGTH ]; then
[1122]834 NEW_URL_HTTP="[new URL not retrieved]"
[1070]835 fi
836
[1122]837 # Remove slash at end of new URL, if present, so we can filter out the redirects that
838 # merely add an ending slash if the user didn't ask for them
839 NEW_URL_NO_SLASH=$(echo $NEW_URL_HTTP | sed -E 's:/$::')
840
841 # If the URLs match besides HTTP being upgraded to HTTPS, then the link is OK (unless user
842 # wants those to be reported)
843 if [ $SHOW_HTTPS -eq 0 ] && [ $URL_HTTP == $NEW_URL_HTTP ]; then
844 valPrint trs "Skipping URL $URL (found on page $PAGE_NAME) because we have not been asked to show http->https upgrades, and we were redirected to $NEW_URL."
[1069]845 STATUS="OK"
846 let OK_LINKS+=1
[1122]847 let SKIP_HTTPS_UP+=1
848 # If the URLs match besides an added ending slash, then the link is OK (unless user wants
849 # those to be reported)
850 elif [ $SHOW_SLASH -eq 0 ] && [ $URL_HTTP == $NEW_URL_NO_SLASH ]; then
851 valPrint trs "Skipping URL $URL (found on page $PAGE_NAME) because we have not been asked to show added trailing slashes, and we were redirected to $NEW_URL."
852 STATUS="OK"
853 let OK_LINKS+=1
854 let SKIP_SLASH_ADD+=1
[1069]855 else
856 STATUS="RD"
857 let RD_LINKS+=1
858 fi
[1067]859 break
860 fi
861 done
862 fi
863
864 # If we didn't get a match with the "RD" codes, check it against the "NG" codes
865 if [ $STATUS == "??" ]; then
[1064]866 for CODE in "${NG_CODES[@]}"; do
867 if [[ $CODE == $CURL_CODE ]]; then
868 STATUS="NG"
869 let NG_LINKS+=1
870 break
871 fi
872 done
873 fi
874
875 # If we didn't match a known status code, advise the reader
876 if [ $STATUS == "??" ]; then
[1119]877 valPrint trs "Skipping URL $URL (found on page $PAGE_NAME) because I encountered the unknown return code $CURL_CODE."
[1064]878 let SKIP_UNK_CODE+=1
879 continue
880 fi
881
[1070]882 # Check problem links against exceptions file before proceeding
883 if [ $STATUS != "OK" ] && [ ! -z $EXCEPT_URL ]; then
884 # The code we expect to find in the exceptions file is either the 'curl' result or "EI"/"IW"
885 EXPECT_CODE="$CURL_RESULT"
886 if [ $STATUS == "EI" ]; then
887 EXPECT_CODE="EI"
888 elif [ $STATUS == "IW" ]; then
889 EXPECT_CODE="IW"
890 fi
891
892 # Look for link in exceptions file and make sure its listed result code and wiki page also match
[1064]893 GREP_RESULT=$(grep --max-count=1 "$URL" "$EXCEPT_FILE")
[1070]894 EXCEPT_PAGE=${GREP_RESULT##*,}
895 if [ "$EXCEPT_PAGE" == "*" ] || [ "$EXCEPT_PAGE" == $LOCAL_PAGE_PATH ]; then
896 EXCEPT_CODE=${GREP_RESULT%%,*}
897 if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then
[1119]898 valPrint trs "Skipping URL $URL (found on page $PAGE_NAME) because its expected result, $EXPECT_CODE, is listed in the exceptions file."
[1070]899 if [ $STATUS == "EI" ]; then
900 let SKIP_EXPECT_EI+=1
901 elif [ $STATUS == "IW" ]; then
902 let SKIP_EXPECT_IW+=1
903 else
904 let SKIP_EXPECT_NG+=1
905 fi
906 continue
907 fi
[1064]908 fi
909 fi
910
911 # If appropriate, record this link to the log, with clickable URLs when possible
912 if [ $STATUS != "OK" ] || [ $RECORD_OK_LINKS -eq 1 ]; then
[1070]913 # Stupid hack since the strings "IW" and "EI" are narrower than "OK", "RD", or "NG" and it takes
914 # an extra tab to get to the desired level of indentation in the RTF log
[1064]915 RTF_TABS=" "
[1070]916 if [ $STATUS == "IW" ] || [ $STATUS == "EI" ]; then
[1064]917 RTF_TABS=" "
918 fi
919
920 # Record link and its wiki page in TXT, RTF, and HTML markup
921 valPrint t "$STATUS ($CURL_RESULT) $STR_TYPE $URL"
922 valPrint t " linked from $FULL_PAGE_PATH"
923 valPrint r "$STATUS ($CURL_RESULT)$RTF_TABS$STR_TYPE {\field{\*\fldinst{HYPERLINK \"$URL\"}}{\fldrslt $URL}}"
924 valPrint r " linked from {\field{\*\fldinst{HYPERLINK \"$FULL_PAGE_PATH\"}}{\fldrslt $LOCAL_PAGE_PATH}}"
925 valPrint hn "<tr><td style=\"white-space:nowrap\">$STATUS ($CURL_RESULT)</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>"
926 valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"
927
[1067]928 # Record redirect URL if one was given by a 3xx response page
929 if [ $STATUS == "RD" ]; then
[1119]930 valPrint ts " Server suggests $NEW_URL"
931 valPrint rs " Server suggests {\field{\*\fldinst{HYPERLINK \"$NEW_URL\"}}{\fldrslt $NEW_URL}}"
932 valPrint hs "<tr><td colspan=\"2\" align=\"right\">Server suggests</td><td><a href=\"$NEW_URL\" target=\"_blank\">$NEW_URL</a></td></tr>"
[1067]933 fi
934
[1070]935 # Notify reader if we can use an intrawiki link for this URL
936 if [ $STATUS == "EI" ]; then
[1075]937 INTRA_PAGE=${URL#*://*/}
[1119]938 valPrint ts " Just use [[$INTRA_PAGE]]"
939 valPrint rs " Just use [[$INTRA_PAGE]]"
940 valPrint hs "<tr><td colspan=\"2\" align=\"right\">Just use</td><td>[[$INTRA_PAGE]]</td></tr>"
[1070]941 fi
942
[1064]943 # Notify reader if we can use an interwiki prefix for this URL
944 if [ $STATUS == "IW" ]; then
[1075]945 INTER_PAGE=$(echo "$URL" | sed 's/.*\///')
[1119]946 valPrint ts " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
947 valPrint rs " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
948 valPrint hs "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]</td></tr>"
[1064]949 fi
950
951 # Query Internet Archive for latest "OK" snapshot for "NG" page
952 if [ $STATUS == "NG" ] && [ $SUGGEST_SNAPSHOTS -eq 1 ]; then
953 ARCHIVE_QUERY=$(curl --silent --max-time 10 "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
954
[1118]955 # If a "closest" snapshot was received...
[1066]956 if [[ "$ARCHIVE_QUERY" == *\"closest\":* ]]; then
[1118]957 # In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it
958 ARCHIVE_QUERY=$(echo "$ARCHIVE_QUERY" | sed 's/#!/#\\!/')
959
960 # ...isolate "url" property in the response that follows the "closest" tag
961 SNAPSHOT_URL=${ARCHIVE_QUERY##*\"closest\":} # everything after '"closest":'
[1119]962 SNAPSHOT_URL=${SNAPSHOT_URL#*\"url\": \"} # everything after '"url": "'
[1118]963 SNAPSHOT_URL=${SNAPSHOT_URL%%\"*} # everything before '"'
964
965 # Inform the user of the snapshot URL
[1119]966 valPrint ts " IA suggests $SNAPSHOT_URL"
967 valPrint rs " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
968 valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
[1064]969 else # ...otherwise give generic Wayback Machine link for this URL
[1119]970 valPrint ts " Try browsing $ARCHIVE_GENERIC/$URL"
971 valPrint rs " Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
972 valPrint hs "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"
[1064]973 fi
974 fi
975 fi
976
977 # If this is a page and it seems to be there, take screenshot of it using headless Google Chrome
978 if [ $IS_FILE -eq 0 ] && [ $TAKE_PAGE_SHOT -eq 1 ] && [ $STATUS == "OK" ]; then
979 # Sanitize URL by removing "http(s)://" and converting any other colons or slashes to underscores
980 SHOT_NAME=$(echo "$URL" | sed 's/https*\:\/\///' | sed 'y/:\//__/')
981 SHOT_FILE="$SHOT_PATH/$SHOT_NAME.png"
982
983 # Don't take screenshot if we already encountered this page and screenshotted it
984 if [ ! -f "$SHOT_FILE" ]; then
[1070]985 "$CHROME_PATH" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
[1064]986 if [ -f "$WORKING_DIR/$CHROME_SCREENSHOT" ]; then
987 mv -n "$WORKING_DIR/$CHROME_SCREENSHOT" "$SHOT_FILE"
988 else
[1119]989 valPrint trhs "Screenshot of URL $URL seems to have failed!"
[1064]990 fi
991 else
[1119]992 valPrint trhs "Skipping screenshot of URL $URL because $SHOT_FILE already exists."
[1064]993 fi
994 fi
995done
996FINISHED_LIST="yes"
997wrapupAndExit
Note: See TracBrowser for help on using the repository browser.