source: Validate External Links/validate_external_links.sh@ 1138

Last change on this file since 1138 was 1137, checked in by iritscen, 4 years ago

ValExtLinks: Added '.full' as a recognized page suffix.

File size: 49.3 KB
Line 
1#!/bin/bash
2
3# Validate External Links by Iritscen
4# Provided with a list of external links in an expected CSV format, this script validates them. The
5# resulting logs are produced in three formats: TXT (for easy diffing with an earlier log), RTF (for
6# reading as a local file with clickable links), and HTML (for uploading as a web page). Call script
7# with "--help" argument for documentation. Also see Read Me First.rtf for critical notes.
8# Recommended rule:
9# |----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----|
10
11# Set separator token to newline
12IFS="
13"
14
15### GLOBALS ###
16# Settings -- these will be changed from their defaults by the arguments passed in to the script
17LINKS_URL="" # use 'curl' to download file with links from this location (can be file://)
18EXCEPT_URL="" # 'curl' will access this wiki page with a list of exceptions for NG results
19OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder
20RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES
21SHOW_SLASH=0 # record response code to the log when a slash is added to the end of a URL
22SHOW_HTTPS=0 # record response code to the log when "http" is upgraded to "https"
23SHOW_YT_RD=0 # record response code to the log when a youtu.be URL is expanded to the full URL
24SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page
25SKIP_ARCHIVE_LINKS=0 # don't check URLs under the archive.org domain
26TAKE_PAGE_SHOT=0 # take a screenshot of each OK page
27CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature
28URL_START=1 # start at this URL in LINKS_FILE (1 by default)
29URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE
30UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report
31
32# Fixed strings -- see the occurrences of these variables to learn their purpose
33AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36 OPR/69.0.3686.77"
34ARCHIVE_API="http://archive.org/wayback/available"
35ARCHIVE_GENERIC="https://web.archive.org/web/*"
36ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
37CHROME_SCREENSHOT="screenshot.png"
38CURL_CODES="http://iritscen.oni2.net/val/curl_codes.txt"
39EXCEPT_FILE_NAME="exceptions.txt"
40EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
41HTTP_CODES="http://iritscen.oni2.net/val/http_codes.txt"
42MY_WIKI_PAGE="https://wiki.oni2.net/User:Iritscen"
43THIS_DIR=$(cd $(dirname $0); pwd)
44WORKING_DIR=$(pwd)
45WIKI_PATH="wiki.oni2.net"
46
47# These are parallel arrays of the IDs and names of OniGalore's current namespaces
48declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 100 101 102 103 104 105 108 109 110 111)
49declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk")
50
51# These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
52# This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code.
53declare -a HTTP_FILES=(3ds 7z avi BINA blend bsl cpp csv dae dll dmg doc east exe fbx first63 flv gamedata gif jpg last32 log m4a mhm mov mp3 mp4 oni ONWC pdf png py rar tga TRMA txt vbs wav wmv xaf xml zip)
54declare -a HTTP_TLDS_AND_PAGES=(action ars asp aspx cfm cgi com css de full htm html it js jsp net org pgi php php3 phtml pl ru shtml stm uk x)
55
56# These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
57# are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt
58# if you add a new code.
59declare -a OK_CODES=(200 401 405 406 418 501)
60declare -a RD_CODES=(301 302 303 307 308)
61declare -a NG_CODES=(000 400 403 404 410 500 502 503 530)
62
63# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
64# transcluded text, and if the transclusion fails, then the braces show up in the URL
65ILLEGAL_CHARS="{ }"
66
67# The shortest URL possible, used for sanity-checking some URLs: http://a.co
68MIN_URL_LENGTH=11
69
70# These are parallel arrays giving the prefixes that can be used in place of normal external links to
71# some wikis and other sites
72declare -a INTERWIKI_PREFIXES=(commons metawikimedia mw wikibooks wikidata wikimedia wikinews wikiquote wikisource wikispecies wikiversity wikivoyage wikt wp)
73declare -a INTERWIKI_DOMAINS=(commons.wikimedia.org meta.wikimedia.org mediawiki.org wikibooks.org wikidata.org wikimediafoundation.org wikinews.org wikiquote.org wikisource.org species.wikimedia.org wikiversity.org wikivoyage.org wiktionary.org wikipedia.org)
74
75# Variables for keeping track of main loop progress and findings
76LINK_NUM=0
77EI_LINKS=0
78IW_LINKS=0
79OK_LINKS=0
80RD_LINKS=0
81NG_LINKS=0
82SKIP_UNK_NS=0
83SKIP_JS_PAGE=0
84SKIP_BAD_URL=0
85SKIP_NON_ASCII=0
86SKIP_UNK_SUFFIX=0
87SKIP_UNK_CODE=0
88SKIP_EXPECT_NG=0
89SKIP_EXPECT_EI=0
90SKIP_EXPECT_IW=0
91SKIP_HTTPS_UP=0
92SKIP_SLASH_ADD=0
93SKIP_YOUTU_BE=0
94SKIP_ARCHIVE_ORG=0
95FILE_LINKS=0
96PAGE_LINKS=0
97SKIPPED_HEADER_ROW=0
98FINISHED_LIST="no"
99START_RUN=0
100END_RUN=0
101
102
103### HELP ###
104# A pseudo-man page. Here is the 80-character rule for the page text:
105# 234567890123456789012345678901234567890123456789012345678901234567890123456789
106function printHelp()
107{
108 cat << EOF
109
110NAME
111 Validate External Links
112
113SYNOPSIS
114 validate_external_links.sh --help
115 validate_external_links.sh --links URL --output DIR [--exceptions URL]
116 [--record-ok-links] [--show-added-slashes] [--show-https-upgrades]
117 [--show-yt-redirects] [--suggest-snapshots] [--skip-archive-links]
118 [--take-screenshots FILE] [--start-url NUM] [--end-url NUM]
119 [--upload FILE]
120
121DESCRIPTION
122 This script parses a list of external links found in the OniGalore wiki
123 (which is dumped by the Oni2.net domain periodically in a particular
124 format), validates them using the Unix tool 'curl', and produces a report
125 of which links were "OK" (responded positively to an HTTP query), which
126 were "RD" (responded with a 3xx redirect code), which could be "IW"
127 (interwiki) links, which are "EI" (external internal) links and could be
128 intrawiki links, and which were "NG" (no good; a negative response to the
129 query). This report can then be automatically uploaded to the location of
130 your choice. The script can also suggest Internet Archive snapshots for
131 "NG" links, and take screenshots of "OK" links for visual verification by
132 the reader that the page in question is the one intended to be displayed.
133
134 You must pass this script the URL at which the list of links is found
135 (--links) and the path where the directory of logs should be outputted
136 (--output). All other arguments are optional.
137
138OPTIONS
139 --help Show this page.
140 --links URL (required) URL from which to download the CSV
141 file with external links. Note that this URL can
142 be a local file if you supply a file:// path.
143 --output DIR (required) Unix path to directory in which Val
144 should place its reports.
145 --exceptions URL In order to remove links from the report which
146 Val finds an issue with but which you regard as
147 OK, list those desired exceptions on a wiki page.
148 See the sample file "exceptions.pdf" for the
149 required format of the page. Note that this URL
150 can point to a local file if you supply a path
151 beginning with "file://".
152 --record-ok-links Log a link in the report even if its response
153 code is "OK".
154 --show-added-slashes Report on redirects that simply add a '/' to the
155 end of the URL.
156 --show-https-upgrades Report on redirects that simply upgrade a
157 "http://" URL to a "https://" URL.
158 --show-yt-redirects Report on redirects that expand a youtu.be URL.
159 --suggest-snapshots Query the Internet Archive for a possible
160 snapshot URL for each "NG" page.
161 --skip-archive-links Don't check links that are already pointing to
162 a page on the Internet Archive.
163 --take-screenshots FILE Call the Google Chrome binary at this path to
164 take screenshots of each "OK" page.
165 --start-url NUM Start at this link in the links CSV file.
166 --end-url NUM Stop at this link in the links CSV file.
167 --upload FILE Upload report using the credentials and path
168 given in this local text file. See sftp_login.txt
169 for template.
170
171BUGS
172 The script cannot properly parse any line in the external links file
173 which contains a comma in the name of the wiki page containing a link.
174 Commas in the link itself are not an issue.
175EOF
176}
177
178
179### SETUP ###
180# If first argument is a help request, or if nothing was passed in at all, print help page and quit
181if [ "$#" -eq 0 ] || [ "$1" == "--help" ]; then
182 printHelp | less
183 exit 0
184fi
185
186# Parse arguments as long as there are more arguments to process
187while (( "$#" )); do
188 case "$1" in
189 --links ) LINKS_URL="$2"; shift 2;;
190 --exceptions ) EXCEPT_URL="$2"; shift 2;;
191 --output ) OUTPUT_DIR="$2"; shift 2;;
192 --record-ok-links ) RECORD_OK_LINKS=1; shift;;
193 --show-added-slashes ) SHOW_SLASH=1; shift;;
194 --show-https-upgrades ) SHOW_HTTPS=1; shift;;
195 --show-yt-redirects ) SHOW_YT_RD=1; shift;;
196 --suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;;
197 --skip-archive-links ) SKIP_ARCHIVE_LINKS=1; shift;;
198 --take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
199 --start-url ) URL_START=$2; shift 2;;
200 --end-url ) URL_LIMIT=$2; shift 2;;
201 --upload ) UPLOAD_INFO=$2; shift 2;;
202 * ) echo "Invalid argument $1 detected. Aborting."; exit 1;;
203 esac
204done
205
206# If the required arguments were not supplied, print help page and quit
207if [ -z $LINKS_URL ] || [ -z $OUTPUT_DIR ]; then
208 echo "Error: I did not receive one or both required arguments. Run me with the \"--help\" argument for documentation."
209 exit 2
210fi
211
212# If user wants screenshots, make sure path to Chrome was passed in and is valid
213if [ $TAKE_PAGE_SHOT -eq 1 ]; then
214 if [ ! -f "$CHROME_PATH" ]; then
215 echo "Error: You need to supply a path to the Google Chrome application in order to take screenshots."
216 exit 3
217 fi
218fi
219
220# Check that UPLOAD_INFO exists, if this argument was supplied
221if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then
222 echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting."
223 exit 4
224fi
225
226# Check that OUTPUT_DIR is a directory
227if [ ! -d "$OUTPUT_DIR" ]; then
228 echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting."
229 exit 5
230fi
231
232# Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots
233SAFE_TIME=$(date "+%Y-%m-%d--%H-%M-%S") # for use in path names
234NICE_TIME=$(TZ=":UTC" date "+%a, %d %b %Y %k:%M:%S GMT") # matches format of LINKS_DATE, which is extracted from the extlinks file
235OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
236OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
237SHOT_PATH="$OUTPUT_PATH/Screenshots"
238LOG_NAME="ValExtLinks report"
239LOG_TXT="$OUTPUT_PATH/$LOG_NAME.txt"
240LOG_RTF="$OUTPUT_PATH/$LOG_NAME.rtf"
241LOG_HTM="$OUTPUT_PATH/$LOG_NAME.htm"
242mkdir "$OUTPUT_PATH"
243if [ $TAKE_PAGE_SHOT -eq 1 ]; then
244 mkdir "$SHOT_PATH"
245fi
246
247# Check that 'mkdir' succeeded
248if [ ! -d "$OUTPUT_PATH" ]; then
249 echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting."
250 exit 6
251fi
252
253# Get date on the file at LINKS_URL and print to log
254LINKS_DATE=$(curl --silent --head $LINKS_URL | grep "Last-Modified")
255if [ -z "$LINKS_DATE" ]; then
256 echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting."
257 exit 7
258fi
259LINKS_DATE=${LINKS_DATE#Last-Modified: }
260
261
262### UTILITY FUNCTIONS ###
263# Writes a plain-text header to TXT log file
264function printTXTheader()
265{
266 valPrint t "Validate External Links report"
267 valPrint t "generated $NICE_TIME"
268 valPrint t "from data of $LINKS_DATE"
269 valPrint t "script by Iritscen (contact: $MY_WIKI_PAGE)"
270 valPrint t ""
271}
272
273# Writes the RTF header to RTF log file, then a centered title, then sets text to left-justified
274function printRTFheader()
275{
276 valPrint r "{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf820
277{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
278{\colortbl;\red255\green255\blue255;}
279{\*\expandedcolortbl;;}
280\margl1440\margr1440\vieww12600\viewh12100\viewkind0
281\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\qc\partightenfactor0
282
283\f0\fs28 \cf0 \b1 Validate External Links report \b0\\
284generated $NICE_TIME\\
285from data of $LINKS_DATE\\
286script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$MY_WIKI_PAGE\"}}{\fldrslt contact}})
287\\
288\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
289\cf0 "
290}
291
292# Closes the RTF markup of the RTF log file
293function printRTFfooter()
294{
295 valPrint r "}"
296}
297
298# Writes the HTML header to HTML log file
299function printHTMheader()
300{
301 valPrint h "<html>
302<head>
303<title>Validate External Links report</title>
304</head>
305<body>
306<h2>Validate External Links report</h2>
307<h3>generated $NICE_TIME<br />
308from data of $LINKS_DATE<br />
309script by Iritscen (<a href=\"$MY_WIKI_PAGE\" target=\"_blank\">contact</a>)</h3>"
310}
311
312# Closes the HTML markup of the HTML log file
313function printHTMfooter()
314{
315 valPrint h "</body>
316</html>"
317}
318
319# The central logging function. The first parameter is a string composed of one or more characters that
320# indicate which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
321# 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 's' means "Print an extra newline at the end." 'w' means "Don't pass console output through 'fmt'" ("fmt" fits the output
322# to an 80-column CLI but can break special formatting and the 'n' option).
323function valPrint()
324{
325 if [[ "$1" == *c* ]]; then
326 if [[ "$1" == *n* ]]; then
327 echo -n "$2"
328 elif [[ "$1" == *w* ]]; then
329 echo "$2"
330 elif [[ "$1" == *s* ]]; then
331 echo -e "$2\n"
332 else
333 echo "$2" | fmt -w 80
334 fi
335 fi
336 if [[ "$1" == *t* ]]; then
337 if [[ "$1" == *n* ]]; then
338 echo -n "$2" >> "$LOG_TXT"
339 elif [[ "$1" == *s* ]]; then
340 echo -e "$2\n" >> "$LOG_TXT"
341 else
342 echo "$2" >> "$LOG_TXT"
343 fi
344 fi
345 if [[ "$1" == *r* ]]; then
346 if [[ "$1" == *n* ]]; then
347 echo "$2" >> "$LOG_RTF"
348 elif [[ "$1" == *s* ]]; then
349 echo "$2\line\line" >> "$LOG_RTF"
350 else
351 echo "$2\line" >> "$LOG_RTF"
352 fi
353 fi
354 if [[ "$1" == *h* ]]; then
355 if [[ "$1" == *s* ]]; then
356 echo "$2<tr><td>&nbsp;</td></tr>" >> "$LOG_HTM"
357 elif [[ "$1" == *n* ]]; then
358 echo "$2" >> "$LOG_HTM"
359 else
360 echo "$2<br />" >> "$LOG_HTM"
361 fi
362 fi
363}
364
365# Pluralize the string in parameter 1 if the number in parameter 2 is not 1
366function pluralCheckNoun()
367{
368 if [ $2 -ne 1 ]; then
369 if [[ $1 =~ x$ ]]; then
370 echo $1es
371 else
372 echo $1s
373 fi
374 else
375 echo $1
376 fi
377}
378
379# Output "is" if parameter 1 is 1, otherwise "are"
380function pluralCheckIs()
381{
382 if [ $1 -ne 1 ]; then
383 echo "are"
384 else
385 echo "is"
386 fi
387}
388
389# Output "was" if parameter 1 is 1, otherwise "were"
390function pluralCheckWas()
391{
392 if [ $1 -ne 1 ]; then
393 echo "were"
394 else
395 echo "was"
396 fi
397}
398
399# Output "a " if parameter 1 is 1, otherwise nothing
400function pluralCheckA()
401{
402 if [ $1 -eq 1 ]; then
403 echo "a "
404 fi
405}
406
407# Output "an " if parameter 1 is 1, otherwise nothing
408function pluralCheckAn()
409{
410 if [ $1 -eq 1 ]; then
411 echo "an "
412 fi
413}
414
415# Upload HTML report using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
416# reports being saved to disk have already been closed.
417function uploadReport()
418{
419 valPrint c "Uploading HTML report..."
420
421 SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
422 SFTP_USER_NAME_MARKER="user:"
423 SFTP_PASSWORD_MARKER="pw:"
424 SFTP_PORT_MARKER="port:"
425 SFTP_PATH_MARKER="path:"
426 SFTP_USER_NAME=$(grep $SFTP_USER_NAME_MARKER $UPLOAD_INFO)
427 SFTP_USER_NAME=${SFTP_USER_NAME#$SFTP_USER_NAME_MARKER}
428 SFTP_PASSWORD=$(grep $SFTP_PASSWORD_MARKER $UPLOAD_INFO)
429 SFTP_PASSWORD=${SFTP_PASSWORD#$SFTP_PASSWORD_MARKER}
430 SFTP_PORT=$(grep $SFTP_PORT_MARKER $UPLOAD_INFO)
431 SFTP_PORT=${SFTP_PORT#$SFTP_PORT_MARKER}
432 SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
433 SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}
434
435 expect "$SCRIPT_PATH" "$LOG_HTM" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.htm"
436
437 valPrint c "Report was uploaded, unless an error message appears above."
438}
439
440# Prints session summary when script is done
441function wrapupAndExit()
442{
443 # Get off progress line on console, drop down a line from last link in log, and close HTML table
444 valPrint ctr ""
445 valPrint h "</table><br />"
446
447 # If we didn't finish processing the last URL, then the iterator is one too high
448 if [ $FINISHED_LIST != "yes" ]; then
449 let LINK_NUM-=1
450 if [ $FINISHED_LIST == "no" ]; then
451 valPrint ctrh "The session was canceled by the user."
452 fi
453 fi
454
455 # Generate string with elapsed time
456 END_RUN=$(date +%s)
457 ELAPSED=$(echo $(($END_RUN - $START_RUN)) | awk '{printf "%d min. %d sec. elapsed", int($1/60), int($1%60)}')
458
459 # Do some math on results of session
460 LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
461 LINK_PROBLEMS=$((EI_LINKS+IW_LINKS+RD_LINKS+NG_LINKS))
462 LINK_ERRORS=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
463 LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_EI+SKIP_EXPECT_IW))
464 TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP+SKIP_YOUTU_BE))
465 LINKS_CHECKED=$((LINKS_PROCESSED-LINK_ERRORS))
466
467 # Print summary header
468 valPrint ct "Summary ($ELAPSED):"
469 valPrint r "\b1 Summary \b0 ($ELAPSED)"
470 valPrint hn "<h3><span id=\"summary\">Summary ($ELAPSED)</span></h3>"
471 valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT) (there $(pluralCheckWas $FILE_LINKS) $FILE_LINKS file $(pluralCheckNoun link $FILE_LINKS) and $PAGE_LINKS page $(pluralCheckNoun link $PAGE_LINKS))."
472
473 # Print processed link totals
474 if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi
475 if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi
476 if [ $SKIP_ARCHIVE_ORG -gt 0 ]; then valPrint ctrh "- $SKIP_ARCHIVE_ORG Archive.org $(pluralCheckNoun link $SKIP_ARCHIVE_ORG) were not checked"; fi
477 if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS processed $(pluralCheckNoun link $LINK_PROBLEMS) had $(pluralCheckAn $LINK_PROBLEMS)$(pluralCheckNoun issue $LINK_PROBLEMS)"; fi
478 if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr " (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; valPrint h "&nbsp;&nbsp;(excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi
479 if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) $(pluralCheckWas $OK_LINKS) OK"; fi
480 if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctr " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; valPrint h "&nbsp;&nbsp;(counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi
481
482 # Print excepted link totals
483 if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctrh "$LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"; fi
484 if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
485 if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi
486 if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi
487
488 # Print errored link totals
489 if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"; fi
490 if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
491 if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
492 if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
493 if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
494 if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
495 if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
496
497 # Print checked link totals
498 if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS link $(pluralCheckNoun issue $LINK_PROBLEMS):"; fi
499 if [ $NG_LINKS -gt 0 ]; then valPrint ctrh "- $NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
500 if [ $RD_LINKS -gt 0 ]; then valPrint ctrh "- $RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi
501 if [ $EI_LINKS -gt 0 ]; then valPrint ctrh "- $EI_LINKS $(pluralCheckNoun link $EI_LINKS) that could be intrawiki"; fi
502 if [ $IW_LINKS -gt 0 ]; then valPrint ctrh "- $IW_LINKS $(pluralCheckNoun link $IW_LINKS) that could be interwiki"; fi
503
504 # Close the log files' markup
505 valPrint trh "ValExtLinks says goodbye."
506 printRTFfooter
507 printHTMfooter
508
509 # Upload report if this was requested
510 if [ ! -z $UPLOAD_INFO ] && [ $FINISHED_LIST != "no" ]; then
511 uploadReport
512 fi
513
514 # Really quit now
515 valPrint c "ValExtLinks says goodbye."
516 exit 0
517}
518trap wrapupAndExit INT
519
520
521### INITIALIZATION ###
522# Print opening message to console and log files
523valPrint cw "$(tput bold)--Validate External Links--$(tput sgr0)"
524printTXTheader
525printRTFheader
526printHTMheader
527
528# Attempt to download file at LINKS_URL, then check that it succeeded
529valPrint t "Config:"
530valPrint r "\b1 Config \b0"
531valPrint hn "<h3>Config</h3>"
532valPrint cwtrh "Downloading list of external links from $LINKS_URL."
533LINKS_FILE_NAME=$(echo "$LINKS_URL" | sed 's/.*\///')
534LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
535curl --silent -o "$LINKS_FILE" $LINKS_URL
536if [ ! -f "$LINKS_FILE" ]; then
537 echo "The download of $LINKS_URL appears to have failed. Aborting."
538 wrapupAndExit
539fi
540
541# Attempt to download file at EXCEPT_URL, then check that it succeeded
542if [ ! -z $EXCEPT_URL ]; then
543 valPrint cwtrh "Downloading list of reporting exceptions from $EXCEPT_URL."
544 EXCEPT_DATA=$(curl --silent $EXCEPT_URL)
545 if [ -z "$EXCEPT_DATA" ]; then
546 echo "The download of the exceptions data from '$EXCEPT_URL' appears to have failed. Aborting."
547 wrapupAndExit
548 fi
549 EXCEPT_DATA=${EXCEPT_DATA%END LIST*}
550 EXCEPT_DATA=${EXCEPT_DATA#*BEGIN LIST}
551 EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
552
553 # Store on disk for debugging purposes
554 echo "$EXCEPT_DATA" > "$EXCEPT_FILE"
555
556 # Transfer to array for easy searching later
557 declare -a EXCEPT_ARRAY=($(echo "$EXCEPT_DATA"))
558fi
559
560# Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
561LINK_COUNT_STRING=$(cat "$LINKS_FILE" | wc -l)
562
563# Number of URLs is number of lines minus one (first line is column header row for the CSV)
564LINK_COUNT=$(echo "${LINK_COUNT_STRING}" | tr -d '[:space:]')
565let LINK_COUNT-=1
566
567# Calculate number of URLs to consider
568if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
569 valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((URL_LIMIT-URL_START+1)) of them, from link $URL_START to $URL_LIMIT."
570elif [ $URL_START -ne 1 ]; then
571 valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((LINK_COUNT-URL_START+1)) of them, from link $URL_START to $LINK_COUNT."
572else
573 valPrint ctrh "Found $LINK_COUNT links to process. I will be considering all of them."
574fi
575
576# Print settings to console and log
577declare -a SETTINGS_MSG=(I will be checking the response code of each link "and will" take a screenshot of each page. Pages that are OK will "also" be logged. I "will" ask the Internet Archive for a suggested snapshot URL for each NG page. "I will not report links that are in the exceptions list." "I will ignore URLs that simply have ending slashes added onto them." "I will ignore URLs that only upgrade from HTTP to HTTPS." "I will ignore youtu.be links that are merely being expanded." "I will not check the validity of Internet Archive snapshot URLs.")
578if [ $TAKE_PAGE_SHOT -eq 0 ]; then SETTINGS_MSG[10]="but will not"; fi
579if [ $RECORD_OK_LINKS -eq 0 ]; then SETTINGS_MSG[22]="not"; fi
580if [ $SUGGEST_SNAPSHOTS -eq 0 ]; then SETTINGS_MSG[26]="will not"; fi
581if [ -z $EXCEPT_URL ] || [ $RECORD_OK_LINKS -eq 1 ]; then SETTINGS_MSG[40]=""; fi
582if [ $SHOW_SLASH -eq 1 ]; then SETTINGS_MSG[41]=""; fi
583if [ $SHOW_HTTPS -eq 1 ]; then SETTINGS_MSG[42]=""; fi
584if [ $SHOW_YT_RD -eq 1 ]; then SETTINGS_MSG[43]=""; fi
585if [ $SKIP_ARCHIVE_LINKS -eq 0 ]; then SETTINGS_MSG[44]=""; fi
586SETTINGS_STR=${SETTINGS_MSG[@]}
587valPrint ctrh "$SETTINGS_STR"
588valPrint tr "A summary of my findings will be found at the bottom of the report."
589valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
590valPrint trh ""
591
592# Print legend to logs
593valPrint t "Legend:"
594valPrint r "\b1 Legend \b0"
595valPrint hn "<h3>Legend</h3>"
596valPrint trh "OK = URL seems to be working."
597valPrint trh "NG = URL no longer seems to work. You should click each URL marked as NG before attempting to fix it, because false negatives will occur from time to time due to hiccups in the Internet. Please report any persistent false negatives or other issues to the script's author (see top of report). An NG link should be followed by a link to the Internet Archive's Wayback Machine which may help you repair the link, unless the Archive does not have any snapshots of the site. If the link cannot be repaired, you can delete it from the wiki page, or, if this would disrupt the surrounding material on the page, disable the link by wrapping the URL in nowiki tags."
598valPrint trh "RD = The server responding to this URL is saying that the page moved and you should instead use the supplied new URL. Some RD links represent minor adjustments in the organization of a web site, and some are soft 404s (the file/page has been removed and you are being redirected to something like the main page of the web site). You will have to look at the new URL yourself to determine if it represents an OK link and the link on the wiki should be updated to this one, or if the desired file/page is actually gone and we need to replace the wiki link with an Internet Archive snapshot link -- or disable the URL if it has not been archived."
599valPrint trh "EI = URL is an external link to an internal page and should be converted to an intrawiki link using the suggested markup."
600valPrint trh "IW = URL is an external link to a fellow wiki and should be converted to an interwiki link using the suggested markup."
601valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $HTTP_CODES)."
602valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$HTTP_CODES\"}}{\fldrslt here}} for code reference)."
603valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$HTTP_CODES\" target=\"_blank\">here</a> for code reference)."
604valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $CURL_CODES)."
605valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$CURL_CODES\"}}{\fldrslt here}} for code reference)."
606valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$CURL_CODES\" target=\"_blank\">here</a> for code reference)."
607valPrint trh "IA suggests = Last available snapshot suggested by the Internet Archive."
608valPrint trh "Try browsing = The Archive occasionally fails to return a snapshot URL even when one exists, so you will need to check for a snapshot manually using this link to the Wayback Machine before concluding that a site has not been archived."
609valPrint trh ""
610
611
612### MAIN LOOP ###
613valPrint t "Links:"
614valPrint r "\b1 Links \b0"
615valPrint hn "<h3>Links</h3>"
616START_RUN=$(date +%s)
617# Process each line of the .csv in LINKS_FILE
618for LINE in `cat "$LINKS_FILE"`; do
619 let LINK_NUM+=1
620
621 # First line is the column header row for the CSV, so let's verify that the format hasn't changed
622 if [ $SKIPPED_HEADER_ROW -eq 0 ]; then
623 if [ $LINE == "namespace,title,target" ]; then
624 SKIPPED_HEADER_ROW=1
625 LINK_NUM=0 # this line is it's not a link, so reset the link counter
626 valPrint hn "<table>"
627 continue
628 else
629 valPrint ctrh "Did not find expected header row in $LINKS_FILE. Aborting."
630 wrapupAndExit
631 fi
632 fi
633
634 # Skip this link if we are not at URL_START yet
635 if [ $LINK_NUM -lt $URL_START ]; then
636 continue
637 fi
638
639 # Stop if we are at the limit declared for testing purposes
640 if [ $URL_LIMIT -gt 0 ] && [ $LINK_NUM -gt $URL_LIMIT ]; then
641 FINISHED_LIST="limit"
642 wrapupAndExit
643 fi
644
645 # Print progress to screen
646 if [ $LINK_NUM -gt 1 ]; then
647 printf "\e[1A\n" # erase previous progress message so that new one appears in its place
648 fi
649 valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."
650
651 # The number of the namespace is the element before the first comma on the line
652 NS_ID=${LINE%%,*}
653
654 # Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
655 NS_NAME=""
656 a=0
657 while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done
658 if [ $NS_ID == "NULL" ]; then
659 break
660 elif [ $NS_ID -eq ${NS_IDS[$a]} ]; then
661 NS_NAME="${NS_NAMES[$a]}"
662 break
663 fi
664 let a+=1
665 done
666 if [ "$NS_NAME" == "" ]; then
667 if [ $NS_ID == "NULL" ]; then
668 valPrint trs "Skipping line $LINK_NUM ('$LINE') because the namespace (and probably the page too) is 'NULL'. Probably the link is no longer in existence on the wiki."
669 else
670 valPrint trs "Skipping line $LINK_NUM ('$LINE') because I could not find a name for namespace ID $NS_ID."
671 fi
672 let SKIP_UNK_NS+=1
673 continue
674 fi
675
676 # The name of the page is everything between the namespace ID and the next comma on the line (commas
677 # in page names will break this)
678 PAGE_NAME=${LINE#$NS_ID,}
679 PAGE_NAME=${PAGE_NAME%%,*}
680
681 # We don't want to consider wiki pages ending in .js, as the MW parser cannot reliably isolate URLS
682 # in JavaScript code, so it returns erroneous links
683 PAGE_NAME_SUFFIX=$(echo $PAGE_NAME | sed 's/.*\.//')
684 if [ $PAGE_NAME_SUFFIX == "js" ]; then
685 valPrint trs "Skipping URL '${LINE#$NS_ID,$PAGE_NAME,}' on line $LINK_NUM because it was found on JavaScript page '$PAGE_NAME'."
686 let SKIP_JS_PAGE+=1
687 continue
688 fi
689
690 # Build longer wiki page URLs from namespace and page names
691 FULL_PAGE_PATH=https://$WIKI_PATH/$NS_NAME:$PAGE_NAME
692 LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
693 # Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it
694 # explicitly breaks the link
695 if [ $NS_ID -eq 0 ]; then
696 FULL_PAGE_PATH=https://$WIKI_PATH/$PAGE_NAME
697 LOCAL_PAGE_PATH=$PAGE_NAME
698 fi
699
700 # The URL being linked to is everything after the previous two fields (this allows commas to be in
701 # the URLs, but a comma in the previous field, the page name, will break this)
702 URL=${LINE#$NS_ID,$PAGE_NAME,}
703
704 # Scan for illegal characters
705 if [[ $URL == *[$ILLEGAL_CHARS]* ]]; then
706 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because it contains characters illegal in a URL."
707 let SKIP_BAD_URL+=1
708 continue
709 fi
710
711 # If we're skipping Archive.org links, check if this is one
712 if [ $SKIP_ARCHIVE_LINKS -eq 1 ] && [[ $URL == *web.archive.org* ]]; then
713 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have been asked not to check Wayback Machine links."
714 let SKIP_ARCHIVE_ORG+=1
715 continue
716 fi
717
718 # Now we need to know if the URL is for a file or a web page. First step is to determine if the
719 # URL ends in a suffix
720 HAS_SUFFIX=0
721
722 # If the URL ends in something like ".php?foo=bar", strip everything from the '?' onward
723 CLEAN_URL=${URL%%\?*}
724
725 # If the URL ends in something like "#section_15", strip everything from the '#' onward
726 CLEAN_URL=${CLEAN_URL%%\#*}
727
728 # 'sed' cannot handle Unicode in my Bash shell, so skip non-ASCII URL and make user check it
729 if [[ $CLEAN_URL == *[![:ascii:]]* ]]; then
730 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I cannot handle non-ASCII characters."
731 let SKIP_NON_ASCII+=1
732 continue
733 fi
734
735 # Isolate the characters after the last period and after the last slash
736 POST_DOT=$(echo "$CLEAN_URL" | sed 's/.*\.//')
737 POST_SLASH=$(echo "$CLEAN_URL" | sed 's/.*\///')
738
739 # If the last period comes after the last slash, then the URL ends in a suffix
740 POST_DOT_LENGTH=$(echo | awk -v input=$POST_DOT '{print length(input)}')
741 POST_SLASH_LENGTH=$(echo | awk -v input=$POST_SLASH '{print length(input)}')
742 if [ $POST_DOT_LENGTH -lt $POST_SLASH_LENGTH ]; then
743 HAS_SUFFIX=1
744 else
745 HAS_SUFFIX=0
746 fi
747
748 # Now we need to know if the URL is for a file. If the URL ends in a suffix, compare it against the
749 # known URL endings supplied in HTTP_FILES and HTTP_TLDS_AND_PAGES
750 IS_FILE=-1
751 if [ $HAS_SUFFIX -eq 0 ]; then
752 IS_FILE=0
753 else
754 # Turn off case sensitivity while we compare suffixes
755 shopt -s nocasematch
756
757 # Special case: URLs ending in something like "productID.304297400" are pages, not files, so if
758 # the URL's suffix is all numbers, we are looking at the end of a web page URL
759 if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
760 IS_FILE=0
761 fi
762
763 # Special case: URLs ending in a parens, e.g. "ms537113(v=vs.85)", are pages
764 if [[ $POST_DOT =~ ^.*[\(\)]$ ]]; then
765 IS_FILE=0
766 fi
767
768 # Special case: URLs containing a '%', e.g. "10.1007%2FBF00329055", are pages
769 if [[ $POST_DOT == *%* ]]; then
770 IS_FILE=0
771 fi
772
773 # If we did not identify this URL as a web page above, we need to compare the suffix against known
774 # file extensions
775 if [ $IS_FILE -eq -1 ]; then
776 for EXTENSION in "${HTTP_FILES[@]}"; do
777 if [[ $EXTENSION == $POST_DOT ]]; then # double brackets respect the nocasematch setting
778 IS_FILE=1
779 break
780 fi
781 done
782 fi
783
784 # If we did not identify this URL as a file above, we need to compare the suffix against known
785 # pages and TLDs in order to rule out that this is an unknown file suffix or TLD that the user
786 # needs to add to the HTTP_FILES or HTTP_TLDS_AND_PAGES array
787 if [ $IS_FILE -eq -1 ]; then
788 for PAGE_OR_TLD in "${HTTP_TLDS_AND_PAGES[@]}"; do
789 if [[ $PAGE_OR_TLD == $POST_DOT ]]; then
790 IS_FILE=0
791 break
792 fi
793 done
794 fi
795
796 # Turn case sensitivity back on in Bash
797 shopt -u nocasematch
798 fi
799
800 # If this suffix escaped identification as either a file, page or TLD, inform the user
801 STR_TYPE=""
802 if [ $IS_FILE -eq -1 ]; then
803 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I encountered the unknown URL ending '$POST_DOT'. Please add this ending to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
804 let SKIP_UNK_SUFFIX+=1
805 continue
806 elif [ $IS_FILE -eq 1 ]; then
807 STR_TYPE="file"
808 let FILE_LINKS+=1
809 elif [ $IS_FILE -eq 0 ]; then
810 STR_TYPE="page"
811 let PAGE_LINKS+=1
812 fi
813
814 # Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
815 # issue with sites that require HTTPS
816 CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time 10 --write-out '%{http_code}\n' $URL)
817 CURL_ERR=$(echo $?)
818 CURL_RESULT=$CURL_CODE
819
820 # Tack 'curl' exit code onto result string if 'curl' returned "000" (no HTTP response)
821 if [ $CURL_CODE == "000" ]; then
822 CURL_RESULT="$CURL_RESULT-$CURL_ERR"
823 fi
824
825 # Begin to determine our status code for this URL (EI, IW, OK, RD, or NG)
826 STATUS="??"
827 NEW_URL=""
828 INTERWIKI_INDEX=-1
829
830 # First make sure that this isn't an "external internal" link to our own wiki that can be replaced
831 # by "[[page_name]]". If it uses a special access URL beginning with "/w/", let it pass, as it
832 # probably cannot be replaced by "[[ ]]" markup
833 if [[ $URL == *$WIKI_PATH* ]] && [[ $URL != *$WIKI_PATH/w/* ]]; then
834 STATUS="EI"
835 let EI_LINKS+=1
836 fi
837
838 # If it's not, check if this is a link to a domain that we have an interwiki prefix for
839 if [ $STATUS == "??" ]; then
840 for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
841 if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]] && [[ $URL != *${INTERWIKI_DOMAINS[$i]}/w/* ]]; then
842 STATUS="IW"
843 let IW_LINKS+=1
844 INTERWIKI_INDEX=$i
845 break
846 fi
847 done
848 fi
849
850 # If we didn't match an interwiki domain, see if the status code is in our "OK" codes list
851 if [ $STATUS == "??" ]; then
852 for CODE in "${OK_CODES[@]}"; do
853 if [[ $CODE == $CURL_CODE ]]; then
854 STATUS="OK"
855 let OK_LINKS+=1
856 break
857 fi
858 done
859 fi
860
861 # If we didn't get a match with the "OK" codes, check it against the "RD" codes
862 if [ $STATUS == "??" ]; then
863 for CODE in "${RD_CODES[@]}"; do
864 if [[ $CODE == $CURL_CODE ]]; then
865 # Get URL header again in order to retrieve the URL we are being redirected to
866 NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time 10 --write-out '%{redirect_url}\n' $URL)
867
868 # Adjust the old and new URLs to both use HTTP for comparison purposes, so we can filter
869 # those changes out if the user didn't ask for them
870 URL_HTTP=$(echo $URL | sed -E 's/^https:/http:/')
871 NEW_URL_HTTP=$(echo $NEW_URL | sed -E 's/^https:/http:/')
872
873 # Sometimes 'curl' fails to get the redirect_url due to time-out or bad web site config
874 NEW_URL_LENGTH=$(echo | awk -v input=$NEW_URL_HTTP '{print length(input)}')
875 if [ $NEW_URL_LENGTH -lt $MIN_URL_LENGTH ]; then
876 NEW_URL_HTTP="[new URL not retrieved]"
877 fi
878
879 # Remove slash at end of new URL, if present, so we can filter out the redirects that
880 # merely add an ending slash if the user didn't ask for them
881 NEW_URL_NO_SLASH=$(echo $NEW_URL_HTTP | sed -E 's:/$::')
882
883 # Detect if this is a youtu.be link simply being expanded by YouTube to the full
884 # youtube.com address
885 YOUTU_BE=0
886 if [[ $URL_HTTP == http://youtu.be* ]] && [[ $NEW_URL_HTTP == http://www.youtube.com* ]]; then
887 YOUTU_BE=1
888 fi
889
890 # If the URLs match besides HTTP being upgraded to HTTPS, then the link is OK (unless user
891 # wants those to be reported)
892 if [ $SHOW_HTTPS -eq 0 ] && [ $URL_HTTP == $NEW_URL_HTTP ]; then
893 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show http->https upgrades, and I was redirected to '$NEW_URL'."
894 STATUS="OK"
895 let OK_LINKS+=1
896 let SKIP_HTTPS_UP+=1
897 # If the URLs match besides an added ending slash, then the link is OK (unless user wants
898 # those to be reported)
899 elif [ $SHOW_SLASH -eq 0 ] && [ $URL_HTTP == $NEW_URL_NO_SLASH ]; then
900 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show added trailing slashes, and I was redirected to '$NEW_URL'."
901 STATUS="OK"
902 let OK_LINKS+=1
903 let SKIP_SLASH_ADD+=1
904 elif [ $SHOW_YT_RD -eq 0 ] && [ $YOUTU_BE -eq 1 ]; then
905 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show redirects for youtu.be links, and I was redirected to '$NEW_URL'."
906 STATUS="OK"
907 let OK_LINKS+=1
908 let SKIP_YOUTU_BE+=1
909 else
910 STATUS="RD"
911 let RD_LINKS+=1
912 fi
913 break
914 fi
915 done
916 fi
917
918 # If we didn't get a match with the "RD" codes, check it against the "NG" codes
919 if [ $STATUS == "??" ]; then
920 for CODE in "${NG_CODES[@]}"; do
921 if [[ $CODE == $CURL_CODE ]]; then
922 STATUS="NG"
923 let NG_LINKS+=1
924 break
925 fi
926 done
927 fi
928
929 # If we didn't match a known status code, advise the reader
930 if [ $STATUS == "??" ]; then
931 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I encountered the unknown response code $CURL_CODE."
932 let SKIP_UNK_CODE+=1
933 continue
934 fi
935
936 # Check problem links against exceptions list before proceeding
937 FOUND_EXCEPT=0
938 if [ $STATUS != "OK" ] && [ ! -z $EXCEPT_URL ]; then
939 # The code we expect to find in the exceptions file is either the 'curl' result or "EI"/"IW"
940 EXPECT_CODE="$CURL_RESULT"
941 if [ $STATUS == "EI" ]; then
942 EXPECT_CODE="EI"
943 elif [ $STATUS == "IW" ]; then
944 EXPECT_CODE="IW"
945 fi
946
947 # Look for link in exceptions list and make sure the listed result code and wiki page also match
948 for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
949 {
950 EXCEPT_LINE="${EXCEPT_ARRAY[$i]}"
951
952 # Match URL
953 EXCEPT_URL="${EXCEPT_LINE#*,}"
954 EXCEPT_URL="${EXCEPT_URL%,*}"
955 if [ "$EXCEPT_URL" != "$URL" ]; then
956 continue
957 fi
958
959 # Match containing page's name
960 EXCEPT_PAGE="${EXCEPT_LINE##*,}"
961 EXCEPT_PAGE="${EXCEPT_PAGE%% *}"
962 if [ "$EXCEPT_PAGE" == "*" ] || [ "$EXCEPT_PAGE" == $LOCAL_PAGE_PATH ]; then
963 # Match result code
964 EXCEPT_CODE=${EXCEPT_LINE%%,*}
965 if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then
966 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because its expected result, '$EXPECT_CODE', is in the exceptions list."
967 if [ $STATUS == "EI" ]; then
968 let SKIP_EXPECT_EI+=1
969 elif [ $STATUS == "IW" ]; then
970 let SKIP_EXPECT_IW+=1
971 else
972 let SKIP_EXPECT_NG+=1
973 fi
974 FOUND_EXCEPT=1
975 break
976 fi
977 fi
978 } done
979 fi
980 if [ $FOUND_EXCEPT -eq 1 ]; then
981 continue
982 fi
983
984 # If appropriate, record this link to the log, with clickable URLs when possible
985 if [ $STATUS != "OK" ] || [ $RECORD_OK_LINKS -eq 1 ]; then
986 # Prepare 'curl' result in parentheses to print after status code, unless this is an "IW" or "EI"
987 # link, in which case showing the status code doesn't make sense. Adjust spacing after string to
988 # ensure TXT and RTF reports have aligned columns of results.
989 CURL_STR_H=" ($CURL_RESULT)"
990 CURL_STR_T="$CURL_STR_H"
991 CURL_STR_R="$CURL_STR_H "
992 if [ $STATUS == "IW" ] || [ $STATUS == "EI" ]; then
993 CURL_STR_H=""
994 CURL_STR_T=" "
995 CURL_STR_R=" "
996 fi
997
998 # Record link and its wiki page in TXT, RTF, and HTML markup
999 valPrint t "${STATUS}${CURL_STR_T} $STR_TYPE $URL"
1000 valPrint t " linked from $FULL_PAGE_PATH"
1001 valPrint r "${STATUS}${CURL_STR_R}${RTF_TABS}$STR_TYPE {\field{\*\fldinst{HYPERLINK \"$URL\"}}{\fldrslt $URL}}"
1002 valPrint r " linked from {\field{\*\fldinst{HYPERLINK \"$FULL_PAGE_PATH\"}}{\fldrslt $LOCAL_PAGE_PATH}}"
1003 valPrint hn "<tr><td style=\"white-space:nowrap\">${STATUS}${CURL_STR_H}</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>"
1004 valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"
1005
1006 # Place vertical space here since we won't be printing anything more about this link
1007 if [ $STATUS == "OK" ]; then valPrint tr ""; valPrint hs ""; fi
1008
1009 # Record redirect URL if one was given by a 3xx response page
1010 if [ $STATUS == "RD" ]; then
1011 valPrint ts " Server suggests $NEW_URL"
1012 valPrint rs " Server suggests {\field{\*\fldinst{HYPERLINK \"$NEW_URL\"}}{\fldrslt $NEW_URL}}"
1013 valPrint hs "<tr><td colspan=\"2\" align=\"right\">Server suggests</td><td><a href=\"$NEW_URL\" target=\"_blank\">$NEW_URL</a></td></tr>"
1014 fi
1015
1016 # Notify reader if we can use an intrawiki link for this URL
1017 if [ $STATUS == "EI" ]; then
1018 INTRA_PAGE=${URL#*://*/}
1019 valPrint ts " Just use [[$INTRA_PAGE]]"
1020 valPrint rs " Just use [[$INTRA_PAGE]]"
1021 valPrint hs "<tr><td colspan=\"2\" align=\"right\">Just use</td><td>[[$INTRA_PAGE]]</td></tr>"
1022 fi
1023
1024 # Notify reader if we can use an interwiki prefix for this URL
1025 if [ $STATUS == "IW" ]; then
1026 INTER_PAGE=$(echo "$URL" | sed 's/.*\///')
1027 valPrint ts " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
1028 valPrint rs " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
1029 valPrint hs "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]</td></tr>"
1030 fi
1031
1032 # Query Internet Archive for latest "OK" snapshot for "NG" page
1033 if [ $STATUS == "NG" ] && [ $SUGGEST_SNAPSHOTS -eq 1 ]; then
1034 ARCHIVE_QUERY=$(curl --silent --max-time 10 "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
1035
1036 # If a "closest" snapshot was received...
1037 if [[ "$ARCHIVE_QUERY" == *\"closest\":* ]]; then
1038 # In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it
1039 ARCHIVE_QUERY=$(echo "$ARCHIVE_QUERY" | sed 's/#!/#\\!/')
1040
1041 # ...isolate "url" property in the response that follows the "closest" tag
1042 SNAPSHOT_URL=${ARCHIVE_QUERY##*\"closest\":} # everything after '"closest":'
1043 SNAPSHOT_URL=${SNAPSHOT_URL#*\"url\": \"} # everything after '"url": "'
1044 SNAPSHOT_URL=${SNAPSHOT_URL%%\"*} # everything before '"'
1045
1046 # Remove the port 80 part that IA often adds to the URL, as it's superfluous
1047 SNAPSHOT_URL=$(echo $SNAPSHOT_URL | sed 's/:80//')
1048
1049 # Inform the user of the snapshot URL
1050 valPrint ts " IA suggests $SNAPSHOT_URL"
1051 valPrint rs " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
1052 valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
1053 else # ...otherwise give generic Wayback Machine link for this URL
1054 valPrint ts " Try browsing $ARCHIVE_GENERIC/$URL"
1055 valPrint rs " Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
1056 valPrint hs "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"
1057 fi
1058 fi
1059 fi
1060
1061 # If this is a page and it seems to be there, take screenshot of it using headless Google Chrome
1062 if [ $IS_FILE -eq 0 ] && [ $TAKE_PAGE_SHOT -eq 1 ] && [ $STATUS == "OK" ]; then
1063 # Sanitize URL by removing "http(s)://" and converting any other colons or slashes to underscores
1064 SHOT_NAME=$(echo "$URL" | sed 's/https*\:\/\///' | sed 'y/:\//__/')
1065 SHOT_FILE="$SHOT_PATH/$SHOT_NAME.png"
1066
1067 # Don't take screenshot if we already encountered this page and screenshotted it
1068 if [ ! -f "$SHOT_FILE" ]; then
1069 "$CHROME_PATH" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
1070 if [ -f "$WORKING_DIR/$CHROME_SCREENSHOT" ]; then
1071 mv -n "$WORKING_DIR/$CHROME_SCREENSHOT" "$SHOT_FILE"
1072 else
1073 valPrint trhs "Screenshot of URL $URL seems to have failed!"
1074 fi
1075 else
1076 valPrint trhs "Skipping screenshot of URL '$URL' because file '$SHOT_FILE' already exists."
1077 fi
1078 fi
1079done
1080FINISHED_LIST="yes"
1081wrapupAndExit
Note: See TracBrowser for help on using the repository browser.