source: Validate External Links/validate_external_links.sh@ 1149

Last change on this file since 1149 was 1149, checked in by iritscen, 4 years ago

ValExtLinks: The messages about skipping URLs now show the wiki page's namespace. Added 504 to known response codes.

File size: 54.5 KB
Line 
1#!/bin/bash
2
3# Validate External Links by Iritscen
4#
5# Validates a list of external links in CSV format. The resulting logs are produced in three formats:
6# - TXT (for easy diffing with an earlier log)
7# - RTF (for reading as a local file with clickable links)
8# - HTML (for reading as a web page)
9# Call script with "--help" argument for documentation. Also see Read Me.rtf for critical notes.
10#
11# Recommended rule:
12# |----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----|
13#
14# Table of contents (sections of script in order of appearance, not execution):
15# • Globals
16# • Help Output
17# • Setup
18# • Utility Functions
19# • Summary Output
20# • Initialization
21# • Data Sourcing
22# • Config Output
23# • Legend Output
24# • Main Loop
25
26# Set separator token to newline
27IFS="
28"
29
30### GLOBALS ###
31# Settings -- these will be changed from their defaults by the arguments passed in to the script
32LINKS_URL="" # use 'curl' to download file with links from this location (can be file://)
33EXCEPT_URL="" # 'curl' will access this wiki page with a list of exceptions for NG results
34OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder
35RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES
36SHOW_SLASH=0 # record issue when a slash is added to the end of a URL
37SHOW_HTTPS=0 # record issue when "http" is upgraded to "https"
38SHOW_YT_RD=0 # record redirection for a youtu.be URL expanding to the full URL
39SUGGEST_SNAPSHOTS_NG=0 # query the Internet Archive for a possible snapshot URL for each NG page
40SUGGEST_SNAPSHOTS_OK=0 # query the Internet Archive for an existing snapshot of each OK page
41CHECK_ARCHIVE_LINKS=0 # check URLs under the archive.org domain
42TAKE_PAGE_SHOT=0 # take a screenshot of each OK page
43TIMEOUT=10 # time to wait for a response when querying a site
44CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature
45URL_START=1 # start at this URL in LINKS_FILE
46URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE
47UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report
48
49# Fixed strings -- see the occurrences of these variables to learn their purpose
50AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/88.0.4324.146 Safari/537.36"
51ARCHIVE_API="http://archive.org/wayback/available"
52ARCHIVE_GENERIC="https://web.archive.org/web/*"
53ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
54CHROME_SCREENSHOT="screenshot.png"
55EXCEPT_FILE_NAME="exceptions.txt"
56EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
57WIKI_CURL="https://wiki.oni2.net/Validate_External_Links/Curl_codes"
58WIKI_HTTP="https://wiki.oni2.net/Validate_External_Links/HTTP_codes"
59WIKI_MAIN="https://wiki.oni2.net/Validate_External_Links"
60WIKI_ME="http://iritscen.oni2.net"
61THIS_DIR=$(cd $(dirname $0); pwd)
62WORKING_DIR=$(pwd)
63WIKI_PATH="wiki.oni2.net"
64
65# These are parallel arrays of the IDs and names of OniGalore's current namespaces
66declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 100 101 102 103 104 105 108 109 110 111)
67declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk")
68
69# These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
70# This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code.
71declare -a HTTP_FILES=(3ds 7z avi BINA blend bsl cpp csv dae dll dmg doc east exe fbx first63 flv gamedata gif jpg last32 log m4a mhm mov mp3 mp4 oni ONWC pdf png py rar tga TRMA txt vbs wav wmv xaf xml zip)
72declare -a HTTP_TLDS_AND_PAGES=(action ars asp aspx cfm cgi com css de do full htm html it js jsp net org pgi php php3 phtml pl ru shtml stm uk x)
73
74# These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
75# are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt
76# if you add a new code.
77declare -a OK_CODES=(200 401 405 406 418 501)
78declare -a RD_CODES=(301 302 303 307 308)
79declare -a NG_CODES=(000 400 403 404 410 429 500 502 503 504 530)
80
81# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
82# transcluded text, and if the transclusion fails, then the braces show up in the URL
83ILLEGAL_CHARS="{ }"
84
85# The shortest URL possible, used for sanity-checking some URLs: http://a.co
86MIN_URL_LENGTH=11
87
88# These are parallel arrays giving the prefixes that can be used in place of normal external links to
89# some wikis and other sites
90declare -a INTERWIKI_PREFIXES=(commons metawikimedia mw wikibooks wikidata wikimedia wikinews wikiquote wikisource wikispecies wikiversity wikivoyage wikt wp)
91declare -a INTERWIKI_DOMAINS=(commons.wikimedia.org meta.wikimedia.org mediawiki.org wikibooks.org wikidata.org wikimediafoundation.org wikinews.org wikiquote.org wikisource.org species.wikimedia.org wikiversity.org wikivoyage.org wiktionary.org wikipedia.org)
92
93# Variables for keeping track of main loop progress and findings
94LINK_NUM=0
95EI_LINKS=0
96IW_LINKS=0
97OK_LINKS=0
98RD_LINKS=0
99NG_LINKS=0
100SKIP_UNK_NS=0
101SKIP_JS_PAGE=0
102SKIP_BAD_URL=0
103SKIP_NON_ASCII=0
104SKIP_UNK_SUFFIX=0
105SKIP_UNK_CODE=0
106SKIP_EXPECT_NG=0
107SKIP_EXPECT_RD=0
108SKIP_EXPECT_EI=0
109SKIP_EXPECT_IW=0
110SKIP_HTTPS_UP=0
111SKIP_SLASH_ADD=0
112SKIP_YOUTU_BE=0
113SKIP_ARCHIVE_ORG=0
114FILE_LINKS=0
115PAGE_LINKS=0
116SKIPPED_HEADER_ROW=0
117FINISHED_LIST="no"
118START_RUN=0
119END_RUN=0
120
121
122### HELP OUTPUT ###
123# A pseudo-man page. Here is the 80-character rule for the page text:
124# 234567890123456789012345678901234567890123456789012345678901234567890123456789
125function printHelp()
126{
127 cat << EOF
128
129NAME
130 Validate External Links
131
132SYNOPSIS
133 validate_external_links.sh --help
134 validate_external_links.sh --links URL --output DIR [--exceptions URL]
135 [--record-ok-links] [--show-added-slashes] [--show-https-upgrades]
136 [--show-yt-redirects] [--suggest-snapshots] [--check-archive-links]
137 [--take-screenshots FILE] [--timeout NUM] [--start-url NUM]
138 [--end-url NUM] [--upload FILE]
139
140DESCRIPTION
141 This script parses a list of external links found in the OniGalore wiki
142 (which is dumped by the Oni2.net server periodically in a particular
143 format), validates them using the Unix tool 'curl', and produces a report
144 of which links were "OK" (responded positively to an HTTP query), which
145 were "RD" (responded with a 3xx redirect code), which could be "IW"
146 (interwiki) links, which are "EI" (external internal) links and could be
147 intrawiki links, and which were "NG" (no good; a negative response to the
148 query). This report can then be automatically uploaded to the location of
149 your choice. The script can also suggest Internet Archive snapshots for
150 "NG" links, and take screenshots of "OK" links for visual verification by
151 the reader that the page in question is the one intended to be displayed.
152
153 You must pass this script the URL at which the list of links is found
154 (--links) and the path where the directory of logs should be outputted
155 (--output). All other arguments are optional.
156
157OPTIONS
158 --help Show this page.
159 --links URL (required) URL from which to download the CSV
160 file with external links. Note that this URL can
161 be a local file if you supply a file:// path.
162 --output DIR (required) Unix path to directory in which Val
163 should place its reports.
164 --exceptions URL In order to remove links from the report which
165 Val finds an issue with but which you regard as
166 OK, list those desired exceptions on a wiki page.
167 See the sample file "exceptions.pdf" for the
168 required format of the page. Note that this URL
169 can point to a local file if you supply a path
170 beginning with "file://".
171 --record-ok-links Log a link in the report even if its response
172 code is "OK".
173 --show-added-slashes Report on redirects that simply add a '/' to the
174 end of the URL.
175 --show-https-upgrades Report on redirects that simply upgrade a
176 "http://" URL to a "https://" URL.
177 --show-yt-redirects Report on redirects that expand a youtu.be URL.
178 --suggest-snapshots-ng Query the Internet Archive for a possible
179 snapshot URL for each "NG" page.
180 --suggest-snapshots-ok Query the Internet Archive for a snapshot of each
181 "OK" page just to make sure it's available. Note
182 that this will add a tremendous amount of time to
183 the script execution because there is a rate
184 limit to the Archive API. Note that this option
185 does nothing unless you also use the
186 --record-ok-links argument.
187 --check-archive-links Check links that are already pointing to a page
188 on the Internet Archive. In theory these links
189 should be totally stable and not need validation.
190 --take-screenshots FILE Call the Google Chrome binary at this path to
191 take screenshots of each "OK" page.
192 --timeout NUM Wait this many seconds for a site to respond. The
193 default is 10. Important note: Val will attempt
194 to reach each URL three times, so the time taken
195 to ping an unresponsive site will be three times
196 this setting.
197 --start-url NUM Start at this link in the links CSV file.
198 --end-url NUM Stop at this link in the links CSV file.
199 --upload FILE Upload report using the credentials and path
200 given in this local text file. See sftp_login.txt
201 for template.
202
203BUGS
204 The script cannot properly parse any line in the external links file
205 which contains a comma in the name of the wiki page containing a link.
206 Commas in the link itself are not an issue.
207EOF
208}
209
210
211### SETUP ###
212# If first argument is a help request, or if nothing was passed in at all, print help page and quit
213if [ "$#" -eq 0 ] || [ "$1" == "--help" ]; then
214 printHelp | less
215 exit 0
216fi
217
218# Parse arguments as long as there are more arguments to process
219while (( "$#" )); do
220 case "$1" in
221 --links ) LINKS_URL="$2"; shift 2;;
222 --exceptions ) EXCEPT_URL="$2"; shift 2;;
223 --output ) OUTPUT_DIR="$2"; shift 2;;
224 --record-ok-links ) RECORD_OK_LINKS=1; shift;;
225 --show-added-slashes ) SHOW_SLASH=1; shift;;
226 --show-https-upgrades ) SHOW_HTTPS=1; shift;;
227 --show-yt-redirects ) SHOW_YT_RD=1; shift;;
228 --suggest-snapshots-ng ) SUGGEST_SNAPSHOTS_NG=1; shift;;
229 --suggest-snapshots-ok ) SUGGEST_SNAPSHOTS_OK=1; shift;;
230 --check-archive-links ) CHECK_ARCHIVE_LINKS=1; shift;;
231 --take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
232 --timeout ) TIMEOUT=$2; shift 2;;
233 --start-url ) URL_START=$2; shift 2;;
234 --end-url ) URL_LIMIT=$2; shift 2;;
235 --upload ) UPLOAD_INFO=$2; shift 2;;
236 * ) echo "Invalid argument $1 detected. Aborting."; exit 1;;
237 esac
238done
239
240# If the required arguments were not supplied, print help page and quit
241if [ -z $LINKS_URL ] || [ -z $OUTPUT_DIR ]; then
242 echo "Error: I did not receive one or both required arguments. Run me with the \"--help\" argument for documentation."
243 exit 2
244fi
245
246# If user wants screenshots, make sure path to Chrome was passed in and is valid
247if [ $TAKE_PAGE_SHOT -eq 1 ]; then
248 if [ ! -f "$CHROME_PATH" ]; then
249 echo "Error: You need to supply a path to the Google Chrome application in order to take screenshots."
250 exit 3
251 fi
252fi
253
254# Check that UPLOAD_INFO exists, if this argument was supplied
255if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then
256 echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting."
257 exit 4
258fi
259
260# Check that OUTPUT_DIR is a directory
261if [ ! -d "$OUTPUT_DIR" ]; then
262 echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting."
263 exit 5
264fi
265
266# Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots
267SAFE_TIME=$(date "+%Y-%m-%d--%H-%M-%S") # for use in path names
268NICE_TIME=$(TZ=":UTC" date "+%a, %d %b %Y %k:%M:%S GMT") # matches format of LINKS_DATE, which is extracted from the extlinks file
269OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
270OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
271SHOT_PATH="$OUTPUT_PATH/Screenshots"
272LOG_NAME="ValExtLinks report"
273LOG_NAME_TXT="$LOG_NAME.txt"
274LOG_NAME_RTF="$LOG_NAME.rtf"
275LOG_NAME_HTM="$LOG_NAME.htm"
276LOG_PATH="$OUTPUT_PATH/$LOG_NAME"
277LOG_PATH_TXT="$LOG_PATH.txt"
278LOG_PATH_RTF="$LOG_PATH.rtf"
279LOG_PATH_HTM="$LOG_PATH.htm"
280mkdir "$OUTPUT_PATH"
281if [ $TAKE_PAGE_SHOT -eq 1 ]; then
282 mkdir "$SHOT_PATH"
283fi
284
285# Check that 'mkdir' succeeded
286if [ ! -d "$OUTPUT_PATH" ]; then
287 echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting."
288 exit 6
289fi
290
291# Get date on the file at LINKS_URL and print to log
292LINKS_DATE=$(curl --silent --head $LINKS_URL | grep "Last-Modified")
293if [ -z "$LINKS_DATE" ]; then
294 echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting."
295 exit 7
296fi
297LINKS_DATE=${LINKS_DATE#Last-Modified: }
298
299
300### UTILITY FUNCTIONS ###
301# Writes a plain-text header to TXT log file
302function printTXTheader()
303{
304 valPrint t "Validate External Links report"
305 valPrint t "generated $NICE_TIME"
306 valPrint t "from data of $LINKS_DATE"
307 valPrint t "script by Iritscen (contact: $WIKI_ME)"
308 valPrint t ""
309}
310
311# Writes the RTF header to RTF log file, then a centered title, then sets text to left-justified
312function printRTFheader()
313{
314 valPrint r "{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf820
315{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
316{\colortbl;\red255\green255\blue255;}
317{\*\expandedcolortbl;;}
318\margl1440\margr1440\vieww12600\viewh12100\viewkind0
319\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\qc\partightenfactor0
320
321\f0\fs28 \cf0 \b1 Validate External Links report \b0\\
322generated $NICE_TIME\\
323from data of $LINKS_DATE\\
324script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$WIKI_ME\"}}{\fldrslt contact}})
325\\
326\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
327\cf0 "
328}
329
330# Closes the RTF markup of the RTF log file
331function printRTFfooter()
332{
333 valPrint r "}"
334}
335
336# Writes the HTML header to HTML log file
337function printHTMheader()
338{
339 valPrint h "<html>
340<head>
341<title>Validate External Links report</title>
342</head>
343<body>
344<h2>Validate External Links report</h2>
345<h3>generated $NICE_TIME<br />
346from data of $LINKS_DATE<br />
347script by Iritscen (<a href=\"$WIKI_ME\" target=\"_blank\">contact</a>)</h3>"
348}
349
350# Closes the HTML markup of the HTML log file
351function printHTMfooter()
352{
353 valPrint h "</body>
354</html>"
355}
356
357# The central logging function. The first parameter is a string composed of one or more characters that
358# indicate which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
359# 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 's' means "Print an
360# extra newline at the end." 'w' means "Don't pass console output through 'fmt'" ("fmt" fits the output
361# to an 80-column CLI but can break special formatting and the 'n' option).
362function valPrint()
363{
364 if [[ "$1" == *c* ]]; then
365 if [[ "$1" == *n* ]]; then
366 echo -n "$2"
367 elif [[ "$1" == *w* ]]; then
368 echo "$2"
369 elif [[ "$1" == *s* ]]; then
370 echo -e "$2\n"
371 else
372 echo "$2" | fmt -w 80
373 fi
374 fi
375 if [[ "$1" == *t* ]]; then
376 if [[ "$1" == *n* ]]; then
377 echo -n "$2" >> "$LOG_PATH_TXT"
378 elif [[ "$1" == *s* ]]; then
379 echo -e "$2\n" >> "$LOG_PATH_TXT"
380 else
381 echo "$2" >> "$LOG_PATH_TXT"
382 fi
383 fi
384 if [[ "$1" == *r* ]]; then
385 if [[ "$1" == *n* ]]; then
386 echo "$2" >> "$LOG_PATH_RTF"
387 elif [[ "$1" == *s* ]]; then
388 echo "$2\line\line" >> "$LOG_PATH_RTF"
389 else
390 echo "$2\line" >> "$LOG_PATH_RTF"
391 fi
392 fi
393 if [[ "$1" == *h* ]]; then
394 if [[ "$1" == *s* ]]; then
395 echo "$2<tr><td>&nbsp;</td></tr>" >> "$LOG_PATH_HTM"
396 elif [[ "$1" == *n* ]]; then
397 echo "$2" >> "$LOG_PATH_HTM"
398 else
399 echo "$2<br />" >> "$LOG_PATH_HTM"
400 fi
401 fi
402}
403
404# Pluralize the string in parameter 1 if the number in parameter 2 is not 1
405function pluralCheckNoun()
406{
407 if [ $2 -ne 1 ]; then
408 if [[ $1 =~ x$ ]]; then
409 echo $1es
410 else
411 echo $1s
412 fi
413 else
414 echo $1
415 fi
416}
417
418# Output "is" if parameter 1 is 1, otherwise "are"
419function pluralCheckIs()
420{
421 if [ $1 -ne 1 ]; then
422 echo "are"
423 else
424 echo "is"
425 fi
426}
427
428# Output "was" if parameter 1 is 1, otherwise "were"
429function pluralCheckWas()
430{
431 if [ $1 -ne 1 ]; then
432 echo "were"
433 else
434 echo "was"
435 fi
436}
437
438# Output "a " if parameter 1 is 1, otherwise nothing
439function pluralCheckA()
440{
441 if [ $1 -eq 1 ]; then
442 echo "a "
443 fi
444}
445
446# Output "an " if parameter 1 is 1, otherwise nothing
447function pluralCheckAn()
448{
449 if [ $1 -eq 1 ]; then
450 echo "an "
451 fi
452}
453
454# Upload the reports using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
455# reports being saved to disk have already been closed.
456function uploadReport()
457{
458 valPrint c "Uploading reports..."
459
460 SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
461 SFTP_USER_NAME_MARKER="user:"
462 SFTP_PASSWORD_MARKER="pw:"
463 SFTP_PORT_MARKER="port:"
464 SFTP_PATH_MARKER="path:"
465 SFTP_USER_NAME=$(grep $SFTP_USER_NAME_MARKER $UPLOAD_INFO)
466 SFTP_USER_NAME=${SFTP_USER_NAME#$SFTP_USER_NAME_MARKER}
467 SFTP_PASSWORD=$(grep $SFTP_PASSWORD_MARKER $UPLOAD_INFO)
468 SFTP_PASSWORD=${SFTP_PASSWORD#$SFTP_PASSWORD_MARKER}
469 SFTP_PORT=$(grep $SFTP_PORT_MARKER $UPLOAD_INFO)
470 SFTP_PORT=${SFTP_PORT#$SFTP_PORT_MARKER}
471 SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
472 SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}
473
474 for SUFFIX in htm rtf txt; do
475 expect "$SCRIPT_PATH" "$LOG_PATH.$SUFFIX" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.$SUFFIX"
476
477 if [ "$?" -ne 0 ]; then
478 valPrint c "Error $? occurred when attempting to upload $LOG_NAME.$SUFFIX!"
479 else
480 valPrint c "Report in `echo $SUFFIX | tr [:lower:] [:upper:]` format was uploaded."
481 fi
482 done
483}
484
485# Prints session summary when script is done
486function wrapupAndExit()
487{
488 # Get off progress line on console, drop down a line from last link in log, and close HTML table
489 valPrint ctr ""
490 valPrint h "</table><br />"
491
492 # If we didn't finish processing the last URL, then the iterator is one too high
493 if [ $FINISHED_LIST != "yes" ]; then
494 let LINK_NUM-=1
495 if [ $FINISHED_LIST == "no" ]; then
496 valPrint ctrh "The session was canceled by the user."
497 fi
498 fi
499
500 # Generate string with elapsed time
501 END_RUN=$(date +%s)
502 ELAPSED=$(echo $(($END_RUN - $START_RUN)) | awk '{printf "%d min. %d sec. elapsed", int($1/60), int($1%60)}')
503
504 # Do some math on results of session
505 LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
506 TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP+SKIP_YOUTU_BE))
507 LINK_ERRORS=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
508 LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_RD+SKIP_EXPECT_EI+SKIP_EXPECT_IW))
509 LINK_PROBLEMS_TOTAL=$((NG_LINKS+RD_LINKS+EI_LINKS+IW_LINKS))
510 LINK_PROBLEMS_NG=$((NG_LINKS-SKIP_EXPECT_NG))
511 LINK_PROBLEMS_RD=$((RD_LINKS-SKIP_EXPECT_RD))
512 LINK_PROBLEMS_EI=$((EI_LINKS-SKIP_EXPECT_EI))
513 LINK_PROBLEMS_IW=$((IW_LINKS-SKIP_EXPECT_IW))
514 LINK_PROBLEMS_NET=$((LINK_PROBLEMS_NG+LINK_PROBLEMS_RD+LINK_PROBLEMS_EI+LINK_PROBLEMS_IW))
515
516 # Print something in the Links section if no link issues were printed
517 if [ $LINK_PROBLEMS_NET -eq 0 ]; then
518 valPrint h "<i>No link problems to report! See the <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for a list of links with issues that were not reported.</i>"
519 fi
520 if [ $LINK_PROBLEMS_TOTAL -eq 0 ]; then
521 valPrint t "No link problems to report!"
522 valPrint r "\i1 No link problems to report! \i0"
523 fi
524
525 ## SUMMARY OUTPUT ##
526 valPrint ct "Summary ($ELAPSED):"
527 valPrint r "\b1 Summary \b0 ($ELAPSED)"
528 valPrint hn "<h3><span id=\"summary\">Summary ($ELAPSED)</span></h3>"
529 valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT) (there $(pluralCheckWas $FILE_LINKS) $FILE_LINKS file $(pluralCheckNoun link $FILE_LINKS) and $PAGE_LINKS page $(pluralCheckNoun link $PAGE_LINKS))."
530
531 # Print processed link totals
532 if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi
533 if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi
534 if [ $SKIP_ARCHIVE_ORG -gt 0 ]; then valPrint ctrh "- $SKIP_ARCHIVE_ORG Archive.org $(pluralCheckNoun link $SKIP_ARCHIVE_ORG) were not checked"; fi
535 if [ $LINK_PROBLEMS_TOTAL -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_TOTAL processed $(pluralCheckNoun link $LINK_PROBLEMS_TOTAL) had $(pluralCheckAn $LINK_PROBLEMS_TOTAL)$(pluralCheckNoun issue $LINK_PROBLEMS_TOTAL)"; fi
536 if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr " (excepted $LINKS_EXCEPTED link $(pluralCheckNoun issue $LINKS_EXCEPTED) from report)"; valPrint h "&nbsp;&nbsp;(excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi
537 if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) $(pluralCheckWas $OK_LINKS) OK"; fi
538 if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctr " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; valPrint h "&nbsp;&nbsp;(counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi
539
540 # Print errored link totals
541 if [ $LINK_ERRORS -gt 0 ]; then
542 valPrint c "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"
543 valPrint h "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for specific links):"
544 valPrint rt "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS):"
545 fi
546 if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
547 if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
548 if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
549 if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
550 if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
551 if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
552
553 # Print excepted link totals
554 if [ $LINKS_EXCEPTED -gt 0 ]; then
555 valPrint c "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"
556 valPrint h "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for specific links):"
557 valPrint rt "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted:"
558 fi
559 if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
560 if [ $SKIP_EXPECT_RD -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_RD/$RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi
561 if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi
562 if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi
563
564 # Print checked link totals
565 if [ $LINK_PROBLEMS_NET -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS_NET link $(pluralCheckNoun issue $LINK_PROBLEMS_NET):"; fi
566 if [ $LINK_PROBLEMS_NG -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_NG NG $(pluralCheckNoun link $LINK_PROBLEMS_NG)"; fi
567 if [ $LINK_PROBLEMS_RD -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_RD $(pluralCheckNoun redirection $LINK_PROBLEMS_RD)"; fi
568 if [ $LINK_PROBLEMS_EI -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_EI $(pluralCheckNoun link $LINK_PROBLEMS_EI) that could be intrawiki"; fi
569 if [ $LINK_PROBLEMS_IW -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_IW $(pluralCheckNoun link $LINK_PROBLEMS_IW) that could be interwiki"; fi
570
571 # Close the log files' markup
572 valPrint trh "ValExtLinks says goodbye."
573 printRTFfooter
574 printHTMfooter
575
576 # Upload report if this was requested
577 if [ ! -z $UPLOAD_INFO ] && [ $FINISHED_LIST != "no" ]; then
578 uploadReport
579 fi
580
581 # Really quit now
582 valPrint c "ValExtLinks says goodbye."
583 exit 0
584}
585trap wrapupAndExit INT
586
587
588### INITIALIZATION ###
589# Print opening message to console and log files
590valPrint cw "$(tput bold)--Validate External Links--$(tput sgr0)"
591printTXTheader
592printRTFheader
593printHTMheader
594
595## DATA SOURCING ##
596valPrint t "Startup:"
597valPrint r "\b1 Startup \b0"
598valPrint hn "<h3>Startup</h3>"
599
600# Attempt to download file at LINKS_URL, then check that it succeeded
601valPrint cwtrhn "Downloading list of external links from $LINKS_URL..."
602LINKS_FILE_NAME=$(echo "$LINKS_URL" | sed 's/.*\///')
603LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
604curl --silent -o "$LINKS_FILE" $LINKS_URL
605if [ ! -f "$LINKS_FILE" ]; then
606 echo -e "\nThe download of $LINKS_URL appears to have failed. Aborting."
607 wrapupAndExit
608else
609 valPrint ctrh " success."
610fi
611
612# Attempt to download file at EXCEPT_URL, then check that it succeeded
613if [ ! -z $EXCEPT_URL ]; then
614 valPrint cwtrhn "Downloading list of reporting exceptions from $EXCEPT_URL..."
615 EXCEPT_DATA=$(curl --silent $EXCEPT_URL)
616 if [ -z "$EXCEPT_DATA" ]; then
617 echo -e "\nThe download of the exceptions data from '$EXCEPT_URL' appears to have failed. Aborting."
618 wrapupAndExit
619 else
620 valPrint ctrh " success."
621 fi
622 EXCEPT_DATA=${EXCEPT_DATA%END LIST*}
623 EXCEPT_DATA=${EXCEPT_DATA#*BEGIN LIST}
624 EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
625
626 # Store on disk for debugging purposes
627 echo "$EXCEPT_DATA" > "$EXCEPT_FILE"
628
629 # Transfer to array for easy searching later
630 declare -a EXCEPT_ARRAY=($(echo "$EXCEPT_DATA"))
631fi
632
633# Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
634LINK_COUNT_STRING=$(cat "$LINKS_FILE" | wc -l)
635
636# Number of URLs is number of lines minus one (first line is column header row for the CSV)
637LINK_COUNT=$(echo "${LINK_COUNT_STRING}" | tr -d '[:space:]')
638let LINK_COUNT-=1
639valPrint ctrh "Found $LINK_COUNT links to process."
640valPrint trh ""
641
642## CONFIG OUTPUT ##
643valPrint t "Config:"
644valPrint r "\b1 Config \b0"
645valPrint hn "<h3>Config</h3>"
646
647valPrint ctrhn "Links to consider: "
648if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
649 valPrint ctrh "$((URL_LIMIT-URL_START+1)), from link $URL_START to $URL_LIMIT"
650elif [ $URL_START -ne 1 ]; then
651 valPrint ctrh "$((LINK_COUNT-URL_START+1)), from link $URL_START to $LINK_COUNT"
652else
653 valPrint ctrh "$LINK_COUNT"
654fi
655
656valPrint ctrh "Site query timeout: $TIMEOUT seconds"
657
658valPrint ctrhn "Show OK links: "
659if [ $RECORD_OK_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
660
661valPrint ctrhn "Take screenshots: "
662if [ $TAKE_PAGE_SHOT -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
663
664valPrint ctrhn "Suggest archive.org snapshots for NG pages: "
665if [ $SUGGEST_SNAPSHOTS_NG -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
666
667valPrint ctrhn "Suggest archive.org snapshots for OK pages: "
668if [ $SUGGEST_SNAPSHOTS_OK -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
669
670valPrint ctrhn "Ignore slash-adding redirects: "
671if [ $SHOW_SLASH -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
672
673valPrint ctrhn "Ignore HTTPS-upgrading redirects: "
674if [ $SHOW_HTTPS -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
675
676valPrint ctrhn "Ignore youtu.be redirects: "
677if [ $SHOW_YT_RD -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
678
679valPrint ctrhn "Check archive.org links: "
680if [ $CHECK_ARCHIVE_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
681
682valPrint tr "A summary of my findings will be found at the bottom of the report."
683valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
684valPrint trh ""
685
686## LEGEND OUTPUT ##
687valPrint t "Legend:"
688valPrint r "\b1 Legend \b0"
689valPrint hn "<h3>Legend</h3>"
690valPrint t "(For guidance in fixing these links, see $WIKI_MAIN.)"
691valPrint r "(For guidance in fixing these links, see {\field{\*\fldinst{HYPERLINK \"$WIKI_MAIN\"}}{\fldrslt here}}.)"
692valPrint h "(For guidance in fixing these links, see <a href=\"$WIKI_MAIN\" target=\"_blank\">here</a>.)"
693valPrint trh "OK = URL seems to be working"
694valPrint trh "NG = URL no longer seems to work"
695valPrint trh "RD = URL is redirecting to this new URL"
696valPrint trh "EI = URL is an external link to an internal page and should be converted to an intrawiki link using the suggested markup"
697valPrint trh "IW = URL is an external link to a fellow wiki and should be converted to an interwiki link using the suggested markup"
698valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $WIKI_HTTP)"
699valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$WIKI_HTTP\"}}{\fldrslt here}} for code reference)"
700valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$WIKI_HTTP\" target=\"_blank\">here</a> for code reference)"
701valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $WIKI_CURL)"
702valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$WIKI_CURL\"}}{\fldrslt here}} for code reference)"
703valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$WIKI_CURL\" target=\"_blank\">here</a> for code reference)"
704valPrint trh "IA suggests = Last available snapshot returned by the Internet Archive"
705valPrint trh "Try browsing = The Archive failed to return a snapshot URL, so check for a snapshot manually using this link"
706valPrint trh ""
707
708
709### MAIN LOOP ###
710valPrint t "Links:"
711valPrint r "\b1 Links \b0"
712valPrint hn "<h3>Links</h3>"
713START_RUN=$(date +%s)
714# Process each line of the .csv in LINKS_FILE
715for LINE in `cat "$LINKS_FILE"`; do
716 START_LINK=$(date +%s)
717 let LINK_NUM+=1
718
719 # First line is the column header row for the CSV, so let's verify that the format hasn't changed
720 if [ $SKIPPED_HEADER_ROW -eq 0 ]; then
721 if [ $LINE == "namespace,title,target" ]; then
722 SKIPPED_HEADER_ROW=1
723 LINK_NUM=0 # this line is not a link, so reset the link counter
724 valPrint hn "<table>"
725 continue
726 else
727 valPrint ctrh "Did not find expected header row in $LINKS_FILE. Aborting."
728 wrapupAndExit
729 fi
730 fi
731
732 # Skip this link if we are not at URL_START yet
733 if [ $LINK_NUM -lt $URL_START ]; then
734 continue
735 fi
736
737 # Stop if we are at the limit declared for testing purposes
738 if [ $URL_LIMIT -gt 0 ] && [ $LINK_NUM -gt $URL_LIMIT ]; then
739 FINISHED_LIST="limit"
740 wrapupAndExit
741 fi
742
743 # Print progress to screen
744 if [ $LINK_NUM -gt 1 ]; then
745 printf "\e[1A\n" # erase previous progress message so that new one appears in its place
746 fi
747 valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."
748
749 # The number of the namespace is the element before the first comma on the line
750 NS_ID=${LINE%%,*}
751
752 # Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
753 NS_NAME=""
754 a=0
755 while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done
756 if [ $NS_ID == "NULL" ]; then
757 break
758 elif [ $NS_ID -eq ${NS_IDS[$a]} ]; then
759 NS_NAME="${NS_NAMES[$a]}"
760 break
761 fi
762 let a+=1
763 done
764 if [ "$NS_NAME" == "" ]; then
765 if [ $NS_ID == "NULL" ]; then
766 valPrint trs "Skipping line $LINK_NUM ('$LINE') because the namespace (and probably the page too) is 'NULL'. Probably the link is no longer in existence on the wiki."
767 else
768 valPrint trs "Skipping line $LINK_NUM ('$LINE') because I could not find a name for namespace ID $NS_ID."
769 fi
770 let SKIP_UNK_NS+=1
771 let PAGE_LINKS+=1
772 continue
773 fi
774
775 # The name of the page is everything between the namespace ID and the next comma on the line (commas
776 # in page names will break this)
777 PAGE_NAME=${LINE#$NS_ID,}
778 PAGE_NAME=${PAGE_NAME%%,*}
779
780 # Build longer wiki page URLs from namespace and page names
781 FULL_PAGE_PATH=https://$WIKI_PATH/$NS_NAME:$PAGE_NAME
782 LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
783 # Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it
784 # explicitly breaks the link
785 if [ $NS_ID -eq 0 ]; then
786 FULL_PAGE_PATH=https://$WIKI_PATH/$PAGE_NAME
787 LOCAL_PAGE_PATH=$PAGE_NAME
788 fi
789
790 # We don't want to consider wiki pages ending in .js, as the MW parser cannot reliably isolate URLS
791 # in JavaScript code, so it returns erroneous links
792 PAGE_NAME_SUFFIX=$(echo $PAGE_NAME | sed 's/.*\.//')
793 if [ $PAGE_NAME_SUFFIX == "js" ]; then
794 valPrint trs "Skipping URL '${LINE#$NS_ID,$PAGE_NAME,}' on line $LINK_NUM because it was found on JavaScript page '$LOCAL_PAGE_PATH'."
795 let SKIP_JS_PAGE+=1
796 let PAGE_LINKS+=1
797 continue
798 fi
799
800 # The URL being linked to is everything after the previous two fields (this allows commas to be in
801 # the URLs, but a comma in the previous field, the page name, will break this)
802 URL=${LINE#$NS_ID,$PAGE_NAME,}
803
804 # Scan for illegal characters
805 if [[ $URL == *[$ILLEGAL_CHARS]* ]]; then
806 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because it contains characters illegal in a URL."
807 let SKIP_BAD_URL+=1
808 let PAGE_LINKS+=1
809 continue
810 fi
811
812 # If we're skipping Archive.org links, see if this is one
813 if [ $CHECK_ARCHIVE_LINKS -eq 0 ] && [[ $URL == *web.archive.org* ]]; then
814 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to check Wayback Machine links."
815 let SKIP_ARCHIVE_ORG+=1
816 let PAGE_LINKS+=1
817 continue
818 fi
819
820 # Now we need to know if the URL is for a file or a web page. First step is to determine if the
821 # URL ends in a suffix
822 HAS_SUFFIX=0
823
824 # If the URL ends in something like ".php?foo=bar", strip everything from the '?' onward
825 CLEAN_URL=${URL%%\?*}
826
827 # If the URL ends in something like "#section_15", strip everything from the '#' onward
828 CLEAN_URL=${CLEAN_URL%%\#*}
829
830 # 'sed' cannot handle Unicode in my Bash shell, so skip non-ASCII URL and make user check it
831 if [[ $CLEAN_URL == *[![:ascii:]]* ]]; then
832 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I cannot handle non-ASCII characters."
833 let SKIP_NON_ASCII+=1
834 let PAGE_LINKS+=1
835 continue
836 fi
837
838 # Isolate the characters after the last period and after the last slash
839 POST_DOT=$(echo "$CLEAN_URL" | sed 's/.*\.//')
840 POST_SLASH=$(echo "$CLEAN_URL" | sed 's/.*\///')
841
842 # If the last period comes after the last slash, then the URL ends in a suffix
843 POST_DOT_LENGTH=$(echo | awk -v input=$POST_DOT '{print length(input)}')
844 POST_SLASH_LENGTH=$(echo | awk -v input=$POST_SLASH '{print length(input)}')
845 if [ $POST_DOT_LENGTH -lt $POST_SLASH_LENGTH ]; then
846 HAS_SUFFIX=1
847 else
848 HAS_SUFFIX=0
849 fi
850
851 # Now we need to know if the URL is for a file. If the URL ends in a suffix, compare it against the
852 # known URL endings supplied in HTTP_FILES and HTTP_TLDS_AND_PAGES
853 IS_FILE=-1
854 if [ $HAS_SUFFIX -eq 0 ]; then
855 IS_FILE=0
856 else
857 # Turn off case sensitivity while we compare suffixes
858 shopt -s nocasematch
859
860 # Special case: URLs ending in something like "productID.304297400" are pages, not files, so if
861 # the URL's suffix is all numbers, we are looking at the end of a web page URL
862 if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
863 IS_FILE=0
864 fi
865
866 # Special case: URLs ending in a parens, e.g. "ms537113(v=vs.85)", are pages
867 if [[ $POST_DOT =~ ^.*[\(\)]$ ]]; then
868 IS_FILE=0
869 fi
870
871 # Special case: URLs containing a '%', e.g. "10.1007%2FBF00329055", are pages
872 if [[ $POST_DOT == *%* ]]; then
873 IS_FILE=0
874 fi
875
876 # If we did not identify this URL as a web page above, we need to compare the suffix against known
877 # file extensions
878 if [ $IS_FILE -eq -1 ]; then
879 for EXTENSION in "${HTTP_FILES[@]}"; do
880 if [[ $EXTENSION == $POST_DOT ]]; then # double brackets respect the nocasematch setting
881 IS_FILE=1
882 break
883 fi
884 done
885 fi
886
887 # If we did not identify this URL as a file above, we need to compare the suffix against known
888 # pages and TLDs in order to rule out that this is an unknown file suffix or TLD that the user
889 # needs to add to the HTTP_FILES or HTTP_TLDS_AND_PAGES array
890 if [ $IS_FILE -eq -1 ]; then
891 for PAGE_OR_TLD in "${HTTP_TLDS_AND_PAGES[@]}"; do
892 if [[ $PAGE_OR_TLD == $POST_DOT ]]; then
893 IS_FILE=0
894 break
895 fi
896 done
897 fi
898
899 # Turn case sensitivity back on in Bash
900 shopt -u nocasematch
901 fi
902
903 # If this suffix escaped identification as either a file, page or TLD, inform the user
904 STR_TYPE=""
905 if [ $IS_FILE -eq -1 ]; then
906 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I encountered the unknown URL ending '$POST_DOT'. Please add this ending to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
907 let SKIP_UNK_SUFFIX+=1
908 continue
909 elif [ $IS_FILE -eq 1 ]; then
910 STR_TYPE="file"
911 let FILE_LINKS+=1
912 else
913 STR_TYPE="page"
914 let PAGE_LINKS+=1
915 fi
916
917 # Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
918 # issue with sites that require HTTPS
919 CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --retry 2 --write-out '%{http_code}\n' $URL)
920 CURL_ERR=$(echo $?)
921 CURL_RESULT=$CURL_CODE
922
923 # Tack 'curl' exit code onto result string if 'curl' returned "000" (no HTTP response)
924 if [ $CURL_CODE == "000" ]; then
925 CURL_RESULT="$CURL_RESULT-$CURL_ERR"
926 fi
927
928 # Begin to determine our status code for this URL (EI, IW, OK, RD, or NG)
929 STATUS="??"
930 NEW_URL=""
931 INTERWIKI_INDEX=-1
932
933 # First make sure that this isn't an "external internal" link to our own wiki that can be replaced
934 # by "[[page_name]]". If it uses a special access URL beginning with "/w/", let it pass, as it
935 # probably cannot be replaced by "[[ ]]" markup
936 if [[ $URL == *$WIKI_PATH* ]] && [[ $URL != *$WIKI_PATH/w/* ]]; then
937 STATUS="EI"
938 let EI_LINKS+=1
939 fi
940
941 # If it's not, check if this is a link to a domain that we have an interwiki prefix for (also make
942 # sure that it's not an archive.org link to a page from an interwiki domain)
943 if [ $STATUS == "??" ] && [[ $URL != *web.archive.org* ]]; then
944 for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
945 if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]] && [[ $URL != *${INTERWIKI_DOMAINS[$i]}/w/* ]]; then
946 STATUS="IW"
947 let IW_LINKS+=1
948 INTERWIKI_INDEX=$i
949 break
950 fi
951 done
952 fi
953
954 # If we didn't match an interwiki domain, see if the status code is in our "OK" codes list
955 if [ $STATUS == "??" ]; then
956 for CODE in "${OK_CODES[@]}"; do
957 if [[ $CODE == $CURL_CODE ]]; then
958 STATUS="OK"
959 let OK_LINKS+=1
960
961 # If this is a YouTube link, we have to look at the actual page source to know if the video
962 # is good or not
963 if [[ $URL == *www.youtube.com* ]]; then
964 PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $URL | grep "\"simpleText\":\"Video unavailable\"")
965 if [ ! -z "$PAGE_TEXT" ]; then
966 STATUS="NG"
967 let OK_LINKS-=1
968 let NG_LINKS+=1
969 fi
970 fi
971 break
972 fi
973 done
974 fi
975
976 # If we didn't get a match with the "OK" codes, check it against the "RD" codes
977 if [ $STATUS == "??" ]; then
978 for CODE in "${RD_CODES[@]}"; do
979 if [[ $CODE == $CURL_CODE ]]; then
980 # Get URL header again in order to retrieve the URL we are being redirected to
981 NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --write-out '%{redirect_url}\n' $URL)
982
983 # Adjust the old and new URLs to both use HTTP for comparison purposes, so we can filter
984 # those changes out if the user didn't ask for them
985 URL_HTTP=$(echo $URL | sed -E 's/^https:/http:/')
986 NEW_URL_HTTP=$(echo $NEW_URL | sed -E 's/^https:/http:/')
987
988 # Sometimes 'curl' fails to get the redirect_url due to time-out or bad web site config
989 NEW_URL_LENGTH=$(echo | awk -v input=$NEW_URL_HTTP '{print length(input)}')
990 if [ $NEW_URL_LENGTH -lt $MIN_URL_LENGTH ]; then
991 NEW_URL_HTTP="[new URL not retrieved]"
992 fi
993
994 # Remove slash at end of new URL, if present, so we can filter out the redirects that
995 # merely add an ending slash if the user didn't ask for them
996 NEW_URL_NO_SLASH=$(echo $NEW_URL_HTTP | sed -E 's:/$::')
997
998 # Detect if this is a youtu.be link simply being expanded by YouTube to the full
999 # youtube.com address
1000 YOUTU_BE=0
1001 if [[ $URL_HTTP == http://youtu.be* ]] && [[ $NEW_URL_HTTP == http://www.youtube.com* ]]; then
1002 YOUTU_BE=1
1003 fi
1004
1005 # If the URLs match besides HTTP being upgraded to HTTPS, then the link is OK (unless user
1006 # wants those to be reported)
1007 if [ $SHOW_HTTPS -eq 0 ] && [ $URL_HTTP == $NEW_URL_HTTP ]; then
1008 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to show http->https upgrades, and I was redirected to '$NEW_URL'."
1009 STATUS="OK"
1010 let OK_LINKS+=1
1011 let SKIP_HTTPS_UP+=1
1012 # If the URLs match besides an added ending slash, then the link is OK (unless user wants
1013 # those to be reported)
1014 elif [ $SHOW_SLASH -eq 0 ] && [ $URL_HTTP == $NEW_URL_NO_SLASH ]; then
1015 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to show added trailing slashes, and I was redirected to '$NEW_URL'."
1016 STATUS="OK"
1017 let OK_LINKS+=1
1018 let SKIP_SLASH_ADD+=1
1019 elif [ $YOUTU_BE -eq 1 ]; then
1020 # We have to look at the actual page source to know if a YouTube video is good or not
1021 PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $NEW_URL | grep "\"simpleText\":\"Video unavailable\"")
1022 if [ ! -z "$PAGE_TEXT" ]; then
1023 STATUS="NG"
1024 let NG_LINKS+=1
1025 else
1026 if [ $SHOW_YT_RD -eq 0 ]; then
1027 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to show redirects for youtu.be links, and I was redirected to '$NEW_URL'."
1028 STATUS="OK"
1029 let OK_LINKS+=1
1030 let SKIP_YOUTU_BE+=1
1031 else
1032 STATUS="RD"
1033 let RD_LINKS+=1
1034 fi
1035 fi
1036 else
1037 STATUS="RD"
1038 let RD_LINKS+=1
1039 fi
1040 break
1041 fi
1042 done
1043 fi
1044
1045 # If we didn't get a match with the "RD" codes, check it against the "NG" codes
1046 if [ $STATUS == "??" ]; then
1047 for CODE in "${NG_CODES[@]}"; do
1048 if [[ $CODE == $CURL_CODE ]]; then
1049 STATUS="NG"
1050 let NG_LINKS+=1
1051 break
1052 fi
1053 done
1054 fi
1055
1056 # If we didn't match a known status code, advise the reader
1057 if [ $STATUS == "??" ]; then
1058 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I encountered the unknown response code $CURL_CODE."
1059 let SKIP_UNK_CODE+=1
1060 continue
1061 fi
1062
1063 # Check problem links against exceptions list before proceeding
1064 FOUND_EXCEPT=0
1065 if [ $STATUS != "OK" ] && [ ! -z $EXCEPT_URL ]; then
1066 # The code we expect to find in the exceptions file is either the 'curl' result or "EI"/"IW"
1067 EXPECT_CODE="$CURL_RESULT"
1068 if [ $STATUS == "EI" ]; then
1069 EXPECT_CODE="EI"
1070 elif [ $STATUS == "IW" ]; then
1071 EXPECT_CODE="IW"
1072 fi
1073
1074 # Look for link in exceptions list and make sure the listed result code and wiki page also match
1075 for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
1076 {
1077 EXCEPT_LINE="${EXCEPT_ARRAY[$i]}"
1078
1079 # Undo any HTML-encoding from the wiki page; for now we just worry about the ampersand, as most
1080 # other HTML-encoded characters are not found in URLs
1081 EXCEPT_LINE=$(echo "$EXCEPT_LINE" | sed 's/\&amp;/\&/g')
1082
1083 # Match URL
1084 EXCEPT_URL="${EXCEPT_LINE#*,}"
1085 EXCEPT_URL="${EXCEPT_URL%,*}"
1086 if [ "$EXCEPT_URL" != "$URL" ]; then
1087 continue
1088 fi
1089
1090 # Match containing page's name
1091 EXCEPT_PAGE="${EXCEPT_LINE##*,}"
1092 EXCEPT_PAGE="${EXCEPT_PAGE%% *}"
1093 if [ "$EXCEPT_PAGE" == "*" ] || [ "$EXCEPT_PAGE" == $LOCAL_PAGE_PATH ]; then
1094 # Match result code
1095 EXCEPT_CODE=${EXCEPT_LINE%%,*}
1096 if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then
1097 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because its expected result, '$EXPECT_CODE', is in the exceptions list."
1098 if [ $STATUS == "EI" ]; then
1099 let SKIP_EXPECT_EI+=1
1100 elif [ $STATUS == "IW" ]; then
1101 let SKIP_EXPECT_IW+=1
1102 elif [ $STATUS == "RD" ]; then
1103 let SKIP_EXPECT_RD+=1
1104 else
1105 let SKIP_EXPECT_NG+=1
1106 fi
1107 FOUND_EXCEPT=1
1108 break
1109 fi
1110 fi
1111 } done
1112 fi
1113 if [ $FOUND_EXCEPT -eq 1 ]; then
1114 continue
1115 fi
1116
1117 # If appropriate, record this link to the log, with clickable URLs when possible
1118 if [ $STATUS != "OK" ] || [ $RECORD_OK_LINKS -eq 1 ]; then
1119 # Prepare 'curl' result in parentheses to print after status code, unless this is an "IW" or "EI"
1120 # link, in which case showing the status code doesn't make sense. Adjust spacing after string to
1121 # ensure TXT and RTF reports have aligned columns of results.
1122 CURL_STR_H=" ($CURL_RESULT)"
1123 CURL_STR_T="$CURL_STR_H"
1124 CURL_STR_R="$CURL_STR_H "
1125 if [ $STATUS == "IW" ] || [ $STATUS == "EI" ]; then
1126 CURL_STR_H=""
1127 CURL_STR_T=" "
1128 CURL_STR_R=" "
1129 fi
1130
1131 # Record link and its wiki page in TXT, RTF, and HTML markup
1132 valPrint t "${STATUS}${CURL_STR_T} $STR_TYPE $URL"
1133 valPrint t " linked from $FULL_PAGE_PATH"
1134 valPrint r "${STATUS}${CURL_STR_R}${RTF_TABS}$STR_TYPE {\field{\*\fldinst{HYPERLINK \"$URL\"}}{\fldrslt $URL}}"
1135 valPrint r " linked from {\field{\*\fldinst{HYPERLINK \"$FULL_PAGE_PATH\"}}{\fldrslt $LOCAL_PAGE_PATH}}"
1136 valPrint hn "<tr><td style=\"white-space:nowrap\">${STATUS}${CURL_STR_H}</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>"
1137 valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"
1138
1139 # Place vertical space here since we won't be printing anything more about this link
1140 if [ $STATUS == "OK" ] && [ $SUGGEST_SNAPSHOTS_OK -eq 0 ]; then valPrint tr ""; valPrint hs ""; fi
1141
1142 # Record redirect URL if one was given by a 3xx response page
1143 if [ $STATUS == "RD" ]; then
1144 valPrint ts " Server suggests $NEW_URL"
1145 valPrint rs " Server suggests {\field{\*\fldinst{HYPERLINK \"$NEW_URL\"}}{\fldrslt $NEW_URL}}"
1146 valPrint hs "<tr><td colspan=\"2\" align=\"right\">Server suggests</td><td><a href=\"$NEW_URL\" target=\"_blank\">$NEW_URL</a></td></tr>"
1147 fi
1148
1149 # Notify reader if we can use an intrawiki link for this URL
1150 if [ $STATUS == "EI" ]; then
1151 INTRA_PAGE=${URL#*://*/}
1152 valPrint ts " Just use [[$INTRA_PAGE]]"
1153 valPrint rs " Just use [[$INTRA_PAGE]]"
1154 valPrint hs "<tr><td colspan=\"2\" align=\"right\">Just use</td><td>[[$INTRA_PAGE]]</td></tr>"
1155 fi
1156
1157 # Notify reader if we can use an interwiki prefix for this URL
1158 if [ $STATUS == "IW" ]; then
1159 INTER_PAGE=$(echo "$URL" | sed 's/.*\///')
1160 valPrint ts " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
1161 valPrint rs " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
1162 valPrint hs "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]</td></tr>"
1163 fi
1164
1165 # Query Internet Archive for latest "OK" snapshot for "NG" page
1166 if [[ ( $STATUS == "NG" && $SUGGEST_SNAPSHOTS_NG -eq 1 ) || ( $STATUS == "OK" && $SUGGEST_SNAPSHOTS_OK -eq 1 ) ]]; then
1167
1168 # We need to watch out for the rate limit or we'll get locked out; look at how much time has
1169 # elapsed and then wait the remainder between that and how long of a wait we think is needed
1170 # to avoid the dreaded "Too Many Requests" response. 5 seconds is just a guess.
1171 CUR_TIME=$(date +%s)
1172 WAIT_REMAINDER=$((5 - $CUR_TIME + $START_LINK))
1173 if [ $WAIT_REMAINDER -gt 0 ]; then
1174 valPrint t "Waiting $WAIT_REMAINDER second(s) to conform to Archive.org rate limit."
1175 sleep $WAIT_REMAINDER
1176 fi
1177
1178 # Issue query to the API
1179 ARCHIVE_QUERY=$(curl --silent --max-time $TIMEOUT "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
1180
1181 # Notify user if we hit the rate limit and just keep going
1182 if [[ "$ARCHIVE_QUERY" == "*Too Many Requests*" ]]; then
1183 valPrint t " IA has rate-limited us!"
1184 valPrint r " IA has rate-limited us!"
1185 valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td>(hit the API rate limit!)</td></tr>"
1186 # If a "closest" snapshot was received, inform user
1187 elif [[ "$ARCHIVE_QUERY" == *\"closest\":* ]]; then
1188 # In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it
1189 ARCHIVE_QUERY=$(echo "$ARCHIVE_QUERY" | sed 's/#!/#\\!/')
1190
1191 # ...isolate "url" property in the response that follows the "closest" tag
1192 SNAPSHOT_URL=${ARCHIVE_QUERY##*\"closest\":} # everything after '"closest":'
1193 SNAPSHOT_URL=${SNAPSHOT_URL#*\"url\": \"} # everything after '"url": "'
1194 SNAPSHOT_URL=${SNAPSHOT_URL%%\"*} # everything before '"'
1195
1196 # Remove the port 80 part that IA often adds to the URL, as it's superfluous
1197 SNAPSHOT_URL=$(echo $SNAPSHOT_URL | sed 's/:80//')
1198
1199 # Inform the user of the snapshot URL
1200 valPrint ts " IA suggests $SNAPSHOT_URL"
1201 valPrint rs " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
1202 valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
1203 else # Otherwise give a generic Wayback Machine link for this URL, which might work
1204 valPrint ts " Try browsing $ARCHIVE_GENERIC/$URL"
1205 valPrint rs " Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
1206 valPrint hs "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"
1207 fi
1208 fi
1209 fi
1210
1211 # If this is a page and it seems to be there, take screenshot of it using headless Google Chrome
1212 if [ $IS_FILE -eq 0 ] && [ $TAKE_PAGE_SHOT -eq 1 ] && [ $STATUS == "OK" ]; then
1213 # Sanitize URL by removing "http(s)://" and converting any other colons or slashes to underscores
1214 SHOT_NAME=$(echo "$URL" | sed 's/https*\:\/\///' | sed 'y/:\//__/')
1215 SHOT_FILE="$SHOT_PATH/$SHOT_NAME.png"
1216
1217 # Don't take screenshot if we already encountered this page and screenshotted it
1218 if [ ! -f "$SHOT_FILE" ]; then
1219 "$CHROME_PATH" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
1220 if [ -f "$WORKING_DIR/$CHROME_SCREENSHOT" ]; then
1221 mv -n "$WORKING_DIR/$CHROME_SCREENSHOT" "$SHOT_FILE"
1222 else
1223 valPrint trhs "Screenshot of URL $URL seems to have failed!"
1224 fi
1225 else
1226 valPrint trhs "Skipping screenshot of URL '$URL' because file '$SHOT_FILE' already exists."
1227 fi
1228 fi
1229done
1230FINISHED_LIST="yes"
1231wrapupAndExit
Note: See TracBrowser for help on using the repository browser.