source: Validate External Links/validate_external_links.sh@ 1176

Last change on this file since 1176 was 1175, checked in by iritscen, 2 years ago

ValExtLinks: Added audit feature which tells the user if there are items in the exception list which are no longer present on the wiki or no longer return the given error code.

File size: 56.9 KB
Line 
1#!/bin/bash
2
3# Validate External Links by Iritscen
4#
5# Validates a list of external links in CSV format. The resulting logs are produced in three formats:
6# - TXT (for easy diffing with an earlier log)
7# - RTF (for reading as a local file with clickable links)
8# - HTML (for reading as a web page)
9# Call script with "--help" argument for documentation. Also see Read Me.rtf for critical notes.
10#
11# Recommended rule:
12# |----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----|
13#
14# Table of contents (sections of script in order of appearance, not execution):
15# • Globals
16# • Help Output
17# • Setup
18# • Utility Functions
19# • Summary Output
20# • Initialization
21# • Data Sourcing
22# • Config Output
23# • Legend Output
24# • Main Loop
25
26# Set separator token to newline
27IFS="
28"
29
30### GLOBALS ###
31# Settings -- these will be changed from their defaults by the arguments passed in to the script
32LINKS_URL="" # download external link CSV from this location (can use "file://" protocol)
33EXCEPT_URL="" # location of wiki page with a list of exceptions for NG results
34OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder
35RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES
36SHOW_SLASH=0 # record issue when a slash is added to the end of a URL
37SHOW_HTTPS=0 # record issue when "http" is upgraded to "https"
38SHOW_YT_RD=0 # record redirection for a youtu.be URL expanding to the full URL
39SUGGEST_SNAPSHOTS_NG=0 # query the Internet Archive for a possible snapshot URL for each NG page
40SUGGEST_SNAPSHOTS_OK=0 # query the Internet Archive for an existing snapshot of each OK page
41CHECK_ARCHIVE_LINKS=0 # check URLs on archive.org and archive.is
42TAKE_PAGE_SHOT=0 # take a screenshot of each OK page
43TIMEOUT=10 # time to wait for a response when querying a site
44CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature
45URL_START=1 # start at this URL in LINKS_FILE
46URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE
47UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report
48
49# Fixed strings -- see the occurrences of these variables to learn their purpose
50AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.134 Safari/537.36"
51ARCHIVE_API="http://archive.org/wayback/available"
52ARCHIVE_GENERIC="https://web.archive.org/web/*"
53ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
54CHROME_SCREENSHOT="screenshot.png"
55EXCEPT_FILE_NAME="exceptions.txt"
56EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
57WIKI_CURL="https://wiki.oni2.net/Validate_External_Links/Curl_codes"
58WIKI_HTTP="https://wiki.oni2.net/Validate_External_Links/HTTP_codes"
59WIKI_MAIN="https://wiki.oni2.net/Validate_External_Links"
60WIKI_ME="http://iritscen.oni2.net"
61THIS_DIR=$(cd $(dirname $0); pwd)
62WORKING_DIR=$(pwd)
63WIKI_PATH="wiki.oni2.net"
64
65# These are parallel arrays of the IDs and names of OniGalore's current namespaces
66declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 100 101 102 103 104 105 108 109 110 111)
67declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk")
68
69# These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
70# This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code.
71declare -a HTTP_FILES=(3ds 7z avi BINA blend bsl cpp csv dae dll dmg doc east exe fbx first63 flv gamedata gif jpg last32 log m4a mhm mov mp3 mp4 oni ONWC pdf png psd py rar tga TRMA txt vbs wav wmv xaf xcf xlsx xml zip)
72declare -a HTTP_TLDS_AND_PAGES=(abstract action ars asp aspx cfm cgi com css de do full htm html it js jsp net org pgi php php3 phtml pl ru shtml stm uk x)
73
74# These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
75# are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt
76# if you add a new code.
77declare -a OK_CODES=(200 401 405 406 418 501)
78declare -a RD_CODES=(301 302 303 307 308)
79declare -a NG_CODES=(000 400 403 404 410 429 500 502 503 504 530)
80
81# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
82# transcluded text, and if the transclusion fails, then the braces show up in the URL
83ILLEGAL_CHARS="{ }"
84
85# The shortest URL possible, used for sanity-checking some URLs: http://a.co
86MIN_URL_LENGTH=11
87
88# These are parallel arrays giving the prefixes that can be used in place of normal external links to
89# some wikis and other sites; based on https://wiki.oni2.net/Special:Interwiki
90declare -a INTERWIKI_PREFIXES=(commons metawikimedia mw wikibooks wikidata wikimedia wikinews wikiquote wikisource wikispecies wikiversity wikivoyage wikt wp)
91declare -a INTERWIKI_DOMAINS=(commons.wikimedia.org meta.wikimedia.org mediawiki.org wikibooks.org wikidata.org wikimediafoundation.org wikinews.org wikiquote.org wikisource.org species.wikimedia.org wikiversity.org wikivoyage.org wiktionary.org wikipedia.org)
92
93# Variables for keeping track of main loop progress and findings
94LINK_NUM=0
95EI_LINKS=0
96IW_LINKS=0
97OK_LINKS=0
98RD_LINKS=0
99NG_LINKS=0
100SKIP_UNK_NS=0
101SKIP_JS_PAGE=0
102SKIP_BAD_URL=0
103SKIP_NON_ASCII=0
104SKIP_UNK_SUFFIX=0
105SKIP_UNK_CODE=0
106SKIP_EXPECT_NG=0
107SKIP_EXPECT_RD=0
108SKIP_EXPECT_EI=0
109SKIP_EXPECT_IW=0
110SKIP_HTTPS_UP=0
111SKIP_SLASH_ADD=0
112SKIP_YOUTU_BE=0
113SKIP_ARCHIVES=0
114FILE_LINKS=0
115PAGE_LINKS=0
116SKIPPED_HEADER_ROW=0
117FINISHED_LIST="no"
118START_RUN=0
119END_RUN=0
120
121
122### HELP OUTPUT ###
123# A pseudo-man page. Here is the 80-character rule for the page text:
124# 234567890123456789012345678901234567890123456789012345678901234567890123456789
125function printHelp()
126{
127 cat << EOF
128
129NAME
130 Validate External Links
131
132SYNOPSIS
133 validate_external_links.sh --help
134 validate_external_links.sh --links URL --output DIR [--exceptions URL]
135 [--record-ok-links] [--show-added-slashes] [--show-https-upgrades]
136 [--show-yt-redirects] [--suggest-snapshots] [--check-archive-links]
137 [--take-screenshots FILE] [--timeout NUM] [--start-url NUM]
138 [--end-url NUM] [--upload FILE]
139
140DESCRIPTION
141 This script parses a list of external links found in the OniGalore wiki
142 (which is dumped by the Oni2.net server periodically in a particular
143 format), validates them using the Unix tool 'curl', and produces a report
144 of which links were "OK" (responded positively to an HTTP query), which
145 were "RD" (responded with a 3xx redirect code), which could be "IW"
146 (interwiki) links, which are "EI" (external internal) links and could be
147 intrawiki links, and which were "NG" (no good; a negative response to the
148 query). This report can then be automatically uploaded to the location of
149 your choice. The script can also suggest Internet Archive snapshots for
150 "NG" links, and take screenshots of "OK" links for visual verification by
151 the reader that the page in question is the one intended to be displayed.
152
153 You must pass this script the URL at which the list of links is found
154 (--links) and the path where the directory of logs should be outputted
155 (--output). All other arguments are optional.
156
157OPTIONS
158 --help Show this page.
159 --links URL (required) URL from which to download the CSV
160 file with external links. Note that this URL can
161 be a local file if you supply a file:// path.
162 --output DIR (required) Unix path to directory in which Val
163 should place its reports.
164 --exceptions URL In order to remove links from the report which
165 Val finds an issue with but which you regard as
166 OK, list those desired exceptions on a wiki page.
167 See the sample file "exceptions.pdf" for the
168 required format of the page. Note that this URL
169 can point to a local file if you supply a path
170 beginning with "file://".
171 --record-ok-links Log a link in the report even if its response
172 code is "OK".
173 --show-added-slashes Report on redirects that simply add a '/' to the
174 end of the URL.
175 --show-https-upgrades Report on redirects that simply upgrade a
176 "http://" URL to a "https://" URL.
177 --show-yt-redirects Report on redirects that expand a youtu.be URL.
178 --suggest-snapshots-ng Query the Internet Archive for a possible
179 snapshot URL for each "NG" page.
180 --suggest-snapshots-ok Query the Internet Archive for a snapshot of each
181 "OK" page just to make sure it's available. Note
182 that this will add a tremendous amount of time to
183 the script execution because there is a rate
184 limit to the Archive API. Note that this option
185 does nothing unless you also use the
186 --record-ok-links argument.
187 --check-archive-links Check links that are already pointing to a page
188 on the Internet Archive or archive.is (AKA
189 archive.today). In theory these links should be
190 totally stable and not need validation.
191 --take-screenshots FILE Call the Google Chrome binary at this path to
192 take screenshots of each "OK" page.
193 --timeout NUM Wait this many seconds for a site to respond. The
194 default is 10. Important note: Val will attempt
195 to reach each URL three times, so the time taken
196 to ping an unresponsive site will be three times
197 this setting.
198 --start-url NUM Start at this link in the links CSV file.
199 --end-url NUM Stop at this link in the links CSV file.
200 --upload FILE Upload report using the credentials and path
201 given in this local text file. See sftp_login.txt
202 for template.
203
204BUGS
205 The script cannot properly parse any line in the external links file
206 which contains a comma in the name of the wiki page containing a link.
207 Commas in the link itself are not an issue.
208EOF
209}
210
211
212### SETUP ###
213# If first argument is a help request, or if nothing was passed in at all, print help page and quit
214if [ "$#" -eq 0 ] || [ "$1" == "--help" ]; then
215 printHelp | less
216 exit 0
217fi
218
219# Parse arguments as long as there are more arguments to process
220while (( "$#" )); do
221 case "$1" in
222 --links ) LINKS_URL="$2"; shift 2;;
223 --exceptions ) EXCEPT_URL="$2"; shift 2;;
224 --output ) OUTPUT_DIR="$2"; shift 2;;
225 --record-ok-links ) RECORD_OK_LINKS=1; shift;;
226 --show-added-slashes ) SHOW_SLASH=1; shift;;
227 --show-https-upgrades ) SHOW_HTTPS=1; shift;;
228 --show-yt-redirects ) SHOW_YT_RD=1; shift;;
229 --suggest-snapshots-ng ) SUGGEST_SNAPSHOTS_NG=1; shift;;
230 --suggest-snapshots-ok ) SUGGEST_SNAPSHOTS_OK=1; shift;;
231 --check-archive-links ) CHECK_ARCHIVE_LINKS=1; shift;;
232 --take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
233 --timeout ) TIMEOUT=$2; shift 2;;
234 --start-url ) URL_START=$2; shift 2;;
235 --end-url ) URL_LIMIT=$2; shift 2;;
236 --upload ) UPLOAD_INFO=$2; shift 2;;
237 * ) echo "Invalid argument '$1' detected. Aborting."; exit 1;;
238 esac
239done
240
241# If the required arguments were not supplied, print help page and quit
242if [ -z $LINKS_URL ] || [ -z $OUTPUT_DIR ]; then
243 echo "Error: I did not receive one or both required arguments. Run me with the \"--help\" argument for documentation."
244 exit 2
245fi
246
247# If user wants screenshots, make sure path to Chrome was passed in and is valid
248if [ $TAKE_PAGE_SHOT -eq 1 ]; then
249 if [ ! -f "$CHROME_PATH" ]; then
250 echo "Error: You need to supply a path to the Google Chrome application in order to take screenshots."
251 exit 3
252 fi
253fi
254
255# Check that UPLOAD_INFO exists, if this argument was supplied
256if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then
257 echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting."
258 exit 4
259fi
260
261# Check that OUTPUT_DIR is a directory
262if [ ! -d "$OUTPUT_DIR" ]; then
263 echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting."
264 exit 5
265fi
266
267# Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots
268SAFE_TIME=$(date "+%Y-%m-%d--%H-%M-%S") # for use in path names
269NICE_TIME=$(TZ=":UTC" date "+%a, %d %b %Y %k:%M:%S GMT") # matches format of LINKS_DATE, which is extracted from the extlinks file
270OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
271OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
272SHOT_PATH="$OUTPUT_PATH/Screenshots"
273LOG_NAME="ValExtLinks report"
274LOG_NAME_TXT="$LOG_NAME.txt"
275LOG_NAME_RTF="$LOG_NAME.rtf"
276LOG_NAME_HTM="$LOG_NAME.htm"
277LOG_PATH="$OUTPUT_PATH/$LOG_NAME"
278LOG_PATH_TXT="$LOG_PATH.txt"
279LOG_PATH_RTF="$LOG_PATH.rtf"
280LOG_PATH_HTM="$LOG_PATH.htm"
281mkdir "$OUTPUT_PATH"
282if [ $TAKE_PAGE_SHOT -eq 1 ]; then
283 mkdir "$SHOT_PATH"
284fi
285
286# Check that 'mkdir' succeeded
287if [ ! -d "$OUTPUT_PATH" ]; then
288 echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting."
289 exit 6
290fi
291
292# Get date on the file at LINKS_URL and print to log
293LINKS_DATE=$(curl --silent --head $LINKS_URL | grep "Last-Modified")
294if [ -z "$LINKS_DATE" ]; then
295 echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting."
296 exit 7
297fi
298LINKS_DATE=${LINKS_DATE#Last-Modified: }
299
300
301### UTILITY FUNCTIONS ###
302# Writes a plain-text header to TXT log file
303function printTXTheader()
304{
305 valPrint t "Validate External Links report"
306 valPrint t "generated $NICE_TIME"
307 valPrint t "from data of $LINKS_DATE"
308 valPrint t "script by Iritscen (contact: $WIKI_ME)"
309 valPrint t ""
310}
311
312# Writes the RTF header to RTF log file, then a centered title, then sets text to left-justified
313function printRTFheader()
314{
315 valPrint r "{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf820
316{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
317{\colortbl;\red255\green255\blue255;}
318{\*\expandedcolortbl;;}
319\margl1440\margr1440\vieww12600\viewh12100\viewkind0
320\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\qc\partightenfactor0
321
322\f0\fs28 \cf0 \b1 Validate External Links report \b0\\
323generated $NICE_TIME\\
324from data of $LINKS_DATE\\
325script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$WIKI_ME\"}}{\fldrslt contact}})
326\\
327\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
328\cf0 "
329}
330
331# Closes the RTF markup of the RTF log file
332function printRTFfooter()
333{
334 valPrint r "}"
335}
336
337# Writes the HTML header to HTML log file
338function printHTMheader()
339{
340 valPrint h "<html>
341<head>
342<title>Validate External Links report</title>
343</head>
344<body>
345<h2>Validate External Links report</h2>
346<h3>generated $NICE_TIME<br />
347from data of $LINKS_DATE<br />
348script by Iritscen (<a href=\"$WIKI_ME\" target=\"_blank\">contact</a>)</h3>"
349}
350
351# Closes the HTML markup of the HTML log file
352function printHTMfooter()
353{
354 valPrint h "</body>
355</html>"
356}
357
358# The central logging function. The first parameter is a string composed of one or more characters that
359# indicate which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
360# 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 's' means "Print an
361# extra newline at the end." 'w' means "Don't pass console output through 'fmt'" ("fmt" fits the output
362# to an 80-column CLI but can break special formatting and the 'n' option).
363function valPrint()
364{
365 if [[ "$1" == *c* ]]; then
366 if [[ "$1" == *n* ]]; then
367 echo -n "$2"
368 elif [[ "$1" == *w* ]]; then
369 echo "$2"
370 elif [[ "$1" == *s* ]]; then
371 echo -e "$2\n"
372 else
373 echo "$2" | fmt -w 80
374 fi
375 fi
376 if [[ "$1" == *t* ]]; then
377 if [[ "$1" == *n* ]]; then
378 echo -n "$2" >> "$LOG_PATH_TXT"
379 elif [[ "$1" == *s* ]]; then
380 echo -e "$2\n" >> "$LOG_PATH_TXT"
381 else
382 echo "$2" >> "$LOG_PATH_TXT"
383 fi
384 fi
385 if [[ "$1" == *r* ]]; then
386 if [[ "$1" == *n* ]]; then
387 echo "$2" >> "$LOG_PATH_RTF"
388 elif [[ "$1" == *s* ]]; then
389 echo "$2\line\line" >> "$LOG_PATH_RTF"
390 else
391 echo "$2\line" >> "$LOG_PATH_RTF"
392 fi
393 fi
394 if [[ "$1" == *h* ]]; then
395 if [[ "$1" == *s* ]]; then
396 echo "$2<tr><td>&nbsp;</td></tr>" >> "$LOG_PATH_HTM"
397 elif [[ "$1" == *n* ]]; then
398 echo "$2" >> "$LOG_PATH_HTM"
399 else
400 echo "$2<br />" >> "$LOG_PATH_HTM"
401 fi
402 fi
403}
404
405# Pluralize the string in parameter 1 if the number in parameter 2 is not 1
406function pluralCheckNoun()
407{
408 if [ $2 -ne 1 ]; then
409 if [[ $1 =~ x$ ]]; then
410 echo $1es
411 else
412 echo $1s
413 fi
414 else
415 echo $1
416 fi
417}
418
419# Output "is" if parameter 1 is 1, otherwise "are"
420function pluralCheckIs()
421{
422 if [ $1 -ne 1 ]; then
423 echo "are"
424 else
425 echo "is"
426 fi
427}
428
429# Output "was" if parameter 1 is 1, otherwise "were"
430function pluralCheckWas()
431{
432 if [ $1 -ne 1 ]; then
433 echo "were"
434 else
435 echo "was"
436 fi
437}
438
439# Output "a " if parameter 1 is 1, otherwise nothing
440function pluralCheckA()
441{
442 if [ $1 -eq 1 ]; then
443 echo "a "
444 fi
445}
446
447# Output "an " if parameter 1 is 1, otherwise nothing
448function pluralCheckAn()
449{
450 if [ $1 -eq 1 ]; then
451 echo "an "
452 fi
453}
454
455# Upload the reports using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
456# reports being saved to disk have already been closed.
457function uploadReport()
458{
459 valPrint c "Uploading reports..."
460
461 SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
462 SFTP_USER_NAME_MARKER="user:"
463 SFTP_PASSWORD_MARKER="pw:"
464 SFTP_PORT_MARKER="port:"
465 SFTP_PATH_MARKER="path:"
466 SFTP_USER_NAME=$(grep $SFTP_USER_NAME_MARKER $UPLOAD_INFO)
467 SFTP_USER_NAME=${SFTP_USER_NAME#$SFTP_USER_NAME_MARKER}
468 SFTP_PASSWORD=$(grep $SFTP_PASSWORD_MARKER $UPLOAD_INFO)
469 SFTP_PASSWORD=${SFTP_PASSWORD#$SFTP_PASSWORD_MARKER}
470 SFTP_PORT=$(grep $SFTP_PORT_MARKER $UPLOAD_INFO)
471 SFTP_PORT=${SFTP_PORT#$SFTP_PORT_MARKER}
472 SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
473 SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}
474
475 for SUFFIX in htm rtf txt; do
476 expect "$SCRIPT_PATH" "$LOG_PATH.$SUFFIX" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.$SUFFIX"
477
478 if [ "$?" -ne 0 ]; then
479 valPrint c "Error $? occurred when attempting to upload $LOG_NAME.$SUFFIX!"
480 else
481 valPrint c "Report in `echo $SUFFIX | tr [:lower:] [:upper:]` format was uploaded."
482 fi
483 done
484}
485
486# Prints session summary when script is done
487function wrapupAndExit()
488{
489 # Get off progress line on console, drop down a line from last link in log, and close HTML table
490 valPrint ctr ""
491 valPrint h "</table><br />"
492
493 # If we didn't finish processing the last URL, then the iterator is one too high
494 if [ $FINISHED_LIST != "yes" ]; then
495 let LINK_NUM-=1
496 if [ $FINISHED_LIST == "no" ]; then
497 valPrint ctrh "The session was canceled by the user."
498 fi
499 fi
500
501 # Generate string with elapsed time
502 END_RUN=$(date +%s)
503 ELAPSED=$(echo $(($END_RUN - $START_RUN)) | awk '{printf "%d min. %d sec. elapsed", int($1/60), int($1%60)}')
504
505 # Do some math on results of session
506 LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
507 TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP+SKIP_YOUTU_BE))
508 LINK_ERRORS=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
509 LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_RD+SKIP_EXPECT_EI+SKIP_EXPECT_IW))
510 LINK_PROBLEMS_TOTAL=$((NG_LINKS+RD_LINKS+EI_LINKS+IW_LINKS))
511 LINK_PROBLEMS_NG=$((NG_LINKS-SKIP_EXPECT_NG))
512 LINK_PROBLEMS_RD=$((RD_LINKS-SKIP_EXPECT_RD))
513 LINK_PROBLEMS_EI=$((EI_LINKS-SKIP_EXPECT_EI))
514 LINK_PROBLEMS_IW=$((IW_LINKS-SKIP_EXPECT_IW))
515 LINK_PROBLEMS_NET=$((LINK_PROBLEMS_NG+LINK_PROBLEMS_RD+LINK_PROBLEMS_EI+LINK_PROBLEMS_IW))
516
517 # Print something in the Links section if no link issues were printed
518 if [ $LINK_PROBLEMS_NET -eq 0 ]; then
519 valPrint h "<i>No link problems to report! See the <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for a list of links with issues that were not reported.</i>"
520 fi
521 if [ $LINK_PROBLEMS_TOTAL -eq 0 ]; then
522 valPrint t "No link problems to report!"
523 valPrint r "\i1 No link problems to report! \i0"
524 fi
525
526 ## SUMMARY OUTPUT ##
527 valPrint ct "Summary ($ELAPSED):"
528 valPrint r "\b1 Summary \b0 ($ELAPSED)"
529 valPrint hn "<h3><span id=\"summary\">Summary ($ELAPSED)</span></h3>"
530 valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT) (there $(pluralCheckWas $FILE_LINKS) $FILE_LINKS file $(pluralCheckNoun link $FILE_LINKS) and $PAGE_LINKS page $(pluralCheckNoun link $PAGE_LINKS))."
531
532 # Print processed link totals
533 if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi
534 if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi
535 if [ $SKIP_ARCHIVES -gt 0 ]; then valPrint ctrh "- $SKIP_ARCHIVES archive.org/archive.is $(pluralCheckNoun link $SKIP_ARCHIVES) were not checked"; fi
536 if [ $LINK_PROBLEMS_TOTAL -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_TOTAL processed $(pluralCheckNoun link $LINK_PROBLEMS_TOTAL) had $(pluralCheckAn $LINK_PROBLEMS_TOTAL)$(pluralCheckNoun issue $LINK_PROBLEMS_TOTAL)"; fi
537 if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr " (excepted $LINKS_EXCEPTED link $(pluralCheckNoun issue $LINKS_EXCEPTED) from report)"; valPrint h "&nbsp;&nbsp;(excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi
538 if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) $(pluralCheckWas $OK_LINKS) OK"; fi
539 if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctr " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; valPrint h "&nbsp;&nbsp;(counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi
540
541 # Print errored link totals
542 if [ $LINK_ERRORS -gt 0 ]; then
543 valPrint c "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"
544 valPrint h "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for specific links):"
545 valPrint rt "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS):"
546 fi
547 if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
548 if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
549 if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
550 if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
551 if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
552 if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
553
554 # Print excepted link totals
555 if [ $LINKS_EXCEPTED -gt 0 ]; then
556 valPrint c "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"
557 valPrint h "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for specific links):"
558 valPrint rt "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted:"
559 fi
560 if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
561 if [ $SKIP_EXPECT_RD -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_RD/$RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi
562 if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi
563 if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi
564
565 # Perform exceptions audit
566 EXCEPTION_ISSUES=0
567 valPrint ctrh "Exceptions list audit:"
568 for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
569 EXCEPT_LINE="${EXCEPT_ARRAY[$i]}"
570 EXCEPT_LINE=$(echo "$EXCEPT_LINE" | sed 's/\&amp;/\&/g') # copied from exception-matching code
571
572 if [ ${EXCEPT_FOUND[$i]} -eq 0 ]; then
573 EXCEPT_URL="${EXCEPT_LINE#*,}"
574 EXCEPT_URL="${EXCEPT_URL%,*}"
575 EXCEPT_PAGE="${EXCEPT_LINE##*,}"
576 EXCEPT_PAGE="${EXCEPT_PAGE%% *}"
577 if [ "$EXCEPT_PAGE" == "*" ]; then
578 valPrint tr "- The link '$EXCEPT_URL' did not return an error or is not on any page."
579 else
580 valPrint tr "- The link '$EXCEPT_URL' did not return an error or is not on page '$EXCEPT_PAGE'."
581 fi
582 let EXCEPTION_ISSUES+=1
583 elif [ ${EXCEPT_USED[$i]} -eq 0 ]; then
584 EXCEPT_URL="${EXCEPT_LINE#*,}"
585 EXCEPT_URL="${EXCEPT_URL%,*}"
586 EXCEPT_CODE=${EXCEPT_LINE%%,*}
587 valPrint tr "- The link '$EXCEPT_URL' did not return error code $EXCEPT_CODE."
588 let EXCEPTION_ISSUES+=1
589 fi
590 done
591 if [ $EXCEPTION_ISSUES -eq 0 ]; then
592 valPrint ctrh "- No issues found."
593 else
594 valPrint c "$EXCEPTION_ISSUES exception list $(pluralCheckNoun issue $EXCEPTION_ISSUES) detected (see RTF or TXT report for details)."
595 valPrint h "$EXCEPTION_ISSUES exception list $(pluralCheckNoun issue $EXCEPTION_ISSUES) detected (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for details)."
596 fi
597
598 # Print checked link totals
599 if [ $LINK_PROBLEMS_NET -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS_NET link $(pluralCheckNoun issue $LINK_PROBLEMS_NET):"; fi
600 if [ $LINK_PROBLEMS_NG -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_NG NG $(pluralCheckNoun link $LINK_PROBLEMS_NG)"; fi
601 if [ $LINK_PROBLEMS_RD -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_RD $(pluralCheckNoun redirection $LINK_PROBLEMS_RD)"; fi
602 if [ $LINK_PROBLEMS_EI -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_EI $(pluralCheckNoun link $LINK_PROBLEMS_EI) that could be intrawiki"; fi
603 if [ $LINK_PROBLEMS_IW -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_IW $(pluralCheckNoun link $LINK_PROBLEMS_IW) that could be interwiki"; fi
604
605 # Close the log files' markup
606 valPrint trh "ValExtLinks says goodbye."
607 printRTFfooter
608 printHTMfooter
609
610 # Upload report if this was requested
611 if [ ! -z $UPLOAD_INFO ] && [ $FINISHED_LIST != "no" ]; then
612 uploadReport
613 fi
614
615 # Really quit now
616 valPrint c "ValExtLinks says goodbye."
617 exit 0
618}
619trap wrapupAndExit INT
620
621
622### INITIALIZATION ###
623# Print opening message to console and log files
624valPrint cw "$(tput bold)--Validate External Links--$(tput sgr0)"
625printTXTheader
626printRTFheader
627printHTMheader
628
629## DATA SOURCING ##
630valPrint t "Startup:"
631valPrint r "\b1 Startup \b0"
632valPrint hn "<h3>Startup</h3>"
633
634# Attempt to download file at LINKS_URL, then check that it succeeded
635valPrint cwtrhn "Downloading list of external links from $LINKS_URL..."
636LINKS_FILE_NAME=$(echo "$LINKS_URL" | sed 's/.*\///')
637LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
638curl --silent -o "$LINKS_FILE" $LINKS_URL
639if [ ! -f "$LINKS_FILE" ]; then
640 echo -e "\nThe download of $LINKS_URL appears to have failed. Aborting."
641 wrapupAndExit
642else
643 valPrint ctrh " success."
644fi
645
646# Attempt to download file at EXCEPT_URL, then check that it succeeded
647if [ ! -z $EXCEPT_URL ]; then
648 valPrint cwtrhn "Downloading list of reporting exceptions from $EXCEPT_URL..."
649 EXCEPT_DATA=$(curl --silent $EXCEPT_URL)
650 if [ -z "$EXCEPT_DATA" ]; then
651 echo -e "\nThe download of the exceptions data from '$EXCEPT_URL' appears to have failed. Aborting."
652 wrapupAndExit
653 else
654 valPrint ctrh " success."
655 fi
656 EXCEPT_DATA=${EXCEPT_DATA%END LIST*}
657 EXCEPT_DATA=${EXCEPT_DATA#*BEGIN LIST}
658 EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
659
660 # Store on disk for debugging purposes
661 echo "$EXCEPT_DATA" > "$EXCEPT_FILE"
662
663 # Transfer to array for easy searching later
664 declare -a EXCEPT_ARRAY=($(echo "$EXCEPT_DATA"))
665
666 # Create parallel arrays for marking which exceptions get used later
667 declare -a EXCEPT_USED=()
668 declare -a EXCEPT_FOUND=()
669 for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
670 EXCEPT_USED+=(0)
671 EXCEPT_FOUND+=(0)
672 done
673fi
674
675# Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
676LINK_COUNT_STRING=$(cat "$LINKS_FILE" | wc -l)
677
678# Number of URLs is number of lines minus one (first line is column header row for the CSV)
679LINK_COUNT=$(echo "${LINK_COUNT_STRING}" | tr -d '[:space:]')
680let LINK_COUNT-=1
681valPrint ctrh "Found $LINK_COUNT links to process."
682valPrint trh ""
683
684## CONFIG OUTPUT ##
685valPrint t "Config:"
686valPrint r "\b1 Config \b0"
687valPrint hn "<h3>Config</h3>"
688
689valPrint ctrhn "Links to consider: "
690if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
691 valPrint ctrh "$((URL_LIMIT-URL_START+1)), from link $URL_START to $URL_LIMIT"
692elif [ $URL_START -ne 1 ]; then
693 valPrint ctrh "$((LINK_COUNT-URL_START+1)), from link $URL_START to $LINK_COUNT"
694else
695 valPrint ctrh "$LINK_COUNT"
696fi
697
698valPrint ctrh "Site query timeout: $TIMEOUT seconds"
699
700valPrint ctrhn "Show OK links: "
701if [ $RECORD_OK_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
702
703valPrint ctrhn "Take screenshots: "
704if [ $TAKE_PAGE_SHOT -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
705
706valPrint ctrhn "Suggest archive.org snapshots for NG pages: "
707if [ $SUGGEST_SNAPSHOTS_NG -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
708
709valPrint ctrhn "Suggest archive.org snapshots for OK pages: "
710if [ $SUGGEST_SNAPSHOTS_OK -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
711
712valPrint ctrhn "Ignore slash-adding redirects: "
713if [ $SHOW_SLASH -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
714
715valPrint ctrhn "Ignore HTTPS-upgrading redirects: "
716if [ $SHOW_HTTPS -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
717
718valPrint ctrhn "Ignore youtu.be redirects: "
719if [ $SHOW_YT_RD -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
720
721valPrint ctrhn "Check archive.org and archive.is links: "
722if [ $CHECK_ARCHIVE_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
723
724valPrint tr "A summary of my findings will be found at the bottom of the report."
725valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
726valPrint trh ""
727
728## LEGEND OUTPUT ##
729valPrint t "Legend:"
730valPrint r "\b1 Legend \b0"
731valPrint hn "<h3>Legend</h3>"
732valPrint t "(For guidance in fixing these links, see $WIKI_MAIN. The exceptions list is at $EXCEPT_URL.)"
733valPrint r "(For guidance in fixing these links, see {\field{\*\fldinst{HYPERLINK \"$WIKI_MAIN\"}}{\fldrslt here}}. The exceptions list is {\field{\*\fldinst{HYPERLINK \"$EXCEPT_URL\"}}{\fldrslt here}}.)"
734valPrint h "(For guidance in fixing these links, see <a href=\"$WIKI_MAIN\" target=\"_blank\">here</a>. The exceptions list is <a href=\"$EXCEPT_URL\" target=\"_blank\">here</a>.)"
735valPrint trh "OK = URL seems to be working"
736valPrint trh "NG = URL no longer seems to work"
737valPrint trh "RD = URL is redirecting to this new URL"
738valPrint trh "EI = URL is an external link to an internal page and should be converted to an intrawiki link using the suggested markup"
739valPrint trh "IW = URL is an external link to a fellow wiki and should be converted to an interwiki link using the suggested markup"
740valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $WIKI_HTTP)"
741valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$WIKI_HTTP\"}}{\fldrslt here}} for code reference)"
742valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$WIKI_HTTP\" target=\"_blank\">here</a> for code reference)"
743valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $WIKI_CURL)"
744valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$WIKI_CURL\"}}{\fldrslt here}} for code reference)"
745valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$WIKI_CURL\" target=\"_blank\">here</a> for code reference)"
746valPrint trh "IA suggests = Last available snapshot returned by the Internet Archive"
747valPrint trh "Try browsing = The Archive failed to return a snapshot URL, so check for a snapshot manually using this link"
748valPrint trh ""
749
750
751### MAIN LOOP ###
752valPrint t "Links:"
753valPrint r "\b1 Links \b0"
754valPrint hn "<h3>Links</h3>"
755START_RUN=$(date +%s)
756# Process each line of the .csv in LINKS_FILE
757for LINE in `cat "$LINKS_FILE"`; do
758 START_LINK=$(date +%s)
759 let LINK_NUM+=1
760
761 # First line is the column header row for the CSV, so let's verify that the format hasn't changed
762 if [ $SKIPPED_HEADER_ROW -eq 0 ]; then
763 if [ $LINE == "namespace,title,target" ]; then
764 SKIPPED_HEADER_ROW=1
765 LINK_NUM=0 # this line is not a link, so reset the link counter
766 valPrint hn "<table>"
767 continue
768 else
769 valPrint ctrh "Did not find expected header row in $LINKS_FILE. Aborting."
770 wrapupAndExit
771 fi
772 fi
773
774 # Skip this link if we are not at URL_START yet
775 if [ $LINK_NUM -lt $URL_START ]; then
776 continue
777 fi
778
779 # Stop if we are at the limit declared for testing purposes
780 if [ $URL_LIMIT -gt 0 ] && [ $LINK_NUM -gt $URL_LIMIT ]; then
781 FINISHED_LIST="limit"
782 wrapupAndExit
783 fi
784
785 # Print progress to screen
786 if [ $LINK_NUM -gt 1 ]; then
787 printf "\e[1A\n" # erase previous progress message so that new one appears in its place
788 fi
789 valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."
790
791 # The number of the namespace is the element before the first comma on the line
792 NS_ID=${LINE%%,*}
793
794 # Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
795 NS_NAME=""
796 a=0
797 while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done
798 if [ $NS_ID == "NULL" ]; then
799 break
800 elif [ $NS_ID -eq ${NS_IDS[$a]} ]; then
801 NS_NAME="${NS_NAMES[$a]}"
802 break
803 fi
804 let a+=1
805 done
806 if [ "$NS_NAME" == "" ]; then
807 if [ $NS_ID == "NULL" ]; then
808 valPrint trs "Skipping line $LINK_NUM ('$LINE') because the namespace (and probably the page too) is 'NULL'. Probably the link is no longer in existence on the wiki."
809 else
810 valPrint trs "Skipping line $LINK_NUM ('$LINE') because I could not find a name for namespace ID $NS_ID."
811 fi
812 let SKIP_UNK_NS+=1
813 let PAGE_LINKS+=1
814 continue
815 fi
816
817 # The name of the page is everything between the namespace ID and the next comma on the line (commas
818 # in page names will break this)
819 PAGE_NAME=${LINE#$NS_ID,}
820 PAGE_NAME=${PAGE_NAME%%,*}
821
822 # Build longer wiki page URLs from namespace and page names
823 FULL_PAGE_PATH=https://$WIKI_PATH/$NS_NAME:$PAGE_NAME
824 LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
825 # Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it
826 # explicitly breaks the link
827 if [ $NS_ID -eq 0 ]; then
828 FULL_PAGE_PATH=https://$WIKI_PATH/$PAGE_NAME
829 LOCAL_PAGE_PATH=$PAGE_NAME
830 fi
831
832 # We don't want to consider wiki pages ending in .js, as the MW parser cannot reliably isolate URLS
833 # in JavaScript code, so it returns erroneous links
834 PAGE_NAME_SUFFIX=$(echo $PAGE_NAME | sed 's/.*\.//')
835 if [ $PAGE_NAME_SUFFIX == "js" ]; then
836 valPrint trs "Skipping URL '${LINE#$NS_ID,$PAGE_NAME,}' on line $LINK_NUM because it was found on JavaScript page '$LOCAL_PAGE_PATH'."
837 let SKIP_JS_PAGE+=1
838 let PAGE_LINKS+=1
839 continue
840 fi
841
842 # The URL being linked to is everything after the previous two fields (this allows commas to be in
843 # the URLs, but a comma in the previous field, the page name, will break this)
844 URL=${LINE#$NS_ID,$PAGE_NAME,}
845
846 # Scan for illegal characters
847 if [[ $URL == *[$ILLEGAL_CHARS]* ]]; then
848 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because it contains characters illegal in a URL."
849 let SKIP_BAD_URL+=1
850 let PAGE_LINKS+=1
851 continue
852 fi
853
854 # If we're skipping archive links, see if this is one
855 if [ $CHECK_ARCHIVE_LINKS -eq 0 ] && [[ ( $URL == *web.archive.org* || $URL == *archive.is* ) ]]; then
856 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to check archive links."
857 let SKIP_ARCHIVES+=1
858 let PAGE_LINKS+=1
859 continue
860 fi
861
862 # Now we need to know if the URL is for a file or a web page. First step is to determine if the
863 # URL ends in a suffix
864 HAS_SUFFIX=0
865
866 # If the URL ends in something like ".php?foo=bar", strip everything from the '?' onward
867 CLEAN_URL=${URL%%\?*}
868
869 # If the URL ends in something like "#section_15", strip everything from the '#' onward
870 CLEAN_URL=${CLEAN_URL%%\#*}
871
872 # 'sed' cannot handle Unicode in my Bash shell, so skip non-ASCII URL and make reader check it
873 if [[ $CLEAN_URL == *[![:ascii:]]* ]]; then
874 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I cannot handle non-ASCII characters."
875 let SKIP_NON_ASCII+=1
876 let PAGE_LINKS+=1
877 continue
878 fi
879
880 # Isolate the characters after the last period and after the last slash
881 POST_DOT=$(echo "$CLEAN_URL" | sed 's/.*\.//')
882 POST_SLASH=$(echo "$CLEAN_URL" | sed 's/.*\///')
883
884 # If the last period comes after the last slash, then the URL ends in a suffix
885 POST_DOT_LENGTH=$(echo | awk -v input=$POST_DOT '{print length(input)}')
886 POST_SLASH_LENGTH=$(echo | awk -v input=$POST_SLASH '{print length(input)}')
887 if [ $POST_DOT_LENGTH -lt $POST_SLASH_LENGTH ]; then
888 HAS_SUFFIX=1
889 else
890 HAS_SUFFIX=0
891 fi
892
893 # Now we need to know if the URL is for a file. If the URL ends in a suffix, compare it against the
894 # known URL endings supplied in HTTP_FILES and HTTP_TLDS_AND_PAGES
895 IS_FILE=-1
896 if [ $HAS_SUFFIX -eq 0 ]; then
897 IS_FILE=0
898 else
899 # Turn off case sensitivity while we compare suffixes
900 shopt -s nocasematch
901
902 # Special case: URLs ending in something like "productID.304297400" are pages, not files, so if
903 # the URL's suffix is all numbers, we are looking at the end of a web page URL
904 if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
905 IS_FILE=0
906 fi
907
908 # Special case: URLs ending in a parens, e.g. "ms537113(v=vs.85)", are pages
909 if [[ $POST_DOT =~ ^.*[\(\)]$ ]]; then
910 IS_FILE=0
911 fi
912
913 # Special case: URLs containing a '%', e.g. "10.1007%2FBF00329055", are pages
914 if [[ $POST_DOT == *%* ]]; then
915 IS_FILE=0
916 fi
917
918 # If we did not identify this URL as a web page above, we need to compare the suffix against known
919 # file extensions
920 if [ $IS_FILE -eq -1 ]; then
921 for EXTENSION in "${HTTP_FILES[@]}"; do
922 if [[ $EXTENSION == $POST_DOT ]]; then # double brackets respect the nocasematch setting
923 IS_FILE=1
924 break
925 fi
926 done
927 fi
928
929 # If we did not identify this URL as a file above, we need to compare the suffix against known
930 # pages and TLDs in order to rule out that this is an unknown file suffix or TLD that the user
931 # needs to add to the HTTP_FILES or HTTP_TLDS_AND_PAGES array
932 if [ $IS_FILE -eq -1 ]; then
933 for PAGE_OR_TLD in "${HTTP_TLDS_AND_PAGES[@]}"; do
934 if [[ $PAGE_OR_TLD == $POST_DOT ]]; then
935 IS_FILE=0
936 break
937 fi
938 done
939 fi
940
941 # Turn case sensitivity back on in Bash
942 shopt -u nocasematch
943 fi
944
945 # If this suffix escaped identification as either a file, page or TLD, inform the reader
946 STR_TYPE=""
947 if [ $IS_FILE -eq -1 ]; then
948 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I encountered the unknown URL suffix '$POST_DOT'. Please add this suffix to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
949 let SKIP_UNK_SUFFIX+=1
950 continue
951 elif [ $IS_FILE -eq 1 ]; then
952 STR_TYPE="file"
953 let FILE_LINKS+=1
954 else
955 STR_TYPE="page"
956 let PAGE_LINKS+=1
957 fi
958
959 # Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
960 # issue with sites that require HTTPS
961 CURL_CODE=$(curl -o /dev/null --silent --insecure --compressed --head --user-agent '$AGENT' --max-time $TIMEOUT --retry 2 --write-out '%{http_code}\n' $URL)
962 CURL_ERR=$(echo $?)
963 CURL_RESULT=$CURL_CODE
964
965 # Tack 'curl' exit code onto result string if 'curl' returned "000" (no HTTP response)
966 if [ $CURL_CODE == "000" ]; then
967 CURL_RESULT="$CURL_RESULT-$CURL_ERR"
968 fi
969
970 # Begin to determine our status code for this URL (EI, IW, OK, RD, or NG)
971 STATUS="??"
972 NEW_URL=""
973 INTERWIKI_INDEX=-1
974
975 # First make sure that this isn't an "external internal" link to our own wiki that can be replaced
976 # by "[[page_name]]". If it uses a special access URL beginning with "/w/", let it pass, as it
977 # probably cannot be replaced by "[[ ]]" markup
978 if [[ $URL == *$WIKI_PATH* ]] && [[ $URL != *$WIKI_PATH/w/* ]]; then
979 STATUS="EI"
980 let EI_LINKS+=1
981 fi
982
983 # If it's not, check if this is a link to a domain that we have an interwiki prefix for (also make
984 # sure that it's not an archive.org link to a page from an interwiki domain)
985 if [ $STATUS == "??" ] && [[ $URL != *web.archive.org* ]]; then
986 for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
987 if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]] && [[ $URL != *${INTERWIKI_DOMAINS[$i]}/w/* ]]; then
988 STATUS="IW"
989 let IW_LINKS+=1
990 INTERWIKI_INDEX=$i
991 break
992 fi
993 done
994 fi
995
996 # If we didn't match an interwiki domain, see if the status code is in our "OK" codes list
997 if [ $STATUS == "??" ]; then
998 for CODE in "${OK_CODES[@]}"; do
999 if [[ $CODE == $CURL_CODE ]]; then
1000 STATUS="OK"
1001 let OK_LINKS+=1
1002
1003 # If this is a YouTube link, we have to look at the actual page source to know if the video
1004 # is good or not; override the link's info if it's actually NG
1005 if [[ $URL == *www.youtube.com* ]]; then
1006 PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $URL | grep "\"simpleText\":\"Video unavailable\"")
1007 if [ ! -z "$PAGE_TEXT" ]; then
1008 STATUS="NG"
1009 CURL_RESULT=404
1010 let OK_LINKS-=1
1011 let NG_LINKS+=1
1012 fi
1013 fi
1014 break
1015 fi
1016 done
1017 fi
1018
1019 # If we didn't get a match with the "OK" codes, check it against the "RD" codes
1020 if [ $STATUS == "??" ]; then
1021 for CODE in "${RD_CODES[@]}"; do
1022 if [[ $CODE == $CURL_CODE ]]; then
1023 # Get URL header again in order to retrieve the URL we are being redirected to
1024 NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --write-out '%{redirect_url}\n' $URL)
1025
1026 # Adjust the old and new URLs to both use HTTP for comparison purposes, so we can filter
1027 # those changes out if the user didn't ask for them
1028 URL_HTTP=$(echo $URL | sed -E 's/^https:/http:/')
1029 NEW_URL_HTTP=$(echo $NEW_URL | sed -E 's/^https:/http:/')
1030
1031 # Sometimes 'curl' fails to get the redirect_url due to time-out or bad web site config
1032 NEW_URL_LENGTH=$(echo | awk -v input=$NEW_URL_HTTP '{print length(input)}')
1033 if [ $NEW_URL_LENGTH -lt $MIN_URL_LENGTH ]; then
1034 NEW_URL_HTTP="[new URL not retrieved]"
1035 fi
1036
1037 # Remove slash at end of new URL, if present, so we can filter out the redirects that
1038 # merely add an ending slash if the user didn't ask for them
1039 NEW_URL_NO_SLASH=$(echo $NEW_URL_HTTP | sed -E 's:/$::')
1040
1041 # Detect if this is a youtu.be link simply being expanded by YouTube to the full
1042 # youtube.com address
1043 YOUTU_BE=0
1044 if [[ $URL_HTTP == http://youtu.be* ]] && [[ $NEW_URL_HTTP == http://www.youtube.com* ]]; then
1045 YOUTU_BE=1
1046 fi
1047
1048 # If the URLs match besides HTTP being upgraded to HTTPS, then the link is OK (unless user
1049 # wants those to be reported)
1050 if [ $SHOW_HTTPS -eq 0 ] && [ $URL_HTTP == $NEW_URL_HTTP ]; then
1051 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to show http->https upgrades, and I was redirected to '$NEW_URL'."
1052 STATUS="OK"
1053 let OK_LINKS+=1
1054 let SKIP_HTTPS_UP+=1
1055 # If the URLs match besides an added ending slash, then the link is OK (unless user wants
1056 # those to be reported)
1057 elif [ $SHOW_SLASH -eq 0 ] && [ $URL_HTTP == $NEW_URL_NO_SLASH ]; then
1058 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to show added trailing slashes, and I was redirected to '$NEW_URL'."
1059 STATUS="OK"
1060 let OK_LINKS+=1
1061 let SKIP_SLASH_ADD+=1
1062 elif [ $YOUTU_BE -eq 1 ]; then
1063 # We have to look at the actual page source to know if a YouTube video is good or not
1064 PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $NEW_URL | grep "\"simpleText\":\"Video unavailable\"")
1065 if [ ! -z "$PAGE_TEXT" ]; then
1066 STATUS="NG"
1067 let NG_LINKS+=1
1068 else
1069 if [ $SHOW_YT_RD -eq 0 ]; then
1070 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to show redirects for youtu.be links, and I was redirected to '$NEW_URL'."
1071 STATUS="OK"
1072 let OK_LINKS+=1
1073 let SKIP_YOUTU_BE+=1
1074 else
1075 STATUS="RD"
1076 let RD_LINKS+=1
1077 fi
1078 fi
1079 else
1080 STATUS="RD"
1081 let RD_LINKS+=1
1082 fi
1083 break
1084 fi
1085 done
1086 fi
1087
1088 # If we didn't get a match with the "RD" codes, check it against the "NG" codes
1089 if [ $STATUS == "??" ]; then
1090 for CODE in "${NG_CODES[@]}"; do
1091 if [[ $CODE == $CURL_CODE ]]; then
1092 STATUS="NG"
1093 let NG_LINKS+=1
1094 break
1095 fi
1096 done
1097 fi
1098
1099 # If we didn't match a known status code, advise the reader
1100 if [ $STATUS == "??" ]; then
1101 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I encountered the unknown response code $CURL_CODE."
1102 let SKIP_UNK_CODE+=1
1103 continue
1104 fi
1105
1106 # Check problem links against exceptions list before proceeding
1107 FOUND_EXCEPT=0
1108 if [ $STATUS != "OK" ] && [ ! -z "$EXCEPT_URL" ]; then
1109 # The code we expect to find in the exceptions file is either the 'curl' result or "EI"/"IW"
1110 EXPECT_CODE="$CURL_RESULT"
1111 if [ $STATUS == "EI" ]; then
1112 EXPECT_CODE="EI"
1113 elif [ $STATUS == "IW" ]; then
1114 EXPECT_CODE="IW"
1115 fi
1116
1117 # Look for link in exceptions list and make sure the listed result code and wiki page also match
1118 for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
1119 {
1120 EXCEPT_LINE="${EXCEPT_ARRAY[$i]}"
1121
1122 # Undo any HTML-encoding from the wiki page; for now we just worry about the ampersand, as most
1123 # other HTML-encoded characters are not found in URLs
1124 EXCEPT_LINE=$(echo "$EXCEPT_LINE" | sed 's/\&amp;/\&/g')
1125
1126 # Check for URL match
1127 EXCEPT_URL="${EXCEPT_LINE#*,}"
1128 EXCEPT_URL="${EXCEPT_URL%,*}"
1129 if [ "$EXCEPT_URL" != "$URL" ]; then
1130 continue
1131 fi
1132
1133 # Check for page name match
1134 EXCEPT_PAGE="${EXCEPT_LINE##*,}"
1135 EXCEPT_PAGE="${EXCEPT_PAGE%% *}"
1136 if [ "$EXCEPT_PAGE" == "*" ] || [ "$EXCEPT_PAGE" == "$LOCAL_PAGE_PATH" ]; then
1137 let EXCEPT_FOUND[$i]+=1
1138 valPrint trs "Found exception '$URL' on page '$LOCAL_PAGE_PATH'."
1139
1140 # Check for result code match
1141 EXCEPT_CODE=${EXCEPT_LINE%%,*}
1142 if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then
1143 FOUND_EXCEPT=1
1144 let EXCEPT_USED[$i]+=1
1145 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because its expected result, '$EXPECT_CODE', is in the exceptions list."
1146
1147 if [ $STATUS == "EI" ]; then
1148 let SKIP_EXPECT_EI+=1
1149 elif [ $STATUS == "IW" ]; then
1150 let SKIP_EXPECT_IW+=1
1151 elif [ $STATUS == "RD" ]; then
1152 let SKIP_EXPECT_RD+=1
1153 else
1154 let SKIP_EXPECT_NG+=1
1155 fi
1156
1157 break
1158 fi
1159 fi
1160 } done
1161 fi
1162 if [ $FOUND_EXCEPT -eq 1 ]; then
1163 continue
1164 fi
1165
1166 # If appropriate, record this link to the log, with clickable URLs when possible
1167 if [ $STATUS != "OK" ] || [ $RECORD_OK_LINKS -eq 1 ]; then
1168 # Prepare 'curl' result in parentheses to print after status code, unless this is an "IW" or "EI"
1169 # link, in which case showing the status code doesn't make sense. Adjust spacing after string to
1170 # ensure TXT and RTF reports have aligned columns of results.
1171 CURL_STR_H=" ($CURL_RESULT)"
1172 CURL_STR_T="$CURL_STR_H"
1173 CURL_STR_R="$CURL_STR_H "
1174 if [ $STATUS == "IW" ] || [ $STATUS == "EI" ]; then
1175 CURL_STR_H=""
1176 CURL_STR_T=" "
1177 CURL_STR_R=" "
1178 fi
1179
1180 # Record link and its wiki page in TXT, RTF, and HTML markup
1181 valPrint t "${STATUS}${CURL_STR_T} $STR_TYPE $URL"
1182 valPrint t " linked from $FULL_PAGE_PATH"
1183 valPrint r "${STATUS}${CURL_STR_R}${RTF_TABS}$STR_TYPE {\field{\*\fldinst{HYPERLINK \"$URL\"}}{\fldrslt $URL}}"
1184 valPrint r " linked from {\field{\*\fldinst{HYPERLINK \"$FULL_PAGE_PATH\"}}{\fldrslt $LOCAL_PAGE_PATH}}"
1185 valPrint hn "<tr><td style=\"white-space:nowrap\">${STATUS}${CURL_STR_H}</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>"
1186 valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"
1187
1188 # Place vertical space here since we won't be printing anything more about this link
1189 if [ $STATUS == "OK" ] && [ $SUGGEST_SNAPSHOTS_OK -eq 0 ]; then valPrint tr ""; valPrint hs ""; fi
1190
1191 # Record redirect URL if one was given by a 3xx response page
1192 if [ $STATUS == "RD" ]; then
1193 valPrint ts " Server suggests $NEW_URL"
1194 valPrint rs " Server suggests {\field{\*\fldinst{HYPERLINK \"$NEW_URL\"}}{\fldrslt $NEW_URL}}"
1195 valPrint hs "<tr><td colspan=\"2\" align=\"right\">Server suggests</td><td><a href=\"$NEW_URL\" target=\"_blank\">$NEW_URL</a></td></tr>"
1196 fi
1197
1198 # Notify reader if we can use an intrawiki link for this URL
1199 if [ $STATUS == "EI" ]; then
1200 INTRA_PAGE=${URL#*://*/}
1201 valPrint ts " Just use [[$INTRA_PAGE]]"
1202 valPrint rs " Just use [[$INTRA_PAGE]]"
1203 valPrint hs "<tr><td colspan=\"2\" align=\"right\">Just use</td><td>[[$INTRA_PAGE]]</td></tr>"
1204 fi
1205
1206 # Notify reader if we can use an interwiki prefix for this URL
1207 if [ $STATUS == "IW" ]; then
1208 INTER_PAGE=$(echo "$URL" | sed 's/.*\///')
1209 valPrint ts " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
1210 valPrint rs " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
1211 valPrint hs "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]</td></tr>"
1212 fi
1213
1214 # Query Internet Archive for latest "OK" snapshot for "NG" page
1215 if [[ ( $STATUS == "NG" && $SUGGEST_SNAPSHOTS_NG -eq 1 ) || ( $STATUS == "OK" && $SUGGEST_SNAPSHOTS_OK -eq 1 ) ]]; then
1216
1217 # We need to watch out for the rate limit or we'll get locked out; look at how much time has
1218 # elapsed and then wait the remainder between that and how long of a wait we think is needed
1219 # to avoid the dreaded "Too Many Requests" response. 5 seconds is just a guess.
1220 CUR_TIME=$(date +%s)
1221 WAIT_REMAINDER=$((5 - $CUR_TIME + $START_LINK))
1222 if [ $WAIT_REMAINDER -gt 0 ]; then
1223 valPrint t "Waiting $WAIT_REMAINDER second(s) to conform to Archive.org rate limit."
1224 sleep $WAIT_REMAINDER
1225 fi
1226
1227 # Issue query to the API
1228 ARCHIVE_QUERY=$(curl --silent --max-time $TIMEOUT "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
1229
1230 # Notify reader if we hit the rate limit and just keep going
1231 if [[ "$ARCHIVE_QUERY" == "*Too Many Requests*" ]]; then
1232 valPrint t " IA has rate-limited us!"
1233 valPrint r " IA has rate-limited us!"
1234 valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td>(hit the API rate limit!)</td></tr>"
1235 # If a "closest" snapshot was received, inform reader
1236 elif [[ "$ARCHIVE_QUERY" == *\"closest\":* ]]; then
1237 # In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it
1238 ARCHIVE_QUERY=$(echo "$ARCHIVE_QUERY" | sed 's/#!/#\\!/')
1239
1240 # ...isolate "url" property in the response that follows the "closest" tag
1241 SNAPSHOT_URL=${ARCHIVE_QUERY##*\"closest\":} # everything after '"closest":'
1242 SNAPSHOT_URL=${SNAPSHOT_URL#*\"url\": \"} # everything after '"url": "'
1243 SNAPSHOT_URL=${SNAPSHOT_URL%%\"*} # everything before '"'
1244
1245 # Remove the port 80 part that IA often adds to the URL, as it's superfluous
1246 SNAPSHOT_URL=$(echo $SNAPSHOT_URL | sed 's/:80//')
1247
1248 # Inform the reader of the snapshot URL
1249 valPrint ts " IA suggests $SNAPSHOT_URL"
1250 valPrint rs " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
1251 valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
1252 else # Otherwise give a generic Wayback Machine link for this URL, which might work
1253 valPrint ts " Try browsing $ARCHIVE_GENERIC/$URL"
1254 valPrint rs " Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
1255 valPrint hs "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"
1256 fi
1257 fi
1258 fi
1259
1260 # If this is a page and it seems to be there, take screenshot of it using headless Google Chrome
1261 if [ $IS_FILE -eq 0 ] && [ $TAKE_PAGE_SHOT -eq 1 ] && [ $STATUS == "OK" ]; then
1262 # Sanitize URL by removing "http(s)://" and converting any other colons or slashes to underscores
1263 SHOT_NAME=$(echo "$URL" | sed 's/https*\:\/\///' | sed 'y/:\//__/')
1264 SHOT_FILE="$SHOT_PATH/$SHOT_NAME.png"
1265
1266 # Don't take screenshot if we already encountered this page and screenshotted it
1267 if [ ! -f "$SHOT_FILE" ]; then
1268 "$CHROME_PATH" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
1269 if [ -f "$WORKING_DIR/$CHROME_SCREENSHOT" ]; then
1270 mv -n "$WORKING_DIR/$CHROME_SCREENSHOT" "$SHOT_FILE"
1271 else
1272 valPrint trhs "Screenshot of URL $URL seems to have failed!"
1273 fi
1274 else
1275 valPrint trhs "Skipping screenshot of URL '$URL' because file '$SHOT_FILE' already exists."
1276 fi
1277 fi
1278done
1279FINISHED_LIST="yes"
1280wrapupAndExit
Note: See TracBrowser for help on using the repository browser.