source: Validate External Links/validate_external_links.sh@ 1141

Last change on this file since 1141 was 1141, checked in by iritscen, 4 years ago

Committing the changes to Val which I meant to commit over a week ago. I committed everything but the updated script itself. See last Val commit message for list of changes.

File size: 48.7 KB
RevLine 
[1064]1#!/bin/bash
2
3# Validate External Links by Iritscen
[1141]4#
5# Validates a list of external links in CSV format. The resulting logs are produced in three formats:
6# - TXT (for easy diffing with an earlier log)
7# - RTF (for reading as a local file with clickable links)
8# - HTML (for uploading as a web page).
9# Call script with "--help" argument for documentation. Also see Read Me First.rtf for critical notes.
10#
[1064]11# Recommended rule:
[1118]12# |----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----|
[1141]13#
14# Table of contents (sections of script in order of appearance, not execution):
15# • Globals
16# • Help Output
17# • Setup
18# • Utility Functions
19# • Summary Output
20# • Initialization
21# • Data Sourcing
22# • Config Output
23# • Legend Output
24# • Main Loop
[1064]25
26# Set separator token to newline
27IFS="
28"
29
30### GLOBALS ###
31# Settings -- these will be changed from their defaults by the arguments passed in to the script
[1135]32LINKS_URL="" # use 'curl' to download file with links from this location (can be file://)
[1136]33EXCEPT_URL="" # 'curl' will access this wiki page with a list of exceptions for NG results
[1135]34OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder
35RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES
36SHOW_SLASH=0 # record response code to the log when a slash is added to the end of a URL
37SHOW_HTTPS=0 # record response code to the log when "http" is upgraded to "https"
38SHOW_YT_RD=0 # record response code to the log when a youtu.be URL is expanded to the full URL
39SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page
40SKIP_ARCHIVE_LINKS=0 # don't check URLs under the archive.org domain
41TAKE_PAGE_SHOT=0 # take a screenshot of each OK page
[1141]42TIMEOUT=10 # time to wait for a response when querying a site
[1135]43CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature
44URL_START=1 # start at this URL in LINKS_FILE (1 by default)
45URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE
46UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report
[1064]47
48# Fixed strings -- see the occurrences of these variables to learn their purpose
[1136]49AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36 OPR/69.0.3686.77"
[1064]50ARCHIVE_API="http://archive.org/wayback/available"
51ARCHIVE_GENERIC="https://web.archive.org/web/*"
52ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
53CHROME_SCREENSHOT="screenshot.png"
[1136]54EXCEPT_FILE_NAME="exceptions.txt"
[1064]55EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
[1141]56WIKI_CURL="https://wiki.oni2.net/Validate_External_Links/Curl_codes"
57WIKI_HTTP="https://wiki.oni2.net/Validate_External_Links/HTTP_codes"
58WIKI_MAIN="https://wiki.oni2.net/Validate_External_Links"
59WIKI_ME="http://iritscen.oni2.net"
[1064]60THIS_DIR=$(cd $(dirname $0); pwd)
61WORKING_DIR=$(pwd)
62WIKI_PATH="wiki.oni2.net"
63
64# These are parallel arrays of the IDs and names of OniGalore's current namespaces
65declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 100 101 102 103 104 105 108 109 110 111)
66declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk")
67
68# These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
[1070]69# This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code.
[1127]70declare -a HTTP_FILES=(3ds 7z avi BINA blend bsl cpp csv dae dll dmg doc east exe fbx first63 flv gamedata gif jpg last32 log m4a mhm mov mp3 mp4 oni ONWC pdf png py rar tga TRMA txt vbs wav wmv xaf xml zip)
[1137]71declare -a HTTP_TLDS_AND_PAGES=(action ars asp aspx cfm cgi com css de full htm html it js jsp net org pgi php php3 phtml pl ru shtml stm uk x)
[1064]72
[1067]73# These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
74# are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt
75# if you add a new code.
[1127]76declare -a OK_CODES=(200 401 405 406 418 501)
[1067]77declare -a RD_CODES=(301 302 303 307 308)
[1127]78declare -a NG_CODES=(000 400 403 404 410 500 502 503 530)
[1064]79
80# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
81# transcluded text, and if the transclusion fails, then the braces show up in the URL
82ILLEGAL_CHARS="{ }"
83
[1070]84# The shortest URL possible, used for sanity-checking some URLs: http://a.co
85MIN_URL_LENGTH=11
86
[1064]87# These are parallel arrays giving the prefixes that can be used in place of normal external links to
88# some wikis and other sites
[1070]89declare -a INTERWIKI_PREFIXES=(commons metawikimedia mw wikibooks wikidata wikimedia wikinews wikiquote wikisource wikispecies wikiversity wikivoyage wikt wp)
90declare -a INTERWIKI_DOMAINS=(commons.wikimedia.org meta.wikimedia.org mediawiki.org wikibooks.org wikidata.org wikimediafoundation.org wikinews.org wikiquote.org wikisource.org species.wikimedia.org wikiversity.org wikivoyage.org wiktionary.org wikipedia.org)
[1064]91
92# Variables for keeping track of main loop progress and findings
93LINK_NUM=0
[1070]94EI_LINKS=0
95IW_LINKS=0
[1064]96OK_LINKS=0
[1067]97RD_LINKS=0
[1064]98NG_LINKS=0
99SKIP_UNK_NS=0
100SKIP_JS_PAGE=0
101SKIP_BAD_URL=0
102SKIP_NON_ASCII=0
103SKIP_UNK_SUFFIX=0
104SKIP_UNK_CODE=0
[1070]105SKIP_EXPECT_NG=0
106SKIP_EXPECT_EI=0
107SKIP_EXPECT_IW=0
[1122]108SKIP_HTTPS_UP=0
109SKIP_SLASH_ADD=0
[1127]110SKIP_YOUTU_BE=0
[1135]111SKIP_ARCHIVE_ORG=0
[1064]112FILE_LINKS=0
113PAGE_LINKS=0
114SKIPPED_HEADER_ROW=0
115FINISHED_LIST="no"
[1118]116START_RUN=0
117END_RUN=0
[1064]118
119
[1141]120### HELP OUTPUT ###
[1064]121# A pseudo-man page. Here is the 80-character rule for the page text:
122# 234567890123456789012345678901234567890123456789012345678901234567890123456789
123function printHelp()
124{
125 cat << EOF
126
127NAME
128 Validate External Links
129
130SYNOPSIS
131 validate_external_links.sh --help
[1070]132 validate_external_links.sh --links URL --output DIR [--exceptions URL]
[1136]133 [--record-ok-links] [--show-added-slashes] [--show-https-upgrades]
134 [--show-yt-redirects] [--suggest-snapshots] [--skip-archive-links]
[1141]135 [--take-screenshots FILE] [--timeout NUM] [--start-url NUM]
136 [--end-url NUM] [--upload FILE]
[1064]137
138DESCRIPTION
139 This script parses a list of external links found in the OniGalore wiki
140 (which is dumped by the Oni2.net domain periodically in a particular
141 format), validates them using the Unix tool 'curl', and produces a report
[1070]142 of which links were "OK" (responded positively to an HTTP query), which
143 were "RD" (responded with a 3xx redirect code), which could be "IW"
144 (interwiki) links, which are "EI" (external internal) links and could be
145 intrawiki links, and which were "NG" (no good; a negative response to the
[1069]146 query). This report can then be automatically uploaded to the location of
[1064]147 your choice. The script can also suggest Internet Archive snapshots for
[1070]148 "NG" links, and take screenshots of "OK" links for visual verification by
149 the reader that the page in question is the one intended to be displayed.
[1064]150
151 You must pass this script the URL at which the list of links is found
[1070]152 (--links) and the path where the directory of logs should be outputted
153 (--output). All other arguments are optional.
[1064]154
155OPTIONS
[1075]156 --help Show this page.
157 --links URL (required) URL from which to download the CSV
158 file with external links. Note that this URL can
159 be a local file if you supply a file:// path.
160 --output DIR (required) Unix path to directory in which Val
161 should place its reports.
162 --exceptions URL In order to remove links from the report which
[1136]163 Val finds an issue with but which you regard as
164 OK, list those desired exceptions on a wiki page.
165 See the sample file "exceptions.pdf" for the
166 required format of the page. Note that this URL
167 can point to a local file if you supply a path
168 beginning with "file://".
[1075]169 --record-ok-links Log a link in the report even if its response
170 code is "OK".
[1122]171 --show-added-slashes Report on redirects that simply add a '/' to the
172 end of the URL.
[1127]173 --show-https-upgrades Report on redirects that simply upgrade a
[1122]174 "http://" URL to a "https://" URL.
[1127]175 --show-yt-redirects Report on redirects that expand a youtu.be URL.
[1075]176 --suggest-snapshots Query the Internet Archive for a possible
177 snapshot URL for each "NG" page.
[1135]178 --skip-archive-links Don't check links that are already pointing to
179 a page on the Internet Archive.
[1075]180 --take-screenshots FILE Call the Google Chrome binary at this path to
181 take screenshots of each "OK" page.
[1141]182 --timeout NUM Wait this many seconds for a site to respond. The
183 default is 10.
[1075]184 --start-url NUM Start at this link in the links CSV file.
185 --end-url NUM Stop at this link in the links CSV file.
186 --upload FILE Upload report using the credentials and path
187 given in this local text file. See sftp_login.txt
188 for template.
[1064]189
190BUGS
191 The script cannot properly parse any line in the external links file
192 which contains a comma in the name of the wiki page containing a link.
193 Commas in the link itself are not an issue.
194EOF
195}
196
197
198### SETUP ###
199# If first argument is a help request, or if nothing was passed in at all, print help page and quit
200if [ "$#" -eq 0 ] || [ "$1" == "--help" ]; then
201 printHelp | less
202 exit 0
203fi
204
205# Parse arguments as long as there are more arguments to process
206while (( "$#" )); do
207 case "$1" in
[1127]208 --links ) LINKS_URL="$2"; shift 2;;
209 --exceptions ) EXCEPT_URL="$2"; shift 2;;
210 --output ) OUTPUT_DIR="$2"; shift 2;;
211 --record-ok-links ) RECORD_OK_LINKS=1; shift;;
212 --show-added-slashes ) SHOW_SLASH=1; shift;;
213 --show-https-upgrades ) SHOW_HTTPS=1; shift;;
214 --show-yt-redirects ) SHOW_YT_RD=1; shift;;
215 --suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;;
[1135]216 --skip-archive-links ) SKIP_ARCHIVE_LINKS=1; shift;;
[1127]217 --take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
[1141]218 --timeout ) TIMEOUT=$2; shift 2;;
[1127]219 --start-url ) URL_START=$2; shift 2;;
220 --end-url ) URL_LIMIT=$2; shift 2;;
221 --upload ) UPLOAD_INFO=$2; shift 2;;
222 * ) echo "Invalid argument $1 detected. Aborting."; exit 1;;
[1064]223 esac
224done
225
226# If the required arguments were not supplied, print help page and quit
227if [ -z $LINKS_URL ] || [ -z $OUTPUT_DIR ]; then
[1070]228 echo "Error: I did not receive one or both required arguments. Run me with the \"--help\" argument for documentation."
[1064]229 exit 2
230fi
231
[1070]232# If user wants screenshots, make sure path to Chrome was passed in and is valid
233if [ $TAKE_PAGE_SHOT -eq 1 ]; then
234 if [ ! -f "$CHROME_PATH" ]; then
235 echo "Error: You need to supply a path to the Google Chrome application in order to take screenshots."
236 exit 3
237 fi
238fi
239
[1064]240# Check that UPLOAD_INFO exists, if this argument was supplied
241if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then
242 echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting."
[1070]243 exit 4
[1064]244fi
245
246# Check that OUTPUT_DIR is a directory
247if [ ! -d "$OUTPUT_DIR" ]; then
248 echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting."
[1070]249 exit 5
[1064]250fi
251
252# Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots
253SAFE_TIME=$(date "+%Y-%m-%d--%H-%M-%S") # for use in path names
254NICE_TIME=$(TZ=":UTC" date "+%a, %d %b %Y %k:%M:%S GMT") # matches format of LINKS_DATE, which is extracted from the extlinks file
255OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
256OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
257SHOT_PATH="$OUTPUT_PATH/Screenshots"
258LOG_NAME="ValExtLinks report"
259LOG_TXT="$OUTPUT_PATH/$LOG_NAME.txt"
260LOG_RTF="$OUTPUT_PATH/$LOG_NAME.rtf"
261LOG_HTM="$OUTPUT_PATH/$LOG_NAME.htm"
262mkdir "$OUTPUT_PATH"
263if [ $TAKE_PAGE_SHOT -eq 1 ]; then
264 mkdir "$SHOT_PATH"
265fi
266
267# Check that 'mkdir' succeeded
268if [ ! -d "$OUTPUT_PATH" ]; then
269 echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting."
[1070]270 exit 6
[1064]271fi
272
273# Get date on the file at LINKS_URL and print to log
274LINKS_DATE=$(curl --silent --head $LINKS_URL | grep "Last-Modified")
275if [ -z "$LINKS_DATE" ]; then
276 echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting."
[1070]277 exit 7
[1064]278fi
279LINKS_DATE=${LINKS_DATE#Last-Modified: }
280
281
282### UTILITY FUNCTIONS ###
283# Writes a plain-text header to TXT log file
284function printTXTheader()
285{
286 valPrint t "Validate External Links report"
287 valPrint t "generated $NICE_TIME"
288 valPrint t "from data of $LINKS_DATE"
[1141]289 valPrint t "script by Iritscen (contact: $WIKI_ME)"
[1064]290 valPrint t ""
291}
292
293# Writes the RTF header to RTF log file, then a centered title, then sets text to left-justified
294function printRTFheader()
295{
296 valPrint r "{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf820
297{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
298{\colortbl;\red255\green255\blue255;}
299{\*\expandedcolortbl;;}
300\margl1440\margr1440\vieww12600\viewh12100\viewkind0
301\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\qc\partightenfactor0
302
303\f0\fs28 \cf0 \b1 Validate External Links report \b0\\
304generated $NICE_TIME\\
305from data of $LINKS_DATE\\
[1141]306script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$WIKI_ME\"}}{\fldrslt contact}})
[1064]307\\
308\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
309\cf0 "
310}
311
312# Closes the RTF markup of the RTF log file
313function printRTFfooter()
314{
315 valPrint r "}"
316}
317
318# Writes the HTML header to HTML log file
319function printHTMheader()
320{
321 valPrint h "<html>
322<head>
323<title>Validate External Links report</title>
324</head>
325<body>
326<h2>Validate External Links report</h2>
327<h3>generated $NICE_TIME<br />
328from data of $LINKS_DATE<br />
[1141]329script by Iritscen (<a href=\"$WIKI_ME\" target=\"_blank\">contact</a>)</h3>"
[1064]330}
331
332# Closes the HTML markup of the HTML log file
333function printHTMfooter()
334{
335 valPrint h "</body>
336</html>"
337}
338
339# The central logging function. The first parameter is a string composed of one or more characters that
[1070]340# indicate which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
[1141]341# 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 's' means "Print an
342# extra newline at the end." 'w' means "Don't pass console output through 'fmt'" ("fmt" fits the output
[1119]343# to an 80-column CLI but can break special formatting and the 'n' option).
[1064]344function valPrint()
345{
346 if [[ "$1" == *c* ]]; then
347 if [[ "$1" == *n* ]]; then
348 echo -n "$2"
349 elif [[ "$1" == *w* ]]; then
350 echo "$2"
[1119]351 elif [[ "$1" == *s* ]]; then
352 echo -e "$2\n"
[1064]353 else
354 echo "$2" | fmt -w 80
355 fi
356 fi
357 if [[ "$1" == *t* ]]; then
358 if [[ "$1" == *n* ]]; then
359 echo -n "$2" >> "$LOG_TXT"
[1119]360 elif [[ "$1" == *s* ]]; then
361 echo -e "$2\n" >> "$LOG_TXT"
[1064]362 else
363 echo "$2" >> "$LOG_TXT"
364 fi
365 fi
366 if [[ "$1" == *r* ]]; then
367 if [[ "$1" == *n* ]]; then
368 echo "$2" >> "$LOG_RTF"
[1119]369 elif [[ "$1" == *s* ]]; then
370 echo "$2\line\line" >> "$LOG_RTF"
[1064]371 else
[1119]372 echo "$2\line" >> "$LOG_RTF"
[1064]373 fi
374 fi
375 if [[ "$1" == *h* ]]; then
[1119]376 if [[ "$1" == *s* ]]; then
377 echo "$2<tr><td>&nbsp;</td></tr>" >> "$LOG_HTM"
378 elif [[ "$1" == *n* ]]; then
[1064]379 echo "$2" >> "$LOG_HTM"
380 else
381 echo "$2<br />" >> "$LOG_HTM"
382 fi
383 fi
384}
385
386# Pluralize the string in parameter 1 if the number in parameter 2 is not 1
387function pluralCheckNoun()
388{
389 if [ $2 -ne 1 ]; then
390 if [[ $1 =~ x$ ]]; then
391 echo $1es
392 else
393 echo $1s
394 fi
395 else
396 echo $1
397 fi
398}
399
[1067]400# Output "is" if parameter 1 is 1, otherwise "are"
401function pluralCheckIs()
402{
403 if [ $1 -ne 1 ]; then
404 echo "are"
405 else
406 echo "is"
407 fi
408}
409
[1064]410# Output "was" if parameter 1 is 1, otherwise "were"
411function pluralCheckWas()
412{
413 if [ $1 -ne 1 ]; then
414 echo "were"
415 else
416 echo "was"
417 fi
418}
419
[1067]420# Output "a " if parameter 1 is 1, otherwise nothing
421function pluralCheckA()
422{
423 if [ $1 -eq 1 ]; then
424 echo "a "
425 fi
426}
427
428# Output "an " if parameter 1 is 1, otherwise nothing
429function pluralCheckAn()
430{
431 if [ $1 -eq 1 ]; then
432 echo "an "
433 fi
434}
435
[1064]436# Upload HTML report using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
437# reports being saved to disk have already been closed.
438function uploadReport()
439{
440 valPrint c "Uploading HTML report..."
441
442 SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
443 SFTP_USER_NAME_MARKER="user:"
444 SFTP_PASSWORD_MARKER="pw:"
445 SFTP_PORT_MARKER="port:"
446 SFTP_PATH_MARKER="path:"
447 SFTP_USER_NAME=$(grep $SFTP_USER_NAME_MARKER $UPLOAD_INFO)
448 SFTP_USER_NAME=${SFTP_USER_NAME#$SFTP_USER_NAME_MARKER}
449 SFTP_PASSWORD=$(grep $SFTP_PASSWORD_MARKER $UPLOAD_INFO)
450 SFTP_PASSWORD=${SFTP_PASSWORD#$SFTP_PASSWORD_MARKER}
451 SFTP_PORT=$(grep $SFTP_PORT_MARKER $UPLOAD_INFO)
452 SFTP_PORT=${SFTP_PORT#$SFTP_PORT_MARKER}
453 SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
454 SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}
455
456 expect "$SCRIPT_PATH" "$LOG_HTM" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.htm"
457
458 valPrint c "Report was uploaded, unless an error message appears above."
459}
460
461# Prints session summary when script is done
462function wrapupAndExit()
463{
464 # Get off progress line on console, drop down a line from last link in log, and close HTML table
465 valPrint ctr ""
466 valPrint h "</table><br />"
467
468 # If we didn't finish processing the last URL, then the iterator is one too high
469 if [ $FINISHED_LIST != "yes" ]; then
470 let LINK_NUM-=1
471 if [ $FINISHED_LIST == "no" ]; then
472 valPrint ctrh "The session was canceled by the user."
473 fi
474 fi
475
[1118]476 # Generate string with elapsed time
477 END_RUN=$(date +%s)
478 ELAPSED=$(echo $(($END_RUN - $START_RUN)) | awk '{printf "%d min. %d sec. elapsed", int($1/60), int($1%60)}')
479
[1122]480 # Do some math on results of session
[1064]481 LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
[1122]482 LINK_PROBLEMS=$((EI_LINKS+IW_LINKS+RD_LINKS+NG_LINKS))
483 LINK_ERRORS=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
484 LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_EI+SKIP_EXPECT_IW))
[1127]485 TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP+SKIP_YOUTU_BE))
[1122]486 LINKS_CHECKED=$((LINKS_PROCESSED-LINK_ERRORS))
487
[1141]488 ## SUMMARY OUTPUT ##
[1118]489 valPrint ct "Summary ($ELAPSED):"
490 valPrint r "\b1 Summary \b0 ($ELAPSED)"
491 valPrint hn "<h3><span id=\"summary\">Summary ($ELAPSED)</span></h3>"
[1123]492 valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT) (there $(pluralCheckWas $FILE_LINKS) $FILE_LINKS file $(pluralCheckNoun link $FILE_LINKS) and $PAGE_LINKS page $(pluralCheckNoun link $PAGE_LINKS))."
[1122]493
494 # Print processed link totals
495 if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi
496 if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi
[1135]497 if [ $SKIP_ARCHIVE_ORG -gt 0 ]; then valPrint ctrh "- $SKIP_ARCHIVE_ORG Archive.org $(pluralCheckNoun link $SKIP_ARCHIVE_ORG) were not checked"; fi
[1123]498 if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS processed $(pluralCheckNoun link $LINK_PROBLEMS) had $(pluralCheckAn $LINK_PROBLEMS)$(pluralCheckNoun issue $LINK_PROBLEMS)"; fi
[1127]499 if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr " (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; valPrint h "&nbsp;&nbsp;(excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi
[1123]500 if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) $(pluralCheckWas $OK_LINKS) OK"; fi
501 if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctr " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; valPrint h "&nbsp;&nbsp;(counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi
[1122]502
503 # Print excepted link totals
504 if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctrh "$LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"; fi
505 if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
506 if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi
507 if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi
508
509 # Print errored link totals
510 if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"; fi
511 if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
[1070]512 if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
[1064]513 if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
514 if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
515 if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
516 if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
[1122]517
518 # Print checked link totals
[1123]519 if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS link $(pluralCheckNoun issue $LINK_PROBLEMS):"; fi
[1122]520 if [ $NG_LINKS -gt 0 ]; then valPrint ctrh "- $NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
521 if [ $RD_LINKS -gt 0 ]; then valPrint ctrh "- $RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi
522 if [ $EI_LINKS -gt 0 ]; then valPrint ctrh "- $EI_LINKS $(pluralCheckNoun link $EI_LINKS) that could be intrawiki"; fi
523 if [ $IW_LINKS -gt 0 ]; then valPrint ctrh "- $IW_LINKS $(pluralCheckNoun link $IW_LINKS) that could be interwiki"; fi
524
525 # Close the log files' markup
[1070]526 valPrint trh "ValExtLinks says goodbye."
[1064]527 printRTFfooter
528 printHTMfooter
529
530 # Upload report if this was requested
531 if [ ! -z $UPLOAD_INFO ] && [ $FINISHED_LIST != "no" ]; then
532 uploadReport
533 fi
534
535 # Really quit now
536 valPrint c "ValExtLinks says goodbye."
537 exit 0
538}
539trap wrapupAndExit INT
540
541
542### INITIALIZATION ###
543# Print opening message to console and log files
544valPrint cw "$(tput bold)--Validate External Links--$(tput sgr0)"
545printTXTheader
546printRTFheader
547printHTMheader
548
[1141]549## DATA SOURCING ##
550valPrint t "Startup:"
551valPrint r "\b1 Startup \b0"
552valPrint hn "<h3>Startup</h3>"
553
[1064]554# Attempt to download file at LINKS_URL, then check that it succeeded
[1141]555valPrint cwtrhn "Downloading list of external links from $LINKS_URL..."
[1064]556LINKS_FILE_NAME=$(echo "$LINKS_URL" | sed 's/.*\///')
557LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
558curl --silent -o "$LINKS_FILE" $LINKS_URL
559if [ ! -f "$LINKS_FILE" ]; then
[1141]560 echo -e "\nThe download of $LINKS_URL appears to have failed. Aborting."
[1064]561 wrapupAndExit
[1141]562else
563 valPrint ctrh " success."
[1064]564fi
565
566# Attempt to download file at EXCEPT_URL, then check that it succeeded
567if [ ! -z $EXCEPT_URL ]; then
[1141]568 valPrint cwtrhn "Downloading list of reporting exceptions from $EXCEPT_URL..."
[1136]569 EXCEPT_DATA=$(curl --silent $EXCEPT_URL)
570 if [ -z "$EXCEPT_DATA" ]; then
[1141]571 echo -e "\nThe download of the exceptions data from '$EXCEPT_URL' appears to have failed. Aborting."
[1064]572 wrapupAndExit
[1141]573 else
574 valPrint ctrh " success."
[1064]575 fi
[1136]576 EXCEPT_DATA=${EXCEPT_DATA%END LIST*}
577 EXCEPT_DATA=${EXCEPT_DATA#*BEGIN LIST}
578 EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
579
580 # Store on disk for debugging purposes
581 echo "$EXCEPT_DATA" > "$EXCEPT_FILE"
582
583 # Transfer to array for easy searching later
584 declare -a EXCEPT_ARRAY=($(echo "$EXCEPT_DATA"))
[1064]585fi
586
587# Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
588LINK_COUNT_STRING=$(cat "$LINKS_FILE" | wc -l)
589
590# Number of URLs is number of lines minus one (first line is column header row for the CSV)
591LINK_COUNT=$(echo "${LINK_COUNT_STRING}" | tr -d '[:space:]')
592let LINK_COUNT-=1
[1141]593valPrint ctrh "Found $LINK_COUNT links to process."
594valPrint trh ""
[1064]595
[1141]596## CONFIG OUTPUT ##
597valPrint t "Config:"
598valPrint r "\b1 Config \b0"
599valPrint hn "<h3>Config</h3>"
600
601valPrint ctrhn "Links to consider: "
[1064]602if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
[1141]603 valPrint ctrh "$((URL_LIMIT-URL_START+1)), from link $URL_START to $URL_LIMIT"
[1064]604elif [ $URL_START -ne 1 ]; then
[1141]605 valPrint ctrh "$((LINK_COUNT-URL_START+1)), from link $URL_START to $LINK_COUNT"
[1064]606else
[1141]607 valPrint ctrh "$LINK_COUNT"
[1064]608fi
609
[1141]610valPrint ctrh "Site query timeout: $TIMEOUT seconds"
611
612valPrint ctrhn "Show OK links: "
613if [ $RECORD_OK_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
614
615valPrint ctrhn "Take screenshots: "
616if [ $TAKE_PAGE_SHOT -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
617
618valPrint ctrhn "Suggest Archive.org snapshots: "
619if [ $SUGGEST_SNAPSHOTS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
620
621valPrint ctrhn "Ignore slash-adding redirects: "
622if [ $SHOW_SLASH -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
623
624valPrint ctrhn "Ignore HTTPS-upgrading redirects: "
625if [ $SHOW_HTTPS -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
626
627valPrint ctrhn "Ignore youtu.be redirects: "
628if [ $SHOW_YT_RD -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
629
630valPrint ctrhn "Check archive.org links: "
631if [ $SKIP_ARCHIVE_LINKS -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
632
[1064]633valPrint tr "A summary of my findings will be found at the bottom of the report."
634valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
635valPrint trh ""
636
[1141]637## LEGEND OUTPUT ##
[1064]638valPrint t "Legend:"
639valPrint r "\b1 Legend \b0"
640valPrint hn "<h3>Legend</h3>"
[1141]641valPrint t "(For guidance in fixing these links, see $WIKI_MAIN.)"
642valPrint r "(For guidance in fixing these links, see {\field{\*\fldinst{HYPERLINK \"$WIKI_MAIN\"}}{\fldrslt here}}.)"
643valPrint h "(For guidance in fixing these links, see <a href=\"$WIKI_MAIN\" target=\"_blank\">here</a>.)"
644valPrint trh "OK = URL seems to be working"
645valPrint trh "NG = URL no longer seems to work"
646valPrint trh "RD = URL is redirecting to this new URL"
647valPrint trh "EI = URL is an external link to an internal page and should be converted to an intrawiki link using the suggested markup"
648valPrint trh "IW = URL is an external link to a fellow wiki and should be converted to an interwiki link using the suggested markup"
649valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $WIKI_HTTP)"
650valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$WIKI_HTTP\"}}{\fldrslt here}} for code reference)"
651valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$WIKI_HTTP\" target=\"_blank\">here</a> for code reference)"
652valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $WIKI_CURL)"
653valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$WIKI_CURL\"}}{\fldrslt here}} for code reference)"
654valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$WIKI_CURL\" target=\"_blank\">here</a> for code reference)"
655valPrint trh "IA suggests = Last available snapshot returned by the Internet Archive"
656valPrint trh "Try browsing = The Archive failed to return a snapshot URL, so check for a snapshot manually using this link"
[1064]657valPrint trh ""
658
659
660### MAIN LOOP ###
[1120]661valPrint t "Links:"
662valPrint r "\b1 Links \b0"
663valPrint hn "<h3>Links</h3>"
[1118]664START_RUN=$(date +%s)
[1064]665# Process each line of the .csv in LINKS_FILE
666for LINE in `cat "$LINKS_FILE"`; do
667 let LINK_NUM+=1
668
669 # First line is the column header row for the CSV, so let's verify that the format hasn't changed
670 if [ $SKIPPED_HEADER_ROW -eq 0 ]; then
671 if [ $LINE == "namespace,title,target" ]; then
672 SKIPPED_HEADER_ROW=1
673 LINK_NUM=0 # this line is it's not a link, so reset the link counter
674 valPrint hn "<table>"
675 continue
676 else
677 valPrint ctrh "Did not find expected header row in $LINKS_FILE. Aborting."
678 wrapupAndExit
679 fi
680 fi
681
682 # Skip this link if we are not at URL_START yet
683 if [ $LINK_NUM -lt $URL_START ]; then
684 continue
685 fi
686
687 # Stop if we are at the limit declared for testing purposes
688 if [ $URL_LIMIT -gt 0 ] && [ $LINK_NUM -gt $URL_LIMIT ]; then
689 FINISHED_LIST="limit"
690 wrapupAndExit
691 fi
692
693 # Print progress to screen
694 if [ $LINK_NUM -gt 1 ]; then
695 printf "\e[1A\n" # erase previous progress message so that new one appears in its place
696 fi
697 valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."
698
699 # The number of the namespace is the element before the first comma on the line
700 NS_ID=${LINE%%,*}
701
702 # Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
703 NS_NAME=""
704 a=0
[1069]705 while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done
[1118]706 if [ $NS_ID == "NULL" ]; then
707 break
708 elif [ $NS_ID -eq ${NS_IDS[$a]} ]; then
[1064]709 NS_NAME="${NS_NAMES[$a]}"
710 break
711 fi
712 let a+=1
713 done
[1118]714 if [ "$NS_NAME" == "" ]; then
715 if [ $NS_ID == "NULL" ]; then
[1123]716 valPrint trs "Skipping line $LINK_NUM ('$LINE') because the namespace (and probably the page too) is 'NULL'. Probably the link is no longer in existence on the wiki."
[1118]717 else
[1123]718 valPrint trs "Skipping line $LINK_NUM ('$LINE') because I could not find a name for namespace ID $NS_ID."
[1118]719 fi
[1064]720 let SKIP_UNK_NS+=1
721 continue
722 fi
723
724 # The name of the page is everything between the namespace ID and the next comma on the line (commas
725 # in page names will break this)
726 PAGE_NAME=${LINE#$NS_ID,}
727 PAGE_NAME=${PAGE_NAME%%,*}
728
[1135]729 # We don't want to consider wiki pages ending in .js, as the MW parser cannot reliably isolate URLS
730 # in JavaScript code, so it returns erroneous links
[1064]731 PAGE_NAME_SUFFIX=$(echo $PAGE_NAME | sed 's/.*\.//')
732 if [ $PAGE_NAME_SUFFIX == "js" ]; then
[1123]733 valPrint trs "Skipping URL '${LINE#$NS_ID,$PAGE_NAME,}' on line $LINK_NUM because it was found on JavaScript page '$PAGE_NAME'."
[1064]734 let SKIP_JS_PAGE+=1
735 continue
736 fi
737
[1070]738 # Build longer wiki page URLs from namespace and page names
[1122]739 FULL_PAGE_PATH=https://$WIKI_PATH/$NS_NAME:$PAGE_NAME
[1070]740 LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
741 # Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it
742 # explicitly breaks the link
743 if [ $NS_ID -eq 0 ]; then
[1122]744 FULL_PAGE_PATH=https://$WIKI_PATH/$PAGE_NAME
[1070]745 LOCAL_PAGE_PATH=$PAGE_NAME
746 fi
747
[1064]748 # The URL being linked to is everything after the previous two fields (this allows commas to be in
749 # the URLs, but a comma in the previous field, the page name, will break this)
750 URL=${LINE#$NS_ID,$PAGE_NAME,}
751
752 # Scan for illegal characters
753 if [[ $URL == *[$ILLEGAL_CHARS]* ]]; then
[1123]754 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because it contains characters illegal in a URL."
[1064]755 let SKIP_BAD_URL+=1
756 continue
757 fi
758
[1135]759 # If we're skipping Archive.org links, check if this is one
760 if [ $SKIP_ARCHIVE_LINKS -eq 1 ] && [[ $URL == *web.archive.org* ]]; then
761 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have been asked not to check Wayback Machine links."
762 let SKIP_ARCHIVE_ORG+=1
763 continue
764 fi
765
[1064]766 # Now we need to know if the URL is for a file or a web page. First step is to determine if the
767 # URL ends in a suffix
768 HAS_SUFFIX=0
769
770 # If the URL ends in something like ".php?foo=bar", strip everything from the '?' onward
[1070]771 CLEAN_URL=${URL%%\?*}
[1064]772
773 # If the URL ends in something like "#section_15", strip everything from the '#' onward
[1070]774 CLEAN_URL=${CLEAN_URL%%\#*}
[1064]775
[1135]776 # 'sed' cannot handle Unicode in my Bash shell, so skip non-ASCII URL and make user check it
[1070]777 if [[ $CLEAN_URL == *[![:ascii:]]* ]]; then
[1123]778 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I cannot handle non-ASCII characters."
[1064]779 let SKIP_NON_ASCII+=1
780 continue
781 fi
782
783 # Isolate the characters after the last period and after the last slash
[1070]784 POST_DOT=$(echo "$CLEAN_URL" | sed 's/.*\.//')
785 POST_SLASH=$(echo "$CLEAN_URL" | sed 's/.*\///')
[1064]786
787 # If the last period comes after the last slash, then the URL ends in a suffix
788 POST_DOT_LENGTH=$(echo | awk -v input=$POST_DOT '{print length(input)}')
789 POST_SLASH_LENGTH=$(echo | awk -v input=$POST_SLASH '{print length(input)}')
790 if [ $POST_DOT_LENGTH -lt $POST_SLASH_LENGTH ]; then
791 HAS_SUFFIX=1
792 else
793 HAS_SUFFIX=0
794 fi
795
796 # Now we need to know if the URL is for a file. If the URL ends in a suffix, compare it against the
797 # known URL endings supplied in HTTP_FILES and HTTP_TLDS_AND_PAGES
798 IS_FILE=-1
799 if [ $HAS_SUFFIX -eq 0 ]; then
800 IS_FILE=0
801 else
802 # Turn off case sensitivity while we compare suffixes
803 shopt -s nocasematch
804
[1127]805 # Special case: URLs ending in something like "productID.304297400" are pages, not files, so if
[1064]806 # the URL's suffix is all numbers, we are looking at the end of a web page URL
807 if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
808 IS_FILE=0
809 fi
[1127]810
811 # Special case: URLs ending in a parens, e.g. "ms537113(v=vs.85)", are pages
812 if [[ $POST_DOT =~ ^.*[\(\)]$ ]]; then
813 IS_FILE=0
814 fi
815
816 # Special case: URLs containing a '%', e.g. "10.1007%2FBF00329055", are pages
817 if [[ $POST_DOT == *%* ]]; then
818 IS_FILE=0
819 fi
[1064]820
821 # If we did not identify this URL as a web page above, we need to compare the suffix against known
822 # file extensions
823 if [ $IS_FILE -eq -1 ]; then
824 for EXTENSION in "${HTTP_FILES[@]}"; do
825 if [[ $EXTENSION == $POST_DOT ]]; then # double brackets respect the nocasematch setting
826 IS_FILE=1
827 break
828 fi
829 done
830 fi
831
832 # If we did not identify this URL as a file above, we need to compare the suffix against known
833 # pages and TLDs in order to rule out that this is an unknown file suffix or TLD that the user
834 # needs to add to the HTTP_FILES or HTTP_TLDS_AND_PAGES array
835 if [ $IS_FILE -eq -1 ]; then
836 for PAGE_OR_TLD in "${HTTP_TLDS_AND_PAGES[@]}"; do
837 if [[ $PAGE_OR_TLD == $POST_DOT ]]; then
838 IS_FILE=0
839 break
840 fi
841 done
842 fi
843
844 # Turn case sensitivity back on in Bash
845 shopt -u nocasematch
846 fi
847
848 # If this suffix escaped identification as either a file, page or TLD, inform the user
849 STR_TYPE=""
850 if [ $IS_FILE -eq -1 ]; then
[1123]851 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I encountered the unknown URL ending '$POST_DOT'. Please add this ending to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
[1064]852 let SKIP_UNK_SUFFIX+=1
853 continue
854 elif [ $IS_FILE -eq 1 ]; then
855 STR_TYPE="file"
856 let FILE_LINKS+=1
857 elif [ $IS_FILE -eq 0 ]; then
858 STR_TYPE="page"
859 let PAGE_LINKS+=1
860 fi
861
862 # Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
863 # issue with sites that require HTTPS
[1141]864 CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --write-out '%{http_code}\n' $URL)
[1064]865 CURL_ERR=$(echo $?)
866 CURL_RESULT=$CURL_CODE
867
868 # Tack 'curl' exit code onto result string if 'curl' returned "000" (no HTTP response)
869 if [ $CURL_CODE == "000" ]; then
870 CURL_RESULT="$CURL_RESULT-$CURL_ERR"
871 fi
872
[1070]873 # Begin to determine our status code for this URL (EI, IW, OK, RD, or NG)
[1064]874 STATUS="??"
[1067]875 NEW_URL=""
[1064]876 INTERWIKI_INDEX=-1
877
[1070]878 # First make sure that this isn't an "external internal" link to our own wiki that can be replaced
879 # by "[[page_name]]". If it uses a special access URL beginning with "/w/", let it pass, as it
880 # probably cannot be replaced by "[[ ]]" markup
881 if [[ $URL == *$WIKI_PATH* ]] && [[ $URL != *$WIKI_PATH/w/* ]]; then
882 STATUS="EI"
883 let EI_LINKS+=1
884 fi
885
886 # If it's not, check if this is a link to a domain that we have an interwiki prefix for
887 if [ $STATUS == "??" ]; then
888 for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
889 if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]] && [[ $URL != *${INTERWIKI_DOMAINS[$i]}/w/* ]]; then
890 STATUS="IW"
891 let IW_LINKS+=1
892 INTERWIKI_INDEX=$i
893 break
894 fi
895 done
896 fi
897
[1069]898 # If we didn't match an interwiki domain, see if the status code is in our "OK" codes list
899 if [ $STATUS == "??" ]; then
900 for CODE in "${OK_CODES[@]}"; do
901 if [[ $CODE == $CURL_CODE ]]; then
902 STATUS="OK"
903 let OK_LINKS+=1
904 break
905 fi
906 done
907 fi
908
[1067]909 # If we didn't get a match with the "OK" codes, check it against the "RD" codes
[1064]910 if [ $STATUS == "??" ]; then
[1067]911 for CODE in "${RD_CODES[@]}"; do
912 if [[ $CODE == $CURL_CODE ]]; then
913 # Get URL header again in order to retrieve the URL we are being redirected to
[1141]914 NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --write-out '%{redirect_url}\n' $URL)
[1067]915
[1122]916 # Adjust the old and new URLs to both use HTTP for comparison purposes, so we can filter
917 # those changes out if the user didn't ask for them
918 URL_HTTP=$(echo $URL | sed -E 's/^https:/http:/')
919 NEW_URL_HTTP=$(echo $NEW_URL | sed -E 's/^https:/http:/')
[1070]920
921 # Sometimes 'curl' fails to get the redirect_url due to time-out or bad web site config
[1122]922 NEW_URL_LENGTH=$(echo | awk -v input=$NEW_URL_HTTP '{print length(input)}')
[1070]923 if [ $NEW_URL_LENGTH -lt $MIN_URL_LENGTH ]; then
[1122]924 NEW_URL_HTTP="[new URL not retrieved]"
[1070]925 fi
926
[1122]927 # Remove slash at end of new URL, if present, so we can filter out the redirects that
928 # merely add an ending slash if the user didn't ask for them
929 NEW_URL_NO_SLASH=$(echo $NEW_URL_HTTP | sed -E 's:/$::')
930
[1127]931 # Detect if this is a youtu.be link simply being expanded by YouTube to the full
932 # youtube.com address
933 YOUTU_BE=0
934 if [[ $URL_HTTP == http://youtu.be* ]] && [[ $NEW_URL_HTTP == http://www.youtube.com* ]]; then
935 YOUTU_BE=1
936 fi
937
[1122]938 # If the URLs match besides HTTP being upgraded to HTTPS, then the link is OK (unless user
939 # wants those to be reported)
940 if [ $SHOW_HTTPS -eq 0 ] && [ $URL_HTTP == $NEW_URL_HTTP ]; then
[1123]941 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show http->https upgrades, and I was redirected to '$NEW_URL'."
[1069]942 STATUS="OK"
943 let OK_LINKS+=1
[1122]944 let SKIP_HTTPS_UP+=1
945 # If the URLs match besides an added ending slash, then the link is OK (unless user wants
946 # those to be reported)
947 elif [ $SHOW_SLASH -eq 0 ] && [ $URL_HTTP == $NEW_URL_NO_SLASH ]; then
[1123]948 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show added trailing slashes, and I was redirected to '$NEW_URL'."
[1122]949 STATUS="OK"
950 let OK_LINKS+=1
951 let SKIP_SLASH_ADD+=1
[1127]952 elif [ $SHOW_YT_RD -eq 0 ] && [ $YOUTU_BE -eq 1 ]; then
953 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show redirects for youtu.be links, and I was redirected to '$NEW_URL'."
954 STATUS="OK"
955 let OK_LINKS+=1
956 let SKIP_YOUTU_BE+=1
[1069]957 else
958 STATUS="RD"
959 let RD_LINKS+=1
960 fi
[1067]961 break
962 fi
963 done
964 fi
965
966 # If we didn't get a match with the "RD" codes, check it against the "NG" codes
967 if [ $STATUS == "??" ]; then
[1064]968 for CODE in "${NG_CODES[@]}"; do
969 if [[ $CODE == $CURL_CODE ]]; then
970 STATUS="NG"
971 let NG_LINKS+=1
972 break
973 fi
974 done
975 fi
976
977 # If we didn't match a known status code, advise the reader
978 if [ $STATUS == "??" ]; then
[1127]979 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I encountered the unknown response code $CURL_CODE."
[1064]980 let SKIP_UNK_CODE+=1
981 continue
982 fi
983
[1136]984 # Check problem links against exceptions list before proceeding
985 FOUND_EXCEPT=0
[1070]986 if [ $STATUS != "OK" ] && [ ! -z $EXCEPT_URL ]; then
987 # The code we expect to find in the exceptions file is either the 'curl' result or "EI"/"IW"
988 EXPECT_CODE="$CURL_RESULT"
989 if [ $STATUS == "EI" ]; then
990 EXPECT_CODE="EI"
991 elif [ $STATUS == "IW" ]; then
992 EXPECT_CODE="IW"
993 fi
994
[1136]995 # Look for link in exceptions list and make sure the listed result code and wiki page also match
996 for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
997 {
998 EXCEPT_LINE="${EXCEPT_ARRAY[$i]}"
999
1000 # Match URL
1001 EXCEPT_URL="${EXCEPT_LINE#*,}"
1002 EXCEPT_URL="${EXCEPT_URL%,*}"
1003 if [ "$EXCEPT_URL" != "$URL" ]; then
[1070]1004 continue
1005 fi
[1136]1006
1007 # Match containing page's name
1008 EXCEPT_PAGE="${EXCEPT_LINE##*,}"
1009 EXCEPT_PAGE="${EXCEPT_PAGE%% *}"
1010 if [ "$EXCEPT_PAGE" == "*" ] || [ "$EXCEPT_PAGE" == $LOCAL_PAGE_PATH ]; then
1011 # Match result code
1012 EXCEPT_CODE=${EXCEPT_LINE%%,*}
1013 if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then
1014 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because its expected result, '$EXPECT_CODE', is in the exceptions list."
1015 if [ $STATUS == "EI" ]; then
1016 let SKIP_EXPECT_EI+=1
1017 elif [ $STATUS == "IW" ]; then
1018 let SKIP_EXPECT_IW+=1
1019 else
1020 let SKIP_EXPECT_NG+=1
1021 fi
1022 FOUND_EXCEPT=1
1023 break
1024 fi
1025 fi
1026 } done
[1064]1027 fi
[1136]1028 if [ $FOUND_EXCEPT -eq 1 ]; then
1029 continue
1030 fi
[1064]1031
1032 # If appropriate, record this link to the log, with clickable URLs when possible
1033 if [ $STATUS != "OK" ] || [ $RECORD_OK_LINKS -eq 1 ]; then
[1125]1034 # Prepare 'curl' result in parentheses to print after status code, unless this is an "IW" or "EI"
1035 # link, in which case showing the status code doesn't make sense. Adjust spacing after string to
1036 # ensure TXT and RTF reports have aligned columns of results.
1037 CURL_STR_H=" ($CURL_RESULT)"
1038 CURL_STR_T="$CURL_STR_H"
1039 CURL_STR_R="$CURL_STR_H "
[1070]1040 if [ $STATUS == "IW" ] || [ $STATUS == "EI" ]; then
[1125]1041 CURL_STR_H=""
1042 CURL_STR_T=" "
1043 CURL_STR_R=" "
[1064]1044 fi
1045
1046 # Record link and its wiki page in TXT, RTF, and HTML markup
[1125]1047 valPrint t "${STATUS}${CURL_STR_T} $STR_TYPE $URL"
[1064]1048 valPrint t " linked from $FULL_PAGE_PATH"
[1125]1049 valPrint r "${STATUS}${CURL_STR_R}${RTF_TABS}$STR_TYPE {\field{\*\fldinst{HYPERLINK \"$URL\"}}{\fldrslt $URL}}"
[1064]1050 valPrint r " linked from {\field{\*\fldinst{HYPERLINK \"$FULL_PAGE_PATH\"}}{\fldrslt $LOCAL_PAGE_PATH}}"
[1125]1051 valPrint hn "<tr><td style=\"white-space:nowrap\">${STATUS}${CURL_STR_H}</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>"
[1064]1052 valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"
1053
[1123]1054 # Place vertical space here since we won't be printing anything more about this link
[1125]1055 if [ $STATUS == "OK" ]; then valPrint tr ""; valPrint hs ""; fi
[1123]1056
[1067]1057 # Record redirect URL if one was given by a 3xx response page
1058 if [ $STATUS == "RD" ]; then
[1119]1059 valPrint ts " Server suggests $NEW_URL"
1060 valPrint rs " Server suggests {\field{\*\fldinst{HYPERLINK \"$NEW_URL\"}}{\fldrslt $NEW_URL}}"
1061 valPrint hs "<tr><td colspan=\"2\" align=\"right\">Server suggests</td><td><a href=\"$NEW_URL\" target=\"_blank\">$NEW_URL</a></td></tr>"
[1067]1062 fi
1063
[1070]1064 # Notify reader if we can use an intrawiki link for this URL
1065 if [ $STATUS == "EI" ]; then
[1075]1066 INTRA_PAGE=${URL#*://*/}
[1119]1067 valPrint ts " Just use [[$INTRA_PAGE]]"
1068 valPrint rs " Just use [[$INTRA_PAGE]]"
1069 valPrint hs "<tr><td colspan=\"2\" align=\"right\">Just use</td><td>[[$INTRA_PAGE]]</td></tr>"
[1070]1070 fi
1071
[1064]1072 # Notify reader if we can use an interwiki prefix for this URL
1073 if [ $STATUS == "IW" ]; then
[1075]1074 INTER_PAGE=$(echo "$URL" | sed 's/.*\///')
[1119]1075 valPrint ts " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
1076 valPrint rs " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
1077 valPrint hs "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]</td></tr>"
[1064]1078 fi
1079
1080 # Query Internet Archive for latest "OK" snapshot for "NG" page
1081 if [ $STATUS == "NG" ] && [ $SUGGEST_SNAPSHOTS -eq 1 ]; then
[1141]1082 ARCHIVE_QUERY=$(curl --silent --max-time $TIMEOUT "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
[1064]1083
[1118]1084 # If a "closest" snapshot was received...
[1066]1085 if [[ "$ARCHIVE_QUERY" == *\"closest\":* ]]; then
[1118]1086 # In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it
1087 ARCHIVE_QUERY=$(echo "$ARCHIVE_QUERY" | sed 's/#!/#\\!/')
1088
1089 # ...isolate "url" property in the response that follows the "closest" tag
1090 SNAPSHOT_URL=${ARCHIVE_QUERY##*\"closest\":} # everything after '"closest":'
[1119]1091 SNAPSHOT_URL=${SNAPSHOT_URL#*\"url\": \"} # everything after '"url": "'
[1118]1092 SNAPSHOT_URL=${SNAPSHOT_URL%%\"*} # everything before '"'
1093
[1124]1094 # Remove the port 80 part that IA often adds to the URL, as it's superfluous
1095 SNAPSHOT_URL=$(echo $SNAPSHOT_URL | sed 's/:80//')
1096
[1118]1097 # Inform the user of the snapshot URL
[1119]1098 valPrint ts " IA suggests $SNAPSHOT_URL"
1099 valPrint rs " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
1100 valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
[1064]1101 else # ...otherwise give generic Wayback Machine link for this URL
[1119]1102 valPrint ts " Try browsing $ARCHIVE_GENERIC/$URL"
1103 valPrint rs " Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
1104 valPrint hs "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"
[1064]1105 fi
1106 fi
1107 fi
1108
1109 # If this is a page and it seems to be there, take screenshot of it using headless Google Chrome
1110 if [ $IS_FILE -eq 0 ] && [ $TAKE_PAGE_SHOT -eq 1 ] && [ $STATUS == "OK" ]; then
1111 # Sanitize URL by removing "http(s)://" and converting any other colons or slashes to underscores
1112 SHOT_NAME=$(echo "$URL" | sed 's/https*\:\/\///' | sed 'y/:\//__/')
1113 SHOT_FILE="$SHOT_PATH/$SHOT_NAME.png"
1114
1115 # Don't take screenshot if we already encountered this page and screenshotted it
1116 if [ ! -f "$SHOT_FILE" ]; then
[1070]1117 "$CHROME_PATH" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
[1064]1118 if [ -f "$WORKING_DIR/$CHROME_SCREENSHOT" ]; then
1119 mv -n "$WORKING_DIR/$CHROME_SCREENSHOT" "$SHOT_FILE"
1120 else
[1119]1121 valPrint trhs "Screenshot of URL $URL seems to have failed!"
[1064]1122 fi
1123 else
[1123]1124 valPrint trhs "Skipping screenshot of URL '$URL' because file '$SHOT_FILE' already exists."
[1064]1125 fi
1126 fi
1127done
1128FINISHED_LIST="yes"
1129wrapupAndExit
Note: See TracBrowser for help on using the repository browser.