source: Validate External Links/validate_external_links.sh@ 1169

Last change on this file since 1169 was 1160, checked in by iritscen, 3 years ago

ValExtLinks: Added some entries to the lists of known file and page suffixes.

File size: 54.7 KB
Line 
1#!/bin/bash
2
3# Validate External Links by Iritscen
4#
5# Validates a list of external links in CSV format. The resulting logs are produced in three formats:
6# - TXT (for easy diffing with an earlier log)
7# - RTF (for reading as a local file with clickable links)
8# - HTML (for reading as a web page)
9# Call script with "--help" argument for documentation. Also see Read Me.rtf for critical notes.
10#
11# Recommended rule:
12# |----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----|
13#
14# Table of contents (sections of script in order of appearance, not execution):
15# • Globals
16# • Help Output
17# • Setup
18# • Utility Functions
19# • Summary Output
20# • Initialization
21# • Data Sourcing
22# • Config Output
23# • Legend Output
24# • Main Loop
25
26# Set separator token to newline
27IFS="
28"
29
30### GLOBALS ###
31# Settings -- these will be changed from their defaults by the arguments passed in to the script
32LINKS_URL="" # use 'curl' to download file with links from this location (can be file://)
33EXCEPT_URL="" # 'curl' will access this wiki page with a list of exceptions for NG results
34OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder
35RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES
36SHOW_SLASH=0 # record issue when a slash is added to the end of a URL
37SHOW_HTTPS=0 # record issue when "http" is upgraded to "https"
38SHOW_YT_RD=0 # record redirection for a youtu.be URL expanding to the full URL
39SUGGEST_SNAPSHOTS_NG=0 # query the Internet Archive for a possible snapshot URL for each NG page
40SUGGEST_SNAPSHOTS_OK=0 # query the Internet Archive for an existing snapshot of each OK page
41CHECK_ARCHIVE_LINKS=0 # check URLs on archive.org and archive.is
42TAKE_PAGE_SHOT=0 # take a screenshot of each OK page
43TIMEOUT=10 # time to wait for a response when querying a site
44CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature
45URL_START=1 # start at this URL in LINKS_FILE
46URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE
47UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report
48
49# Fixed strings -- see the occurrences of these variables to learn their purpose
50AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/88.0.4324.146 Safari/537.36"
51ARCHIVE_API="http://archive.org/wayback/available"
52ARCHIVE_GENERIC="https://web.archive.org/web/*"
53ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
54CHROME_SCREENSHOT="screenshot.png"
55EXCEPT_FILE_NAME="exceptions.txt"
56EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
57WIKI_CURL="https://wiki.oni2.net/Validate_External_Links/Curl_codes"
58WIKI_HTTP="https://wiki.oni2.net/Validate_External_Links/HTTP_codes"
59WIKI_MAIN="https://wiki.oni2.net/Validate_External_Links"
60WIKI_ME="http://iritscen.oni2.net"
61THIS_DIR=$(cd $(dirname $0); pwd)
62WORKING_DIR=$(pwd)
63WIKI_PATH="wiki.oni2.net"
64
65# These are parallel arrays of the IDs and names of OniGalore's current namespaces
66declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 100 101 102 103 104 105 108 109 110 111)
67declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk")
68
69# These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
70# This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code.
71declare -a HTTP_FILES=(3ds 7z avi BINA blend bsl cpp csv dae dll dmg doc east exe fbx first63 flv gamedata gif jpg last32 log m4a mhm mov mp3 mp4 oni ONWC pdf png psd py rar tga TRMA txt vbs wav wmv xaf xcf xml zip)
72declare -a HTTP_TLDS_AND_PAGES=(abstract action ars asp aspx cfm cgi com css de do full htm html it js jsp net org pgi php php3 phtml pl ru shtml stm uk x)
73
74# These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
75# are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt
76# if you add a new code.
77declare -a OK_CODES=(200 401 405 406 418 501)
78declare -a RD_CODES=(301 302 303 307 308)
79declare -a NG_CODES=(000 400 403 404 410 429 500 502 503 504 530)
80
81# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
82# transcluded text, and if the transclusion fails, then the braces show up in the URL
83ILLEGAL_CHARS="{ }"
84
85# The shortest URL possible, used for sanity-checking some URLs: http://a.co
86MIN_URL_LENGTH=11
87
88# These are parallel arrays giving the prefixes that can be used in place of normal external links to
89# some wikis and other sites; based on https://wiki.oni2.net/Special:Interwiki
90declare -a INTERWIKI_PREFIXES=(commons metawikimedia mw wikibooks wikidata wikimedia wikinews wikiquote wikisource wikispecies wikiversity wikivoyage wikt wp)
91declare -a INTERWIKI_DOMAINS=(commons.wikimedia.org meta.wikimedia.org mediawiki.org wikibooks.org wikidata.org wikimediafoundation.org wikinews.org wikiquote.org wikisource.org species.wikimedia.org wikiversity.org wikivoyage.org wiktionary.org wikipedia.org)
92
93# Variables for keeping track of main loop progress and findings
94LINK_NUM=0
95EI_LINKS=0
96IW_LINKS=0
97OK_LINKS=0
98RD_LINKS=0
99NG_LINKS=0
100SKIP_UNK_NS=0
101SKIP_JS_PAGE=0
102SKIP_BAD_URL=0
103SKIP_NON_ASCII=0
104SKIP_UNK_SUFFIX=0
105SKIP_UNK_CODE=0
106SKIP_EXPECT_NG=0
107SKIP_EXPECT_RD=0
108SKIP_EXPECT_EI=0
109SKIP_EXPECT_IW=0
110SKIP_HTTPS_UP=0
111SKIP_SLASH_ADD=0
112SKIP_YOUTU_BE=0
113SKIP_ARCHIVES=0
114FILE_LINKS=0
115PAGE_LINKS=0
116SKIPPED_HEADER_ROW=0
117FINISHED_LIST="no"
118START_RUN=0
119END_RUN=0
120
121
122### HELP OUTPUT ###
123# A pseudo-man page. Here is the 80-character rule for the page text:
124# 234567890123456789012345678901234567890123456789012345678901234567890123456789
125function printHelp()
126{
127 cat << EOF
128
129NAME
130 Validate External Links
131
132SYNOPSIS
133 validate_external_links.sh --help
134 validate_external_links.sh --links URL --output DIR [--exceptions URL]
135 [--record-ok-links] [--show-added-slashes] [--show-https-upgrades]
136 [--show-yt-redirects] [--suggest-snapshots] [--check-archive-links]
137 [--take-screenshots FILE] [--timeout NUM] [--start-url NUM]
138 [--end-url NUM] [--upload FILE]
139
140DESCRIPTION
141 This script parses a list of external links found in the OniGalore wiki
142 (which is dumped by the Oni2.net server periodically in a particular
143 format), validates them using the Unix tool 'curl', and produces a report
144 of which links were "OK" (responded positively to an HTTP query), which
145 were "RD" (responded with a 3xx redirect code), which could be "IW"
146 (interwiki) links, which are "EI" (external internal) links and could be
147 intrawiki links, and which were "NG" (no good; a negative response to the
148 query). This report can then be automatically uploaded to the location of
149 your choice. The script can also suggest Internet Archive snapshots for
150 "NG" links, and take screenshots of "OK" links for visual verification by
151 the reader that the page in question is the one intended to be displayed.
152
153 You must pass this script the URL at which the list of links is found
154 (--links) and the path where the directory of logs should be outputted
155 (--output). All other arguments are optional.
156
157OPTIONS
158 --help Show this page.
159 --links URL (required) URL from which to download the CSV
160 file with external links. Note that this URL can
161 be a local file if you supply a file:// path.
162 --output DIR (required) Unix path to directory in which Val
163 should place its reports.
164 --exceptions URL In order to remove links from the report which
165 Val finds an issue with but which you regard as
166 OK, list those desired exceptions on a wiki page.
167 See the sample file "exceptions.pdf" for the
168 required format of the page. Note that this URL
169 can point to a local file if you supply a path
170 beginning with "file://".
171 --record-ok-links Log a link in the report even if its response
172 code is "OK".
173 --show-added-slashes Report on redirects that simply add a '/' to the
174 end of the URL.
175 --show-https-upgrades Report on redirects that simply upgrade a
176 "http://" URL to a "https://" URL.
177 --show-yt-redirects Report on redirects that expand a youtu.be URL.
178 --suggest-snapshots-ng Query the Internet Archive for a possible
179 snapshot URL for each "NG" page.
180 --suggest-snapshots-ok Query the Internet Archive for a snapshot of each
181 "OK" page just to make sure it's available. Note
182 that this will add a tremendous amount of time to
183 the script execution because there is a rate
184 limit to the Archive API. Note that this option
185 does nothing unless you also use the
186 --record-ok-links argument.
187 --check-archive-links Check links that are already pointing to a page
188 on the Internet Archive or archive.is (AKA
189 archive.today). In theory these links should be
190 totally stable and not need validation.
191 --take-screenshots FILE Call the Google Chrome binary at this path to
192 take screenshots of each "OK" page.
193 --timeout NUM Wait this many seconds for a site to respond. The
194 default is 10. Important note: Val will attempt
195 to reach each URL three times, so the time taken
196 to ping an unresponsive site will be three times
197 this setting.
198 --start-url NUM Start at this link in the links CSV file.
199 --end-url NUM Stop at this link in the links CSV file.
200 --upload FILE Upload report using the credentials and path
201 given in this local text file. See sftp_login.txt
202 for template.
203
204BUGS
205 The script cannot properly parse any line in the external links file
206 which contains a comma in the name of the wiki page containing a link.
207 Commas in the link itself are not an issue.
208EOF
209}
210
211
212### SETUP ###
213# If first argument is a help request, or if nothing was passed in at all, print help page and quit
214if [ "$#" -eq 0 ] || [ "$1" == "--help" ]; then
215 printHelp | less
216 exit 0
217fi
218
219# Parse arguments as long as there are more arguments to process
220while (( "$#" )); do
221 case "$1" in
222 --links ) LINKS_URL="$2"; shift 2;;
223 --exceptions ) EXCEPT_URL="$2"; shift 2;;
224 --output ) OUTPUT_DIR="$2"; shift 2;;
225 --record-ok-links ) RECORD_OK_LINKS=1; shift;;
226 --show-added-slashes ) SHOW_SLASH=1; shift;;
227 --show-https-upgrades ) SHOW_HTTPS=1; shift;;
228 --show-yt-redirects ) SHOW_YT_RD=1; shift;;
229 --suggest-snapshots-ng ) SUGGEST_SNAPSHOTS_NG=1; shift;;
230 --suggest-snapshots-ok ) SUGGEST_SNAPSHOTS_OK=1; shift;;
231 --check-archive-links ) CHECK_ARCHIVE_LINKS=1; shift;;
232 --take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
233 --timeout ) TIMEOUT=$2; shift 2;;
234 --start-url ) URL_START=$2; shift 2;;
235 --end-url ) URL_LIMIT=$2; shift 2;;
236 --upload ) UPLOAD_INFO=$2; shift 2;;
237 * ) echo "Invalid argument '$1' detected. Aborting."; exit 1;;
238 esac
239done
240
241# If the required arguments were not supplied, print help page and quit
242if [ -z $LINKS_URL ] || [ -z $OUTPUT_DIR ]; then
243 echo "Error: I did not receive one or both required arguments. Run me with the \"--help\" argument for documentation."
244 exit 2
245fi
246
247# If user wants screenshots, make sure path to Chrome was passed in and is valid
248if [ $TAKE_PAGE_SHOT -eq 1 ]; then
249 if [ ! -f "$CHROME_PATH" ]; then
250 echo "Error: You need to supply a path to the Google Chrome application in order to take screenshots."
251 exit 3
252 fi
253fi
254
255# Check that UPLOAD_INFO exists, if this argument was supplied
256if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then
257 echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting."
258 exit 4
259fi
260
261# Check that OUTPUT_DIR is a directory
262if [ ! -d "$OUTPUT_DIR" ]; then
263 echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting."
264 exit 5
265fi
266
267# Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots
268SAFE_TIME=$(date "+%Y-%m-%d--%H-%M-%S") # for use in path names
269NICE_TIME=$(TZ=":UTC" date "+%a, %d %b %Y %k:%M:%S GMT") # matches format of LINKS_DATE, which is extracted from the extlinks file
270OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
271OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
272SHOT_PATH="$OUTPUT_PATH/Screenshots"
273LOG_NAME="ValExtLinks report"
274LOG_NAME_TXT="$LOG_NAME.txt"
275LOG_NAME_RTF="$LOG_NAME.rtf"
276LOG_NAME_HTM="$LOG_NAME.htm"
277LOG_PATH="$OUTPUT_PATH/$LOG_NAME"
278LOG_PATH_TXT="$LOG_PATH.txt"
279LOG_PATH_RTF="$LOG_PATH.rtf"
280LOG_PATH_HTM="$LOG_PATH.htm"
281mkdir "$OUTPUT_PATH"
282if [ $TAKE_PAGE_SHOT -eq 1 ]; then
283 mkdir "$SHOT_PATH"
284fi
285
286# Check that 'mkdir' succeeded
287if [ ! -d "$OUTPUT_PATH" ]; then
288 echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting."
289 exit 6
290fi
291
292# Get date on the file at LINKS_URL and print to log
293LINKS_DATE=$(curl --silent --head $LINKS_URL | grep "Last-Modified")
294if [ -z "$LINKS_DATE" ]; then
295 echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting."
296 exit 7
297fi
298LINKS_DATE=${LINKS_DATE#Last-Modified: }
299
300
301### UTILITY FUNCTIONS ###
302# Writes a plain-text header to TXT log file
303function printTXTheader()
304{
305 valPrint t "Validate External Links report"
306 valPrint t "generated $NICE_TIME"
307 valPrint t "from data of $LINKS_DATE"
308 valPrint t "script by Iritscen (contact: $WIKI_ME)"
309 valPrint t ""
310}
311
312# Writes the RTF header to RTF log file, then a centered title, then sets text to left-justified
313function printRTFheader()
314{
315 valPrint r "{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf820
316{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
317{\colortbl;\red255\green255\blue255;}
318{\*\expandedcolortbl;;}
319\margl1440\margr1440\vieww12600\viewh12100\viewkind0
320\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\qc\partightenfactor0
321
322\f0\fs28 \cf0 \b1 Validate External Links report \b0\\
323generated $NICE_TIME\\
324from data of $LINKS_DATE\\
325script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$WIKI_ME\"}}{\fldrslt contact}})
326\\
327\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
328\cf0 "
329}
330
331# Closes the RTF markup of the RTF log file
332function printRTFfooter()
333{
334 valPrint r "}"
335}
336
337# Writes the HTML header to HTML log file
338function printHTMheader()
339{
340 valPrint h "<html>
341<head>
342<title>Validate External Links report</title>
343</head>
344<body>
345<h2>Validate External Links report</h2>
346<h3>generated $NICE_TIME<br />
347from data of $LINKS_DATE<br />
348script by Iritscen (<a href=\"$WIKI_ME\" target=\"_blank\">contact</a>)</h3>"
349}
350
351# Closes the HTML markup of the HTML log file
352function printHTMfooter()
353{
354 valPrint h "</body>
355</html>"
356}
357
358# The central logging function. The first parameter is a string composed of one or more characters that
359# indicate which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
360# 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 's' means "Print an
361# extra newline at the end." 'w' means "Don't pass console output through 'fmt'" ("fmt" fits the output
362# to an 80-column CLI but can break special formatting and the 'n' option).
363function valPrint()
364{
365 if [[ "$1" == *c* ]]; then
366 if [[ "$1" == *n* ]]; then
367 echo -n "$2"
368 elif [[ "$1" == *w* ]]; then
369 echo "$2"
370 elif [[ "$1" == *s* ]]; then
371 echo -e "$2\n"
372 else
373 echo "$2" | fmt -w 80
374 fi
375 fi
376 if [[ "$1" == *t* ]]; then
377 if [[ "$1" == *n* ]]; then
378 echo -n "$2" >> "$LOG_PATH_TXT"
379 elif [[ "$1" == *s* ]]; then
380 echo -e "$2\n" >> "$LOG_PATH_TXT"
381 else
382 echo "$2" >> "$LOG_PATH_TXT"
383 fi
384 fi
385 if [[ "$1" == *r* ]]; then
386 if [[ "$1" == *n* ]]; then
387 echo "$2" >> "$LOG_PATH_RTF"
388 elif [[ "$1" == *s* ]]; then
389 echo "$2\line\line" >> "$LOG_PATH_RTF"
390 else
391 echo "$2\line" >> "$LOG_PATH_RTF"
392 fi
393 fi
394 if [[ "$1" == *h* ]]; then
395 if [[ "$1" == *s* ]]; then
396 echo "$2<tr><td>&nbsp;</td></tr>" >> "$LOG_PATH_HTM"
397 elif [[ "$1" == *n* ]]; then
398 echo "$2" >> "$LOG_PATH_HTM"
399 else
400 echo "$2<br />" >> "$LOG_PATH_HTM"
401 fi
402 fi
403}
404
405# Pluralize the string in parameter 1 if the number in parameter 2 is not 1
406function pluralCheckNoun()
407{
408 if [ $2 -ne 1 ]; then
409 if [[ $1 =~ x$ ]]; then
410 echo $1es
411 else
412 echo $1s
413 fi
414 else
415 echo $1
416 fi
417}
418
419# Output "is" if parameter 1 is 1, otherwise "are"
420function pluralCheckIs()
421{
422 if [ $1 -ne 1 ]; then
423 echo "are"
424 else
425 echo "is"
426 fi
427}
428
429# Output "was" if parameter 1 is 1, otherwise "were"
430function pluralCheckWas()
431{
432 if [ $1 -ne 1 ]; then
433 echo "were"
434 else
435 echo "was"
436 fi
437}
438
439# Output "a " if parameter 1 is 1, otherwise nothing
440function pluralCheckA()
441{
442 if [ $1 -eq 1 ]; then
443 echo "a "
444 fi
445}
446
447# Output "an " if parameter 1 is 1, otherwise nothing
448function pluralCheckAn()
449{
450 if [ $1 -eq 1 ]; then
451 echo "an "
452 fi
453}
454
455# Upload the reports using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
456# reports being saved to disk have already been closed.
457function uploadReport()
458{
459 valPrint c "Uploading reports..."
460
461 SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
462 SFTP_USER_NAME_MARKER="user:"
463 SFTP_PASSWORD_MARKER="pw:"
464 SFTP_PORT_MARKER="port:"
465 SFTP_PATH_MARKER="path:"
466 SFTP_USER_NAME=$(grep $SFTP_USER_NAME_MARKER $UPLOAD_INFO)
467 SFTP_USER_NAME=${SFTP_USER_NAME#$SFTP_USER_NAME_MARKER}
468 SFTP_PASSWORD=$(grep $SFTP_PASSWORD_MARKER $UPLOAD_INFO)
469 SFTP_PASSWORD=${SFTP_PASSWORD#$SFTP_PASSWORD_MARKER}
470 SFTP_PORT=$(grep $SFTP_PORT_MARKER $UPLOAD_INFO)
471 SFTP_PORT=${SFTP_PORT#$SFTP_PORT_MARKER}
472 SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
473 SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}
474
475 for SUFFIX in htm rtf txt; do
476 expect "$SCRIPT_PATH" "$LOG_PATH.$SUFFIX" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.$SUFFIX"
477
478 if [ "$?" -ne 0 ]; then
479 valPrint c "Error $? occurred when attempting to upload $LOG_NAME.$SUFFIX!"
480 else
481 valPrint c "Report in `echo $SUFFIX | tr [:lower:] [:upper:]` format was uploaded."
482 fi
483 done
484}
485
486# Prints session summary when script is done
487function wrapupAndExit()
488{
489 # Get off progress line on console, drop down a line from last link in log, and close HTML table
490 valPrint ctr ""
491 valPrint h "</table><br />"
492
493 # If we didn't finish processing the last URL, then the iterator is one too high
494 if [ $FINISHED_LIST != "yes" ]; then
495 let LINK_NUM-=1
496 if [ $FINISHED_LIST == "no" ]; then
497 valPrint ctrh "The session was canceled by the user."
498 fi
499 fi
500
501 # Generate string with elapsed time
502 END_RUN=$(date +%s)
503 ELAPSED=$(echo $(($END_RUN - $START_RUN)) | awk '{printf "%d min. %d sec. elapsed", int($1/60), int($1%60)}')
504
505 # Do some math on results of session
506 LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
507 TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP+SKIP_YOUTU_BE))
508 LINK_ERRORS=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
509 LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_RD+SKIP_EXPECT_EI+SKIP_EXPECT_IW))
510 LINK_PROBLEMS_TOTAL=$((NG_LINKS+RD_LINKS+EI_LINKS+IW_LINKS))
511 LINK_PROBLEMS_NG=$((NG_LINKS-SKIP_EXPECT_NG))
512 LINK_PROBLEMS_RD=$((RD_LINKS-SKIP_EXPECT_RD))
513 LINK_PROBLEMS_EI=$((EI_LINKS-SKIP_EXPECT_EI))
514 LINK_PROBLEMS_IW=$((IW_LINKS-SKIP_EXPECT_IW))
515 LINK_PROBLEMS_NET=$((LINK_PROBLEMS_NG+LINK_PROBLEMS_RD+LINK_PROBLEMS_EI+LINK_PROBLEMS_IW))
516
517 # Print something in the Links section if no link issues were printed
518 if [ $LINK_PROBLEMS_NET -eq 0 ]; then
519 valPrint h "<i>No link problems to report! See the <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for a list of links with issues that were not reported.</i>"
520 fi
521 if [ $LINK_PROBLEMS_TOTAL -eq 0 ]; then
522 valPrint t "No link problems to report!"
523 valPrint r "\i1 No link problems to report! \i0"
524 fi
525
526 ## SUMMARY OUTPUT ##
527 valPrint ct "Summary ($ELAPSED):"
528 valPrint r "\b1 Summary \b0 ($ELAPSED)"
529 valPrint hn "<h3><span id=\"summary\">Summary ($ELAPSED)</span></h3>"
530 valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT) (there $(pluralCheckWas $FILE_LINKS) $FILE_LINKS file $(pluralCheckNoun link $FILE_LINKS) and $PAGE_LINKS page $(pluralCheckNoun link $PAGE_LINKS))."
531
532 # Print processed link totals
533 if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi
534 if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi
535 if [ $SKIP_ARCHIVES -gt 0 ]; then valPrint ctrh "- $SKIP_ARCHIVES archive.org/archive.is $(pluralCheckNoun link $SKIP_ARCHIVES) were not checked"; fi
536 if [ $LINK_PROBLEMS_TOTAL -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_TOTAL processed $(pluralCheckNoun link $LINK_PROBLEMS_TOTAL) had $(pluralCheckAn $LINK_PROBLEMS_TOTAL)$(pluralCheckNoun issue $LINK_PROBLEMS_TOTAL)"; fi
537 if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr " (excepted $LINKS_EXCEPTED link $(pluralCheckNoun issue $LINKS_EXCEPTED) from report)"; valPrint h "&nbsp;&nbsp;(excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi
538 if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) $(pluralCheckWas $OK_LINKS) OK"; fi
539 if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctr " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; valPrint h "&nbsp;&nbsp;(counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi
540
541 # Print errored link totals
542 if [ $LINK_ERRORS -gt 0 ]; then
543 valPrint c "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"
544 valPrint h "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for specific links):"
545 valPrint rt "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS):"
546 fi
547 if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
548 if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
549 if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
550 if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
551 if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
552 if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
553
554 # Print excepted link totals
555 if [ $LINKS_EXCEPTED -gt 0 ]; then
556 valPrint c "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"
557 valPrint h "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for specific links):"
558 valPrint rt "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted:"
559 fi
560 if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
561 if [ $SKIP_EXPECT_RD -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_RD/$RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi
562 if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi
563 if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi
564
565 # Print checked link totals
566 if [ $LINK_PROBLEMS_NET -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS_NET link $(pluralCheckNoun issue $LINK_PROBLEMS_NET):"; fi
567 if [ $LINK_PROBLEMS_NG -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_NG NG $(pluralCheckNoun link $LINK_PROBLEMS_NG)"; fi
568 if [ $LINK_PROBLEMS_RD -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_RD $(pluralCheckNoun redirection $LINK_PROBLEMS_RD)"; fi
569 if [ $LINK_PROBLEMS_EI -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_EI $(pluralCheckNoun link $LINK_PROBLEMS_EI) that could be intrawiki"; fi
570 if [ $LINK_PROBLEMS_IW -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_IW $(pluralCheckNoun link $LINK_PROBLEMS_IW) that could be interwiki"; fi
571
572 # Close the log files' markup
573 valPrint trh "ValExtLinks says goodbye."
574 printRTFfooter
575 printHTMfooter
576
577 # Upload report if this was requested
578 if [ ! -z $UPLOAD_INFO ] && [ $FINISHED_LIST != "no" ]; then
579 uploadReport
580 fi
581
582 # Really quit now
583 valPrint c "ValExtLinks says goodbye."
584 exit 0
585}
586trap wrapupAndExit INT
587
588
589### INITIALIZATION ###
590# Print opening message to console and log files
591valPrint cw "$(tput bold)--Validate External Links--$(tput sgr0)"
592printTXTheader
593printRTFheader
594printHTMheader
595
596## DATA SOURCING ##
597valPrint t "Startup:"
598valPrint r "\b1 Startup \b0"
599valPrint hn "<h3>Startup</h3>"
600
601# Attempt to download file at LINKS_URL, then check that it succeeded
602valPrint cwtrhn "Downloading list of external links from $LINKS_URL..."
603LINKS_FILE_NAME=$(echo "$LINKS_URL" | sed 's/.*\///')
604LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
605curl --silent -o "$LINKS_FILE" $LINKS_URL
606if [ ! -f "$LINKS_FILE" ]; then
607 echo -e "\nThe download of $LINKS_URL appears to have failed. Aborting."
608 wrapupAndExit
609else
610 valPrint ctrh " success."
611fi
612
613# Attempt to download file at EXCEPT_URL, then check that it succeeded
614if [ ! -z $EXCEPT_URL ]; then
615 valPrint cwtrhn "Downloading list of reporting exceptions from $EXCEPT_URL..."
616 EXCEPT_DATA=$(curl --silent $EXCEPT_URL)
617 if [ -z "$EXCEPT_DATA" ]; then
618 echo -e "\nThe download of the exceptions data from '$EXCEPT_URL' appears to have failed. Aborting."
619 wrapupAndExit
620 else
621 valPrint ctrh " success."
622 fi
623 EXCEPT_DATA=${EXCEPT_DATA%END LIST*}
624 EXCEPT_DATA=${EXCEPT_DATA#*BEGIN LIST}
625 EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
626
627 # Store on disk for debugging purposes
628 echo "$EXCEPT_DATA" > "$EXCEPT_FILE"
629
630 # Transfer to array for easy searching later
631 declare -a EXCEPT_ARRAY=($(echo "$EXCEPT_DATA"))
632fi
633
634# Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
635LINK_COUNT_STRING=$(cat "$LINKS_FILE" | wc -l)
636
637# Number of URLs is number of lines minus one (first line is column header row for the CSV)
638LINK_COUNT=$(echo "${LINK_COUNT_STRING}" | tr -d '[:space:]')
639let LINK_COUNT-=1
640valPrint ctrh "Found $LINK_COUNT links to process."
641valPrint trh ""
642
643## CONFIG OUTPUT ##
644valPrint t "Config:"
645valPrint r "\b1 Config \b0"
646valPrint hn "<h3>Config</h3>"
647
648valPrint ctrhn "Links to consider: "
649if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
650 valPrint ctrh "$((URL_LIMIT-URL_START+1)), from link $URL_START to $URL_LIMIT"
651elif [ $URL_START -ne 1 ]; then
652 valPrint ctrh "$((LINK_COUNT-URL_START+1)), from link $URL_START to $LINK_COUNT"
653else
654 valPrint ctrh "$LINK_COUNT"
655fi
656
657valPrint ctrh "Site query timeout: $TIMEOUT seconds"
658
659valPrint ctrhn "Show OK links: "
660if [ $RECORD_OK_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
661
662valPrint ctrhn "Take screenshots: "
663if [ $TAKE_PAGE_SHOT -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
664
665valPrint ctrhn "Suggest archive.org snapshots for NG pages: "
666if [ $SUGGEST_SNAPSHOTS_NG -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
667
668valPrint ctrhn "Suggest archive.org snapshots for OK pages: "
669if [ $SUGGEST_SNAPSHOTS_OK -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
670
671valPrint ctrhn "Ignore slash-adding redirects: "
672if [ $SHOW_SLASH -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
673
674valPrint ctrhn "Ignore HTTPS-upgrading redirects: "
675if [ $SHOW_HTTPS -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
676
677valPrint ctrhn "Ignore youtu.be redirects: "
678if [ $SHOW_YT_RD -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
679
680valPrint ctrhn "Check archive.org and archive.is links: "
681if [ $CHECK_ARCHIVE_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
682
683valPrint tr "A summary of my findings will be found at the bottom of the report."
684valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
685valPrint trh ""
686
687## LEGEND OUTPUT ##
688valPrint t "Legend:"
689valPrint r "\b1 Legend \b0"
690valPrint hn "<h3>Legend</h3>"
691valPrint t "(For guidance in fixing these links, see $WIKI_MAIN.)"
692valPrint r "(For guidance in fixing these links, see {\field{\*\fldinst{HYPERLINK \"$WIKI_MAIN\"}}{\fldrslt here}}.)"
693valPrint h "(For guidance in fixing these links, see <a href=\"$WIKI_MAIN\" target=\"_blank\">here</a>.)"
694valPrint trh "OK = URL seems to be working"
695valPrint trh "NG = URL no longer seems to work"
696valPrint trh "RD = URL is redirecting to this new URL"
697valPrint trh "EI = URL is an external link to an internal page and should be converted to an intrawiki link using the suggested markup"
698valPrint trh "IW = URL is an external link to a fellow wiki and should be converted to an interwiki link using the suggested markup"
699valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $WIKI_HTTP)"
700valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$WIKI_HTTP\"}}{\fldrslt here}} for code reference)"
701valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$WIKI_HTTP\" target=\"_blank\">here</a> for code reference)"
702valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $WIKI_CURL)"
703valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$WIKI_CURL\"}}{\fldrslt here}} for code reference)"
704valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$WIKI_CURL\" target=\"_blank\">here</a> for code reference)"
705valPrint trh "IA suggests = Last available snapshot returned by the Internet Archive"
706valPrint trh "Try browsing = The Archive failed to return a snapshot URL, so check for a snapshot manually using this link"
707valPrint trh ""
708
709
710### MAIN LOOP ###
711valPrint t "Links:"
712valPrint r "\b1 Links \b0"
713valPrint hn "<h3>Links</h3>"
714START_RUN=$(date +%s)
715# Process each line of the .csv in LINKS_FILE
716for LINE in `cat "$LINKS_FILE"`; do
717 START_LINK=$(date +%s)
718 let LINK_NUM+=1
719
720 # First line is the column header row for the CSV, so let's verify that the format hasn't changed
721 if [ $SKIPPED_HEADER_ROW -eq 0 ]; then
722 if [ $LINE == "namespace,title,target" ]; then
723 SKIPPED_HEADER_ROW=1
724 LINK_NUM=0 # this line is not a link, so reset the link counter
725 valPrint hn "<table>"
726 continue
727 else
728 valPrint ctrh "Did not find expected header row in $LINKS_FILE. Aborting."
729 wrapupAndExit
730 fi
731 fi
732
733 # Skip this link if we are not at URL_START yet
734 if [ $LINK_NUM -lt $URL_START ]; then
735 continue
736 fi
737
738 # Stop if we are at the limit declared for testing purposes
739 if [ $URL_LIMIT -gt 0 ] && [ $LINK_NUM -gt $URL_LIMIT ]; then
740 FINISHED_LIST="limit"
741 wrapupAndExit
742 fi
743
744 # Print progress to screen
745 if [ $LINK_NUM -gt 1 ]; then
746 printf "\e[1A\n" # erase previous progress message so that new one appears in its place
747 fi
748 valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."
749
750 # The number of the namespace is the element before the first comma on the line
751 NS_ID=${LINE%%,*}
752
753 # Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
754 NS_NAME=""
755 a=0
756 while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done
757 if [ $NS_ID == "NULL" ]; then
758 break
759 elif [ $NS_ID -eq ${NS_IDS[$a]} ]; then
760 NS_NAME="${NS_NAMES[$a]}"
761 break
762 fi
763 let a+=1
764 done
765 if [ "$NS_NAME" == "" ]; then
766 if [ $NS_ID == "NULL" ]; then
767 valPrint trs "Skipping line $LINK_NUM ('$LINE') because the namespace (and probably the page too) is 'NULL'. Probably the link is no longer in existence on the wiki."
768 else
769 valPrint trs "Skipping line $LINK_NUM ('$LINE') because I could not find a name for namespace ID $NS_ID."
770 fi
771 let SKIP_UNK_NS+=1
772 let PAGE_LINKS+=1
773 continue
774 fi
775
776 # The name of the page is everything between the namespace ID and the next comma on the line (commas
777 # in page names will break this)
778 PAGE_NAME=${LINE#$NS_ID,}
779 PAGE_NAME=${PAGE_NAME%%,*}
780
781 # Build longer wiki page URLs from namespace and page names
782 FULL_PAGE_PATH=https://$WIKI_PATH/$NS_NAME:$PAGE_NAME
783 LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
784 # Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it
785 # explicitly breaks the link
786 if [ $NS_ID -eq 0 ]; then
787 FULL_PAGE_PATH=https://$WIKI_PATH/$PAGE_NAME
788 LOCAL_PAGE_PATH=$PAGE_NAME
789 fi
790
791 # We don't want to consider wiki pages ending in .js, as the MW parser cannot reliably isolate URLS
792 # in JavaScript code, so it returns erroneous links
793 PAGE_NAME_SUFFIX=$(echo $PAGE_NAME | sed 's/.*\.//')
794 if [ $PAGE_NAME_SUFFIX == "js" ]; then
795 valPrint trs "Skipping URL '${LINE#$NS_ID,$PAGE_NAME,}' on line $LINK_NUM because it was found on JavaScript page '$LOCAL_PAGE_PATH'."
796 let SKIP_JS_PAGE+=1
797 let PAGE_LINKS+=1
798 continue
799 fi
800
801 # The URL being linked to is everything after the previous two fields (this allows commas to be in
802 # the URLs, but a comma in the previous field, the page name, will break this)
803 URL=${LINE#$NS_ID,$PAGE_NAME,}
804
805 # Scan for illegal characters
806 if [[ $URL == *[$ILLEGAL_CHARS]* ]]; then
807 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because it contains characters illegal in a URL."
808 let SKIP_BAD_URL+=1
809 let PAGE_LINKS+=1
810 continue
811 fi
812
813 # If we're skipping archive links, see if this is one
814 if [ $CHECK_ARCHIVE_LINKS -eq 0 ] && [[ ( $URL == *web.archive.org* || $URL == *archive.is* ) ]]; then
815 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to check archive links."
816 let SKIP_ARCHIVES+=1
817 let PAGE_LINKS+=1
818 continue
819 fi
820
821 # Now we need to know if the URL is for a file or a web page. First step is to determine if the
822 # URL ends in a suffix
823 HAS_SUFFIX=0
824
825 # If the URL ends in something like ".php?foo=bar", strip everything from the '?' onward
826 CLEAN_URL=${URL%%\?*}
827
828 # If the URL ends in something like "#section_15", strip everything from the '#' onward
829 CLEAN_URL=${CLEAN_URL%%\#*}
830
831 # 'sed' cannot handle Unicode in my Bash shell, so skip non-ASCII URL and make user check it
832 if [[ $CLEAN_URL == *[![:ascii:]]* ]]; then
833 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I cannot handle non-ASCII characters."
834 let SKIP_NON_ASCII+=1
835 let PAGE_LINKS+=1
836 continue
837 fi
838
839 # Isolate the characters after the last period and after the last slash
840 POST_DOT=$(echo "$CLEAN_URL" | sed 's/.*\.//')
841 POST_SLASH=$(echo "$CLEAN_URL" | sed 's/.*\///')
842
843 # If the last period comes after the last slash, then the URL ends in a suffix
844 POST_DOT_LENGTH=$(echo | awk -v input=$POST_DOT '{print length(input)}')
845 POST_SLASH_LENGTH=$(echo | awk -v input=$POST_SLASH '{print length(input)}')
846 if [ $POST_DOT_LENGTH -lt $POST_SLASH_LENGTH ]; then
847 HAS_SUFFIX=1
848 else
849 HAS_SUFFIX=0
850 fi
851
852 # Now we need to know if the URL is for a file. If the URL ends in a suffix, compare it against the
853 # known URL endings supplied in HTTP_FILES and HTTP_TLDS_AND_PAGES
854 IS_FILE=-1
855 if [ $HAS_SUFFIX -eq 0 ]; then
856 IS_FILE=0
857 else
858 # Turn off case sensitivity while we compare suffixes
859 shopt -s nocasematch
860
861 # Special case: URLs ending in something like "productID.304297400" are pages, not files, so if
862 # the URL's suffix is all numbers, we are looking at the end of a web page URL
863 if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
864 IS_FILE=0
865 fi
866
867 # Special case: URLs ending in a parens, e.g. "ms537113(v=vs.85)", are pages
868 if [[ $POST_DOT =~ ^.*[\(\)]$ ]]; then
869 IS_FILE=0
870 fi
871
872 # Special case: URLs containing a '%', e.g. "10.1007%2FBF00329055", are pages
873 if [[ $POST_DOT == *%* ]]; then
874 IS_FILE=0
875 fi
876
877 # If we did not identify this URL as a web page above, we need to compare the suffix against known
878 # file extensions
879 if [ $IS_FILE -eq -1 ]; then
880 for EXTENSION in "${HTTP_FILES[@]}"; do
881 if [[ $EXTENSION == $POST_DOT ]]; then # double brackets respect the nocasematch setting
882 IS_FILE=1
883 break
884 fi
885 done
886 fi
887
888 # If we did not identify this URL as a file above, we need to compare the suffix against known
889 # pages and TLDs in order to rule out that this is an unknown file suffix or TLD that the user
890 # needs to add to the HTTP_FILES or HTTP_TLDS_AND_PAGES array
891 if [ $IS_FILE -eq -1 ]; then
892 for PAGE_OR_TLD in "${HTTP_TLDS_AND_PAGES[@]}"; do
893 if [[ $PAGE_OR_TLD == $POST_DOT ]]; then
894 IS_FILE=0
895 break
896 fi
897 done
898 fi
899
900 # Turn case sensitivity back on in Bash
901 shopt -u nocasematch
902 fi
903
904 # If this suffix escaped identification as either a file, page or TLD, inform the user
905 STR_TYPE=""
906 if [ $IS_FILE -eq -1 ]; then
907 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I encountered the unknown URL suffix '$POST_DOT'. Please add this suffix to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
908 let SKIP_UNK_SUFFIX+=1
909 continue
910 elif [ $IS_FILE -eq 1 ]; then
911 STR_TYPE="file"
912 let FILE_LINKS+=1
913 else
914 STR_TYPE="page"
915 let PAGE_LINKS+=1
916 fi
917
918 # Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
919 # issue with sites that require HTTPS
920 CURL_CODE=$(curl -o /dev/null --silent --insecure --compressed --head --user-agent '$AGENT' --max-time $TIMEOUT --retry 2 --write-out '%{http_code}\n' $URL)
921 CURL_ERR=$(echo $?)
922 CURL_RESULT=$CURL_CODE
923
924 # Tack 'curl' exit code onto result string if 'curl' returned "000" (no HTTP response)
925 if [ $CURL_CODE == "000" ]; then
926 CURL_RESULT="$CURL_RESULT-$CURL_ERR"
927 fi
928
929 # Begin to determine our status code for this URL (EI, IW, OK, RD, or NG)
930 STATUS="??"
931 NEW_URL=""
932 INTERWIKI_INDEX=-1
933
934 # First make sure that this isn't an "external internal" link to our own wiki that can be replaced
935 # by "[[page_name]]". If it uses a special access URL beginning with "/w/", let it pass, as it
936 # probably cannot be replaced by "[[ ]]" markup
937 if [[ $URL == *$WIKI_PATH* ]] && [[ $URL != *$WIKI_PATH/w/* ]]; then
938 STATUS="EI"
939 let EI_LINKS+=1
940 fi
941
942 # If it's not, check if this is a link to a domain that we have an interwiki prefix for (also make
943 # sure that it's not an archive.org link to a page from an interwiki domain)
944 if [ $STATUS == "??" ] && [[ $URL != *web.archive.org* ]]; then
945 for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
946 if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]] && [[ $URL != *${INTERWIKI_DOMAINS[$i]}/w/* ]]; then
947 STATUS="IW"
948 let IW_LINKS+=1
949 INTERWIKI_INDEX=$i
950 break
951 fi
952 done
953 fi
954
955 # If we didn't match an interwiki domain, see if the status code is in our "OK" codes list
956 if [ $STATUS == "??" ]; then
957 for CODE in "${OK_CODES[@]}"; do
958 if [[ $CODE == $CURL_CODE ]]; then
959 STATUS="OK"
960 let OK_LINKS+=1
961
962 # If this is a YouTube link, we have to look at the actual page source to know if the video
963 # is good or not; override the link's info if it's actually NG
964 if [[ $URL == *www.youtube.com* ]]; then
965 PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $URL | grep "\"simpleText\":\"Video unavailable\"")
966 if [ ! -z "$PAGE_TEXT" ]; then
967 STATUS="NG"
968 CURL_RESULT=404
969 let OK_LINKS-=1
970 let NG_LINKS+=1
971 fi
972 fi
973 break
974 fi
975 done
976 fi
977
978 # If we didn't get a match with the "OK" codes, check it against the "RD" codes
979 if [ $STATUS == "??" ]; then
980 for CODE in "${RD_CODES[@]}"; do
981 if [[ $CODE == $CURL_CODE ]]; then
982 # Get URL header again in order to retrieve the URL we are being redirected to
983 NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --write-out '%{redirect_url}\n' $URL)
984
985 # Adjust the old and new URLs to both use HTTP for comparison purposes, so we can filter
986 # those changes out if the user didn't ask for them
987 URL_HTTP=$(echo $URL | sed -E 's/^https:/http:/')
988 NEW_URL_HTTP=$(echo $NEW_URL | sed -E 's/^https:/http:/')
989
990 # Sometimes 'curl' fails to get the redirect_url due to time-out or bad web site config
991 NEW_URL_LENGTH=$(echo | awk -v input=$NEW_URL_HTTP '{print length(input)}')
992 if [ $NEW_URL_LENGTH -lt $MIN_URL_LENGTH ]; then
993 NEW_URL_HTTP="[new URL not retrieved]"
994 fi
995
996 # Remove slash at end of new URL, if present, so we can filter out the redirects that
997 # merely add an ending slash if the user didn't ask for them
998 NEW_URL_NO_SLASH=$(echo $NEW_URL_HTTP | sed -E 's:/$::')
999
1000 # Detect if this is a youtu.be link simply being expanded by YouTube to the full
1001 # youtube.com address
1002 YOUTU_BE=0
1003 if [[ $URL_HTTP == http://youtu.be* ]] && [[ $NEW_URL_HTTP == http://www.youtube.com* ]]; then
1004 YOUTU_BE=1
1005 fi
1006
1007 # If the URLs match besides HTTP being upgraded to HTTPS, then the link is OK (unless user
1008 # wants those to be reported)
1009 if [ $SHOW_HTTPS -eq 0 ] && [ $URL_HTTP == $NEW_URL_HTTP ]; then
1010 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to show http->https upgrades, and I was redirected to '$NEW_URL'."
1011 STATUS="OK"
1012 let OK_LINKS+=1
1013 let SKIP_HTTPS_UP+=1
1014 # If the URLs match besides an added ending slash, then the link is OK (unless user wants
1015 # those to be reported)
1016 elif [ $SHOW_SLASH -eq 0 ] && [ $URL_HTTP == $NEW_URL_NO_SLASH ]; then
1017 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to show added trailing slashes, and I was redirected to '$NEW_URL'."
1018 STATUS="OK"
1019 let OK_LINKS+=1
1020 let SKIP_SLASH_ADD+=1
1021 elif [ $YOUTU_BE -eq 1 ]; then
1022 # We have to look at the actual page source to know if a YouTube video is good or not
1023 PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $NEW_URL | grep "\"simpleText\":\"Video unavailable\"")
1024 if [ ! -z "$PAGE_TEXT" ]; then
1025 STATUS="NG"
1026 let NG_LINKS+=1
1027 else
1028 if [ $SHOW_YT_RD -eq 0 ]; then
1029 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to show redirects for youtu.be links, and I was redirected to '$NEW_URL'."
1030 STATUS="OK"
1031 let OK_LINKS+=1
1032 let SKIP_YOUTU_BE+=1
1033 else
1034 STATUS="RD"
1035 let RD_LINKS+=1
1036 fi
1037 fi
1038 else
1039 STATUS="RD"
1040 let RD_LINKS+=1
1041 fi
1042 break
1043 fi
1044 done
1045 fi
1046
1047 # If we didn't get a match with the "RD" codes, check it against the "NG" codes
1048 if [ $STATUS == "??" ]; then
1049 for CODE in "${NG_CODES[@]}"; do
1050 if [[ $CODE == $CURL_CODE ]]; then
1051 STATUS="NG"
1052 let NG_LINKS+=1
1053 break
1054 fi
1055 done
1056 fi
1057
1058 # If we didn't match a known status code, advise the reader
1059 if [ $STATUS == "??" ]; then
1060 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I encountered the unknown response code $CURL_CODE."
1061 let SKIP_UNK_CODE+=1
1062 continue
1063 fi
1064
1065 # Check problem links against exceptions list before proceeding
1066 FOUND_EXCEPT=0
1067 if [ $STATUS != "OK" ] && [ ! -z $EXCEPT_URL ]; then
1068 # The code we expect to find in the exceptions file is either the 'curl' result or "EI"/"IW"
1069 EXPECT_CODE="$CURL_RESULT"
1070 if [ $STATUS == "EI" ]; then
1071 EXPECT_CODE="EI"
1072 elif [ $STATUS == "IW" ]; then
1073 EXPECT_CODE="IW"
1074 fi
1075
1076 # Look for link in exceptions list and make sure the listed result code and wiki page also match
1077 for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
1078 {
1079 EXCEPT_LINE="${EXCEPT_ARRAY[$i]}"
1080
1081 # Undo any HTML-encoding from the wiki page; for now we just worry about the ampersand, as most
1082 # other HTML-encoded characters are not found in URLs
1083 EXCEPT_LINE=$(echo "$EXCEPT_LINE" | sed 's/\&amp;/\&/g')
1084
1085 # Match URL
1086 EXCEPT_URL="${EXCEPT_LINE#*,}"
1087 EXCEPT_URL="${EXCEPT_URL%,*}"
1088 if [ "$EXCEPT_URL" != "$URL" ]; then
1089 continue
1090 fi
1091
1092 # Match containing page's name
1093 EXCEPT_PAGE="${EXCEPT_LINE##*,}"
1094 EXCEPT_PAGE="${EXCEPT_PAGE%% *}"
1095 if [ "$EXCEPT_PAGE" == "*" ] || [ "$EXCEPT_PAGE" == $LOCAL_PAGE_PATH ]; then
1096 # Match result code
1097 EXCEPT_CODE=${EXCEPT_LINE%%,*}
1098 if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then
1099 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because its expected result, '$EXPECT_CODE', is in the exceptions list."
1100 if [ $STATUS == "EI" ]; then
1101 let SKIP_EXPECT_EI+=1
1102 elif [ $STATUS == "IW" ]; then
1103 let SKIP_EXPECT_IW+=1
1104 elif [ $STATUS == "RD" ]; then
1105 let SKIP_EXPECT_RD+=1
1106 else
1107 let SKIP_EXPECT_NG+=1
1108 fi
1109 FOUND_EXCEPT=1
1110 break
1111 fi
1112 fi
1113 } done
1114 fi
1115 if [ $FOUND_EXCEPT -eq 1 ]; then
1116 continue
1117 fi
1118
1119 # If appropriate, record this link to the log, with clickable URLs when possible
1120 if [ $STATUS != "OK" ] || [ $RECORD_OK_LINKS -eq 1 ]; then
1121 # Prepare 'curl' result in parentheses to print after status code, unless this is an "IW" or "EI"
1122 # link, in which case showing the status code doesn't make sense. Adjust spacing after string to
1123 # ensure TXT and RTF reports have aligned columns of results.
1124 CURL_STR_H=" ($CURL_RESULT)"
1125 CURL_STR_T="$CURL_STR_H"
1126 CURL_STR_R="$CURL_STR_H "
1127 if [ $STATUS == "IW" ] || [ $STATUS == "EI" ]; then
1128 CURL_STR_H=""
1129 CURL_STR_T=" "
1130 CURL_STR_R=" "
1131 fi
1132
1133 # Record link and its wiki page in TXT, RTF, and HTML markup
1134 valPrint t "${STATUS}${CURL_STR_T} $STR_TYPE $URL"
1135 valPrint t " linked from $FULL_PAGE_PATH"
1136 valPrint r "${STATUS}${CURL_STR_R}${RTF_TABS}$STR_TYPE {\field{\*\fldinst{HYPERLINK \"$URL\"}}{\fldrslt $URL}}"
1137 valPrint r " linked from {\field{\*\fldinst{HYPERLINK \"$FULL_PAGE_PATH\"}}{\fldrslt $LOCAL_PAGE_PATH}}"
1138 valPrint hn "<tr><td style=\"white-space:nowrap\">${STATUS}${CURL_STR_H}</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>"
1139 valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"
1140
1141 # Place vertical space here since we won't be printing anything more about this link
1142 if [ $STATUS == "OK" ] && [ $SUGGEST_SNAPSHOTS_OK -eq 0 ]; then valPrint tr ""; valPrint hs ""; fi
1143
1144 # Record redirect URL if one was given by a 3xx response page
1145 if [ $STATUS == "RD" ]; then
1146 valPrint ts " Server suggests $NEW_URL"
1147 valPrint rs " Server suggests {\field{\*\fldinst{HYPERLINK \"$NEW_URL\"}}{\fldrslt $NEW_URL}}"
1148 valPrint hs "<tr><td colspan=\"2\" align=\"right\">Server suggests</td><td><a href=\"$NEW_URL\" target=\"_blank\">$NEW_URL</a></td></tr>"
1149 fi
1150
1151 # Notify reader if we can use an intrawiki link for this URL
1152 if [ $STATUS == "EI" ]; then
1153 INTRA_PAGE=${URL#*://*/}
1154 valPrint ts " Just use [[$INTRA_PAGE]]"
1155 valPrint rs " Just use [[$INTRA_PAGE]]"
1156 valPrint hs "<tr><td colspan=\"2\" align=\"right\">Just use</td><td>[[$INTRA_PAGE]]</td></tr>"
1157 fi
1158
1159 # Notify reader if we can use an interwiki prefix for this URL
1160 if [ $STATUS == "IW" ]; then
1161 INTER_PAGE=$(echo "$URL" | sed 's/.*\///')
1162 valPrint ts " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
1163 valPrint rs " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
1164 valPrint hs "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]</td></tr>"
1165 fi
1166
1167 # Query Internet Archive for latest "OK" snapshot for "NG" page
1168 if [[ ( $STATUS == "NG" && $SUGGEST_SNAPSHOTS_NG -eq 1 ) || ( $STATUS == "OK" && $SUGGEST_SNAPSHOTS_OK -eq 1 ) ]]; then
1169
1170 # We need to watch out for the rate limit or we'll get locked out; look at how much time has
1171 # elapsed and then wait the remainder between that and how long of a wait we think is needed
1172 # to avoid the dreaded "Too Many Requests" response. 5 seconds is just a guess.
1173 CUR_TIME=$(date +%s)
1174 WAIT_REMAINDER=$((5 - $CUR_TIME + $START_LINK))
1175 if [ $WAIT_REMAINDER -gt 0 ]; then
1176 valPrint t "Waiting $WAIT_REMAINDER second(s) to conform to Archive.org rate limit."
1177 sleep $WAIT_REMAINDER
1178 fi
1179
1180 # Issue query to the API
1181 ARCHIVE_QUERY=$(curl --silent --max-time $TIMEOUT "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
1182
1183 # Notify user if we hit the rate limit and just keep going
1184 if [[ "$ARCHIVE_QUERY" == "*Too Many Requests*" ]]; then
1185 valPrint t " IA has rate-limited us!"
1186 valPrint r " IA has rate-limited us!"
1187 valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td>(hit the API rate limit!)</td></tr>"
1188 # If a "closest" snapshot was received, inform user
1189 elif [[ "$ARCHIVE_QUERY" == *\"closest\":* ]]; then
1190 # In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it
1191 ARCHIVE_QUERY=$(echo "$ARCHIVE_QUERY" | sed 's/#!/#\\!/')
1192
1193 # ...isolate "url" property in the response that follows the "closest" tag
1194 SNAPSHOT_URL=${ARCHIVE_QUERY##*\"closest\":} # everything after '"closest":'
1195 SNAPSHOT_URL=${SNAPSHOT_URL#*\"url\": \"} # everything after '"url": "'
1196 SNAPSHOT_URL=${SNAPSHOT_URL%%\"*} # everything before '"'
1197
1198 # Remove the port 80 part that IA often adds to the URL, as it's superfluous
1199 SNAPSHOT_URL=$(echo $SNAPSHOT_URL | sed 's/:80//')
1200
1201 # Inform the user of the snapshot URL
1202 valPrint ts " IA suggests $SNAPSHOT_URL"
1203 valPrint rs " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
1204 valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
1205 else # Otherwise give a generic Wayback Machine link for this URL, which might work
1206 valPrint ts " Try browsing $ARCHIVE_GENERIC/$URL"
1207 valPrint rs " Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
1208 valPrint hs "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"
1209 fi
1210 fi
1211 fi
1212
1213 # If this is a page and it seems to be there, take screenshot of it using headless Google Chrome
1214 if [ $IS_FILE -eq 0 ] && [ $TAKE_PAGE_SHOT -eq 1 ] && [ $STATUS == "OK" ]; then
1215 # Sanitize URL by removing "http(s)://" and converting any other colons or slashes to underscores
1216 SHOT_NAME=$(echo "$URL" | sed 's/https*\:\/\///' | sed 'y/:\//__/')
1217 SHOT_FILE="$SHOT_PATH/$SHOT_NAME.png"
1218
1219 # Don't take screenshot if we already encountered this page and screenshotted it
1220 if [ ! -f "$SHOT_FILE" ]; then
1221 "$CHROME_PATH" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
1222 if [ -f "$WORKING_DIR/$CHROME_SCREENSHOT" ]; then
1223 mv -n "$WORKING_DIR/$CHROME_SCREENSHOT" "$SHOT_FILE"
1224 else
1225 valPrint trhs "Screenshot of URL $URL seems to have failed!"
1226 fi
1227 else
1228 valPrint trhs "Skipping screenshot of URL '$URL' because file '$SHOT_FILE' already exists."
1229 fi
1230 fi
1231done
1232FINISHED_LIST="yes"
1233wrapupAndExit
Note: See TracBrowser for help on using the repository browser.