source: Validate External Links/validate_external_links.sh@ 1142

Last change on this file since 1142 was 1142, checked in by iritscen, 4 years ago

Val now tries each URL three times. This has proven more effective than giving Val a long timeout and trying each URL once. The summary report has been refined a bit; the most notable change is that the final number and breakdown of link issues leaves out the excepted links. Also stopped Val from getting confused by HTML-encoded '&'s in the exceptions list.

File size: 49.7 KB
RevLine 
[1064]1#!/bin/bash
2
3# Validate External Links by Iritscen
[1141]4#
5# Validates a list of external links in CSV format. The resulting logs are produced in three formats:
6# - TXT (for easy diffing with an earlier log)
7# - RTF (for reading as a local file with clickable links)
8# - HTML (for uploading as a web page).
[1142]9# Call script with "--help" argument for documentation. Also see Read Me.rtf for critical notes.
[1141]10#
[1064]11# Recommended rule:
[1118]12# |----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----|
[1141]13#
14# Table of contents (sections of script in order of appearance, not execution):
15# • Globals
16# • Help Output
17# • Setup
18# • Utility Functions
19# • Summary Output
20# • Initialization
21# • Data Sourcing
22# • Config Output
23# • Legend Output
24# • Main Loop
[1064]25
26# Set separator token to newline
27IFS="
28"
29
30### GLOBALS ###
31# Settings -- these will be changed from their defaults by the arguments passed in to the script
[1135]32LINKS_URL="" # use 'curl' to download file with links from this location (can be file://)
[1136]33EXCEPT_URL="" # 'curl' will access this wiki page with a list of exceptions for NG results
[1135]34OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder
35RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES
36SHOW_SLASH=0 # record response code to the log when a slash is added to the end of a URL
37SHOW_HTTPS=0 # record response code to the log when "http" is upgraded to "https"
38SHOW_YT_RD=0 # record response code to the log when a youtu.be URL is expanded to the full URL
39SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page
40SKIP_ARCHIVE_LINKS=0 # don't check URLs under the archive.org domain
41TAKE_PAGE_SHOT=0 # take a screenshot of each OK page
[1141]42TIMEOUT=10 # time to wait for a response when querying a site
[1135]43CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature
[1142]44URL_START=1 # start at this URL in LINKS_FILE
[1135]45URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE
46UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report
[1064]47
48# Fixed strings -- see the occurrences of these variables to learn their purpose
[1142]49AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36 OPR/70.0.3728.154"
[1064]50ARCHIVE_API="http://archive.org/wayback/available"
51ARCHIVE_GENERIC="https://web.archive.org/web/*"
52ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
53CHROME_SCREENSHOT="screenshot.png"
[1136]54EXCEPT_FILE_NAME="exceptions.txt"
[1064]55EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
[1141]56WIKI_CURL="https://wiki.oni2.net/Validate_External_Links/Curl_codes"
57WIKI_HTTP="https://wiki.oni2.net/Validate_External_Links/HTTP_codes"
58WIKI_MAIN="https://wiki.oni2.net/Validate_External_Links"
59WIKI_ME="http://iritscen.oni2.net"
[1064]60THIS_DIR=$(cd $(dirname $0); pwd)
61WORKING_DIR=$(pwd)
62WIKI_PATH="wiki.oni2.net"
63
64# These are parallel arrays of the IDs and names of OniGalore's current namespaces
65declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 100 101 102 103 104 105 108 109 110 111)
66declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk")
67
68# These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
[1070]69# This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code.
[1127]70declare -a HTTP_FILES=(3ds 7z avi BINA blend bsl cpp csv dae dll dmg doc east exe fbx first63 flv gamedata gif jpg last32 log m4a mhm mov mp3 mp4 oni ONWC pdf png py rar tga TRMA txt vbs wav wmv xaf xml zip)
[1137]71declare -a HTTP_TLDS_AND_PAGES=(action ars asp aspx cfm cgi com css de full htm html it js jsp net org pgi php php3 phtml pl ru shtml stm uk x)
[1064]72
[1067]73# These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
74# are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt
75# if you add a new code.
[1127]76declare -a OK_CODES=(200 401 405 406 418 501)
[1067]77declare -a RD_CODES=(301 302 303 307 308)
[1127]78declare -a NG_CODES=(000 400 403 404 410 500 502 503 530)
[1064]79
80# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
81# transcluded text, and if the transclusion fails, then the braces show up in the URL
82ILLEGAL_CHARS="{ }"
83
[1070]84# The shortest URL possible, used for sanity-checking some URLs: http://a.co
85MIN_URL_LENGTH=11
86
[1064]87# These are parallel arrays giving the prefixes that can be used in place of normal external links to
88# some wikis and other sites
[1070]89declare -a INTERWIKI_PREFIXES=(commons metawikimedia mw wikibooks wikidata wikimedia wikinews wikiquote wikisource wikispecies wikiversity wikivoyage wikt wp)
90declare -a INTERWIKI_DOMAINS=(commons.wikimedia.org meta.wikimedia.org mediawiki.org wikibooks.org wikidata.org wikimediafoundation.org wikinews.org wikiquote.org wikisource.org species.wikimedia.org wikiversity.org wikivoyage.org wiktionary.org wikipedia.org)
[1064]91
92# Variables for keeping track of main loop progress and findings
93LINK_NUM=0
[1070]94EI_LINKS=0
95IW_LINKS=0
[1064]96OK_LINKS=0
[1067]97RD_LINKS=0
[1064]98NG_LINKS=0
99SKIP_UNK_NS=0
100SKIP_JS_PAGE=0
101SKIP_BAD_URL=0
102SKIP_NON_ASCII=0
103SKIP_UNK_SUFFIX=0
104SKIP_UNK_CODE=0
[1070]105SKIP_EXPECT_NG=0
[1142]106SKIP_EXPECT_RD=0
[1070]107SKIP_EXPECT_EI=0
108SKIP_EXPECT_IW=0
[1122]109SKIP_HTTPS_UP=0
110SKIP_SLASH_ADD=0
[1127]111SKIP_YOUTU_BE=0
[1135]112SKIP_ARCHIVE_ORG=0
[1064]113FILE_LINKS=0
114PAGE_LINKS=0
115SKIPPED_HEADER_ROW=0
116FINISHED_LIST="no"
[1118]117START_RUN=0
118END_RUN=0
[1064]119
120
[1141]121### HELP OUTPUT ###
[1064]122# A pseudo-man page. Here is the 80-character rule for the page text:
123# 234567890123456789012345678901234567890123456789012345678901234567890123456789
124function printHelp()
125{
126 cat << EOF
127
128NAME
129 Validate External Links
130
131SYNOPSIS
132 validate_external_links.sh --help
[1070]133 validate_external_links.sh --links URL --output DIR [--exceptions URL]
[1136]134 [--record-ok-links] [--show-added-slashes] [--show-https-upgrades]
135 [--show-yt-redirects] [--suggest-snapshots] [--skip-archive-links]
[1141]136 [--take-screenshots FILE] [--timeout NUM] [--start-url NUM]
137 [--end-url NUM] [--upload FILE]
[1064]138
139DESCRIPTION
140 This script parses a list of external links found in the OniGalore wiki
141 (which is dumped by the Oni2.net domain periodically in a particular
142 format), validates them using the Unix tool 'curl', and produces a report
[1070]143 of which links were "OK" (responded positively to an HTTP query), which
144 were "RD" (responded with a 3xx redirect code), which could be "IW"
145 (interwiki) links, which are "EI" (external internal) links and could be
146 intrawiki links, and which were "NG" (no good; a negative response to the
[1069]147 query). This report can then be automatically uploaded to the location of
[1064]148 your choice. The script can also suggest Internet Archive snapshots for
[1070]149 "NG" links, and take screenshots of "OK" links for visual verification by
150 the reader that the page in question is the one intended to be displayed.
[1064]151
152 You must pass this script the URL at which the list of links is found
[1070]153 (--links) and the path where the directory of logs should be outputted
154 (--output). All other arguments are optional.
[1064]155
156OPTIONS
[1075]157 --help Show this page.
158 --links URL (required) URL from which to download the CSV
159 file with external links. Note that this URL can
160 be a local file if you supply a file:// path.
161 --output DIR (required) Unix path to directory in which Val
162 should place its reports.
163 --exceptions URL In order to remove links from the report which
[1136]164 Val finds an issue with but which you regard as
165 OK, list those desired exceptions on a wiki page.
166 See the sample file "exceptions.pdf" for the
167 required format of the page. Note that this URL
168 can point to a local file if you supply a path
169 beginning with "file://".
[1075]170 --record-ok-links Log a link in the report even if its response
171 code is "OK".
[1122]172 --show-added-slashes Report on redirects that simply add a '/' to the
173 end of the URL.
[1127]174 --show-https-upgrades Report on redirects that simply upgrade a
[1122]175 "http://" URL to a "https://" URL.
[1127]176 --show-yt-redirects Report on redirects that expand a youtu.be URL.
[1075]177 --suggest-snapshots Query the Internet Archive for a possible
178 snapshot URL for each "NG" page.
[1135]179 --skip-archive-links Don't check links that are already pointing to
180 a page on the Internet Archive.
[1075]181 --take-screenshots FILE Call the Google Chrome binary at this path to
182 take screenshots of each "OK" page.
[1141]183 --timeout NUM Wait this many seconds for a site to respond. The
[1142]184 default is 10. Important note: Val will attempt
185 to reach each URL three times, so the time taken
186 to ping an unresponsive site will be three times
187 this setting.
[1075]188 --start-url NUM Start at this link in the links CSV file.
189 --end-url NUM Stop at this link in the links CSV file.
190 --upload FILE Upload report using the credentials and path
191 given in this local text file. See sftp_login.txt
192 for template.
[1064]193
194BUGS
195 The script cannot properly parse any line in the external links file
196 which contains a comma in the name of the wiki page containing a link.
197 Commas in the link itself are not an issue.
198EOF
199}
200
201
202### SETUP ###
203# If first argument is a help request, or if nothing was passed in at all, print help page and quit
204if [ "$#" -eq 0 ] || [ "$1" == "--help" ]; then
205 printHelp | less
206 exit 0
207fi
208
209# Parse arguments as long as there are more arguments to process
210while (( "$#" )); do
211 case "$1" in
[1127]212 --links ) LINKS_URL="$2"; shift 2;;
213 --exceptions ) EXCEPT_URL="$2"; shift 2;;
214 --output ) OUTPUT_DIR="$2"; shift 2;;
215 --record-ok-links ) RECORD_OK_LINKS=1; shift;;
216 --show-added-slashes ) SHOW_SLASH=1; shift;;
217 --show-https-upgrades ) SHOW_HTTPS=1; shift;;
218 --show-yt-redirects ) SHOW_YT_RD=1; shift;;
219 --suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;;
[1135]220 --skip-archive-links ) SKIP_ARCHIVE_LINKS=1; shift;;
[1127]221 --take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
[1141]222 --timeout ) TIMEOUT=$2; shift 2;;
[1127]223 --start-url ) URL_START=$2; shift 2;;
224 --end-url ) URL_LIMIT=$2; shift 2;;
225 --upload ) UPLOAD_INFO=$2; shift 2;;
226 * ) echo "Invalid argument $1 detected. Aborting."; exit 1;;
[1064]227 esac
228done
229
230# If the required arguments were not supplied, print help page and quit
231if [ -z $LINKS_URL ] || [ -z $OUTPUT_DIR ]; then
[1070]232 echo "Error: I did not receive one or both required arguments. Run me with the \"--help\" argument for documentation."
[1064]233 exit 2
234fi
235
[1070]236# If user wants screenshots, make sure path to Chrome was passed in and is valid
237if [ $TAKE_PAGE_SHOT -eq 1 ]; then
238 if [ ! -f "$CHROME_PATH" ]; then
239 echo "Error: You need to supply a path to the Google Chrome application in order to take screenshots."
240 exit 3
241 fi
242fi
243
[1064]244# Check that UPLOAD_INFO exists, if this argument was supplied
245if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then
246 echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting."
[1070]247 exit 4
[1064]248fi
249
250# Check that OUTPUT_DIR is a directory
251if [ ! -d "$OUTPUT_DIR" ]; then
252 echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting."
[1070]253 exit 5
[1064]254fi
255
256# Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots
257SAFE_TIME=$(date "+%Y-%m-%d--%H-%M-%S") # for use in path names
258NICE_TIME=$(TZ=":UTC" date "+%a, %d %b %Y %k:%M:%S GMT") # matches format of LINKS_DATE, which is extracted from the extlinks file
259OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
260OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
261SHOT_PATH="$OUTPUT_PATH/Screenshots"
262LOG_NAME="ValExtLinks report"
263LOG_TXT="$OUTPUT_PATH/$LOG_NAME.txt"
264LOG_RTF="$OUTPUT_PATH/$LOG_NAME.rtf"
265LOG_HTM="$OUTPUT_PATH/$LOG_NAME.htm"
266mkdir "$OUTPUT_PATH"
267if [ $TAKE_PAGE_SHOT -eq 1 ]; then
268 mkdir "$SHOT_PATH"
269fi
270
271# Check that 'mkdir' succeeded
272if [ ! -d "$OUTPUT_PATH" ]; then
273 echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting."
[1070]274 exit 6
[1064]275fi
276
277# Get date on the file at LINKS_URL and print to log
278LINKS_DATE=$(curl --silent --head $LINKS_URL | grep "Last-Modified")
279if [ -z "$LINKS_DATE" ]; then
280 echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting."
[1070]281 exit 7
[1064]282fi
283LINKS_DATE=${LINKS_DATE#Last-Modified: }
284
285
286### UTILITY FUNCTIONS ###
287# Writes a plain-text header to TXT log file
288function printTXTheader()
289{
290 valPrint t "Validate External Links report"
291 valPrint t "generated $NICE_TIME"
292 valPrint t "from data of $LINKS_DATE"
[1141]293 valPrint t "script by Iritscen (contact: $WIKI_ME)"
[1064]294 valPrint t ""
295}
296
297# Writes the RTF header to RTF log file, then a centered title, then sets text to left-justified
298function printRTFheader()
299{
300 valPrint r "{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf820
301{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
302{\colortbl;\red255\green255\blue255;}
303{\*\expandedcolortbl;;}
304\margl1440\margr1440\vieww12600\viewh12100\viewkind0
305\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\qc\partightenfactor0
306
307\f0\fs28 \cf0 \b1 Validate External Links report \b0\\
308generated $NICE_TIME\\
309from data of $LINKS_DATE\\
[1141]310script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$WIKI_ME\"}}{\fldrslt contact}})
[1064]311\\
312\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
313\cf0 "
314}
315
316# Closes the RTF markup of the RTF log file
317function printRTFfooter()
318{
319 valPrint r "}"
320}
321
322# Writes the HTML header to HTML log file
323function printHTMheader()
324{
325 valPrint h "<html>
326<head>
327<title>Validate External Links report</title>
328</head>
329<body>
330<h2>Validate External Links report</h2>
331<h3>generated $NICE_TIME<br />
332from data of $LINKS_DATE<br />
[1141]333script by Iritscen (<a href=\"$WIKI_ME\" target=\"_blank\">contact</a>)</h3>"
[1064]334}
335
336# Closes the HTML markup of the HTML log file
337function printHTMfooter()
338{
339 valPrint h "</body>
340</html>"
341}
342
343# The central logging function. The first parameter is a string composed of one or more characters that
[1070]344# indicate which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
[1141]345# 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 's' means "Print an
346# extra newline at the end." 'w' means "Don't pass console output through 'fmt'" ("fmt" fits the output
[1119]347# to an 80-column CLI but can break special formatting and the 'n' option).
[1064]348function valPrint()
349{
350 if [[ "$1" == *c* ]]; then
351 if [[ "$1" == *n* ]]; then
352 echo -n "$2"
353 elif [[ "$1" == *w* ]]; then
354 echo "$2"
[1119]355 elif [[ "$1" == *s* ]]; then
356 echo -e "$2\n"
[1064]357 else
358 echo "$2" | fmt -w 80
359 fi
360 fi
361 if [[ "$1" == *t* ]]; then
362 if [[ "$1" == *n* ]]; then
363 echo -n "$2" >> "$LOG_TXT"
[1119]364 elif [[ "$1" == *s* ]]; then
365 echo -e "$2\n" >> "$LOG_TXT"
[1064]366 else
367 echo "$2" >> "$LOG_TXT"
368 fi
369 fi
370 if [[ "$1" == *r* ]]; then
371 if [[ "$1" == *n* ]]; then
372 echo "$2" >> "$LOG_RTF"
[1119]373 elif [[ "$1" == *s* ]]; then
374 echo "$2\line\line" >> "$LOG_RTF"
[1064]375 else
[1119]376 echo "$2\line" >> "$LOG_RTF"
[1064]377 fi
378 fi
379 if [[ "$1" == *h* ]]; then
[1119]380 if [[ "$1" == *s* ]]; then
381 echo "$2<tr><td>&nbsp;</td></tr>" >> "$LOG_HTM"
382 elif [[ "$1" == *n* ]]; then
[1064]383 echo "$2" >> "$LOG_HTM"
384 else
385 echo "$2<br />" >> "$LOG_HTM"
386 fi
387 fi
388}
389
390# Pluralize the string in parameter 1 if the number in parameter 2 is not 1
391function pluralCheckNoun()
392{
393 if [ $2 -ne 1 ]; then
394 if [[ $1 =~ x$ ]]; then
395 echo $1es
396 else
397 echo $1s
398 fi
399 else
400 echo $1
401 fi
402}
403
[1067]404# Output "is" if parameter 1 is 1, otherwise "are"
405function pluralCheckIs()
406{
407 if [ $1 -ne 1 ]; then
408 echo "are"
409 else
410 echo "is"
411 fi
412}
413
[1064]414# Output "was" if parameter 1 is 1, otherwise "were"
415function pluralCheckWas()
416{
417 if [ $1 -ne 1 ]; then
418 echo "were"
419 else
420 echo "was"
421 fi
422}
423
[1067]424# Output "a " if parameter 1 is 1, otherwise nothing
425function pluralCheckA()
426{
427 if [ $1 -eq 1 ]; then
428 echo "a "
429 fi
430}
431
432# Output "an " if parameter 1 is 1, otherwise nothing
433function pluralCheckAn()
434{
435 if [ $1 -eq 1 ]; then
436 echo "an "
437 fi
438}
439
[1064]440# Upload HTML report using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
441# reports being saved to disk have already been closed.
442function uploadReport()
443{
444 valPrint c "Uploading HTML report..."
445
446 SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
447 SFTP_USER_NAME_MARKER="user:"
448 SFTP_PASSWORD_MARKER="pw:"
449 SFTP_PORT_MARKER="port:"
450 SFTP_PATH_MARKER="path:"
451 SFTP_USER_NAME=$(grep $SFTP_USER_NAME_MARKER $UPLOAD_INFO)
452 SFTP_USER_NAME=${SFTP_USER_NAME#$SFTP_USER_NAME_MARKER}
453 SFTP_PASSWORD=$(grep $SFTP_PASSWORD_MARKER $UPLOAD_INFO)
454 SFTP_PASSWORD=${SFTP_PASSWORD#$SFTP_PASSWORD_MARKER}
455 SFTP_PORT=$(grep $SFTP_PORT_MARKER $UPLOAD_INFO)
456 SFTP_PORT=${SFTP_PORT#$SFTP_PORT_MARKER}
457 SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
458 SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}
459
460 expect "$SCRIPT_PATH" "$LOG_HTM" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.htm"
461
462 valPrint c "Report was uploaded, unless an error message appears above."
463}
464
465# Prints session summary when script is done
466function wrapupAndExit()
467{
468 # Get off progress line on console, drop down a line from last link in log, and close HTML table
469 valPrint ctr ""
470 valPrint h "</table><br />"
471
472 # If we didn't finish processing the last URL, then the iterator is one too high
473 if [ $FINISHED_LIST != "yes" ]; then
474 let LINK_NUM-=1
475 if [ $FINISHED_LIST == "no" ]; then
476 valPrint ctrh "The session was canceled by the user."
477 fi
478 fi
479
[1118]480 # Generate string with elapsed time
481 END_RUN=$(date +%s)
482 ELAPSED=$(echo $(($END_RUN - $START_RUN)) | awk '{printf "%d min. %d sec. elapsed", int($1/60), int($1%60)}')
483
[1122]484 # Do some math on results of session
[1064]485 LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
[1142]486 TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP+SKIP_YOUTU_BE))
[1122]487 LINK_ERRORS=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
[1142]488 LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_RD+SKIP_EXPECT_EI+SKIP_EXPECT_IW))
489 LINK_PROBLEMS_TOTAL=$((NG_LINKS+RD_LINKS+EI_LINKS+IW_LINKS))
490 LINK_PROBLEMS_NG=$((NG_LINKS-SKIP_EXPECT_NG))
491 LINK_PROBLEMS_RD=$((RD_LINKS-SKIP_EXPECT_RD))
492 LINK_PROBLEMS_EI=$((EI_LINKS-SKIP_EXPECT_EI))
493 LINK_PROBLEMS_IW=$((IW_LINKS-SKIP_EXPECT_IW))
494 LINK_PROBLEMS_NET=$((LINK_PROBLEMS_NG+LINK_PROBLEMS_RD+LINK_PROBLEMS_EI+LINK_PROBLEMS_IW))
[1122]495
[1141]496 ## SUMMARY OUTPUT ##
[1118]497 valPrint ct "Summary ($ELAPSED):"
498 valPrint r "\b1 Summary \b0 ($ELAPSED)"
499 valPrint hn "<h3><span id=\"summary\">Summary ($ELAPSED)</span></h3>"
[1123]500 valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT) (there $(pluralCheckWas $FILE_LINKS) $FILE_LINKS file $(pluralCheckNoun link $FILE_LINKS) and $PAGE_LINKS page $(pluralCheckNoun link $PAGE_LINKS))."
[1122]501
502 # Print processed link totals
503 if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi
504 if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi
[1135]505 if [ $SKIP_ARCHIVE_ORG -gt 0 ]; then valPrint ctrh "- $SKIP_ARCHIVE_ORG Archive.org $(pluralCheckNoun link $SKIP_ARCHIVE_ORG) were not checked"; fi
[1142]506 if [ $LINK_PROBLEMS_TOTAL -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_TOTAL processed $(pluralCheckNoun link $LINK_PROBLEMS_TOTAL) had $(pluralCheckAn $LINK_PROBLEMS_TOTAL)$(pluralCheckNoun issue $LINK_PROBLEMS_TOTAL)"; fi
507 if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr " (excepted $LINKS_EXCEPTED link $(pluralCheckNoun issue $LINKS_EXCEPTED) from report)"; valPrint h "&nbsp;&nbsp;(excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi
[1123]508 if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) $(pluralCheckWas $OK_LINKS) OK"; fi
509 if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctr " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; valPrint h "&nbsp;&nbsp;(counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi
[1122]510
511 # Print errored link totals
512 if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"; fi
513 if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
[1070]514 if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
[1064]515 if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
516 if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
517 if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
518 if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
[1122]519
[1142]520 # Print excepted link totals
521 if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctrh "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"; fi
522 if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
523 if [ $SKIP_EXPECT_RD -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_RD/$RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi
524 if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi
525 if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi
526
[1122]527 # Print checked link totals
[1142]528 if [ $LINK_PROBLEMS_NET -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS_NET link $(pluralCheckNoun issue $LINK_PROBLEMS_NET):"; fi
529 if [ $LINK_PROBLEMS_NG -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_NG NG $(pluralCheckNoun link $LINK_PROBLEMS_NG)"; fi
530 if [ $LINK_PROBLEMS_RD -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_RD $(pluralCheckNoun redirection $LINK_PROBLEMS_RD)"; fi
531 if [ $LINK_PROBLEMS_EI -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_EI $(pluralCheckNoun link $LINK_PROBLEMS_EI) that could be intrawiki"; fi
532 if [ $LINK_PROBLEMS_IW -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_IW $(pluralCheckNoun link $LINK_PROBLEMS_IW) that could be interwiki"; fi
[1122]533
534 # Close the log files' markup
[1070]535 valPrint trh "ValExtLinks says goodbye."
[1064]536 printRTFfooter
537 printHTMfooter
538
539 # Upload report if this was requested
540 if [ ! -z $UPLOAD_INFO ] && [ $FINISHED_LIST != "no" ]; then
541 uploadReport
542 fi
543
544 # Really quit now
545 valPrint c "ValExtLinks says goodbye."
546 exit 0
547}
548trap wrapupAndExit INT
549
550
551### INITIALIZATION ###
552# Print opening message to console and log files
553valPrint cw "$(tput bold)--Validate External Links--$(tput sgr0)"
554printTXTheader
555printRTFheader
556printHTMheader
557
[1141]558## DATA SOURCING ##
559valPrint t "Startup:"
560valPrint r "\b1 Startup \b0"
561valPrint hn "<h3>Startup</h3>"
562
[1064]563# Attempt to download file at LINKS_URL, then check that it succeeded
[1141]564valPrint cwtrhn "Downloading list of external links from $LINKS_URL..."
[1064]565LINKS_FILE_NAME=$(echo "$LINKS_URL" | sed 's/.*\///')
566LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
567curl --silent -o "$LINKS_FILE" $LINKS_URL
568if [ ! -f "$LINKS_FILE" ]; then
[1141]569 echo -e "\nThe download of $LINKS_URL appears to have failed. Aborting."
[1064]570 wrapupAndExit
[1141]571else
572 valPrint ctrh " success."
[1064]573fi
574
575# Attempt to download file at EXCEPT_URL, then check that it succeeded
576if [ ! -z $EXCEPT_URL ]; then
[1141]577 valPrint cwtrhn "Downloading list of reporting exceptions from $EXCEPT_URL..."
[1136]578 EXCEPT_DATA=$(curl --silent $EXCEPT_URL)
579 if [ -z "$EXCEPT_DATA" ]; then
[1141]580 echo -e "\nThe download of the exceptions data from '$EXCEPT_URL' appears to have failed. Aborting."
[1064]581 wrapupAndExit
[1141]582 else
583 valPrint ctrh " success."
[1064]584 fi
[1136]585 EXCEPT_DATA=${EXCEPT_DATA%END LIST*}
586 EXCEPT_DATA=${EXCEPT_DATA#*BEGIN LIST}
587 EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
588
589 # Store on disk for debugging purposes
590 echo "$EXCEPT_DATA" > "$EXCEPT_FILE"
591
592 # Transfer to array for easy searching later
593 declare -a EXCEPT_ARRAY=($(echo "$EXCEPT_DATA"))
[1064]594fi
595
596# Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
597LINK_COUNT_STRING=$(cat "$LINKS_FILE" | wc -l)
598
599# Number of URLs is number of lines minus one (first line is column header row for the CSV)
600LINK_COUNT=$(echo "${LINK_COUNT_STRING}" | tr -d '[:space:]')
601let LINK_COUNT-=1
[1141]602valPrint ctrh "Found $LINK_COUNT links to process."
603valPrint trh ""
[1064]604
[1141]605## CONFIG OUTPUT ##
606valPrint t "Config:"
607valPrint r "\b1 Config \b0"
608valPrint hn "<h3>Config</h3>"
609
610valPrint ctrhn "Links to consider: "
[1064]611if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
[1141]612 valPrint ctrh "$((URL_LIMIT-URL_START+1)), from link $URL_START to $URL_LIMIT"
[1064]613elif [ $URL_START -ne 1 ]; then
[1141]614 valPrint ctrh "$((LINK_COUNT-URL_START+1)), from link $URL_START to $LINK_COUNT"
[1064]615else
[1141]616 valPrint ctrh "$LINK_COUNT"
[1064]617fi
618
[1141]619valPrint ctrh "Site query timeout: $TIMEOUT seconds"
620
621valPrint ctrhn "Show OK links: "
622if [ $RECORD_OK_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
623
624valPrint ctrhn "Take screenshots: "
625if [ $TAKE_PAGE_SHOT -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
626
[1142]627valPrint ctrhn "Suggest archive.org snapshots: "
[1141]628if [ $SUGGEST_SNAPSHOTS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
629
630valPrint ctrhn "Ignore slash-adding redirects: "
631if [ $SHOW_SLASH -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
632
633valPrint ctrhn "Ignore HTTPS-upgrading redirects: "
634if [ $SHOW_HTTPS -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
635
636valPrint ctrhn "Ignore youtu.be redirects: "
637if [ $SHOW_YT_RD -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
638
639valPrint ctrhn "Check archive.org links: "
640if [ $SKIP_ARCHIVE_LINKS -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
641
[1064]642valPrint tr "A summary of my findings will be found at the bottom of the report."
643valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
644valPrint trh ""
645
[1141]646## LEGEND OUTPUT ##
[1064]647valPrint t "Legend:"
648valPrint r "\b1 Legend \b0"
649valPrint hn "<h3>Legend</h3>"
[1141]650valPrint t "(For guidance in fixing these links, see $WIKI_MAIN.)"
651valPrint r "(For guidance in fixing these links, see {\field{\*\fldinst{HYPERLINK \"$WIKI_MAIN\"}}{\fldrslt here}}.)"
652valPrint h "(For guidance in fixing these links, see <a href=\"$WIKI_MAIN\" target=\"_blank\">here</a>.)"
653valPrint trh "OK = URL seems to be working"
654valPrint trh "NG = URL no longer seems to work"
655valPrint trh "RD = URL is redirecting to this new URL"
656valPrint trh "EI = URL is an external link to an internal page and should be converted to an intrawiki link using the suggested markup"
657valPrint trh "IW = URL is an external link to a fellow wiki and should be converted to an interwiki link using the suggested markup"
658valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $WIKI_HTTP)"
659valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$WIKI_HTTP\"}}{\fldrslt here}} for code reference)"
660valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$WIKI_HTTP\" target=\"_blank\">here</a> for code reference)"
661valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $WIKI_CURL)"
662valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$WIKI_CURL\"}}{\fldrslt here}} for code reference)"
663valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$WIKI_CURL\" target=\"_blank\">here</a> for code reference)"
664valPrint trh "IA suggests = Last available snapshot returned by the Internet Archive"
665valPrint trh "Try browsing = The Archive failed to return a snapshot URL, so check for a snapshot manually using this link"
[1064]666valPrint trh ""
667
668
669### MAIN LOOP ###
[1120]670valPrint t "Links:"
671valPrint r "\b1 Links \b0"
672valPrint hn "<h3>Links</h3>"
[1118]673START_RUN=$(date +%s)
[1064]674# Process each line of the .csv in LINKS_FILE
675for LINE in `cat "$LINKS_FILE"`; do
676 let LINK_NUM+=1
677
678 # First line is the column header row for the CSV, so let's verify that the format hasn't changed
679 if [ $SKIPPED_HEADER_ROW -eq 0 ]; then
680 if [ $LINE == "namespace,title,target" ]; then
681 SKIPPED_HEADER_ROW=1
682 LINK_NUM=0 # this line is it's not a link, so reset the link counter
683 valPrint hn "<table>"
684 continue
685 else
686 valPrint ctrh "Did not find expected header row in $LINKS_FILE. Aborting."
687 wrapupAndExit
688 fi
689 fi
690
691 # Skip this link if we are not at URL_START yet
692 if [ $LINK_NUM -lt $URL_START ]; then
693 continue
694 fi
695
696 # Stop if we are at the limit declared for testing purposes
697 if [ $URL_LIMIT -gt 0 ] && [ $LINK_NUM -gt $URL_LIMIT ]; then
698 FINISHED_LIST="limit"
699 wrapupAndExit
700 fi
701
702 # Print progress to screen
703 if [ $LINK_NUM -gt 1 ]; then
704 printf "\e[1A\n" # erase previous progress message so that new one appears in its place
705 fi
706 valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."
707
708 # The number of the namespace is the element before the first comma on the line
709 NS_ID=${LINE%%,*}
710
711 # Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
712 NS_NAME=""
713 a=0
[1069]714 while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done
[1118]715 if [ $NS_ID == "NULL" ]; then
716 break
717 elif [ $NS_ID -eq ${NS_IDS[$a]} ]; then
[1064]718 NS_NAME="${NS_NAMES[$a]}"
719 break
720 fi
721 let a+=1
722 done
[1118]723 if [ "$NS_NAME" == "" ]; then
724 if [ $NS_ID == "NULL" ]; then
[1123]725 valPrint trs "Skipping line $LINK_NUM ('$LINE') because the namespace (and probably the page too) is 'NULL'. Probably the link is no longer in existence on the wiki."
[1118]726 else
[1123]727 valPrint trs "Skipping line $LINK_NUM ('$LINE') because I could not find a name for namespace ID $NS_ID."
[1118]728 fi
[1064]729 let SKIP_UNK_NS+=1
730 continue
731 fi
732
733 # The name of the page is everything between the namespace ID and the next comma on the line (commas
734 # in page names will break this)
735 PAGE_NAME=${LINE#$NS_ID,}
736 PAGE_NAME=${PAGE_NAME%%,*}
737
[1135]738 # We don't want to consider wiki pages ending in .js, as the MW parser cannot reliably isolate URLS
739 # in JavaScript code, so it returns erroneous links
[1064]740 PAGE_NAME_SUFFIX=$(echo $PAGE_NAME | sed 's/.*\.//')
741 if [ $PAGE_NAME_SUFFIX == "js" ]; then
[1123]742 valPrint trs "Skipping URL '${LINE#$NS_ID,$PAGE_NAME,}' on line $LINK_NUM because it was found on JavaScript page '$PAGE_NAME'."
[1064]743 let SKIP_JS_PAGE+=1
744 continue
745 fi
746
[1070]747 # Build longer wiki page URLs from namespace and page names
[1122]748 FULL_PAGE_PATH=https://$WIKI_PATH/$NS_NAME:$PAGE_NAME
[1070]749 LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
750 # Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it
751 # explicitly breaks the link
752 if [ $NS_ID -eq 0 ]; then
[1122]753 FULL_PAGE_PATH=https://$WIKI_PATH/$PAGE_NAME
[1070]754 LOCAL_PAGE_PATH=$PAGE_NAME
755 fi
756
[1064]757 # The URL being linked to is everything after the previous two fields (this allows commas to be in
758 # the URLs, but a comma in the previous field, the page name, will break this)
759 URL=${LINE#$NS_ID,$PAGE_NAME,}
760
761 # Scan for illegal characters
762 if [[ $URL == *[$ILLEGAL_CHARS]* ]]; then
[1123]763 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because it contains characters illegal in a URL."
[1064]764 let SKIP_BAD_URL+=1
765 continue
766 fi
767
[1135]768 # If we're skipping Archive.org links, check if this is one
769 if [ $SKIP_ARCHIVE_LINKS -eq 1 ] && [[ $URL == *web.archive.org* ]]; then
770 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have been asked not to check Wayback Machine links."
771 let SKIP_ARCHIVE_ORG+=1
772 continue
773 fi
774
[1064]775 # Now we need to know if the URL is for a file or a web page. First step is to determine if the
776 # URL ends in a suffix
777 HAS_SUFFIX=0
778
779 # If the URL ends in something like ".php?foo=bar", strip everything from the '?' onward
[1070]780 CLEAN_URL=${URL%%\?*}
[1064]781
782 # If the URL ends in something like "#section_15", strip everything from the '#' onward
[1070]783 CLEAN_URL=${CLEAN_URL%%\#*}
[1064]784
[1135]785 # 'sed' cannot handle Unicode in my Bash shell, so skip non-ASCII URL and make user check it
[1070]786 if [[ $CLEAN_URL == *[![:ascii:]]* ]]; then
[1123]787 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I cannot handle non-ASCII characters."
[1064]788 let SKIP_NON_ASCII+=1
789 continue
790 fi
791
792 # Isolate the characters after the last period and after the last slash
[1070]793 POST_DOT=$(echo "$CLEAN_URL" | sed 's/.*\.//')
794 POST_SLASH=$(echo "$CLEAN_URL" | sed 's/.*\///')
[1064]795
796 # If the last period comes after the last slash, then the URL ends in a suffix
797 POST_DOT_LENGTH=$(echo | awk -v input=$POST_DOT '{print length(input)}')
798 POST_SLASH_LENGTH=$(echo | awk -v input=$POST_SLASH '{print length(input)}')
799 if [ $POST_DOT_LENGTH -lt $POST_SLASH_LENGTH ]; then
800 HAS_SUFFIX=1
801 else
802 HAS_SUFFIX=0
803 fi
804
805 # Now we need to know if the URL is for a file. If the URL ends in a suffix, compare it against the
806 # known URL endings supplied in HTTP_FILES and HTTP_TLDS_AND_PAGES
807 IS_FILE=-1
808 if [ $HAS_SUFFIX -eq 0 ]; then
809 IS_FILE=0
810 else
811 # Turn off case sensitivity while we compare suffixes
812 shopt -s nocasematch
813
[1127]814 # Special case: URLs ending in something like "productID.304297400" are pages, not files, so if
[1064]815 # the URL's suffix is all numbers, we are looking at the end of a web page URL
816 if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
817 IS_FILE=0
818 fi
[1127]819
820 # Special case: URLs ending in a parens, e.g. "ms537113(v=vs.85)", are pages
821 if [[ $POST_DOT =~ ^.*[\(\)]$ ]]; then
822 IS_FILE=0
823 fi
824
825 # Special case: URLs containing a '%', e.g. "10.1007%2FBF00329055", are pages
826 if [[ $POST_DOT == *%* ]]; then
827 IS_FILE=0
828 fi
[1064]829
830 # If we did not identify this URL as a web page above, we need to compare the suffix against known
831 # file extensions
832 if [ $IS_FILE -eq -1 ]; then
833 for EXTENSION in "${HTTP_FILES[@]}"; do
834 if [[ $EXTENSION == $POST_DOT ]]; then # double brackets respect the nocasematch setting
835 IS_FILE=1
836 break
837 fi
838 done
839 fi
840
841 # If we did not identify this URL as a file above, we need to compare the suffix against known
842 # pages and TLDs in order to rule out that this is an unknown file suffix or TLD that the user
843 # needs to add to the HTTP_FILES or HTTP_TLDS_AND_PAGES array
844 if [ $IS_FILE -eq -1 ]; then
845 for PAGE_OR_TLD in "${HTTP_TLDS_AND_PAGES[@]}"; do
846 if [[ $PAGE_OR_TLD == $POST_DOT ]]; then
847 IS_FILE=0
848 break
849 fi
850 done
851 fi
852
853 # Turn case sensitivity back on in Bash
854 shopt -u nocasematch
855 fi
856
857 # If this suffix escaped identification as either a file, page or TLD, inform the user
858 STR_TYPE=""
859 if [ $IS_FILE -eq -1 ]; then
[1123]860 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I encountered the unknown URL ending '$POST_DOT'. Please add this ending to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
[1064]861 let SKIP_UNK_SUFFIX+=1
862 continue
863 elif [ $IS_FILE -eq 1 ]; then
864 STR_TYPE="file"
865 let FILE_LINKS+=1
866 elif [ $IS_FILE -eq 0 ]; then
867 STR_TYPE="page"
868 let PAGE_LINKS+=1
869 fi
870
871 # Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
872 # issue with sites that require HTTPS
[1142]873 CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --retry 2 --write-out '%{http_code}\n' $URL)
[1064]874 CURL_ERR=$(echo $?)
875 CURL_RESULT=$CURL_CODE
876
877 # Tack 'curl' exit code onto result string if 'curl' returned "000" (no HTTP response)
878 if [ $CURL_CODE == "000" ]; then
879 CURL_RESULT="$CURL_RESULT-$CURL_ERR"
880 fi
881
[1070]882 # Begin to determine our status code for this URL (EI, IW, OK, RD, or NG)
[1064]883 STATUS="??"
[1067]884 NEW_URL=""
[1064]885 INTERWIKI_INDEX=-1
886
[1070]887 # First make sure that this isn't an "external internal" link to our own wiki that can be replaced
888 # by "[[page_name]]". If it uses a special access URL beginning with "/w/", let it pass, as it
889 # probably cannot be replaced by "[[ ]]" markup
890 if [[ $URL == *$WIKI_PATH* ]] && [[ $URL != *$WIKI_PATH/w/* ]]; then
891 STATUS="EI"
892 let EI_LINKS+=1
893 fi
894
895 # If it's not, check if this is a link to a domain that we have an interwiki prefix for
896 if [ $STATUS == "??" ]; then
897 for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
898 if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]] && [[ $URL != *${INTERWIKI_DOMAINS[$i]}/w/* ]]; then
899 STATUS="IW"
900 let IW_LINKS+=1
901 INTERWIKI_INDEX=$i
902 break
903 fi
904 done
905 fi
906
[1069]907 # If we didn't match an interwiki domain, see if the status code is in our "OK" codes list
908 if [ $STATUS == "??" ]; then
909 for CODE in "${OK_CODES[@]}"; do
910 if [[ $CODE == $CURL_CODE ]]; then
911 STATUS="OK"
912 let OK_LINKS+=1
913 break
914 fi
915 done
916 fi
917
[1067]918 # If we didn't get a match with the "OK" codes, check it against the "RD" codes
[1064]919 if [ $STATUS == "??" ]; then
[1067]920 for CODE in "${RD_CODES[@]}"; do
921 if [[ $CODE == $CURL_CODE ]]; then
922 # Get URL header again in order to retrieve the URL we are being redirected to
[1141]923 NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --write-out '%{redirect_url}\n' $URL)
[1067]924
[1122]925 # Adjust the old and new URLs to both use HTTP for comparison purposes, so we can filter
926 # those changes out if the user didn't ask for them
927 URL_HTTP=$(echo $URL | sed -E 's/^https:/http:/')
928 NEW_URL_HTTP=$(echo $NEW_URL | sed -E 's/^https:/http:/')
[1070]929
930 # Sometimes 'curl' fails to get the redirect_url due to time-out or bad web site config
[1122]931 NEW_URL_LENGTH=$(echo | awk -v input=$NEW_URL_HTTP '{print length(input)}')
[1070]932 if [ $NEW_URL_LENGTH -lt $MIN_URL_LENGTH ]; then
[1122]933 NEW_URL_HTTP="[new URL not retrieved]"
[1070]934 fi
935
[1122]936 # Remove slash at end of new URL, if present, so we can filter out the redirects that
937 # merely add an ending slash if the user didn't ask for them
938 NEW_URL_NO_SLASH=$(echo $NEW_URL_HTTP | sed -E 's:/$::')
939
[1127]940 # Detect if this is a youtu.be link simply being expanded by YouTube to the full
941 # youtube.com address
942 YOUTU_BE=0
943 if [[ $URL_HTTP == http://youtu.be* ]] && [[ $NEW_URL_HTTP == http://www.youtube.com* ]]; then
944 YOUTU_BE=1
945 fi
946
[1122]947 # If the URLs match besides HTTP being upgraded to HTTPS, then the link is OK (unless user
948 # wants those to be reported)
949 if [ $SHOW_HTTPS -eq 0 ] && [ $URL_HTTP == $NEW_URL_HTTP ]; then
[1123]950 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show http->https upgrades, and I was redirected to '$NEW_URL'."
[1069]951 STATUS="OK"
952 let OK_LINKS+=1
[1122]953 let SKIP_HTTPS_UP+=1
954 # If the URLs match besides an added ending slash, then the link is OK (unless user wants
955 # those to be reported)
956 elif [ $SHOW_SLASH -eq 0 ] && [ $URL_HTTP == $NEW_URL_NO_SLASH ]; then
[1123]957 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show added trailing slashes, and I was redirected to '$NEW_URL'."
[1122]958 STATUS="OK"
959 let OK_LINKS+=1
960 let SKIP_SLASH_ADD+=1
[1127]961 elif [ $SHOW_YT_RD -eq 0 ] && [ $YOUTU_BE -eq 1 ]; then
962 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show redirects for youtu.be links, and I was redirected to '$NEW_URL'."
963 STATUS="OK"
964 let OK_LINKS+=1
965 let SKIP_YOUTU_BE+=1
[1069]966 else
967 STATUS="RD"
968 let RD_LINKS+=1
969 fi
[1067]970 break
971 fi
972 done
973 fi
974
975 # If we didn't get a match with the "RD" codes, check it against the "NG" codes
976 if [ $STATUS == "??" ]; then
[1064]977 for CODE in "${NG_CODES[@]}"; do
978 if [[ $CODE == $CURL_CODE ]]; then
979 STATUS="NG"
980 let NG_LINKS+=1
981 break
982 fi
983 done
984 fi
985
986 # If we didn't match a known status code, advise the reader
987 if [ $STATUS == "??" ]; then
[1127]988 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I encountered the unknown response code $CURL_CODE."
[1064]989 let SKIP_UNK_CODE+=1
990 continue
991 fi
992
[1136]993 # Check problem links against exceptions list before proceeding
994 FOUND_EXCEPT=0
[1070]995 if [ $STATUS != "OK" ] && [ ! -z $EXCEPT_URL ]; then
996 # The code we expect to find in the exceptions file is either the 'curl' result or "EI"/"IW"
997 EXPECT_CODE="$CURL_RESULT"
998 if [ $STATUS == "EI" ]; then
999 EXPECT_CODE="EI"
1000 elif [ $STATUS == "IW" ]; then
1001 EXPECT_CODE="IW"
1002 fi
1003
[1136]1004 # Look for link in exceptions list and make sure the listed result code and wiki page also match
1005 for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
1006 {
1007 EXCEPT_LINE="${EXCEPT_ARRAY[$i]}"
1008
[1142]1009 # Undo any HTML-encoding from the wiki page; for now we just worry about the ampersand, as most
1010 # other HTML-encoded characters are not found in URLs
1011 EXCEPT_LINE=$(echo "$EXCEPT_LINE" | sed 's/\&amp;/\&/')
1012
[1136]1013 # Match URL
1014 EXCEPT_URL="${EXCEPT_LINE#*,}"
1015 EXCEPT_URL="${EXCEPT_URL%,*}"
1016 if [ "$EXCEPT_URL" != "$URL" ]; then
[1070]1017 continue
1018 fi
[1136]1019
1020 # Match containing page's name
1021 EXCEPT_PAGE="${EXCEPT_LINE##*,}"
1022 EXCEPT_PAGE="${EXCEPT_PAGE%% *}"
1023 if [ "$EXCEPT_PAGE" == "*" ] || [ "$EXCEPT_PAGE" == $LOCAL_PAGE_PATH ]; then
1024 # Match result code
1025 EXCEPT_CODE=${EXCEPT_LINE%%,*}
1026 if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then
1027 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because its expected result, '$EXPECT_CODE', is in the exceptions list."
1028 if [ $STATUS == "EI" ]; then
1029 let SKIP_EXPECT_EI+=1
1030 elif [ $STATUS == "IW" ]; then
1031 let SKIP_EXPECT_IW+=1
[1142]1032 elif [ $STATUS == "RD" ]; then
1033 let SKIP_EXPECT_RD+=1
[1136]1034 else
1035 let SKIP_EXPECT_NG+=1
1036 fi
1037 FOUND_EXCEPT=1
1038 break
1039 fi
1040 fi
1041 } done
[1064]1042 fi
[1136]1043 if [ $FOUND_EXCEPT -eq 1 ]; then
1044 continue
1045 fi
[1064]1046
1047 # If appropriate, record this link to the log, with clickable URLs when possible
1048 if [ $STATUS != "OK" ] || [ $RECORD_OK_LINKS -eq 1 ]; then
[1125]1049 # Prepare 'curl' result in parentheses to print after status code, unless this is an "IW" or "EI"
1050 # link, in which case showing the status code doesn't make sense. Adjust spacing after string to
1051 # ensure TXT and RTF reports have aligned columns of results.
1052 CURL_STR_H=" ($CURL_RESULT)"
1053 CURL_STR_T="$CURL_STR_H"
1054 CURL_STR_R="$CURL_STR_H "
[1070]1055 if [ $STATUS == "IW" ] || [ $STATUS == "EI" ]; then
[1125]1056 CURL_STR_H=""
1057 CURL_STR_T=" "
1058 CURL_STR_R=" "
[1064]1059 fi
1060
1061 # Record link and its wiki page in TXT, RTF, and HTML markup
[1125]1062 valPrint t "${STATUS}${CURL_STR_T} $STR_TYPE $URL"
[1064]1063 valPrint t " linked from $FULL_PAGE_PATH"
[1125]1064 valPrint r "${STATUS}${CURL_STR_R}${RTF_TABS}$STR_TYPE {\field{\*\fldinst{HYPERLINK \"$URL\"}}{\fldrslt $URL}}"
[1064]1065 valPrint r " linked from {\field{\*\fldinst{HYPERLINK \"$FULL_PAGE_PATH\"}}{\fldrslt $LOCAL_PAGE_PATH}}"
[1125]1066 valPrint hn "<tr><td style=\"white-space:nowrap\">${STATUS}${CURL_STR_H}</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>"
[1064]1067 valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"
1068
[1123]1069 # Place vertical space here since we won't be printing anything more about this link
[1125]1070 if [ $STATUS == "OK" ]; then valPrint tr ""; valPrint hs ""; fi
[1123]1071
[1067]1072 # Record redirect URL if one was given by a 3xx response page
1073 if [ $STATUS == "RD" ]; then
[1119]1074 valPrint ts " Server suggests $NEW_URL"
1075 valPrint rs " Server suggests {\field{\*\fldinst{HYPERLINK \"$NEW_URL\"}}{\fldrslt $NEW_URL}}"
1076 valPrint hs "<tr><td colspan=\"2\" align=\"right\">Server suggests</td><td><a href=\"$NEW_URL\" target=\"_blank\">$NEW_URL</a></td></tr>"
[1067]1077 fi
1078
[1070]1079 # Notify reader if we can use an intrawiki link for this URL
1080 if [ $STATUS == "EI" ]; then
[1075]1081 INTRA_PAGE=${URL#*://*/}
[1119]1082 valPrint ts " Just use [[$INTRA_PAGE]]"
1083 valPrint rs " Just use [[$INTRA_PAGE]]"
1084 valPrint hs "<tr><td colspan=\"2\" align=\"right\">Just use</td><td>[[$INTRA_PAGE]]</td></tr>"
[1070]1085 fi
1086
[1064]1087 # Notify reader if we can use an interwiki prefix for this URL
1088 if [ $STATUS == "IW" ]; then
[1075]1089 INTER_PAGE=$(echo "$URL" | sed 's/.*\///')
[1119]1090 valPrint ts " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
1091 valPrint rs " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
1092 valPrint hs "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]</td></tr>"
[1064]1093 fi
1094
1095 # Query Internet Archive for latest "OK" snapshot for "NG" page
1096 if [ $STATUS == "NG" ] && [ $SUGGEST_SNAPSHOTS -eq 1 ]; then
[1141]1097 ARCHIVE_QUERY=$(curl --silent --max-time $TIMEOUT "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
[1064]1098
[1118]1099 # If a "closest" snapshot was received...
[1066]1100 if [[ "$ARCHIVE_QUERY" == *\"closest\":* ]]; then
[1118]1101 # In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it
1102 ARCHIVE_QUERY=$(echo "$ARCHIVE_QUERY" | sed 's/#!/#\\!/')
1103
1104 # ...isolate "url" property in the response that follows the "closest" tag
1105 SNAPSHOT_URL=${ARCHIVE_QUERY##*\"closest\":} # everything after '"closest":'
[1119]1106 SNAPSHOT_URL=${SNAPSHOT_URL#*\"url\": \"} # everything after '"url": "'
[1118]1107 SNAPSHOT_URL=${SNAPSHOT_URL%%\"*} # everything before '"'
1108
[1124]1109 # Remove the port 80 part that IA often adds to the URL, as it's superfluous
1110 SNAPSHOT_URL=$(echo $SNAPSHOT_URL | sed 's/:80//')
1111
[1118]1112 # Inform the user of the snapshot URL
[1119]1113 valPrint ts " IA suggests $SNAPSHOT_URL"
1114 valPrint rs " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
1115 valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
[1064]1116 else # ...otherwise give generic Wayback Machine link for this URL
[1119]1117 valPrint ts " Try browsing $ARCHIVE_GENERIC/$URL"
1118 valPrint rs " Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
1119 valPrint hs "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"
[1064]1120 fi
1121 fi
1122 fi
1123
1124 # If this is a page and it seems to be there, take screenshot of it using headless Google Chrome
1125 if [ $IS_FILE -eq 0 ] && [ $TAKE_PAGE_SHOT -eq 1 ] && [ $STATUS == "OK" ]; then
1126 # Sanitize URL by removing "http(s)://" and converting any other colons or slashes to underscores
1127 SHOT_NAME=$(echo "$URL" | sed 's/https*\:\/\///' | sed 'y/:\//__/')
1128 SHOT_FILE="$SHOT_PATH/$SHOT_NAME.png"
1129
1130 # Don't take screenshot if we already encountered this page and screenshotted it
1131 if [ ! -f "$SHOT_FILE" ]; then
[1070]1132 "$CHROME_PATH" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
[1064]1133 if [ -f "$WORKING_DIR/$CHROME_SCREENSHOT" ]; then
1134 mv -n "$WORKING_DIR/$CHROME_SCREENSHOT" "$SHOT_FILE"
1135 else
[1119]1136 valPrint trhs "Screenshot of URL $URL seems to have failed!"
[1064]1137 fi
1138 else
[1123]1139 valPrint trhs "Skipping screenshot of URL '$URL' because file '$SHOT_FILE' already exists."
[1064]1140 fi
1141 fi
1142done
1143FINISHED_LIST="yes"
1144wrapupAndExit
Note: See TracBrowser for help on using the repository browser.