source: Validate External Links/validate_external_links.sh@ 1183

Last change on this file since 1183 was 1183, checked in by iritscen, 19 months ago

ValExtLinks now knows to recommend that a template or image intrawiki link start with a ':' so it's a clickable link.

File size: 59.7 KB
RevLine 
[1064]1#!/bin/bash
2
[1177]3# Validate External Links by Iritscen (iritscen@yahoo.com)
[1141]4#
5# Validates a list of external links in CSV format. The resulting logs are produced in three formats:
6# - TXT (for easy diffing with an earlier log)
7# - RTF (for reading as a local file with clickable links)
[1144]8# - HTML (for reading as a web page)
[1142]9# Call script with "--help" argument for documentation. Also see Read Me.rtf for critical notes.
[1141]10#
[1064]11# Recommended rule:
[1118]12# |----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----|
[1141]13#
14# Table of contents (sections of script in order of appearance, not execution):
15# • Globals
16# • Help Output
17# • Setup
18# • Utility Functions
19# • Summary Output
20# • Initialization
21# • Data Sourcing
22# • Config Output
23# • Legend Output
24# • Main Loop
[1064]25
26# Set separator token to newline
27IFS="
28"
29
30### GLOBALS ###
31# Settings -- these will be changed from their defaults by the arguments passed in to the script
[1175]32LINKS_URL="" # download external link CSV from this location (can use "file://" protocol)
33EXCEPT_URL="" # location of wiki page with a list of exceptions for NG results
[1177]34OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder
[1147]35RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES
36SHOW_SLASH=0 # record issue when a slash is added to the end of a URL
37SHOW_HTTPS=0 # record issue when "http" is upgraded to "https"
38SHOW_YT_RD=0 # record redirection for a youtu.be URL expanding to the full URL
39SUGGEST_SNAPSHOTS_NG=0 # query the Internet Archive for a possible snapshot URL for each NG page
40SUGGEST_SNAPSHOTS_OK=0 # query the Internet Archive for an existing snapshot of each OK page
[1158]41CHECK_ARCHIVE_LINKS=0 # check URLs on archive.org and archive.is
[1147]42TAKE_PAGE_SHOT=0 # take a screenshot of each OK page
43TIMEOUT=10 # time to wait for a response when querying a site
44CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature
45URL_START=1 # start at this URL in LINKS_FILE
46URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE
47UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report
[1064]48
49# Fixed strings -- see the occurrences of these variables to learn their purpose
[1178]50AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
[1064]51ARCHIVE_API="http://archive.org/wayback/available"
52ARCHIVE_GENERIC="https://web.archive.org/web/*"
53ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
54CHROME_SCREENSHOT="screenshot.png"
[1136]55EXCEPT_FILE_NAME="exceptions.txt"
[1064]56EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
[1141]57WIKI_CURL="https://wiki.oni2.net/Validate_External_Links/Curl_codes"
58WIKI_HTTP="https://wiki.oni2.net/Validate_External_Links/HTTP_codes"
59WIKI_MAIN="https://wiki.oni2.net/Validate_External_Links"
60WIKI_ME="http://iritscen.oni2.net"
[1064]61THIS_DIR=$(cd $(dirname $0); pwd)
62WORKING_DIR=$(pwd)
63WIKI_PATH="wiki.oni2.net"
64
65# These are parallel arrays of the IDs and names of OniGalore's current namespaces
66declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 100 101 102 103 104 105 108 109 110 111)
67declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk")
68
69# These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
[1070]70# This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code.
[1175]71declare -a HTTP_FILES=(3ds 7z avi BINA blend bsl cpp csv dae dll dmg doc east exe fbx first63 flv gamedata gif jpg last32 log m4a mhm mov mp3 mp4 oni ONWC pdf png psd py rar tga TRMA txt vbs wav wmv xaf xcf xlsx xml zip)
[1160]72declare -a HTTP_TLDS_AND_PAGES=(abstract action ars asp aspx cfm cgi com css de do full htm html it js jsp net org pgi php php3 phtml pl ru shtml stm uk x)
[1064]73
[1067]74# These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
[1182]75# are NG (no good). Pages that return OK codes will be screenshotted when screenshots are asked for.
76# Remember to update http_codes.txt if you add a new code.
[1127]77declare -a OK_CODES=(200 401 405 406 418 501)
[1067]78declare -a RD_CODES=(301 302 303 307 308)
[1178]79declare -a NG_CODES=(000 400 403 404 410 429 500 502 503 504 520 530)
[1064]80
81# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
82# transcluded text, and if the transclusion fails, then the braces show up in the URL
83ILLEGAL_CHARS="{ }"
84
[1070]85# The shortest URL possible, used for sanity-checking some URLs: http://a.co
86MIN_URL_LENGTH=11
87
[1064]88# These are parallel arrays giving the prefixes that can be used in place of normal external links to
[1157]89# some wikis and other sites; based on https://wiki.oni2.net/Special:Interwiki
[1070]90declare -a INTERWIKI_PREFIXES=(commons metawikimedia mw wikibooks wikidata wikimedia wikinews wikiquote wikisource wikispecies wikiversity wikivoyage wikt wp)
91declare -a INTERWIKI_DOMAINS=(commons.wikimedia.org meta.wikimedia.org mediawiki.org wikibooks.org wikidata.org wikimediafoundation.org wikinews.org wikiquote.org wikisource.org species.wikimedia.org wikiversity.org wikivoyage.org wiktionary.org wikipedia.org)
[1064]92
93# Variables for keeping track of main loop progress and findings
94LINK_NUM=0
[1070]95EI_LINKS=0
96IW_LINKS=0
[1064]97OK_LINKS=0
[1067]98RD_LINKS=0
[1064]99NG_LINKS=0
[1177]100SKIP_PARSE_FAIL=0
101SKIP_UNK_PROT=0
[1064]102SKIP_UNK_NS=0
103SKIP_JS_PAGE=0
104SKIP_BAD_URL=0
105SKIP_NON_ASCII=0
106SKIP_UNK_SUFFIX=0
107SKIP_UNK_CODE=0
[1070]108SKIP_EXPECT_NG=0
[1142]109SKIP_EXPECT_RD=0
[1070]110SKIP_EXPECT_EI=0
111SKIP_EXPECT_IW=0
[1122]112SKIP_HTTPS_UP=0
113SKIP_SLASH_ADD=0
[1127]114SKIP_YOUTU_BE=0
[1158]115SKIP_ARCHIVES=0
[1064]116FILE_LINKS=0
117PAGE_LINKS=0
118SKIPPED_HEADER_ROW=0
119FINISHED_LIST="no"
[1118]120START_RUN=0
121END_RUN=0
[1064]122
123
[1141]124### HELP OUTPUT ###
[1064]125# A pseudo-man page. Here is the 80-character rule for the page text:
[1178]126# 345678901234567890123456789012345678901234567890123456789012345678901234567890
[1064]127function printHelp()
128{
129 cat << EOF
130
131NAME
132 Validate External Links
133
134SYNOPSIS
135 validate_external_links.sh --help
[1070]136 validate_external_links.sh --links URL --output DIR [--exceptions URL]
[1136]137 [--record-ok-links] [--show-added-slashes] [--show-https-upgrades]
[1144]138 [--show-yt-redirects] [--suggest-snapshots] [--check-archive-links]
[1141]139 [--take-screenshots FILE] [--timeout NUM] [--start-url NUM]
140 [--end-url NUM] [--upload FILE]
[1064]141
142DESCRIPTION
143 This script parses a list of external links found in the OniGalore wiki
[1147]144 (which is dumped by the Oni2.net server periodically in a particular
[1064]145 format), validates them using the Unix tool 'curl', and produces a report
[1070]146 of which links were "OK" (responded positively to an HTTP query), which
147 were "RD" (responded with a 3xx redirect code), which could be "IW"
148 (interwiki) links, which are "EI" (external internal) links and could be
149 intrawiki links, and which were "NG" (no good; a negative response to the
[1069]150 query). This report can then be automatically uploaded to the location of
[1064]151 your choice. The script can also suggest Internet Archive snapshots for
[1070]152 "NG" links, and take screenshots of "OK" links for visual verification by
153 the reader that the page in question is the one intended to be displayed.
[1064]154
155 You must pass this script the URL at which the list of links is found
[1070]156 (--links) and the path where the directory of logs should be outputted
157 (--output). All other arguments are optional.
[1064]158
159OPTIONS
[1075]160 --help Show this page.
161 --links URL (required) URL from which to download the CSV
162 file with external links. Note that this URL can
163 be a local file if you supply a file:// path.
164 --output DIR (required) Unix path to directory in which Val
165 should place its reports.
166 --exceptions URL In order to remove links from the report which
[1136]167 Val finds an issue with but which you regard as
168 OK, list those desired exceptions on a wiki page.
169 See the sample file "exceptions.pdf" for the
170 required format of the page. Note that this URL
171 can point to a local file if you supply a path
172 beginning with "file://".
[1075]173 --record-ok-links Log a link in the report even if its response
174 code is "OK".
[1122]175 --show-added-slashes Report on redirects that simply add a '/' to the
176 end of the URL.
[1127]177 --show-https-upgrades Report on redirects that simply upgrade a
[1122]178 "http://" URL to a "https://" URL.
[1127]179 --show-yt-redirects Report on redirects that expand a youtu.be URL.
[1147]180 --suggest-snapshots-ng Query the Internet Archive for a possible
[1075]181 snapshot URL for each "NG" page.
[1147]182 --suggest-snapshots-ok Query the Internet Archive for a snapshot of each
183 "OK" page just to make sure it's available. Note
184 that this will add a tremendous amount of time to
185 the script execution because there is a rate
186 limit to the Archive API. Note that this option
187 does nothing unless you also use the
188 --record-ok-links argument.
[1144]189 --check-archive-links Check links that are already pointing to a page
[1158]190 on the Internet Archive or archive.is (AKA
191 archive.today). In theory these links should be
192 totally stable and not need validation.
[1075]193 --take-screenshots FILE Call the Google Chrome binary at this path to
194 take screenshots of each "OK" page.
[1141]195 --timeout NUM Wait this many seconds for a site to respond. The
[1142]196 default is 10. Important note: Val will attempt
197 to reach each URL three times, so the time taken
198 to ping an unresponsive site will be three times
199 this setting.
[1075]200 --start-url NUM Start at this link in the links CSV file.
201 --end-url NUM Stop at this link in the links CSV file.
202 --upload FILE Upload report using the credentials and path
203 given in this local text file. See sftp_login.txt
204 for template.
[1064]205
206BUGS
207 The script cannot properly parse any line in the external links file
208 which contains a comma in the name of the wiki page containing a link.
209 Commas in the link itself are not an issue.
210EOF
211}
212
213
214### SETUP ###
215# If first argument is a help request, or if nothing was passed in at all, print help page and quit
216if [ "$#" -eq 0 ] || [ "$1" == "--help" ]; then
217 printHelp | less
218 exit 0
219fi
220
221# Parse arguments as long as there are more arguments to process
222while (( "$#" )); do
223 case "$1" in
[1147]224 --links ) LINKS_URL="$2"; shift 2;;
225 --exceptions ) EXCEPT_URL="$2"; shift 2;;
226 --output ) OUTPUT_DIR="$2"; shift 2;;
227 --record-ok-links ) RECORD_OK_LINKS=1; shift;;
228 --show-added-slashes ) SHOW_SLASH=1; shift;;
229 --show-https-upgrades ) SHOW_HTTPS=1; shift;;
230 --show-yt-redirects ) SHOW_YT_RD=1; shift;;
231 --suggest-snapshots-ng ) SUGGEST_SNAPSHOTS_NG=1; shift;;
232 --suggest-snapshots-ok ) SUGGEST_SNAPSHOTS_OK=1; shift;;
233 --check-archive-links ) CHECK_ARCHIVE_LINKS=1; shift;;
234 --take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
235 --timeout ) TIMEOUT=$2; shift 2;;
236 --start-url ) URL_START=$2; shift 2;;
237 --end-url ) URL_LIMIT=$2; shift 2;;
238 --upload ) UPLOAD_INFO=$2; shift 2;;
[1157]239 * ) echo "Invalid argument '$1' detected. Aborting."; exit 1;;
[1064]240 esac
241done
242
243# If the required arguments were not supplied, print help page and quit
244if [ -z $LINKS_URL ] || [ -z $OUTPUT_DIR ]; then
[1070]245 echo "Error: I did not receive one or both required arguments. Run me with the \"--help\" argument for documentation."
[1064]246 exit 2
247fi
248
[1070]249# If user wants screenshots, make sure path to Chrome was passed in and is valid
250if [ $TAKE_PAGE_SHOT -eq 1 ]; then
251 if [ ! -f "$CHROME_PATH" ]; then
252 echo "Error: You need to supply a path to the Google Chrome application in order to take screenshots."
253 exit 3
254 fi
255fi
256
[1064]257# Check that UPLOAD_INFO exists, if this argument was supplied
258if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then
259 echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting."
[1070]260 exit 4
[1064]261fi
262
263# Check that OUTPUT_DIR is a directory
264if [ ! -d "$OUTPUT_DIR" ]; then
265 echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting."
[1070]266 exit 5
[1064]267fi
268
269# Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots
270SAFE_TIME=$(date "+%Y-%m-%d--%H-%M-%S") # for use in path names
271NICE_TIME=$(TZ=":UTC" date "+%a, %d %b %Y %k:%M:%S GMT") # matches format of LINKS_DATE, which is extracted from the extlinks file
272OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
273OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
274SHOT_PATH="$OUTPUT_PATH/Screenshots"
275LOG_NAME="ValExtLinks report"
[1144]276LOG_NAME_TXT="$LOG_NAME.txt"
277LOG_NAME_RTF="$LOG_NAME.rtf"
278LOG_NAME_HTM="$LOG_NAME.htm"
279LOG_PATH="$OUTPUT_PATH/$LOG_NAME"
280LOG_PATH_TXT="$LOG_PATH.txt"
281LOG_PATH_RTF="$LOG_PATH.rtf"
282LOG_PATH_HTM="$LOG_PATH.htm"
[1064]283mkdir "$OUTPUT_PATH"
284if [ $TAKE_PAGE_SHOT -eq 1 ]; then
285 mkdir "$SHOT_PATH"
286fi
287
288# Check that 'mkdir' succeeded
289if [ ! -d "$OUTPUT_PATH" ]; then
290 echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting."
[1070]291 exit 6
[1064]292fi
293
294# Get date on the file at LINKS_URL and print to log
295LINKS_DATE=$(curl --silent --head $LINKS_URL | grep "Last-Modified")
296if [ -z "$LINKS_DATE" ]; then
297 echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting."
[1070]298 exit 7
[1064]299fi
300LINKS_DATE=${LINKS_DATE#Last-Modified: }
301
302
303### UTILITY FUNCTIONS ###
304# Writes a plain-text header to TXT log file
305function printTXTheader()
306{
307 valPrint t "Validate External Links report"
308 valPrint t "generated $NICE_TIME"
309 valPrint t "from data of $LINKS_DATE"
[1141]310 valPrint t "script by Iritscen (contact: $WIKI_ME)"
[1064]311 valPrint t ""
312}
313
314# Writes the RTF header to RTF log file, then a centered title, then sets text to left-justified
315function printRTFheader()
316{
317 valPrint r "{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf820
318{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
319{\colortbl;\red255\green255\blue255;}
320{\*\expandedcolortbl;;}
321\margl1440\margr1440\vieww12600\viewh12100\viewkind0
322\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\qc\partightenfactor0
323
324\f0\fs28 \cf0 \b1 Validate External Links report \b0\\
325generated $NICE_TIME\\
326from data of $LINKS_DATE\\
[1141]327script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$WIKI_ME\"}}{\fldrslt contact}})
[1064]328\\
329\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
330\cf0 "
331}
332
333# Closes the RTF markup of the RTF log file
334function printRTFfooter()
335{
336 valPrint r "}"
337}
338
339# Writes the HTML header to HTML log file
340function printHTMheader()
341{
342 valPrint h "<html>
343<head>
344<title>Validate External Links report</title>
345</head>
346<body>
347<h2>Validate External Links report</h2>
348<h3>generated $NICE_TIME<br />
349from data of $LINKS_DATE<br />
[1141]350script by Iritscen (<a href=\"$WIKI_ME\" target=\"_blank\">contact</a>)</h3>"
[1064]351}
352
353# Closes the HTML markup of the HTML log file
354function printHTMfooter()
355{
356 valPrint h "</body>
357</html>"
358}
359
360# The central logging function. The first parameter is a string composed of one or more characters that
[1070]361# indicate which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
[1141]362# 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 's' means "Print an
363# extra newline at the end." 'w' means "Don't pass console output through 'fmt'" ("fmt" fits the output
[1119]364# to an 80-column CLI but can break special formatting and the 'n' option).
[1064]365function valPrint()
366{
367 if [[ "$1" == *c* ]]; then
368 if [[ "$1" == *n* ]]; then
369 echo -n "$2"
370 elif [[ "$1" == *w* ]]; then
371 echo "$2"
[1119]372 elif [[ "$1" == *s* ]]; then
373 echo -e "$2\n"
[1064]374 else
375 echo "$2" | fmt -w 80
376 fi
377 fi
378 if [[ "$1" == *t* ]]; then
379 if [[ "$1" == *n* ]]; then
[1144]380 echo -n "$2" >> "$LOG_PATH_TXT"
[1119]381 elif [[ "$1" == *s* ]]; then
[1144]382 echo -e "$2\n" >> "$LOG_PATH_TXT"
[1064]383 else
[1144]384 echo "$2" >> "$LOG_PATH_TXT"
[1064]385 fi
386 fi
387 if [[ "$1" == *r* ]]; then
388 if [[ "$1" == *n* ]]; then
[1144]389 echo "$2" >> "$LOG_PATH_RTF"
[1119]390 elif [[ "$1" == *s* ]]; then
[1144]391 echo "$2\line\line" >> "$LOG_PATH_RTF"
[1064]392 else
[1144]393 echo "$2\line" >> "$LOG_PATH_RTF"
[1064]394 fi
395 fi
396 if [[ "$1" == *h* ]]; then
[1119]397 if [[ "$1" == *s* ]]; then
[1144]398 echo "$2<tr><td>&nbsp;</td></tr>" >> "$LOG_PATH_HTM"
[1119]399 elif [[ "$1" == *n* ]]; then
[1144]400 echo "$2" >> "$LOG_PATH_HTM"
[1064]401 else
[1144]402 echo "$2<br />" >> "$LOG_PATH_HTM"
[1064]403 fi
404 fi
405}
406
407# Pluralize the string in parameter 1 if the number in parameter 2 is not 1
408function pluralCheckNoun()
409{
410 if [ $2 -ne 1 ]; then
411 if [[ $1 =~ x$ ]]; then
412 echo $1es
413 else
414 echo $1s
415 fi
416 else
417 echo $1
418 fi
419}
420
[1067]421# Output "is" if parameter 1 is 1, otherwise "are"
422function pluralCheckIs()
423{
424 if [ $1 -ne 1 ]; then
425 echo "are"
426 else
427 echo "is"
428 fi
429}
430
[1064]431# Output "was" if parameter 1 is 1, otherwise "were"
432function pluralCheckWas()
433{
434 if [ $1 -ne 1 ]; then
435 echo "were"
436 else
437 echo "was"
438 fi
439}
440
[1067]441# Output "a " if parameter 1 is 1, otherwise nothing
442function pluralCheckA()
443{
444 if [ $1 -eq 1 ]; then
445 echo "a "
446 fi
447}
448
449# Output "an " if parameter 1 is 1, otherwise nothing
450function pluralCheckAn()
451{
452 if [ $1 -eq 1 ]; then
453 echo "an "
454 fi
455}
456
[1144]457# Upload the reports using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
[1064]458# reports being saved to disk have already been closed.
459function uploadReport()
460{
[1144]461 valPrint c "Uploading reports..."
[1064]462
463 SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
464 SFTP_USER_NAME_MARKER="user:"
465 SFTP_PASSWORD_MARKER="pw:"
466 SFTP_PORT_MARKER="port:"
467 SFTP_PATH_MARKER="path:"
468 SFTP_USER_NAME=$(grep $SFTP_USER_NAME_MARKER $UPLOAD_INFO)
469 SFTP_USER_NAME=${SFTP_USER_NAME#$SFTP_USER_NAME_MARKER}
470 SFTP_PASSWORD=$(grep $SFTP_PASSWORD_MARKER $UPLOAD_INFO)
471 SFTP_PASSWORD=${SFTP_PASSWORD#$SFTP_PASSWORD_MARKER}
472 SFTP_PORT=$(grep $SFTP_PORT_MARKER $UPLOAD_INFO)
473 SFTP_PORT=${SFTP_PORT#$SFTP_PORT_MARKER}
474 SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
475 SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}
476
[1144]477 for SUFFIX in htm rtf txt; do
478 expect "$SCRIPT_PATH" "$LOG_PATH.$SUFFIX" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.$SUFFIX"
[1064]479
[1144]480 if [ "$?" -ne 0 ]; then
481 valPrint c "Error $? occurred when attempting to upload $LOG_NAME.$SUFFIX!"
482 else
483 valPrint c "Report in `echo $SUFFIX | tr [:lower:] [:upper:]` format was uploaded."
484 fi
485 done
[1064]486}
487
488# Prints session summary when script is done
489function wrapupAndExit()
490{
491 # Get off progress line on console, drop down a line from last link in log, and close HTML table
492 valPrint ctr ""
493 valPrint h "</table><br />"
494
495 # If we didn't finish processing the last URL, then the iterator is one too high
496 if [ $FINISHED_LIST != "yes" ]; then
497 let LINK_NUM-=1
498 if [ $FINISHED_LIST == "no" ]; then
499 valPrint ctrh "The session was canceled by the user."
500 fi
501 fi
502
[1118]503 # Generate string with elapsed time
504 END_RUN=$(date +%s)
505 ELAPSED=$(echo $(($END_RUN - $START_RUN)) | awk '{printf "%d min. %d sec. elapsed", int($1/60), int($1%60)}')
506
[1122]507 # Do some math on results of session
[1064]508 LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
[1142]509 TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP+SKIP_YOUTU_BE))
[1177]510 LINK_ERRORS=$((SKIP_PARSE_FAIL+SKIP_UNK_PROT+SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
[1142]511 LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_RD+SKIP_EXPECT_EI+SKIP_EXPECT_IW))
512 LINK_PROBLEMS_TOTAL=$((NG_LINKS+RD_LINKS+EI_LINKS+IW_LINKS))
513 LINK_PROBLEMS_NG=$((NG_LINKS-SKIP_EXPECT_NG))
514 LINK_PROBLEMS_RD=$((RD_LINKS-SKIP_EXPECT_RD))
515 LINK_PROBLEMS_EI=$((EI_LINKS-SKIP_EXPECT_EI))
516 LINK_PROBLEMS_IW=$((IW_LINKS-SKIP_EXPECT_IW))
517 LINK_PROBLEMS_NET=$((LINK_PROBLEMS_NG+LINK_PROBLEMS_RD+LINK_PROBLEMS_EI+LINK_PROBLEMS_IW))
[1122]518
[1144]519 # Print something in the Links section if no link issues were printed
520 if [ $LINK_PROBLEMS_NET -eq 0 ]; then
521 valPrint h "<i>No link problems to report! See the <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for a list of links with issues that were not reported.</i>"
522 fi
523 if [ $LINK_PROBLEMS_TOTAL -eq 0 ]; then
524 valPrint t "No link problems to report!"
525 valPrint r "\i1 No link problems to report! \i0"
526 fi
527
[1141]528 ## SUMMARY OUTPUT ##
[1118]529 valPrint ct "Summary ($ELAPSED):"
530 valPrint r "\b1 Summary \b0 ($ELAPSED)"
531 valPrint hn "<h3><span id=\"summary\">Summary ($ELAPSED)</span></h3>"
[1123]532 valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT) (there $(pluralCheckWas $FILE_LINKS) $FILE_LINKS file $(pluralCheckNoun link $FILE_LINKS) and $PAGE_LINKS page $(pluralCheckNoun link $PAGE_LINKS))."
[1122]533
534 # Print processed link totals
535 if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi
536 if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi
[1178]537 if [ $SKIP_ARCHIVES -gt 0 ]; then valPrint ctrh "- $SKIP_ARCHIVES archive.org/archive.is $(pluralCheckNoun link $SKIP_ARCHIVES) $(pluralCheckWas $SKIP_ARCHIVES) not checked"; fi
[1142]538 if [ $LINK_PROBLEMS_TOTAL -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_TOTAL processed $(pluralCheckNoun link $LINK_PROBLEMS_TOTAL) had $(pluralCheckAn $LINK_PROBLEMS_TOTAL)$(pluralCheckNoun issue $LINK_PROBLEMS_TOTAL)"; fi
539 if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr " (excepted $LINKS_EXCEPTED link $(pluralCheckNoun issue $LINKS_EXCEPTED) from report)"; valPrint h "&nbsp;&nbsp;(excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi
[1123]540 if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) $(pluralCheckWas $OK_LINKS) OK"; fi
541 if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctr " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; valPrint h "&nbsp;&nbsp;(counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi
[1122]542
543 # Print errored link totals
[1144]544 if [ $LINK_ERRORS -gt 0 ]; then
545 valPrint c "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"
546 valPrint h "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for specific links):"
547 valPrint rt "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS):"
548 fi
[1177]549 if [ $SKIP_PARSE_FAIL -gt 0 ]; then valPrint ctrh "- $SKIP_PARSE_FAIL line-parsing $(pluralCheckNoun failure $SKIP_PARSE_FAIL)"; fi
550 if [ $SKIP_UNK_PROT -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_PROT unknown $(pluralCheckNoun protocol $SKIP_UNK_PROT)"; fi
[1122]551 if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
[1070]552 if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
[1064]553 if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
554 if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
555 if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
556 if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
[1122]557
[1142]558 # Print excepted link totals
[1144]559 if [ $LINKS_EXCEPTED -gt 0 ]; then
560 valPrint c "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"
561 valPrint h "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for specific links):"
562 valPrint rt "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted:"
563 fi
[1142]564 if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
565 if [ $SKIP_EXPECT_RD -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_RD/$RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi
566 if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi
567 if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi
568
[1175]569 # Perform exceptions audit
570 EXCEPTION_ISSUES=0
571 valPrint ctrh "Exceptions list audit:"
572 for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
573 EXCEPT_LINE="${EXCEPT_ARRAY[$i]}"
574 EXCEPT_LINE=$(echo "$EXCEPT_LINE" | sed 's/\&amp;/\&/g') # copied from exception-matching code
575
576 if [ ${EXCEPT_FOUND[$i]} -eq 0 ]; then
577 EXCEPT_URL="${EXCEPT_LINE#*,}"
578 EXCEPT_URL="${EXCEPT_URL%,*}"
579 EXCEPT_PAGE="${EXCEPT_LINE##*,}"
580 EXCEPT_PAGE="${EXCEPT_PAGE%% *}"
581 if [ "$EXCEPT_PAGE" == "*" ]; then
582 valPrint tr "- The link '$EXCEPT_URL' did not return an error or is not on any page."
583 else
584 valPrint tr "- The link '$EXCEPT_URL' did not return an error or is not on page '$EXCEPT_PAGE'."
585 fi
586 let EXCEPTION_ISSUES+=1
587 elif [ ${EXCEPT_USED[$i]} -eq 0 ]; then
588 EXCEPT_URL="${EXCEPT_LINE#*,}"
589 EXCEPT_URL="${EXCEPT_URL%,*}"
590 EXCEPT_CODE=${EXCEPT_LINE%%,*}
591 valPrint tr "- The link '$EXCEPT_URL' did not return error code $EXCEPT_CODE."
592 let EXCEPTION_ISSUES+=1
593 fi
594 done
595 if [ $EXCEPTION_ISSUES -eq 0 ]; then
596 valPrint ctrh "- No issues found."
597 else
598 valPrint c "$EXCEPTION_ISSUES exception list $(pluralCheckNoun issue $EXCEPTION_ISSUES) detected (see RTF or TXT report for details)."
599 valPrint h "$EXCEPTION_ISSUES exception list $(pluralCheckNoun issue $EXCEPTION_ISSUES) detected (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for details)."
600 fi
601
[1122]602 # Print checked link totals
[1142]603 if [ $LINK_PROBLEMS_NET -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS_NET link $(pluralCheckNoun issue $LINK_PROBLEMS_NET):"; fi
604 if [ $LINK_PROBLEMS_NG -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_NG NG $(pluralCheckNoun link $LINK_PROBLEMS_NG)"; fi
605 if [ $LINK_PROBLEMS_RD -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_RD $(pluralCheckNoun redirection $LINK_PROBLEMS_RD)"; fi
606 if [ $LINK_PROBLEMS_EI -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_EI $(pluralCheckNoun link $LINK_PROBLEMS_EI) that could be intrawiki"; fi
607 if [ $LINK_PROBLEMS_IW -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_IW $(pluralCheckNoun link $LINK_PROBLEMS_IW) that could be interwiki"; fi
[1122]608
609 # Close the log files' markup
[1070]610 valPrint trh "ValExtLinks says goodbye."
[1064]611 printRTFfooter
612 printHTMfooter
613
614 # Upload report if this was requested
615 if [ ! -z $UPLOAD_INFO ] && [ $FINISHED_LIST != "no" ]; then
616 uploadReport
617 fi
618
619 # Really quit now
620 valPrint c "ValExtLinks says goodbye."
621 exit 0
622}
623trap wrapupAndExit INT
624
625
626### INITIALIZATION ###
627# Print opening message to console and log files
628valPrint cw "$(tput bold)--Validate External Links--$(tput sgr0)"
629printTXTheader
630printRTFheader
631printHTMheader
632
[1141]633## DATA SOURCING ##
634valPrint t "Startup:"
635valPrint r "\b1 Startup \b0"
636valPrint hn "<h3>Startup</h3>"
637
[1064]638# Attempt to download file at LINKS_URL, then check that it succeeded
[1141]639valPrint cwtrhn "Downloading list of external links from $LINKS_URL..."
[1064]640LINKS_FILE_NAME=$(echo "$LINKS_URL" | sed 's/.*\///')
641LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
642curl --silent -o "$LINKS_FILE" $LINKS_URL
643if [ ! -f "$LINKS_FILE" ]; then
[1141]644 echo -e "\nThe download of $LINKS_URL appears to have failed. Aborting."
[1064]645 wrapupAndExit
[1141]646else
647 valPrint ctrh " success."
[1064]648fi
649
650# Attempt to download file at EXCEPT_URL, then check that it succeeded
651if [ ! -z $EXCEPT_URL ]; then
[1141]652 valPrint cwtrhn "Downloading list of reporting exceptions from $EXCEPT_URL..."
[1136]653 EXCEPT_DATA=$(curl --silent $EXCEPT_URL)
654 if [ -z "$EXCEPT_DATA" ]; then
[1141]655 echo -e "\nThe download of the exceptions data from '$EXCEPT_URL' appears to have failed. Aborting."
[1064]656 wrapupAndExit
[1141]657 else
658 valPrint ctrh " success."
[1064]659 fi
[1136]660 EXCEPT_DATA=${EXCEPT_DATA%END LIST*}
661 EXCEPT_DATA=${EXCEPT_DATA#*BEGIN LIST}
662 EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
663
664 # Store on disk for debugging purposes
665 echo "$EXCEPT_DATA" > "$EXCEPT_FILE"
666
667 # Transfer to array for easy searching later
668 declare -a EXCEPT_ARRAY=($(echo "$EXCEPT_DATA"))
[1175]669
670 # Create parallel arrays for marking which exceptions get used later
671 declare -a EXCEPT_USED=()
672 declare -a EXCEPT_FOUND=()
673 for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
674 EXCEPT_USED+=(0)
675 EXCEPT_FOUND+=(0)
676 done
[1064]677fi
678
679# Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
680LINK_COUNT_STRING=$(cat "$LINKS_FILE" | wc -l)
681
682# Number of URLs is number of lines minus one (first line is column header row for the CSV)
683LINK_COUNT=$(echo "${LINK_COUNT_STRING}" | tr -d '[:space:]')
684let LINK_COUNT-=1
[1141]685valPrint ctrh "Found $LINK_COUNT links to process."
686valPrint trh ""
[1064]687
[1141]688## CONFIG OUTPUT ##
689valPrint t "Config:"
690valPrint r "\b1 Config \b0"
691valPrint hn "<h3>Config</h3>"
692
693valPrint ctrhn "Links to consider: "
[1064]694if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
[1141]695 valPrint ctrh "$((URL_LIMIT-URL_START+1)), from link $URL_START to $URL_LIMIT"
[1064]696elif [ $URL_START -ne 1 ]; then
[1141]697 valPrint ctrh "$((LINK_COUNT-URL_START+1)), from link $URL_START to $LINK_COUNT"
[1064]698else
[1141]699 valPrint ctrh "$LINK_COUNT"
[1064]700fi
701
[1141]702valPrint ctrh "Site query timeout: $TIMEOUT seconds"
703
704valPrint ctrhn "Show OK links: "
705if [ $RECORD_OK_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
706
707valPrint ctrhn "Take screenshots: "
708if [ $TAKE_PAGE_SHOT -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
709
[1147]710valPrint ctrhn "Suggest archive.org snapshots for NG pages: "
711if [ $SUGGEST_SNAPSHOTS_NG -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
[1141]712
[1147]713valPrint ctrhn "Suggest archive.org snapshots for OK pages: "
714if [ $SUGGEST_SNAPSHOTS_OK -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
715
[1141]716valPrint ctrhn "Ignore slash-adding redirects: "
717if [ $SHOW_SLASH -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
718
719valPrint ctrhn "Ignore HTTPS-upgrading redirects: "
720if [ $SHOW_HTTPS -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
721
722valPrint ctrhn "Ignore youtu.be redirects: "
723if [ $SHOW_YT_RD -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
724
[1158]725valPrint ctrhn "Check archive.org and archive.is links: "
[1144]726if [ $CHECK_ARCHIVE_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
[1141]727
[1064]728valPrint tr "A summary of my findings will be found at the bottom of the report."
729valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
730valPrint trh ""
731
[1141]732## LEGEND OUTPUT ##
[1064]733valPrint t "Legend:"
734valPrint r "\b1 Legend \b0"
735valPrint hn "<h3>Legend</h3>"
[1175]736valPrint t "(For guidance in fixing these links, see $WIKI_MAIN. The exceptions list is at $EXCEPT_URL.)"
737valPrint r "(For guidance in fixing these links, see {\field{\*\fldinst{HYPERLINK \"$WIKI_MAIN\"}}{\fldrslt here}}. The exceptions list is {\field{\*\fldinst{HYPERLINK \"$EXCEPT_URL\"}}{\fldrslt here}}.)"
738valPrint h "(For guidance in fixing these links, see <a href=\"$WIKI_MAIN\" target=\"_blank\">here</a>. The exceptions list is <a href=\"$EXCEPT_URL\" target=\"_blank\">here</a>.)"
[1141]739valPrint trh "OK = URL seems to be working"
740valPrint trh "NG = URL no longer seems to work"
741valPrint trh "RD = URL is redirecting to this new URL"
742valPrint trh "EI = URL is an external link to an internal page and should be converted to an intrawiki link using the suggested markup"
743valPrint trh "IW = URL is an external link to a fellow wiki and should be converted to an interwiki link using the suggested markup"
744valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $WIKI_HTTP)"
745valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$WIKI_HTTP\"}}{\fldrslt here}} for code reference)"
746valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$WIKI_HTTP\" target=\"_blank\">here</a> for code reference)"
747valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $WIKI_CURL)"
748valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$WIKI_CURL\"}}{\fldrslt here}} for code reference)"
749valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$WIKI_CURL\" target=\"_blank\">here</a> for code reference)"
750valPrint trh "IA suggests = Last available snapshot returned by the Internet Archive"
751valPrint trh "Try browsing = The Archive failed to return a snapshot URL, so check for a snapshot manually using this link"
[1064]752valPrint trh ""
753
754
755### MAIN LOOP ###
[1120]756valPrint t "Links:"
757valPrint r "\b1 Links \b0"
758valPrint hn "<h3>Links</h3>"
[1118]759START_RUN=$(date +%s)
[1064]760# Process each line of the .csv in LINKS_FILE
761for LINE in `cat "$LINKS_FILE"`; do
[1147]762 START_LINK=$(date +%s)
[1064]763 let LINK_NUM+=1
764
765 # First line is the column header row for the CSV, so let's verify that the format hasn't changed
766 if [ $SKIPPED_HEADER_ROW -eq 0 ]; then
767 if [ $LINE == "namespace,title,target" ]; then
768 SKIPPED_HEADER_ROW=1
[1148]769 LINK_NUM=0 # this line is not a link, so reset the link counter
[1064]770 valPrint hn "<table>"
771 continue
772 else
773 valPrint ctrh "Did not find expected header row in $LINKS_FILE. Aborting."
774 wrapupAndExit
775 fi
776 fi
777
778 # Skip this link if we are not at URL_START yet
779 if [ $LINK_NUM -lt $URL_START ]; then
780 continue
781 fi
[1183]782
[1064]783 # Stop if we are at the limit declared for testing purposes
784 if [ $URL_LIMIT -gt 0 ] && [ $LINK_NUM -gt $URL_LIMIT ]; then
785 FINISHED_LIST="limit"
786 wrapupAndExit
787 fi
[1177]788
789 # Parse line into namespace ID number, containing wiki page, and external link URL
790 NS_ID=${LINE%%,*}
791 PAGE_NAME=${LINE#$NS_ID,}
792 PAGE_NAME=${PAGE_NAME%%,*} # a comma in the page name will break this
793 URL=${LINE#$NS_ID,$PAGE_NAME,} # commas can be in this
794 if [ -z "$NS_ID" ] || [ -z "$PAGE_NAME" ] || [ -z "$URL" ]; then
795 valPrint trs "Skipping line $LINK_NUM ('$LINE') because the namespace, wiki page or link URL could not be read."
796 let SKIP_PARSE_FAIL+=1
797 continue
798 fi
799
800 # Skip any link that isn't "http://" or "https://"
801 if [[ ! $URL =~ ^http* ]]; then
802 valPrint trs "Skipping line $LINK_NUM ('$LINE') because the protocol isn't 'http://' or 'https://'."
803 let SKIP_UNK_PROT+=1
804 continue
805 fi
[1064]806
807 # Print progress to screen
808 if [ $LINK_NUM -gt 1 ]; then
809 printf "\e[1A\n" # erase previous progress message so that new one appears in its place
810 fi
811 valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."
812
813 # Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
814 NS_NAME=""
815 a=0
[1069]816 while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done
[1118]817 if [ $NS_ID == "NULL" ]; then
818 break
819 elif [ $NS_ID -eq ${NS_IDS[$a]} ]; then
[1064]820 NS_NAME="${NS_NAMES[$a]}"
821 break
822 fi
823 let a+=1
824 done
[1118]825 if [ "$NS_NAME" == "" ]; then
826 if [ $NS_ID == "NULL" ]; then
[1123]827 valPrint trs "Skipping line $LINK_NUM ('$LINE') because the namespace (and probably the page too) is 'NULL'. Probably the link is no longer in existence on the wiki."
[1118]828 else
[1123]829 valPrint trs "Skipping line $LINK_NUM ('$LINE') because I could not find a name for namespace ID $NS_ID."
[1118]830 fi
[1064]831 let SKIP_UNK_NS+=1
[1148]832 let PAGE_LINKS+=1
[1064]833 continue
834 fi
835
[1070]836 # Build longer wiki page URLs from namespace and page names
[1122]837 FULL_PAGE_PATH=https://$WIKI_PATH/$NS_NAME:$PAGE_NAME
[1070]838 LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
839 # Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it
840 # explicitly breaks the link
841 if [ $NS_ID -eq 0 ]; then
[1122]842 FULL_PAGE_PATH=https://$WIKI_PATH/$PAGE_NAME
[1070]843 LOCAL_PAGE_PATH=$PAGE_NAME
844 fi
845
[1149]846 # We don't want to consider wiki pages ending in .js, as the MW parser cannot reliably isolate URLS
847 # in JavaScript code, so it returns erroneous links
848 PAGE_NAME_SUFFIX=$(echo $PAGE_NAME | sed 's/.*\.//')
849 if [ $PAGE_NAME_SUFFIX == "js" ]; then
850 valPrint trs "Skipping URL '${LINE#$NS_ID,$PAGE_NAME,}' on line $LINK_NUM because it was found on JavaScript page '$LOCAL_PAGE_PATH'."
851 let SKIP_JS_PAGE+=1
852 let PAGE_LINKS+=1
853 continue
854 fi
855
[1064]856 # Scan for illegal characters
857 if [[ $URL == *[$ILLEGAL_CHARS]* ]]; then
[1149]858 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because it contains characters illegal in a URL."
[1064]859 let SKIP_BAD_URL+=1
[1148]860 let PAGE_LINKS+=1
[1064]861 continue
862 fi
863
[1158]864 # If we're skipping archive links, see if this is one
865 if [ $CHECK_ARCHIVE_LINKS -eq 0 ] && [[ ( $URL == *web.archive.org* || $URL == *archive.is* ) ]]; then
866 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to check archive links."
867 let SKIP_ARCHIVES+=1
[1148]868 let PAGE_LINKS+=1
[1135]869 continue
870 fi
871
[1064]872 # Now we need to know if the URL is for a file or a web page. First step is to determine if the
873 # URL ends in a suffix
874 HAS_SUFFIX=0
875
876 # If the URL ends in something like ".php?foo=bar", strip everything from the '?' onward
[1070]877 CLEAN_URL=${URL%%\?*}
[1064]878
879 # If the URL ends in something like "#section_15", strip everything from the '#' onward
[1070]880 CLEAN_URL=${CLEAN_URL%%\#*}
[1064]881
[1175]882 # 'sed' cannot handle Unicode in my Bash shell, so skip non-ASCII URL and make reader check it
[1070]883 if [[ $CLEAN_URL == *[![:ascii:]]* ]]; then
[1149]884 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I cannot handle non-ASCII characters."
[1064]885 let SKIP_NON_ASCII+=1
[1148]886 let PAGE_LINKS+=1
[1064]887 continue
888 fi
889
890 # Isolate the characters after the last period and after the last slash
[1070]891 POST_DOT=$(echo "$CLEAN_URL" | sed 's/.*\.//')
892 POST_SLASH=$(echo "$CLEAN_URL" | sed 's/.*\///')
[1064]893
894 # If the last period comes after the last slash, then the URL ends in a suffix
895 POST_DOT_LENGTH=$(echo | awk -v input=$POST_DOT '{print length(input)}')
896 POST_SLASH_LENGTH=$(echo | awk -v input=$POST_SLASH '{print length(input)}')
897 if [ $POST_DOT_LENGTH -lt $POST_SLASH_LENGTH ]; then
898 HAS_SUFFIX=1
899 else
900 HAS_SUFFIX=0
901 fi
902
903 # Now we need to know if the URL is for a file. If the URL ends in a suffix, compare it against the
904 # known URL endings supplied in HTTP_FILES and HTTP_TLDS_AND_PAGES
905 IS_FILE=-1
906 if [ $HAS_SUFFIX -eq 0 ]; then
907 IS_FILE=0
908 else
909 # Turn off case sensitivity while we compare suffixes
910 shopt -s nocasematch
911
[1127]912 # Special case: URLs ending in something like "productID.304297400" are pages, not files, so if
[1064]913 # the URL's suffix is all numbers, we are looking at the end of a web page URL
914 if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
915 IS_FILE=0
916 fi
[1127]917
918 # Special case: URLs ending in a parens, e.g. "ms537113(v=vs.85)", are pages
919 if [[ $POST_DOT =~ ^.*[\(\)]$ ]]; then
920 IS_FILE=0
921 fi
922
923 # Special case: URLs containing a '%', e.g. "10.1007%2FBF00329055", are pages
924 if [[ $POST_DOT == *%* ]]; then
925 IS_FILE=0
926 fi
[1064]927
928 # If we did not identify this URL as a web page above, we need to compare the suffix against known
929 # file extensions
930 if [ $IS_FILE -eq -1 ]; then
931 for EXTENSION in "${HTTP_FILES[@]}"; do
932 if [[ $EXTENSION == $POST_DOT ]]; then # double brackets respect the nocasematch setting
933 IS_FILE=1
934 break
935 fi
936 done
937 fi
938
939 # If we did not identify this URL as a file above, we need to compare the suffix against known
940 # pages and TLDs in order to rule out that this is an unknown file suffix or TLD that the user
941 # needs to add to the HTTP_FILES or HTTP_TLDS_AND_PAGES array
942 if [ $IS_FILE -eq -1 ]; then
943 for PAGE_OR_TLD in "${HTTP_TLDS_AND_PAGES[@]}"; do
944 if [[ $PAGE_OR_TLD == $POST_DOT ]]; then
945 IS_FILE=0
946 break
947 fi
948 done
949 fi
950
951 # Turn case sensitivity back on in Bash
952 shopt -u nocasematch
953 fi
954
[1175]955 # If this suffix escaped identification as either a file, page or TLD, inform the reader
[1064]956 STR_TYPE=""
957 if [ $IS_FILE -eq -1 ]; then
[1160]958 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I encountered the unknown URL suffix '$POST_DOT'. Please add this suffix to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
[1064]959 let SKIP_UNK_SUFFIX+=1
960 continue
961 elif [ $IS_FILE -eq 1 ]; then
962 STR_TYPE="file"
963 let FILE_LINKS+=1
[1148]964 else
[1064]965 STR_TYPE="page"
966 let PAGE_LINKS+=1
967 fi
968
969 # Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
970 # issue with sites that require HTTPS
[1158]971 CURL_CODE=$(curl -o /dev/null --silent --insecure --compressed --head --user-agent '$AGENT' --max-time $TIMEOUT --retry 2 --write-out '%{http_code}\n' $URL)
[1064]972 CURL_ERR=$(echo $?)
973 CURL_RESULT=$CURL_CODE
974
975 # Tack 'curl' exit code onto result string if 'curl' returned "000" (no HTTP response)
976 if [ $CURL_CODE == "000" ]; then
977 CURL_RESULT="$CURL_RESULT-$CURL_ERR"
978 fi
979
[1070]980 # Begin to determine our status code for this URL (EI, IW, OK, RD, or NG)
[1064]981 STATUS="??"
[1067]982 NEW_URL=""
[1064]983 INTERWIKI_INDEX=-1
984
[1070]985 # First make sure that this isn't an "external internal" link to our own wiki that can be replaced
986 # by "[[page_name]]". If it uses a special access URL beginning with "/w/", let it pass, as it
987 # probably cannot be replaced by "[[ ]]" markup
988 if [[ $URL == *$WIKI_PATH* ]] && [[ $URL != *$WIKI_PATH/w/* ]]; then
989 STATUS="EI"
990 let EI_LINKS+=1
991 fi
992
[1144]993 # If it's not, check if this is a link to a domain that we have an interwiki prefix for (also make
994 # sure that it's not an archive.org link to a page from an interwiki domain)
995 if [ $STATUS == "??" ] && [[ $URL != *web.archive.org* ]]; then
[1070]996 for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
997 if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]] && [[ $URL != *${INTERWIKI_DOMAINS[$i]}/w/* ]]; then
998 STATUS="IW"
999 let IW_LINKS+=1
1000 INTERWIKI_INDEX=$i
1001 break
1002 fi
1003 done
1004 fi
1005
[1069]1006 # If we didn't match an interwiki domain, see if the status code is in our "OK" codes list
1007 if [ $STATUS == "??" ]; then
1008 for CODE in "${OK_CODES[@]}"; do
1009 if [[ $CODE == $CURL_CODE ]]; then
1010 STATUS="OK"
1011 let OK_LINKS+=1
[1148]1012
1013 # If this is a YouTube link, we have to look at the actual page source to know if the video
[1157]1014 # is good or not; override the link's info if it's actually NG
[1148]1015 if [[ $URL == *www.youtube.com* ]]; then
[1182]1016 PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $URL)
1017 CURL_ERR=$(echo $?)
1018 if [ "$CURL_ERR" != "0" ]; then
[1148]1019 STATUS="NG"
[1182]1020 CURL_RESULT="000-$CURL_ERR"
[1148]1021 let OK_LINKS-=1
1022 let NG_LINKS+=1
[1182]1023 elif [[ "$PAGE_TEXT" =~ "simpleText\":\"Video unavailable" ]]; then
1024 STATUS="NG"
1025 CURL_CODE="404"
1026 CURL_RESULT=$CURL_CODE
1027 let OK_LINKS-=1
1028 let NG_LINKS+=1
[1148]1029 fi
1030 fi
[1182]1031
1032 # If this is a OneDrive link, we have to look at the actual page source to know if the file
1033 # is really still at this URL; override the link's info if it's actually NG or RD
1034 if [[ $URL == *skydrive.live.com* ]]; then
1035 PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $URL)
1036 CURL_ERR=$(echo $?)
1037 if [ "$CURL_ERR" != "0" ]; then
1038 STATUS="NG"
1039 CURL_RESULT="000-$CURL_ERR"
1040 let OK_LINKS-=1
1041 let NG_LINKS+=1
1042 elif [[ "$PAGE_TEXT" =~ "<h1>Sorry, something went wrong" ]]; then
1043 STATUS="NG"
1044 CURL_CODE="404"
1045 CURL_RESULT=$CURL_CODE
1046 let OK_LINKS-=1
1047 let NG_LINKS+=1
1048 elif [[ "$PAGE_TEXT" =~ "<h2>Object moved to" ]]; then
1049 STATUS="??" # have to send the code through the next block to treat the redirect properly
1050 CURL_CODE="301"
1051 CURL_RESULT=$CURL_CODE
1052 let OK_LINKS-=1
1053 fi
1054 fi
1055
[1069]1056 break
1057 fi
1058 done
1059 fi
1060
[1067]1061 # If we didn't get a match with the "OK" codes, check it against the "RD" codes
[1064]1062 if [ $STATUS == "??" ]; then
[1067]1063 for CODE in "${RD_CODES[@]}"; do
1064 if [[ $CODE == $CURL_CODE ]]; then
[1182]1065 # Get URL header again in order to retrieve the URL we are being redirected to, but if this
1066 # is a OneDrive link, we already have the new URL in $PAGE_TEXT
1067 if [[ $URL == *skydrive.live.com* ]]; then
1068 NEW_URL=${PAGE_TEXT##*href=\"}
1069 NEW_URL=${NEW_URL%\">here*}
1070 else
1071 NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --write-out '%{redirect_url}\n' $URL)
1072 fi
[1067]1073
[1122]1074 # Adjust the old and new URLs to both use HTTP for comparison purposes, so we can filter
1075 # those changes out if the user didn't ask for them
1076 URL_HTTP=$(echo $URL | sed -E 's/^https:/http:/')
1077 NEW_URL_HTTP=$(echo $NEW_URL | sed -E 's/^https:/http:/')
[1070]1078
1079 # Sometimes 'curl' fails to get the redirect_url due to time-out or bad web site config
[1122]1080 NEW_URL_LENGTH=$(echo | awk -v input=$NEW_URL_HTTP '{print length(input)}')
[1070]1081 if [ $NEW_URL_LENGTH -lt $MIN_URL_LENGTH ]; then
[1122]1082 NEW_URL_HTTP="[new URL not retrieved]"
[1070]1083 fi
1084
[1122]1085 # Remove slash at end of new URL, if present, so we can filter out the redirects that
1086 # merely add an ending slash if the user didn't ask for them
1087 NEW_URL_NO_SLASH=$(echo $NEW_URL_HTTP | sed -E 's:/$::')
1088
[1127]1089 # Detect if this is a youtu.be link simply being expanded by YouTube to the full
1090 # youtube.com address
1091 YOUTU_BE=0
1092 if [[ $URL_HTTP == http://youtu.be* ]] && [[ $NEW_URL_HTTP == http://www.youtube.com* ]]; then
1093 YOUTU_BE=1
1094 fi
1095
[1122]1096 # If the URLs match besides HTTP being upgraded to HTTPS, then the link is OK (unless user
1097 # wants those to be reported)
1098 if [ $SHOW_HTTPS -eq 0 ] && [ $URL_HTTP == $NEW_URL_HTTP ]; then
[1149]1099 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to show http->https upgrades, and I was redirected to '$NEW_URL'."
[1069]1100 STATUS="OK"
1101 let OK_LINKS+=1
[1122]1102 let SKIP_HTTPS_UP+=1
1103 # If the URLs match besides an added ending slash, then the link is OK (unless user wants
1104 # those to be reported)
1105 elif [ $SHOW_SLASH -eq 0 ] && [ $URL_HTTP == $NEW_URL_NO_SLASH ]; then
[1149]1106 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to show added trailing slashes, and I was redirected to '$NEW_URL'."
[1122]1107 STATUS="OK"
1108 let OK_LINKS+=1
1109 let SKIP_SLASH_ADD+=1
[1148]1110 elif [ $YOUTU_BE -eq 1 ]; then
1111 # We have to look at the actual page source to know if a YouTube video is good or not
1112 PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $NEW_URL | grep "\"simpleText\":\"Video unavailable\"")
1113 if [ ! -z "$PAGE_TEXT" ]; then
1114 STATUS="NG"
1115 let NG_LINKS+=1
1116 else
1117 if [ $SHOW_YT_RD -eq 0 ]; then
[1149]1118 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to show redirects for youtu.be links, and I was redirected to '$NEW_URL'."
[1148]1119 STATUS="OK"
1120 let OK_LINKS+=1
1121 let SKIP_YOUTU_BE+=1
1122 else
1123 STATUS="RD"
1124 let RD_LINKS+=1
1125 fi
1126 fi
[1069]1127 else
1128 STATUS="RD"
1129 let RD_LINKS+=1
1130 fi
[1067]1131 break
1132 fi
1133 done
1134 fi
1135
1136 # If we didn't get a match with the "RD" codes, check it against the "NG" codes
1137 if [ $STATUS == "??" ]; then
[1064]1138 for CODE in "${NG_CODES[@]}"; do
1139 if [[ $CODE == $CURL_CODE ]]; then
1140 STATUS="NG"
1141 let NG_LINKS+=1
1142 break
1143 fi
1144 done
1145 fi
1146
1147 # If we didn't match a known status code, advise the reader
1148 if [ $STATUS == "??" ]; then
[1149]1149 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I encountered the unknown response code $CURL_CODE."
[1064]1150 let SKIP_UNK_CODE+=1
1151 continue
1152 fi
1153
[1136]1154 # Check problem links against exceptions list before proceeding
1155 FOUND_EXCEPT=0
[1175]1156 if [ $STATUS != "OK" ] && [ ! -z "$EXCEPT_URL" ]; then
[1070]1157 # The code we expect to find in the exceptions file is either the 'curl' result or "EI"/"IW"
1158 EXPECT_CODE="$CURL_RESULT"
1159 if [ $STATUS == "EI" ]; then
1160 EXPECT_CODE="EI"
1161 elif [ $STATUS == "IW" ]; then
1162 EXPECT_CODE="IW"
1163 fi
1164
[1136]1165 # Look for link in exceptions list and make sure the listed result code and wiki page also match
1166 for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
1167 {
1168 EXCEPT_LINE="${EXCEPT_ARRAY[$i]}"
[1182]1169
[1142]1170 # Undo any HTML-encoding from the wiki page; for now we just worry about the ampersand, as most
1171 # other HTML-encoded characters are not found in URLs
[1146]1172 EXCEPT_LINE=$(echo "$EXCEPT_LINE" | sed 's/\&amp;/\&/g')
[1142]1173
[1175]1174 # Check for URL match
[1136]1175 EXCEPT_URL="${EXCEPT_LINE#*,}"
1176 EXCEPT_URL="${EXCEPT_URL%,*}"
[1178]1177 if [[ "$EXCEPT_URL" =~ \* ]]; then # if this exception URL contains the '*' wildcard, use pattern-matching with it
[1182]1178 if [[ ! "$URL" == $EXCEPT_URL ]]; then
[1178]1179 continue
1180 fi
1181 else
1182 if [ "$EXCEPT_URL" != "$URL" ]; then # otherwise just use a straight string comparison
1183 continue
1184 fi
[1070]1185 fi
[1136]1186
[1175]1187 # Check for page name match
[1136]1188 EXCEPT_PAGE="${EXCEPT_LINE##*,}"
1189 EXCEPT_PAGE="${EXCEPT_PAGE%% *}"
[1175]1190 if [ "$EXCEPT_PAGE" == "*" ] || [ "$EXCEPT_PAGE" == "$LOCAL_PAGE_PATH" ]; then
1191 let EXCEPT_FOUND[$i]+=1
1192 valPrint trs "Found exception '$URL' on page '$LOCAL_PAGE_PATH'."
1193
1194 # Check for result code match
[1136]1195 EXCEPT_CODE=${EXCEPT_LINE%%,*}
1196 if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then
[1175]1197 FOUND_EXCEPT=1
1198 let EXCEPT_USED[$i]+=1
[1149]1199 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because its expected result, '$EXPECT_CODE', is in the exceptions list."
[1175]1200
[1136]1201 if [ $STATUS == "EI" ]; then
1202 let SKIP_EXPECT_EI+=1
1203 elif [ $STATUS == "IW" ]; then
1204 let SKIP_EXPECT_IW+=1
[1142]1205 elif [ $STATUS == "RD" ]; then
1206 let SKIP_EXPECT_RD+=1
[1136]1207 else
1208 let SKIP_EXPECT_NG+=1
1209 fi
[1175]1210
[1136]1211 break
1212 fi
1213 fi
1214 } done
[1064]1215 fi
[1136]1216 if [ $FOUND_EXCEPT -eq 1 ]; then
1217 continue
1218 fi
[1064]1219
1220 # If appropriate, record this link to the log, with clickable URLs when possible
1221 if [ $STATUS != "OK" ] || [ $RECORD_OK_LINKS -eq 1 ]; then
[1125]1222 # Prepare 'curl' result in parentheses to print after status code, unless this is an "IW" or "EI"
1223 # link, in which case showing the status code doesn't make sense. Adjust spacing after string to
1224 # ensure TXT and RTF reports have aligned columns of results.
1225 CURL_STR_H=" ($CURL_RESULT)"
1226 CURL_STR_T="$CURL_STR_H"
1227 CURL_STR_R="$CURL_STR_H "
[1070]1228 if [ $STATUS == "IW" ] || [ $STATUS == "EI" ]; then
[1125]1229 CURL_STR_H=""
1230 CURL_STR_T=" "
1231 CURL_STR_R=" "
[1064]1232 fi
1233
1234 # Record link and its wiki page in TXT, RTF, and HTML markup
[1125]1235 valPrint t "${STATUS}${CURL_STR_T} $STR_TYPE $URL"
[1064]1236 valPrint t " linked from $FULL_PAGE_PATH"
[1125]1237 valPrint r "${STATUS}${CURL_STR_R}${RTF_TABS}$STR_TYPE {\field{\*\fldinst{HYPERLINK \"$URL\"}}{\fldrslt $URL}}"
[1064]1238 valPrint r " linked from {\field{\*\fldinst{HYPERLINK \"$FULL_PAGE_PATH\"}}{\fldrslt $LOCAL_PAGE_PATH}}"
[1125]1239 valPrint hn "<tr><td style=\"white-space:nowrap\">${STATUS}${CURL_STR_H}</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>"
[1064]1240 valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"
1241
[1123]1242 # Place vertical space here since we won't be printing anything more about this link
[1147]1243 if [ $STATUS == "OK" ] && [ $SUGGEST_SNAPSHOTS_OK -eq 0 ]; then valPrint tr ""; valPrint hs ""; fi
[1123]1244
[1067]1245 # Record redirect URL if one was given by a 3xx response page
1246 if [ $STATUS == "RD" ]; then
[1119]1247 valPrint ts " Server suggests $NEW_URL"
1248 valPrint rs " Server suggests {\field{\*\fldinst{HYPERLINK \"$NEW_URL\"}}{\fldrslt $NEW_URL}}"
1249 valPrint hs "<tr><td colspan=\"2\" align=\"right\">Server suggests</td><td><a href=\"$NEW_URL\" target=\"_blank\">$NEW_URL</a></td></tr>"
[1067]1250 fi
1251
[1070]1252 # Notify reader if we can use an intrawiki link for this URL
1253 if [ $STATUS == "EI" ]; then
[1075]1254 INTRA_PAGE=${URL#*://*/}
[1183]1255 # If INTRA_PAGE starts with Category:, File: or Image:, prefix it with a ':' to make it a wikilink
1256 if [[ $INTRA_PAGE == Category:* ]] || [[ $INTRA_PAGE == File:* ]]|| [[ $INTRA_PAGE == Image:* ]]; then
1257 INTRA_PAGE=:${INTRA_PAGE}
1258 fi
[1119]1259 valPrint ts " Just use [[$INTRA_PAGE]]"
1260 valPrint rs " Just use [[$INTRA_PAGE]]"
1261 valPrint hs "<tr><td colspan=\"2\" align=\"right\">Just use</td><td>[[$INTRA_PAGE]]</td></tr>"
[1070]1262 fi
1263
[1064]1264 # Notify reader if we can use an interwiki prefix for this URL
1265 if [ $STATUS == "IW" ]; then
[1075]1266 INTER_PAGE=$(echo "$URL" | sed 's/.*\///')
[1119]1267 valPrint ts " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
1268 valPrint rs " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
1269 valPrint hs "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]</td></tr>"
[1064]1270 fi
1271
1272 # Query Internet Archive for latest "OK" snapshot for "NG" page
[1147]1273 if [[ ( $STATUS == "NG" && $SUGGEST_SNAPSHOTS_NG -eq 1 ) || ( $STATUS == "OK" && $SUGGEST_SNAPSHOTS_OK -eq 1 ) ]]; then
1274
1275 # We need to watch out for the rate limit or we'll get locked out; look at how much time has
1276 # elapsed and then wait the remainder between that and how long of a wait we think is needed
1277 # to avoid the dreaded "Too Many Requests" response. 5 seconds is just a guess.
1278 CUR_TIME=$(date +%s)
1279 WAIT_REMAINDER=$((5 - $CUR_TIME + $START_LINK))
1280 if [ $WAIT_REMAINDER -gt 0 ]; then
1281 valPrint t "Waiting $WAIT_REMAINDER second(s) to conform to Archive.org rate limit."
1282 sleep $WAIT_REMAINDER
1283 fi
1284
1285 # Issue query to the API
[1141]1286 ARCHIVE_QUERY=$(curl --silent --max-time $TIMEOUT "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
[1064]1287
[1175]1288 # Notify reader if we hit the rate limit and just keep going
[1147]1289 if [[ "$ARCHIVE_QUERY" == "*Too Many Requests*" ]]; then
1290 valPrint t " IA has rate-limited us!"
1291 valPrint r " IA has rate-limited us!"
1292 valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td>(hit the API rate limit!)</td></tr>"
[1175]1293 # If a "closest" snapshot was received, inform reader
[1147]1294 elif [[ "$ARCHIVE_QUERY" == *\"closest\":* ]]; then
[1118]1295 # In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it
1296 ARCHIVE_QUERY=$(echo "$ARCHIVE_QUERY" | sed 's/#!/#\\!/')
1297
1298 # ...isolate "url" property in the response that follows the "closest" tag
1299 SNAPSHOT_URL=${ARCHIVE_QUERY##*\"closest\":} # everything after '"closest":'
[1119]1300 SNAPSHOT_URL=${SNAPSHOT_URL#*\"url\": \"} # everything after '"url": "'
[1118]1301 SNAPSHOT_URL=${SNAPSHOT_URL%%\"*} # everything before '"'
1302
[1124]1303 # Remove the port 80 part that IA often adds to the URL, as it's superfluous
1304 SNAPSHOT_URL=$(echo $SNAPSHOT_URL | sed 's/:80//')
1305
[1175]1306 # Inform the reader of the snapshot URL
[1119]1307 valPrint ts " IA suggests $SNAPSHOT_URL"
1308 valPrint rs " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
1309 valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
[1147]1310 else # Otherwise give a generic Wayback Machine link for this URL, which might work
[1119]1311 valPrint ts " Try browsing $ARCHIVE_GENERIC/$URL"
1312 valPrint rs " Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
1313 valPrint hs "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"
[1064]1314 fi
1315 fi
1316 fi
1317
1318 # If this is a page and it seems to be there, take screenshot of it using headless Google Chrome
1319 if [ $IS_FILE -eq 0 ] && [ $TAKE_PAGE_SHOT -eq 1 ] && [ $STATUS == "OK" ]; then
1320 # Sanitize URL by removing "http(s)://" and converting any other colons or slashes to underscores
1321 SHOT_NAME=$(echo "$URL" | sed 's/https*\:\/\///' | sed 'y/:\//__/')
1322 SHOT_FILE="$SHOT_PATH/$SHOT_NAME.png"
1323
1324 # Don't take screenshot if we already encountered this page and screenshotted it
1325 if [ ! -f "$SHOT_FILE" ]; then
[1070]1326 "$CHROME_PATH" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
[1064]1327 if [ -f "$WORKING_DIR/$CHROME_SCREENSHOT" ]; then
1328 mv -n "$WORKING_DIR/$CHROME_SCREENSHOT" "$SHOT_FILE"
1329 else
[1119]1330 valPrint trhs "Screenshot of URL $URL seems to have failed!"
[1064]1331 fi
1332 else
[1123]1333 valPrint trhs "Skipping screenshot of URL '$URL' because file '$SHOT_FILE' already exists."
[1064]1334 fi
1335 fi
1336done
1337FINISHED_LIST="yes"
1338wrapupAndExit
Note: See TracBrowser for help on using the repository browser.