source: Validate External Links/validate_external_links.sh

Last change on this file was 1193, checked in by iritscen, 3 weeks ago

ValExtLinks now recognizes private YT videos.

File size: 61.2 KB
Line 
1#!/bin/bash
2
3# Validate External Links by Iritscen (iritscen@yahoo.com)
4#
5# Validates a list of external links in CSV format. The resulting logs are produced in three formats:
6# - TXT (for easy diffing with an earlier log)
7# - RTF (for reading as a local file with clickable links)
8# - HTML (for reading as a web page)
9# Call script with "--help" argument for documentation. Also see Read Me.rtf for critical notes.
10#
11# Recommended rule:
12# |----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----|
13#
14# Table of contents (sections of script in order of appearance, not execution):
15# • Globals
16# • Help Output
17# • Setup
18# • Utility Functions
19# • Summary Output
20# • Initialization
21# • Data Sourcing
22# • Config Output
23# • Legend Output
24# • Main Loop
25
26# Set separator token to newline
27IFS="
28"
29
30### GLOBALS ###
31# Settings -- these will be changed from their defaults by the arguments passed in to the script
32LINKS_URL="" # download external link CSV from this location (can use "file://" protocol)
33EXCEPT_URL="" # location of wiki page with a list of exceptions for NG results
34OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder
35RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES
36ONLY_200_OK=0 # only treat code 200 as "OK" and not any other code in OK_CODES
37SHOW_SLASH=0 # record issue when a slash is added to the end of a URL
38SHOW_HTTPS=0 # record issue when "http" is upgraded to "https"
39SHOW_YT_RD=0 # record redirection for a youtu.be URL expanding to the full URL
40SUGGEST_SNAPSHOTS_NG=0 # query the Internet Archive for a possible snapshot URL for each NG page
41SUGGEST_SNAPSHOTS_OK=0 # query the Internet Archive for an existing snapshot of each OK page
42CHECK_ARCHIVE_LINKS=0 # check URLs on archive.org and archive.is
43TAKE_PAGE_SHOT=0 # take a screenshot of each OK page
44TIMEOUT=10 # time to wait for a response when querying a site
45CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature
46URL_START=1 # start at this URL in LINKS_FILE
47URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE
48UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report
49
50# Fixed strings -- see the occurrences of these variables to learn their purpose
51AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
52ARCHIVE_API="http://archive.org/wayback/available"
53ARCHIVE_GENERIC="https://web.archive.org/web/*"
54ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
55CHROME_SCREENSHOT="screenshot.png"
56EXCEPT_FILE_NAME="exceptions.txt"
57EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
58WIKI_CURL="https://wiki.oni2.net/Validate_External_Links/Curl_codes"
59WIKI_HTTP="https://wiki.oni2.net/Validate_External_Links/HTTP_codes"
60WIKI_MAIN="https://wiki.oni2.net/Validate_External_Links"
61WIKI_ME="http://iritscen.oni2.net"
62THIS_DIR=$(cd $(dirname $0); pwd)
63WORKING_DIR=$(pwd)
64WIKI_PATH="wiki.oni2.net"
65
66# These are parallel arrays of the IDs and names of OniGalore's current namespaces
67declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 100 101 102 103 104 105 108 109 110 111)
68declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk")
69
70# These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
71# This determines whether the script tries to take a screenshot of the URL (when screenshots are
72# requested).
73declare -a HTTP_FILES=(3ds 7z avi BINA blend bsl cpp csv dae dll dmg doc east exe fbx first63 flv gamedata gif jpg last32 log m4a mhm mov mp3 mp4 oni ONWC pdf png psd py rar tga tif TRMA txt vbs wav wmv xaf xcf xlsx xml zip)
74declare -a HTTP_TLDS_AND_PAGES=(abstract action ars asp aspx cfm cgi com css de do full htm html htmldem it js jsp net org pgi php php3 phtml pl ru shtml stm uk x)
75
76# These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
77# are NG (no good). Pages that return OK codes will be screenshotted when screenshots are asked for.
78# Remember to update http_codes.txt if you add a new code.
79declare -a OK_CODES=(200 202 401 405 406 418 501)
80declare -a RD_CODES=(301 302 303 307 308)
81declare -a NG_CODES=(000 400 403 404 410 429 500 502 503 504 520 530)
82
83# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
84# transcluded text, and if the transclusion fails, then the braces show up in the URL
85ILLEGAL_CHARS="{ }"
86
87# The shortest URL possible, used for sanity-checking some URLs: http://a.co
88MIN_URL_LENGTH=11
89
90# These are parallel arrays giving the prefixes that can be used in place of normal external links to
91# some wikis and other sites; based on https://wiki.oni2.net/Special:Interwiki
92declare -a INTERWIKI_PREFIXES=(commons metawikimedia mw wikibooks wikidata wikimedia wikinews wikiquote wikisource wikispecies wikiversity wikivoyage wikt wp)
93declare -a INTERWIKI_DOMAINS=(commons.wikimedia.org meta.wikimedia.org mediawiki.org wikibooks.org wikidata.org wikimediafoundation.org wikinews.org wikiquote.org wikisource.org species.wikimedia.org wikiversity.org wikivoyage.org wiktionary.org wikipedia.org)
94
95# Variables for keeping track of main loop progress and findings
96LINK_NUM=0
97EI_LINKS=0
98IW_LINKS=0
99OK_LINKS=0
100RD_LINKS=0
101NG_LINKS=0
102SKIP_PARSE_FAIL=0
103SKIP_UNK_PROT=0
104SKIP_UNK_NS=0
105SKIP_JS_PAGE=0
106SKIP_BAD_URL=0
107SKIP_NON_ASCII=0
108SKIP_UNK_SUFFIX=0
109SKIP_UNK_CODE=0
110SKIP_EXPECT_NG=0
111SKIP_EXPECT_RD=0
112SKIP_EXPECT_EI=0
113SKIP_EXPECT_IW=0
114SKIP_HTTPS_UP=0
115SKIP_SLASH_ADD=0
116SKIP_YOUTU_BE=0
117SKIP_ARCHIVES=0
118FILE_LINKS=0
119PAGE_LINKS=0
120SKIPPED_HEADER_ROW=0
121FINISHED_LIST="no"
122START_RUN=0
123END_RUN=0
124
125
126### HELP OUTPUT ###
127# A pseudo-man page. Here is the 80-character rule for the page text:
128# 345678901234567890123456789012345678901234567890123456789012345678901234567890
129function printHelp()
130{
131 cat << EOF
132
133NAME
134 Validate External Links
135
136SYNOPSIS
137 validate_external_links.sh --help
138 validate_external_links.sh --links URL --output DIR [--exceptions URL]
139 [--record-ok-links] [--only-200-ok] [--show-added-slashes]
140 [--show-https-upgrades] [--show-yt-redirects] [--suggest-snapshots]
141 [--check-archive-links] [--take-screenshots FILE] [--timeout NUM]
142 [--start-url NUM] [--end-url NUM] [--upload FILE]
143
144DESCRIPTION
145 This script parses a list of external links found in the OniGalore wiki
146 (which is dumped by the Oni2.net server periodically in a particular
147 format), validates them using the Unix tool 'curl', and produces a report
148 of which links were "OK" (responded positively to an HTTP query), which
149 were "RD" (responded with a 3xx redirect code), which could be "IW"
150 (interwiki) links, which are "EI" (external internal) links and could be
151 intrawiki links, and which were "NG" (no good; a negative response to the
152 query). This report can then be automatically uploaded to the location of
153 your choice. The script can also suggest Internet Archive snapshots for
154 "NG" links, and take screenshots of "OK" links for visual verification by
155 the reader that the page in question is the one intended to be displayed.
156
157 You must pass this script the URL at which the list of links is found
158 (--links) and the path where the directory of logs should be outputted
159 (--output). All other arguments are optional.
160
161OPTIONS
162 --help Show this page.
163 --links URL (required) URL from which to download the CSV
164 file with external links. Note that this URL can
165 be a local file if you supply a file:// path.
166 --output DIR (required) Unix path to directory in which Val
167 should place its reports.
168 --exceptions URL In order to remove links from the report which
169 Val finds an issue with but which you regard as
170 OK, list those desired exceptions on a wiki page.
171 See the sample file "exceptions.pdf" for the
172 required format of the page. Note that this URL
173 can point to a local file if you supply a path
174 beginning with "file://".
175 --record-ok-links Log a link in the report even if its response
176 code is "OK".
177 --only-200-ok Only treat response code 200 as "OK". Normally
178 several additional codes are treated as "OK" (see
179 the array OK_CODES in script) because they are
180 typically not an indicator of a bad link.
181 --show-added-slashes Report on redirects that simply add a '/' to the
182 end of the URL.
183 --show-https-upgrades Report on redirects that simply upgrade a
184 "http://" URL to a "https://" URL.
185 --show-yt-redirects Report on redirects that expand a youtu.be URL.
186 --suggest-snapshots-ng Query the Internet Archive for a possible
187 snapshot URL for each "NG" page.
188 --suggest-snapshots-ok Query the Internet Archive for a snapshot of each
189 "OK" page just to make sure it's available. Note
190 that this will add a tremendous amount of time to
191 the script execution because there is a rate
192 limit to the Archive API. Note that this option
193 does nothing unless you also use the
194 --record-ok-links argument.
195 --check-archive-links Check links that are already pointing to a page
196 on the Internet Archive or archive.is (AKA
197 archive.today). In theory these links should be
198 totally stable and not need validation.
199 --take-screenshots FILE Call the Google Chrome binary at this path to
200 take screenshots of each "OK" page.
201 --timeout NUM Wait this many seconds for a site to respond. The
202 default is 10. Important note: Val will attempt
203 to reach each URL three times, so the time taken
204 to ping an unresponsive site will be three times
205 this setting.
206 --start-url NUM Start at this link in the links CSV file.
207 --end-url NUM Stop at this link in the links CSV file.
208 --upload FILE Upload report using the credentials and path
209 given in this local text file. See sftp_login.txt
210 for template.
211
212BUGS
213 The script cannot properly parse any line in the external links file
214 which contains a comma in the name of the wiki page containing a link.
215 Commas in the link itself are not an issue.
216EOF
217}
218
219
220### SETUP ###
221# If first argument is a help request, or if nothing was passed in at all, print help page and quit
222if [ "$#" -eq 0 ] || [ "$1" == "--help" ]; then
223 printHelp | less
224 exit 0
225fi
226
227# Parse arguments as long as there are more arguments to process
228while (( "$#" )); do
229 case "$1" in
230 --links ) LINKS_URL="$2"; shift 2;;
231 --exceptions ) EXCEPT_URL="$2"; shift 2;;
232 --output ) OUTPUT_DIR="$2"; shift 2;;
233 --record-ok-links ) RECORD_OK_LINKS=1; shift;;
234 --only-200-ok ) ONLY_200_OK=1; shift;;
235 --show-added-slashes ) SHOW_SLASH=1; shift;;
236 --show-https-upgrades ) SHOW_HTTPS=1; shift;;
237 --show-yt-redirects ) SHOW_YT_RD=1; shift;;
238 --suggest-snapshots-ng ) SUGGEST_SNAPSHOTS_NG=1; shift;;
239 --suggest-snapshots-ok ) SUGGEST_SNAPSHOTS_OK=1; shift;;
240 --check-archive-links ) CHECK_ARCHIVE_LINKS=1; shift;;
241 --take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
242 --timeout ) TIMEOUT=$2; shift 2;;
243 --start-url ) URL_START=$2; shift 2;;
244 --end-url ) URL_LIMIT=$2; shift 2;;
245 --upload ) UPLOAD_INFO=$2; shift 2;;
246 * ) echo "Invalid argument '$1' detected. Aborting."; exit 1;;
247 esac
248done
249
250# If the required arguments were not supplied, print help page and quit
251if [ -z $LINKS_URL ] || [ -z $OUTPUT_DIR ]; then
252 echo "Error: I did not receive one or both required arguments. Run me with the \"--help\" argument for documentation."
253 exit 2
254fi
255
256# If user wants screenshots, make sure path to Chrome was passed in and is valid
257if [ $TAKE_PAGE_SHOT -eq 1 ]; then
258 if [ ! -f "$CHROME_PATH" ]; then
259 echo "Error: You need to supply a path to the Google Chrome application in order to take screenshots."
260 exit 3
261 fi
262fi
263
264# Check that UPLOAD_INFO exists, if this argument was supplied
265if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then
266 echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting."
267 exit 4
268fi
269
270# Check that OUTPUT_DIR is a directory
271if [ ! -d "$OUTPUT_DIR" ]; then
272 echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting."
273 exit 5
274fi
275
276# Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots
277SAFE_TIME=$(date "+%Y-%m-%d--%H-%M-%S") # for use in path names
278NICE_TIME=$(TZ=":UTC" date "+%a, %d %b %Y %k:%M:%S GMT") # matches format of LINKS_DATE, which is extracted from the extlinks file
279OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
280OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
281SHOT_PATH="$OUTPUT_PATH/Screenshots"
282LOG_NAME="ValExtLinks report"
283LOG_NAME_TXT="$LOG_NAME.txt"
284LOG_NAME_RTF="$LOG_NAME.rtf"
285LOG_NAME_HTM="$LOG_NAME.htm"
286LOG_PATH="$OUTPUT_PATH/$LOG_NAME"
287LOG_PATH_TXT="$LOG_PATH.txt"
288LOG_PATH_RTF="$LOG_PATH.rtf"
289LOG_PATH_HTM="$LOG_PATH.htm"
290mkdir "$OUTPUT_PATH"
291if [ $TAKE_PAGE_SHOT -eq 1 ]; then
292 mkdir "$SHOT_PATH"
293fi
294
295# Check that 'mkdir' succeeded
296if [ ! -d "$OUTPUT_PATH" ]; then
297 echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting."
298 exit 6
299fi
300
301# Get date on the file at LINKS_URL and print to log
302LINKS_DATE=$(curl --silent --head $LINKS_URL | grep "Last-Modified")
303if [ -z "$LINKS_DATE" ]; then
304 echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting."
305 exit 7
306fi
307LINKS_DATE=${LINKS_DATE#Last-Modified: }
308
309
310### UTILITY FUNCTIONS ###
311# Writes a plain-text header to TXT log file
312function printTXTheader()
313{
314 valPrint t "Validate External Links report"
315 valPrint t "generated $NICE_TIME"
316 valPrint t "from data of $LINKS_DATE"
317 valPrint t "script by Iritscen (contact: $WIKI_ME)"
318 valPrint t ""
319}
320
321# Writes the RTF header to RTF log file, then a centered title, then sets text to left-justified
322function printRTFheader()
323{
324 valPrint r "{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf820
325{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
326{\colortbl;\red255\green255\blue255;}
327{\*\expandedcolortbl;;}
328\margl1440\margr1440\vieww12600\viewh12100\viewkind0
329\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\qc\partightenfactor0
330
331\f0\fs28 \cf0 \b1 Validate External Links report \b0\\
332generated $NICE_TIME\\
333from data of $LINKS_DATE\\
334script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$WIKI_ME\"}}{\fldrslt contact}})
335\\
336\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
337\cf0 "
338}
339
340# Closes the RTF markup of the RTF log file
341function printRTFfooter()
342{
343 valPrint r "}"
344}
345
346# Writes the HTML header to HTML log file
347function printHTMheader()
348{
349 valPrint h "<html>
350<head>
351<title>Validate External Links report</title>
352</head>
353<body>
354<h2>Validate External Links report</h2>
355<h3>generated $NICE_TIME<br />
356from data of $LINKS_DATE<br />
357script by Iritscen (<a href=\"$WIKI_ME\" target=\"_blank\">contact</a>)</h3>"
358}
359
360# Closes the HTML markup of the HTML log file
361function printHTMfooter()
362{
363 valPrint h "</body>
364</html>"
365}
366
367# The central logging function. The first parameter is a string composed of one or more characters that
368# indicate which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
369# 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 's' means "Print an
370# extra newline at the end." 'w' means "Don't pass console output through 'fmt'" ("fmt" fits the output
371# to an 80-column CLI but can break special formatting and the 'n' option).
372function valPrint()
373{
374 if [[ "$1" == *c* ]]; then
375 if [[ "$1" == *n* ]]; then
376 echo -n "$2"
377 elif [[ "$1" == *w* ]]; then
378 echo "$2"
379 elif [[ "$1" == *s* ]]; then
380 echo -e "$2\n"
381 else
382 echo "$2" | fmt -w 80
383 fi
384 fi
385 if [[ "$1" == *t* ]]; then
386 if [[ "$1" == *n* ]]; then
387 echo -n "$2" >> "$LOG_PATH_TXT"
388 elif [[ "$1" == *s* ]]; then
389 echo -e "$2\n" >> "$LOG_PATH_TXT"
390 else
391 echo "$2" >> "$LOG_PATH_TXT"
392 fi
393 fi
394 if [[ "$1" == *r* ]]; then
395 if [[ "$1" == *n* ]]; then
396 echo "$2" >> "$LOG_PATH_RTF"
397 elif [[ "$1" == *s* ]]; then
398 echo "$2\line\line" >> "$LOG_PATH_RTF"
399 else
400 echo "$2\line" >> "$LOG_PATH_RTF"
401 fi
402 fi
403 if [[ "$1" == *h* ]]; then
404 if [[ "$1" == *s* ]]; then
405 echo "$2<tr><td>&nbsp;</td></tr>" >> "$LOG_PATH_HTM"
406 elif [[ "$1" == *n* ]]; then
407 echo "$2" >> "$LOG_PATH_HTM"
408 else
409 echo "$2<br />" >> "$LOG_PATH_HTM"
410 fi
411 fi
412}
413
414# Pluralize the string in parameter 1 if the number in parameter 2 is not 1
415function pluralCheckNoun()
416{
417 if [ $2 -ne 1 ]; then
418 if [[ $1 =~ x$ ]]; then
419 echo $1es
420 else
421 echo $1s
422 fi
423 else
424 echo $1
425 fi
426}
427
428# Output "is" if parameter 1 is 1, otherwise "are"
429function pluralCheckIs()
430{
431 if [ $1 -ne 1 ]; then
432 echo "are"
433 else
434 echo "is"
435 fi
436}
437
438# Output "was" if parameter 1 is 1, otherwise "were"
439function pluralCheckWas()
440{
441 if [ $1 -ne 1 ]; then
442 echo "were"
443 else
444 echo "was"
445 fi
446}
447
448# Output "a " if parameter 1 is 1, otherwise nothing
449function pluralCheckA()
450{
451 if [ $1 -eq 1 ]; then
452 echo "a "
453 fi
454}
455
456# Output "an " if parameter 1 is 1, otherwise nothing
457function pluralCheckAn()
458{
459 if [ $1 -eq 1 ]; then
460 echo "an "
461 fi
462}
463
464# Upload the reports using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
465# reports being saved to disk have already been closed.
466function uploadReport()
467{
468 valPrint c "Uploading reports..."
469
470 SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
471 SFTP_USER_NAME_MARKER="user:"
472 SFTP_PASSWORD_MARKER="pw:"
473 SFTP_PORT_MARKER="port:"
474 SFTP_PATH_MARKER="path:"
475 SFTP_USER_NAME=$(grep $SFTP_USER_NAME_MARKER $UPLOAD_INFO)
476 SFTP_USER_NAME=${SFTP_USER_NAME#$SFTP_USER_NAME_MARKER}
477 SFTP_PASSWORD=$(grep $SFTP_PASSWORD_MARKER $UPLOAD_INFO)
478 SFTP_PASSWORD=${SFTP_PASSWORD#$SFTP_PASSWORD_MARKER}
479 SFTP_PORT=$(grep $SFTP_PORT_MARKER $UPLOAD_INFO)
480 SFTP_PORT=${SFTP_PORT#$SFTP_PORT_MARKER}
481 SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
482 SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}
483
484 for SUFFIX in htm rtf txt; do
485 expect "$SCRIPT_PATH" "$LOG_PATH.$SUFFIX" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.$SUFFIX"
486
487 if [ "$?" -ne 0 ]; then
488 valPrint c "Error $? occurred when attempting to upload $LOG_NAME.$SUFFIX!"
489 else
490 valPrint c "Report in `echo $SUFFIX | tr [:lower:] [:upper:]` format was uploaded."
491 fi
492 done
493}
494
495# Prints session summary when script is done
496function wrapupAndExit()
497{
498 # Get off progress line on console, drop down a line from last link in log, and close HTML table
499 valPrint ctr ""
500 valPrint h "</table><br />"
501
502 # If we didn't finish processing the last URL, then the iterator is one too high
503 if [ $FINISHED_LIST != "yes" ]; then
504 let LINK_NUM-=1
505 if [ $FINISHED_LIST == "no" ]; then
506 valPrint ctrh "The session was canceled by the user."
507 fi
508 fi
509
510 # Generate string with elapsed time
511 END_RUN=$(date +%s)
512 ELAPSED=$(echo $(($END_RUN - $START_RUN)) | awk '{printf "%d min. %d sec. elapsed", int($1/60), int($1%60)}')
513
514 # Do some math on results of session
515 LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
516 TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP+SKIP_YOUTU_BE))
517 LINK_ERRORS=$((SKIP_PARSE_FAIL+SKIP_UNK_PROT+SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
518 LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_RD+SKIP_EXPECT_EI+SKIP_EXPECT_IW))
519 LINK_PROBLEMS_TOTAL=$((NG_LINKS+RD_LINKS+EI_LINKS+IW_LINKS))
520 LINK_PROBLEMS_NG=$((NG_LINKS-SKIP_EXPECT_NG))
521 LINK_PROBLEMS_RD=$((RD_LINKS-SKIP_EXPECT_RD))
522 LINK_PROBLEMS_EI=$((EI_LINKS-SKIP_EXPECT_EI))
523 LINK_PROBLEMS_IW=$((IW_LINKS-SKIP_EXPECT_IW))
524 LINK_PROBLEMS_NET=$((LINK_PROBLEMS_NG+LINK_PROBLEMS_RD+LINK_PROBLEMS_EI+LINK_PROBLEMS_IW))
525
526 # Print something in the Links section if no link issues were printed
527 if [ $LINK_PROBLEMS_NET -eq 0 ]; then
528 valPrint h "<i>No link problems to report! See the <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for a list of links with issues that were not reported.</i>"
529 fi
530 if [ $LINK_PROBLEMS_TOTAL -eq 0 ]; then
531 valPrint t "No link problems to report!"
532 valPrint r "\i1 No link problems to report! \i0"
533 fi
534
535 ## SUMMARY OUTPUT ##
536 valPrint ct "Summary ($ELAPSED):"
537 valPrint r "\b1 Summary \b0 ($ELAPSED)"
538 valPrint hn "<h3><span id=\"summary\">Summary ($ELAPSED)</span></h3>"
539 valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT) (there $(pluralCheckWas $FILE_LINKS) $FILE_LINKS file $(pluralCheckNoun link $FILE_LINKS) and $PAGE_LINKS page $(pluralCheckNoun link $PAGE_LINKS))."
540
541 # Print processed link totals
542 if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi
543 if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi
544 if [ $SKIP_ARCHIVES -gt 0 ]; then valPrint ctrh "- $SKIP_ARCHIVES archive.org/archive.is $(pluralCheckNoun link $SKIP_ARCHIVES) $(pluralCheckWas $SKIP_ARCHIVES) not checked"; fi
545 if [ $LINK_PROBLEMS_TOTAL -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_TOTAL processed $(pluralCheckNoun link $LINK_PROBLEMS_TOTAL) had $(pluralCheckAn $LINK_PROBLEMS_TOTAL)$(pluralCheckNoun issue $LINK_PROBLEMS_TOTAL)"; fi
546 if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr " (excepted $LINKS_EXCEPTED link $(pluralCheckNoun issue $LINKS_EXCEPTED) from report)"; valPrint h "&nbsp;&nbsp;(excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi
547 if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) $(pluralCheckWas $OK_LINKS) OK"; fi
548 if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctr " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; valPrint h "&nbsp;&nbsp;(counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi
549
550 # Print errored link totals
551 if [ $LINK_ERRORS -gt 0 ]; then
552 valPrint c "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"
553 valPrint h "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for specific links):"
554 valPrint rt "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS):"
555 fi
556 if [ $SKIP_PARSE_FAIL -gt 0 ]; then valPrint ctrh "- $SKIP_PARSE_FAIL line-parsing $(pluralCheckNoun failure $SKIP_PARSE_FAIL)"; fi
557 if [ $SKIP_UNK_PROT -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_PROT unknown $(pluralCheckNoun protocol $SKIP_UNK_PROT)"; fi
558 if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
559 if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
560 if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
561 if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
562 if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
563 if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
564
565 # Print excepted link totals
566 if [ $LINKS_EXCEPTED -gt 0 ]; then
567 valPrint c "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"
568 valPrint h "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for specific links):"
569 valPrint rt "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted:"
570 fi
571 if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
572 if [ $SKIP_EXPECT_RD -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_RD/$RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi
573 if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi
574 if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi
575
576 # Perform exceptions audit
577 EXCEPTION_ISSUES=0
578 valPrint ctrh "Exceptions list audit:"
579 for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
580 EXCEPT_LINE="${EXCEPT_ARRAY[$i]}"
581 EXCEPT_LINE=$(echo "$EXCEPT_LINE" | sed 's/\&amp;/\&/g') # copied from exception-matching code
582
583 if [ ${EXCEPT_FOUND[$i]} -eq 0 ]; then
584 EXCEPT_URL="${EXCEPT_LINE#*,}"
585 EXCEPT_URL="${EXCEPT_URL%,*}"
586 EXCEPT_PAGE="${EXCEPT_LINE##*,}"
587 EXCEPT_PAGE="${EXCEPT_PAGE%% *}"
588 if [ "$EXCEPT_PAGE" == "*" ]; then
589 valPrint tr "- The link '$EXCEPT_URL' did not return an error or is not on any page."
590 else
591 valPrint tr "- The link '$EXCEPT_URL' did not return an error or is not on page '$EXCEPT_PAGE'."
592 fi
593 let EXCEPTION_ISSUES+=1
594 elif [ ${EXCEPT_USED[$i]} -eq 0 ]; then
595 EXCEPT_URL="${EXCEPT_LINE#*,}"
596 EXCEPT_URL="${EXCEPT_URL%,*}"
597 EXCEPT_CODE=${EXCEPT_LINE%%,*}
598 valPrint tr "- The link '$EXCEPT_URL' did not return error code $EXCEPT_CODE."
599 let EXCEPTION_ISSUES+=1
600 fi
601 done
602 if [ $EXCEPTION_ISSUES -eq 0 ]; then
603 valPrint ctrh "- No issues found."
604 else
605 valPrint c "$EXCEPTION_ISSUES exception list $(pluralCheckNoun issue $EXCEPTION_ISSUES) detected (see RTF or TXT report for details)."
606 valPrint h "$EXCEPTION_ISSUES exception list $(pluralCheckNoun issue $EXCEPTION_ISSUES) detected (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for details)."
607 fi
608
609 # Print checked link totals
610 if [ $LINK_PROBLEMS_NET -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS_NET link $(pluralCheckNoun issue $LINK_PROBLEMS_NET):"; fi
611 if [ $LINK_PROBLEMS_NG -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_NG NG $(pluralCheckNoun link $LINK_PROBLEMS_NG)"; fi
612 if [ $LINK_PROBLEMS_RD -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_RD $(pluralCheckNoun redirection $LINK_PROBLEMS_RD)"; fi
613 if [ $LINK_PROBLEMS_EI -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_EI $(pluralCheckNoun link $LINK_PROBLEMS_EI) that could be intrawiki"; fi
614 if [ $LINK_PROBLEMS_IW -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_IW $(pluralCheckNoun link $LINK_PROBLEMS_IW) that could be interwiki"; fi
615
616 # Close the log files' markup
617 valPrint trh "ValExtLinks says goodbye."
618 printRTFfooter
619 printHTMfooter
620
621 # Upload report if this was requested
622 if [ ! -z $UPLOAD_INFO ] && [ $FINISHED_LIST != "no" ]; then
623 uploadReport
624 fi
625
626 # Really quit now
627 valPrint c "ValExtLinks says goodbye."
628 exit 0
629}
630trap wrapupAndExit INT
631
632
633### INITIALIZATION ###
634# Print opening message to console and log files
635valPrint cw "$(tput bold)--Validate External Links--$(tput sgr0)"
636printTXTheader
637printRTFheader
638printHTMheader
639
640## DATA SOURCING ##
641valPrint t "Startup:"
642valPrint r "\b1 Startup \b0"
643valPrint hn "<h3>Startup</h3>"
644
645# Attempt to download file at LINKS_URL, then check that it succeeded
646valPrint cwtrhn "Downloading list of external links from $LINKS_URL..."
647LINKS_FILE_NAME=$(echo "$LINKS_URL" | sed 's/.*\///')
648LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
649curl --silent -o "$LINKS_FILE" $LINKS_URL
650if [ ! -f "$LINKS_FILE" ]; then
651 echo -e "\nThe download of $LINKS_URL appears to have failed. Aborting."
652 wrapupAndExit
653else
654 valPrint ctrh " success."
655fi
656
657# Attempt to download file at EXCEPT_URL, then check that it succeeded
658if [ ! -z $EXCEPT_URL ]; then
659 valPrint cwtrhn "Downloading list of reporting exceptions from $EXCEPT_URL..."
660 EXCEPT_DATA=$(curl --silent $EXCEPT_URL)
661 if [ -z "$EXCEPT_DATA" ]; then
662 echo -e "\nThe download of the exceptions data from '$EXCEPT_URL' appears to have failed. Aborting."
663 wrapupAndExit
664 else
665 valPrint ctrh " success."
666 fi
667 EXCEPT_DATA=${EXCEPT_DATA%END LIST*}
668 EXCEPT_DATA=${EXCEPT_DATA#*BEGIN LIST}
669 EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
670
671 # Store on disk for debugging purposes
672 echo "$EXCEPT_DATA" > "$EXCEPT_FILE"
673
674 # Transfer to array for easy searching later
675 declare -a EXCEPT_ARRAY=($(echo "$EXCEPT_DATA"))
676
677 # Create parallel arrays for marking which exceptions get used later
678 declare -a EXCEPT_USED=()
679 declare -a EXCEPT_FOUND=()
680 for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
681 EXCEPT_USED+=(0)
682 EXCEPT_FOUND+=(0)
683 done
684fi
685
686# Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
687LINK_COUNT_STRING=$(cat "$LINKS_FILE" | wc -l)
688
689# Number of URLs is number of lines minus one (first line is column header row for the CSV)
690LINK_COUNT=$(echo "${LINK_COUNT_STRING}" | tr -d '[:space:]')
691let LINK_COUNT-=1
692valPrint ctrh "Found $LINK_COUNT links to process."
693valPrint trh ""
694
695## CONFIG OUTPUT ##
696valPrint t "Config:"
697valPrint r "\b1 Config \b0"
698valPrint hn "<h3>Config</h3>"
699
700valPrint ctrhn "Links to consider: "
701if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
702 valPrint ctrh "$((URL_LIMIT-URL_START+1)), from link $URL_START to $URL_LIMIT"
703elif [ $URL_START -ne 1 ]; then
704 valPrint ctrh "$((LINK_COUNT-URL_START+1)), from link $URL_START to $LINK_COUNT"
705else
706 valPrint ctrh "$LINK_COUNT"
707fi
708
709valPrint ctrh "Site query timeout: $TIMEOUT seconds"
710
711valPrint ctrhn "Show OK links: "
712if [ $RECORD_OK_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
713
714valPrint ctrhn "Treat these response codes as OK: "
715if [ $ONLY_200_OK -eq 1 ]; then valPrint ctrh "200"; else valPrint ctrh "${OK_CODES[*]}"; fi
716
717valPrint ctrhn "Take screenshots: "
718if [ $TAKE_PAGE_SHOT -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
719
720valPrint ctrhn "Suggest archive.org snapshots for NG pages: "
721if [ $SUGGEST_SNAPSHOTS_NG -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
722
723valPrint ctrhn "Suggest archive.org snapshots for OK pages: "
724if [ $SUGGEST_SNAPSHOTS_OK -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
725
726valPrint ctrhn "Ignore slash-adding redirects: "
727if [ $SHOW_SLASH -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
728
729valPrint ctrhn "Ignore HTTPS-upgrading redirects: "
730if [ $SHOW_HTTPS -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
731
732valPrint ctrhn "Ignore youtu.be redirects: "
733if [ $SHOW_YT_RD -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
734
735valPrint ctrhn "Check archive.org and archive.is links: "
736if [ $CHECK_ARCHIVE_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
737
738valPrint tr "A summary of my findings will be found at the bottom of the report."
739valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
740valPrint trh ""
741
742## LEGEND OUTPUT ##
743valPrint t "Legend:"
744valPrint r "\b1 Legend \b0"
745valPrint hn "<h3>Legend</h3>"
746valPrint t "(For guidance in fixing these links, see $WIKI_MAIN. The exceptions list is at $EXCEPT_URL.)"
747valPrint r "(For guidance in fixing these links, see {\field{\*\fldinst{HYPERLINK \"$WIKI_MAIN\"}}{\fldrslt here}}. The exceptions list is {\field{\*\fldinst{HYPERLINK \"$EXCEPT_URL\"}}{\fldrslt here}}.)"
748valPrint h "(For guidance in fixing these links, see <a href=\"$WIKI_MAIN\" target=\"_blank\">here</a>. The exceptions list is <a href=\"$EXCEPT_URL\" target=\"_blank\">here</a>.)"
749valPrint trh "OK = URL seems to be working"
750valPrint trh "NG = URL no longer seems to work"
751valPrint trh "RD = URL is redirecting to this new URL"
752valPrint trh "EI = URL is an external link to an internal page and should be converted to an intrawiki link using the suggested markup"
753valPrint trh "IW = URL is an external link to a fellow wiki and should be converted to an interwiki link using the suggested markup"
754valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $WIKI_HTTP)"
755valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$WIKI_HTTP\"}}{\fldrslt here}} for code reference)"
756valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$WIKI_HTTP\" target=\"_blank\">here</a> for code reference)"
757valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $WIKI_CURL)"
758valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$WIKI_CURL\"}}{\fldrslt here}} for code reference)"
759valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$WIKI_CURL\" target=\"_blank\">here</a> for code reference)"
760valPrint trh "IA suggests = Last available snapshot returned by the Internet Archive"
761valPrint trh "Try browsing = The Archive failed to return a snapshot URL, so check for a snapshot manually using this link"
762valPrint trh ""
763
764
765### MAIN LOOP ###
766valPrint t "Links:"
767valPrint r "\b1 Links \b0"
768valPrint hn "<h3>Links</h3>"
769START_RUN=$(date +%s)
770# Process each line of the .csv in LINKS_FILE
771for LINE in `cat "$LINKS_FILE"`; do
772 START_LINK=$(date +%s)
773 let LINK_NUM+=1
774
775 # First line is the column header row for the CSV, so let's verify that the format hasn't changed
776 if [ $SKIPPED_HEADER_ROW -eq 0 ]; then
777 if [ $LINE == "namespace,title,target" ]; then
778 SKIPPED_HEADER_ROW=1
779 LINK_NUM=0 # this line is not a link, so reset the link counter
780 valPrint hn "<table>"
781 continue
782 else
783 valPrint ctrh "Did not find expected header row in $LINKS_FILE. Aborting."
784 wrapupAndExit
785 fi
786 fi
787
788 # Skip this link if we are not at URL_START yet
789 if [ $LINK_NUM -lt $URL_START ]; then
790 continue
791 fi
792
793 # Stop if we are at the limit declared for testing purposes
794 if [ $URL_LIMIT -gt 0 ] && [ $LINK_NUM -gt $URL_LIMIT ]; then
795 FINISHED_LIST="limit"
796 wrapupAndExit
797 fi
798
799 # Parse line into namespace ID number, containing wiki page, and external link URL
800 NS_ID=${LINE%%,*}
801 PAGE_NAME=${LINE#$NS_ID,}
802 PAGE_NAME=${PAGE_NAME%%,*} # a comma in the page name will break this
803 URL=${LINE#$NS_ID,$PAGE_NAME,} # commas can be in this
804 if [ -z "$NS_ID" ] || [ -z "$PAGE_NAME" ] || [ -z "$URL" ]; then
805 valPrint trs "Skipping line $LINK_NUM ('$LINE') because the namespace, wiki page or link URL could not be read."
806 let SKIP_PARSE_FAIL+=1
807 continue
808 fi
809
810 # Skip any link that isn't "http://" or "https://"
811 if [[ ! $URL =~ ^http* ]]; then
812 valPrint trs "Skipping line $LINK_NUM ('$LINE') because the protocol isn't 'http://' or 'https://'."
813 let SKIP_UNK_PROT+=1
814 continue
815 fi
816
817 # Print progress to screen
818 if [ $LINK_NUM -gt 1 ]; then
819 printf "\e[1A\n" # erase previous progress message so that new one appears in its place
820 fi
821 valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."
822
823 # Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
824 NS_NAME=""
825 a=0
826 while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done
827 if [ $NS_ID == "NULL" ]; then
828 break
829 elif [ $NS_ID -eq ${NS_IDS[$a]} ]; then
830 NS_NAME="${NS_NAMES[$a]}"
831 break
832 fi
833 let a+=1
834 done
835 if [ "$NS_NAME" == "" ]; then
836 if [ $NS_ID == "NULL" ]; then
837 valPrint trs "Skipping line $LINK_NUM ('$LINE') because the namespace (and probably the page too) is 'NULL'. Probably the link is no longer in existence on the wiki."
838 else
839 valPrint trs "Skipping line $LINK_NUM ('$LINE') because I could not find a name for namespace ID $NS_ID."
840 fi
841 let SKIP_UNK_NS+=1
842 let PAGE_LINKS+=1
843 continue
844 fi
845
846 # Build longer wiki page URLs from namespace and page names
847 FULL_PAGE_PATH=https://$WIKI_PATH/$NS_NAME:$PAGE_NAME
848 LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
849 # Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it
850 # explicitly breaks the link
851 if [ $NS_ID -eq 0 ]; then
852 FULL_PAGE_PATH=https://$WIKI_PATH/$PAGE_NAME
853 LOCAL_PAGE_PATH=$PAGE_NAME
854 fi
855
856 # We don't want to consider wiki pages ending in .js, as the MW parser cannot reliably isolate URLS
857 # in JavaScript code, so it returns erroneous links
858 PAGE_NAME_SUFFIX=$(echo $PAGE_NAME | sed 's/.*\.//')
859 if [ $PAGE_NAME_SUFFIX == "js" ]; then
860 valPrint trs "Skipping URL '${LINE#$NS_ID,$PAGE_NAME,}' on line $LINK_NUM because it was found on JavaScript page '$LOCAL_PAGE_PATH'."
861 let SKIP_JS_PAGE+=1
862 let PAGE_LINKS+=1
863 continue
864 fi
865
866 # Scan for illegal characters
867 if [[ $URL == *[$ILLEGAL_CHARS]* ]]; then
868 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because it contains characters illegal in a URL."
869 let SKIP_BAD_URL+=1
870 let PAGE_LINKS+=1
871 continue
872 fi
873
874 # If we're skipping archive links, see if this is one
875 if [ $CHECK_ARCHIVE_LINKS -eq 0 ] && [[ ( $URL == *web.archive.org* || $URL == *archive.is* || $URL == *archive.ph* ) ]]; then
876 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to check archive links."
877 let SKIP_ARCHIVES+=1
878 let PAGE_LINKS+=1
879 continue
880 fi
881
882 # Now we need to know if the URL is for a file or a web page. First step is to determine if the
883 # URL ends in a suffix
884 HAS_SUFFIX=0
885
886 # If the URL ends in a query string like ".php?foo=bar", strip everything from the '?' onward
887 CLEAN_URL=${URL%%\?*}
888
889 # If the URL ends in an anchor link like "#section_15", strip everything from the '#' onward
890 CLEAN_URL=${CLEAN_URL%%\#*}
891
892 # 'sed' cannot handle Unicode in my Bash shell, so skip non-ASCII URL and make reader check it
893 if [[ $CLEAN_URL == *[![:ascii:]]* ]]; then
894 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I cannot handle non-ASCII characters."
895 let SKIP_NON_ASCII+=1
896 let PAGE_LINKS+=1
897 continue
898 fi
899
900 # Isolate the characters after the last period and after the last slash
901 POST_DOT=$(echo "$CLEAN_URL" | sed 's/.*\.//')
902 POST_SLASH=$(echo "$CLEAN_URL" | sed 's/.*\///')
903
904 # If the last period comes after the last slash, then the URL ends in a suffix
905 POST_DOT_LENGTH=$(echo | awk -v input=$POST_DOT '{print length(input)}')
906 POST_SLASH_LENGTH=$(echo | awk -v input=$POST_SLASH '{print length(input)}')
907 if [ $POST_DOT_LENGTH -lt $POST_SLASH_LENGTH ]; then
908 HAS_SUFFIX=1
909 else
910 HAS_SUFFIX=0
911 fi
912
913 # Now we need to know if the URL is for a file. If the URL ends in a suffix, compare it against the
914 # known URL endings supplied in HTTP_FILES and HTTP_TLDS_AND_PAGES
915 IS_FILE=-1
916 if [ $HAS_SUFFIX -eq 0 ]; then
917 IS_FILE=0
918 else
919 # Turn off case sensitivity while we compare suffixes
920 shopt -s nocasematch
921
922 # Special case: URLs ending in something like "productID.304297400" are pages, not files, so if
923 # the URL's suffix is all numbers, we are looking at the end of a web page URL
924 if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
925 IS_FILE=0
926 fi
927
928 # Special case: URLs ending in a parens, e.g. "ms537113(v=vs.85)", are pages
929 if [[ $POST_DOT =~ ^.*[\(\)]$ ]]; then
930 IS_FILE=0
931 fi
932
933 # Special case: URLs containing a '%', e.g. "10.1007%2FBF00329055", are pages
934 if [[ $POST_DOT == *%* ]]; then
935 IS_FILE=0
936 fi
937
938 # If we did not identify this URL as a web page above, we need to compare the suffix against known
939 # file extensions
940 if [ $IS_FILE -eq -1 ]; then
941 for EXTENSION in "${HTTP_FILES[@]}"; do
942 if [[ $EXTENSION == $POST_DOT ]]; then # double brackets respect the nocasematch setting
943 IS_FILE=1
944 break
945 fi
946 done
947 fi
948
949 # If we did not identify this URL as a file above, we need to compare the suffix against known
950 # pages and TLDs in order to rule out that this is an unknown file suffix or TLD that the user
951 # needs to add to the HTTP_FILES or HTTP_TLDS_AND_PAGES array
952 if [ $IS_FILE -eq -1 ]; then
953 for PAGE_OR_TLD in "${HTTP_TLDS_AND_PAGES[@]}"; do
954 if [[ $PAGE_OR_TLD == $POST_DOT ]]; then
955 IS_FILE=0
956 break
957 fi
958 done
959 fi
960
961 # Turn case sensitivity back on in Bash
962 shopt -u nocasematch
963 fi
964
965 # If this suffix escaped identification as either a file, page or TLD, inform the reader
966 STR_TYPE=""
967 if [ $IS_FILE -eq -1 ]; then
968 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I encountered the unknown URL suffix '$POST_DOT'. Please add this suffix to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
969 let SKIP_UNK_SUFFIX+=1
970 continue
971 elif [ $IS_FILE -eq 1 ]; then
972 STR_TYPE="file"
973 let FILE_LINKS+=1
974 else
975 STR_TYPE="page"
976 let PAGE_LINKS+=1
977 fi
978
979 # Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
980 # issue with sites that require HTTPS
981 CURL_CODE=$(curl -o /dev/null --silent --insecure --compressed --head --user-agent '$AGENT' --max-time $TIMEOUT --retry 2 --write-out '%{http_code}\n' $URL)
982 CURL_ERR=$(echo $?)
983 CURL_RESULT=$CURL_CODE
984
985 # Tack 'curl' exit code onto result string if 'curl' returned "000" (no HTTP response)
986 if [ $CURL_CODE == "000" ]; then
987 CURL_RESULT="$CURL_RESULT-$CURL_ERR"
988 fi
989
990 # Begin to determine our status code for this URL (EI, IW, OK, RD, or NG)
991 STATUS="??"
992 NEW_URL=""
993 INTERWIKI_INDEX=-1
994
995 # First make sure that this isn't an "external internal" link to our own wiki that can be replaced
996 # by "[[page_name]]". If it uses a special access URL beginning with "/w/", let it pass, as it
997 # probably cannot be replaced by "[[ ]]" markup
998 if [[ $URL == *$WIKI_PATH* ]] && [[ $URL != *$WIKI_PATH/w/* ]]; then
999 STATUS="EI"
1000 let EI_LINKS+=1
1001 fi
1002
1003 # If it's not, check if this is a link to a domain that we have an interwiki prefix for (also make
1004 # sure that it's not an archive.org link to a page from an interwiki domain)
1005 if [ $STATUS == "??" ] && [[ $URL != *web.archive.org* ]]; then
1006 for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
1007 if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]] && [[ $URL != *${INTERWIKI_DOMAINS[$i]}/w/* ]]; then
1008 STATUS="IW"
1009 let IW_LINKS+=1
1010 INTERWIKI_INDEX=$i
1011 break
1012 fi
1013 done
1014 fi
1015
1016 # If we didn't match an interwiki domain, see if the status code is in our "OK" codes list
1017 if [ $STATUS == "??" ]; then
1018 for CODE in "${OK_CODES[@]}"; do
1019 if [ $ONLY_200_OK -eq 1 ] && [ $CODE -ne 200 ]; then
1020 continue
1021 fi
1022
1023 if [[ $CODE == $CURL_CODE ]]; then
1024 STATUS="OK"
1025 let OK_LINKS+=1
1026
1027 # If this is a YouTube link, we have to look at the actual page source to know if the video
1028 # is good or not; override the link's info if it's actually NG. Also see RD_CODES section
1029 # below for duplicative code.
1030 if [[ $URL == *www.youtube.com* ]]; then
1031 PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $URL)
1032 CURL_ERR=$(echo $?)
1033 if [ "$CURL_ERR" != "0" ]; then
1034 STATUS="NG"
1035 CURL_RESULT="000-$CURL_ERR"
1036 let OK_LINKS-=1
1037 let NG_LINKS+=1
1038 elif [[ "$PAGE_TEXT" =~ "simpleText\":\"Video unavailable" ]] | [[ "$PAGE_TEXT" =~ "simpleText\":\"Private video" ]]; then
1039 STATUS="NG"
1040 CURL_CODE="404"
1041 CURL_RESULT=$CURL_CODE
1042 let OK_LINKS-=1
1043 let NG_LINKS+=1
1044 fi
1045 fi
1046
1047 # If this is a OneDrive link, we have to look at the actual page source to know if the file
1048 # is really still at this URL; override the link's info if it's actually NG or RD
1049 if [[ $URL == *skydrive.live.com* ]]; then
1050 PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $URL)
1051 CURL_ERR=$(echo $?)
1052 if [ "$CURL_ERR" != "0" ]; then
1053 STATUS="NG"
1054 CURL_RESULT="000-$CURL_ERR"
1055 let OK_LINKS-=1
1056 let NG_LINKS+=1
1057 elif [[ "$PAGE_TEXT" =~ "<h1>Sorry, something went wrong" ]]; then
1058 STATUS="NG"
1059 CURL_CODE="404"
1060 CURL_RESULT=$CURL_CODE
1061 let OK_LINKS-=1
1062 let NG_LINKS+=1
1063 elif [[ "$PAGE_TEXT" =~ "<h2>Object moved to" ]]; then
1064 STATUS="??" # have to send the code through the next block to treat the redirect properly
1065 CURL_CODE="301"
1066 CURL_RESULT=$CURL_CODE
1067 let OK_LINKS-=1
1068 fi
1069 fi
1070
1071 break
1072 fi
1073 done
1074 fi
1075
1076 # If we didn't get a match with the "OK" codes, check it against the "RD" codes
1077 if [ $STATUS == "??" ]; then
1078 for CODE in "${RD_CODES[@]}"; do
1079 if [[ $CODE == $CURL_CODE ]]; then
1080 # Get URL header again in order to retrieve the URL we are being redirected to, but if this
1081 # is a OneDrive link, we already have the new URL in $PAGE_TEXT
1082 if [[ $URL == *skydrive.live.com* ]]; then
1083 NEW_URL=${PAGE_TEXT##*href=\"}
1084 NEW_URL=${NEW_URL%\">here*}
1085 else
1086 NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --write-out '%{redirect_url}\n' $URL)
1087 fi
1088
1089 # Adjust the old and new URLs to both use HTTP for comparison purposes, so we can filter
1090 # those changes out if the user didn't ask for them
1091 URL_HTTP=$(echo $URL | sed -E 's/^https:/http:/')
1092 NEW_URL_HTTP=$(echo $NEW_URL | sed -E 's/^https:/http:/')
1093
1094 # Sometimes 'curl' fails to get the redirect_url due to time-out or bad web site config
1095 NEW_URL_LENGTH=$(echo | awk -v input=$NEW_URL_HTTP '{print length(input)}')
1096 if [ $NEW_URL_LENGTH -lt $MIN_URL_LENGTH ]; then
1097 NEW_URL_HTTP="[new URL not retrieved]"
1098 fi
1099
1100 # Remove slash at end of new URL, if present, so we can filter out the redirects that
1101 # merely add an ending slash if the user didn't ask for them
1102 NEW_URL_NO_SLASH=$(echo $NEW_URL_HTTP | sed -E 's:/$::')
1103
1104 # Detect if this is a youtu.be link simply being expanded by YouTube to the full
1105 # youtube.com address
1106 YOUTU_BE=0
1107 if [[ $URL_HTTP == http://youtu.be* ]] && [[ $NEW_URL_HTTP == http://www.youtube.com* ]]; then
1108 YOUTU_BE=1
1109 fi
1110
1111 # If the URLs match besides HTTP being upgraded to HTTPS, then the link is OK (unless user
1112 # wants those to be reported)
1113 if [ $SHOW_HTTPS -eq 0 ] && [ $URL_HTTP == $NEW_URL_HTTP ]; then
1114 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to show http->https upgrades, and I was redirected to '$NEW_URL'."
1115 STATUS="OK"
1116 let OK_LINKS+=1
1117 let SKIP_HTTPS_UP+=1
1118 # If the URLs match besides an added ending slash, then the link is OK (unless user wants
1119 # those to be reported)
1120 elif [ $SHOW_SLASH -eq 0 ] && [ $URL_HTTP == $NEW_URL_NO_SLASH ]; then
1121 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to show added trailing slashes, and I was redirected to '$NEW_URL'."
1122 STATUS="OK"
1123 let OK_LINKS+=1
1124 let SKIP_SLASH_ADD+=1
1125 elif [ $YOUTU_BE -eq 1 ]; then
1126 # We have to look at the actual page source to know if a YouTube video is good or not.
1127 # Also see OK_CODES section above for duplicative code.
1128 PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $NEW_URL | grep -e "simpleText\":\"Video unavailable" -e "simpleText\":\"Private video")
1129 if [ ! -z "$PAGE_TEXT" ]; then
1130 STATUS="NG"
1131 let NG_LINKS+=1
1132 else
1133 if [ $SHOW_YT_RD -eq 0 ]; then
1134 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to show redirects for youtu.be links, and I was redirected to '$NEW_URL'."
1135 STATUS="OK"
1136 let OK_LINKS+=1
1137 let SKIP_YOUTU_BE+=1
1138 else
1139 STATUS="RD"
1140 let RD_LINKS+=1
1141 fi
1142 fi
1143 else
1144 STATUS="RD"
1145 let RD_LINKS+=1
1146 fi
1147 break
1148 fi
1149 done
1150 fi
1151
1152 # If we didn't get a match with the "RD" codes, check it against the "NG" codes
1153 if [ $STATUS == "??" ]; then
1154 for CODE in "${NG_CODES[@]}"; do
1155 if [[ $CODE == $CURL_CODE ]]; then
1156 STATUS="NG"
1157 let NG_LINKS+=1
1158 break
1159 fi
1160 done
1161 # Also check it against the "OK" codes besides 200 if the --only-200-ok argument was received
1162 if [ $ONLY_200_OK -eq 1 ]; then
1163 for CODE in "${OK_CODES[@]}"; do
1164 if [ $CODE -eq 200 ]; then
1165 continue
1166 fi
1167 if [[ $CODE == $CURL_CODE ]]; then
1168 STATUS="NG"
1169 let NG_LINKS+=1
1170 break
1171 fi
1172 done
1173 fi
1174 fi
1175
1176 # If we didn't match a known status code, advise the reader
1177 if [ $STATUS == "??" ]; then
1178 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I encountered the unknown response code $CURL_CODE."
1179 let SKIP_UNK_CODE+=1
1180 continue
1181 fi
1182
1183 # Check problem links against exceptions list before proceeding
1184 FOUND_EXCEPT=0
1185 if [ $STATUS != "OK" ] && [ ! -z "$EXCEPT_URL" ]; then
1186 # The code we expect to find in the exceptions file is either the 'curl' result or "EI"/"IW"
1187 EXPECT_CODE="$CURL_RESULT"
1188 if [ $STATUS == "EI" ]; then
1189 EXPECT_CODE="EI"
1190 elif [ $STATUS == "IW" ]; then
1191 EXPECT_CODE="IW"
1192 fi
1193
1194 # Look for link in exceptions list and make sure the listed result code and wiki page also match
1195 for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
1196 {
1197 EXCEPT_LINE="${EXCEPT_ARRAY[$i]}"
1198
1199 # Undo any HTML-encoding from the wiki page; for now we just worry about the ampersand, as most
1200 # other HTML-encoded characters are not found in URLs
1201 EXCEPT_LINE=$(echo "$EXCEPT_LINE" | sed 's/\&amp;/\&/g')
1202
1203 # Check for URL match
1204 EXCEPT_URL="${EXCEPT_LINE#*,}"
1205 EXCEPT_URL="${EXCEPT_URL%,*}"
1206 if [[ "$EXCEPT_URL" =~ \* ]]; then # if this exception URL contains the '*' wildcard, use pattern-matching with it
1207 if [[ ! "$URL" == $EXCEPT_URL ]]; then
1208 continue
1209 fi
1210 else
1211 if [ "$EXCEPT_URL" != "$URL" ]; then # otherwise just use a straight string comparison
1212 continue
1213 fi
1214 fi
1215
1216 # Check for page name match
1217 EXCEPT_PAGE="${EXCEPT_LINE##*,}"
1218 EXCEPT_PAGE="${EXCEPT_PAGE%% *}"
1219 if [ "$EXCEPT_PAGE" == "*" ] || [ "$EXCEPT_PAGE" == "$LOCAL_PAGE_PATH" ]; then
1220 let EXCEPT_FOUND[$i]+=1
1221 valPrint trs "Found exception '$URL' on page '$LOCAL_PAGE_PATH'."
1222
1223 # Check for result code match
1224 EXCEPT_CODE=${EXCEPT_LINE%%,*}
1225 if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then
1226 FOUND_EXCEPT=1
1227 let EXCEPT_USED[$i]+=1
1228 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because its expected result, '$EXPECT_CODE', is in the exceptions list."
1229
1230 if [ $STATUS == "EI" ]; then
1231 let SKIP_EXPECT_EI+=1
1232 elif [ $STATUS == "IW" ]; then
1233 let SKIP_EXPECT_IW+=1
1234 elif [ $STATUS == "RD" ]; then
1235 let SKIP_EXPECT_RD+=1
1236 else
1237 let SKIP_EXPECT_NG+=1
1238 fi
1239
1240 break
1241 fi
1242 fi
1243 } done
1244 fi
1245 if [ $FOUND_EXCEPT -eq 1 ]; then
1246 continue
1247 fi
1248
1249 # If appropriate, record this link to the log, with clickable URLs when possible
1250 if [ $STATUS != "OK" ] || [ $RECORD_OK_LINKS -eq 1 ]; then
1251 # Prepare 'curl' result in parentheses to print after status code, unless this is an "IW" or "EI"
1252 # link, in which case showing the status code doesn't make sense. Adjust spacing after string to
1253 # ensure TXT and RTF reports have aligned columns of results.
1254 CURL_STR_H=" ($CURL_RESULT)"
1255 CURL_STR_T="$CURL_STR_H"
1256 CURL_STR_R="$CURL_STR_H "
1257 if [ $STATUS == "IW" ] || [ $STATUS == "EI" ]; then
1258 CURL_STR_H=""
1259 CURL_STR_T=" "
1260 CURL_STR_R=" "
1261 fi
1262
1263 # Record link and its wiki page in TXT, RTF, and HTML markup
1264 valPrint t "${STATUS}${CURL_STR_T} $STR_TYPE $URL"
1265 valPrint t " linked from $FULL_PAGE_PATH"
1266 valPrint r "${STATUS}${CURL_STR_R}${RTF_TABS}$STR_TYPE {\field{\*\fldinst{HYPERLINK \"$URL\"}}{\fldrslt $URL}}"
1267 valPrint r " linked from {\field{\*\fldinst{HYPERLINK \"$FULL_PAGE_PATH\"}}{\fldrslt $LOCAL_PAGE_PATH}}"
1268 valPrint hn "<tr><td style=\"white-space:nowrap\">${STATUS}${CURL_STR_H}</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>"
1269 valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"
1270
1271 # Place vertical space here since we won't be printing anything more about this link
1272 if [ $STATUS == "OK" ] && [ $SUGGEST_SNAPSHOTS_OK -eq 0 ]; then valPrint tr ""; valPrint hs ""; fi
1273
1274 # Record redirect URL if one was given by a 3xx response page
1275 if [ $STATUS == "RD" ]; then
1276 valPrint ts " Server suggests $NEW_URL"
1277 valPrint rs " Server suggests {\field{\*\fldinst{HYPERLINK \"$NEW_URL\"}}{\fldrslt $NEW_URL}}"
1278 valPrint hs "<tr><td colspan=\"2\" align=\"right\">Server suggests</td><td><a href=\"$NEW_URL\" target=\"_blank\">$NEW_URL</a></td></tr>"
1279 fi
1280
1281 # Notify reader if we can use an intrawiki link for this URL
1282 if [ $STATUS == "EI" ]; then
1283 INTRA_PAGE=${URL#*://*/}
1284 # If INTRA_PAGE starts with Category:, File: or Image:, prefix it with a ':' to make it a wikilink
1285 if [[ $INTRA_PAGE == Category:* ]] || [[ $INTRA_PAGE == File:* ]]|| [[ $INTRA_PAGE == Image:* ]]; then
1286 INTRA_PAGE=:${INTRA_PAGE}
1287 fi
1288 valPrint ts " Just use [[$INTRA_PAGE]]"
1289 valPrint rs " Just use [[$INTRA_PAGE]]"
1290 valPrint hs "<tr><td colspan=\"2\" align=\"right\">Just use</td><td>[[$INTRA_PAGE]]</td></tr>"
1291 fi
1292
1293 # Notify reader if we can use an interwiki prefix for this URL
1294 if [ $STATUS == "IW" ]; then
1295 INTER_PAGE=$(echo "$URL" | sed 's/.*\///')
1296 valPrint ts " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
1297 valPrint rs " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
1298 valPrint hs "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]</td></tr>"
1299 fi
1300
1301 # Query Internet Archive for latest "OK" snapshot for "NG" page
1302 if [[ ( $STATUS == "NG" && $SUGGEST_SNAPSHOTS_NG -eq 1 ) || ( $STATUS == "OK" && $SUGGEST_SNAPSHOTS_OK -eq 1 ) ]]; then
1303
1304 # We need to watch out for the rate limit or we'll get locked out; look at how much time has
1305 # elapsed and then wait the remainder between that and how long of a wait we think is needed
1306 # to avoid the dreaded "Too Many Requests" response. 5 seconds is just a guess.
1307 CUR_TIME=$(date +%s)
1308 WAIT_REMAINDER=$((5 - $CUR_TIME + $START_LINK))
1309 if [ $WAIT_REMAINDER -gt 0 ]; then
1310 valPrint t "Waiting $WAIT_REMAINDER second(s) to conform to Archive.org rate limit."
1311 sleep $WAIT_REMAINDER
1312 fi
1313
1314 # Issue query to the API
1315 ARCHIVE_QUERY=$(curl --silent --max-time $TIMEOUT "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
1316
1317 # Notify reader if we hit the rate limit and just keep going
1318 if [[ "$ARCHIVE_QUERY" == "*Too Many Requests*" ]]; then
1319 valPrint t " IA has rate-limited us!"
1320 valPrint r " IA has rate-limited us!"
1321 valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td>(hit the API rate limit!)</td></tr>"
1322 # If a "closest" snapshot was received, inform reader
1323 elif [[ "$ARCHIVE_QUERY" == *\"closest\":* ]]; then
1324 # In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it
1325 ARCHIVE_QUERY=$(echo "$ARCHIVE_QUERY" | sed 's/#!/#\\!/')
1326
1327 # ...isolate "url" property in the response that follows the "closest" tag
1328 SNAPSHOT_URL=${ARCHIVE_QUERY##*\"closest\":} # everything after '"closest":'
1329 SNAPSHOT_URL=${SNAPSHOT_URL#*\"url\": \"} # everything after '"url": "'
1330 SNAPSHOT_URL=${SNAPSHOT_URL%%\"*} # everything before '"'
1331
1332 # Remove the port 80 part that IA often adds to the URL, as it's superfluous
1333 SNAPSHOT_URL=$(echo $SNAPSHOT_URL | sed 's/:80//')
1334
1335 # Inform the reader of the snapshot URL
1336 valPrint ts " IA suggests $SNAPSHOT_URL"
1337 valPrint rs " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
1338 valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
1339 else # Otherwise give a generic Wayback Machine link for this URL, which might work
1340 valPrint ts " Try browsing $ARCHIVE_GENERIC/$URL"
1341 valPrint rs " Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
1342 valPrint hs "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"
1343 fi
1344 fi
1345 fi
1346
1347 # If this is a page and it seems to be there, take screenshot of it using headless Google Chrome
1348 if [ $IS_FILE -eq 0 ] && [ $TAKE_PAGE_SHOT -eq 1 ] && [ $STATUS == "OK" ]; then
1349 # Sanitize URL by removing "http(s)://" and converting any other colons or slashes to underscores
1350 SHOT_NAME=$(echo "$URL" | sed 's/https*\:\/\///' | sed 'y/:\//__/')
1351 SHOT_FILE="$SHOT_PATH/$SHOT_NAME.png"
1352
1353 # Don't take screenshot if we already encountered this page and screenshotted it
1354 if [ ! -f "$SHOT_FILE" ]; then
1355 "$CHROME_PATH" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
1356 if [ -f "$WORKING_DIR/$CHROME_SCREENSHOT" ]; then
1357 mv -n "$WORKING_DIR/$CHROME_SCREENSHOT" "$SHOT_FILE"
1358 else
1359 valPrint trhs "Screenshot of URL $URL seems to have failed!"
1360 fi
1361 else
1362 valPrint trhs "Skipping screenshot of URL '$URL' because file '$SHOT_FILE' already exists."
1363 fi
1364 fi
1365done
1366FINISHED_LIST="yes"
1367wrapupAndExit
Note: See TracBrowser for help on using the repository browser.