1 | #!/bin/bash
|
---|
2 |
|
---|
3 | # Validate External Links by Iritscen
|
---|
4 | # Provided with a list of external links in an expected CSV format, this script validates them. The
|
---|
5 | # resulting logs are produced in three formats: TXT (for easy diffing with an earlier log), RTF (for
|
---|
6 | # reading as a local file with clickable links), and HTML (for uploading as a web page). Call script
|
---|
7 | # with "--help" argument for documentation. Also see Read Me First.rtf for critical notes.
|
---|
8 | # Recommended rule:
|
---|
9 | # |----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----|
|
---|
10 |
|
---|
11 | # Set separator token to newline
|
---|
12 | IFS="
|
---|
13 | "
|
---|
14 |
|
---|
15 | ### GLOBALS ###
|
---|
16 | # Settings -- these will be changed from their defaults by the arguments passed in to the script
|
---|
17 | LINKS_URL="" # use 'curl' to download file with links from this location (can be file://)
|
---|
18 | EXCEPT_URL="" # 'curl' will access this wiki page with a list of exceptions for NG results
|
---|
19 | OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder
|
---|
20 | RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES
|
---|
21 | SHOW_SLASH=0 # record response code to the log when a slash is added to the end of a URL
|
---|
22 | SHOW_HTTPS=0 # record response code to the log when "http" is upgraded to "https"
|
---|
23 | SHOW_YT_RD=0 # record response code to the log when a youtu.be URL is expanded to the full URL
|
---|
24 | SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page
|
---|
25 | SKIP_ARCHIVE_LINKS=0 # don't check URLs under the archive.org domain
|
---|
26 | TAKE_PAGE_SHOT=0 # take a screenshot of each OK page
|
---|
27 | CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature
|
---|
28 | URL_START=1 # start at this URL in LINKS_FILE (1 by default)
|
---|
29 | URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE
|
---|
30 | UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report
|
---|
31 |
|
---|
32 | # Fixed strings -- see the occurrences of these variables to learn their purpose
|
---|
33 | AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36 OPR/69.0.3686.77"
|
---|
34 | ARCHIVE_API="http://archive.org/wayback/available"
|
---|
35 | ARCHIVE_GENERIC="https://web.archive.org/web/*"
|
---|
36 | ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
|
---|
37 | CHROME_SCREENSHOT="screenshot.png"
|
---|
38 | CURL_CODES="http://iritscen.oni2.net/val/curl_codes.txt"
|
---|
39 | EXCEPT_FILE_NAME="exceptions.txt"
|
---|
40 | EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
|
---|
41 | HTTP_CODES="http://iritscen.oni2.net/val/http_codes.txt"
|
---|
42 | MY_WIKI_PAGE="https://wiki.oni2.net/User:Iritscen"
|
---|
43 | THIS_DIR=$(cd $(dirname $0); pwd)
|
---|
44 | WORKING_DIR=$(pwd)
|
---|
45 | WIKI_PATH="wiki.oni2.net"
|
---|
46 |
|
---|
47 | # These are parallel arrays of the IDs and names of OniGalore's current namespaces
|
---|
48 | declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 100 101 102 103 104 105 108 109 110 111)
|
---|
49 | declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk")
|
---|
50 |
|
---|
51 | # These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
|
---|
52 | # This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code.
|
---|
53 | declare -a HTTP_FILES=(3ds 7z avi BINA blend bsl cpp csv dae dll dmg doc east exe fbx first63 flv gamedata gif jpg last32 log m4a mhm mov mp3 mp4 oni ONWC pdf png py rar tga TRMA txt vbs wav wmv xaf xml zip)
|
---|
54 | declare -a HTTP_TLDS_AND_PAGES=(action ars asp aspx cfm cgi com css de htm html it js jsp net org pgi php php3 phtml pl ru shtml stm uk x)
|
---|
55 |
|
---|
56 | # These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
|
---|
57 | # are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt
|
---|
58 | # if you add a new code.
|
---|
59 | declare -a OK_CODES=(200 401 405 406 418 501)
|
---|
60 | declare -a RD_CODES=(301 302 303 307 308)
|
---|
61 | declare -a NG_CODES=(000 400 403 404 410 500 502 503 530)
|
---|
62 |
|
---|
63 | # Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
|
---|
64 | # transcluded text, and if the transclusion fails, then the braces show up in the URL
|
---|
65 | ILLEGAL_CHARS="{ }"
|
---|
66 |
|
---|
67 | # The shortest URL possible, used for sanity-checking some URLs: http://a.co
|
---|
68 | MIN_URL_LENGTH=11
|
---|
69 |
|
---|
70 | # These are parallel arrays giving the prefixes that can be used in place of normal external links to
|
---|
71 | # some wikis and other sites
|
---|
72 | declare -a INTERWIKI_PREFIXES=(commons metawikimedia mw wikibooks wikidata wikimedia wikinews wikiquote wikisource wikispecies wikiversity wikivoyage wikt wp)
|
---|
73 | declare -a INTERWIKI_DOMAINS=(commons.wikimedia.org meta.wikimedia.org mediawiki.org wikibooks.org wikidata.org wikimediafoundation.org wikinews.org wikiquote.org wikisource.org species.wikimedia.org wikiversity.org wikivoyage.org wiktionary.org wikipedia.org)
|
---|
74 |
|
---|
75 | # Variables for keeping track of main loop progress and findings
|
---|
76 | LINK_NUM=0
|
---|
77 | EI_LINKS=0
|
---|
78 | IW_LINKS=0
|
---|
79 | OK_LINKS=0
|
---|
80 | RD_LINKS=0
|
---|
81 | NG_LINKS=0
|
---|
82 | SKIP_UNK_NS=0
|
---|
83 | SKIP_JS_PAGE=0
|
---|
84 | SKIP_BAD_URL=0
|
---|
85 | SKIP_NON_ASCII=0
|
---|
86 | SKIP_UNK_SUFFIX=0
|
---|
87 | SKIP_UNK_CODE=0
|
---|
88 | SKIP_EXPECT_NG=0
|
---|
89 | SKIP_EXPECT_EI=0
|
---|
90 | SKIP_EXPECT_IW=0
|
---|
91 | SKIP_HTTPS_UP=0
|
---|
92 | SKIP_SLASH_ADD=0
|
---|
93 | SKIP_YOUTU_BE=0
|
---|
94 | SKIP_ARCHIVE_ORG=0
|
---|
95 | FILE_LINKS=0
|
---|
96 | PAGE_LINKS=0
|
---|
97 | SKIPPED_HEADER_ROW=0
|
---|
98 | FINISHED_LIST="no"
|
---|
99 | START_RUN=0
|
---|
100 | END_RUN=0
|
---|
101 |
|
---|
102 |
|
---|
103 | ### HELP ###
|
---|
104 | # A pseudo-man page. Here is the 80-character rule for the page text:
|
---|
105 | # 234567890123456789012345678901234567890123456789012345678901234567890123456789
|
---|
106 | function printHelp()
|
---|
107 | {
|
---|
108 | cat << EOF
|
---|
109 |
|
---|
110 | NAME
|
---|
111 | Validate External Links
|
---|
112 |
|
---|
113 | SYNOPSIS
|
---|
114 | validate_external_links.sh --help
|
---|
115 | validate_external_links.sh --links URL --output DIR [--exceptions URL]
|
---|
116 | [--record-ok-links] [--show-added-slashes] [--show-https-upgrades]
|
---|
117 | [--show-yt-redirects] [--suggest-snapshots] [--skip-archive-links]
|
---|
118 | [--take-screenshots FILE] [--start-url NUM] [--end-url NUM]
|
---|
119 | [--upload FILE]
|
---|
120 |
|
---|
121 | DESCRIPTION
|
---|
122 | This script parses a list of external links found in the OniGalore wiki
|
---|
123 | (which is dumped by the Oni2.net domain periodically in a particular
|
---|
124 | format), validates them using the Unix tool 'curl', and produces a report
|
---|
125 | of which links were "OK" (responded positively to an HTTP query), which
|
---|
126 | were "RD" (responded with a 3xx redirect code), which could be "IW"
|
---|
127 | (interwiki) links, which are "EI" (external internal) links and could be
|
---|
128 | intrawiki links, and which were "NG" (no good; a negative response to the
|
---|
129 | query). This report can then be automatically uploaded to the location of
|
---|
130 | your choice. The script can also suggest Internet Archive snapshots for
|
---|
131 | "NG" links, and take screenshots of "OK" links for visual verification by
|
---|
132 | the reader that the page in question is the one intended to be displayed.
|
---|
133 |
|
---|
134 | You must pass this script the URL at which the list of links is found
|
---|
135 | (--links) and the path where the directory of logs should be outputted
|
---|
136 | (--output). All other arguments are optional.
|
---|
137 |
|
---|
138 | OPTIONS
|
---|
139 | --help Show this page.
|
---|
140 | --links URL (required) URL from which to download the CSV
|
---|
141 | file with external links. Note that this URL can
|
---|
142 | be a local file if you supply a file:// path.
|
---|
143 | --output DIR (required) Unix path to directory in which Val
|
---|
144 | should place its reports.
|
---|
145 | --exceptions URL In order to remove links from the report which
|
---|
146 | Val finds an issue with but which you regard as
|
---|
147 | OK, list those desired exceptions on a wiki page.
|
---|
148 | See the sample file "exceptions.pdf" for the
|
---|
149 | required format of the page. Note that this URL
|
---|
150 | can point to a local file if you supply a path
|
---|
151 | beginning with "file://".
|
---|
152 | --record-ok-links Log a link in the report even if its response
|
---|
153 | code is "OK".
|
---|
154 | --show-added-slashes Report on redirects that simply add a '/' to the
|
---|
155 | end of the URL.
|
---|
156 | --show-https-upgrades Report on redirects that simply upgrade a
|
---|
157 | "http://" URL to a "https://" URL.
|
---|
158 | --show-yt-redirects Report on redirects that expand a youtu.be URL.
|
---|
159 | --suggest-snapshots Query the Internet Archive for a possible
|
---|
160 | snapshot URL for each "NG" page.
|
---|
161 | --skip-archive-links Don't check links that are already pointing to
|
---|
162 | a page on the Internet Archive.
|
---|
163 | --take-screenshots FILE Call the Google Chrome binary at this path to
|
---|
164 | take screenshots of each "OK" page.
|
---|
165 | --start-url NUM Start at this link in the links CSV file.
|
---|
166 | --end-url NUM Stop at this link in the links CSV file.
|
---|
167 | --upload FILE Upload report using the credentials and path
|
---|
168 | given in this local text file. See sftp_login.txt
|
---|
169 | for template.
|
---|
170 |
|
---|
171 | BUGS
|
---|
172 | The script cannot properly parse any line in the external links file
|
---|
173 | which contains a comma in the name of the wiki page containing a link.
|
---|
174 | Commas in the link itself are not an issue.
|
---|
175 | EOF
|
---|
176 | }
|
---|
177 |
|
---|
178 |
|
---|
179 | ### SETUP ###
|
---|
180 | # If first argument is a help request, or if nothing was passed in at all, print help page and quit
|
---|
181 | if [ "$#" -eq 0 ] || [ "$1" == "--help" ]; then
|
---|
182 | printHelp | less
|
---|
183 | exit 0
|
---|
184 | fi
|
---|
185 |
|
---|
186 | # Parse arguments as long as there are more arguments to process
|
---|
187 | while (( "$#" )); do
|
---|
188 | case "$1" in
|
---|
189 | --links ) LINKS_URL="$2"; shift 2;;
|
---|
190 | --exceptions ) EXCEPT_URL="$2"; shift 2;;
|
---|
191 | --output ) OUTPUT_DIR="$2"; shift 2;;
|
---|
192 | --record-ok-links ) RECORD_OK_LINKS=1; shift;;
|
---|
193 | --show-added-slashes ) SHOW_SLASH=1; shift;;
|
---|
194 | --show-https-upgrades ) SHOW_HTTPS=1; shift;;
|
---|
195 | --show-yt-redirects ) SHOW_YT_RD=1; shift;;
|
---|
196 | --suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;;
|
---|
197 | --skip-archive-links ) SKIP_ARCHIVE_LINKS=1; shift;;
|
---|
198 | --take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
|
---|
199 | --start-url ) URL_START=$2; shift 2;;
|
---|
200 | --end-url ) URL_LIMIT=$2; shift 2;;
|
---|
201 | --upload ) UPLOAD_INFO=$2; shift 2;;
|
---|
202 | * ) echo "Invalid argument $1 detected. Aborting."; exit 1;;
|
---|
203 | esac
|
---|
204 | done
|
---|
205 |
|
---|
206 | # If the required arguments were not supplied, print help page and quit
|
---|
207 | if [ -z $LINKS_URL ] || [ -z $OUTPUT_DIR ]; then
|
---|
208 | echo "Error: I did not receive one or both required arguments. Run me with the \"--help\" argument for documentation."
|
---|
209 | exit 2
|
---|
210 | fi
|
---|
211 |
|
---|
212 | # If user wants screenshots, make sure path to Chrome was passed in and is valid
|
---|
213 | if [ $TAKE_PAGE_SHOT -eq 1 ]; then
|
---|
214 | if [ ! -f "$CHROME_PATH" ]; then
|
---|
215 | echo "Error: You need to supply a path to the Google Chrome application in order to take screenshots."
|
---|
216 | exit 3
|
---|
217 | fi
|
---|
218 | fi
|
---|
219 |
|
---|
220 | # Check that UPLOAD_INFO exists, if this argument was supplied
|
---|
221 | if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then
|
---|
222 | echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting."
|
---|
223 | exit 4
|
---|
224 | fi
|
---|
225 |
|
---|
226 | # Check that OUTPUT_DIR is a directory
|
---|
227 | if [ ! -d "$OUTPUT_DIR" ]; then
|
---|
228 | echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting."
|
---|
229 | exit 5
|
---|
230 | fi
|
---|
231 |
|
---|
232 | # Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots
|
---|
233 | SAFE_TIME=$(date "+%Y-%m-%d--%H-%M-%S") # for use in path names
|
---|
234 | NICE_TIME=$(TZ=":UTC" date "+%a, %d %b %Y %k:%M:%S GMT") # matches format of LINKS_DATE, which is extracted from the extlinks file
|
---|
235 | OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
|
---|
236 | OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
|
---|
237 | SHOT_PATH="$OUTPUT_PATH/Screenshots"
|
---|
238 | LOG_NAME="ValExtLinks report"
|
---|
239 | LOG_TXT="$OUTPUT_PATH/$LOG_NAME.txt"
|
---|
240 | LOG_RTF="$OUTPUT_PATH/$LOG_NAME.rtf"
|
---|
241 | LOG_HTM="$OUTPUT_PATH/$LOG_NAME.htm"
|
---|
242 | mkdir "$OUTPUT_PATH"
|
---|
243 | if [ $TAKE_PAGE_SHOT -eq 1 ]; then
|
---|
244 | mkdir "$SHOT_PATH"
|
---|
245 | fi
|
---|
246 |
|
---|
247 | # Check that 'mkdir' succeeded
|
---|
248 | if [ ! -d "$OUTPUT_PATH" ]; then
|
---|
249 | echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting."
|
---|
250 | exit 6
|
---|
251 | fi
|
---|
252 |
|
---|
253 | # Get date on the file at LINKS_URL and print to log
|
---|
254 | LINKS_DATE=$(curl --silent --head $LINKS_URL | grep "Last-Modified")
|
---|
255 | if [ -z "$LINKS_DATE" ]; then
|
---|
256 | echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting."
|
---|
257 | exit 7
|
---|
258 | fi
|
---|
259 | LINKS_DATE=${LINKS_DATE#Last-Modified: }
|
---|
260 |
|
---|
261 |
|
---|
262 | ### UTILITY FUNCTIONS ###
|
---|
263 | # Writes a plain-text header to TXT log file
|
---|
264 | function printTXTheader()
|
---|
265 | {
|
---|
266 | valPrint t "Validate External Links report"
|
---|
267 | valPrint t "generated $NICE_TIME"
|
---|
268 | valPrint t "from data of $LINKS_DATE"
|
---|
269 | valPrint t "script by Iritscen (contact: $MY_WIKI_PAGE)"
|
---|
270 | valPrint t ""
|
---|
271 | }
|
---|
272 |
|
---|
273 | # Writes the RTF header to RTF log file, then a centered title, then sets text to left-justified
|
---|
274 | function printRTFheader()
|
---|
275 | {
|
---|
276 | valPrint r "{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf820
|
---|
277 | {\fonttbl\f0\fswiss\fcharset0 Helvetica;}
|
---|
278 | {\colortbl;\red255\green255\blue255;}
|
---|
279 | {\*\expandedcolortbl;;}
|
---|
280 | \margl1440\margr1440\vieww12600\viewh12100\viewkind0
|
---|
281 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\qc\partightenfactor0
|
---|
282 |
|
---|
283 | \f0\fs28 \cf0 \b1 Validate External Links report \b0\\
|
---|
284 | generated $NICE_TIME\\
|
---|
285 | from data of $LINKS_DATE\\
|
---|
286 | script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$MY_WIKI_PAGE\"}}{\fldrslt contact}})
|
---|
287 | \\
|
---|
288 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
|
---|
289 | \cf0 "
|
---|
290 | }
|
---|
291 |
|
---|
292 | # Closes the RTF markup of the RTF log file
|
---|
293 | function printRTFfooter()
|
---|
294 | {
|
---|
295 | valPrint r "}"
|
---|
296 | }
|
---|
297 |
|
---|
298 | # Writes the HTML header to HTML log file
|
---|
299 | function printHTMheader()
|
---|
300 | {
|
---|
301 | valPrint h "<html>
|
---|
302 | <head>
|
---|
303 | <title>Validate External Links report</title>
|
---|
304 | </head>
|
---|
305 | <body>
|
---|
306 | <h2>Validate External Links report</h2>
|
---|
307 | <h3>generated $NICE_TIME<br />
|
---|
308 | from data of $LINKS_DATE<br />
|
---|
309 | script by Iritscen (<a href=\"$MY_WIKI_PAGE\" target=\"_blank\">contact</a>)</h3>"
|
---|
310 | }
|
---|
311 |
|
---|
312 | # Closes the HTML markup of the HTML log file
|
---|
313 | function printHTMfooter()
|
---|
314 | {
|
---|
315 | valPrint h "</body>
|
---|
316 | </html>"
|
---|
317 | }
|
---|
318 |
|
---|
319 | # The central logging function. The first parameter is a string composed of one or more characters that
|
---|
320 | # indicate which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
|
---|
321 | # 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 's' means "Print an extra newline at the end." 'w' means "Don't pass console output through 'fmt'" ("fmt" fits the output
|
---|
322 | # to an 80-column CLI but can break special formatting and the 'n' option).
|
---|
323 | function valPrint()
|
---|
324 | {
|
---|
325 | if [[ "$1" == *c* ]]; then
|
---|
326 | if [[ "$1" == *n* ]]; then
|
---|
327 | echo -n "$2"
|
---|
328 | elif [[ "$1" == *w* ]]; then
|
---|
329 | echo "$2"
|
---|
330 | elif [[ "$1" == *s* ]]; then
|
---|
331 | echo -e "$2\n"
|
---|
332 | else
|
---|
333 | echo "$2" | fmt -w 80
|
---|
334 | fi
|
---|
335 | fi
|
---|
336 | if [[ "$1" == *t* ]]; then
|
---|
337 | if [[ "$1" == *n* ]]; then
|
---|
338 | echo -n "$2" >> "$LOG_TXT"
|
---|
339 | elif [[ "$1" == *s* ]]; then
|
---|
340 | echo -e "$2\n" >> "$LOG_TXT"
|
---|
341 | else
|
---|
342 | echo "$2" >> "$LOG_TXT"
|
---|
343 | fi
|
---|
344 | fi
|
---|
345 | if [[ "$1" == *r* ]]; then
|
---|
346 | if [[ "$1" == *n* ]]; then
|
---|
347 | echo "$2" >> "$LOG_RTF"
|
---|
348 | elif [[ "$1" == *s* ]]; then
|
---|
349 | echo "$2\line\line" >> "$LOG_RTF"
|
---|
350 | else
|
---|
351 | echo "$2\line" >> "$LOG_RTF"
|
---|
352 | fi
|
---|
353 | fi
|
---|
354 | if [[ "$1" == *h* ]]; then
|
---|
355 | if [[ "$1" == *s* ]]; then
|
---|
356 | echo "$2<tr><td> </td></tr>" >> "$LOG_HTM"
|
---|
357 | elif [[ "$1" == *n* ]]; then
|
---|
358 | echo "$2" >> "$LOG_HTM"
|
---|
359 | else
|
---|
360 | echo "$2<br />" >> "$LOG_HTM"
|
---|
361 | fi
|
---|
362 | fi
|
---|
363 | }
|
---|
364 |
|
---|
365 | # Pluralize the string in parameter 1 if the number in parameter 2 is not 1
|
---|
366 | function pluralCheckNoun()
|
---|
367 | {
|
---|
368 | if [ $2 -ne 1 ]; then
|
---|
369 | if [[ $1 =~ x$ ]]; then
|
---|
370 | echo $1es
|
---|
371 | else
|
---|
372 | echo $1s
|
---|
373 | fi
|
---|
374 | else
|
---|
375 | echo $1
|
---|
376 | fi
|
---|
377 | }
|
---|
378 |
|
---|
379 | # Output "is" if parameter 1 is 1, otherwise "are"
|
---|
380 | function pluralCheckIs()
|
---|
381 | {
|
---|
382 | if [ $1 -ne 1 ]; then
|
---|
383 | echo "are"
|
---|
384 | else
|
---|
385 | echo "is"
|
---|
386 | fi
|
---|
387 | }
|
---|
388 |
|
---|
389 | # Output "was" if parameter 1 is 1, otherwise "were"
|
---|
390 | function pluralCheckWas()
|
---|
391 | {
|
---|
392 | if [ $1 -ne 1 ]; then
|
---|
393 | echo "were"
|
---|
394 | else
|
---|
395 | echo "was"
|
---|
396 | fi
|
---|
397 | }
|
---|
398 |
|
---|
399 | # Output "a " if parameter 1 is 1, otherwise nothing
|
---|
400 | function pluralCheckA()
|
---|
401 | {
|
---|
402 | if [ $1 -eq 1 ]; then
|
---|
403 | echo "a "
|
---|
404 | fi
|
---|
405 | }
|
---|
406 |
|
---|
407 | # Output "an " if parameter 1 is 1, otherwise nothing
|
---|
408 | function pluralCheckAn()
|
---|
409 | {
|
---|
410 | if [ $1 -eq 1 ]; then
|
---|
411 | echo "an "
|
---|
412 | fi
|
---|
413 | }
|
---|
414 |
|
---|
415 | # Upload HTML report using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
|
---|
416 | # reports being saved to disk have already been closed.
|
---|
417 | function uploadReport()
|
---|
418 | {
|
---|
419 | valPrint c "Uploading HTML report..."
|
---|
420 |
|
---|
421 | SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
|
---|
422 | SFTP_USER_NAME_MARKER="user:"
|
---|
423 | SFTP_PASSWORD_MARKER="pw:"
|
---|
424 | SFTP_PORT_MARKER="port:"
|
---|
425 | SFTP_PATH_MARKER="path:"
|
---|
426 | SFTP_USER_NAME=$(grep $SFTP_USER_NAME_MARKER $UPLOAD_INFO)
|
---|
427 | SFTP_USER_NAME=${SFTP_USER_NAME#$SFTP_USER_NAME_MARKER}
|
---|
428 | SFTP_PASSWORD=$(grep $SFTP_PASSWORD_MARKER $UPLOAD_INFO)
|
---|
429 | SFTP_PASSWORD=${SFTP_PASSWORD#$SFTP_PASSWORD_MARKER}
|
---|
430 | SFTP_PORT=$(grep $SFTP_PORT_MARKER $UPLOAD_INFO)
|
---|
431 | SFTP_PORT=${SFTP_PORT#$SFTP_PORT_MARKER}
|
---|
432 | SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
|
---|
433 | SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}
|
---|
434 |
|
---|
435 | expect "$SCRIPT_PATH" "$LOG_HTM" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.htm"
|
---|
436 |
|
---|
437 | valPrint c "Report was uploaded, unless an error message appears above."
|
---|
438 | }
|
---|
439 |
|
---|
440 | # Prints session summary when script is done
|
---|
441 | function wrapupAndExit()
|
---|
442 | {
|
---|
443 | # Get off progress line on console, drop down a line from last link in log, and close HTML table
|
---|
444 | valPrint ctr ""
|
---|
445 | valPrint h "</table><br />"
|
---|
446 |
|
---|
447 | # If we didn't finish processing the last URL, then the iterator is one too high
|
---|
448 | if [ $FINISHED_LIST != "yes" ]; then
|
---|
449 | let LINK_NUM-=1
|
---|
450 | if [ $FINISHED_LIST == "no" ]; then
|
---|
451 | valPrint ctrh "The session was canceled by the user."
|
---|
452 | fi
|
---|
453 | fi
|
---|
454 |
|
---|
455 | # Generate string with elapsed time
|
---|
456 | END_RUN=$(date +%s)
|
---|
457 | ELAPSED=$(echo $(($END_RUN - $START_RUN)) | awk '{printf "%d min. %d sec. elapsed", int($1/60), int($1%60)}')
|
---|
458 |
|
---|
459 | # Do some math on results of session
|
---|
460 | LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
|
---|
461 | LINK_PROBLEMS=$((EI_LINKS+IW_LINKS+RD_LINKS+NG_LINKS))
|
---|
462 | LINK_ERRORS=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
|
---|
463 | LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_EI+SKIP_EXPECT_IW))
|
---|
464 | TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP+SKIP_YOUTU_BE))
|
---|
465 | LINKS_CHECKED=$((LINKS_PROCESSED-LINK_ERRORS))
|
---|
466 |
|
---|
467 | # Print summary header
|
---|
468 | valPrint ct "Summary ($ELAPSED):"
|
---|
469 | valPrint r "\b1 Summary \b0 ($ELAPSED)"
|
---|
470 | valPrint hn "<h3><span id=\"summary\">Summary ($ELAPSED)</span></h3>"
|
---|
471 | valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT) (there $(pluralCheckWas $FILE_LINKS) $FILE_LINKS file $(pluralCheckNoun link $FILE_LINKS) and $PAGE_LINKS page $(pluralCheckNoun link $PAGE_LINKS))."
|
---|
472 |
|
---|
473 | # Print processed link totals
|
---|
474 | if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi
|
---|
475 | if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi
|
---|
476 | if [ $SKIP_ARCHIVE_ORG -gt 0 ]; then valPrint ctrh "- $SKIP_ARCHIVE_ORG Archive.org $(pluralCheckNoun link $SKIP_ARCHIVE_ORG) were not checked"; fi
|
---|
477 | if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS processed $(pluralCheckNoun link $LINK_PROBLEMS) had $(pluralCheckAn $LINK_PROBLEMS)$(pluralCheckNoun issue $LINK_PROBLEMS)"; fi
|
---|
478 | if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr " (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; valPrint h " (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi
|
---|
479 | if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) $(pluralCheckWas $OK_LINKS) OK"; fi
|
---|
480 | if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctr " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; valPrint h " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi
|
---|
481 |
|
---|
482 | # Print excepted link totals
|
---|
483 | if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctrh "$LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"; fi
|
---|
484 | if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
|
---|
485 | if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi
|
---|
486 | if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi
|
---|
487 |
|
---|
488 | # Print errored link totals
|
---|
489 | if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"; fi
|
---|
490 | if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
|
---|
491 | if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
|
---|
492 | if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
|
---|
493 | if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
|
---|
494 | if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
|
---|
495 | if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
|
---|
496 |
|
---|
497 | # Print checked link totals
|
---|
498 | if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS link $(pluralCheckNoun issue $LINK_PROBLEMS):"; fi
|
---|
499 | if [ $NG_LINKS -gt 0 ]; then valPrint ctrh "- $NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
|
---|
500 | if [ $RD_LINKS -gt 0 ]; then valPrint ctrh "- $RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi
|
---|
501 | if [ $EI_LINKS -gt 0 ]; then valPrint ctrh "- $EI_LINKS $(pluralCheckNoun link $EI_LINKS) that could be intrawiki"; fi
|
---|
502 | if [ $IW_LINKS -gt 0 ]; then valPrint ctrh "- $IW_LINKS $(pluralCheckNoun link $IW_LINKS) that could be interwiki"; fi
|
---|
503 |
|
---|
504 | # Close the log files' markup
|
---|
505 | valPrint trh "ValExtLinks says goodbye."
|
---|
506 | printRTFfooter
|
---|
507 | printHTMfooter
|
---|
508 |
|
---|
509 | # Upload report if this was requested
|
---|
510 | if [ ! -z $UPLOAD_INFO ] && [ $FINISHED_LIST != "no" ]; then
|
---|
511 | uploadReport
|
---|
512 | fi
|
---|
513 |
|
---|
514 | # Really quit now
|
---|
515 | valPrint c "ValExtLinks says goodbye."
|
---|
516 | exit 0
|
---|
517 | }
|
---|
518 | trap wrapupAndExit INT
|
---|
519 |
|
---|
520 |
|
---|
521 | ### INITIALIZATION ###
|
---|
522 | # Print opening message to console and log files
|
---|
523 | valPrint cw "$(tput bold)--Validate External Links--$(tput sgr0)"
|
---|
524 | printTXTheader
|
---|
525 | printRTFheader
|
---|
526 | printHTMheader
|
---|
527 |
|
---|
528 | # Attempt to download file at LINKS_URL, then check that it succeeded
|
---|
529 | valPrint t "Config:"
|
---|
530 | valPrint r "\b1 Config \b0"
|
---|
531 | valPrint hn "<h3>Config</h3>"
|
---|
532 | valPrint cwtrh "Downloading list of external links from $LINKS_URL."
|
---|
533 | LINKS_FILE_NAME=$(echo "$LINKS_URL" | sed 's/.*\///')
|
---|
534 | LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
|
---|
535 | curl --silent -o "$LINKS_FILE" $LINKS_URL
|
---|
536 | if [ ! -f "$LINKS_FILE" ]; then
|
---|
537 | echo "The download of $LINKS_URL appears to have failed. Aborting."
|
---|
538 | wrapupAndExit
|
---|
539 | fi
|
---|
540 |
|
---|
541 | # Attempt to download file at EXCEPT_URL, then check that it succeeded
|
---|
542 | if [ ! -z $EXCEPT_URL ]; then
|
---|
543 | valPrint cwtrh "Downloading list of reporting exceptions from $EXCEPT_URL."
|
---|
544 | EXCEPT_DATA=$(curl --silent $EXCEPT_URL)
|
---|
545 | if [ -z "$EXCEPT_DATA" ]; then
|
---|
546 | echo "The download of the exceptions data from '$EXCEPT_URL' appears to have failed. Aborting."
|
---|
547 | wrapupAndExit
|
---|
548 | fi
|
---|
549 | EXCEPT_DATA=${EXCEPT_DATA%END LIST*}
|
---|
550 | EXCEPT_DATA=${EXCEPT_DATA#*BEGIN LIST}
|
---|
551 | EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
|
---|
552 |
|
---|
553 | # Store on disk for debugging purposes
|
---|
554 | echo "$EXCEPT_DATA" > "$EXCEPT_FILE"
|
---|
555 |
|
---|
556 | # Transfer to array for easy searching later
|
---|
557 | declare -a EXCEPT_ARRAY=($(echo "$EXCEPT_DATA"))
|
---|
558 | fi
|
---|
559 |
|
---|
560 | # Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
|
---|
561 | LINK_COUNT_STRING=$(cat "$LINKS_FILE" | wc -l)
|
---|
562 |
|
---|
563 | # Number of URLs is number of lines minus one (first line is column header row for the CSV)
|
---|
564 | LINK_COUNT=$(echo "${LINK_COUNT_STRING}" | tr -d '[:space:]')
|
---|
565 | let LINK_COUNT-=1
|
---|
566 |
|
---|
567 | # Calculate number of URLs to consider
|
---|
568 | if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
|
---|
569 | valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((URL_LIMIT-URL_START+1)) of them, from link $URL_START to $URL_LIMIT."
|
---|
570 | elif [ $URL_START -ne 1 ]; then
|
---|
571 | valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((LINK_COUNT-URL_START+1)) of them, from link $URL_START to $LINK_COUNT."
|
---|
572 | else
|
---|
573 | valPrint ctrh "Found $LINK_COUNT links to process. I will be considering all of them."
|
---|
574 | fi
|
---|
575 |
|
---|
576 | # Print settings to console and log
|
---|
577 | declare -a SETTINGS_MSG=(I will be checking the response code of each link "and will" take a screenshot of each page. Pages that are OK will "also" be logged. I "will" ask the Internet Archive for a suggested snapshot URL for each NG page. "I will not report links that are in the exceptions list." "I will ignore URLs that simply have ending slashes added onto them." "I will ignore URLs that only upgrade from HTTP to HTTPS." "I will ignore youtu.be links that are merely being expanded." "I will not check the validity of Internet Archive snapshot URLs.")
|
---|
578 | if [ $TAKE_PAGE_SHOT -eq 0 ]; then SETTINGS_MSG[10]="but will not"; fi
|
---|
579 | if [ $RECORD_OK_LINKS -eq 0 ]; then SETTINGS_MSG[22]="not"; fi
|
---|
580 | if [ $SUGGEST_SNAPSHOTS -eq 0 ]; then SETTINGS_MSG[26]="will not"; fi
|
---|
581 | if [ -z $EXCEPT_URL ] || [ $RECORD_OK_LINKS -eq 1 ]; then SETTINGS_MSG[40]=""; fi
|
---|
582 | if [ $SHOW_SLASH -eq 1 ]; then SETTINGS_MSG[41]=""; fi
|
---|
583 | if [ $SHOW_HTTPS -eq 1 ]; then SETTINGS_MSG[42]=""; fi
|
---|
584 | if [ $SHOW_YT_RD -eq 1 ]; then SETTINGS_MSG[43]=""; fi
|
---|
585 | if [ $SKIP_ARCHIVE_LINKS -eq 0 ]; then SETTINGS_MSG[44]=""; fi
|
---|
586 | SETTINGS_STR=${SETTINGS_MSG[@]}
|
---|
587 | valPrint ctrh "$SETTINGS_STR"
|
---|
588 | valPrint tr "A summary of my findings will be found at the bottom of the report."
|
---|
589 | valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
|
---|
590 | valPrint trh ""
|
---|
591 |
|
---|
592 | # Print legend to logs
|
---|
593 | valPrint t "Legend:"
|
---|
594 | valPrint r "\b1 Legend \b0"
|
---|
595 | valPrint hn "<h3>Legend</h3>"
|
---|
596 | valPrint trh "OK = URL seems to be working."
|
---|
597 | valPrint trh "NG = URL no longer seems to work. You should click each URL marked as NG before attempting to fix it, because false negatives will occur from time to time due to hiccups in the Internet. Please report any persistent false negatives or other issues to the script's author (see top of report). An NG link should be followed by a link to the Internet Archive's Wayback Machine which may help you repair the link, unless the Archive does not have any snapshots of the site. If the link cannot be repaired, you can delete it from the wiki page, or, if this would disrupt the surrounding material on the page, disable the link by wrapping the URL in nowiki tags."
|
---|
598 | valPrint trh "RD = The server responding to this URL is saying that the page moved and you should instead use the supplied new URL. Some RD links represent minor adjustments in the organization of a web site, and some are soft 404s (the file/page has been removed and you are being redirected to something like the main page of the web site). You will have to look at the new URL yourself to determine if it represents an OK link and the link on the wiki should be updated to this one, or if the desired file/page is actually gone and we need to replace the wiki link with an Internet Archive snapshot link -- or disable the URL if it has not been archived."
|
---|
599 | valPrint trh "EI = URL is an external link to an internal page and should be converted to an intrawiki link using the suggested markup."
|
---|
600 | valPrint trh "IW = URL is an external link to a fellow wiki and should be converted to an interwiki link using the suggested markup."
|
---|
601 | valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $HTTP_CODES)."
|
---|
602 | valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$HTTP_CODES\"}}{\fldrslt here}} for code reference)."
|
---|
603 | valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$HTTP_CODES\" target=\"_blank\">here</a> for code reference)."
|
---|
604 | valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $CURL_CODES)."
|
---|
605 | valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$CURL_CODES\"}}{\fldrslt here}} for code reference)."
|
---|
606 | valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$CURL_CODES\" target=\"_blank\">here</a> for code reference)."
|
---|
607 | valPrint trh "IA suggests = Last available snapshot suggested by the Internet Archive."
|
---|
608 | valPrint trh "Try browsing = The Archive occasionally fails to return a snapshot URL even when one exists, so you will need to check for a snapshot manually using this link to the Wayback Machine before concluding that a site has not been archived."
|
---|
609 | valPrint trh ""
|
---|
610 |
|
---|
611 |
|
---|
612 | ### MAIN LOOP ###
|
---|
613 | valPrint t "Links:"
|
---|
614 | valPrint r "\b1 Links \b0"
|
---|
615 | valPrint hn "<h3>Links</h3>"
|
---|
616 | START_RUN=$(date +%s)
|
---|
617 | # Process each line of the .csv in LINKS_FILE
|
---|
618 | for LINE in `cat "$LINKS_FILE"`; do
|
---|
619 | let LINK_NUM+=1
|
---|
620 |
|
---|
621 | # First line is the column header row for the CSV, so let's verify that the format hasn't changed
|
---|
622 | if [ $SKIPPED_HEADER_ROW -eq 0 ]; then
|
---|
623 | if [ $LINE == "namespace,title,target" ]; then
|
---|
624 | SKIPPED_HEADER_ROW=1
|
---|
625 | LINK_NUM=0 # this line is it's not a link, so reset the link counter
|
---|
626 | valPrint hn "<table>"
|
---|
627 | continue
|
---|
628 | else
|
---|
629 | valPrint ctrh "Did not find expected header row in $LINKS_FILE. Aborting."
|
---|
630 | wrapupAndExit
|
---|
631 | fi
|
---|
632 | fi
|
---|
633 |
|
---|
634 | # Skip this link if we are not at URL_START yet
|
---|
635 | if [ $LINK_NUM -lt $URL_START ]; then
|
---|
636 | continue
|
---|
637 | fi
|
---|
638 |
|
---|
639 | # Stop if we are at the limit declared for testing purposes
|
---|
640 | if [ $URL_LIMIT -gt 0 ] && [ $LINK_NUM -gt $URL_LIMIT ]; then
|
---|
641 | FINISHED_LIST="limit"
|
---|
642 | wrapupAndExit
|
---|
643 | fi
|
---|
644 |
|
---|
645 | # Print progress to screen
|
---|
646 | if [ $LINK_NUM -gt 1 ]; then
|
---|
647 | printf "\e[1A\n" # erase previous progress message so that new one appears in its place
|
---|
648 | fi
|
---|
649 | valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."
|
---|
650 |
|
---|
651 | # The number of the namespace is the element before the first comma on the line
|
---|
652 | NS_ID=${LINE%%,*}
|
---|
653 |
|
---|
654 | # Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
|
---|
655 | NS_NAME=""
|
---|
656 | a=0
|
---|
657 | while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done
|
---|
658 | if [ $NS_ID == "NULL" ]; then
|
---|
659 | break
|
---|
660 | elif [ $NS_ID -eq ${NS_IDS[$a]} ]; then
|
---|
661 | NS_NAME="${NS_NAMES[$a]}"
|
---|
662 | break
|
---|
663 | fi
|
---|
664 | let a+=1
|
---|
665 | done
|
---|
666 | if [ "$NS_NAME" == "" ]; then
|
---|
667 | if [ $NS_ID == "NULL" ]; then
|
---|
668 | valPrint trs "Skipping line $LINK_NUM ('$LINE') because the namespace (and probably the page too) is 'NULL'. Probably the link is no longer in existence on the wiki."
|
---|
669 | else
|
---|
670 | valPrint trs "Skipping line $LINK_NUM ('$LINE') because I could not find a name for namespace ID $NS_ID."
|
---|
671 | fi
|
---|
672 | let SKIP_UNK_NS+=1
|
---|
673 | continue
|
---|
674 | fi
|
---|
675 |
|
---|
676 | # The name of the page is everything between the namespace ID and the next comma on the line (commas
|
---|
677 | # in page names will break this)
|
---|
678 | PAGE_NAME=${LINE#$NS_ID,}
|
---|
679 | PAGE_NAME=${PAGE_NAME%%,*}
|
---|
680 |
|
---|
681 | # We don't want to consider wiki pages ending in .js, as the MW parser cannot reliably isolate URLS
|
---|
682 | # in JavaScript code, so it returns erroneous links
|
---|
683 | PAGE_NAME_SUFFIX=$(echo $PAGE_NAME | sed 's/.*\.//')
|
---|
684 | if [ $PAGE_NAME_SUFFIX == "js" ]; then
|
---|
685 | valPrint trs "Skipping URL '${LINE#$NS_ID,$PAGE_NAME,}' on line $LINK_NUM because it was found on JavaScript page '$PAGE_NAME'."
|
---|
686 | let SKIP_JS_PAGE+=1
|
---|
687 | continue
|
---|
688 | fi
|
---|
689 |
|
---|
690 | # Build longer wiki page URLs from namespace and page names
|
---|
691 | FULL_PAGE_PATH=https://$WIKI_PATH/$NS_NAME:$PAGE_NAME
|
---|
692 | LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
|
---|
693 | # Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it
|
---|
694 | # explicitly breaks the link
|
---|
695 | if [ $NS_ID -eq 0 ]; then
|
---|
696 | FULL_PAGE_PATH=https://$WIKI_PATH/$PAGE_NAME
|
---|
697 | LOCAL_PAGE_PATH=$PAGE_NAME
|
---|
698 | fi
|
---|
699 |
|
---|
700 | # The URL being linked to is everything after the previous two fields (this allows commas to be in
|
---|
701 | # the URLs, but a comma in the previous field, the page name, will break this)
|
---|
702 | URL=${LINE#$NS_ID,$PAGE_NAME,}
|
---|
703 |
|
---|
704 | # Scan for illegal characters
|
---|
705 | if [[ $URL == *[$ILLEGAL_CHARS]* ]]; then
|
---|
706 | valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because it contains characters illegal in a URL."
|
---|
707 | let SKIP_BAD_URL+=1
|
---|
708 | continue
|
---|
709 | fi
|
---|
710 |
|
---|
711 | # If we're skipping Archive.org links, check if this is one
|
---|
712 | if [ $SKIP_ARCHIVE_LINKS -eq 1 ] && [[ $URL == *web.archive.org* ]]; then
|
---|
713 | valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have been asked not to check Wayback Machine links."
|
---|
714 | let SKIP_ARCHIVE_ORG+=1
|
---|
715 | continue
|
---|
716 | fi
|
---|
717 |
|
---|
718 | # Now we need to know if the URL is for a file or a web page. First step is to determine if the
|
---|
719 | # URL ends in a suffix
|
---|
720 | HAS_SUFFIX=0
|
---|
721 |
|
---|
722 | # If the URL ends in something like ".php?foo=bar", strip everything from the '?' onward
|
---|
723 | CLEAN_URL=${URL%%\?*}
|
---|
724 |
|
---|
725 | # If the URL ends in something like "#section_15", strip everything from the '#' onward
|
---|
726 | CLEAN_URL=${CLEAN_URL%%\#*}
|
---|
727 |
|
---|
728 | # 'sed' cannot handle Unicode in my Bash shell, so skip non-ASCII URL and make user check it
|
---|
729 | if [[ $CLEAN_URL == *[![:ascii:]]* ]]; then
|
---|
730 | valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I cannot handle non-ASCII characters."
|
---|
731 | let SKIP_NON_ASCII+=1
|
---|
732 | continue
|
---|
733 | fi
|
---|
734 |
|
---|
735 | # Isolate the characters after the last period and after the last slash
|
---|
736 | POST_DOT=$(echo "$CLEAN_URL" | sed 's/.*\.//')
|
---|
737 | POST_SLASH=$(echo "$CLEAN_URL" | sed 's/.*\///')
|
---|
738 |
|
---|
739 | # If the last period comes after the last slash, then the URL ends in a suffix
|
---|
740 | POST_DOT_LENGTH=$(echo | awk -v input=$POST_DOT '{print length(input)}')
|
---|
741 | POST_SLASH_LENGTH=$(echo | awk -v input=$POST_SLASH '{print length(input)}')
|
---|
742 | if [ $POST_DOT_LENGTH -lt $POST_SLASH_LENGTH ]; then
|
---|
743 | HAS_SUFFIX=1
|
---|
744 | else
|
---|
745 | HAS_SUFFIX=0
|
---|
746 | fi
|
---|
747 |
|
---|
748 | # Now we need to know if the URL is for a file. If the URL ends in a suffix, compare it against the
|
---|
749 | # known URL endings supplied in HTTP_FILES and HTTP_TLDS_AND_PAGES
|
---|
750 | IS_FILE=-1
|
---|
751 | if [ $HAS_SUFFIX -eq 0 ]; then
|
---|
752 | IS_FILE=0
|
---|
753 | else
|
---|
754 | # Turn off case sensitivity while we compare suffixes
|
---|
755 | shopt -s nocasematch
|
---|
756 |
|
---|
757 | # Special case: URLs ending in something like "productID.304297400" are pages, not files, so if
|
---|
758 | # the URL's suffix is all numbers, we are looking at the end of a web page URL
|
---|
759 | if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
|
---|
760 | IS_FILE=0
|
---|
761 | fi
|
---|
762 |
|
---|
763 | # Special case: URLs ending in a parens, e.g. "ms537113(v=vs.85)", are pages
|
---|
764 | if [[ $POST_DOT =~ ^.*[\(\)]$ ]]; then
|
---|
765 | IS_FILE=0
|
---|
766 | fi
|
---|
767 |
|
---|
768 | # Special case: URLs containing a '%', e.g. "10.1007%2FBF00329055", are pages
|
---|
769 | if [[ $POST_DOT == *%* ]]; then
|
---|
770 | IS_FILE=0
|
---|
771 | fi
|
---|
772 |
|
---|
773 | # If we did not identify this URL as a web page above, we need to compare the suffix against known
|
---|
774 | # file extensions
|
---|
775 | if [ $IS_FILE -eq -1 ]; then
|
---|
776 | for EXTENSION in "${HTTP_FILES[@]}"; do
|
---|
777 | if [[ $EXTENSION == $POST_DOT ]]; then # double brackets respect the nocasematch setting
|
---|
778 | IS_FILE=1
|
---|
779 | break
|
---|
780 | fi
|
---|
781 | done
|
---|
782 | fi
|
---|
783 |
|
---|
784 | # If we did not identify this URL as a file above, we need to compare the suffix against known
|
---|
785 | # pages and TLDs in order to rule out that this is an unknown file suffix or TLD that the user
|
---|
786 | # needs to add to the HTTP_FILES or HTTP_TLDS_AND_PAGES array
|
---|
787 | if [ $IS_FILE -eq -1 ]; then
|
---|
788 | for PAGE_OR_TLD in "${HTTP_TLDS_AND_PAGES[@]}"; do
|
---|
789 | if [[ $PAGE_OR_TLD == $POST_DOT ]]; then
|
---|
790 | IS_FILE=0
|
---|
791 | break
|
---|
792 | fi
|
---|
793 | done
|
---|
794 | fi
|
---|
795 |
|
---|
796 | # Turn case sensitivity back on in Bash
|
---|
797 | shopt -u nocasematch
|
---|
798 | fi
|
---|
799 |
|
---|
800 | # If this suffix escaped identification as either a file, page or TLD, inform the user
|
---|
801 | STR_TYPE=""
|
---|
802 | if [ $IS_FILE -eq -1 ]; then
|
---|
803 | valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I encountered the unknown URL ending '$POST_DOT'. Please add this ending to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
|
---|
804 | let SKIP_UNK_SUFFIX+=1
|
---|
805 | continue
|
---|
806 | elif [ $IS_FILE -eq 1 ]; then
|
---|
807 | STR_TYPE="file"
|
---|
808 | let FILE_LINKS+=1
|
---|
809 | elif [ $IS_FILE -eq 0 ]; then
|
---|
810 | STR_TYPE="page"
|
---|
811 | let PAGE_LINKS+=1
|
---|
812 | fi
|
---|
813 |
|
---|
814 | # Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
|
---|
815 | # issue with sites that require HTTPS
|
---|
816 | CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time 10 --write-out '%{http_code}\n' $URL)
|
---|
817 | CURL_ERR=$(echo $?)
|
---|
818 | CURL_RESULT=$CURL_CODE
|
---|
819 |
|
---|
820 | # Tack 'curl' exit code onto result string if 'curl' returned "000" (no HTTP response)
|
---|
821 | if [ $CURL_CODE == "000" ]; then
|
---|
822 | CURL_RESULT="$CURL_RESULT-$CURL_ERR"
|
---|
823 | fi
|
---|
824 |
|
---|
825 | # Begin to determine our status code for this URL (EI, IW, OK, RD, or NG)
|
---|
826 | STATUS="??"
|
---|
827 | NEW_URL=""
|
---|
828 | INTERWIKI_INDEX=-1
|
---|
829 |
|
---|
830 | # First make sure that this isn't an "external internal" link to our own wiki that can be replaced
|
---|
831 | # by "[[page_name]]". If it uses a special access URL beginning with "/w/", let it pass, as it
|
---|
832 | # probably cannot be replaced by "[[ ]]" markup
|
---|
833 | if [[ $URL == *$WIKI_PATH* ]] && [[ $URL != *$WIKI_PATH/w/* ]]; then
|
---|
834 | STATUS="EI"
|
---|
835 | let EI_LINKS+=1
|
---|
836 | fi
|
---|
837 |
|
---|
838 | # If it's not, check if this is a link to a domain that we have an interwiki prefix for
|
---|
839 | if [ $STATUS == "??" ]; then
|
---|
840 | for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
|
---|
841 | if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]] && [[ $URL != *${INTERWIKI_DOMAINS[$i]}/w/* ]]; then
|
---|
842 | STATUS="IW"
|
---|
843 | let IW_LINKS+=1
|
---|
844 | INTERWIKI_INDEX=$i
|
---|
845 | break
|
---|
846 | fi
|
---|
847 | done
|
---|
848 | fi
|
---|
849 |
|
---|
850 | # If we didn't match an interwiki domain, see if the status code is in our "OK" codes list
|
---|
851 | if [ $STATUS == "??" ]; then
|
---|
852 | for CODE in "${OK_CODES[@]}"; do
|
---|
853 | if [[ $CODE == $CURL_CODE ]]; then
|
---|
854 | STATUS="OK"
|
---|
855 | let OK_LINKS+=1
|
---|
856 | break
|
---|
857 | fi
|
---|
858 | done
|
---|
859 | fi
|
---|
860 |
|
---|
861 | # If we didn't get a match with the "OK" codes, check it against the "RD" codes
|
---|
862 | if [ $STATUS == "??" ]; then
|
---|
863 | for CODE in "${RD_CODES[@]}"; do
|
---|
864 | if [[ $CODE == $CURL_CODE ]]; then
|
---|
865 | # Get URL header again in order to retrieve the URL we are being redirected to
|
---|
866 | NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time 10 --write-out '%{redirect_url}\n' $URL)
|
---|
867 |
|
---|
868 | # Adjust the old and new URLs to both use HTTP for comparison purposes, so we can filter
|
---|
869 | # those changes out if the user didn't ask for them
|
---|
870 | URL_HTTP=$(echo $URL | sed -E 's/^https:/http:/')
|
---|
871 | NEW_URL_HTTP=$(echo $NEW_URL | sed -E 's/^https:/http:/')
|
---|
872 |
|
---|
873 | # Sometimes 'curl' fails to get the redirect_url due to time-out or bad web site config
|
---|
874 | NEW_URL_LENGTH=$(echo | awk -v input=$NEW_URL_HTTP '{print length(input)}')
|
---|
875 | if [ $NEW_URL_LENGTH -lt $MIN_URL_LENGTH ]; then
|
---|
876 | NEW_URL_HTTP="[new URL not retrieved]"
|
---|
877 | fi
|
---|
878 |
|
---|
879 | # Remove slash at end of new URL, if present, so we can filter out the redirects that
|
---|
880 | # merely add an ending slash if the user didn't ask for them
|
---|
881 | NEW_URL_NO_SLASH=$(echo $NEW_URL_HTTP | sed -E 's:/$::')
|
---|
882 |
|
---|
883 | # Detect if this is a youtu.be link simply being expanded by YouTube to the full
|
---|
884 | # youtube.com address
|
---|
885 | YOUTU_BE=0
|
---|
886 | if [[ $URL_HTTP == http://youtu.be* ]] && [[ $NEW_URL_HTTP == http://www.youtube.com* ]]; then
|
---|
887 | YOUTU_BE=1
|
---|
888 | fi
|
---|
889 |
|
---|
890 | # If the URLs match besides HTTP being upgraded to HTTPS, then the link is OK (unless user
|
---|
891 | # wants those to be reported)
|
---|
892 | if [ $SHOW_HTTPS -eq 0 ] && [ $URL_HTTP == $NEW_URL_HTTP ]; then
|
---|
893 | valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show http->https upgrades, and I was redirected to '$NEW_URL'."
|
---|
894 | STATUS="OK"
|
---|
895 | let OK_LINKS+=1
|
---|
896 | let SKIP_HTTPS_UP+=1
|
---|
897 | # If the URLs match besides an added ending slash, then the link is OK (unless user wants
|
---|
898 | # those to be reported)
|
---|
899 | elif [ $SHOW_SLASH -eq 0 ] && [ $URL_HTTP == $NEW_URL_NO_SLASH ]; then
|
---|
900 | valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show added trailing slashes, and I was redirected to '$NEW_URL'."
|
---|
901 | STATUS="OK"
|
---|
902 | let OK_LINKS+=1
|
---|
903 | let SKIP_SLASH_ADD+=1
|
---|
904 | elif [ $SHOW_YT_RD -eq 0 ] && [ $YOUTU_BE -eq 1 ]; then
|
---|
905 | valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show redirects for youtu.be links, and I was redirected to '$NEW_URL'."
|
---|
906 | STATUS="OK"
|
---|
907 | let OK_LINKS+=1
|
---|
908 | let SKIP_YOUTU_BE+=1
|
---|
909 | else
|
---|
910 | STATUS="RD"
|
---|
911 | let RD_LINKS+=1
|
---|
912 | fi
|
---|
913 | break
|
---|
914 | fi
|
---|
915 | done
|
---|
916 | fi
|
---|
917 |
|
---|
918 | # If we didn't get a match with the "RD" codes, check it against the "NG" codes
|
---|
919 | if [ $STATUS == "??" ]; then
|
---|
920 | for CODE in "${NG_CODES[@]}"; do
|
---|
921 | if [[ $CODE == $CURL_CODE ]]; then
|
---|
922 | STATUS="NG"
|
---|
923 | let NG_LINKS+=1
|
---|
924 | break
|
---|
925 | fi
|
---|
926 | done
|
---|
927 | fi
|
---|
928 |
|
---|
929 | # If we didn't match a known status code, advise the reader
|
---|
930 | if [ $STATUS == "??" ]; then
|
---|
931 | valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I encountered the unknown response code $CURL_CODE."
|
---|
932 | let SKIP_UNK_CODE+=1
|
---|
933 | continue
|
---|
934 | fi
|
---|
935 |
|
---|
936 | # Check problem links against exceptions list before proceeding
|
---|
937 | FOUND_EXCEPT=0
|
---|
938 | if [ $STATUS != "OK" ] && [ ! -z $EXCEPT_URL ]; then
|
---|
939 | # The code we expect to find in the exceptions file is either the 'curl' result or "EI"/"IW"
|
---|
940 | EXPECT_CODE="$CURL_RESULT"
|
---|
941 | if [ $STATUS == "EI" ]; then
|
---|
942 | EXPECT_CODE="EI"
|
---|
943 | elif [ $STATUS == "IW" ]; then
|
---|
944 | EXPECT_CODE="IW"
|
---|
945 | fi
|
---|
946 |
|
---|
947 | # Look for link in exceptions list and make sure the listed result code and wiki page also match
|
---|
948 | for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
|
---|
949 | {
|
---|
950 | EXCEPT_LINE="${EXCEPT_ARRAY[$i]}"
|
---|
951 |
|
---|
952 | # Match URL
|
---|
953 | EXCEPT_URL="${EXCEPT_LINE#*,}"
|
---|
954 | EXCEPT_URL="${EXCEPT_URL%,*}"
|
---|
955 | if [ "$EXCEPT_URL" != "$URL" ]; then
|
---|
956 | continue
|
---|
957 | fi
|
---|
958 |
|
---|
959 | # Match containing page's name
|
---|
960 | EXCEPT_PAGE="${EXCEPT_LINE##*,}"
|
---|
961 | EXCEPT_PAGE="${EXCEPT_PAGE%% *}"
|
---|
962 | if [ "$EXCEPT_PAGE" == "*" ] || [ "$EXCEPT_PAGE" == $LOCAL_PAGE_PATH ]; then
|
---|
963 | # Match result code
|
---|
964 | EXCEPT_CODE=${EXCEPT_LINE%%,*}
|
---|
965 | if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then
|
---|
966 | valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because its expected result, '$EXPECT_CODE', is in the exceptions list."
|
---|
967 | if [ $STATUS == "EI" ]; then
|
---|
968 | let SKIP_EXPECT_EI+=1
|
---|
969 | elif [ $STATUS == "IW" ]; then
|
---|
970 | let SKIP_EXPECT_IW+=1
|
---|
971 | else
|
---|
972 | let SKIP_EXPECT_NG+=1
|
---|
973 | fi
|
---|
974 | FOUND_EXCEPT=1
|
---|
975 | break
|
---|
976 | fi
|
---|
977 | fi
|
---|
978 | } done
|
---|
979 | fi
|
---|
980 | if [ $FOUND_EXCEPT -eq 1 ]; then
|
---|
981 | continue
|
---|
982 | fi
|
---|
983 |
|
---|
984 | # If appropriate, record this link to the log, with clickable URLs when possible
|
---|
985 | if [ $STATUS != "OK" ] || [ $RECORD_OK_LINKS -eq 1 ]; then
|
---|
986 | # Prepare 'curl' result in parentheses to print after status code, unless this is an "IW" or "EI"
|
---|
987 | # link, in which case showing the status code doesn't make sense. Adjust spacing after string to
|
---|
988 | # ensure TXT and RTF reports have aligned columns of results.
|
---|
989 | CURL_STR_H=" ($CURL_RESULT)"
|
---|
990 | CURL_STR_T="$CURL_STR_H"
|
---|
991 | CURL_STR_R="$CURL_STR_H "
|
---|
992 | if [ $STATUS == "IW" ] || [ $STATUS == "EI" ]; then
|
---|
993 | CURL_STR_H=""
|
---|
994 | CURL_STR_T=" "
|
---|
995 | CURL_STR_R=" "
|
---|
996 | fi
|
---|
997 |
|
---|
998 | # Record link and its wiki page in TXT, RTF, and HTML markup
|
---|
999 | valPrint t "${STATUS}${CURL_STR_T} $STR_TYPE $URL"
|
---|
1000 | valPrint t " linked from $FULL_PAGE_PATH"
|
---|
1001 | valPrint r "${STATUS}${CURL_STR_R}${RTF_TABS}$STR_TYPE {\field{\*\fldinst{HYPERLINK \"$URL\"}}{\fldrslt $URL}}"
|
---|
1002 | valPrint r " linked from {\field{\*\fldinst{HYPERLINK \"$FULL_PAGE_PATH\"}}{\fldrslt $LOCAL_PAGE_PATH}}"
|
---|
1003 | valPrint hn "<tr><td style=\"white-space:nowrap\">${STATUS}${CURL_STR_H}</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>"
|
---|
1004 | valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"
|
---|
1005 |
|
---|
1006 | # Place vertical space here since we won't be printing anything more about this link
|
---|
1007 | if [ $STATUS == "OK" ]; then valPrint tr ""; valPrint hs ""; fi
|
---|
1008 |
|
---|
1009 | # Record redirect URL if one was given by a 3xx response page
|
---|
1010 | if [ $STATUS == "RD" ]; then
|
---|
1011 | valPrint ts " Server suggests $NEW_URL"
|
---|
1012 | valPrint rs " Server suggests {\field{\*\fldinst{HYPERLINK \"$NEW_URL\"}}{\fldrslt $NEW_URL}}"
|
---|
1013 | valPrint hs "<tr><td colspan=\"2\" align=\"right\">Server suggests</td><td><a href=\"$NEW_URL\" target=\"_blank\">$NEW_URL</a></td></tr>"
|
---|
1014 | fi
|
---|
1015 |
|
---|
1016 | # Notify reader if we can use an intrawiki link for this URL
|
---|
1017 | if [ $STATUS == "EI" ]; then
|
---|
1018 | INTRA_PAGE=${URL#*://*/}
|
---|
1019 | valPrint ts " Just use [[$INTRA_PAGE]]"
|
---|
1020 | valPrint rs " Just use [[$INTRA_PAGE]]"
|
---|
1021 | valPrint hs "<tr><td colspan=\"2\" align=\"right\">Just use</td><td>[[$INTRA_PAGE]]</td></tr>"
|
---|
1022 | fi
|
---|
1023 |
|
---|
1024 | # Notify reader if we can use an interwiki prefix for this URL
|
---|
1025 | if [ $STATUS == "IW" ]; then
|
---|
1026 | INTER_PAGE=$(echo "$URL" | sed 's/.*\///')
|
---|
1027 | valPrint ts " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
|
---|
1028 | valPrint rs " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
|
---|
1029 | valPrint hs "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]</td></tr>"
|
---|
1030 | fi
|
---|
1031 |
|
---|
1032 | # Query Internet Archive for latest "OK" snapshot for "NG" page
|
---|
1033 | if [ $STATUS == "NG" ] && [ $SUGGEST_SNAPSHOTS -eq 1 ]; then
|
---|
1034 | ARCHIVE_QUERY=$(curl --silent --max-time 10 "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
|
---|
1035 |
|
---|
1036 | # If a "closest" snapshot was received...
|
---|
1037 | if [[ "$ARCHIVE_QUERY" == *\"closest\":* ]]; then
|
---|
1038 | # In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it
|
---|
1039 | ARCHIVE_QUERY=$(echo "$ARCHIVE_QUERY" | sed 's/#!/#\\!/')
|
---|
1040 |
|
---|
1041 | # ...isolate "url" property in the response that follows the "closest" tag
|
---|
1042 | SNAPSHOT_URL=${ARCHIVE_QUERY##*\"closest\":} # everything after '"closest":'
|
---|
1043 | SNAPSHOT_URL=${SNAPSHOT_URL#*\"url\": \"} # everything after '"url": "'
|
---|
1044 | SNAPSHOT_URL=${SNAPSHOT_URL%%\"*} # everything before '"'
|
---|
1045 |
|
---|
1046 | # Remove the port 80 part that IA often adds to the URL, as it's superfluous
|
---|
1047 | SNAPSHOT_URL=$(echo $SNAPSHOT_URL | sed 's/:80//')
|
---|
1048 |
|
---|
1049 | # Inform the user of the snapshot URL
|
---|
1050 | valPrint ts " IA suggests $SNAPSHOT_URL"
|
---|
1051 | valPrint rs " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
|
---|
1052 | valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
|
---|
1053 | else # ...otherwise give generic Wayback Machine link for this URL
|
---|
1054 | valPrint ts " Try browsing $ARCHIVE_GENERIC/$URL"
|
---|
1055 | valPrint rs " Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
|
---|
1056 | valPrint hs "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"
|
---|
1057 | fi
|
---|
1058 | fi
|
---|
1059 | fi
|
---|
1060 |
|
---|
1061 | # If this is a page and it seems to be there, take screenshot of it using headless Google Chrome
|
---|
1062 | if [ $IS_FILE -eq 0 ] && [ $TAKE_PAGE_SHOT -eq 1 ] && [ $STATUS == "OK" ]; then
|
---|
1063 | # Sanitize URL by removing "http(s)://" and converting any other colons or slashes to underscores
|
---|
1064 | SHOT_NAME=$(echo "$URL" | sed 's/https*\:\/\///' | sed 'y/:\//__/')
|
---|
1065 | SHOT_FILE="$SHOT_PATH/$SHOT_NAME.png"
|
---|
1066 |
|
---|
1067 | # Don't take screenshot if we already encountered this page and screenshotted it
|
---|
1068 | if [ ! -f "$SHOT_FILE" ]; then
|
---|
1069 | "$CHROME_PATH" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
|
---|
1070 | if [ -f "$WORKING_DIR/$CHROME_SCREENSHOT" ]; then
|
---|
1071 | mv -n "$WORKING_DIR/$CHROME_SCREENSHOT" "$SHOT_FILE"
|
---|
1072 | else
|
---|
1073 | valPrint trhs "Screenshot of URL $URL seems to have failed!"
|
---|
1074 | fi
|
---|
1075 | else
|
---|
1076 | valPrint trhs "Skipping screenshot of URL '$URL' because file '$SHOT_FILE' already exists."
|
---|
1077 | fi
|
---|
1078 | fi
|
---|
1079 | done
|
---|
1080 | FINISHED_LIST="yes"
|
---|
1081 | wrapupAndExit
|
---|