1 | #!/bin/bash
2 |
3 | # Validate External Links by Iritscen
4 | #
5 | # Validates a list of external links in CSV format. The resulting logs are produced in three formats:
6 | # - TXT (for easy diffing with an earlier log)
7 | # - RTF (for reading as a local file with clickable links)
8 | # - HTML (for reading as a web page)
9 | # Call script with "--help" argument for documentation. Also see Read Me.rtf for critical notes.
10 | #
11 | # Recommended rule:
12 | # |----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----|
13 | #
14 | # Table of contents (sections of script in order of appearance, not execution):
15 | # • Globals
16 | # • Help Output
17 | # • Setup
18 | # • Utility Functions
19 | # • Summary Output
20 | # • Initialization
21 | # • Data Sourcing
22 | # • Config Output
23 | # • Legend Output
24 | # • Main Loop
25 |
26 | # Set separator token to newline
27 | IFS="
28 | "
29 |
30 | ### GLOBALS ###
31 | # Settings -- these will be changed from their defaults by the arguments passed in to the script
32 | LINKS_URL="" # use 'curl' to download file with links from this location (can be file://)
33 | EXCEPT_URL="" # 'curl' will access this wiki page with a list of exceptions for NG results
34 | OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder
35 | RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES
36 | SHOW_SLASH=0 # record issue when a slash is added to the end of a URL
37 | SHOW_HTTPS=0 # record issue when "http" is upgraded to "https"
38 | SHOW_YT_RD=0 # record redirection for a youtu.be URL expanding to the full URL
39 | SUGGEST_SNAPSHOTS_NG=0 # query the Internet Archive for a possible snapshot URL for each NG page
40 | SUGGEST_SNAPSHOTS_OK=0 # query the Internet Archive for an existing snapshot of each OK page
41 | CHECK_ARCHIVE_LINKS=0 # check URLs under the archive.org domain
42 | TAKE_PAGE_SHOT=0 # take a screenshot of each OK page
43 | TIMEOUT=10 # time to wait for a response when querying a site
44 | CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature
45 | URL_START=1 # start at this URL in LINKS_FILE
46 | URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE
47 | UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report
48 |
49 | # Fixed strings -- see the occurrences of these variables to learn their purpose
50 | AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/88.0.4324.146 Safari/537.36"
51 | ARCHIVE_API="http://archive.org/wayback/available"
52 | ARCHIVE_GENERIC="https://web.archive.org/web/*"
53 | ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
54 | CHROME_SCREENSHOT="screenshot.png"
55 | EXCEPT_FILE_NAME="exceptions.txt"
56 | EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
57 | WIKI_CURL="https://wiki.oni2.net/Validate_External_Links/Curl_codes"
58 | WIKI_HTTP="https://wiki.oni2.net/Validate_External_Links/HTTP_codes"
59 | WIKI_MAIN="https://wiki.oni2.net/Validate_External_Links"
60 | WIKI_ME="http://iritscen.oni2.net"
61 | THIS_DIR=$(cd $(dirname $0); pwd)
62 | WORKING_DIR=$(pwd)
63 | WIKI_PATH="wiki.oni2.net"
64 |
65 | # These are parallel arrays of the IDs and names of OniGalore's current namespaces
66 | declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 100 101 102 103 104 105 108 109 110 111)
67 | declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk")
68 |
69 | # These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
70 | # This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code.
71 | declare -a HTTP_FILES=(3ds 7z avi BINA blend bsl cpp csv dae dll dmg doc east exe fbx first63 flv gamedata gif jpg last32 log m4a mhm mov mp3 mp4 oni ONWC pdf png py rar tga TRMA txt vbs wav wmv xaf xml zip)
72 | declare -a HTTP_TLDS_AND_PAGES=(action ars asp aspx cfm cgi com css de do full htm html it js jsp net org pgi php php3 phtml pl ru shtml stm uk x)
73 |
74 | # These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
75 | # are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt
76 | # if you add a new code.
77 | declare -a OK_CODES=(200 401 405 406 418 501)
78 | declare -a RD_CODES=(301 302 303 307 308)
79 | declare -a NG_CODES=(000 400 403 404 410 429 500 502 503 504 530)
80 |
81 | # Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
82 | # transcluded text, and if the transclusion fails, then the braces show up in the URL
83 | ILLEGAL_CHARS="{ }"
84 |
85 | # The shortest URL possible, used for sanity-checking some URLs: http://a.co
87 |
88 | # These are parallel arrays giving the prefixes that can be used in place of normal external links to
89 | # some wikis and other sites; based on https://wiki.oni2.net/Special:Interwiki
90 | declare -a INTERWIKI_PREFIXES=(commons metawikimedia mw wikibooks wikidata wikimedia wikinews wikiquote wikisource wikispecies wikiversity wikivoyage wikt wp)
91 | declare -a INTERWIKI_DOMAINS=(commons.wikimedia.org meta.wikimedia.org mediawiki.org wikibooks.org wikidata.org wikimediafoundation.org wikinews.org wikiquote.org wikisource.org species.wikimedia.org wikiversity.org wikivoyage.org wiktionary.org wikipedia.org)
92 |
93 | # Variables for keeping track of main loop progress and findings
94 | LINK_NUM=0
95 | EI_LINKS=0
96 | IW_LINKS=0
97 | OK_LINKS=0
98 | RD_LINKS=0
99 | NG_LINKS=0
100 | SKIP_UNK_NS=0
101 | SKIP_JS_PAGE=0
102 | SKIP_BAD_URL=0
114 | FILE_LINKS=0
115 | PAGE_LINKS=0
117 | FINISHED_LIST="no"
118 | START_RUN=0
119 | END_RUN=0
120 |
121 |
122 | ### HELP OUTPUT ###
123 | # A pseudo-man page. Here is the 80-character rule for the page text:
124 | # 234567890123456789012345678901234567890123456789012345678901234567890123456789
125 | function printHelp()
126 | {
127 | cat << EOF
128 |
129 | NAME
130 | Validate External Links
131 |
133 | validate_external_links.sh --help
134 | validate_external_links.sh --links URL --output DIR [--exceptions URL]
135 | [--record-ok-links] [--show-added-slashes] [--show-https-upgrades]
136 | [--show-yt-redirects] [--suggest-snapshots] [--check-archive-links]
137 | [--take-screenshots FILE] [--timeout NUM] [--start-url NUM]
138 | [--end-url NUM] [--upload FILE]
139 |
141 | This script parses a list of external links found in the OniGalore wiki
142 | (which is dumped by the Oni2.net server periodically in a particular
143 | format), validates them using the Unix tool 'curl', and produces a report
144 | of which links were "OK" (responded positively to an HTTP query), which
145 | were "RD" (responded with a 3xx redirect code), which could be "IW"
146 | (interwiki) links, which are "EI" (external internal) links and could be
147 | intrawiki links, and which were "NG" (no good; a negative response to the
148 | query). This report can then be automatically uploaded to the location of
149 | your choice. The script can also suggest Internet Archive snapshots for
150 | "NG" links, and take screenshots of "OK" links for visual verification by
151 | the reader that the page in question is the one intended to be displayed.
152 |
153 | You must pass this script the URL at which the list of links is found
154 | (--links) and the path where the directory of logs should be outputted
155 | (--output). All other arguments are optional.
156 |
158 | --help Show this page.
159 | --links URL (required) URL from which to download the CSV
160 | file with external links. Note that this URL can
161 | be a local file if you supply a file:// path.
162 | --output DIR (required) Unix path to directory in which Val
163 | should place its reports.
164 | --exceptions URL In order to remove links from the report which
165 | Val finds an issue with but which you regard as
166 | OK, list those desired exceptions on a wiki page.
167 | See the sample file "exceptions.pdf" for the
168 | required format of the page. Note that this URL
169 | can point to a local file if you supply a path
170 | beginning with "file://".
171 | --record-ok-links Log a link in the report even if its response
172 | code is "OK".
173 | --show-added-slashes Report on redirects that simply add a '/' to the
174 | end of the URL.
175 | --show-https-upgrades Report on redirects that simply upgrade a
176 | "http://" URL to a "https://" URL.
177 | --show-yt-redirects Report on redirects that expand a youtu.be URL.
178 | --suggest-snapshots-ng Query the Internet Archive for a possible
179 | snapshot URL for each "NG" page.
180 | --suggest-snapshots-ok Query the Internet Archive for a snapshot of each
181 | "OK" page just to make sure it's available. Note
182 | that this will add a tremendous amount of time to
183 | the script execution because there is a rate
184 | limit to the Archive API. Note that this option
185 | does nothing unless you also use the
186 | --record-ok-links argument.
187 | --check-archive-links Check links that are already pointing to a page
188 | on the Internet Archive. In theory these links
189 | should be totally stable and not need validation.
190 | --take-screenshots FILE Call the Google Chrome binary at this path to
191 | take screenshots of each "OK" page.
192 | --timeout NUM Wait this many seconds for a site to respond. The
193 | default is 10. Important note: Val will attempt
194 | to reach each URL three times, so the time taken
195 | to ping an unresponsive site will be three times
196 | this setting.
197 | --start-url NUM Start at this link in the links CSV file.
198 | --end-url NUM Stop at this link in the links CSV file.
199 | --upload FILE Upload report using the credentials and path
200 | given in this local text file. See sftp_login.txt
201 | for template.
202 |
203 | BUGS
204 | The script cannot properly parse any line in the external links file
205 | which contains a comma in the name of the wiki page containing a link.
206 | Commas in the link itself are not an issue.
207 | EOF
208 | }
209 |
210 |
211 | ### SETUP ###
212 | # If first argument is a help request, or if nothing was passed in at all, print help page and quit
213 | if [ "$#" -eq 0 ] || [ "$1" == "--help" ]; then
214 | printHelp | less
215 | exit 0
216 | fi
217 |
218 | # Parse arguments as long as there are more arguments to process
219 | while (( "$#" )); do
220 | case "$1" in
221 | --links ) LINKS_URL="$2"; shift 2;;
222 | --exceptions ) EXCEPT_URL="$2"; shift 2;;
223 | --output ) OUTPUT_DIR="$2"; shift 2;;
224 | --record-ok-links ) RECORD_OK_LINKS=1; shift;;
225 | --show-added-slashes ) SHOW_SLASH=1; shift;;
226 | --show-https-upgrades ) SHOW_HTTPS=1; shift;;
227 | --show-yt-redirects ) SHOW_YT_RD=1; shift;;
228 | --suggest-snapshots-ng ) SUGGEST_SNAPSHOTS_NG=1; shift;;
229 | --suggest-snapshots-ok ) SUGGEST_SNAPSHOTS_OK=1; shift;;
230 | --check-archive-links ) CHECK_ARCHIVE_LINKS=1; shift;;
231 | --take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
232 | --timeout ) TIMEOUT=$2; shift 2;;
233 | --start-url ) URL_START=$2; shift 2;;
234 | --end-url ) URL_LIMIT=$2; shift 2;;
235 | --upload ) UPLOAD_INFO=$2; shift 2;;
236 | * ) echo "Invalid argument '$1' detected. Aborting."; exit 1;;
237 | esac
238 | done
239 |
240 | # If the required arguments were not supplied, print help page and quit
241 | if [ -z $LINKS_URL ] || [ -z $OUTPUT_DIR ]; then
242 | echo "Error: I did not receive one or both required arguments. Run me with the \"--help\" argument for documentation."
243 | exit 2
244 | fi
245 |
246 | # If user wants screenshots, make sure path to Chrome was passed in and is valid
247 | if [ $TAKE_PAGE_SHOT -eq 1 ]; then
248 | if [ ! -f "$CHROME_PATH" ]; then
249 | echo "Error: You need to supply a path to the Google Chrome application in order to take screenshots."
250 | exit 3
251 | fi
252 | fi
253 |
254 | # Check that UPLOAD_INFO exists, if this argument was supplied
255 | if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then
256 | echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting."
257 | exit 4
258 | fi
259 |
260 | # Check that OUTPUT_DIR is a directory
261 | if [ ! -d "$OUTPUT_DIR" ]; then
262 | echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting."
263 | exit 5
264 | fi
265 |
266 | # Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots
267 | SAFE_TIME=$(date "+%Y-%m-%d--%H-%M-%S") # for use in path names
268 | NICE_TIME=$(TZ=":UTC" date "+%a, %d %b %Y %k:%M:%S GMT") # matches format of LINKS_DATE, which is extracted from the extlinks file
269 | OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
271 | SHOT_PATH="$OUTPUT_PATH/Screenshots"
272 | LOG_NAME="ValExtLinks report"
273 | LOG_NAME_TXT="$LOG_NAME.txt"
274 | LOG_NAME_RTF="$LOG_NAME.rtf"
275 | LOG_NAME_HTM="$LOG_NAME.htm"
277 | LOG_PATH_TXT="$LOG_PATH.txt"
278 | LOG_PATH_RTF="$LOG_PATH.rtf"
279 | LOG_PATH_HTM="$LOG_PATH.htm"
280 | mkdir "$OUTPUT_PATH"
281 | if [ $TAKE_PAGE_SHOT -eq 1 ]; then
282 | mkdir "$SHOT_PATH"
283 | fi
284 |
285 | # Check that 'mkdir' succeeded
286 | if [ ! -d "$OUTPUT_PATH" ]; then
287 | echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting."
288 | exit 6
289 | fi
290 |
291 | # Get date on the file at LINKS_URL and print to log
292 | LINKS_DATE=$(curl --silent --head $LINKS_URL | grep "Last-Modified")
293 | if [ -z "$LINKS_DATE" ]; then
294 | echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting."
295 | exit 7
296 | fi
297 | LINKS_DATE=${LINKS_DATE#Last-Modified: }
298 |
299 |
301 | # Writes a plain-text header to TXT log file
302 | function printTXTheader()
303 | {
304 | valPrint t "Validate External Links report"
305 | valPrint t "generated $NICE_TIME"
306 | valPrint t "from data of $LINKS_DATE"
307 | valPrint t "script by Iritscen (contact: $WIKI_ME)"
308 | valPrint t ""
309 | }
310 |
311 | # Writes the RTF header to RTF log file, then a centered title, then sets text to left-justified
312 | function printRTFheader()
313 | {
314 | valPrint r "{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf820
315 | {\fonttbl\f0\fswiss\fcharset0 Helvetica;}
316 | {\colortbl;\red255\green255\blue255;}
317 | {\*\expandedcolortbl;;}
318 | \margl1440\margr1440\vieww12600\viewh12100\viewkind0
319 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\qc\partightenfactor0
320 |
321 | \f0\fs28 \cf0 \b1 Validate External Links report \b0\\
322 | generated $NICE_TIME\\
323 | from data of $LINKS_DATE\\
324 | script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$WIKI_ME\"}}{\fldrslt contact}})
325 | \\
326 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
327 | \cf0 "
328 | }
329 |
330 | # Closes the RTF markup of the RTF log file
331 | function printRTFfooter()
332 | {
333 | valPrint r "}"
334 | }
335 |
336 | # Writes the HTML header to HTML log file
337 | function printHTMheader()
338 | {
339 | valPrint h "<html>
340 | <head>
341 | <title>Validate External Links report</title>
342 | </head>
343 | <body>
344 | <h2>Validate External Links report</h2>
345 | <h3>generated $NICE_TIME<br />
346 | from data of $LINKS_DATE<br />
347 | script by Iritscen (<a href=\"$WIKI_ME\" target=\"_blank\">contact</a>)</h3>"
348 | }
349 |
350 | # Closes the HTML markup of the HTML log file
351 | function printHTMfooter()
352 | {
353 | valPrint h "</body>
354 | </html>"
355 | }
356 |
357 | # The central logging function. The first parameter is a string composed of one or more characters that
358 | # indicate which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
359 | # 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 's' means "Print an
360 | # extra newline at the end." 'w' means "Don't pass console output through 'fmt'" ("fmt" fits the output
361 | # to an 80-column CLI but can break special formatting and the 'n' option).
362 | function valPrint()
363 | {
364 | if [[ "$1" == *c* ]]; then
365 | if [[ "$1" == *n* ]]; then
366 | echo -n "$2"
367 | elif [[ "$1" == *w* ]]; then
368 | echo "$2"
369 | elif [[ "$1" == *s* ]]; then
370 | echo -e "$2\n"
371 | else
372 | echo "$2" | fmt -w 80
373 | fi
374 | fi
375 | if [[ "$1" == *t* ]]; then
376 | if [[ "$1" == *n* ]]; then
377 | echo -n "$2" >> "$LOG_PATH_TXT"
378 | elif [[ "$1" == *s* ]]; then
379 | echo -e "$2\n" >> "$LOG_PATH_TXT"
380 | else
381 | echo "$2" >> "$LOG_PATH_TXT"
382 | fi
383 | fi
384 | if [[ "$1" == *r* ]]; then
385 | if [[ "$1" == *n* ]]; then
386 | echo "$2" >> "$LOG_PATH_RTF"
387 | elif [[ "$1" == *s* ]]; then
388 | echo "$2\line\line" >> "$LOG_PATH_RTF"
389 | else
390 | echo "$2\line" >> "$LOG_PATH_RTF"
391 | fi
392 | fi
393 | if [[ "$1" == *h* ]]; then
394 | if [[ "$1" == *s* ]]; then
395 | echo "$2<tr><td> </td></tr>" >> "$LOG_PATH_HTM"
396 | elif [[ "$1" == *n* ]]; then
397 | echo "$2" >> "$LOG_PATH_HTM"
398 | else
399 | echo "$2<br />" >> "$LOG_PATH_HTM"
400 | fi
401 | fi
402 | }
403 |
404 | # Pluralize the string in parameter 1 if the number in parameter 2 is not 1
405 | function pluralCheckNoun()
406 | {
407 | if [ $2 -ne 1 ]; then
408 | if [[ $1 =~ x$ ]]; then
409 | echo $1es
410 | else
411 | echo $1s
412 | fi
413 | else
414 | echo $1
415 | fi
416 | }
417 |
418 | # Output "is" if parameter 1 is 1, otherwise "are"
419 | function pluralCheckIs()
420 | {
421 | if [ $1 -ne 1 ]; then
422 | echo "are"
423 | else
424 | echo "is"
425 | fi
426 | }
427 |
428 | # Output "was" if parameter 1 is 1, otherwise "were"
429 | function pluralCheckWas()
430 | {
431 | if [ $1 -ne 1 ]; then
432 | echo "were"
433 | else
434 | echo "was"
435 | fi
436 | }
437 |
438 | # Output "a " if parameter 1 is 1, otherwise nothing
439 | function pluralCheckA()
440 | {
441 | if [ $1 -eq 1 ]; then
442 | echo "a "
443 | fi
444 | }
445 |
446 | # Output "an " if parameter 1 is 1, otherwise nothing
447 | function pluralCheckAn()
448 | {
449 | if [ $1 -eq 1 ]; then
450 | echo "an "
451 | fi
452 | }
453 |
454 | # Upload the reports using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
455 | # reports being saved to disk have already been closed.
456 | function uploadReport()
457 | {
458 | valPrint c "Uploading reports..."
459 |
463 | SFTP_PORT_MARKER="port:"
464 | SFTP_PATH_MARKER="path:"
473 |
474 | for SUFFIX in htm rtf txt; do
476 |
477 | if [ "$?" -ne 0 ]; then
478 | valPrint c "Error $? occurred when attempting to upload $LOG_NAME.$SUFFIX!"
479 | else
480 | valPrint c "Report in `echo $SUFFIX | tr [:lower:] [:upper:]` format was uploaded."
481 | fi
482 | done
483 | }
484 |
485 | # Prints session summary when script is done
486 | function wrapupAndExit()
487 | {
488 | # Get off progress line on console, drop down a line from last link in log, and close HTML table
489 | valPrint ctr ""
490 | valPrint h "</table><br />"
491 |
492 | # If we didn't finish processing the last URL, then the iterator is one too high
493 | if [ $FINISHED_LIST != "yes" ]; then
494 | let LINK_NUM-=1
495 | if [ $FINISHED_LIST == "no" ]; then
496 | valPrint ctrh "The session was canceled by the user."
497 | fi
498 | fi
499 |
500 | # Generate string with elapsed time
501 | END_RUN=$(date +%s)
502 | ELAPSED=$(echo $(($END_RUN - $START_RUN)) | awk '{printf "%d min. %d sec. elapsed", int($1/60), int($1%60)}')
503 |
504 | # Do some math on results of session
515 |
516 | # Print something in the Links section if no link issues were printed
517 | if [ $LINK_PROBLEMS_NET -eq 0 ]; then
518 | valPrint h "<i>No link problems to report! See the <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for a list of links with issues that were not reported.</i>"
519 | fi
520 | if [ $LINK_PROBLEMS_TOTAL -eq 0 ]; then
521 | valPrint t "No link problems to report!"
522 | valPrint r "\i1 No link problems to report! \i0"
523 | fi
524 |
525 | ## SUMMARY OUTPUT ##
526 | valPrint ct "Summary ($ELAPSED):"
527 | valPrint r "\b1 Summary \b0 ($ELAPSED)"
528 | valPrint hn "<h3><span id=\"summary\">Summary ($ELAPSED)</span></h3>"
529 | valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT) (there $(pluralCheckWas $FILE_LINKS) $FILE_LINKS file $(pluralCheckNoun link $FILE_LINKS) and $PAGE_LINKS page $(pluralCheckNoun link $PAGE_LINKS))."
530 |
531 | # Print processed link totals
532 | if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi
533 | if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi
534 | if [ $SKIP_ARCHIVE_ORG -gt 0 ]; then valPrint ctrh "- $SKIP_ARCHIVE_ORG Archive.org $(pluralCheckNoun link $SKIP_ARCHIVE_ORG) were not checked"; fi
535 | if [ $LINK_PROBLEMS_TOTAL -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_TOTAL processed $(pluralCheckNoun link $LINK_PROBLEMS_TOTAL) had $(pluralCheckAn $LINK_PROBLEMS_TOTAL)$(pluralCheckNoun issue $LINK_PROBLEMS_TOTAL)"; fi
536 | if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr " (excepted $LINKS_EXCEPTED link $(pluralCheckNoun issue $LINKS_EXCEPTED) from report)"; valPrint h " (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi
537 | if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) $(pluralCheckWas $OK_LINKS) OK"; fi
538 | if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctr " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; valPrint h " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi
539 |
540 | # Print errored link totals
541 | if [ $LINK_ERRORS -gt 0 ]; then
542 | valPrint c "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"
543 | valPrint h "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for specific links):"
544 | valPrint rt "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS):"
545 | fi
546 | if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
547 | if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
548 | if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
549 | if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
550 | if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
551 | if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
552 |
553 | # Print excepted link totals
554 | if [ $LINKS_EXCEPTED -gt 0 ]; then
555 | valPrint c "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"
556 | valPrint h "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for specific links):"
557 | valPrint rt "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted:"
558 | fi
559 | if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
560 | if [ $SKIP_EXPECT_RD -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_RD/$RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi
561 | if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi
562 | if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi
563 |
564 | # Print checked link totals
565 | if [ $LINK_PROBLEMS_NET -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS_NET link $(pluralCheckNoun issue $LINK_PROBLEMS_NET):"; fi
566 | if [ $LINK_PROBLEMS_NG -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_NG NG $(pluralCheckNoun link $LINK_PROBLEMS_NG)"; fi
567 | if [ $LINK_PROBLEMS_RD -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_RD $(pluralCheckNoun redirection $LINK_PROBLEMS_RD)"; fi
568 | if [ $LINK_PROBLEMS_EI -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_EI $(pluralCheckNoun link $LINK_PROBLEMS_EI) that could be intrawiki"; fi
569 | if [ $LINK_PROBLEMS_IW -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_IW $(pluralCheckNoun link $LINK_PROBLEMS_IW) that could be interwiki"; fi
570 |
571 | # Close the log files' markup
572 | valPrint trh "ValExtLinks says goodbye."
573 | printRTFfooter
574 | printHTMfooter
575 |
576 | # Upload report if this was requested
577 | if [ ! -z $UPLOAD_INFO ] && [ $FINISHED_LIST != "no" ]; then
578 | uploadReport
579 | fi
580 |
581 | # Really quit now
582 | valPrint c "ValExtLinks says goodbye."
583 | exit 0
584 | }
585 | trap wrapupAndExit INT
586 |
587 |
588 | ### INITIALIZATION ###
589 | # Print opening message to console and log files
590 | valPrint cw "$(tput bold)--Validate External Links--$(tput sgr0)"
591 | printTXTheader
592 | printRTFheader
593 | printHTMheader
594 |
595 | ## DATA SOURCING ##
596 | valPrint t "Startup:"
597 | valPrint r "\b1 Startup \b0"
598 | valPrint hn "<h3>Startup</h3>"
599 |
600 | # Attempt to download file at LINKS_URL, then check that it succeeded
601 | valPrint cwtrhn "Downloading list of external links from $LINKS_URL..."
602 | LINKS_FILE_NAME=$(echo "$LINKS_URL" | sed 's/.*\///')
604 | curl --silent -o "$LINKS_FILE" $LINKS_URL
605 | if [ ! -f "$LINKS_FILE" ]; then
606 | echo -e "\nThe download of $LINKS_URL appears to have failed. Aborting."
607 | wrapupAndExit
608 | else
609 | valPrint ctrh " success."
610 | fi
611 |
612 | # Attempt to download file at EXCEPT_URL, then check that it succeeded
613 | if [ ! -z $EXCEPT_URL ]; then
614 | valPrint cwtrhn "Downloading list of reporting exceptions from $EXCEPT_URL..."
615 | EXCEPT_DATA=$(curl --silent $EXCEPT_URL)
616 | if [ -z "$EXCEPT_DATA" ]; then
617 | echo -e "\nThe download of the exceptions data from '$EXCEPT_URL' appears to have failed. Aborting."
618 | wrapupAndExit
619 | else
620 | valPrint ctrh " success."
621 | fi
625 |
626 | # Store on disk for debugging purposes
627 | echo "$EXCEPT_DATA" > "$EXCEPT_FILE"
628 |
629 | # Transfer to array for easy searching later
630 | declare -a EXCEPT_ARRAY=($(echo "$EXCEPT_DATA"))
631 | fi
632 |
633 | # Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
634 | LINK_COUNT_STRING=$(cat "$LINKS_FILE" | wc -l)
635 |
636 | # Number of URLs is number of lines minus one (first line is column header row for the CSV)
637 | LINK_COUNT=$(echo "${LINK_COUNT_STRING}" | tr -d '[:space:]')
638 | let LINK_COUNT-=1
639 | valPrint ctrh "Found $LINK_COUNT links to process."
640 | valPrint trh ""
641 |
642 | ## CONFIG OUTPUT ##
643 | valPrint t "Config:"
644 | valPrint r "\b1 Config \b0"
645 | valPrint hn "<h3>Config</h3>"
646 |
647 | valPrint ctrhn "Links to consider: "
648 | if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
649 | valPrint ctrh "$((URL_LIMIT-URL_START+1)), from link $URL_START to $URL_LIMIT"
650 | elif [ $URL_START -ne 1 ]; then
651 | valPrint ctrh "$((LINK_COUNT-URL_START+1)), from link $URL_START to $LINK_COUNT"
652 | else
653 | valPrint ctrh "$LINK_COUNT"
654 | fi
655 |
656 | valPrint ctrh "Site query timeout: $TIMEOUT seconds"
657 |
658 | valPrint ctrhn "Show OK links: "
659 | if [ $RECORD_OK_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
660 |
661 | valPrint ctrhn "Take screenshots: "
662 | if [ $TAKE_PAGE_SHOT -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
663 |
664 | valPrint ctrhn "Suggest archive.org snapshots for NG pages: "
665 | if [ $SUGGEST_SNAPSHOTS_NG -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
666 |
667 | valPrint ctrhn "Suggest archive.org snapshots for OK pages: "
668 | if [ $SUGGEST_SNAPSHOTS_OK -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
669 |
670 | valPrint ctrhn "Ignore slash-adding redirects: "
671 | if [ $SHOW_SLASH -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
672 |
673 | valPrint ctrhn "Ignore HTTPS-upgrading redirects: "
674 | if [ $SHOW_HTTPS -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
675 |
676 | valPrint ctrhn "Ignore youtu.be redirects: "
677 | if [ $SHOW_YT_RD -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
678 |
679 | valPrint ctrhn "Check archive.org links: "
680 | if [ $CHECK_ARCHIVE_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
681 |
682 | valPrint tr "A summary of my findings will be found at the bottom of the report."
683 | valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
684 | valPrint trh ""
685 |
686 | ## LEGEND OUTPUT ##
687 | valPrint t "Legend:"
688 | valPrint r "\b1 Legend \b0"
689 | valPrint hn "<h3>Legend</h3>"
690 | valPrint t "(For guidance in fixing these links, see $WIKI_MAIN.)"
691 | valPrint r "(For guidance in fixing these links, see {\field{\*\fldinst{HYPERLINK \"$WIKI_MAIN\"}}{\fldrslt here}}.)"
692 | valPrint h "(For guidance in fixing these links, see <a href=\"$WIKI_MAIN\" target=\"_blank\">here</a>.)"
693 | valPrint trh "OK = URL seems to be working"
694 | valPrint trh "NG = URL no longer seems to work"
695 | valPrint trh "RD = URL is redirecting to this new URL"
696 | valPrint trh "EI = URL is an external link to an internal page and should be converted to an intrawiki link using the suggested markup"
697 | valPrint trh "IW = URL is an external link to a fellow wiki and should be converted to an interwiki link using the suggested markup"
698 | valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $WIKI_HTTP)"
699 | valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$WIKI_HTTP\"}}{\fldrslt here}} for code reference)"
700 | valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$WIKI_HTTP\" target=\"_blank\">here</a> for code reference)"
701 | valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $WIKI_CURL)"
702 | valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$WIKI_CURL\"}}{\fldrslt here}} for code reference)"
703 | valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$WIKI_CURL\" target=\"_blank\">here</a> for code reference)"
704 | valPrint trh "IA suggests = Last available snapshot returned by the Internet Archive"
705 | valPrint trh "Try browsing = The Archive failed to return a snapshot URL, so check for a snapshot manually using this link"
706 | valPrint trh ""
707 |
708 |
709 | ### MAIN LOOP ###
710 | valPrint t "Links:"
711 | valPrint r "\b1 Links \b0"
712 | valPrint hn "<h3>Links</h3>"
713 | START_RUN=$(date +%s)
714 | # Process each line of the .csv in LINKS_FILE
715 | for LINE in `cat "$LINKS_FILE"`; do
716 | START_LINK=$(date +%s)
717 | let LINK_NUM+=1
718 |
719 | # First line is the column header row for the CSV, so let's verify that the format hasn't changed
720 | if [ $SKIPPED_HEADER_ROW -eq 0 ]; then
721 | if [ $LINE == "namespace,title,target" ]; then
723 | LINK_NUM=0 # this line is not a link, so reset the link counter
724 | valPrint hn "<table>"
725 | continue
726 | else
727 | valPrint ctrh "Did not find expected header row in $LINKS_FILE. Aborting."
728 | wrapupAndExit
729 | fi
730 | fi
731 |
732 | # Skip this link if we are not at URL_START yet
733 | if [ $LINK_NUM -lt $URL_START ]; then
734 | continue
735 | fi
736 |
737 | # Stop if we are at the limit declared for testing purposes
738 | if [ $URL_LIMIT -gt 0 ] && [ $LINK_NUM -gt $URL_LIMIT ]; then
739 | FINISHED_LIST="limit"
740 | wrapupAndExit
741 | fi
742 |
743 | # Print progress to screen
744 | if [ $LINK_NUM -gt 1 ]; then
745 | printf "\e[1A\n" # erase previous progress message so that new one appears in its place
746 | fi
747 | valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."
748 |
749 | # The number of the namespace is the element before the first comma on the line
750 | NS_ID=${LINE%%,*}
751 |
752 | # Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
753 | NS_NAME=""
754 | a=0
755 | while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done
756 | if [ $NS_ID == "NULL" ]; then
757 | break
758 | elif [ $NS_ID -eq ${NS_IDS[$a]} ]; then
759 | NS_NAME="${NS_NAMES[$a]}"
760 | break
761 | fi
762 | let a+=1
763 | done
764 | if [ "$NS_NAME" == "" ]; then
765 | if [ $NS_ID == "NULL" ]; then
766 | valPrint trs "Skipping line $LINK_NUM ('$LINE') because the namespace (and probably the page too) is 'NULL'. Probably the link is no longer in existence on the wiki."
767 | else
768 | valPrint trs "Skipping line $LINK_NUM ('$LINE') because I could not find a name for namespace ID $NS_ID."
769 | fi
770 | let SKIP_UNK_NS+=1
771 | let PAGE_LINKS+=1
772 | continue
773 | fi
774 |
775 | # The name of the page is everything between the namespace ID and the next comma on the line (commas
776 | # in page names will break this)
778 | PAGE_NAME=${PAGE_NAME%%,*}
779 |
780 | # Build longer wiki page URLs from namespace and page names
783 | # Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it
784 | # explicitly breaks the link
785 | if [ $NS_ID -eq 0 ]; then
788 | fi
789 |
790 | # We don't want to consider wiki pages ending in .js, as the MW parser cannot reliably isolate URLS
791 | # in JavaScript code, so it returns erroneous links
792 | PAGE_NAME_SUFFIX=$(echo $PAGE_NAME | sed 's/.*\.//')
793 | if [ $PAGE_NAME_SUFFIX == "js" ]; then
794 | valPrint trs "Skipping URL '${LINE#$NS_ID,$PAGE_NAME,}' on line $LINK_NUM because it was found on JavaScript page '$LOCAL_PAGE_PATH'."
795 | let SKIP_JS_PAGE+=1
796 | let PAGE_LINKS+=1
797 | continue
798 | fi
799 |
800 | # The URL being linked to is everything after the previous two fields (this allows commas to be in
801 | # the URLs, but a comma in the previous field, the page name, will break this)
803 |
804 | # Scan for illegal characters
805 | if [[ $URL == *[$ILLEGAL_CHARS]* ]]; then
806 | valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because it contains characters illegal in a URL."
807 | let SKIP_BAD_URL+=1
808 | let PAGE_LINKS+=1
809 | continue
810 | fi
811 |
812 | # If we're skipping Archive.org links, see if this is one
813 | if [ $CHECK_ARCHIVE_LINKS -eq 0 ] && [[ $URL == *web.archive.org* ]]; then
814 | valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to check Wayback Machine links."
815 | let SKIP_ARCHIVE_ORG+=1
816 | let PAGE_LINKS+=1
817 | continue
818 | fi
819 |
820 | # Now we need to know if the URL is for a file or a web page. First step is to determine if the
821 | # URL ends in a suffix
822 | HAS_SUFFIX=0
823 |
824 | # If the URL ends in something like ".php?foo=bar", strip everything from the '?' onward
825 | CLEAN_URL=${URL%%\?*}
826 |
827 | # If the URL ends in something like "#section_15", strip everything from the '#' onward
828 | CLEAN_URL=${CLEAN_URL%%\#*}
829 |
830 | # 'sed' cannot handle Unicode in my Bash shell, so skip non-ASCII URL and make user check it
831 | if [[ $CLEAN_URL == *[![:ascii:]]* ]]; then
832 | valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I cannot handle non-ASCII characters."
833 | let SKIP_NON_ASCII+=1
834 | let PAGE_LINKS+=1
835 | continue
836 | fi
837 |
838 | # Isolate the characters after the last period and after the last slash
839 | POST_DOT=$(echo "$CLEAN_URL" | sed 's/.*\.//')
840 | POST_SLASH=$(echo "$CLEAN_URL" | sed 's/.*\///')
841 |
842 | # If the last period comes after the last slash, then the URL ends in a suffix
843 | POST_DOT_LENGTH=$(echo | awk -v input=$POST_DOT '{print length(input)}')
844 | POST_SLASH_LENGTH=$(echo | awk -v input=$POST_SLASH '{print length(input)}')
845 | if [ $POST_DOT_LENGTH -lt $POST_SLASH_LENGTH ]; then
846 | HAS_SUFFIX=1
847 | else
848 | HAS_SUFFIX=0
849 | fi
850 |
851 | # Now we need to know if the URL is for a file. If the URL ends in a suffix, compare it against the
852 | # known URL endings supplied in HTTP_FILES and HTTP_TLDS_AND_PAGES
853 | IS_FILE=-1
854 | if [ $HAS_SUFFIX -eq 0 ]; then
855 | IS_FILE=0
856 | else
857 | # Turn off case sensitivity while we compare suffixes
858 | shopt -s nocasematch
859 |
860 | # Special case: URLs ending in something like "productID.304297400" are pages, not files, so if
861 | # the URL's suffix is all numbers, we are looking at the end of a web page URL
862 | if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
863 | IS_FILE=0
864 | fi
865 |
866 | # Special case: URLs ending in a parens, e.g. "ms537113(v=vs.85)", are pages
867 | if [[ $POST_DOT =~ ^.*[\(\)]$ ]]; then
868 | IS_FILE=0
869 | fi
870 |
871 | # Special case: URLs containing a '%', e.g. "10.1007%2FBF00329055", are pages
872 | if [[ $POST_DOT == *%* ]]; then
873 | IS_FILE=0
874 | fi
875 |
876 | # If we did not identify this URL as a web page above, we need to compare the suffix against known
877 | # file extensions
878 | if [ $IS_FILE -eq -1 ]; then
879 | for EXTENSION in "${HTTP_FILES[@]}"; do
880 | if [[ $EXTENSION == $POST_DOT ]]; then # double brackets respect the nocasematch setting
881 | IS_FILE=1
882 | break
883 | fi
884 | done
885 | fi
886 |
887 | # If we did not identify this URL as a file above, we need to compare the suffix against known
888 | # pages and TLDs in order to rule out that this is an unknown file suffix or TLD that the user
889 | # needs to add to the HTTP_FILES or HTTP_TLDS_AND_PAGES array
890 | if [ $IS_FILE -eq -1 ]; then
891 | for PAGE_OR_TLD in "${HTTP_TLDS_AND_PAGES[@]}"; do
892 | if [[ $PAGE_OR_TLD == $POST_DOT ]]; then
893 | IS_FILE=0
894 | break
895 | fi
896 | done
897 | fi
898 |
899 | # Turn case sensitivity back on in Bash
900 | shopt -u nocasematch
901 | fi
902 |
903 | # If this suffix escaped identification as either a file, page or TLD, inform the user
904 | STR_TYPE=""
905 | if [ $IS_FILE -eq -1 ]; then
906 | valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I encountered the unknown URL ending '$POST_DOT'. Please add this ending to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
907 | let SKIP_UNK_SUFFIX+=1
908 | continue
909 | elif [ $IS_FILE -eq 1 ]; then
910 | STR_TYPE="file"
911 | let FILE_LINKS+=1
912 | else
913 | STR_TYPE="page"
914 | let PAGE_LINKS+=1
915 | fi
916 |
917 | # Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
918 | # issue with sites that require HTTPS
919 | CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --retry 2 --write-out '%{http_code}\n' $URL)
920 | CURL_ERR=$(echo $?)
922 |
923 | # Tack 'curl' exit code onto result string if 'curl' returned "000" (no HTTP response)
924 | if [ $CURL_CODE == "000" ]; then
926 | fi
927 |
928 | # Begin to determine our status code for this URL (EI, IW, OK, RD, or NG)
929 | STATUS="??"
930 | NEW_URL=""
932 |
933 | # First make sure that this isn't an "external internal" link to our own wiki that can be replaced
934 | # by "[[page_name]]". If it uses a special access URL beginning with "/w/", let it pass, as it
935 | # probably cannot be replaced by "[[ ]]" markup
936 | if [[ $URL == *$WIKI_PATH* ]] && [[ $URL != *$WIKI_PATH/w/* ]]; then
937 | STATUS="EI"
938 | let EI_LINKS+=1
939 | fi
940 |
941 | # If it's not, check if this is a link to a domain that we have an interwiki prefix for (also make
942 | # sure that it's not an archive.org link to a page from an interwiki domain)
943 | if [ $STATUS == "??" ] && [[ $URL != *web.archive.org* ]]; then
944 | for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
945 | if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]] && [[ $URL != *${INTERWIKI_DOMAINS[$i]}/w/* ]]; then
946 | STATUS="IW"
947 | let IW_LINKS+=1
949 | break
950 | fi
951 | done
952 | fi
953 |
954 | # If we didn't match an interwiki domain, see if the status code is in our "OK" codes list
955 | if [ $STATUS == "??" ]; then
956 | for CODE in "${OK_CODES[@]}"; do
957 | if [[ $CODE == $CURL_CODE ]]; then
958 | STATUS="OK"
959 | let OK_LINKS+=1
960 |
961 | # If this is a YouTube link, we have to look at the actual page source to know if the video
962 | # is good or not; override the link's info if it's actually NG
963 | if [[ $URL == *www.youtube.com* ]]; then
964 | PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $URL | grep "\"simpleText\":\"Video unavailable\"")
965 | if [ ! -z "$PAGE_TEXT" ]; then
966 | STATUS="NG"
967 | CURL_RESULT=404
968 | let OK_LINKS-=1
969 | let NG_LINKS+=1
970 | fi
971 | fi
972 | break
973 | fi
974 | done
975 | fi
976 |
977 | # If we didn't get a match with the "OK" codes, check it against the "RD" codes
978 | if [ $STATUS == "??" ]; then
979 | for CODE in "${RD_CODES[@]}"; do
980 | if [[ $CODE == $CURL_CODE ]]; then
981 | # Get URL header again in order to retrieve the URL we are being redirected to
982 | NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --write-out '%{redirect_url}\n' $URL)
983 |
984 | # Adjust the old and new URLs to both use HTTP for comparison purposes, so we can filter
985 | # those changes out if the user didn't ask for them
986 | URL_HTTP=$(echo $URL | sed -E 's/^https:/http:/')
987 | NEW_URL_HTTP=$(echo $NEW_URL | sed -E 's/^https:/http:/')
988 |
989 | # Sometimes 'curl' fails to get the redirect_url due to time-out or bad web site config
990 | NEW_URL_LENGTH=$(echo | awk -v input=$NEW_URL_HTTP '{print length(input)}')
991 | if [ $NEW_URL_LENGTH -lt $MIN_URL_LENGTH ]; then
992 | NEW_URL_HTTP="[new URL not retrieved]"
993 | fi
994 |
995 | # Remove slash at end of new URL, if present, so we can filter out the redirects that
996 | # merely add an ending slash if the user didn't ask for them
997 | NEW_URL_NO_SLASH=$(echo $NEW_URL_HTTP | sed -E 's:/$::')
998 |
999 | # Detect if this is a youtu.be link simply being expanded by YouTube to the full
1000 | # youtube.com address
1001 | YOUTU_BE=0
1002 | if [[ $URL_HTTP == http://youtu.be* ]] && [[ $NEW_URL_HTTP == http://www.youtube.com* ]]; then
1003 | YOUTU_BE=1
1004 | fi
1005 |
1006 | # If the URLs match besides HTTP being upgraded to HTTPS, then the link is OK (unless user
1007 | # wants those to be reported)
1008 | if [ $SHOW_HTTPS -eq 0 ] && [ $URL_HTTP == $NEW_URL_HTTP ]; then
1009 | valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to show http->https upgrades, and I was redirected to '$NEW_URL'."
1010 | STATUS="OK"
1011 | let OK_LINKS+=1
1012 | let SKIP_HTTPS_UP+=1
1013 | # If the URLs match besides an added ending slash, then the link is OK (unless user wants
1014 | # those to be reported)
1015 | elif [ $SHOW_SLASH -eq 0 ] && [ $URL_HTTP == $NEW_URL_NO_SLASH ]; then
1016 | valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to show added trailing slashes, and I was redirected to '$NEW_URL'."
1017 | STATUS="OK"
1018 | let OK_LINKS+=1
1019 | let SKIP_SLASH_ADD+=1
1020 | elif [ $YOUTU_BE -eq 1 ]; then
1021 | # We have to look at the actual page source to know if a YouTube video is good or not
1022 | PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $NEW_URL | grep "\"simpleText\":\"Video unavailable\"")
1023 | if [ ! -z "$PAGE_TEXT" ]; then
1024 | STATUS="NG"
1025 | let NG_LINKS+=1
1026 | else
1027 | if [ $SHOW_YT_RD -eq 0 ]; then
1028 | valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to show redirects for youtu.be links, and I was redirected to '$NEW_URL'."
1029 | STATUS="OK"
1030 | let OK_LINKS+=1
1031 | let SKIP_YOUTU_BE+=1
1032 | else
1033 | STATUS="RD"
1034 | let RD_LINKS+=1
1035 | fi
1036 | fi
1037 | else
1038 | STATUS="RD"
1039 | let RD_LINKS+=1
1040 | fi
1041 | break
1042 | fi
1043 | done
1044 | fi
1045 |
1046 | # If we didn't get a match with the "RD" codes, check it against the "NG" codes
1047 | if [ $STATUS == "??" ]; then
1048 | for CODE in "${NG_CODES[@]}"; do
1049 | if [[ $CODE == $CURL_CODE ]]; then
1050 | STATUS="NG"
1051 | let NG_LINKS+=1
1052 | break
1053 | fi
1054 | done
1055 | fi
1056 |
1057 | # If we didn't match a known status code, advise the reader
1058 | if [ $STATUS == "??" ]; then
1059 | valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I encountered the unknown response code $CURL_CODE."
1060 | let SKIP_UNK_CODE+=1
1061 | continue
1062 | fi
1063 |
1064 | # Check problem links against exceptions list before proceeding
1066 | if [ $STATUS != "OK" ] && [ ! -z $EXCEPT_URL ]; then
1067 | # The code we expect to find in the exceptions file is either the 'curl' result or "EI"/"IW"
1069 | if [ $STATUS == "EI" ]; then
1071 | elif [ $STATUS == "IW" ]; then
1073 | fi
1074 |
1075 | # Look for link in exceptions list and make sure the listed result code and wiki page also match
1076 | for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
1077 | {
1079 |
1080 | # Undo any HTML-encoding from the wiki page; for now we just worry about the ampersand, as most
1081 | # other HTML-encoded characters are not found in URLs
1082 | EXCEPT_LINE=$(echo "$EXCEPT_LINE" | sed 's/\&/\&/g')
1083 |
1084 | # Match URL
1086 | EXCEPT_URL="${EXCEPT_URL%,*}"
1087 | if [ "$EXCEPT_URL" != "$URL" ]; then
1088 | continue
1089 | fi
1090 |
1091 | # Match containing page's name
1094 | if [ "$EXCEPT_PAGE" == "*" ] || [ "$EXCEPT_PAGE" == $LOCAL_PAGE_PATH ]; then
1095 | # Match result code
1097 | if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then
1098 | valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because its expected result, '$EXPECT_CODE', is in the exceptions list."
1099 | if [ $STATUS == "EI" ]; then
1100 | let SKIP_EXPECT_EI+=1
1101 | elif [ $STATUS == "IW" ]; then
1102 | let SKIP_EXPECT_IW+=1
1103 | elif [ $STATUS == "RD" ]; then
1104 | let SKIP_EXPECT_RD+=1
1105 | else
1106 | let SKIP_EXPECT_NG+=1
1107 | fi
1109 | break
1110 | fi
1111 | fi
1112 | } done
1113 | fi
1114 | if [ $FOUND_EXCEPT -eq 1 ]; then
1115 | continue
1116 | fi
1117 |
1118 | # If appropriate, record this link to the log, with clickable URLs when possible
1119 | if [ $STATUS != "OK" ] || [ $RECORD_OK_LINKS -eq 1 ]; then
1120 | # Prepare 'curl' result in parentheses to print after status code, unless this is an "IW" or "EI"
1121 | # link, in which case showing the status code doesn't make sense. Adjust spacing after string to
1122 | # ensure TXT and RTF reports have aligned columns of results.
1125 | CURL_STR_R="$CURL_STR_H "
1126 | if [ $STATUS == "IW" ] || [ $STATUS == "EI" ]; then
1127 | CURL_STR_H=""
1128 | CURL_STR_T=" "
1129 | CURL_STR_R=" "
1130 | fi
1131 |
1132 | # Record link and its wiki page in TXT, RTF, and HTML markup
1133 | valPrint t "${STATUS}${CURL_STR_T} $STR_TYPE $URL"
1134 | valPrint t " linked from $FULL_PAGE_PATH"
1135 | valPrint r "${STATUS}${CURL_STR_R}${RTF_TABS}$STR_TYPE {\field{\*\fldinst{HYPERLINK \"$URL\"}}{\fldrslt $URL}}"
1136 | valPrint r " linked from {\field{\*\fldinst{HYPERLINK \"$FULL_PAGE_PATH\"}}{\fldrslt $LOCAL_PAGE_PATH}}"
1137 | valPrint hn "<tr><td style=\"white-space:nowrap\">${STATUS}${CURL_STR_H}</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>"
1138 | valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"
1139 |
1140 | # Place vertical space here since we won't be printing anything more about this link
1141 | if [ $STATUS == "OK" ] && [ $SUGGEST_SNAPSHOTS_OK -eq 0 ]; then valPrint tr ""; valPrint hs ""; fi
1142 |
1143 | # Record redirect URL if one was given by a 3xx response page
1144 | if [ $STATUS == "RD" ]; then
1145 | valPrint ts " Server suggests $NEW_URL"
1146 | valPrint rs " Server suggests {\field{\*\fldinst{HYPERLINK \"$NEW_URL\"}}{\fldrslt $NEW_URL}}"
1147 | valPrint hs "<tr><td colspan=\"2\" align=\"right\">Server suggests</td><td><a href=\"$NEW_URL\" target=\"_blank\">$NEW_URL</a></td></tr>"
1148 | fi
1149 |
1150 | # Notify reader if we can use an intrawiki link for this URL
1151 | if [ $STATUS == "EI" ]; then
1152 | INTRA_PAGE=${URL#*://*/}
1153 | valPrint ts " Just use [[$INTRA_PAGE]]"
1154 | valPrint rs " Just use [[$INTRA_PAGE]]"
1155 | valPrint hs "<tr><td colspan=\"2\" align=\"right\">Just use</td><td>[[$INTRA_PAGE]]</td></tr>"
1156 | fi
1157 |
1158 | # Notify reader if we can use an interwiki prefix for this URL
1159 | if [ $STATUS == "IW" ]; then
1160 | INTER_PAGE=$(echo "$URL" | sed 's/.*\///')
1161 | valPrint ts " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
1162 | valPrint rs " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
1163 | valPrint hs "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]</td></tr>"
1164 | fi
1165 |
1166 | # Query Internet Archive for latest "OK" snapshot for "NG" page
1167 | if [[ ( $STATUS == "NG" && $SUGGEST_SNAPSHOTS_NG -eq 1 ) || ( $STATUS == "OK" && $SUGGEST_SNAPSHOTS_OK -eq 1 ) ]]; then
1168 |
1169 | # We need to watch out for the rate limit or we'll get locked out; look at how much time has
1170 | # elapsed and then wait the remainder between that and how long of a wait we think is needed
1171 | # to avoid the dreaded "Too Many Requests" response. 5 seconds is just a guess.
1172 | CUR_TIME=$(date +%s)
1174 | if [ $WAIT_REMAINDER -gt 0 ]; then
1175 | valPrint t "Waiting $WAIT_REMAINDER second(s) to conform to Archive.org rate limit."
1176 | sleep $WAIT_REMAINDER
1177 | fi
1178 |
1179 | # Issue query to the API
1180 | ARCHIVE_QUERY=$(curl --silent --max-time $TIMEOUT "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
1181 |
1182 | # Notify user if we hit the rate limit and just keep going
1183 | if [[ "$ARCHIVE_QUERY" == "*Too Many Requests*" ]]; then
1184 | valPrint t " IA has rate-limited us!"
1185 | valPrint r " IA has rate-limited us!"
1186 | valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td>(hit the API rate limit!)</td></tr>"
1187 | # If a "closest" snapshot was received, inform user
1188 | elif [[ "$ARCHIVE_QUERY" == *\"closest\":* ]]; then
1189 | # In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it
1190 | ARCHIVE_QUERY=$(echo "$ARCHIVE_QUERY" | sed 's/#!/#\\!/')
1191 |
1192 | # ...isolate "url" property in the response that follows the "closest" tag
1193 | SNAPSHOT_URL=${ARCHIVE_QUERY##*\"closest\":} # everything after '"closest":'
1194 | SNAPSHOT_URL=${SNAPSHOT_URL#*\"url\": \"} # everything after '"url": "'
1195 | SNAPSHOT_URL=${SNAPSHOT_URL%%\"*} # everything before '"'
1196 |
1197 | # Remove the port 80 part that IA often adds to the URL, as it's superfluous
1198 | SNAPSHOT_URL=$(echo $SNAPSHOT_URL | sed 's/:80//')
1199 |
1200 | # Inform the user of the snapshot URL
1201 | valPrint ts " IA suggests $SNAPSHOT_URL"
1202 | valPrint rs " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
1203 | valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
1204 | else # Otherwise give a generic Wayback Machine link for this URL, which might work
1205 | valPrint ts " Try browsing $ARCHIVE_GENERIC/$URL"
1206 | valPrint rs " Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
1207 | valPrint hs "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"
1208 | fi
1209 | fi
1210 | fi
1211 |
1212 | # If this is a page and it seems to be there, take screenshot of it using headless Google Chrome
1213 | if [ $IS_FILE -eq 0 ] && [ $TAKE_PAGE_SHOT -eq 1 ] && [ $STATUS == "OK" ]; then
1214 | # Sanitize URL by removing "http(s)://" and converting any other colons or slashes to underscores
1215 | SHOT_NAME=$(echo "$URL" | sed 's/https*\:\/\///' | sed 'y/:\//__/')
1217 |
1218 | # Don't take screenshot if we already encountered this page and screenshotted it
1219 | if [ ! -f "$SHOT_FILE" ]; then
1220 | "$CHROME_PATH" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
1221 | if [ -f "$WORKING_DIR/$CHROME_SCREENSHOT" ]; then
1223 | else
1224 | valPrint trhs "Screenshot of URL $URL seems to have failed!"
1225 | fi
1226 | else
1227 | valPrint trhs "Skipping screenshot of URL '$URL' because file '$SHOT_FILE' already exists."
1228 | fi
1229 | fi
1230 | done
1231 | FINISHED_LIST="yes"
1232 | wrapupAndExit