source: Validate External Links/validate_external_links.sh@ 1145

Last change on this file since 1145 was 1145, checked in by iritscen, 4 years ago

ValExtLinks: Added .do to recognized page suffixes.

File size: 51.4 KB
Line 
1#!/bin/bash
2
3# Validate External Links by Iritscen
4#
5# Validates a list of external links in CSV format. The resulting logs are produced in three formats:
6# - TXT (for easy diffing with an earlier log)
7# - RTF (for reading as a local file with clickable links)
8# - HTML (for reading as a web page)
9# Call script with "--help" argument for documentation. Also see Read Me.rtf for critical notes.
10#
11# Recommended rule:
12# |----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----|
13#
14# Table of contents (sections of script in order of appearance, not execution):
15# • Globals
16# • Help Output
17# • Setup
18# • Utility Functions
19# • Summary Output
20# • Initialization
21# • Data Sourcing
22# • Config Output
23# • Legend Output
24# • Main Loop
25
26# Set separator token to newline
27IFS="
28"
29
30### GLOBALS ###
31# Settings -- these will be changed from their defaults by the arguments passed in to the script
32LINKS_URL="" # use 'curl' to download file with links from this location (can be file://)
33EXCEPT_URL="" # 'curl' will access this wiki page with a list of exceptions for NG results
34OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder
35RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES
36SHOW_SLASH=0 # record issue when a slash is added to the end of a URL
37SHOW_HTTPS=0 # record issue when "http" is upgraded to "https"
38SHOW_YT_RD=0 # record redirection for a youtu.be URL expanding to the full URL
39SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page
40CHECK_ARCHIVE_LINKS=0 # check URLs under the archive.org domain
41TAKE_PAGE_SHOT=0 # take a screenshot of each OK page
42TIMEOUT=10 # time to wait for a response when querying a site
43CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature
44URL_START=1 # start at this URL in LINKS_FILE
45URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE
46UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report
47
48# Fixed strings -- see the occurrences of these variables to learn their purpose
49AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36 OPR/70.0.3728.154"
50ARCHIVE_API="http://archive.org/wayback/available"
51ARCHIVE_GENERIC="https://web.archive.org/web/*"
52ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
53CHROME_SCREENSHOT="screenshot.png"
54EXCEPT_FILE_NAME="exceptions.txt"
55EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
56WIKI_CURL="https://wiki.oni2.net/Validate_External_Links/Curl_codes"
57WIKI_HTTP="https://wiki.oni2.net/Validate_External_Links/HTTP_codes"
58WIKI_MAIN="https://wiki.oni2.net/Validate_External_Links"
59WIKI_ME="http://iritscen.oni2.net"
60THIS_DIR=$(cd $(dirname $0); pwd)
61WORKING_DIR=$(pwd)
62WIKI_PATH="wiki.oni2.net"
63
64# These are parallel arrays of the IDs and names of OniGalore's current namespaces
65declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 100 101 102 103 104 105 108 109 110 111)
66declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk")
67
68# These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
69# This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code.
70declare -a HTTP_FILES=(3ds 7z avi BINA blend bsl cpp csv dae dll dmg doc east exe fbx first63 flv gamedata gif jpg last32 log m4a mhm mov mp3 mp4 oni ONWC pdf png py rar tga TRMA txt vbs wav wmv xaf xml zip)
71declare -a HTTP_TLDS_AND_PAGES=(action ars asp aspx cfm cgi com css de do full htm html it js jsp net org pgi php php3 phtml pl ru shtml stm uk x)
72
73# These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
74# are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt
75# if you add a new code.
76declare -a OK_CODES=(200 401 405 406 418 501)
77declare -a RD_CODES=(301 302 303 307 308)
78declare -a NG_CODES=(000 400 403 404 410 500 502 503 530)
79
80# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
81# transcluded text, and if the transclusion fails, then the braces show up in the URL
82ILLEGAL_CHARS="{ }"
83
84# The shortest URL possible, used for sanity-checking some URLs: http://a.co
85MIN_URL_LENGTH=11
86
87# These are parallel arrays giving the prefixes that can be used in place of normal external links to
88# some wikis and other sites
89declare -a INTERWIKI_PREFIXES=(commons metawikimedia mw wikibooks wikidata wikimedia wikinews wikiquote wikisource wikispecies wikiversity wikivoyage wikt wp)
90declare -a INTERWIKI_DOMAINS=(commons.wikimedia.org meta.wikimedia.org mediawiki.org wikibooks.org wikidata.org wikimediafoundation.org wikinews.org wikiquote.org wikisource.org species.wikimedia.org wikiversity.org wikivoyage.org wiktionary.org wikipedia.org)
91
92# Variables for keeping track of main loop progress and findings
93LINK_NUM=0
94EI_LINKS=0
95IW_LINKS=0
96OK_LINKS=0
97RD_LINKS=0
98NG_LINKS=0
99SKIP_UNK_NS=0
100SKIP_JS_PAGE=0
101SKIP_BAD_URL=0
102SKIP_NON_ASCII=0
103SKIP_UNK_SUFFIX=0
104SKIP_UNK_CODE=0
105SKIP_EXPECT_NG=0
106SKIP_EXPECT_RD=0
107SKIP_EXPECT_EI=0
108SKIP_EXPECT_IW=0
109SKIP_HTTPS_UP=0
110SKIP_SLASH_ADD=0
111SKIP_YOUTU_BE=0
112SKIP_ARCHIVE_ORG=0
113FILE_LINKS=0
114PAGE_LINKS=0
115SKIPPED_HEADER_ROW=0
116FINISHED_LIST="no"
117START_RUN=0
118END_RUN=0
119
120
121### HELP OUTPUT ###
122# A pseudo-man page. Here is the 80-character rule for the page text:
123# 234567890123456789012345678901234567890123456789012345678901234567890123456789
124function printHelp()
125{
126 cat << EOF
127
128NAME
129 Validate External Links
130
131SYNOPSIS
132 validate_external_links.sh --help
133 validate_external_links.sh --links URL --output DIR [--exceptions URL]
134 [--record-ok-links] [--show-added-slashes] [--show-https-upgrades]
135 [--show-yt-redirects] [--suggest-snapshots] [--check-archive-links]
136 [--take-screenshots FILE] [--timeout NUM] [--start-url NUM]
137 [--end-url NUM] [--upload FILE]
138
139DESCRIPTION
140 This script parses a list of external links found in the OniGalore wiki
141 (which is dumped by the Oni2.net domain periodically in a particular
142 format), validates them using the Unix tool 'curl', and produces a report
143 of which links were "OK" (responded positively to an HTTP query), which
144 were "RD" (responded with a 3xx redirect code), which could be "IW"
145 (interwiki) links, which are "EI" (external internal) links and could be
146 intrawiki links, and which were "NG" (no good; a negative response to the
147 query). This report can then be automatically uploaded to the location of
148 your choice. The script can also suggest Internet Archive snapshots for
149 "NG" links, and take screenshots of "OK" links for visual verification by
150 the reader that the page in question is the one intended to be displayed.
151
152 You must pass this script the URL at which the list of links is found
153 (--links) and the path where the directory of logs should be outputted
154 (--output). All other arguments are optional.
155
156OPTIONS
157 --help Show this page.
158 --links URL (required) URL from which to download the CSV
159 file with external links. Note that this URL can
160 be a local file if you supply a file:// path.
161 --output DIR (required) Unix path to directory in which Val
162 should place its reports.
163 --exceptions URL In order to remove links from the report which
164 Val finds an issue with but which you regard as
165 OK, list those desired exceptions on a wiki page.
166 See the sample file "exceptions.pdf" for the
167 required format of the page. Note that this URL
168 can point to a local file if you supply a path
169 beginning with "file://".
170 --record-ok-links Log a link in the report even if its response
171 code is "OK".
172 --show-added-slashes Report on redirects that simply add a '/' to the
173 end of the URL.
174 --show-https-upgrades Report on redirects that simply upgrade a
175 "http://" URL to a "https://" URL.
176 --show-yt-redirects Report on redirects that expand a youtu.be URL.
177 --suggest-snapshots Query the Internet Archive for a possible
178 snapshot URL for each "NG" page.
179 --check-archive-links Check links that are already pointing to a page
180 on the Internet Archive. In theory these links
181 should be totally stable and not need validation.
182 --take-screenshots FILE Call the Google Chrome binary at this path to
183 take screenshots of each "OK" page.
184 --timeout NUM Wait this many seconds for a site to respond. The
185 default is 10. Important note: Val will attempt
186 to reach each URL three times, so the time taken
187 to ping an unresponsive site will be three times
188 this setting.
189 --start-url NUM Start at this link in the links CSV file.
190 --end-url NUM Stop at this link in the links CSV file.
191 --upload FILE Upload report using the credentials and path
192 given in this local text file. See sftp_login.txt
193 for template.
194
195BUGS
196 The script cannot properly parse any line in the external links file
197 which contains a comma in the name of the wiki page containing a link.
198 Commas in the link itself are not an issue.
199EOF
200}
201
202
203### SETUP ###
204# If first argument is a help request, or if nothing was passed in at all, print help page and quit
205if [ "$#" -eq 0 ] || [ "$1" == "--help" ]; then
206 printHelp | less
207 exit 0
208fi
209
210# Parse arguments as long as there are more arguments to process
211while (( "$#" )); do
212 case "$1" in
213 --links ) LINKS_URL="$2"; shift 2;;
214 --exceptions ) EXCEPT_URL="$2"; shift 2;;
215 --output ) OUTPUT_DIR="$2"; shift 2;;
216 --record-ok-links ) RECORD_OK_LINKS=1; shift;;
217 --show-added-slashes ) SHOW_SLASH=1; shift;;
218 --show-https-upgrades ) SHOW_HTTPS=1; shift;;
219 --show-yt-redirects ) SHOW_YT_RD=1; shift;;
220 --suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;;
221 --check-archive-links ) CHECK_ARCHIVE_LINKS=1; shift;;
222 --take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
223 --timeout ) TIMEOUT=$2; shift 2;;
224 --start-url ) URL_START=$2; shift 2;;
225 --end-url ) URL_LIMIT=$2; shift 2;;
226 --upload ) UPLOAD_INFO=$2; shift 2;;
227 * ) echo "Invalid argument $1 detected. Aborting."; exit 1;;
228 esac
229done
230
231# If the required arguments were not supplied, print help page and quit
232if [ -z $LINKS_URL ] || [ -z $OUTPUT_DIR ]; then
233 echo "Error: I did not receive one or both required arguments. Run me with the \"--help\" argument for documentation."
234 exit 2
235fi
236
237# If user wants screenshots, make sure path to Chrome was passed in and is valid
238if [ $TAKE_PAGE_SHOT -eq 1 ]; then
239 if [ ! -f "$CHROME_PATH" ]; then
240 echo "Error: You need to supply a path to the Google Chrome application in order to take screenshots."
241 exit 3
242 fi
243fi
244
245# Check that UPLOAD_INFO exists, if this argument was supplied
246if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then
247 echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting."
248 exit 4
249fi
250
251# Check that OUTPUT_DIR is a directory
252if [ ! -d "$OUTPUT_DIR" ]; then
253 echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting."
254 exit 5
255fi
256
257# Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots
258SAFE_TIME=$(date "+%Y-%m-%d--%H-%M-%S") # for use in path names
259NICE_TIME=$(TZ=":UTC" date "+%a, %d %b %Y %k:%M:%S GMT") # matches format of LINKS_DATE, which is extracted from the extlinks file
260OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
261OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
262SHOT_PATH="$OUTPUT_PATH/Screenshots"
263LOG_NAME="ValExtLinks report"
264LOG_NAME_TXT="$LOG_NAME.txt"
265LOG_NAME_RTF="$LOG_NAME.rtf"
266LOG_NAME_HTM="$LOG_NAME.htm"
267LOG_PATH="$OUTPUT_PATH/$LOG_NAME"
268LOG_PATH_TXT="$LOG_PATH.txt"
269LOG_PATH_RTF="$LOG_PATH.rtf"
270LOG_PATH_HTM="$LOG_PATH.htm"
271mkdir "$OUTPUT_PATH"
272if [ $TAKE_PAGE_SHOT -eq 1 ]; then
273 mkdir "$SHOT_PATH"
274fi
275
276# Check that 'mkdir' succeeded
277if [ ! -d "$OUTPUT_PATH" ]; then
278 echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting."
279 exit 6
280fi
281
282# Get date on the file at LINKS_URL and print to log
283LINKS_DATE=$(curl --silent --head $LINKS_URL | grep "Last-Modified")
284if [ -z "$LINKS_DATE" ]; then
285 echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting."
286 exit 7
287fi
288LINKS_DATE=${LINKS_DATE#Last-Modified: }
289
290
291### UTILITY FUNCTIONS ###
292# Writes a plain-text header to TXT log file
293function printTXTheader()
294{
295 valPrint t "Validate External Links report"
296 valPrint t "generated $NICE_TIME"
297 valPrint t "from data of $LINKS_DATE"
298 valPrint t "script by Iritscen (contact: $WIKI_ME)"
299 valPrint t ""
300}
301
302# Writes the RTF header to RTF log file, then a centered title, then sets text to left-justified
303function printRTFheader()
304{
305 valPrint r "{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf820
306{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
307{\colortbl;\red255\green255\blue255;}
308{\*\expandedcolortbl;;}
309\margl1440\margr1440\vieww12600\viewh12100\viewkind0
310\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\qc\partightenfactor0
311
312\f0\fs28 \cf0 \b1 Validate External Links report \b0\\
313generated $NICE_TIME\\
314from data of $LINKS_DATE\\
315script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$WIKI_ME\"}}{\fldrslt contact}})
316\\
317\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
318\cf0 "
319}
320
321# Closes the RTF markup of the RTF log file
322function printRTFfooter()
323{
324 valPrint r "}"
325}
326
327# Writes the HTML header to HTML log file
328function printHTMheader()
329{
330 valPrint h "<html>
331<head>
332<title>Validate External Links report</title>
333</head>
334<body>
335<h2>Validate External Links report</h2>
336<h3>generated $NICE_TIME<br />
337from data of $LINKS_DATE<br />
338script by Iritscen (<a href=\"$WIKI_ME\" target=\"_blank\">contact</a>)</h3>"
339}
340
341# Closes the HTML markup of the HTML log file
342function printHTMfooter()
343{
344 valPrint h "</body>
345</html>"
346}
347
348# The central logging function. The first parameter is a string composed of one or more characters that
349# indicate which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
350# 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 's' means "Print an
351# extra newline at the end." 'w' means "Don't pass console output through 'fmt'" ("fmt" fits the output
352# to an 80-column CLI but can break special formatting and the 'n' option).
353function valPrint()
354{
355 if [[ "$1" == *c* ]]; then
356 if [[ "$1" == *n* ]]; then
357 echo -n "$2"
358 elif [[ "$1" == *w* ]]; then
359 echo "$2"
360 elif [[ "$1" == *s* ]]; then
361 echo -e "$2\n"
362 else
363 echo "$2" | fmt -w 80
364 fi
365 fi
366 if [[ "$1" == *t* ]]; then
367 if [[ "$1" == *n* ]]; then
368 echo -n "$2" >> "$LOG_PATH_TXT"
369 elif [[ "$1" == *s* ]]; then
370 echo -e "$2\n" >> "$LOG_PATH_TXT"
371 else
372 echo "$2" >> "$LOG_PATH_TXT"
373 fi
374 fi
375 if [[ "$1" == *r* ]]; then
376 if [[ "$1" == *n* ]]; then
377 echo "$2" >> "$LOG_PATH_RTF"
378 elif [[ "$1" == *s* ]]; then
379 echo "$2\line\line" >> "$LOG_PATH_RTF"
380 else
381 echo "$2\line" >> "$LOG_PATH_RTF"
382 fi
383 fi
384 if [[ "$1" == *h* ]]; then
385 if [[ "$1" == *s* ]]; then
386 echo "$2<tr><td>&nbsp;</td></tr>" >> "$LOG_PATH_HTM"
387 elif [[ "$1" == *n* ]]; then
388 echo "$2" >> "$LOG_PATH_HTM"
389 else
390 echo "$2<br />" >> "$LOG_PATH_HTM"
391 fi
392 fi
393}
394
395# Pluralize the string in parameter 1 if the number in parameter 2 is not 1
396function pluralCheckNoun()
397{
398 if [ $2 -ne 1 ]; then
399 if [[ $1 =~ x$ ]]; then
400 echo $1es
401 else
402 echo $1s
403 fi
404 else
405 echo $1
406 fi
407}
408
409# Output "is" if parameter 1 is 1, otherwise "are"
410function pluralCheckIs()
411{
412 if [ $1 -ne 1 ]; then
413 echo "are"
414 else
415 echo "is"
416 fi
417}
418
419# Output "was" if parameter 1 is 1, otherwise "were"
420function pluralCheckWas()
421{
422 if [ $1 -ne 1 ]; then
423 echo "were"
424 else
425 echo "was"
426 fi
427}
428
429# Output "a " if parameter 1 is 1, otherwise nothing
430function pluralCheckA()
431{
432 if [ $1 -eq 1 ]; then
433 echo "a "
434 fi
435}
436
437# Output "an " if parameter 1 is 1, otherwise nothing
438function pluralCheckAn()
439{
440 if [ $1 -eq 1 ]; then
441 echo "an "
442 fi
443}
444
445# Upload the reports using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
446# reports being saved to disk have already been closed.
447function uploadReport()
448{
449 valPrint c "Uploading reports..."
450
451 SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
452 SFTP_USER_NAME_MARKER="user:"
453 SFTP_PASSWORD_MARKER="pw:"
454 SFTP_PORT_MARKER="port:"
455 SFTP_PATH_MARKER="path:"
456 SFTP_USER_NAME=$(grep $SFTP_USER_NAME_MARKER $UPLOAD_INFO)
457 SFTP_USER_NAME=${SFTP_USER_NAME#$SFTP_USER_NAME_MARKER}
458 SFTP_PASSWORD=$(grep $SFTP_PASSWORD_MARKER $UPLOAD_INFO)
459 SFTP_PASSWORD=${SFTP_PASSWORD#$SFTP_PASSWORD_MARKER}
460 SFTP_PORT=$(grep $SFTP_PORT_MARKER $UPLOAD_INFO)
461 SFTP_PORT=${SFTP_PORT#$SFTP_PORT_MARKER}
462 SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
463 SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}
464
465 for SUFFIX in htm rtf txt; do
466 expect "$SCRIPT_PATH" "$LOG_PATH.$SUFFIX" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.$SUFFIX"
467
468 if [ "$?" -ne 0 ]; then
469 valPrint c "Error $? occurred when attempting to upload $LOG_NAME.$SUFFIX!"
470 else
471 valPrint c "Report in `echo $SUFFIX | tr [:lower:] [:upper:]` format was uploaded."
472 fi
473 done
474}
475
476# Prints session summary when script is done
477function wrapupAndExit()
478{
479 # Get off progress line on console, drop down a line from last link in log, and close HTML table
480 valPrint ctr ""
481 valPrint h "</table><br />"
482
483 # If we didn't finish processing the last URL, then the iterator is one too high
484 if [ $FINISHED_LIST != "yes" ]; then
485 let LINK_NUM-=1
486 if [ $FINISHED_LIST == "no" ]; then
487 valPrint ctrh "The session was canceled by the user."
488 fi
489 fi
490
491 # Generate string with elapsed time
492 END_RUN=$(date +%s)
493 ELAPSED=$(echo $(($END_RUN - $START_RUN)) | awk '{printf "%d min. %d sec. elapsed", int($1/60), int($1%60)}')
494
495 # Do some math on results of session
496 LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
497 TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP+SKIP_YOUTU_BE))
498 LINK_ERRORS=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
499 LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_RD+SKIP_EXPECT_EI+SKIP_EXPECT_IW))
500 LINK_PROBLEMS_TOTAL=$((NG_LINKS+RD_LINKS+EI_LINKS+IW_LINKS))
501 LINK_PROBLEMS_NG=$((NG_LINKS-SKIP_EXPECT_NG))
502 LINK_PROBLEMS_RD=$((RD_LINKS-SKIP_EXPECT_RD))
503 LINK_PROBLEMS_EI=$((EI_LINKS-SKIP_EXPECT_EI))
504 LINK_PROBLEMS_IW=$((IW_LINKS-SKIP_EXPECT_IW))
505 LINK_PROBLEMS_NET=$((LINK_PROBLEMS_NG+LINK_PROBLEMS_RD+LINK_PROBLEMS_EI+LINK_PROBLEMS_IW))
506
507 # Print something in the Links section if no link issues were printed
508 if [ $LINK_PROBLEMS_NET -eq 0 ]; then
509 valPrint h "<i>No link problems to report! See the <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for a list of links with issues that were not reported.</i>"
510 fi
511 if [ $LINK_PROBLEMS_TOTAL -eq 0 ]; then
512 valPrint t "No link problems to report!"
513 valPrint r "\i1 No link problems to report! \i0"
514 fi
515
516 ## SUMMARY OUTPUT ##
517 valPrint ct "Summary ($ELAPSED):"
518 valPrint r "\b1 Summary \b0 ($ELAPSED)"
519 valPrint hn "<h3><span id=\"summary\">Summary ($ELAPSED)</span></h3>"
520 valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT) (there $(pluralCheckWas $FILE_LINKS) $FILE_LINKS file $(pluralCheckNoun link $FILE_LINKS) and $PAGE_LINKS page $(pluralCheckNoun link $PAGE_LINKS))."
521
522 # Print processed link totals
523 if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi
524 if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi
525 if [ $SKIP_ARCHIVE_ORG -gt 0 ]; then valPrint ctrh "- $SKIP_ARCHIVE_ORG Archive.org $(pluralCheckNoun link $SKIP_ARCHIVE_ORG) were not checked"; fi
526 if [ $LINK_PROBLEMS_TOTAL -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_TOTAL processed $(pluralCheckNoun link $LINK_PROBLEMS_TOTAL) had $(pluralCheckAn $LINK_PROBLEMS_TOTAL)$(pluralCheckNoun issue $LINK_PROBLEMS_TOTAL)"; fi
527 if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr " (excepted $LINKS_EXCEPTED link $(pluralCheckNoun issue $LINKS_EXCEPTED) from report)"; valPrint h "&nbsp;&nbsp;(excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi
528 if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) $(pluralCheckWas $OK_LINKS) OK"; fi
529 if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctr " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; valPrint h "&nbsp;&nbsp;(counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi
530
531 # Print errored link totals
532 if [ $LINK_ERRORS -gt 0 ]; then
533 valPrint c "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"
534 valPrint h "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for specific links):"
535 valPrint rt "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS):"
536 fi
537 if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
538 if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
539 if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
540 if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
541 if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
542 if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
543
544 # Print excepted link totals
545 if [ $LINKS_EXCEPTED -gt 0 ]; then
546 valPrint c "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"
547 valPrint h "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for specific links):"
548 valPrint rt "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted:"
549 fi
550 if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
551 if [ $SKIP_EXPECT_RD -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_RD/$RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi
552 if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi
553 if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi
554
555 # Print checked link totals
556 if [ $LINK_PROBLEMS_NET -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS_NET link $(pluralCheckNoun issue $LINK_PROBLEMS_NET):"; fi
557 if [ $LINK_PROBLEMS_NG -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_NG NG $(pluralCheckNoun link $LINK_PROBLEMS_NG)"; fi
558 if [ $LINK_PROBLEMS_RD -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_RD $(pluralCheckNoun redirection $LINK_PROBLEMS_RD)"; fi
559 if [ $LINK_PROBLEMS_EI -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_EI $(pluralCheckNoun link $LINK_PROBLEMS_EI) that could be intrawiki"; fi
560 if [ $LINK_PROBLEMS_IW -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_IW $(pluralCheckNoun link $LINK_PROBLEMS_IW) that could be interwiki"; fi
561
562 # Close the log files' markup
563 valPrint trh "ValExtLinks says goodbye."
564 printRTFfooter
565 printHTMfooter
566
567 # Upload report if this was requested
568 if [ ! -z $UPLOAD_INFO ] && [ $FINISHED_LIST != "no" ]; then
569 uploadReport
570 fi
571
572 # Really quit now
573 valPrint c "ValExtLinks says goodbye."
574 exit 0
575}
576trap wrapupAndExit INT
577
578
579### INITIALIZATION ###
580# Print opening message to console and log files
581valPrint cw "$(tput bold)--Validate External Links--$(tput sgr0)"
582printTXTheader
583printRTFheader
584printHTMheader
585
586## DATA SOURCING ##
587valPrint t "Startup:"
588valPrint r "\b1 Startup \b0"
589valPrint hn "<h3>Startup</h3>"
590
591# Attempt to download file at LINKS_URL, then check that it succeeded
592valPrint cwtrhn "Downloading list of external links from $LINKS_URL..."
593LINKS_FILE_NAME=$(echo "$LINKS_URL" | sed 's/.*\///')
594LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
595curl --silent -o "$LINKS_FILE" $LINKS_URL
596if [ ! -f "$LINKS_FILE" ]; then
597 echo -e "\nThe download of $LINKS_URL appears to have failed. Aborting."
598 wrapupAndExit
599else
600 valPrint ctrh " success."
601fi
602
603# Attempt to download file at EXCEPT_URL, then check that it succeeded
604if [ ! -z $EXCEPT_URL ]; then
605 valPrint cwtrhn "Downloading list of reporting exceptions from $EXCEPT_URL..."
606 EXCEPT_DATA=$(curl --silent $EXCEPT_URL)
607 if [ -z "$EXCEPT_DATA" ]; then
608 echo -e "\nThe download of the exceptions data from '$EXCEPT_URL' appears to have failed. Aborting."
609 wrapupAndExit
610 else
611 valPrint ctrh " success."
612 fi
613 EXCEPT_DATA=${EXCEPT_DATA%END LIST*}
614 EXCEPT_DATA=${EXCEPT_DATA#*BEGIN LIST}
615 EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
616
617 # Store on disk for debugging purposes
618 echo "$EXCEPT_DATA" > "$EXCEPT_FILE"
619
620 # Transfer to array for easy searching later
621 declare -a EXCEPT_ARRAY=($(echo "$EXCEPT_DATA"))
622fi
623
624# Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
625LINK_COUNT_STRING=$(cat "$LINKS_FILE" | wc -l)
626
627# Number of URLs is number of lines minus one (first line is column header row for the CSV)
628LINK_COUNT=$(echo "${LINK_COUNT_STRING}" | tr -d '[:space:]')
629let LINK_COUNT-=1
630valPrint ctrh "Found $LINK_COUNT links to process."
631valPrint trh ""
632
633## CONFIG OUTPUT ##
634valPrint t "Config:"
635valPrint r "\b1 Config \b0"
636valPrint hn "<h3>Config</h3>"
637
638valPrint ctrhn "Links to consider: "
639if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
640 valPrint ctrh "$((URL_LIMIT-URL_START+1)), from link $URL_START to $URL_LIMIT"
641elif [ $URL_START -ne 1 ]; then
642 valPrint ctrh "$((LINK_COUNT-URL_START+1)), from link $URL_START to $LINK_COUNT"
643else
644 valPrint ctrh "$LINK_COUNT"
645fi
646
647valPrint ctrh "Site query timeout: $TIMEOUT seconds"
648
649valPrint ctrhn "Show OK links: "
650if [ $RECORD_OK_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
651
652valPrint ctrhn "Take screenshots: "
653if [ $TAKE_PAGE_SHOT -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
654
655valPrint ctrhn "Suggest archive.org snapshots: "
656if [ $SUGGEST_SNAPSHOTS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
657
658valPrint ctrhn "Ignore slash-adding redirects: "
659if [ $SHOW_SLASH -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
660
661valPrint ctrhn "Ignore HTTPS-upgrading redirects: "
662if [ $SHOW_HTTPS -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
663
664valPrint ctrhn "Ignore youtu.be redirects: "
665if [ $SHOW_YT_RD -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
666
667valPrint ctrhn "Check archive.org links: "
668if [ $CHECK_ARCHIVE_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
669
670valPrint tr "A summary of my findings will be found at the bottom of the report."
671valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
672valPrint trh ""
673
674## LEGEND OUTPUT ##
675valPrint t "Legend:"
676valPrint r "\b1 Legend \b0"
677valPrint hn "<h3>Legend</h3>"
678valPrint t "(For guidance in fixing these links, see $WIKI_MAIN.)"
679valPrint r "(For guidance in fixing these links, see {\field{\*\fldinst{HYPERLINK \"$WIKI_MAIN\"}}{\fldrslt here}}.)"
680valPrint h "(For guidance in fixing these links, see <a href=\"$WIKI_MAIN\" target=\"_blank\">here</a>.)"
681valPrint trh "OK = URL seems to be working"
682valPrint trh "NG = URL no longer seems to work"
683valPrint trh "RD = URL is redirecting to this new URL"
684valPrint trh "EI = URL is an external link to an internal page and should be converted to an intrawiki link using the suggested markup"
685valPrint trh "IW = URL is an external link to a fellow wiki and should be converted to an interwiki link using the suggested markup"
686valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $WIKI_HTTP)"
687valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$WIKI_HTTP\"}}{\fldrslt here}} for code reference)"
688valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$WIKI_HTTP\" target=\"_blank\">here</a> for code reference)"
689valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $WIKI_CURL)"
690valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$WIKI_CURL\"}}{\fldrslt here}} for code reference)"
691valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$WIKI_CURL\" target=\"_blank\">here</a> for code reference)"
692valPrint trh "IA suggests = Last available snapshot returned by the Internet Archive"
693valPrint trh "Try browsing = The Archive failed to return a snapshot URL, so check for a snapshot manually using this link"
694valPrint trh ""
695
696
697### MAIN LOOP ###
698valPrint t "Links:"
699valPrint r "\b1 Links \b0"
700valPrint hn "<h3>Links</h3>"
701START_RUN=$(date +%s)
702# Process each line of the .csv in LINKS_FILE
703for LINE in `cat "$LINKS_FILE"`; do
704 let LINK_NUM+=1
705
706 # First line is the column header row for the CSV, so let's verify that the format hasn't changed
707 if [ $SKIPPED_HEADER_ROW -eq 0 ]; then
708 if [ $LINE == "namespace,title,target" ]; then
709 SKIPPED_HEADER_ROW=1
710 LINK_NUM=0 # this line is it's not a link, so reset the link counter
711 valPrint hn "<table>"
712 continue
713 else
714 valPrint ctrh "Did not find expected header row in $LINKS_FILE. Aborting."
715 wrapupAndExit
716 fi
717 fi
718
719 # Skip this link if we are not at URL_START yet
720 if [ $LINK_NUM -lt $URL_START ]; then
721 continue
722 fi
723
724 # Stop if we are at the limit declared for testing purposes
725 if [ $URL_LIMIT -gt 0 ] && [ $LINK_NUM -gt $URL_LIMIT ]; then
726 FINISHED_LIST="limit"
727 wrapupAndExit
728 fi
729
730 # Print progress to screen
731 if [ $LINK_NUM -gt 1 ]; then
732 printf "\e[1A\n" # erase previous progress message so that new one appears in its place
733 fi
734 valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."
735
736 # The number of the namespace is the element before the first comma on the line
737 NS_ID=${LINE%%,*}
738
739 # Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
740 NS_NAME=""
741 a=0
742 while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done
743 if [ $NS_ID == "NULL" ]; then
744 break
745 elif [ $NS_ID -eq ${NS_IDS[$a]} ]; then
746 NS_NAME="${NS_NAMES[$a]}"
747 break
748 fi
749 let a+=1
750 done
751 if [ "$NS_NAME" == "" ]; then
752 if [ $NS_ID == "NULL" ]; then
753 valPrint trs "Skipping line $LINK_NUM ('$LINE') because the namespace (and probably the page too) is 'NULL'. Probably the link is no longer in existence on the wiki."
754 else
755 valPrint trs "Skipping line $LINK_NUM ('$LINE') because I could not find a name for namespace ID $NS_ID."
756 fi
757 let SKIP_UNK_NS+=1
758 continue
759 fi
760
761 # The name of the page is everything between the namespace ID and the next comma on the line (commas
762 # in page names will break this)
763 PAGE_NAME=${LINE#$NS_ID,}
764 PAGE_NAME=${PAGE_NAME%%,*}
765
766 # We don't want to consider wiki pages ending in .js, as the MW parser cannot reliably isolate URLS
767 # in JavaScript code, so it returns erroneous links
768 PAGE_NAME_SUFFIX=$(echo $PAGE_NAME | sed 's/.*\.//')
769 if [ $PAGE_NAME_SUFFIX == "js" ]; then
770 valPrint trs "Skipping URL '${LINE#$NS_ID,$PAGE_NAME,}' on line $LINK_NUM because it was found on JavaScript page '$PAGE_NAME'."
771 let SKIP_JS_PAGE+=1
772 continue
773 fi
774
775 # Build longer wiki page URLs from namespace and page names
776 FULL_PAGE_PATH=https://$WIKI_PATH/$NS_NAME:$PAGE_NAME
777 LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
778 # Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it
779 # explicitly breaks the link
780 if [ $NS_ID -eq 0 ]; then
781 FULL_PAGE_PATH=https://$WIKI_PATH/$PAGE_NAME
782 LOCAL_PAGE_PATH=$PAGE_NAME
783 fi
784
785 # The URL being linked to is everything after the previous two fields (this allows commas to be in
786 # the URLs, but a comma in the previous field, the page name, will break this)
787 URL=${LINE#$NS_ID,$PAGE_NAME,}
788
789 # Scan for illegal characters
790 if [[ $URL == *[$ILLEGAL_CHARS]* ]]; then
791 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because it contains characters illegal in a URL."
792 let SKIP_BAD_URL+=1
793 continue
794 fi
795
796 # If we're skipping Archive.org links, see if this is one
797 if [ $CHECK_ARCHIVE_LINKS -eq 0 ] && [[ $URL == *web.archive.org* ]]; then
798 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to check Wayback Machine links."
799 let SKIP_ARCHIVE_ORG+=1
800 continue
801 fi
802
803 # Now we need to know if the URL is for a file or a web page. First step is to determine if the
804 # URL ends in a suffix
805 HAS_SUFFIX=0
806
807 # If the URL ends in something like ".php?foo=bar", strip everything from the '?' onward
808 CLEAN_URL=${URL%%\?*}
809
810 # If the URL ends in something like "#section_15", strip everything from the '#' onward
811 CLEAN_URL=${CLEAN_URL%%\#*}
812
813 # 'sed' cannot handle Unicode in my Bash shell, so skip non-ASCII URL and make user check it
814 if [[ $CLEAN_URL == *[![:ascii:]]* ]]; then
815 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I cannot handle non-ASCII characters."
816 let SKIP_NON_ASCII+=1
817 continue
818 fi
819
820 # Isolate the characters after the last period and after the last slash
821 POST_DOT=$(echo "$CLEAN_URL" | sed 's/.*\.//')
822 POST_SLASH=$(echo "$CLEAN_URL" | sed 's/.*\///')
823
824 # If the last period comes after the last slash, then the URL ends in a suffix
825 POST_DOT_LENGTH=$(echo | awk -v input=$POST_DOT '{print length(input)}')
826 POST_SLASH_LENGTH=$(echo | awk -v input=$POST_SLASH '{print length(input)}')
827 if [ $POST_DOT_LENGTH -lt $POST_SLASH_LENGTH ]; then
828 HAS_SUFFIX=1
829 else
830 HAS_SUFFIX=0
831 fi
832
833 # Now we need to know if the URL is for a file. If the URL ends in a suffix, compare it against the
834 # known URL endings supplied in HTTP_FILES and HTTP_TLDS_AND_PAGES
835 IS_FILE=-1
836 if [ $HAS_SUFFIX -eq 0 ]; then
837 IS_FILE=0
838 else
839 # Turn off case sensitivity while we compare suffixes
840 shopt -s nocasematch
841
842 # Special case: URLs ending in something like "productID.304297400" are pages, not files, so if
843 # the URL's suffix is all numbers, we are looking at the end of a web page URL
844 if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
845 IS_FILE=0
846 fi
847
848 # Special case: URLs ending in a parens, e.g. "ms537113(v=vs.85)", are pages
849 if [[ $POST_DOT =~ ^.*[\(\)]$ ]]; then
850 IS_FILE=0
851 fi
852
853 # Special case: URLs containing a '%', e.g. "10.1007%2FBF00329055", are pages
854 if [[ $POST_DOT == *%* ]]; then
855 IS_FILE=0
856 fi
857
858 # If we did not identify this URL as a web page above, we need to compare the suffix against known
859 # file extensions
860 if [ $IS_FILE -eq -1 ]; then
861 for EXTENSION in "${HTTP_FILES[@]}"; do
862 if [[ $EXTENSION == $POST_DOT ]]; then # double brackets respect the nocasematch setting
863 IS_FILE=1
864 break
865 fi
866 done
867 fi
868
869 # If we did not identify this URL as a file above, we need to compare the suffix against known
870 # pages and TLDs in order to rule out that this is an unknown file suffix or TLD that the user
871 # needs to add to the HTTP_FILES or HTTP_TLDS_AND_PAGES array
872 if [ $IS_FILE -eq -1 ]; then
873 for PAGE_OR_TLD in "${HTTP_TLDS_AND_PAGES[@]}"; do
874 if [[ $PAGE_OR_TLD == $POST_DOT ]]; then
875 IS_FILE=0
876 break
877 fi
878 done
879 fi
880
881 # Turn case sensitivity back on in Bash
882 shopt -u nocasematch
883 fi
884
885 # If this suffix escaped identification as either a file, page or TLD, inform the user
886 STR_TYPE=""
887 if [ $IS_FILE -eq -1 ]; then
888 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I encountered the unknown URL ending '$POST_DOT'. Please add this ending to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
889 let SKIP_UNK_SUFFIX+=1
890 continue
891 elif [ $IS_FILE -eq 1 ]; then
892 STR_TYPE="file"
893 let FILE_LINKS+=1
894 elif [ $IS_FILE -eq 0 ]; then
895 STR_TYPE="page"
896 let PAGE_LINKS+=1
897 fi
898
899 # Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
900 # issue with sites that require HTTPS
901 CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --retry 2 --write-out '%{http_code}\n' $URL)
902 CURL_ERR=$(echo $?)
903 CURL_RESULT=$CURL_CODE
904
905 # Tack 'curl' exit code onto result string if 'curl' returned "000" (no HTTP response)
906 if [ $CURL_CODE == "000" ]; then
907 CURL_RESULT="$CURL_RESULT-$CURL_ERR"
908 fi
909
910 # Begin to determine our status code for this URL (EI, IW, OK, RD, or NG)
911 STATUS="??"
912 NEW_URL=""
913 INTERWIKI_INDEX=-1
914
915 # First make sure that this isn't an "external internal" link to our own wiki that can be replaced
916 # by "[[page_name]]". If it uses a special access URL beginning with "/w/", let it pass, as it
917 # probably cannot be replaced by "[[ ]]" markup
918 if [[ $URL == *$WIKI_PATH* ]] && [[ $URL != *$WIKI_PATH/w/* ]]; then
919 STATUS="EI"
920 let EI_LINKS+=1
921 fi
922
923 # If it's not, check if this is a link to a domain that we have an interwiki prefix for (also make
924 # sure that it's not an archive.org link to a page from an interwiki domain)
925 if [ $STATUS == "??" ] && [[ $URL != *web.archive.org* ]]; then
926 for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
927 if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]] && [[ $URL != *${INTERWIKI_DOMAINS[$i]}/w/* ]]; then
928 STATUS="IW"
929 let IW_LINKS+=1
930 INTERWIKI_INDEX=$i
931 break
932 fi
933 done
934 fi
935
936 # If we didn't match an interwiki domain, see if the status code is in our "OK" codes list
937 if [ $STATUS == "??" ]; then
938 for CODE in "${OK_CODES[@]}"; do
939 if [[ $CODE == $CURL_CODE ]]; then
940 STATUS="OK"
941 let OK_LINKS+=1
942 break
943 fi
944 done
945 fi
946
947 # If we didn't get a match with the "OK" codes, check it against the "RD" codes
948 if [ $STATUS == "??" ]; then
949 for CODE in "${RD_CODES[@]}"; do
950 if [[ $CODE == $CURL_CODE ]]; then
951 # Get URL header again in order to retrieve the URL we are being redirected to
952 NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --write-out '%{redirect_url}\n' $URL)
953
954 # Adjust the old and new URLs to both use HTTP for comparison purposes, so we can filter
955 # those changes out if the user didn't ask for them
956 URL_HTTP=$(echo $URL | sed -E 's/^https:/http:/')
957 NEW_URL_HTTP=$(echo $NEW_URL | sed -E 's/^https:/http:/')
958
959 # Sometimes 'curl' fails to get the redirect_url due to time-out or bad web site config
960 NEW_URL_LENGTH=$(echo | awk -v input=$NEW_URL_HTTP '{print length(input)}')
961 if [ $NEW_URL_LENGTH -lt $MIN_URL_LENGTH ]; then
962 NEW_URL_HTTP="[new URL not retrieved]"
963 fi
964
965 # Remove slash at end of new URL, if present, so we can filter out the redirects that
966 # merely add an ending slash if the user didn't ask for them
967 NEW_URL_NO_SLASH=$(echo $NEW_URL_HTTP | sed -E 's:/$::')
968
969 # Detect if this is a youtu.be link simply being expanded by YouTube to the full
970 # youtube.com address
971 YOUTU_BE=0
972 if [[ $URL_HTTP == http://youtu.be* ]] && [[ $NEW_URL_HTTP == http://www.youtube.com* ]]; then
973 YOUTU_BE=1
974 fi
975
976 # If the URLs match besides HTTP being upgraded to HTTPS, then the link is OK (unless user
977 # wants those to be reported)
978 if [ $SHOW_HTTPS -eq 0 ] && [ $URL_HTTP == $NEW_URL_HTTP ]; then
979 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show http->https upgrades, and I was redirected to '$NEW_URL'."
980 STATUS="OK"
981 let OK_LINKS+=1
982 let SKIP_HTTPS_UP+=1
983 # If the URLs match besides an added ending slash, then the link is OK (unless user wants
984 # those to be reported)
985 elif [ $SHOW_SLASH -eq 0 ] && [ $URL_HTTP == $NEW_URL_NO_SLASH ]; then
986 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show added trailing slashes, and I was redirected to '$NEW_URL'."
987 STATUS="OK"
988 let OK_LINKS+=1
989 let SKIP_SLASH_ADD+=1
990 elif [ $SHOW_YT_RD -eq 0 ] && [ $YOUTU_BE -eq 1 ]; then
991 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show redirects for youtu.be links, and I was redirected to '$NEW_URL'."
992 STATUS="OK"
993 let OK_LINKS+=1
994 let SKIP_YOUTU_BE+=1
995 else
996 STATUS="RD"
997 let RD_LINKS+=1
998 fi
999 break
1000 fi
1001 done
1002 fi
1003
1004 # If we didn't get a match with the "RD" codes, check it against the "NG" codes
1005 if [ $STATUS == "??" ]; then
1006 for CODE in "${NG_CODES[@]}"; do
1007 if [[ $CODE == $CURL_CODE ]]; then
1008 STATUS="NG"
1009 let NG_LINKS+=1
1010 break
1011 fi
1012 done
1013 fi
1014
1015 # If we didn't match a known status code, advise the reader
1016 if [ $STATUS == "??" ]; then
1017 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I encountered the unknown response code $CURL_CODE."
1018 let SKIP_UNK_CODE+=1
1019 continue
1020 fi
1021
1022 # Check problem links against exceptions list before proceeding
1023 FOUND_EXCEPT=0
1024 if [ $STATUS != "OK" ] && [ ! -z $EXCEPT_URL ]; then
1025 # The code we expect to find in the exceptions file is either the 'curl' result or "EI"/"IW"
1026 EXPECT_CODE="$CURL_RESULT"
1027 if [ $STATUS == "EI" ]; then
1028 EXPECT_CODE="EI"
1029 elif [ $STATUS == "IW" ]; then
1030 EXPECT_CODE="IW"
1031 fi
1032
1033 # Look for link in exceptions list and make sure the listed result code and wiki page also match
1034 for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
1035 {
1036 EXCEPT_LINE="${EXCEPT_ARRAY[$i]}"
1037
1038 # Undo any HTML-encoding from the wiki page; for now we just worry about the ampersand, as most
1039 # other HTML-encoded characters are not found in URLs
1040 EXCEPT_LINE=$(echo "$EXCEPT_LINE" | sed 's/\&amp;/\&/')
1041
1042 # Match URL
1043 EXCEPT_URL="${EXCEPT_LINE#*,}"
1044 EXCEPT_URL="${EXCEPT_URL%,*}"
1045 if [ "$EXCEPT_URL" != "$URL" ]; then
1046 continue
1047 fi
1048
1049 # Match containing page's name
1050 EXCEPT_PAGE="${EXCEPT_LINE##*,}"
1051 EXCEPT_PAGE="${EXCEPT_PAGE%% *}"
1052 if [ "$EXCEPT_PAGE" == "*" ] || [ "$EXCEPT_PAGE" == $LOCAL_PAGE_PATH ]; then
1053 # Match result code
1054 EXCEPT_CODE=${EXCEPT_LINE%%,*}
1055 if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then
1056 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because its expected result, '$EXPECT_CODE', is in the exceptions list."
1057 if [ $STATUS == "EI" ]; then
1058 let SKIP_EXPECT_EI+=1
1059 elif [ $STATUS == "IW" ]; then
1060 let SKIP_EXPECT_IW+=1
1061 elif [ $STATUS == "RD" ]; then
1062 let SKIP_EXPECT_RD+=1
1063 else
1064 let SKIP_EXPECT_NG+=1
1065 fi
1066 FOUND_EXCEPT=1
1067 break
1068 fi
1069 fi
1070 } done
1071 fi
1072 if [ $FOUND_EXCEPT -eq 1 ]; then
1073 continue
1074 fi
1075
1076 # If appropriate, record this link to the log, with clickable URLs when possible
1077 if [ $STATUS != "OK" ] || [ $RECORD_OK_LINKS -eq 1 ]; then
1078 # Prepare 'curl' result in parentheses to print after status code, unless this is an "IW" or "EI"
1079 # link, in which case showing the status code doesn't make sense. Adjust spacing after string to
1080 # ensure TXT and RTF reports have aligned columns of results.
1081 CURL_STR_H=" ($CURL_RESULT)"
1082 CURL_STR_T="$CURL_STR_H"
1083 CURL_STR_R="$CURL_STR_H "
1084 if [ $STATUS == "IW" ] || [ $STATUS == "EI" ]; then
1085 CURL_STR_H=""
1086 CURL_STR_T=" "
1087 CURL_STR_R=" "
1088 fi
1089
1090 # Record link and its wiki page in TXT, RTF, and HTML markup
1091 valPrint t "${STATUS}${CURL_STR_T} $STR_TYPE $URL"
1092 valPrint t " linked from $FULL_PAGE_PATH"
1093 valPrint r "${STATUS}${CURL_STR_R}${RTF_TABS}$STR_TYPE {\field{\*\fldinst{HYPERLINK \"$URL\"}}{\fldrslt $URL}}"
1094 valPrint r " linked from {\field{\*\fldinst{HYPERLINK \"$FULL_PAGE_PATH\"}}{\fldrslt $LOCAL_PAGE_PATH}}"
1095 valPrint hn "<tr><td style=\"white-space:nowrap\">${STATUS}${CURL_STR_H}</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>"
1096 valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"
1097
1098 # Place vertical space here since we won't be printing anything more about this link
1099 if [ $STATUS == "OK" ]; then valPrint tr ""; valPrint hs ""; fi
1100
1101 # Record redirect URL if one was given by a 3xx response page
1102 if [ $STATUS == "RD" ]; then
1103 valPrint ts " Server suggests $NEW_URL"
1104 valPrint rs " Server suggests {\field{\*\fldinst{HYPERLINK \"$NEW_URL\"}}{\fldrslt $NEW_URL}}"
1105 valPrint hs "<tr><td colspan=\"2\" align=\"right\">Server suggests</td><td><a href=\"$NEW_URL\" target=\"_blank\">$NEW_URL</a></td></tr>"
1106 fi
1107
1108 # Notify reader if we can use an intrawiki link for this URL
1109 if [ $STATUS == "EI" ]; then
1110 INTRA_PAGE=${URL#*://*/}
1111 valPrint ts " Just use [[$INTRA_PAGE]]"
1112 valPrint rs " Just use [[$INTRA_PAGE]]"
1113 valPrint hs "<tr><td colspan=\"2\" align=\"right\">Just use</td><td>[[$INTRA_PAGE]]</td></tr>"
1114 fi
1115
1116 # Notify reader if we can use an interwiki prefix for this URL
1117 if [ $STATUS == "IW" ]; then
1118 INTER_PAGE=$(echo "$URL" | sed 's/.*\///')
1119 valPrint ts " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
1120 valPrint rs " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
1121 valPrint hs "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]</td></tr>"
1122 fi
1123
1124 # Query Internet Archive for latest "OK" snapshot for "NG" page
1125 if [ $STATUS == "NG" ] && [ $SUGGEST_SNAPSHOTS -eq 1 ]; then
1126 ARCHIVE_QUERY=$(curl --silent --max-time $TIMEOUT "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
1127
1128 # If a "closest" snapshot was received...
1129 if [[ "$ARCHIVE_QUERY" == *\"closest\":* ]]; then
1130 # In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it
1131 ARCHIVE_QUERY=$(echo "$ARCHIVE_QUERY" | sed 's/#!/#\\!/')
1132
1133 # ...isolate "url" property in the response that follows the "closest" tag
1134 SNAPSHOT_URL=${ARCHIVE_QUERY##*\"closest\":} # everything after '"closest":'
1135 SNAPSHOT_URL=${SNAPSHOT_URL#*\"url\": \"} # everything after '"url": "'
1136 SNAPSHOT_URL=${SNAPSHOT_URL%%\"*} # everything before '"'
1137
1138 # Remove the port 80 part that IA often adds to the URL, as it's superfluous
1139 SNAPSHOT_URL=$(echo $SNAPSHOT_URL | sed 's/:80//')
1140
1141 # Inform the user of the snapshot URL
1142 valPrint ts " IA suggests $SNAPSHOT_URL"
1143 valPrint rs " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
1144 valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
1145 else # ...otherwise give generic Wayback Machine link for this URL
1146 valPrint ts " Try browsing $ARCHIVE_GENERIC/$URL"
1147 valPrint rs " Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
1148 valPrint hs "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"
1149 fi
1150 fi
1151 fi
1152
1153 # If this is a page and it seems to be there, take screenshot of it using headless Google Chrome
1154 if [ $IS_FILE -eq 0 ] && [ $TAKE_PAGE_SHOT -eq 1 ] && [ $STATUS == "OK" ]; then
1155 # Sanitize URL by removing "http(s)://" and converting any other colons or slashes to underscores
1156 SHOT_NAME=$(echo "$URL" | sed 's/https*\:\/\///' | sed 'y/:\//__/')
1157 SHOT_FILE="$SHOT_PATH/$SHOT_NAME.png"
1158
1159 # Don't take screenshot if we already encountered this page and screenshotted it
1160 if [ ! -f "$SHOT_FILE" ]; then
1161 "$CHROME_PATH" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
1162 if [ -f "$WORKING_DIR/$CHROME_SCREENSHOT" ]; then
1163 mv -n "$WORKING_DIR/$CHROME_SCREENSHOT" "$SHOT_FILE"
1164 else
1165 valPrint trhs "Screenshot of URL $URL seems to have failed!"
1166 fi
1167 else
1168 valPrint trhs "Skipping screenshot of URL '$URL' because file '$SHOT_FILE' already exists."
1169 fi
1170 fi
1171done
1172FINISHED_LIST="yes"
1173wrapupAndExit
Note: See TracBrowser for help on using the repository browser.