source: Validate External Links/validate_external_links.sh@ 1127

Last change on this file since 1127 was 1127, checked in by iritscen, 5 years ago

Val now counts redirects from youtu.be to youtube.com as OK links. These links will be reported on if the argument --show-yt-redirects is used. Renamed --show-https-upgrade to --show-https-upgrades for consistency. Also sorted the file and page suffix arrays and added some more items to them. Now handling status codes 400, 418, 502 and 530. Fixed incorrect nbsps in HTML report. Val is no longer confused by URLs ending in '(' or ')', or which contain a '%' towards the end.

File size: 46.9 KB
RevLine 
[1064]1#!/bin/bash
2
3# Validate External Links by Iritscen
4# Provided with a list of external links found in the OniGalore wiki, this script validates them.
5# The resulting logs are produced in three formats: TXT (for easy diffing with an earlier log), RTF
6# (for reading as a local file with clickable links), and HTML (for uploading as a web page).
7# Call script with "--help" argument for documentation. Also see Read Me First.rtf for critical notes.
8# Recommended rule:
[1118]9# |----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----|
[1064]10
11# Set separator token to newline
12IFS="
13"
14
15### GLOBALS ###
16# Settings -- these will be changed from their defaults by the arguments passed in to the script
17LINKS_URL="" # use 'curl' to download file with links from this location (can be file://)
18EXCEPT_URL="" # ditto above for file with exceptions to NG results
19OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder
[1070]20RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES
[1122]21SHOW_SLASH=0 # record response code to the log when a slash is added to the end of a URL
22SHOW_HTTPS=0 # record response code to the log when "http" is upgraded to "https"
[1127]23SHOW_YT_RD=0 # record response code to the log when a youtu.be URL is expanded to the full URL
[1064]24SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page
25TAKE_PAGE_SHOT=0 # take a screenshot of each OK page
[1070]26CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature
[1064]27URL_START=1 # start at this URL in LINKS_FILE (1 by default)
28URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE
29UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report
30
31# Fixed strings -- see the occurrences of these variables to learn their purpose
[1118]32AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36 OPR/67.0.3575.53"
[1064]33ARCHIVE_API="http://archive.org/wayback/available"
34ARCHIVE_GENERIC="https://web.archive.org/web/*"
35ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
36CHROME_SCREENSHOT="screenshot.png"
[1066]37CURL_CODES="http://iritscen.oni2.net/val/curl_codes.txt"
[1064]38EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
[1066]39HTTP_CODES="http://iritscen.oni2.net/val/http_codes.txt"
[1122]40MY_WIKI_PAGE="https://wiki.oni2.net/User:Iritscen"
[1064]41THIS_DIR=$(cd $(dirname $0); pwd)
42WORKING_DIR=$(pwd)
43WIKI_PATH="wiki.oni2.net"
44
45# These are parallel arrays of the IDs and names of OniGalore's current namespaces
46declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 100 101 102 103 104 105 108 109 110 111)
47declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk")
48
49# These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
[1070]50# This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code.
[1127]51declare -a HTTP_FILES=(3ds 7z avi BINA blend bsl cpp csv dae dll dmg doc east exe fbx first63 flv gamedata gif jpg last32 log m4a mhm mov mp3 mp4 oni ONWC pdf png py rar tga TRMA txt vbs wav wmv xaf xml zip)
52declare -a HTTP_TLDS_AND_PAGES=(action ars asp aspx cfm cgi com css de htm html it js jsp net org pgi php php3 phtml pl ru shtml stm uk x)
[1064]53
[1067]54# These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
55# are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt
56# if you add a new code.
[1127]57declare -a OK_CODES=(200 401 405 406 418 501)
[1067]58declare -a RD_CODES=(301 302 303 307 308)
[1127]59declare -a NG_CODES=(000 400 403 404 410 500 502 503 530)
[1064]60
61# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
62# transcluded text, and if the transclusion fails, then the braces show up in the URL
63ILLEGAL_CHARS="{ }"
64
[1070]65# The shortest URL possible, used for sanity-checking some URLs: http://a.co
66MIN_URL_LENGTH=11
67
[1064]68# These are parallel arrays giving the prefixes that can be used in place of normal external links to
69# some wikis and other sites
[1070]70declare -a INTERWIKI_PREFIXES=(commons metawikimedia mw wikibooks wikidata wikimedia wikinews wikiquote wikisource wikispecies wikiversity wikivoyage wikt wp)
71declare -a INTERWIKI_DOMAINS=(commons.wikimedia.org meta.wikimedia.org mediawiki.org wikibooks.org wikidata.org wikimediafoundation.org wikinews.org wikiquote.org wikisource.org species.wikimedia.org wikiversity.org wikivoyage.org wiktionary.org wikipedia.org)
[1064]72
73# Variables for keeping track of main loop progress and findings
74LINK_NUM=0
[1070]75EI_LINKS=0
76IW_LINKS=0
[1064]77OK_LINKS=0
[1067]78RD_LINKS=0
[1064]79NG_LINKS=0
80SKIP_UNK_NS=0
81SKIP_JS_PAGE=0
82SKIP_BAD_URL=0
83SKIP_NON_ASCII=0
84SKIP_UNK_SUFFIX=0
85SKIP_UNK_CODE=0
[1070]86SKIP_EXPECT_NG=0
87SKIP_EXPECT_EI=0
88SKIP_EXPECT_IW=0
[1122]89SKIP_HTTPS_UP=0
90SKIP_SLASH_ADD=0
[1127]91SKIP_YOUTU_BE=0
[1064]92FILE_LINKS=0
93PAGE_LINKS=0
94SKIPPED_HEADER_ROW=0
95FINISHED_LIST="no"
[1118]96START_RUN=0
97END_RUN=0
[1064]98
99
100### HELP ###
101# A pseudo-man page. Here is the 80-character rule for the page text:
102# 234567890123456789012345678901234567890123456789012345678901234567890123456789
103function printHelp()
104{
105 cat << EOF
106
107NAME
108 Validate External Links
109
110SYNOPSIS
111 validate_external_links.sh --help
[1070]112 validate_external_links.sh --links URL --output DIR [--exceptions URL]
[1075]113 [--record-ok-links] [--suggest-snapshots] [--take-screenshots FILE]
[1070]114 [--start-url NUM] [--end-url NUM] [--upload FILE]
[1064]115
116DESCRIPTION
117 This script parses a list of external links found in the OniGalore wiki
118 (which is dumped by the Oni2.net domain periodically in a particular
119 format), validates them using the Unix tool 'curl', and produces a report
[1070]120 of which links were "OK" (responded positively to an HTTP query), which
121 were "RD" (responded with a 3xx redirect code), which could be "IW"
122 (interwiki) links, which are "EI" (external internal) links and could be
123 intrawiki links, and which were "NG" (no good; a negative response to the
[1069]124 query). This report can then be automatically uploaded to the location of
[1064]125 your choice. The script can also suggest Internet Archive snapshots for
[1070]126 "NG" links, and take screenshots of "OK" links for visual verification by
127 the reader that the page in question is the one intended to be displayed.
[1064]128
129 You must pass this script the URL at which the list of links is found
[1070]130 (--links) and the path where the directory of logs should be outputted
131 (--output). All other arguments are optional.
[1064]132
133OPTIONS
[1075]134 --help Show this page.
135 --links URL (required) URL from which to download the CSV
136 file with external links. Note that this URL can
137 be a local file if you supply a file:// path.
138 --output DIR (required) Unix path to directory in which Val
139 should place its reports.
140 --exceptions URL In order to remove links from the report which
141 Val finds an issue with, but which you regard as
142 OK, list those desired exceptions in this file.
143 See the sample file exceptions.txt for details.
144 Note that this URL can point to a local file if
145 you supply a file:// path.
146 --record-ok-links Log a link in the report even if its response
147 code is "OK".
[1122]148 --show-added-slashes Report on redirects that simply add a '/' to the
149 end of the URL.
[1127]150 --show-https-upgrades Report on redirects that simply upgrade a
[1122]151 "http://" URL to a "https://" URL.
[1127]152 --show-yt-redirects Report on redirects that expand a youtu.be URL.
[1075]153 --suggest-snapshots Query the Internet Archive for a possible
154 snapshot URL for each "NG" page.
155 --take-screenshots FILE Call the Google Chrome binary at this path to
156 take screenshots of each "OK" page.
157 --start-url NUM Start at this link in the links CSV file.
158 --end-url NUM Stop at this link in the links CSV file.
159 --upload FILE Upload report using the credentials and path
160 given in this local text file. See sftp_login.txt
161 for template.
[1064]162
163BUGS
164 The script cannot properly parse any line in the external links file
165 which contains a comma in the name of the wiki page containing a link.
166 Commas in the link itself are not an issue.
167EOF
168}
169
170
171### SETUP ###
172# If first argument is a help request, or if nothing was passed in at all, print help page and quit
173if [ "$#" -eq 0 ] || [ "$1" == "--help" ]; then
174 printHelp | less
175 exit 0
176fi
177
178# Parse arguments as long as there are more arguments to process
179while (( "$#" )); do
180 case "$1" in
[1127]181 --links ) LINKS_URL="$2"; shift 2;;
182 --exceptions ) EXCEPT_URL="$2"; shift 2;;
183 --output ) OUTPUT_DIR="$2"; shift 2;;
184 --record-ok-links ) RECORD_OK_LINKS=1; shift;;
185 --show-added-slashes ) SHOW_SLASH=1; shift;;
186 --show-https-upgrades ) SHOW_HTTPS=1; shift;;
187 --show-yt-redirects ) SHOW_YT_RD=1; shift;;
188 --suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;;
189 --take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
190 --start-url ) URL_START=$2; shift 2;;
191 --end-url ) URL_LIMIT=$2; shift 2;;
192 --upload ) UPLOAD_INFO=$2; shift 2;;
193 * ) echo "Invalid argument $1 detected. Aborting."; exit 1;;
[1064]194 esac
195done
196
197# If the required arguments were not supplied, print help page and quit
198if [ -z $LINKS_URL ] || [ -z $OUTPUT_DIR ]; then
[1070]199 echo "Error: I did not receive one or both required arguments. Run me with the \"--help\" argument for documentation."
[1064]200 exit 2
201fi
202
[1070]203# If user wants screenshots, make sure path to Chrome was passed in and is valid
204if [ $TAKE_PAGE_SHOT -eq 1 ]; then
205 if [ ! -f "$CHROME_PATH" ]; then
206 echo "Error: You need to supply a path to the Google Chrome application in order to take screenshots."
207 exit 3
208 fi
209fi
210
[1064]211# Check that UPLOAD_INFO exists, if this argument was supplied
212if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then
213 echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting."
[1070]214 exit 4
[1064]215fi
216
217# Check that OUTPUT_DIR is a directory
218if [ ! -d "$OUTPUT_DIR" ]; then
219 echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting."
[1070]220 exit 5
[1064]221fi
222
223# Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots
224SAFE_TIME=$(date "+%Y-%m-%d--%H-%M-%S") # for use in path names
225NICE_TIME=$(TZ=":UTC" date "+%a, %d %b %Y %k:%M:%S GMT") # matches format of LINKS_DATE, which is extracted from the extlinks file
226OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
227OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
228SHOT_PATH="$OUTPUT_PATH/Screenshots"
229LOG_NAME="ValExtLinks report"
230LOG_TXT="$OUTPUT_PATH/$LOG_NAME.txt"
231LOG_RTF="$OUTPUT_PATH/$LOG_NAME.rtf"
232LOG_HTM="$OUTPUT_PATH/$LOG_NAME.htm"
233mkdir "$OUTPUT_PATH"
234if [ $TAKE_PAGE_SHOT -eq 1 ]; then
235 mkdir "$SHOT_PATH"
236fi
237
238# Check that 'mkdir' succeeded
239if [ ! -d "$OUTPUT_PATH" ]; then
240 echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting."
[1070]241 exit 6
[1064]242fi
243
244# Get date on the file at LINKS_URL and print to log
245LINKS_DATE=$(curl --silent --head $LINKS_URL | grep "Last-Modified")
246if [ -z "$LINKS_DATE" ]; then
247 echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting."
[1070]248 exit 7
[1064]249fi
250LINKS_DATE=${LINKS_DATE#Last-Modified: }
251
252
253### UTILITY FUNCTIONS ###
254# Writes a plain-text header to TXT log file
255function printTXTheader()
256{
257 valPrint t "Validate External Links report"
258 valPrint t "generated $NICE_TIME"
259 valPrint t "from data of $LINKS_DATE"
260 valPrint t "script by Iritscen (contact: $MY_WIKI_PAGE)"
261 valPrint t ""
262}
263
264# Writes the RTF header to RTF log file, then a centered title, then sets text to left-justified
265function printRTFheader()
266{
267 valPrint r "{\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf820
268{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
269{\colortbl;\red255\green255\blue255;}
270{\*\expandedcolortbl;;}
271\margl1440\margr1440\vieww12600\viewh12100\viewkind0
272\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\qc\partightenfactor0
273
274\f0\fs28 \cf0 \b1 Validate External Links report \b0\\
275generated $NICE_TIME\\
276from data of $LINKS_DATE\\
277script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$MY_WIKI_PAGE\"}}{\fldrslt contact}})
278\\
279\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
280\cf0 "
281}
282
283# Closes the RTF markup of the RTF log file
284function printRTFfooter()
285{
286 valPrint r "}"
287}
288
289# Writes the HTML header to HTML log file
290function printHTMheader()
291{
292 valPrint h "<html>
293<head>
294<title>Validate External Links report</title>
295</head>
296<body>
297<h2>Validate External Links report</h2>
298<h3>generated $NICE_TIME<br />
299from data of $LINKS_DATE<br />
300script by Iritscen (<a href=\"$MY_WIKI_PAGE\" target=\"_blank\">contact</a>)</h3>"
301}
302
303# Closes the HTML markup of the HTML log file
304function printHTMfooter()
305{
306 valPrint h "</body>
307</html>"
308}
309
310# The central logging function. The first parameter is a string composed of one or more characters that
[1070]311# indicate which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
[1119]312# 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 's' means "Print an extra newline at the end." 'w' means "Don't pass console output through 'fmt'" ("fmt" fits the output
313# to an 80-column CLI but can break special formatting and the 'n' option).
[1064]314function valPrint()
315{
316 if [[ "$1" == *c* ]]; then
317 if [[ "$1" == *n* ]]; then
318 echo -n "$2"
319 elif [[ "$1" == *w* ]]; then
320 echo "$2"
[1119]321 elif [[ "$1" == *s* ]]; then
322 echo -e "$2\n"
[1064]323 else
324 echo "$2" | fmt -w 80
325 fi
326 fi
327 if [[ "$1" == *t* ]]; then
328 if [[ "$1" == *n* ]]; then
329 echo -n "$2" >> "$LOG_TXT"
[1119]330 elif [[ "$1" == *s* ]]; then
331 echo -e "$2\n" >> "$LOG_TXT"
[1064]332 else
333 echo "$2" >> "$LOG_TXT"
334 fi
335 fi
336 if [[ "$1" == *r* ]]; then
337 if [[ "$1" == *n* ]]; then
338 echo "$2" >> "$LOG_RTF"
[1119]339 elif [[ "$1" == *s* ]]; then
340 echo "$2\line\line" >> "$LOG_RTF"
[1064]341 else
[1119]342 echo "$2\line" >> "$LOG_RTF"
[1064]343 fi
344 fi
345 if [[ "$1" == *h* ]]; then
[1119]346 if [[ "$1" == *s* ]]; then
347 echo "$2<tr><td>&nbsp;</td></tr>" >> "$LOG_HTM"
348 elif [[ "$1" == *n* ]]; then
[1064]349 echo "$2" >> "$LOG_HTM"
350 else
351 echo "$2<br />" >> "$LOG_HTM"
352 fi
353 fi
354}
355
356# Pluralize the string in parameter 1 if the number in parameter 2 is not 1
357function pluralCheckNoun()
358{
359 if [ $2 -ne 1 ]; then
360 if [[ $1 =~ x$ ]]; then
361 echo $1es
362 else
363 echo $1s
364 fi
365 else
366 echo $1
367 fi
368}
369
[1067]370# Output "is" if parameter 1 is 1, otherwise "are"
371function pluralCheckIs()
372{
373 if [ $1 -ne 1 ]; then
374 echo "are"
375 else
376 echo "is"
377 fi
378}
379
[1064]380# Output "was" if parameter 1 is 1, otherwise "were"
381function pluralCheckWas()
382{
383 if [ $1 -ne 1 ]; then
384 echo "were"
385 else
386 echo "was"
387 fi
388}
389
[1067]390# Output "a " if parameter 1 is 1, otherwise nothing
391function pluralCheckA()
392{
393 if [ $1 -eq 1 ]; then
394 echo "a "
395 fi
396}
397
398# Output "an " if parameter 1 is 1, otherwise nothing
399function pluralCheckAn()
400{
401 if [ $1 -eq 1 ]; then
402 echo "an "
403 fi
404}
405
[1064]406# Upload HTML report using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
407# reports being saved to disk have already been closed.
408function uploadReport()
409{
410 valPrint c "Uploading HTML report..."
411
412 SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
413 SFTP_USER_NAME_MARKER="user:"
414 SFTP_PASSWORD_MARKER="pw:"
415 SFTP_PORT_MARKER="port:"
416 SFTP_PATH_MARKER="path:"
417 SFTP_USER_NAME=$(grep $SFTP_USER_NAME_MARKER $UPLOAD_INFO)
418 SFTP_USER_NAME=${SFTP_USER_NAME#$SFTP_USER_NAME_MARKER}
419 SFTP_PASSWORD=$(grep $SFTP_PASSWORD_MARKER $UPLOAD_INFO)
420 SFTP_PASSWORD=${SFTP_PASSWORD#$SFTP_PASSWORD_MARKER}
421 SFTP_PORT=$(grep $SFTP_PORT_MARKER $UPLOAD_INFO)
422 SFTP_PORT=${SFTP_PORT#$SFTP_PORT_MARKER}
423 SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
424 SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}
425
426 expect "$SCRIPT_PATH" "$LOG_HTM" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.htm"
427
428 valPrint c "Report was uploaded, unless an error message appears above."
429}
430
431# Prints session summary when script is done
432function wrapupAndExit()
433{
434 # Get off progress line on console, drop down a line from last link in log, and close HTML table
435 valPrint ctr ""
436 valPrint h "</table><br />"
437
438 # If we didn't finish processing the last URL, then the iterator is one too high
439 if [ $FINISHED_LIST != "yes" ]; then
440 let LINK_NUM-=1
441 if [ $FINISHED_LIST == "no" ]; then
442 valPrint ctrh "The session was canceled by the user."
443 fi
444 fi
445
[1118]446 # Generate string with elapsed time
447 END_RUN=$(date +%s)
448 ELAPSED=$(echo $(($END_RUN - $START_RUN)) | awk '{printf "%d min. %d sec. elapsed", int($1/60), int($1%60)}')
449
[1122]450 # Do some math on results of session
[1064]451 LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
[1122]452 LINK_PROBLEMS=$((EI_LINKS+IW_LINKS+RD_LINKS+NG_LINKS))
453 LINK_ERRORS=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
454 LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_EI+SKIP_EXPECT_IW))
[1127]455 TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP+SKIP_YOUTU_BE))
[1122]456 LINKS_CHECKED=$((LINKS_PROCESSED-LINK_ERRORS))
457
458 # Print summary header
[1118]459 valPrint ct "Summary ($ELAPSED):"
460 valPrint r "\b1 Summary \b0 ($ELAPSED)"
461 valPrint hn "<h3><span id=\"summary\">Summary ($ELAPSED)</span></h3>"
[1123]462 valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT) (there $(pluralCheckWas $FILE_LINKS) $FILE_LINKS file $(pluralCheckNoun link $FILE_LINKS) and $PAGE_LINKS page $(pluralCheckNoun link $PAGE_LINKS))."
[1122]463
464 # Print processed link totals
465 if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi
466 if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi
[1123]467 if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS processed $(pluralCheckNoun link $LINK_PROBLEMS) had $(pluralCheckAn $LINK_PROBLEMS)$(pluralCheckNoun issue $LINK_PROBLEMS)"; fi
[1127]468 if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr " (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; valPrint h "&nbsp;&nbsp;(excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi
[1123]469 if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) $(pluralCheckWas $OK_LINKS) OK"; fi
470 if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctr " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; valPrint h "&nbsp;&nbsp;(counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi
[1122]471
472 # Print excepted link totals
473 if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctrh "$LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"; fi
474 if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
475 if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi
476 if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi
477
478 # Print errored link totals
479 if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"; fi
480 if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
[1070]481 if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
[1064]482 if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
483 if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
484 if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
485 if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
[1122]486
487 # Print checked link totals
[1123]488 if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS link $(pluralCheckNoun issue $LINK_PROBLEMS):"; fi
[1122]489 if [ $NG_LINKS -gt 0 ]; then valPrint ctrh "- $NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
490 if [ $RD_LINKS -gt 0 ]; then valPrint ctrh "- $RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi
491 if [ $EI_LINKS -gt 0 ]; then valPrint ctrh "- $EI_LINKS $(pluralCheckNoun link $EI_LINKS) that could be intrawiki"; fi
492 if [ $IW_LINKS -gt 0 ]; then valPrint ctrh "- $IW_LINKS $(pluralCheckNoun link $IW_LINKS) that could be interwiki"; fi
493
494 # Close the log files' markup
[1070]495 valPrint trh "ValExtLinks says goodbye."
[1064]496 printRTFfooter
497 printHTMfooter
498
499 # Upload report if this was requested
500 if [ ! -z $UPLOAD_INFO ] && [ $FINISHED_LIST != "no" ]; then
501 uploadReport
502 fi
503
504 # Really quit now
505 valPrint c "ValExtLinks says goodbye."
506 exit 0
507}
508trap wrapupAndExit INT
509
510
511### INITIALIZATION ###
512# Print opening message to console and log files
513valPrint cw "$(tput bold)--Validate External Links--$(tput sgr0)"
514printTXTheader
515printRTFheader
516printHTMheader
517
518# Attempt to download file at LINKS_URL, then check that it succeeded
[1120]519valPrint t "Config:"
520valPrint r "\b1 Config \b0"
521valPrint hn "<h3>Config</h3>"
[1069]522valPrint cwtrh "Downloading list of external links from $LINKS_URL."
[1064]523LINKS_FILE_NAME=$(echo "$LINKS_URL" | sed 's/.*\///')
524LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
525curl --silent -o "$LINKS_FILE" $LINKS_URL
526if [ ! -f "$LINKS_FILE" ]; then
527 echo "The download of $LINKS_URL appears to have failed. Aborting."
528 wrapupAndExit
529fi
530
531# Attempt to download file at EXCEPT_URL, then check that it succeeded
532if [ ! -z $EXCEPT_URL ]; then
[1070]533 valPrint cwtrh "Downloading list of reporting exceptions from $EXCEPT_URL."
[1064]534 EXCEPT_FILE_NAME=$(echo "$EXCEPT_URL" | sed 's/.*\///')
535 EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
536 curl --silent -o "$EXCEPT_FILE" $EXCEPT_URL
537 if [ ! -f "$EXCEPT_FILE" ]; then
538 echo "The download of $EXCEPT_URL appears to have failed. Aborting."
539 wrapupAndExit
540 fi
541fi
542
543# Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
544LINK_COUNT_STRING=$(cat "$LINKS_FILE" | wc -l)
545
546# Number of URLs is number of lines minus one (first line is column header row for the CSV)
547LINK_COUNT=$(echo "${LINK_COUNT_STRING}" | tr -d '[:space:]')
548let LINK_COUNT-=1
549
550# Calculate number of URLs to consider
551if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
552 valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((URL_LIMIT-URL_START+1)) of them, from link $URL_START to $URL_LIMIT."
553elif [ $URL_START -ne 1 ]; then
554 valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((LINK_COUNT-URL_START+1)) of them, from link $URL_START to $LINK_COUNT."
555else
556 valPrint ctrh "Found $LINK_COUNT links to process. I will be considering all of them."
557fi
558
559# Print settings to console and log
[1070]560declare -a SETTINGS_MSG=(I will be checking the response code of each link "and will" take a screenshot of each page. Pages that are OK will "also" be logged. I "will" ask the Internet Archive for a suggested snapshot URL for each NG page. "I will not report links that are listed in the exceptions file.")
[1064]561if [ $TAKE_PAGE_SHOT -eq 0 ]; then SETTINGS_MSG[10]="but will not"; fi
562if [ $RECORD_OK_LINKS -eq 0 ]; then SETTINGS_MSG[22]="not"; fi
563if [ $SUGGEST_SNAPSHOTS -eq 0 ]; then SETTINGS_MSG[26]="will not"; fi
564if [ -z $EXCEPT_URL ] || [ $RECORD_OK_LINKS -eq 1 ]; then SETTINGS_MSG[40]=""; fi
565SETTINGS_STR=${SETTINGS_MSG[@]}
566valPrint ctrh "$SETTINGS_STR"
567valPrint tr "A summary of my findings will be found at the bottom of the report."
568valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
569valPrint trh ""
570
571# Print legend to logs
572valPrint t "Legend:"
573valPrint r "\b1 Legend \b0"
574valPrint hn "<h3>Legend</h3>"
575valPrint trh "OK = URL seems to be working."
[1067]576valPrint trh "NG = URL no longer seems to work. You should click each URL marked as NG before attempting to fix it, because false negatives will occur from time to time due to hiccups in the Internet. Please report any persistent false negatives or other issues to Iritscen. An NG link should be followed by a link to the Internet Archive's Wayback Machine which may help you repair the link. If the link cannot be repaired, you can disable it on the wiki (which prevents it from showing up in future ValExtLinks reports) by wrapping it in nowiki tags."
577valPrint trh "RD = The server responding to this URL is saying that the page moved and you should instead use the supplied new URL. Some RD links represent minor adjustments in the organization of a web site, and some are soft 404s (the file/page has been removed and you are being redirected to something like the main page of the web site). You will have to look at the new URL yourself to determine if it represents an OK link and the link on the wiki should be updated to this one, or if the desired file/page is actually gone and we need to replace the wiki link with an Internet Archive snapshot link -- or disable the URL if it has not been archived."
[1070]578valPrint trh "EI = URL is an external link to an internal page and should be converted to an intrawiki link using the suggested markup."
579valPrint trh "IW = URL is an external link to a fellow wiki and should be converted to an interwiki link using the suggested markup."
[1064]580valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $HTTP_CODES)."
581valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$HTTP_CODES\"}}{\fldrslt here}} for code reference)."
582valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$HTTP_CODES\" target=\"_blank\">here</a> for code reference)."
583valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $CURL_CODES)."
584valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$CURL_CODES\"}}{\fldrslt here}} for code reference)."
585valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$CURL_CODES\" target=\"_blank\">here</a> for code reference)."
586valPrint trh "IA suggests = Last available snapshot suggested by the Internet Archive."
[1070]587valPrint trh "Try browsing = The Archive occasionally fails to return a snapshot URL even when one exists, so you will need to check for a snapshot manually using this link to the Wayback Machine before concluding that a site has not been archived."
[1064]588valPrint trh ""
589
590
591### MAIN LOOP ###
[1120]592valPrint t "Links:"
593valPrint r "\b1 Links \b0"
594valPrint hn "<h3>Links</h3>"
[1118]595START_RUN=$(date +%s)
[1064]596# Process each line of the .csv in LINKS_FILE
597for LINE in `cat "$LINKS_FILE"`; do
598 let LINK_NUM+=1
599
600 # First line is the column header row for the CSV, so let's verify that the format hasn't changed
601 if [ $SKIPPED_HEADER_ROW -eq 0 ]; then
602 if [ $LINE == "namespace,title,target" ]; then
603 SKIPPED_HEADER_ROW=1
604 LINK_NUM=0 # this line is it's not a link, so reset the link counter
605 valPrint hn "<table>"
606 continue
607 else
608 valPrint ctrh "Did not find expected header row in $LINKS_FILE. Aborting."
609 wrapupAndExit
610 fi
611 fi
612
613 # Skip this link if we are not at URL_START yet
614 if [ $LINK_NUM -lt $URL_START ]; then
615 continue
616 fi
617
618 # Stop if we are at the limit declared for testing purposes
619 if [ $URL_LIMIT -gt 0 ] && [ $LINK_NUM -gt $URL_LIMIT ]; then
620 FINISHED_LIST="limit"
621 wrapupAndExit
622 fi
623
624 # Print progress to screen
625 if [ $LINK_NUM -gt 1 ]; then
626 printf "\e[1A\n" # erase previous progress message so that new one appears in its place
627 fi
628 valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."
629
630 # The number of the namespace is the element before the first comma on the line
631 NS_ID=${LINE%%,*}
632
633 # Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
634 NS_NAME=""
635 a=0
[1069]636 while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done
[1118]637 if [ $NS_ID == "NULL" ]; then
638 break
639 elif [ $NS_ID -eq ${NS_IDS[$a]} ]; then
[1064]640 NS_NAME="${NS_NAMES[$a]}"
641 break
642 fi
643 let a+=1
644 done
[1118]645 if [ "$NS_NAME" == "" ]; then
646 if [ $NS_ID == "NULL" ]; then
[1123]647 valPrint trs "Skipping line $LINK_NUM ('$LINE') because the namespace (and probably the page too) is 'NULL'. Probably the link is no longer in existence on the wiki."
[1118]648 else
[1123]649 valPrint trs "Skipping line $LINK_NUM ('$LINE') because I could not find a name for namespace ID $NS_ID."
[1118]650 fi
[1064]651 let SKIP_UNK_NS+=1
652 continue
653 fi
654
655 # The name of the page is everything between the namespace ID and the next comma on the line (commas
656 # in page names will break this)
657 PAGE_NAME=${LINE#$NS_ID,}
658 PAGE_NAME=${PAGE_NAME%%,*}
659
660 # We don't want to consider wiki pages ending in .js, as the parser cannot reliably isolate URLS in
661 # JavaScript code, so it will return erroneous links
662 PAGE_NAME_SUFFIX=$(echo $PAGE_NAME | sed 's/.*\.//')
663 if [ $PAGE_NAME_SUFFIX == "js" ]; then
[1123]664 valPrint trs "Skipping URL '${LINE#$NS_ID,$PAGE_NAME,}' on line $LINK_NUM because it was found on JavaScript page '$PAGE_NAME'."
[1064]665 let SKIP_JS_PAGE+=1
666 continue
667 fi
668
[1070]669 # Build longer wiki page URLs from namespace and page names
[1122]670 FULL_PAGE_PATH=https://$WIKI_PATH/$NS_NAME:$PAGE_NAME
[1070]671 LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
672 # Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it
673 # explicitly breaks the link
674 if [ $NS_ID -eq 0 ]; then
[1122]675 FULL_PAGE_PATH=https://$WIKI_PATH/$PAGE_NAME
[1070]676 LOCAL_PAGE_PATH=$PAGE_NAME
677 fi
678
[1064]679 # The URL being linked to is everything after the previous two fields (this allows commas to be in
680 # the URLs, but a comma in the previous field, the page name, will break this)
681 URL=${LINE#$NS_ID,$PAGE_NAME,}
682
683 # Scan for illegal characters
684 if [[ $URL == *[$ILLEGAL_CHARS]* ]]; then
[1123]685 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because it contains characters illegal in a URL."
[1064]686 let SKIP_BAD_URL+=1
687 continue
688 fi
689
690 # Now we need to know if the URL is for a file or a web page. First step is to determine if the
691 # URL ends in a suffix
692 HAS_SUFFIX=0
693
694 # If the URL ends in something like ".php?foo=bar", strip everything from the '?' onward
[1070]695 CLEAN_URL=${URL%%\?*}
[1064]696
697 # If the URL ends in something like "#section_15", strip everything from the '#' onward
[1070]698 CLEAN_URL=${CLEAN_URL%%\#*}
[1064]699
700 # 'sed' cannot handle Unicode in my Bash shell, so skip this URL and make user check it
[1070]701 if [[ $CLEAN_URL == *[![:ascii:]]* ]]; then
[1123]702 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I cannot handle non-ASCII characters."
[1064]703 let SKIP_NON_ASCII+=1
704 continue
705 fi
706
707 # Isolate the characters after the last period and after the last slash
[1070]708 POST_DOT=$(echo "$CLEAN_URL" | sed 's/.*\.//')
709 POST_SLASH=$(echo "$CLEAN_URL" | sed 's/.*\///')
[1064]710
711 # If the last period comes after the last slash, then the URL ends in a suffix
712 POST_DOT_LENGTH=$(echo | awk -v input=$POST_DOT '{print length(input)}')
713 POST_SLASH_LENGTH=$(echo | awk -v input=$POST_SLASH '{print length(input)}')
714 if [ $POST_DOT_LENGTH -lt $POST_SLASH_LENGTH ]; then
715 HAS_SUFFIX=1
716 else
717 HAS_SUFFIX=0
718 fi
719
720 # Now we need to know if the URL is for a file. If the URL ends in a suffix, compare it against the
721 # known URL endings supplied in HTTP_FILES and HTTP_TLDS_AND_PAGES
722 IS_FILE=-1
723 if [ $HAS_SUFFIX -eq 0 ]; then
724 IS_FILE=0
725 else
726 # Turn off case sensitivity while we compare suffixes
727 shopt -s nocasematch
728
[1127]729 # Special case: URLs ending in something like "productID.304297400" are pages, not files, so if
[1064]730 # the URL's suffix is all numbers, we are looking at the end of a web page URL
731 if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
732 IS_FILE=0
733 fi
[1127]734
735 # Special case: URLs ending in a parens, e.g. "ms537113(v=vs.85)", are pages
736 if [[ $POST_DOT =~ ^.*[\(\)]$ ]]; then
737 IS_FILE=0
738 fi
739
740 # Special case: URLs containing a '%', e.g. "10.1007%2FBF00329055", are pages
741 if [[ $POST_DOT == *%* ]]; then
742 IS_FILE=0
743 fi
[1064]744
745 # If we did not identify this URL as a web page above, we need to compare the suffix against known
746 # file extensions
747 if [ $IS_FILE -eq -1 ]; then
748 for EXTENSION in "${HTTP_FILES[@]}"; do
749 if [[ $EXTENSION == $POST_DOT ]]; then # double brackets respect the nocasematch setting
750 IS_FILE=1
751 break
752 fi
753 done
754 fi
755
756 # If we did not identify this URL as a file above, we need to compare the suffix against known
757 # pages and TLDs in order to rule out that this is an unknown file suffix or TLD that the user
758 # needs to add to the HTTP_FILES or HTTP_TLDS_AND_PAGES array
759 if [ $IS_FILE -eq -1 ]; then
760 for PAGE_OR_TLD in "${HTTP_TLDS_AND_PAGES[@]}"; do
761 if [[ $PAGE_OR_TLD == $POST_DOT ]]; then
762 IS_FILE=0
763 break
764 fi
765 done
766 fi
767
768 # Turn case sensitivity back on in Bash
769 shopt -u nocasematch
770 fi
771
772 # If this suffix escaped identification as either a file, page or TLD, inform the user
773 STR_TYPE=""
774 if [ $IS_FILE -eq -1 ]; then
[1123]775 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I encountered the unknown URL ending '$POST_DOT'. Please add this ending to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
[1064]776 let SKIP_UNK_SUFFIX+=1
777 continue
778 elif [ $IS_FILE -eq 1 ]; then
779 STR_TYPE="file"
780 let FILE_LINKS+=1
781 elif [ $IS_FILE -eq 0 ]; then
782 STR_TYPE="page"
783 let PAGE_LINKS+=1
784 fi
785
786 # Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
787 # issue with sites that require HTTPS
[1123]788 CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time 10 --write-out '%{http_code}\n' $URL)
[1064]789 CURL_ERR=$(echo $?)
790 CURL_RESULT=$CURL_CODE
791
792 # Tack 'curl' exit code onto result string if 'curl' returned "000" (no HTTP response)
793 if [ $CURL_CODE == "000" ]; then
794 CURL_RESULT="$CURL_RESULT-$CURL_ERR"
795 fi
796
[1070]797 # Begin to determine our status code for this URL (EI, IW, OK, RD, or NG)
[1064]798 STATUS="??"
[1067]799 NEW_URL=""
[1064]800 INTERWIKI_INDEX=-1
801
[1070]802 # First make sure that this isn't an "external internal" link to our own wiki that can be replaced
803 # by "[[page_name]]". If it uses a special access URL beginning with "/w/", let it pass, as it
804 # probably cannot be replaced by "[[ ]]" markup
805 if [[ $URL == *$WIKI_PATH* ]] && [[ $URL != *$WIKI_PATH/w/* ]]; then
806 STATUS="EI"
807 let EI_LINKS+=1
808 fi
809
810 # If it's not, check if this is a link to a domain that we have an interwiki prefix for
811 if [ $STATUS == "??" ]; then
812 for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
813 if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]] && [[ $URL != *${INTERWIKI_DOMAINS[$i]}/w/* ]]; then
814 STATUS="IW"
815 let IW_LINKS+=1
816 INTERWIKI_INDEX=$i
817 break
818 fi
819 done
820 fi
821
[1069]822 # If we didn't match an interwiki domain, see if the status code is in our "OK" codes list
823 if [ $STATUS == "??" ]; then
824 for CODE in "${OK_CODES[@]}"; do
825 if [[ $CODE == $CURL_CODE ]]; then
826 STATUS="OK"
827 let OK_LINKS+=1
828 break
829 fi
830 done
831 fi
832
[1067]833 # If we didn't get a match with the "OK" codes, check it against the "RD" codes
[1064]834 if [ $STATUS == "??" ]; then
[1067]835 for CODE in "${RD_CODES[@]}"; do
836 if [[ $CODE == $CURL_CODE ]]; then
837 # Get URL header again in order to retrieve the URL we are being redirected to
[1123]838 NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time 10 --write-out '%{redirect_url}\n' $URL)
[1067]839
[1122]840 # Adjust the old and new URLs to both use HTTP for comparison purposes, so we can filter
841 # those changes out if the user didn't ask for them
842 URL_HTTP=$(echo $URL | sed -E 's/^https:/http:/')
843 NEW_URL_HTTP=$(echo $NEW_URL | sed -E 's/^https:/http:/')
[1070]844
845 # Sometimes 'curl' fails to get the redirect_url due to time-out or bad web site config
[1122]846 NEW_URL_LENGTH=$(echo | awk -v input=$NEW_URL_HTTP '{print length(input)}')
[1070]847 if [ $NEW_URL_LENGTH -lt $MIN_URL_LENGTH ]; then
[1122]848 NEW_URL_HTTP="[new URL not retrieved]"
[1070]849 fi
850
[1122]851 # Remove slash at end of new URL, if present, so we can filter out the redirects that
852 # merely add an ending slash if the user didn't ask for them
853 NEW_URL_NO_SLASH=$(echo $NEW_URL_HTTP | sed -E 's:/$::')
854
[1127]855 # Detect if this is a youtu.be link simply being expanded by YouTube to the full
856 # youtube.com address
857 YOUTU_BE=0
858 if [[ $URL_HTTP == http://youtu.be* ]] && [[ $NEW_URL_HTTP == http://www.youtube.com* ]]; then
859 YOUTU_BE=1
860 fi
861
[1122]862 # If the URLs match besides HTTP being upgraded to HTTPS, then the link is OK (unless user
863 # wants those to be reported)
864 if [ $SHOW_HTTPS -eq 0 ] && [ $URL_HTTP == $NEW_URL_HTTP ]; then
[1123]865 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show http->https upgrades, and I was redirected to '$NEW_URL'."
[1069]866 STATUS="OK"
867 let OK_LINKS+=1
[1122]868 let SKIP_HTTPS_UP+=1
869 # If the URLs match besides an added ending slash, then the link is OK (unless user wants
870 # those to be reported)
871 elif [ $SHOW_SLASH -eq 0 ] && [ $URL_HTTP == $NEW_URL_NO_SLASH ]; then
[1123]872 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show added trailing slashes, and I was redirected to '$NEW_URL'."
[1122]873 STATUS="OK"
874 let OK_LINKS+=1
875 let SKIP_SLASH_ADD+=1
[1127]876 elif [ $SHOW_YT_RD -eq 0 ] && [ $YOUTU_BE -eq 1 ]; then
877 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show redirects for youtu.be links, and I was redirected to '$NEW_URL'."
878 STATUS="OK"
879 let OK_LINKS+=1
880 let SKIP_YOUTU_BE+=1
[1069]881 else
882 STATUS="RD"
883 let RD_LINKS+=1
884 fi
[1067]885 break
886 fi
887 done
888 fi
889
890 # If we didn't get a match with the "RD" codes, check it against the "NG" codes
891 if [ $STATUS == "??" ]; then
[1064]892 for CODE in "${NG_CODES[@]}"; do
893 if [[ $CODE == $CURL_CODE ]]; then
894 STATUS="NG"
895 let NG_LINKS+=1
896 break
897 fi
898 done
899 fi
900
901 # If we didn't match a known status code, advise the reader
902 if [ $STATUS == "??" ]; then
[1127]903 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I encountered the unknown response code $CURL_CODE."
[1064]904 let SKIP_UNK_CODE+=1
905 continue
906 fi
907
[1070]908 # Check problem links against exceptions file before proceeding
909 if [ $STATUS != "OK" ] && [ ! -z $EXCEPT_URL ]; then
910 # The code we expect to find in the exceptions file is either the 'curl' result or "EI"/"IW"
911 EXPECT_CODE="$CURL_RESULT"
912 if [ $STATUS == "EI" ]; then
913 EXPECT_CODE="EI"
914 elif [ $STATUS == "IW" ]; then
915 EXPECT_CODE="IW"
916 fi
917
918 # Look for link in exceptions file and make sure its listed result code and wiki page also match
[1064]919 GREP_RESULT=$(grep --max-count=1 "$URL" "$EXCEPT_FILE")
[1070]920 EXCEPT_PAGE=${GREP_RESULT##*,}
921 if [ "$EXCEPT_PAGE" == "*" ] || [ "$EXCEPT_PAGE" == $LOCAL_PAGE_PATH ]; then
922 EXCEPT_CODE=${GREP_RESULT%%,*}
923 if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then
[1123]924 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because its expected result, $EXPECT_CODE, is listed in the exceptions file."
[1070]925 if [ $STATUS == "EI" ]; then
926 let SKIP_EXPECT_EI+=1
927 elif [ $STATUS == "IW" ]; then
928 let SKIP_EXPECT_IW+=1
929 else
930 let SKIP_EXPECT_NG+=1
931 fi
932 continue
933 fi
[1064]934 fi
935 fi
936
937 # If appropriate, record this link to the log, with clickable URLs when possible
938 if [ $STATUS != "OK" ] || [ $RECORD_OK_LINKS -eq 1 ]; then
[1125]939 # Prepare 'curl' result in parentheses to print after status code, unless this is an "IW" or "EI"
940 # link, in which case showing the status code doesn't make sense. Adjust spacing after string to
941 # ensure TXT and RTF reports have aligned columns of results.
942 CURL_STR_H=" ($CURL_RESULT)"
943 CURL_STR_T="$CURL_STR_H"
944 CURL_STR_R="$CURL_STR_H "
[1070]945 if [ $STATUS == "IW" ] || [ $STATUS == "EI" ]; then
[1125]946 CURL_STR_H=""
947 CURL_STR_T=" "
948 CURL_STR_R=" "
[1064]949 fi
950
951 # Record link and its wiki page in TXT, RTF, and HTML markup
[1125]952 valPrint t "${STATUS}${CURL_STR_T} $STR_TYPE $URL"
[1064]953 valPrint t " linked from $FULL_PAGE_PATH"
[1125]954 valPrint r "${STATUS}${CURL_STR_R}${RTF_TABS}$STR_TYPE {\field{\*\fldinst{HYPERLINK \"$URL\"}}{\fldrslt $URL}}"
[1064]955 valPrint r " linked from {\field{\*\fldinst{HYPERLINK \"$FULL_PAGE_PATH\"}}{\fldrslt $LOCAL_PAGE_PATH}}"
[1125]956 valPrint hn "<tr><td style=\"white-space:nowrap\">${STATUS}${CURL_STR_H}</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>"
[1064]957 valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"
958
[1123]959 # Place vertical space here since we won't be printing anything more about this link
[1125]960 if [ $STATUS == "OK" ]; then valPrint tr ""; valPrint hs ""; fi
[1123]961
[1067]962 # Record redirect URL if one was given by a 3xx response page
963 if [ $STATUS == "RD" ]; then
[1119]964 valPrint ts " Server suggests $NEW_URL"
965 valPrint rs " Server suggests {\field{\*\fldinst{HYPERLINK \"$NEW_URL\"}}{\fldrslt $NEW_URL}}"
966 valPrint hs "<tr><td colspan=\"2\" align=\"right\">Server suggests</td><td><a href=\"$NEW_URL\" target=\"_blank\">$NEW_URL</a></td></tr>"
[1067]967 fi
968
[1070]969 # Notify reader if we can use an intrawiki link for this URL
970 if [ $STATUS == "EI" ]; then
[1075]971 INTRA_PAGE=${URL#*://*/}
[1119]972 valPrint ts " Just use [[$INTRA_PAGE]]"
973 valPrint rs " Just use [[$INTRA_PAGE]]"
974 valPrint hs "<tr><td colspan=\"2\" align=\"right\">Just use</td><td>[[$INTRA_PAGE]]</td></tr>"
[1070]975 fi
976
[1064]977 # Notify reader if we can use an interwiki prefix for this URL
978 if [ $STATUS == "IW" ]; then
[1075]979 INTER_PAGE=$(echo "$URL" | sed 's/.*\///')
[1119]980 valPrint ts " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
981 valPrint rs " You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]"
982 valPrint hs "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$INTER_PAGE]]</td></tr>"
[1064]983 fi
984
985 # Query Internet Archive for latest "OK" snapshot for "NG" page
986 if [ $STATUS == "NG" ] && [ $SUGGEST_SNAPSHOTS -eq 1 ]; then
987 ARCHIVE_QUERY=$(curl --silent --max-time 10 "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
988
[1118]989 # If a "closest" snapshot was received...
[1066]990 if [[ "$ARCHIVE_QUERY" == *\"closest\":* ]]; then
[1118]991 # In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it
992 ARCHIVE_QUERY=$(echo "$ARCHIVE_QUERY" | sed 's/#!/#\\!/')
993
994 # ...isolate "url" property in the response that follows the "closest" tag
995 SNAPSHOT_URL=${ARCHIVE_QUERY##*\"closest\":} # everything after '"closest":'
[1119]996 SNAPSHOT_URL=${SNAPSHOT_URL#*\"url\": \"} # everything after '"url": "'
[1118]997 SNAPSHOT_URL=${SNAPSHOT_URL%%\"*} # everything before '"'
998
[1124]999 # Remove the port 80 part that IA often adds to the URL, as it's superfluous
1000 SNAPSHOT_URL=$(echo $SNAPSHOT_URL | sed 's/:80//')
1001
[1118]1002 # Inform the user of the snapshot URL
[1119]1003 valPrint ts " IA suggests $SNAPSHOT_URL"
1004 valPrint rs " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
1005 valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
[1064]1006 else # ...otherwise give generic Wayback Machine link for this URL
[1119]1007 valPrint ts " Try browsing $ARCHIVE_GENERIC/$URL"
1008 valPrint rs " Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
1009 valPrint hs "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"
[1064]1010 fi
1011 fi
1012 fi
1013
1014 # If this is a page and it seems to be there, take screenshot of it using headless Google Chrome
1015 if [ $IS_FILE -eq 0 ] && [ $TAKE_PAGE_SHOT -eq 1 ] && [ $STATUS == "OK" ]; then
1016 # Sanitize URL by removing "http(s)://" and converting any other colons or slashes to underscores
1017 SHOT_NAME=$(echo "$URL" | sed 's/https*\:\/\///' | sed 'y/:\//__/')
1018 SHOT_FILE="$SHOT_PATH/$SHOT_NAME.png"
1019
1020 # Don't take screenshot if we already encountered this page and screenshotted it
1021 if [ ! -f "$SHOT_FILE" ]; then
[1070]1022 "$CHROME_PATH" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
[1064]1023 if [ -f "$WORKING_DIR/$CHROME_SCREENSHOT" ]; then
1024 mv -n "$WORKING_DIR/$CHROME_SCREENSHOT" "$SHOT_FILE"
1025 else
[1119]1026 valPrint trhs "Screenshot of URL $URL seems to have failed!"
[1064]1027 fi
1028 else
[1123]1029 valPrint trhs "Skipping screenshot of URL '$URL' because file '$SHOT_FILE' already exists."
[1064]1030 fi
1031 fi
1032done
1033FINISHED_LIST="yes"
1034wrapupAndExit
Note: See TracBrowser for help on using the repository browser.