Changeset 1127 for Validate External Links/validate_external_links.sh
- Timestamp:
- Mar 28, 2020, 3:08:29 AM (5 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
Validate External Links/validate_external_links.sh
r1125 r1127 21 21 SHOW_SLASH=0 # record response code to the log when a slash is added to the end of a URL 22 22 SHOW_HTTPS=0 # record response code to the log when "http" is upgraded to "https" 23 SHOW_YT_RD=0 # record response code to the log when a youtu.be URL is expanded to the full URL 23 24 SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page 24 25 TAKE_PAGE_SHOT=0 # take a screenshot of each OK page … … 48 49 # These arrays tell the script which suffixes at the ends of URLs represent files and which are pages. 49 50 # This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code. 50 declare -a HTTP_FILES=( txt zip wmv jpg png m4a bsl rar oni mp3 mov ONWC vbs TRMA mp4 doc avi log gif pdf dmg exe cpp tga 7z wav east BINA xml dll dae xaf fbx 3ds blend flv csv)51 declare -a HTTP_TLDS_AND_PAGES=( com net org uk ru de it htm html php pl asp aspx shtml pgi cgi php3 x jsp phtml cfm css action stm js)51 declare -a HTTP_FILES=(3ds 7z avi BINA blend bsl cpp csv dae dll dmg doc east exe fbx first63 flv gamedata gif jpg last32 log m4a mhm mov mp3 mp4 oni ONWC pdf png py rar tga TRMA txt vbs wav wmv xaf xml zip) 52 declare -a HTTP_TLDS_AND_PAGES=(action ars asp aspx cfm cgi com css de htm html it js jsp net org pgi php php3 phtml pl ru shtml stm uk x) 52 53 53 54 # These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which 54 55 # are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt 55 56 # if you add a new code. 56 declare -a OK_CODES=(200 401 405 406 501)57 declare -a OK_CODES=(200 401 405 406 418 501) 57 58 declare -a RD_CODES=(301 302 303 307 308) 58 declare -a NG_CODES=(000 40 3 404 410 500 503)59 declare -a NG_CODES=(000 400 403 404 410 500 502 503 530) 59 60 60 61 # Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using … … 88 89 SKIP_HTTPS_UP=0 89 90 SKIP_SLASH_ADD=0 91 SKIP_YOUTU_BE=0 90 92 FILE_LINKS=0 91 93 PAGE_LINKS=0 … … 146 148 --show-added-slashes Report on redirects that simply add a '/' to the 147 149 end of the URL. 148 --show-https-upgrade 150 --show-https-upgrades Report on redirects that simply upgrade a 149 151 "http://" URL to a "https://" URL. 152 --show-yt-redirects Report on redirects that expand a youtu.be URL. 150 153 --suggest-snapshots Query the Internet Archive for a possible 151 154 snapshot URL for each "NG" page. … … 176 179 while (( "$#" )); do 177 180 case "$1" in 178 --links ) LINKS_URL="$2"; shift 2;; 179 --exceptions ) EXCEPT_URL="$2"; shift 2;; 180 --output ) OUTPUT_DIR="$2"; shift 2;; 181 --record-ok-links ) RECORD_OK_LINKS=1; shift;; 182 --show-added-slashes ) SHOW_SLASH=1; shift;; 183 --show-https-upgrade ) SHOW_HTTPS=1; shift;; 184 --suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;; 185 --take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;; 186 --start-url ) URL_START=$2; shift 2;; 187 --end-url ) URL_LIMIT=$2; shift 2;; 188 --upload ) UPLOAD_INFO=$2; shift 2;; 189 * ) echo "Invalid argument $1 detected. Aborting."; exit 1;; 181 --links ) LINKS_URL="$2"; shift 2;; 182 --exceptions ) EXCEPT_URL="$2"; shift 2;; 183 --output ) OUTPUT_DIR="$2"; shift 2;; 184 --record-ok-links ) RECORD_OK_LINKS=1; shift;; 185 --show-added-slashes ) SHOW_SLASH=1; shift;; 186 --show-https-upgrades ) SHOW_HTTPS=1; shift;; 187 --show-yt-redirects ) SHOW_YT_RD=1; shift;; 188 --suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;; 189 --take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;; 190 --start-url ) URL_START=$2; shift 2;; 191 --end-url ) URL_LIMIT=$2; shift 2;; 192 --upload ) UPLOAD_INFO=$2; shift 2;; 193 * ) echo "Invalid argument $1 detected. Aborting."; exit 1;; 190 194 esac 191 195 done … … 449 453 LINK_ERRORS=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE)) 450 454 LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_EI+SKIP_EXPECT_IW)) 451 TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP ))455 TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP+SKIP_YOUTU_BE)) 452 456 LINKS_CHECKED=$((LINKS_PROCESSED-LINK_ERRORS)) 453 457 … … 462 466 if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi 463 467 if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS processed $(pluralCheckNoun link $LINK_PROBLEMS) had $(pluralCheckAn $LINK_PROBLEMS)$(pluralCheckNoun issue $LINK_PROBLEMS)"; fi 464 if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr " (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; valPrint h " nbsp;nbsp;(excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi468 if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr " (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; valPrint h " (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi 465 469 if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) $(pluralCheckWas $OK_LINKS) OK"; fi 466 470 if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctr " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; valPrint h " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi … … 723 727 shopt -s nocasematch 724 728 725 # Special case: URLs ending in something like " /productID.304297400" are pages, not files, so if729 # Special case: URLs ending in something like "productID.304297400" are pages, not files, so if 726 730 # the URL's suffix is all numbers, we are looking at the end of a web page URL 727 731 if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then 732 IS_FILE=0 733 fi 734 735 # Special case: URLs ending in a parens, e.g. "ms537113(v=vs.85)", are pages 736 if [[ $POST_DOT =~ ^.*[\(\)]$ ]]; then 737 IS_FILE=0 738 fi 739 740 # Special case: URLs containing a '%', e.g. "10.1007%2FBF00329055", are pages 741 if [[ $POST_DOT == *%* ]]; then 728 742 IS_FILE=0 729 743 fi … … 838 852 # merely add an ending slash if the user didn't ask for them 839 853 NEW_URL_NO_SLASH=$(echo $NEW_URL_HTTP | sed -E 's:/$::') 854 855 # Detect if this is a youtu.be link simply being expanded by YouTube to the full 856 # youtube.com address 857 YOUTU_BE=0 858 if [[ $URL_HTTP == http://youtu.be* ]] && [[ $NEW_URL_HTTP == http://www.youtube.com* ]]; then 859 YOUTU_BE=1 860 fi 840 861 841 862 # If the URLs match besides HTTP being upgraded to HTTPS, then the link is OK (unless user … … 853 874 let OK_LINKS+=1 854 875 let SKIP_SLASH_ADD+=1 876 elif [ $SHOW_YT_RD -eq 0 ] && [ $YOUTU_BE -eq 1 ]; then 877 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show redirects for youtu.be links, and I was redirected to '$NEW_URL'." 878 STATUS="OK" 879 let OK_LINKS+=1 880 let SKIP_YOUTU_BE+=1 855 881 else 856 882 STATUS="RD" … … 875 901 # If we didn't match a known status code, advise the reader 876 902 if [ $STATUS == "??" ]; then 877 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I encountered the unknown re turncode $CURL_CODE."903 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I encountered the unknown response code $CURL_CODE." 878 904 let SKIP_UNK_CODE+=1 879 905 continue
Note:
See TracChangeset
for help on using the changeset viewer.