Ignore:
Timestamp:
Mar 28, 2020, 3:08:29 AM (5 years ago)
Author:
iritscen
Message:

Val now counts redirects from youtu.be to youtube.com as OK links. These links will be reported on if the argument --show-yt-redirects is used. Renamed --show-https-upgrade to --show-https-upgrades for consistency. Also sorted the file and page suffix arrays and added some more items to them. Now handling status codes 400, 418, 502 and 530. Fixed incorrect nbsps in HTML report. Val is no longer confused by URLs ending in '(' or ')', or which contain a '%' towards the end.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • Validate External Links/validate_external_links.sh

    r1125 r1127  
    2121SHOW_SLASH=0        # record response code to the log when a slash is added to the end of a URL
    2222SHOW_HTTPS=0        # record response code to the log when "http" is upgraded to "https"
     23SHOW_YT_RD=0        # record response code to the log when a youtu.be URL is expanded to the full URL
    2324SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page
    2425TAKE_PAGE_SHOT=0    # take a screenshot of each OK page
     
    4849# These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
    4950# This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code.
    50 declare -a HTTP_FILES=(txt zip wmv jpg png m4a bsl rar oni mp3 mov ONWC vbs TRMA mp4 doc avi log gif pdf dmg exe cpp tga 7z wav east BINA xml dll dae xaf fbx 3ds blend flv csv)
    51 declare -a HTTP_TLDS_AND_PAGES=(com net org uk ru de it htm html php pl asp aspx shtml pgi cgi php3 x jsp phtml cfm css action stm js)
     51declare -a HTTP_FILES=(3ds 7z avi BINA blend bsl cpp csv dae dll dmg doc east exe fbx first63 flv gamedata gif jpg last32 log m4a mhm mov mp3 mp4 oni ONWC pdf png py rar tga TRMA txt vbs wav wmv xaf xml zip)
     52declare -a HTTP_TLDS_AND_PAGES=(action ars asp aspx cfm cgi com css de htm html it js jsp net org pgi php php3 phtml pl ru shtml stm uk x)
    5253
    5354# These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
    5455# are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt
    5556# if you add a new code.
    56 declare -a OK_CODES=(200 401 405 406 501)
     57declare -a OK_CODES=(200 401 405 406 418 501)
    5758declare -a RD_CODES=(301 302 303 307 308)
    58 declare -a NG_CODES=(000 403 404 410 500 503)
     59declare -a NG_CODES=(000 400 403 404 410 500 502 503 530)
    5960
    6061# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
     
    8889SKIP_HTTPS_UP=0
    8990SKIP_SLASH_ADD=0
     91SKIP_YOUTU_BE=0
    9092FILE_LINKS=0
    9193PAGE_LINKS=0
     
    146148       --show-added-slashes    Report on redirects that simply add a '/' to the
    147149                               end of the URL.
    148        --show-https-upgrade    Report on redirects that simply upgrade a
     150       --show-https-upgrades   Report on redirects that simply upgrade a
    149151                               "http://" URL to a "https://" URL.
     152       --show-yt-redirects     Report on redirects that expand a youtu.be URL.
    150153       --suggest-snapshots     Query the Internet Archive for a possible
    151154                               snapshot URL for each "NG" page.
     
    176179while (( "$#" )); do
    177180   case "$1" in
    178       --links )              LINKS_URL="$2";                     shift 2;;
    179       --exceptions )         EXCEPT_URL="$2";                    shift 2;;
    180       --output )             OUTPUT_DIR="$2";                    shift 2;;
    181       --record-ok-links )    RECORD_OK_LINKS=1;                  shift;;
    182       --show-added-slashes ) SHOW_SLASH=1;                       shift;;
    183       --show-https-upgrade ) SHOW_HTTPS=1;                       shift;;
    184       --suggest-snapshots )  SUGGEST_SNAPSHOTS=1;                shift;;
    185       --take-screenshots )   TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
    186       --start-url )          URL_START=$2;                       shift 2;;
    187       --end-url )            URL_LIMIT=$2;                       shift 2;;
    188       --upload )             UPLOAD_INFO=$2;                     shift 2;;
    189       * )                    echo "Invalid argument $1 detected. Aborting."; exit 1;;
     181      --links )               LINKS_URL="$2";                     shift 2;;
     182      --exceptions )          EXCEPT_URL="$2";                    shift 2;;
     183      --output )              OUTPUT_DIR="$2";                    shift 2;;
     184      --record-ok-links )     RECORD_OK_LINKS=1;                  shift;;
     185      --show-added-slashes )  SHOW_SLASH=1;                       shift;;
     186      --show-https-upgrades ) SHOW_HTTPS=1;                       shift;;
     187      --show-yt-redirects )   SHOW_YT_RD=1;                       shift;;
     188      --suggest-snapshots )   SUGGEST_SNAPSHOTS=1;                shift;;
     189      --take-screenshots )    TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
     190      --start-url )           URL_START=$2;                       shift 2;;
     191      --end-url )             URL_LIMIT=$2;                       shift 2;;
     192      --upload )              UPLOAD_INFO=$2;                     shift 2;;
     193      * )                     echo "Invalid argument $1 detected. Aborting."; exit 1;;
    190194  esac
    191195done
     
    449453   LINK_ERRORS=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
    450454   LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_EI+SKIP_EXPECT_IW))
    451    TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP))
     455   TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP+SKIP_YOUTU_BE))
    452456   LINKS_CHECKED=$((LINKS_PROCESSED-LINK_ERRORS))
    453457
     
    462466   if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi
    463467   if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS processed $(pluralCheckNoun link $LINK_PROBLEMS) had $(pluralCheckAn $LINK_PROBLEMS)$(pluralCheckNoun issue $LINK_PROBLEMS)"; fi
    464    if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr "  (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; valPrint h "nbsp;nbsp;(excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi
     468   if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr "  (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; valPrint h "  (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi
    465469   if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) $(pluralCheckWas $OK_LINKS) OK"; fi
    466470   if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctr "  (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; valPrint h "  (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi
     
    723727      shopt -s nocasematch
    724728
    725       # Special case: URLs ending in something like "/productID.304297400" are pages, not files, so if
     729      # Special case: URLs ending in something like "productID.304297400" are pages, not files, so if
    726730      # the URL's suffix is all numbers, we are looking at the end of a web page URL
    727731      if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
     732         IS_FILE=0
     733      fi
     734
     735      # Special case: URLs ending in a parens, e.g. "ms537113(v=vs.85)", are pages
     736      if [[ $POST_DOT =~ ^.*[\(\)]$ ]]; then
     737         IS_FILE=0
     738      fi
     739
     740      # Special case: URLs containing a '%', e.g. "10.1007%2FBF00329055", are pages
     741      if [[ $POST_DOT == *%* ]]; then
    728742         IS_FILE=0
    729743      fi
     
    838852            # merely add an ending slash if the user didn't ask for them
    839853            NEW_URL_NO_SLASH=$(echo $NEW_URL_HTTP | sed -E 's:/$::')
     854
     855            # Detect if this is a youtu.be link simply being expanded by YouTube to the full
     856            # youtube.com address
     857            YOUTU_BE=0
     858            if [[ $URL_HTTP == http://youtu.be* ]] && [[ $NEW_URL_HTTP == http://www.youtube.com* ]]; then
     859               YOUTU_BE=1
     860            fi
    840861
    841862            # If the URLs match besides HTTP being upgraded to HTTPS, then the link is OK (unless user
     
    853874               let OK_LINKS+=1
    854875               let SKIP_SLASH_ADD+=1
     876            elif [ $SHOW_YT_RD -eq 0 ] && [ $YOUTU_BE -eq 1 ]; then
     877               valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show redirects for youtu.be links, and I was redirected to '$NEW_URL'."
     878               STATUS="OK"
     879               let OK_LINKS+=1
     880               let SKIP_YOUTU_BE+=1
    855881            else
    856882               STATUS="RD"
     
    875901   # If we didn't match a known status code, advise the reader
    876902   if [ $STATUS == "??" ]; then
    877       valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I encountered the unknown return code $CURL_CODE."
     903      valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I encountered the unknown response code $CURL_CODE."
    878904      let SKIP_UNK_CODE+=1
    879905      continue
Note: See TracChangeset for help on using the changeset viewer.