Ignore:
Timestamp:
Feb 5, 2021, 12:15:20 AM (4 years ago)
Author:
iritscen
Message:

ValExtLinks: Val can now recognize bad YouTube links (no thanks to YouTube). Fixed some math errors. Added error 429 to known codes.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • Validate External Links/validate_external_links.sh

    r1147 r1148  
    4848
    4949# Fixed strings -- see the occurrences of these variables to learn their purpose
    50 AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36 OPR/70.0.3728.154"
     50AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/88.0.4324.146 Safari/537.36"
    5151ARCHIVE_API="http://archive.org/wayback/available"
    5252ARCHIVE_GENERIC="https://web.archive.org/web/*"
     
    7777declare -a OK_CODES=(200 401 405 406 418 501)
    7878declare -a RD_CODES=(301 302 303 307 308)
    79 declare -a NG_CODES=(000 400 403 404 410 500 502 503 530)
     79declare -a NG_CODES=(000 400 403 404 410 429 500 502 503 530)
    8080
    8181# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
     
    721721      if [ $LINE == "namespace,title,target" ]; then
    722722         SKIPPED_HEADER_ROW=1
    723          LINK_NUM=0 # this line is it's not a link, so reset the link counter
     723         LINK_NUM=0 # this line is not a link, so reset the link counter
    724724         valPrint hn "<table>"
    725725         continue
     
    769769      fi
    770770      let SKIP_UNK_NS+=1
     771      let PAGE_LINKS+=1
    771772      continue
    772773   fi
     
    783784      valPrint trs "Skipping URL '${LINE#$NS_ID,$PAGE_NAME,}' on line $LINK_NUM because it was found on JavaScript page '$PAGE_NAME'."
    784785      let SKIP_JS_PAGE+=1
     786      let PAGE_LINKS+=1
    785787      continue
    786788   fi
     
    804806      valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because it contains characters illegal in a URL."
    805807      let SKIP_BAD_URL+=1
     808      let PAGE_LINKS+=1
    806809      continue
    807810   fi
     
    811814      valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to check Wayback Machine links."
    812815      let SKIP_ARCHIVE_ORG+=1
     816      let PAGE_LINKS+=1
    813817      continue
    814818   fi
     
    828832      valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I cannot handle non-ASCII characters."
    829833      let SKIP_NON_ASCII+=1
     834      let PAGE_LINKS+=1
    830835      continue
    831836   fi
     
    905910      STR_TYPE="file"
    906911      let FILE_LINKS+=1
    907    elif [ $IS_FILE -eq 0 ]; then
     912   else
    908913      STR_TYPE="page"
    909914      let PAGE_LINKS+=1
     
    953958            STATUS="OK"
    954959            let OK_LINKS+=1
     960
     961            # If this is a YouTube link, we have to look at the actual page source to know if the video
     962            # is good or not
     963            if [[ $URL == *www.youtube.com* ]]; then
     964               PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $URL | grep "\"simpleText\":\"Video unavailable\"")
     965               if [ ! -z "$PAGE_TEXT" ]; then
     966                  STATUS="NG"
     967                  let OK_LINKS-=1
     968                  let NG_LINKS+=1
     969               fi
     970            fi
    955971            break
    956972         fi
     
    10011017               let OK_LINKS+=1
    10021018               let SKIP_SLASH_ADD+=1
    1003             elif [ $SHOW_YT_RD -eq 0 ] && [ $YOUTU_BE -eq 1 ]; then
    1004                valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show redirects for youtu.be links, and I was redirected to '$NEW_URL'."
    1005                STATUS="OK"
    1006                let OK_LINKS+=1
    1007                let SKIP_YOUTU_BE+=1
     1019            elif [ $YOUTU_BE -eq 1 ]; then
     1020               # We have to look at the actual page source to know if a YouTube video is good or not
     1021               PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $NEW_URL | grep "\"simpleText\":\"Video unavailable\"")
     1022               if [ ! -z "$PAGE_TEXT" ]; then
     1023                  STATUS="NG"
     1024                  let NG_LINKS+=1
     1025               else
     1026                  if [ $SHOW_YT_RD -eq 0 ]; then
     1027                     valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show redirects for youtu.be links, and I was redirected to '$NEW_URL'."
     1028                     STATUS="OK"
     1029                     let OK_LINKS+=1
     1030                     let SKIP_YOUTU_BE+=1
     1031                  else
     1032                     STATUS="RD"
     1033                     let RD_LINKS+=1
     1034                  fi
     1035               fi
    10081036            else
    10091037               STATUS="RD"
Note: See TracChangeset for help on using the changeset viewer.