Ignore:
Timestamp:
Jun 13, 2021, 10:50:43 PM (4 years ago)
Author:
iritscen
Message:

ValExtLinks: Added argument to 'curl' that prevents some sites from rejecting it. Val now skips archive.is links too when skipping archive.org links.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • Validate External Links/validate_external_links.sh

    r1157 r1158  
    3939SUGGEST_SNAPSHOTS_NG=0 # query the Internet Archive for a possible snapshot URL for each NG page
    4040SUGGEST_SNAPSHOTS_OK=0 # query the Internet Archive for an existing snapshot of each OK page
    41 CHECK_ARCHIVE_LINKS=0  # check URLs under the archive.org domain
     41CHECK_ARCHIVE_LINKS=0  # check URLs on archive.org and archive.is
    4242TAKE_PAGE_SHOT=0       # take a screenshot of each OK page
    4343TIMEOUT=10             # time to wait for a response when querying a site
     
    111111SKIP_SLASH_ADD=0
    112112SKIP_YOUTU_BE=0
    113 SKIP_ARCHIVE_ORG=0
     113SKIP_ARCHIVES=0
    114114FILE_LINKS=0
    115115PAGE_LINKS=0
     
    186186                               --record-ok-links argument.
    187187       --check-archive-links   Check links that are already pointing to a page
    188                                on the Internet Archive. In theory these links
    189                                should be totally stable and not need validation.
     188                               on the Internet Archive or archive.is (AKA
     189                               archive.today). In theory these links should be
     190                               totally stable and not need validation.
    190191       --take-screenshots FILE Call the Google Chrome binary at this path to
    191192                               take screenshots of each "OK" page.
     
    532533   if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi
    533534   if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi
    534    if [ $SKIP_ARCHIVE_ORG -gt 0 ]; then valPrint ctrh "- $SKIP_ARCHIVE_ORG Archive.org $(pluralCheckNoun link $SKIP_ARCHIVE_ORG) were not checked"; fi
     535   if [ $SKIP_ARCHIVES -gt 0 ]; then valPrint ctrh "- $SKIP_ARCHIVES archive.org/archive.is $(pluralCheckNoun link $SKIP_ARCHIVES) were not checked"; fi
    535536   if [ $LINK_PROBLEMS_TOTAL -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_TOTAL processed $(pluralCheckNoun link $LINK_PROBLEMS_TOTAL) had $(pluralCheckAn $LINK_PROBLEMS_TOTAL)$(pluralCheckNoun issue $LINK_PROBLEMS_TOTAL)"; fi
    536537   if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr "  (excepted $LINKS_EXCEPTED link $(pluralCheckNoun issue $LINKS_EXCEPTED) from report)"; valPrint h "  (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi
     
    677678if [ $SHOW_YT_RD -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
    678679
    679 valPrint ctrhn "Check archive.org links: "
     680valPrint ctrhn "Check archive.org and archive.is links: "
    680681if [ $CHECK_ARCHIVE_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
    681682
     
    810811   fi
    811812
    812    # If we're skipping Archive.org links, see if this is one
    813    if [ $CHECK_ARCHIVE_LINKS -eq 0 ] && [[ $URL == *web.archive.org* ]]; then
    814       valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to check Wayback Machine links."
    815       let SKIP_ARCHIVE_ORG+=1
     813   # If we're skipping archive links, see if this is one
     814   if [ $CHECK_ARCHIVE_LINKS -eq 0 ] && [[ ( $URL == *web.archive.org* || $URL == *archive.is* ) ]]; then
     815      valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to check archive links."
     816      let SKIP_ARCHIVES+=1
    816817      let PAGE_LINKS+=1
    817818      continue
     
    917918   # Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
    918919   # issue with sites that require HTTPS
    919    CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --retry 2 --write-out '%{http_code}\n' $URL)
     920   CURL_CODE=$(curl -o /dev/null --silent --insecure --compressed --head --user-agent '$AGENT' --max-time $TIMEOUT --retry 2 --write-out '%{http_code}\n' $URL)
    920921   CURL_ERR=$(echo $?)
    921922   CURL_RESULT=$CURL_CODE
Note: See TracChangeset for help on using the changeset viewer.