Ignore:
Timestamp:
Mar 20, 2020, 11:13:48 PM (5 years ago)
Author:
iritscen
Message:

Val now links to wiki pages using HTTPS instead of HTTP. Fixed code that exempts minor forms of redirects from being listed. New arguments --show-added-slashes and --show-https-upgrade allow one to turn off these exemptions. Reworked summary section extensively to be more readable.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • Validate External Links/validate_external_links.sh

    r1120 r1122  
    1919OUTPUT_DIR=""       # place reports and all other output in a folder inside this existing folder
    2020RECORD_OK_LINKS=0   # record response code to the log even when it's a value in OK_CODES
     21SHOW_SLASH=0        # record response code to the log when a slash is added to the end of a URL
     22SHOW_HTTPS=0        # record response code to the log when "http" is upgraded to "https"
    2123SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page
    2224TAKE_PAGE_SHOT=0    # take a screenshot of each OK page
     
    3537EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
    3638HTTP_CODES="http://iritscen.oni2.net/val/http_codes.txt"
    37 MY_WIKI_PAGE="http://wiki.oni2.net/User:Iritscen"
     39MY_WIKI_PAGE="https://wiki.oni2.net/User:Iritscen"
    3840THIS_DIR=$(cd $(dirname $0); pwd)
    3941WORKING_DIR=$(pwd)
     
    8486SKIP_EXPECT_EI=0
    8587SKIP_EXPECT_IW=0
     88SKIP_HTTPS_UP=0
     89SKIP_SLASH_ADD=0
    8690FILE_LINKS=0
    8791PAGE_LINKS=0
     
    140144       --record-ok-links       Log a link in the report even if its response
    141145                               code is "OK".
     146       --show-added-slashes    Report on redirects that simply add a '/' to the
     147                               end of the URL.
     148       --show-https-upgrade    Report on redirects that simply upgrade a
     149                               "http://" URL to a "https://" URL.
    142150       --suggest-snapshots     Query the Internet Archive for a possible
    143151                               snapshot URL for each "NG" page.
     
    168176while (( "$#" )); do
    169177   case "$1" in
    170       --links )             LINKS_URL="$2";                     shift 2;;
    171       --exceptions )        EXCEPT_URL="$2";                    shift 2;;
    172       --output )            OUTPUT_DIR="$2";                    shift 2;;
    173       --record-ok-links )   RECORD_OK_LINKS=1;                  shift;;
    174       --suggest-snapshots ) SUGGEST_SNAPSHOTS=1;                shift;;
    175       --take-screenshots )  TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
    176       --start-url )         URL_START=$2;                       shift 2;;
    177       --end-url )           URL_LIMIT=$2;                       shift 2;;
    178       --upload )            UPLOAD_INFO=$2;                     shift 2;;
    179       * )                   echo "Invalid argument $1 detected. Aborting."; exit 1;;
     178      --links )              LINKS_URL="$2";                     shift 2;;
     179      --exceptions )         EXCEPT_URL="$2";                    shift 2;;
     180      --output )             OUTPUT_DIR="$2";                    shift 2;;
     181      --record-ok-links )    RECORD_OK_LINKS=1;                  shift;;
     182      --show-added-slashes ) SHOW_SLASH=1;                       shift;;
     183      --show-https-upgrade ) SHOW_HTTPS=1;                       shift;;
     184      --suggest-snapshots )  SUGGEST_SNAPSHOTS=1;                shift;;
     185      --take-screenshots )   TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
     186      --start-url )          URL_START=$2;                       shift 2;;
     187      --end-url )            URL_LIMIT=$2;                       shift 2;;
     188      --upload )             UPLOAD_INFO=$2;                     shift 2;;
     189      * )                    echo "Invalid argument $1 detected. Aborting."; exit 1;;
    180190  esac
    181191done
     
    434444   ELAPSED=$(echo $(($END_RUN - $START_RUN)) | awk '{printf "%d min. %d sec. elapsed", int($1/60), int($1%60)}')
    435445
    436    # Output results of session and close the log file's markup
     446   # Do some math on results of session
    437447   LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
    438    LINKS_SKIPPED=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
    439    LINKS_CHECKED=$((LINKS_PROCESSED-LINKS_SKIPPED))
     448   LINK_PROBLEMS=$((EI_LINKS+IW_LINKS+RD_LINKS+NG_LINKS))
     449   LINK_ERRORS=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
     450   LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_EI+SKIP_EXPECT_IW))
     451   TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP))
     452   LINKS_CHECKED=$((LINKS_PROCESSED-LINK_ERRORS))
     453
     454   # Print summary header
    440455   valPrint ct "Summary ($ELAPSED):"
    441456   valPrint r "\b1 Summary \b0 ($ELAPSED)"
    442457   valPrint hn "<h3><span id=\"summary\">Summary ($ELAPSED)</span></h3>"
    443    valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT)."
    444    valPrint ctrh "I skipped $LINKS_SKIPPED $(pluralCheckNoun link $LINKS_SKIPPED), and found $FILE_LINKS $(pluralCheckNoun file $FILE_LINKS) and $PAGE_LINKS $(pluralCheckNoun page $PAGE_LINKS)."
    445    if [ $LINKS_SKIPPED -gt 0 ]; then valPrint ctrh "Skip breakdown: "; fi
    446    if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
     458   valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT) (there were $FILE_LINKS file $(pluralCheckNoun link $FILE_LINKS) and $PAGE_LINKS page $(pluralCheckNoun link $PAGE_LINKS))."
     459
     460   # Print processed link totals
     461   if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi
     462   if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi
     463   if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS processed $(pluralCheckNoun link $LINK_PROBLEMS) had issues"; fi
     464   if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctrh "  (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi
     465   if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) were OK"; fi
     466   if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctrh "  (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi
     467
     468   # Print excepted link totals
     469   if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctrh "$LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"; fi
     470   if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
     471   if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi
     472   if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi
     473
     474   # Print errored link totals
     475   if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"; fi
     476   if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
    447477   if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
    448478   if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
     
    450480   if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
    451481   if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
    452    valPrint ctrh "Out of the $LINKS_CHECKED links checked, $EI_LINKS could be $(pluralCheckAn $EI_LINKS)intrawiki $(pluralCheckNoun link $EI_LINKS), $IW_LINKS could be $(pluralCheckAn $IW_LINKS)interwiki $(pluralCheckNoun link $IW_LINKS), $OK_LINKS $(pluralCheckWas $OK_LINKS) OK, $RD_LINKS $(pluralCheckWas $RD_LINKS) $(pluralCheckA $RD_LINKS)redirection $(pluralCheckNoun notice $RD_LINKS), and $NG_LINKS $(pluralCheckWas $NG_LINKS) NG."
    453    if [ $SKIP_EXPECT_NG -gt 0 ]; then
    454       valPrint ctrh "$SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS) went unlisted due to being found in the exceptions file."
    455    fi
    456    if [ $SKIP_EXPECT_EI -gt 0 ]; then
    457       valPrint ctrh "$SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS) went unlisted due to being found in the exceptions file."
    458    fi
    459    if [ $SKIP_EXPECT_IW -gt 0 ]; then
    460       valPrint ctrh "$SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS) went unlisted due to being found in the exceptions file."
    461    fi
     482
     483   # Print checked link totals
     484   if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS link $(pluralCheckNoun issues $LINKS_CHECKED):"; fi
     485   if [ $NG_LINKS -gt 0 ]; then valPrint ctrh "- $NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
     486   if [ $RD_LINKS -gt 0 ]; then valPrint ctrh "- $RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi
     487   if [ $EI_LINKS -gt 0 ]; then valPrint ctrh "- $EI_LINKS $(pluralCheckNoun link $EI_LINKS) that could be intrawiki"; fi
     488   if [ $IW_LINKS -gt 0 ]; then valPrint ctrh "- $IW_LINKS $(pluralCheckNoun link $IW_LINKS) that could be interwiki"; fi
     489
     490   # Close the log files' markup
    462491   valPrint trh "ValExtLinks says goodbye."
    463492   printRTFfooter
     
    635664
    636665   # Build longer wiki page URLs from namespace and page names
    637    FULL_PAGE_PATH=http://$WIKI_PATH/$NS_NAME:$PAGE_NAME
     666   FULL_PAGE_PATH=https://$WIKI_PATH/$NS_NAME:$PAGE_NAME
    638667   LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
    639668   # Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it
    640669   # explicitly breaks the link
    641670   if [ $NS_ID -eq 0 ]; then
    642       FULL_PAGE_PATH=http://$WIKI_PATH/$PAGE_NAME
     671      FULL_PAGE_PATH=https://$WIKI_PATH/$PAGE_NAME
    643672      LOCAL_PAGE_PATH=$PAGE_NAME
    644673   fi
     
    795824            NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{redirect_url}\n' $URL)
    796825
    797             # Filter out cases where the redirect URL is just the original URL with https:// instead of
    798             # http://, or with an added '/' at the end. These corrections happen a lot and are not
    799             # important to us.
    800             URL_NO_PROTOCOL=${URL#http://}
    801             URL_NO_PROTOCOL=${URL_NO_PROTOCOL%/}
    802             NEW_URL_NO_PROTOCOL=${NEW_URL#https://}
    803             NEW_URL_NO_PROTOCOL=${NEW_URL_NO_PROTOCOL%/}
     826            # Adjust the old and new URLs to both use HTTP for comparison purposes, so we can filter
     827            # those changes out if the user didn't ask for them
     828            URL_HTTP=$(echo $URL | sed -E 's/^https:/http:/')
     829            NEW_URL_HTTP=$(echo $NEW_URL | sed -E 's/^https:/http:/')
    804830
    805831            # Sometimes 'curl' fails to get the redirect_url due to time-out or bad web site config
    806             NEW_URL_LENGTH=$(echo | awk -v input=$NEW_URL_NO_PROTOCOL '{print length(input)}')
     832            NEW_URL_LENGTH=$(echo | awk -v input=$NEW_URL_HTTP '{print length(input)}')
    807833            if [ $NEW_URL_LENGTH -lt $MIN_URL_LENGTH ]; then
    808                NEW_URL_NO_PROTOCOL="[new URL not retrieved]"
     834               NEW_URL_HTTP="[new URL not retrieved]"
    809835            fi
    810836
    811             # If the URLs match after the above filters were applied, then the link is OK
    812             if [ $URL_NO_PROTOCOL == $NEW_URL_NO_PROTOCOL ]; then
     837            # Remove slash at end of new URL, if present, so we can filter out the redirects that
     838            # merely add an ending slash if the user didn't ask for them
     839            NEW_URL_NO_SLASH=$(echo $NEW_URL_HTTP | sed -E 's:/$::')
     840
     841            # If the URLs match besides HTTP being upgraded to HTTPS, then the link is OK (unless user
     842            # wants those to be reported)
     843            if [ $SHOW_HTTPS -eq 0 ] && [ $URL_HTTP == $NEW_URL_HTTP ]; then
     844               valPrint trs "Skipping URL $URL (found on page $PAGE_NAME) because we have not been asked to show http->https upgrades, and we were redirected to $NEW_URL."
    813845               STATUS="OK"
    814846               let OK_LINKS+=1
     847               let SKIP_HTTPS_UP+=1
     848            # If the URLs match besides an added ending slash, then the link is OK (unless user wants
     849            # those to be reported)
     850            elif [ $SHOW_SLASH -eq 0 ] && [ $URL_HTTP == $NEW_URL_NO_SLASH ]; then
     851               valPrint trs "Skipping URL $URL (found on page $PAGE_NAME) because we have not been asked to show added trailing slashes, and we were redirected to $NEW_URL."
     852               STATUS="OK"
     853               let OK_LINKS+=1
     854               let SKIP_SLASH_ADD+=1
    815855            else
    816856               STATUS="RD"
Note: See TracChangeset for help on using the changeset viewer.