Changeset 1142


Ignore:
Timestamp:
Sep 4, 2020, 5:07:08 AM (4 years ago)
Author:
iritscen
Message:

Val now tries each URL three times. This has proven more effective than giving Val a long timeout and trying each URL once. The summary report has been refined a bit; the most notable change is that the final number and breakdown of link issues leaves out the excepted links. Also stopped Val from getting confused by HTML-encoded '&'s in the exceptions list.

Location:
Validate External Links
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • Validate External Links/validate_external_links.command

    r1139 r1142  
    2323
    2424# Run with start/end URLs, record OK codes, and don't upload
    25 #bash "$VALEXTLINKS" --links "$LINKS_ONLINE" --exceptions "$EXCEPT_ONLINE" --output "$REPORT_DIR" --record-ok-links --suggest-snapshots --start-url 1920 --end-url 1930
     25#bash "$VALEXTLINKS" --links "$LINKS_ONLINE" --exceptions "$EXCEPT_ONLINE" --output "$REPORT_DIR" --record-ok-links --suggest-snapshots --start-url 2560 --end-url 2570
    2626
    2727# Run with local extlinks and exceptions, start/end URLs, record OK codes, and don't upload
     
    3232
    3333# Normal run with no upload
    34 #bash "$VALEXTLINKS" --links "$LINKS_ONLINE" --exceptions "$EXCEPT_ONLINE" --output "$REPORT_DIR" --suggest-snapshots --skip-archive-links --timeout 20
     34#bash "$VALEXTLINKS" --links "$LINKS_ONLINE" --exceptions "$EXCEPT_ONLINE" --output "$REPORT_DIR" --suggest-snapshots --skip-archive-links --timeout 10
    3535
    3636# Normal run
    37 bash "$VALEXTLINKS" --links "$LINKS_ONLINE" --exceptions "$EXCEPT_ONLINE" --output "$REPORT_DIR" --suggest-snapshots --skip-archive-links --timeout 20 --upload "$UPLOAD_INFO"
     37bash "$VALEXTLINKS" --links "$LINKS_ONLINE" --exceptions "$EXCEPT_ONLINE" --output "$REPORT_DIR" --suggest-snapshots --skip-archive-links --timeout 10 --upload "$UPLOAD_INFO"
  • Validate External Links/validate_external_links.sh

    r1141 r1142  
    77# - RTF (for reading as a local file with clickable links)
    88# - HTML (for uploading as a web page).
    9 # Call script with "--help" argument for documentation. Also see Read Me First.rtf for critical notes.
     9# Call script with "--help" argument for documentation. Also see Read Me.rtf for critical notes.
    1010#
    1111# Recommended rule:
     
    4242TIMEOUT=10           # time to wait for a response when querying a site
    4343CHROME_PATH=""       # path to a copy of Google Chrome that has the command-line screenshot feature
    44 URL_START=1          # start at this URL in LINKS_FILE (1 by default)
     44URL_START=1          # start at this URL in LINKS_FILE
    4545URL_LIMIT=0          # if non-zero, stop at this URL in LINKS_FILE
    4646UPLOAD_INFO=""       # path to a file on your hard drive with the login info needed to upload a report
    4747
    4848# Fixed strings -- see the occurrences of these variables to learn their purpose
    49 AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36 OPR/69.0.3686.77"
     49AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36 OPR/70.0.3728.154"
    5050ARCHIVE_API="http://archive.org/wayback/available"
    5151ARCHIVE_GENERIC="https://web.archive.org/web/*"
     
    104104SKIP_UNK_CODE=0
    105105SKIP_EXPECT_NG=0
     106SKIP_EXPECT_RD=0
    106107SKIP_EXPECT_EI=0
    107108SKIP_EXPECT_IW=0
     
    181182                               take screenshots of each "OK" page.
    182183       --timeout NUM           Wait this many seconds for a site to respond. The
    183                                default is 10.
     184                               default is 10. Important note: Val will attempt
     185                               to reach each URL three times, so the time taken
     186                               to ping an unresponsive site will be three times
     187                               this setting.
    184188       --start-url NUM         Start at this link in the links CSV file.
    185189       --end-url NUM           Stop at this link in the links CSV file.
     
    480484   # Do some math on results of session
    481485   LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
    482    LINK_PROBLEMS=$((EI_LINKS+IW_LINKS+RD_LINKS+NG_LINKS))
     486   TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP+SKIP_YOUTU_BE))
    483487   LINK_ERRORS=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
    484    LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_EI+SKIP_EXPECT_IW))
    485    TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP+SKIP_YOUTU_BE))
    486    LINKS_CHECKED=$((LINKS_PROCESSED-LINK_ERRORS))
     488   LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_RD+SKIP_EXPECT_EI+SKIP_EXPECT_IW))
     489   LINK_PROBLEMS_TOTAL=$((NG_LINKS+RD_LINKS+EI_LINKS+IW_LINKS))
     490   LINK_PROBLEMS_NG=$((NG_LINKS-SKIP_EXPECT_NG))
     491   LINK_PROBLEMS_RD=$((RD_LINKS-SKIP_EXPECT_RD))
     492   LINK_PROBLEMS_EI=$((EI_LINKS-SKIP_EXPECT_EI))
     493   LINK_PROBLEMS_IW=$((IW_LINKS-SKIP_EXPECT_IW))
     494   LINK_PROBLEMS_NET=$((LINK_PROBLEMS_NG+LINK_PROBLEMS_RD+LINK_PROBLEMS_EI+LINK_PROBLEMS_IW))
    487495
    488496   ## SUMMARY OUTPUT ##
     
    496504   if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi
    497505   if [ $SKIP_ARCHIVE_ORG -gt 0 ]; then valPrint ctrh "- $SKIP_ARCHIVE_ORG Archive.org $(pluralCheckNoun link $SKIP_ARCHIVE_ORG) were not checked"; fi
    498    if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS processed $(pluralCheckNoun link $LINK_PROBLEMS) had $(pluralCheckAn $LINK_PROBLEMS)$(pluralCheckNoun issue $LINK_PROBLEMS)"; fi
    499    if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr "  (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; valPrint h "  (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi
     506   if [ $LINK_PROBLEMS_TOTAL -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_TOTAL processed $(pluralCheckNoun link $LINK_PROBLEMS_TOTAL) had $(pluralCheckAn $LINK_PROBLEMS_TOTAL)$(pluralCheckNoun issue $LINK_PROBLEMS_TOTAL)"; fi
     507   if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr "  (excepted $LINKS_EXCEPTED link $(pluralCheckNoun issue $LINKS_EXCEPTED) from report)"; valPrint h "  (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi
    500508   if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) $(pluralCheckWas $OK_LINKS) OK"; fi
    501509   if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctr "  (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; valPrint h "  (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi
    502 
    503    # Print excepted link totals
    504    if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctrh "$LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"; fi
    505    if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
    506    if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi
    507    if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi
    508510
    509511   # Print errored link totals
     
    516518   if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
    517519
     520   # Print excepted link totals
     521   if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctrh "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"; fi
     522   if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
     523   if [ $SKIP_EXPECT_RD -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_RD/$RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi
     524   if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi
     525   if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi
     526
    518527   # Print checked link totals
    519    if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS link $(pluralCheckNoun issue $LINK_PROBLEMS):"; fi
    520    if [ $NG_LINKS -gt 0 ]; then valPrint ctrh "- $NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
    521    if [ $RD_LINKS -gt 0 ]; then valPrint ctrh "- $RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi
    522    if [ $EI_LINKS -gt 0 ]; then valPrint ctrh "- $EI_LINKS $(pluralCheckNoun link $EI_LINKS) that could be intrawiki"; fi
    523    if [ $IW_LINKS -gt 0 ]; then valPrint ctrh "- $IW_LINKS $(pluralCheckNoun link $IW_LINKS) that could be interwiki"; fi
     528   if [ $LINK_PROBLEMS_NET -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS_NET link $(pluralCheckNoun issue $LINK_PROBLEMS_NET):"; fi
     529   if [ $LINK_PROBLEMS_NG -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_NG NG $(pluralCheckNoun link $LINK_PROBLEMS_NG)"; fi
     530   if [ $LINK_PROBLEMS_RD -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_RD $(pluralCheckNoun redirection $LINK_PROBLEMS_RD)"; fi
     531   if [ $LINK_PROBLEMS_EI -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_EI $(pluralCheckNoun link $LINK_PROBLEMS_EI) that could be intrawiki"; fi
     532   if [ $LINK_PROBLEMS_IW -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_IW $(pluralCheckNoun link $LINK_PROBLEMS_IW) that could be interwiki"; fi
    524533
    525534   # Close the log files' markup
     
    616625if [ $TAKE_PAGE_SHOT -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
    617626
    618 valPrint ctrhn "Suggest Archive.org snapshots: "
     627valPrint ctrhn "Suggest archive.org snapshots: "
    619628if [ $SUGGEST_SNAPSHOTS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
    620629
     
    862871   # Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
    863872   # issue with sites that require HTTPS
    864    CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --write-out '%{http_code}\n' $URL)
     873   CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --retry 2 --write-out '%{http_code}\n' $URL)
    865874   CURL_ERR=$(echo $?)
    866875   CURL_RESULT=$CURL_CODE
     
    9981007         EXCEPT_LINE="${EXCEPT_ARRAY[$i]}"
    9991008
     1009         # Undo any HTML-encoding from the wiki page; for now we just worry about the ampersand, as most
     1010         # other HTML-encoded characters are not found in URLs
     1011         EXCEPT_LINE=$(echo "$EXCEPT_LINE" | sed 's/\&/\&/')
     1012
    10001013         # Match URL
    10011014         EXCEPT_URL="${EXCEPT_LINE#*,}"
     
    10171030               elif [ $STATUS == "IW" ]; then
    10181031                  let SKIP_EXPECT_IW+=1
     1032               elif [ $STATUS == "RD" ]; then
     1033                  let SKIP_EXPECT_RD+=1
    10191034               else
    10201035                  let SKIP_EXPECT_NG+=1
Note: See TracChangeset for help on using the changeset viewer.