Changeset 1142
- Timestamp:
- Sep 4, 2020, 5:07:08 AM (4 years ago)
- Location:
- Validate External Links
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
Validate External Links/validate_external_links.command
r1139 r1142 23 23 24 24 # Run with start/end URLs, record OK codes, and don't upload 25 #bash "$VALEXTLINKS" --links "$LINKS_ONLINE" --exceptions "$EXCEPT_ONLINE" --output "$REPORT_DIR" --record-ok-links --suggest-snapshots --start-url 1920 --end-url 193025 #bash "$VALEXTLINKS" --links "$LINKS_ONLINE" --exceptions "$EXCEPT_ONLINE" --output "$REPORT_DIR" --record-ok-links --suggest-snapshots --start-url 2560 --end-url 2570 26 26 27 27 # Run with local extlinks and exceptions, start/end URLs, record OK codes, and don't upload … … 32 32 33 33 # Normal run with no upload 34 #bash "$VALEXTLINKS" --links "$LINKS_ONLINE" --exceptions "$EXCEPT_ONLINE" --output "$REPORT_DIR" --suggest-snapshots --skip-archive-links --timeout 2034 #bash "$VALEXTLINKS" --links "$LINKS_ONLINE" --exceptions "$EXCEPT_ONLINE" --output "$REPORT_DIR" --suggest-snapshots --skip-archive-links --timeout 10 35 35 36 36 # Normal run 37 bash "$VALEXTLINKS" --links "$LINKS_ONLINE" --exceptions "$EXCEPT_ONLINE" --output "$REPORT_DIR" --suggest-snapshots --skip-archive-links --timeout 20 --upload "$UPLOAD_INFO"37 bash "$VALEXTLINKS" --links "$LINKS_ONLINE" --exceptions "$EXCEPT_ONLINE" --output "$REPORT_DIR" --suggest-snapshots --skip-archive-links --timeout 10 --upload "$UPLOAD_INFO" -
Validate External Links/validate_external_links.sh
r1141 r1142 7 7 # - RTF (for reading as a local file with clickable links) 8 8 # - HTML (for uploading as a web page). 9 # Call script with "--help" argument for documentation. Also see Read Me First.rtf for critical notes.9 # Call script with "--help" argument for documentation. Also see Read Me.rtf for critical notes. 10 10 # 11 11 # Recommended rule: … … 42 42 TIMEOUT=10 # time to wait for a response when querying a site 43 43 CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature 44 URL_START=1 # start at this URL in LINKS_FILE (1 by default)44 URL_START=1 # start at this URL in LINKS_FILE 45 45 URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE 46 46 UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report 47 47 48 48 # Fixed strings -- see the occurrences of these variables to learn their purpose 49 AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/8 3.0.4103.116 Safari/537.36 OPR/69.0.3686.77"49 AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36 OPR/70.0.3728.154" 50 50 ARCHIVE_API="http://archive.org/wayback/available" 51 51 ARCHIVE_GENERIC="https://web.archive.org/web/*" … … 104 104 SKIP_UNK_CODE=0 105 105 SKIP_EXPECT_NG=0 106 SKIP_EXPECT_RD=0 106 107 SKIP_EXPECT_EI=0 107 108 SKIP_EXPECT_IW=0 … … 181 182 take screenshots of each "OK" page. 182 183 --timeout NUM Wait this many seconds for a site to respond. The 183 default is 10. 184 default is 10. Important note: Val will attempt 185 to reach each URL three times, so the time taken 186 to ping an unresponsive site will be three times 187 this setting. 184 188 --start-url NUM Start at this link in the links CSV file. 185 189 --end-url NUM Stop at this link in the links CSV file. … … 480 484 # Do some math on results of session 481 485 LINKS_PROCESSED=$((LINK_NUM-URL_START+1)) 482 LINK_PROBLEMS=$((EI_LINKS+IW_LINKS+RD_LINKS+NG_LINKS))486 TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP+SKIP_YOUTU_BE)) 483 487 LINK_ERRORS=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE)) 484 LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_EI+SKIP_EXPECT_IW)) 485 TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP+SKIP_YOUTU_BE)) 486 LINKS_CHECKED=$((LINKS_PROCESSED-LINK_ERRORS)) 488 LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_RD+SKIP_EXPECT_EI+SKIP_EXPECT_IW)) 489 LINK_PROBLEMS_TOTAL=$((NG_LINKS+RD_LINKS+EI_LINKS+IW_LINKS)) 490 LINK_PROBLEMS_NG=$((NG_LINKS-SKIP_EXPECT_NG)) 491 LINK_PROBLEMS_RD=$((RD_LINKS-SKIP_EXPECT_RD)) 492 LINK_PROBLEMS_EI=$((EI_LINKS-SKIP_EXPECT_EI)) 493 LINK_PROBLEMS_IW=$((IW_LINKS-SKIP_EXPECT_IW)) 494 LINK_PROBLEMS_NET=$((LINK_PROBLEMS_NG+LINK_PROBLEMS_RD+LINK_PROBLEMS_EI+LINK_PROBLEMS_IW)) 487 495 488 496 ## SUMMARY OUTPUT ## … … 496 504 if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi 497 505 if [ $SKIP_ARCHIVE_ORG -gt 0 ]; then valPrint ctrh "- $SKIP_ARCHIVE_ORG Archive.org $(pluralCheckNoun link $SKIP_ARCHIVE_ORG) were not checked"; fi 498 if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS processed $(pluralCheckNoun link $LINK_PROBLEMS) had $(pluralCheckAn $LINK_PROBLEMS)$(pluralCheckNoun issue $LINK_PROBLEMS)"; fi499 if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr " (excepted $LINKS_EXCEPTED $(pluralCheckNoun link$LINKS_EXCEPTED) from report)"; valPrint h " (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi506 if [ $LINK_PROBLEMS_TOTAL -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_TOTAL processed $(pluralCheckNoun link $LINK_PROBLEMS_TOTAL) had $(pluralCheckAn $LINK_PROBLEMS_TOTAL)$(pluralCheckNoun issue $LINK_PROBLEMS_TOTAL)"; fi 507 if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr " (excepted $LINKS_EXCEPTED link $(pluralCheckNoun issue $LINKS_EXCEPTED) from report)"; valPrint h " (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi 500 508 if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) $(pluralCheckWas $OK_LINKS) OK"; fi 501 509 if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctr " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; valPrint h " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi 502 503 # Print excepted link totals504 if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctrh "$LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"; fi505 if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi506 if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi507 if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi508 510 509 511 # Print errored link totals … … 516 518 if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi 517 519 520 # Print excepted link totals 521 if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctrh "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"; fi 522 if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi 523 if [ $SKIP_EXPECT_RD -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_RD/$RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi 524 if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi 525 if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi 526 518 527 # Print checked link totals 519 if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS link $(pluralCheckNoun issue $LINK_PROBLEMS):"; fi520 if [ $ NG_LINKS -gt 0 ]; then valPrint ctrh "- $NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi521 if [ $ RD_LINKS -gt 0 ]; then valPrint ctrh "- $RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi522 if [ $ EI_LINKS -gt 0 ]; then valPrint ctrh "- $EI_LINKS $(pluralCheckNoun link $EI_LINKS) that could be intrawiki"; fi523 if [ $ IW_LINKS -gt 0 ]; then valPrint ctrh "- $IW_LINKS $(pluralCheckNoun link $IW_LINKS) that could be interwiki"; fi528 if [ $LINK_PROBLEMS_NET -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS_NET link $(pluralCheckNoun issue $LINK_PROBLEMS_NET):"; fi 529 if [ $LINK_PROBLEMS_NG -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_NG NG $(pluralCheckNoun link $LINK_PROBLEMS_NG)"; fi 530 if [ $LINK_PROBLEMS_RD -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_RD $(pluralCheckNoun redirection $LINK_PROBLEMS_RD)"; fi 531 if [ $LINK_PROBLEMS_EI -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_EI $(pluralCheckNoun link $LINK_PROBLEMS_EI) that could be intrawiki"; fi 532 if [ $LINK_PROBLEMS_IW -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_IW $(pluralCheckNoun link $LINK_PROBLEMS_IW) that could be interwiki"; fi 524 533 525 534 # Close the log files' markup … … 616 625 if [ $TAKE_PAGE_SHOT -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi 617 626 618 valPrint ctrhn "Suggest Archive.org snapshots: "627 valPrint ctrhn "Suggest archive.org snapshots: " 619 628 if [ $SUGGEST_SNAPSHOTS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi 620 629 … … 862 871 # Get response code using 'curl' to see if this link is valid; the --insecure option avoids an 863 872 # issue with sites that require HTTPS 864 CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT -- write-out '%{http_code}\n' $URL)873 CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --retry 2 --write-out '%{http_code}\n' $URL) 865 874 CURL_ERR=$(echo $?) 866 875 CURL_RESULT=$CURL_CODE … … 998 1007 EXCEPT_LINE="${EXCEPT_ARRAY[$i]}" 999 1008 1009 # Undo any HTML-encoding from the wiki page; for now we just worry about the ampersand, as most 1010 # other HTML-encoded characters are not found in URLs 1011 EXCEPT_LINE=$(echo "$EXCEPT_LINE" | sed 's/\&/\&/') 1012 1000 1013 # Match URL 1001 1014 EXCEPT_URL="${EXCEPT_LINE#*,}" … … 1017 1030 elif [ $STATUS == "IW" ]; then 1018 1031 let SKIP_EXPECT_IW+=1 1032 elif [ $STATUS == "RD" ]; then 1033 let SKIP_EXPECT_RD+=1 1019 1034 else 1020 1035 let SKIP_EXPECT_NG+=1
Note:
See TracChangeset
for help on using the changeset viewer.