Changeset 1122 for Validate External Links/validate_external_links.sh
- Timestamp:
- Mar 20, 2020, 11:13:48 PM (5 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
Validate External Links/validate_external_links.sh
r1120 r1122 19 19 OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder 20 20 RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES 21 SHOW_SLASH=0 # record response code to the log when a slash is added to the end of a URL 22 SHOW_HTTPS=0 # record response code to the log when "http" is upgraded to "https" 21 23 SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page 22 24 TAKE_PAGE_SHOT=0 # take a screenshot of each OK page … … 35 37 EXPECT_SCRIPT_NAME="val_expect_sftp.txt" 36 38 HTTP_CODES="http://iritscen.oni2.net/val/http_codes.txt" 37 MY_WIKI_PAGE="http ://wiki.oni2.net/User:Iritscen"39 MY_WIKI_PAGE="https://wiki.oni2.net/User:Iritscen" 38 40 THIS_DIR=$(cd $(dirname $0); pwd) 39 41 WORKING_DIR=$(pwd) … … 84 86 SKIP_EXPECT_EI=0 85 87 SKIP_EXPECT_IW=0 88 SKIP_HTTPS_UP=0 89 SKIP_SLASH_ADD=0 86 90 FILE_LINKS=0 87 91 PAGE_LINKS=0 … … 140 144 --record-ok-links Log a link in the report even if its response 141 145 code is "OK". 146 --show-added-slashes Report on redirects that simply add a '/' to the 147 end of the URL. 148 --show-https-upgrade Report on redirects that simply upgrade a 149 "http://" URL to a "https://" URL. 142 150 --suggest-snapshots Query the Internet Archive for a possible 143 151 snapshot URL for each "NG" page. … … 168 176 while (( "$#" )); do 169 177 case "$1" in 170 --links ) LINKS_URL="$2"; shift 2;; 171 --exceptions ) EXCEPT_URL="$2"; shift 2;; 172 --output ) OUTPUT_DIR="$2"; shift 2;; 173 --record-ok-links ) RECORD_OK_LINKS=1; shift;; 174 --suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;; 175 --take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;; 176 --start-url ) URL_START=$2; shift 2;; 177 --end-url ) URL_LIMIT=$2; shift 2;; 178 --upload ) UPLOAD_INFO=$2; shift 2;; 179 * ) echo "Invalid argument $1 detected. Aborting."; exit 1;; 178 --links ) LINKS_URL="$2"; shift 2;; 179 --exceptions ) EXCEPT_URL="$2"; shift 2;; 180 --output ) OUTPUT_DIR="$2"; shift 2;; 181 --record-ok-links ) RECORD_OK_LINKS=1; shift;; 182 --show-added-slashes ) SHOW_SLASH=1; shift;; 183 --show-https-upgrade ) SHOW_HTTPS=1; shift;; 184 --suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;; 185 --take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;; 186 --start-url ) URL_START=$2; shift 2;; 187 --end-url ) URL_LIMIT=$2; shift 2;; 188 --upload ) UPLOAD_INFO=$2; shift 2;; 189 * ) echo "Invalid argument $1 detected. Aborting."; exit 1;; 180 190 esac 181 191 done … … 434 444 ELAPSED=$(echo $(($END_RUN - $START_RUN)) | awk '{printf "%d min. %d sec. elapsed", int($1/60), int($1%60)}') 435 445 436 # Output results of session and close the log file's markup446 # Do some math on results of session 437 447 LINKS_PROCESSED=$((LINK_NUM-URL_START+1)) 438 LINKS_SKIPPED=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE)) 439 LINKS_CHECKED=$((LINKS_PROCESSED-LINKS_SKIPPED)) 448 LINK_PROBLEMS=$((EI_LINKS+IW_LINKS+RD_LINKS+NG_LINKS)) 449 LINK_ERRORS=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE)) 450 LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_EI+SKIP_EXPECT_IW)) 451 TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP)) 452 LINKS_CHECKED=$((LINKS_PROCESSED-LINK_ERRORS)) 453 454 # Print summary header 440 455 valPrint ct "Summary ($ELAPSED):" 441 456 valPrint r "\b1 Summary \b0 ($ELAPSED)" 442 457 valPrint hn "<h3><span id=\"summary\">Summary ($ELAPSED)</span></h3>" 443 valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT)." 444 valPrint ctrh "I skipped $LINKS_SKIPPED $(pluralCheckNoun link $LINKS_SKIPPED), and found $FILE_LINKS $(pluralCheckNoun file $FILE_LINKS) and $PAGE_LINKS $(pluralCheckNoun page $PAGE_LINKS)." 445 if [ $LINKS_SKIPPED -gt 0 ]; then valPrint ctrh "Skip breakdown: "; fi 446 if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi 458 valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT) (there were $FILE_LINKS file $(pluralCheckNoun link $FILE_LINKS) and $PAGE_LINKS page $(pluralCheckNoun link $PAGE_LINKS))." 459 460 # Print processed link totals 461 if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi 462 if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi 463 if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS processed $(pluralCheckNoun link $LINK_PROBLEMS) had issues"; fi 464 if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctrh " (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi 465 if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) were OK"; fi 466 if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctrh " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi 467 468 # Print excepted link totals 469 if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctrh "$LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"; fi 470 if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi 471 if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi 472 if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi 473 474 # Print errored link totals 475 if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"; fi 476 if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi 447 477 if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi 448 478 if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi … … 450 480 if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi 451 481 if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi 452 valPrint ctrh "Out of the $LINKS_CHECKED links checked, $EI_LINKS could be $(pluralCheckAn $EI_LINKS)intrawiki $(pluralCheckNoun link $EI_LINKS), $IW_LINKS could be $(pluralCheckAn $IW_LINKS)interwiki $(pluralCheckNoun link $IW_LINKS), $OK_LINKS $(pluralCheckWas $OK_LINKS) OK, $RD_LINKS $(pluralCheckWas $RD_LINKS) $(pluralCheckA $RD_LINKS)redirection $(pluralCheckNoun notice $RD_LINKS), and $NG_LINKS $(pluralCheckWas $NG_LINKS) NG." 453 if [ $SKIP_EXPECT_NG -gt 0 ]; then 454 valPrint ctrh "$SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS) went unlisted due to being found in the exceptions file." 455 fi 456 if [ $SKIP_EXPECT_EI -gt 0 ]; then 457 valPrint ctrh "$SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS) went unlisted due to being found in the exceptions file." 458 fi 459 if [ $SKIP_EXPECT_IW -gt 0 ]; then 460 valPrint ctrh "$SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS) went unlisted due to being found in the exceptions file." 461 fi 482 483 # Print checked link totals 484 if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS link $(pluralCheckNoun issues $LINKS_CHECKED):"; fi 485 if [ $NG_LINKS -gt 0 ]; then valPrint ctrh "- $NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi 486 if [ $RD_LINKS -gt 0 ]; then valPrint ctrh "- $RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi 487 if [ $EI_LINKS -gt 0 ]; then valPrint ctrh "- $EI_LINKS $(pluralCheckNoun link $EI_LINKS) that could be intrawiki"; fi 488 if [ $IW_LINKS -gt 0 ]; then valPrint ctrh "- $IW_LINKS $(pluralCheckNoun link $IW_LINKS) that could be interwiki"; fi 489 490 # Close the log files' markup 462 491 valPrint trh "ValExtLinks says goodbye." 463 492 printRTFfooter … … 635 664 636 665 # Build longer wiki page URLs from namespace and page names 637 FULL_PAGE_PATH=http ://$WIKI_PATH/$NS_NAME:$PAGE_NAME666 FULL_PAGE_PATH=https://$WIKI_PATH/$NS_NAME:$PAGE_NAME 638 667 LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME 639 668 # Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it 640 669 # explicitly breaks the link 641 670 if [ $NS_ID -eq 0 ]; then 642 FULL_PAGE_PATH=http ://$WIKI_PATH/$PAGE_NAME671 FULL_PAGE_PATH=https://$WIKI_PATH/$PAGE_NAME 643 672 LOCAL_PAGE_PATH=$PAGE_NAME 644 673 fi … … 795 824 NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{redirect_url}\n' $URL) 796 825 797 # Filter out cases where the redirect URL is just the original URL with https:// instead of 798 # http://, or with an added '/' at the end. These corrections happen a lot and are not 799 # important to us. 800 URL_NO_PROTOCOL=${URL#http://} 801 URL_NO_PROTOCOL=${URL_NO_PROTOCOL%/} 802 NEW_URL_NO_PROTOCOL=${NEW_URL#https://} 803 NEW_URL_NO_PROTOCOL=${NEW_URL_NO_PROTOCOL%/} 826 # Adjust the old and new URLs to both use HTTP for comparison purposes, so we can filter 827 # those changes out if the user didn't ask for them 828 URL_HTTP=$(echo $URL | sed -E 's/^https:/http:/') 829 NEW_URL_HTTP=$(echo $NEW_URL | sed -E 's/^https:/http:/') 804 830 805 831 # Sometimes 'curl' fails to get the redirect_url due to time-out or bad web site config 806 NEW_URL_LENGTH=$(echo | awk -v input=$NEW_URL_ NO_PROTOCOL'{print length(input)}')832 NEW_URL_LENGTH=$(echo | awk -v input=$NEW_URL_HTTP '{print length(input)}') 807 833 if [ $NEW_URL_LENGTH -lt $MIN_URL_LENGTH ]; then 808 NEW_URL_ NO_PROTOCOL="[new URL not retrieved]"834 NEW_URL_HTTP="[new URL not retrieved]" 809 835 fi 810 836 811 # If the URLs match after the above filters were applied, then the link is OK 812 if [ $URL_NO_PROTOCOL == $NEW_URL_NO_PROTOCOL ]; then 837 # Remove slash at end of new URL, if present, so we can filter out the redirects that 838 # merely add an ending slash if the user didn't ask for them 839 NEW_URL_NO_SLASH=$(echo $NEW_URL_HTTP | sed -E 's:/$::') 840 841 # If the URLs match besides HTTP being upgraded to HTTPS, then the link is OK (unless user 842 # wants those to be reported) 843 if [ $SHOW_HTTPS -eq 0 ] && [ $URL_HTTP == $NEW_URL_HTTP ]; then 844 valPrint trs "Skipping URL $URL (found on page $PAGE_NAME) because we have not been asked to show http->https upgrades, and we were redirected to $NEW_URL." 813 845 STATUS="OK" 814 846 let OK_LINKS+=1 847 let SKIP_HTTPS_UP+=1 848 # If the URLs match besides an added ending slash, then the link is OK (unless user wants 849 # those to be reported) 850 elif [ $SHOW_SLASH -eq 0 ] && [ $URL_HTTP == $NEW_URL_NO_SLASH ]; then 851 valPrint trs "Skipping URL $URL (found on page $PAGE_NAME) because we have not been asked to show added trailing slashes, and we were redirected to $NEW_URL." 852 STATUS="OK" 853 let OK_LINKS+=1 854 let SKIP_SLASH_ADD+=1 815 855 else 816 856 STATUS="RD"
Note:
See TracChangeset
for help on using the changeset viewer.