Changeset 1148 for Validate External Links/validate_external_links.sh
- Timestamp:
- Feb 5, 2021, 12:15:20 AM (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
Validate External Links/validate_external_links.sh
r1147 r1148 48 48 49 49 # Fixed strings -- see the occurrences of these variables to learn their purpose 50 AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_ 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36 OPR/70.0.3728.154"50 AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/88.0.4324.146 Safari/537.36" 51 51 ARCHIVE_API="http://archive.org/wayback/available" 52 52 ARCHIVE_GENERIC="https://web.archive.org/web/*" … … 77 77 declare -a OK_CODES=(200 401 405 406 418 501) 78 78 declare -a RD_CODES=(301 302 303 307 308) 79 declare -a NG_CODES=(000 400 403 404 410 500 502 503 530)79 declare -a NG_CODES=(000 400 403 404 410 429 500 502 503 530) 80 80 81 81 # Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using … … 721 721 if [ $LINE == "namespace,title,target" ]; then 722 722 SKIPPED_HEADER_ROW=1 723 LINK_NUM=0 # this line is it'snot a link, so reset the link counter723 LINK_NUM=0 # this line is not a link, so reset the link counter 724 724 valPrint hn "<table>" 725 725 continue … … 769 769 fi 770 770 let SKIP_UNK_NS+=1 771 let PAGE_LINKS+=1 771 772 continue 772 773 fi … … 783 784 valPrint trs "Skipping URL '${LINE#$NS_ID,$PAGE_NAME,}' on line $LINK_NUM because it was found on JavaScript page '$PAGE_NAME'." 784 785 let SKIP_JS_PAGE+=1 786 let PAGE_LINKS+=1 785 787 continue 786 788 fi … … 804 806 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because it contains characters illegal in a URL." 805 807 let SKIP_BAD_URL+=1 808 let PAGE_LINKS+=1 806 809 continue 807 810 fi … … 811 814 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to check Wayback Machine links." 812 815 let SKIP_ARCHIVE_ORG+=1 816 let PAGE_LINKS+=1 813 817 continue 814 818 fi … … 828 832 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I cannot handle non-ASCII characters." 829 833 let SKIP_NON_ASCII+=1 834 let PAGE_LINKS+=1 830 835 continue 831 836 fi … … 905 910 STR_TYPE="file" 906 911 let FILE_LINKS+=1 907 el if [ $IS_FILE -eq 0 ]; then912 else 908 913 STR_TYPE="page" 909 914 let PAGE_LINKS+=1 … … 953 958 STATUS="OK" 954 959 let OK_LINKS+=1 960 961 # If this is a YouTube link, we have to look at the actual page source to know if the video 962 # is good or not 963 if [[ $URL == *www.youtube.com* ]]; then 964 PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $URL | grep "\"simpleText\":\"Video unavailable\"") 965 if [ ! -z "$PAGE_TEXT" ]; then 966 STATUS="NG" 967 let OK_LINKS-=1 968 let NG_LINKS+=1 969 fi 970 fi 955 971 break 956 972 fi … … 1001 1017 let OK_LINKS+=1 1002 1018 let SKIP_SLASH_ADD+=1 1003 elif [ $SHOW_YT_RD -eq 0 ] && [ $YOUTU_BE -eq 1 ]; then 1004 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show redirects for youtu.be links, and I was redirected to '$NEW_URL'." 1005 STATUS="OK" 1006 let OK_LINKS+=1 1007 let SKIP_YOUTU_BE+=1 1019 elif [ $YOUTU_BE -eq 1 ]; then 1020 # We have to look at the actual page source to know if a YouTube video is good or not 1021 PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $NEW_URL | grep "\"simpleText\":\"Video unavailable\"") 1022 if [ ! -z "$PAGE_TEXT" ]; then 1023 STATUS="NG" 1024 let NG_LINKS+=1 1025 else 1026 if [ $SHOW_YT_RD -eq 0 ]; then 1027 valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show redirects for youtu.be links, and I was redirected to '$NEW_URL'." 1028 STATUS="OK" 1029 let OK_LINKS+=1 1030 let SKIP_YOUTU_BE+=1 1031 else 1032 STATUS="RD" 1033 let RD_LINKS+=1 1034 fi 1035 fi 1008 1036 else 1009 1037 STATUS="RD"
Note:
See TracChangeset
for help on using the changeset viewer.