Changeset 1175 for Validate External Links/validate_external_links.sh
- Timestamp:
- Aug 23, 2022, 4:15:48 PM (2 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
Validate External Links/validate_external_links.sh
r1160 r1175 30 30 ### GLOBALS ### 31 31 # Settings -- these will be changed from their defaults by the arguments passed in to the script 32 LINKS_URL="" # use 'curl' to download file with links from this location (can be file://)33 EXCEPT_URL="" # 'curl' will access thiswiki page with a list of exceptions for NG results32 LINKS_URL="" # download external link CSV from this location (can use "file://" protocol) 33 EXCEPT_URL="" # location of wiki page with a list of exceptions for NG results 34 34 OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder 35 35 RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES … … 48 48 49 49 # Fixed strings -- see the occurrences of these variables to learn their purpose 50 AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/88.0.4324.146Safari/537.36"50 AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.134 Safari/537.36" 51 51 ARCHIVE_API="http://archive.org/wayback/available" 52 52 ARCHIVE_GENERIC="https://web.archive.org/web/*" … … 69 69 # These arrays tell the script which suffixes at the ends of URLs represent files and which are pages. 70 70 # This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code. 71 declare -a HTTP_FILES=(3ds 7z avi BINA blend bsl cpp csv dae dll dmg doc east exe fbx first63 flv gamedata gif jpg last32 log m4a mhm mov mp3 mp4 oni ONWC pdf png psd py rar tga TRMA txt vbs wav wmv xaf xcf x ml zip)71 declare -a HTTP_FILES=(3ds 7z avi BINA blend bsl cpp csv dae dll dmg doc east exe fbx first63 flv gamedata gif jpg last32 log m4a mhm mov mp3 mp4 oni ONWC pdf png psd py rar tga TRMA txt vbs wav wmv xaf xcf xlsx xml zip) 72 72 declare -a HTTP_TLDS_AND_PAGES=(abstract action ars asp aspx cfm cgi com css de do full htm html it js jsp net org pgi php php3 phtml pl ru shtml stm uk x) 73 73 … … 563 563 if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi 564 564 565 # Perform exceptions audit 566 EXCEPTION_ISSUES=0 567 valPrint ctrh "Exceptions list audit:" 568 for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do 569 EXCEPT_LINE="${EXCEPT_ARRAY[$i]}" 570 EXCEPT_LINE=$(echo "$EXCEPT_LINE" | sed 's/\&/\&/g') # copied from exception-matching code 571 572 if [ ${EXCEPT_FOUND[$i]} -eq 0 ]; then 573 EXCEPT_URL="${EXCEPT_LINE#*,}" 574 EXCEPT_URL="${EXCEPT_URL%,*}" 575 EXCEPT_PAGE="${EXCEPT_LINE##*,}" 576 EXCEPT_PAGE="${EXCEPT_PAGE%% *}" 577 if [ "$EXCEPT_PAGE" == "*" ]; then 578 valPrint tr "- The link '$EXCEPT_URL' did not return an error or is not on any page." 579 else 580 valPrint tr "- The link '$EXCEPT_URL' did not return an error or is not on page '$EXCEPT_PAGE'." 581 fi 582 let EXCEPTION_ISSUES+=1 583 elif [ ${EXCEPT_USED[$i]} -eq 0 ]; then 584 EXCEPT_URL="${EXCEPT_LINE#*,}" 585 EXCEPT_URL="${EXCEPT_URL%,*}" 586 EXCEPT_CODE=${EXCEPT_LINE%%,*} 587 valPrint tr "- The link '$EXCEPT_URL' did not return error code $EXCEPT_CODE." 588 let EXCEPTION_ISSUES+=1 589 fi 590 done 591 if [ $EXCEPTION_ISSUES -eq 0 ]; then 592 valPrint ctrh "- No issues found." 593 else 594 valPrint c "$EXCEPTION_ISSUES exception list $(pluralCheckNoun issue $EXCEPTION_ISSUES) detected (see RTF or TXT report for details)." 595 valPrint h "$EXCEPTION_ISSUES exception list $(pluralCheckNoun issue $EXCEPTION_ISSUES) detected (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for details)." 596 fi 597 565 598 # Print checked link totals 566 599 if [ $LINK_PROBLEMS_NET -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS_NET link $(pluralCheckNoun issue $LINK_PROBLEMS_NET):"; fi … … 630 663 # Transfer to array for easy searching later 631 664 declare -a EXCEPT_ARRAY=($(echo "$EXCEPT_DATA")) 665 666 # Create parallel arrays for marking which exceptions get used later 667 declare -a EXCEPT_USED=() 668 declare -a EXCEPT_FOUND=() 669 for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do 670 EXCEPT_USED+=(0) 671 EXCEPT_FOUND+=(0) 672 done 632 673 fi 633 674 … … 689 730 valPrint r "\b1 Legend \b0" 690 731 valPrint hn "<h3>Legend</h3>" 691 valPrint t "(For guidance in fixing these links, see $WIKI_MAIN. )"692 valPrint r "(For guidance in fixing these links, see {\field{\*\fldinst{HYPERLINK \"$WIKI_MAIN\"}}{\fldrslt here}}. )"693 valPrint h "(For guidance in fixing these links, see <a href=\"$WIKI_MAIN\" target=\"_blank\">here</a>. )"732 valPrint t "(For guidance in fixing these links, see $WIKI_MAIN. The exceptions list is at $EXCEPT_URL.)" 733 valPrint r "(For guidance in fixing these links, see {\field{\*\fldinst{HYPERLINK \"$WIKI_MAIN\"}}{\fldrslt here}}. The exceptions list is {\field{\*\fldinst{HYPERLINK \"$EXCEPT_URL\"}}{\fldrslt here}}.)" 734 valPrint h "(For guidance in fixing these links, see <a href=\"$WIKI_MAIN\" target=\"_blank\">here</a>. The exceptions list is <a href=\"$EXCEPT_URL\" target=\"_blank\">here</a>.)" 694 735 valPrint trh "OK = URL seems to be working" 695 736 valPrint trh "NG = URL no longer seems to work" … … 829 870 CLEAN_URL=${CLEAN_URL%%\#*} 830 871 831 # 'sed' cannot handle Unicode in my Bash shell, so skip non-ASCII URL and make user check it872 # 'sed' cannot handle Unicode in my Bash shell, so skip non-ASCII URL and make reader check it 832 873 if [[ $CLEAN_URL == *[![:ascii:]]* ]]; then 833 874 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I cannot handle non-ASCII characters." … … 902 943 fi 903 944 904 # If this suffix escaped identification as either a file, page or TLD, inform the user945 # If this suffix escaped identification as either a file, page or TLD, inform the reader 905 946 STR_TYPE="" 906 947 if [ $IS_FILE -eq -1 ]; then … … 1065 1106 # Check problem links against exceptions list before proceeding 1066 1107 FOUND_EXCEPT=0 1067 if [ $STATUS != "OK" ] && [ ! -z $EXCEPT_URL]; then1108 if [ $STATUS != "OK" ] && [ ! -z "$EXCEPT_URL" ]; then 1068 1109 # The code we expect to find in the exceptions file is either the 'curl' result or "EI"/"IW" 1069 1110 EXPECT_CODE="$CURL_RESULT" … … 1083 1124 EXCEPT_LINE=$(echo "$EXCEPT_LINE" | sed 's/\&/\&/g') 1084 1125 1085 # Match URL1126 # Check for URL match 1086 1127 EXCEPT_URL="${EXCEPT_LINE#*,}" 1087 1128 EXCEPT_URL="${EXCEPT_URL%,*}" … … 1090 1131 fi 1091 1132 1092 # Match containing page's name1133 # Check for page name match 1093 1134 EXCEPT_PAGE="${EXCEPT_LINE##*,}" 1094 1135 EXCEPT_PAGE="${EXCEPT_PAGE%% *}" 1095 if [ "$EXCEPT_PAGE" == "*" ] || [ "$EXCEPT_PAGE" == $LOCAL_PAGE_PATH ]; then 1096 # Match result code 1136 if [ "$EXCEPT_PAGE" == "*" ] || [ "$EXCEPT_PAGE" == "$LOCAL_PAGE_PATH" ]; then 1137 let EXCEPT_FOUND[$i]+=1 1138 valPrint trs "Found exception '$URL' on page '$LOCAL_PAGE_PATH'." 1139 1140 # Check for result code match 1097 1141 EXCEPT_CODE=${EXCEPT_LINE%%,*} 1098 1142 if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then 1143 FOUND_EXCEPT=1 1144 let EXCEPT_USED[$i]+=1 1099 1145 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because its expected result, '$EXPECT_CODE', is in the exceptions list." 1146 1100 1147 if [ $STATUS == "EI" ]; then 1101 1148 let SKIP_EXPECT_EI+=1 … … 1107 1154 let SKIP_EXPECT_NG+=1 1108 1155 fi 1109 FOUND_EXCEPT=1 1156 1110 1157 break 1111 1158 fi … … 1181 1228 ARCHIVE_QUERY=$(curl --silent --max-time $TIMEOUT "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES") 1182 1229 1183 # Notify user if we hit the rate limit and just keep going1230 # Notify reader if we hit the rate limit and just keep going 1184 1231 if [[ "$ARCHIVE_QUERY" == "*Too Many Requests*" ]]; then 1185 1232 valPrint t " IA has rate-limited us!" 1186 1233 valPrint r " IA has rate-limited us!" 1187 1234 valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td>(hit the API rate limit!)</td></tr>" 1188 # If a "closest" snapshot was received, inform user1235 # If a "closest" snapshot was received, inform reader 1189 1236 elif [[ "$ARCHIVE_QUERY" == *\"closest\":* ]]; then 1190 1237 # In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it … … 1199 1246 SNAPSHOT_URL=$(echo $SNAPSHOT_URL | sed 's/:80//') 1200 1247 1201 # Inform the user of the snapshot URL1248 # Inform the reader of the snapshot URL 1202 1249 valPrint ts " IA suggests $SNAPSHOT_URL" 1203 1250 valPrint rs " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
Note:
See TracChangeset
for help on using the changeset viewer.