Ignore:
Timestamp:
Aug 23, 2022, 4:15:48 PM (2 years ago)
Author:
iritscen
Message:

ValExtLinks: Added audit feature which tells the user if there are items in the exception list which are no longer present on the wiki or no longer return the given error code.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • Validate External Links/validate_external_links.sh

    r1160 r1175  
    3030### GLOBALS ###
    3131# Settings -- these will be changed from their defaults by the arguments passed in to the script
    32 LINKS_URL=""           # use 'curl' to download file with links from this location (can be file://)
    33 EXCEPT_URL=""          # 'curl' will access this wiki page with a list of exceptions for NG results
     32LINKS_URL=""           # download external link CSV from this location (can use "file://" protocol)
     33EXCEPT_URL=""          # location of wiki page with a list of exceptions for NG results
    3434OUTPUT_DIR=""          # place reports and all other output in a folder inside this existing folder
    3535RECORD_OK_LINKS=0      # record response code to the log even when it's a value in OK_CODES
     
    4848
    4949# Fixed strings -- see the occurrences of these variables to learn their purpose
    50 AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/88.0.4324.146 Safari/537.36"
     50AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.134 Safari/537.36"
    5151ARCHIVE_API="http://archive.org/wayback/available"
    5252ARCHIVE_GENERIC="https://web.archive.org/web/*"
     
    6969# These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
    7070# This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code.
    71 declare -a HTTP_FILES=(3ds 7z avi BINA blend bsl cpp csv dae dll dmg doc east exe fbx first63 flv gamedata gif jpg last32 log m4a mhm mov mp3 mp4 oni ONWC pdf png psd py rar tga TRMA txt vbs wav wmv xaf xcf xml zip)
     71declare -a HTTP_FILES=(3ds 7z avi BINA blend bsl cpp csv dae dll dmg doc east exe fbx first63 flv gamedata gif jpg last32 log m4a mhm mov mp3 mp4 oni ONWC pdf png psd py rar tga TRMA txt vbs wav wmv xaf xcf xlsx xml zip)
    7272declare -a HTTP_TLDS_AND_PAGES=(abstract action ars asp aspx cfm cgi com css de do full htm html it js jsp net org pgi php php3 phtml pl ru shtml stm uk x)
    7373
     
    563563   if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi
    564564
     565   # Perform exceptions audit
     566   EXCEPTION_ISSUES=0
     567   valPrint ctrh "Exceptions list audit:"
     568   for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
     569      EXCEPT_LINE="${EXCEPT_ARRAY[$i]}"
     570      EXCEPT_LINE=$(echo "$EXCEPT_LINE" | sed 's/\&amp;/\&/g') # copied from exception-matching code
     571
     572      if [ ${EXCEPT_FOUND[$i]} -eq 0 ]; then
     573         EXCEPT_URL="${EXCEPT_LINE#*,}"
     574         EXCEPT_URL="${EXCEPT_URL%,*}"
     575         EXCEPT_PAGE="${EXCEPT_LINE##*,}"
     576         EXCEPT_PAGE="${EXCEPT_PAGE%% *}"
     577         if [ "$EXCEPT_PAGE" == "*" ]; then
     578            valPrint tr "- The link '$EXCEPT_URL' did not return an error or is not on any page."
     579         else
     580            valPrint tr "- The link '$EXCEPT_URL' did not return an error or is not on page '$EXCEPT_PAGE'."
     581         fi
     582         let EXCEPTION_ISSUES+=1
     583      elif [ ${EXCEPT_USED[$i]} -eq 0 ]; then
     584         EXCEPT_URL="${EXCEPT_LINE#*,}"
     585         EXCEPT_URL="${EXCEPT_URL%,*}"
     586         EXCEPT_CODE=${EXCEPT_LINE%%,*}
     587         valPrint tr "- The link '$EXCEPT_URL' did not return error code $EXCEPT_CODE."
     588         let EXCEPTION_ISSUES+=1
     589      fi
     590   done
     591   if [ $EXCEPTION_ISSUES -eq 0 ]; then
     592      valPrint ctrh "- No issues found."
     593   else
     594      valPrint c "$EXCEPTION_ISSUES exception list $(pluralCheckNoun issue $EXCEPTION_ISSUES) detected (see RTF or TXT report for details)."
     595      valPrint h "$EXCEPTION_ISSUES exception list $(pluralCheckNoun issue $EXCEPTION_ISSUES) detected (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for details)."
     596   fi
     597
    565598   # Print checked link totals
    566599   if [ $LINK_PROBLEMS_NET -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS_NET link $(pluralCheckNoun issue $LINK_PROBLEMS_NET):"; fi
     
    630663   # Transfer to array for easy searching later
    631664   declare -a EXCEPT_ARRAY=($(echo "$EXCEPT_DATA"))
     665
     666   # Create parallel arrays for marking which exceptions get used later
     667   declare -a EXCEPT_USED=()
     668   declare -a EXCEPT_FOUND=()
     669   for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
     670      EXCEPT_USED+=(0)
     671      EXCEPT_FOUND+=(0)
     672   done
    632673fi
    633674
     
    689730valPrint r "\b1 Legend \b0"
    690731valPrint hn "<h3>Legend</h3>"
    691 valPrint t "(For guidance in fixing these links, see $WIKI_MAIN.)"
    692 valPrint r "(For guidance in fixing these links, see {\field{\*\fldinst{HYPERLINK \"$WIKI_MAIN\"}}{\fldrslt here}}.)"
    693 valPrint h "(For guidance in fixing these links, see <a href=\"$WIKI_MAIN\" target=\"_blank\">here</a>.)"
     732valPrint t "(For guidance in fixing these links, see $WIKI_MAIN. The exceptions list is at $EXCEPT_URL.)"
     733valPrint r "(For guidance in fixing these links, see {\field{\*\fldinst{HYPERLINK \"$WIKI_MAIN\"}}{\fldrslt here}}. The exceptions list is {\field{\*\fldinst{HYPERLINK \"$EXCEPT_URL\"}}{\fldrslt here}}.)"
     734valPrint h "(For guidance in fixing these links, see <a href=\"$WIKI_MAIN\" target=\"_blank\">here</a>. The exceptions list is <a href=\"$EXCEPT_URL\" target=\"_blank\">here</a>.)"
    694735valPrint trh "OK = URL seems to be working"
    695736valPrint trh "NG = URL no longer seems to work"
     
    829870   CLEAN_URL=${CLEAN_URL%%\#*}
    830871
    831    # 'sed' cannot handle Unicode in my Bash shell, so skip non-ASCII URL and make user check it
     872   # 'sed' cannot handle Unicode in my Bash shell, so skip non-ASCII URL and make reader check it
    832873   if [[ $CLEAN_URL == *[![:ascii:]]* ]]; then
    833874      valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I cannot handle non-ASCII characters."
     
    902943   fi
    903944
    904    # If this suffix escaped identification as either a file, page or TLD, inform the user
     945   # If this suffix escaped identification as either a file, page or TLD, inform the reader
    905946   STR_TYPE=""
    906947   if [ $IS_FILE -eq -1 ]; then
     
    10651106   # Check problem links against exceptions list before proceeding
    10661107   FOUND_EXCEPT=0
    1067    if [ $STATUS != "OK" ] && [ ! -z $EXCEPT_URL ]; then
     1108   if [ $STATUS != "OK" ] && [ ! -z "$EXCEPT_URL" ]; then
    10681109      # The code we expect to find in the exceptions file is either the 'curl' result or "EI"/"IW"
    10691110      EXPECT_CODE="$CURL_RESULT"
     
    10831124         EXCEPT_LINE=$(echo "$EXCEPT_LINE" | sed 's/\&amp;/\&/g')
    10841125
    1085          # Match URL
     1126         # Check for URL match
    10861127         EXCEPT_URL="${EXCEPT_LINE#*,}"
    10871128         EXCEPT_URL="${EXCEPT_URL%,*}"
     
    10901131         fi
    10911132
    1092          # Match containing page's name
     1133         # Check for page name match
    10931134         EXCEPT_PAGE="${EXCEPT_LINE##*,}"
    10941135         EXCEPT_PAGE="${EXCEPT_PAGE%% *}"
    1095          if [ "$EXCEPT_PAGE" == "*" ] || [ "$EXCEPT_PAGE" == $LOCAL_PAGE_PATH ]; then
    1096             # Match result code
     1136         if [ "$EXCEPT_PAGE" == "*" ] || [ "$EXCEPT_PAGE" == "$LOCAL_PAGE_PATH" ]; then
     1137            let EXCEPT_FOUND[$i]+=1
     1138            valPrint trs "Found exception '$URL' on page '$LOCAL_PAGE_PATH'."
     1139
     1140            # Check for result code match
    10971141            EXCEPT_CODE=${EXCEPT_LINE%%,*}
    10981142            if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then
     1143               FOUND_EXCEPT=1
     1144               let EXCEPT_USED[$i]+=1
    10991145               valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because its expected result, '$EXPECT_CODE', is in the exceptions list."
     1146
    11001147               if [ $STATUS == "EI" ]; then
    11011148                  let SKIP_EXPECT_EI+=1
     
    11071154                  let SKIP_EXPECT_NG+=1
    11081155               fi
    1109                FOUND_EXCEPT=1
     1156
    11101157               break
    11111158            fi
     
    11811228         ARCHIVE_QUERY=$(curl --silent --max-time $TIMEOUT "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
    11821229
    1183          # Notify user if we hit the rate limit and just keep going
     1230         # Notify reader if we hit the rate limit and just keep going
    11841231         if [[ "$ARCHIVE_QUERY" == "*Too Many Requests*" ]]; then
    11851232            valPrint t "  IA has rate-limited us!"
    11861233            valPrint r "                IA has rate-limited us!"
    11871234            valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td>(hit the API rate limit!)</td></tr>"
    1188          # If a "closest" snapshot was received, inform user
     1235         # If a "closest" snapshot was received, inform reader
    11891236         elif [[ "$ARCHIVE_QUERY" == *\"closest\":* ]]; then
    11901237            # In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it
     
    11991246            SNAPSHOT_URL=$(echo $SNAPSHOT_URL | sed 's/:80//')
    12001247
    1201             # Inform the user of the snapshot URL
     1248            # Inform the reader of the snapshot URL
    12021249            valPrint ts "  IA suggests $SNAPSHOT_URL"
    12031250            valPrint rs "               IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
Note: See TracChangeset for help on using the changeset viewer.