Ignore:
Timestamp:
Aug 1, 2017, 7:09:42 PM (7 years ago)
Author:
iritscen
Message:

Val now understands HTTP redirect responses and will report the URL we're redirected to. Also now tallies IW links.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • Validate External Links/validate_external_links.sh

    r1066 r1067  
    4949declare -a HTTP_TLDS_AND_PAGES=(com net org uk ru de it htm html php pl asp aspx shtml pgi cgi php3 x jsp phtml cfm css action stm js)
    5050
    51 # These arrays tells us which HTTP response codes are OK (good) and which are NG (no good). Pages that
    52 # return NG codes will not be screenshotted. Remember to update http_codes.txt if you add a new code.
    53 declare -a OK_CODES=(200 301 307 401 405 406 501)
    54 declare -a NG_CODES=(000 302 403 404 410 500 503)
     51# These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
     52# are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt
     53# if you add a new code.
     54declare -a OK_CODES=(200 401 405 406 501)
     55declare -a RD_CODES=(301 302 303 307 308)
     56declare -a NG_CODES=(000 403 404 410 500 503)
    5557
    5658# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
     
    6668LINK_NUM=0
    6769OK_LINKS=0
     70RD_LINKS=0
     71IW_LINKS=0
    6872NG_LINKS=0
    6973SKIP_UNK_NS=0
     
    320324}
    321325
     326# Output "is" if parameter 1 is 1, otherwise "are"
     327function pluralCheckIs()
     328{
     329   if [ $1 -ne 1 ]; then
     330      echo "are"
     331   else
     332      echo "is"
     333   fi
     334}
     335
    322336# Output "was" if parameter 1 is 1, otherwise "were"
    323337function pluralCheckWas()
     
    327341   else
    328342      echo "was"
     343   fi
     344}
     345
     346# Output "a " if parameter 1 is 1, otherwise nothing
     347function pluralCheckA()
     348{
     349   if [ $1 -eq 1 ]; then
     350      echo "a "
     351   fi
     352}
     353
     354# Output "an " if parameter 1 is 1, otherwise nothing
     355function pluralCheckAn()
     356{
     357   if [ $1 -eq 1 ]; then
     358      echo "an "
    329359   fi
    330360}
     
    386416   if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
    387417   if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
    388    valPrint ctrh "Out of the $LINKS_CHECKED links checked, $OK_LINKS $(pluralCheckWas $OK_LINKS) OK and $NG_LINKS $(pluralCheckWas $NG_LINKS) NG."
     418   valPrint ctrh "Out of the $LINKS_CHECKED links checked, $OK_LINKS $(pluralCheckWas $OK_LINKS) OK, $RD_LINKS $(pluralCheckWas $RD_LINKS) $(pluralCheckA $RD_LINKS)redirection $(pluralCheckNoun notice $RD_LINKS), and $NG_LINKS $(pluralCheckWas $NG_LINKS) NG."
     419   if [ $IW_LINKS -gt 0 ]; then
     420      valPrint ctrh "$IW_LINKS/$OK_LINKS OK $(pluralCheckNoun link $OK_LINKS) $(pluralCheckIs $IW_LINKS) $(pluralCheckAn $IW_LINKS)external $(pluralCheckNoun link $IW_LINKS) that could be $(pluralCheckAn $IW_LINKS)interwiki $(pluralCheckNoun link $IW_LINKS)."
     421   fi
    389422   if [ $SKIP_EXCEPT -gt 0 ]; then
    390423      valPrint ctrh "$SKIP_EXCEPT/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS) went unlisted due to being found in the exceptions file."
     
    467500valPrint hn "<h3>Legend</h3>"
    468501valPrint trh "OK = URL seems to be working."
    469 valPrint trh "NG = URL no longer seems to work. You should click each URL marked as NG before attempting to fix it. False negatives will occur from time to time due to hiccups in the Internet. Please report any persistent false negatives or other issues to Iritscen."
     502valPrint trh "NG = URL no longer seems to work. You should click each URL marked as NG before attempting to fix it, because false negatives will occur from time to time due to hiccups in the Internet. Please report any persistent false negatives or other issues to Iritscen. An NG link should be followed by a link to the Internet Archive's Wayback Machine which may help you repair the link. If the link cannot be repaired, you can disable it on the wiki (which prevents it from showing up in future ValExtLinks reports) by wrapping it in nowiki tags."
     503valPrint trh "RD = The server responding to this URL is saying that the page moved and you should instead use the supplied new URL. Some RD links represent minor adjustments in the organization of a web site, and some are soft 404s (the file/page has been removed and you are being redirected to something like the main page of the web site). You will have to look at the new URL yourself to determine if it represents an OK link and the link on the wiki should be updated to this one, or if the desired file/page is actually gone and we need to replace the wiki link with an Internet Archive snapshot link -- or disable the URL if it has not been archived."
    470504valPrint trh "IW = URL is working but should be converted to interwiki link using the suggested markup."
    471505valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $HTTP_CODES)."
     
    659693   # Determine if this code is in our "OK" list
    660694   STATUS="??"
     695   NEW_URL=""
    661696   INTERWIKI_INDEX=-1
    662697   for CODE in "${OK_CODES[@]}"; do
     
    668703            if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]]; then
    669704               STATUS="IW"
     705               let IW_LINKS+=1
    670706               INTERWIKI_INDEX=$i
    671707               break
     
    681717   done
    682718
    683    # If we didn't get a match with the "OK" codes, check it against the "NG" codes
     719   # If we didn't get a match with the "OK" codes, check it against the "RD" codes
     720   if [ $STATUS == "??" ]; then
     721      for CODE in "${RD_CODES[@]}"; do
     722         if [[ $CODE == $CURL_CODE ]]; then
     723            STATUS="RD"
     724            let RD_LINKS+=1
     725
     726            # Get URL header again in order to retrieve the URL we are being redirected to
     727            NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{redirect_url}\n' $URL)
     728
     729            break
     730         fi
     731      done
     732   fi
     733
     734   # If we didn't get a match with the "RD" codes, check it against the "NG" codes
    684735   if [ $STATUS == "??" ]; then
    685736      for CODE in "${NG_CODES[@]}"; do
     
    720771      fi
    721772
    722       # Stupid hack since the text "IW" is narrower than "OK" or "NG" and it takes an extra tab to get
    723       # to the desired level of indentation in the RTF log
     773      # Stupid hack since the text "IW" is narrower than "OK", "RD", or "NG" and it takes an extra tab
     774      # to get to the desired level of indentation in the RTF log
    724775      RTF_TABS="        "
    725776      if [ $STATUS == "IW" ]; then
     
    734785      valPrint hn "<tr><td style=\"white-space:nowrap\">$STATUS ($CURL_RESULT)</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>"
    735786      valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"
     787
     788      # Record redirect URL if one was given by a 3xx response page
     789      if [ $STATUS == "RD" ]; then
     790         valPrint t "  Server suggests $NEW_URL"
     791         valPrint r "   Server suggests {\field{\*\fldinst{HYPERLINK \"$NEW_URL\"}}{\fldrslt $NEW_URL}}"
     792         valPrint hn "<tr><td colspan=\"2\" align=\"right\">Server suggests</td><td><a href=\"$NEW_URL\" target=\"_blank\">$NEW_URL</a></td></tr>"
     793      fi
    736794
    737795      # Notify reader if we can use an interwiki prefix for this URL
Note: See TracChangeset for help on using the changeset viewer.