Changeset 1069


Ignore:
Timestamp:
Aug 2, 2017, 6:26:48 AM (7 years ago)
Author:
iritscen
Message:

ValExtLinks: IW links now reported as separate category from OK links. RD links that are just redirecting from http:// to https:// are now regarded as OK.

Location:
Validate External Links
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • Validate External Links/http_codes.txt

    r1068 r1069  
    5454
    5555--5xx: Server Errors--
    56 The server failed to fulfil an apparently valid request.
     56The server failed to fulfill an apparently valid request.
    5757
    5858500 Internal Server Error
  • Validate External Links/validate_external_links.sh

    r1067 r1069  
    104104       (which is dumped by the Oni2.net domain periodically in a particular
    105105       format), validates them using the Unix tool 'curl', and produces a report
    106        of which links were OK (responded to an HTTP query) and which were NG (no
    107        good). This report can then be automatically uploaded to the location of
     106       of which links were OK (responded positively to an HTTP query), which
     107       were RD (responded with a 3xx redirect code), which could be IW (inter-
     108       wiki) links, and which were NG (no good; a negative response to the
     109       query). This report can then be automatically uploaded to the location of
    108110       your choice. The script can also suggest Internet Archive snapshots for
    109111       NG links, and take screenshots of OK links for visual verification by the
     
    121123       --output DIR        Place the folder which will contain the reports and
    122124                           optional screenshots at this path (required)
    123        --exceptions DIR    Don't log an NG link if it is listed in the file
    124                            provided at this path as long as the response code is
    125                            the same as the one associated with the link
    126        --record-ok-links   Log a link in the report whether its response code is
    127                            in the OK_CODES or the NG_CODES array
     125       --exceptions URL    In order to remove links from the list which show as
     126                           NG but which you regard as OK, prepare a plain-text
     127                           file where each line contains a response code being
     128                           returned and the URL returning it, separated by a
     129                           comma, e.g. "403,http://www.example.com" (note that
     130                           this can be a local file if you use the
     131                           file:// protocol)
     132       --record-ok-links   Log a link in the report even if its response code is
     133                           OK
    128134       --suggest-snapshots Query the Internet Archive for a possible snapshot
    129135                           URL for each NG page
     
    416422   if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
    417423   if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
    418    valPrint ctrh "Out of the $LINKS_CHECKED links checked, $OK_LINKS $(pluralCheckWas $OK_LINKS) OK, $RD_LINKS $(pluralCheckWas $RD_LINKS) $(pluralCheckA $RD_LINKS)redirection $(pluralCheckNoun notice $RD_LINKS), and $NG_LINKS $(pluralCheckWas $NG_LINKS) NG."
    419    if [ $IW_LINKS -gt 0 ]; then
    420       valPrint ctrh "$IW_LINKS/$OK_LINKS OK $(pluralCheckNoun link $OK_LINKS) $(pluralCheckIs $IW_LINKS) $(pluralCheckAn $IW_LINKS)external $(pluralCheckNoun link $IW_LINKS) that could be $(pluralCheckAn $IW_LINKS)interwiki $(pluralCheckNoun link $IW_LINKS)."
    421    fi
     424   valPrint ctrh "Out of the $LINKS_CHECKED links checked, $IW_LINKS could be $(pluralCheckAn $IW_LINKS)interwiki $(pluralCheckNoun link $IW_LINKS), $OK_LINKS $(pluralCheckWas $OK_LINKS) OK, $RD_LINKS $(pluralCheckWas $RD_LINKS) $(pluralCheckA $RD_LINKS)redirection $(pluralCheckNoun notice $RD_LINKS), and $NG_LINKS $(pluralCheckWas $NG_LINKS) NG."
    422425   if [ $SKIP_EXCEPT -gt 0 ]; then
    423426      valPrint ctrh "$SKIP_EXCEPT/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS) went unlisted due to being found in the exceptions file."
     
    446449
    447450# Attempt to download file at LINKS_URL, then check that it succeeded
    448 valPrint ctrh "Downloading list of external links from $LINKS_URL."
     451valPrint cwtrh "Downloading list of external links from $LINKS_URL."
    449452LINKS_FILE_NAME=$(echo "$LINKS_URL" | sed 's/.*\///')
    450453LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
     
    457460# Attempt to download file at EXCEPT_URL, then check that it succeeded
    458461if [ ! -z $EXCEPT_URL ]; then
    459    valPrint ctrh "Downloading list of NG exceptions from $EXCEPT_URL."
     462   valPrint cwtrh "Downloading list of NG exceptions from $EXCEPT_URL."
    460463   EXCEPT_FILE_NAME=$(echo "$EXCEPT_URL" | sed 's/.*\///')
    461464   EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
     
    555558   NS_NAME=""
    556559   a=0
    557    while [ "x${NS_IDS[$a]}" != "x" ] # once this evaluates to "x", the array is done
    558    do
     560   while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done
    559561      if [ $NS_ID -eq ${NS_IDS[$a]} ]; then
    560562         NS_NAME="${NS_NAMES[$a]}"
     
    691693   fi
    692694
    693    # Determine if this code is in our "OK" list
     695   # Determine our status code for this URL (IW, OK, RD, or NG)
    694696   STATUS="??"
    695697   NEW_URL=""
    696698   INTERWIKI_INDEX=-1
    697    for CODE in "${OK_CODES[@]}"; do
    698       if [[ $CODE == $CURL_CODE ]]; then
    699          let OK_LINKS+=1
    700 
    701          # Determine if this is a link to a domain that we have an interwiki prefix for
    702          for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
    703             if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]]; then
    704                STATUS="IW"
    705                let IW_LINKS+=1
    706                INTERWIKI_INDEX=$i
    707                break
    708             fi
    709          done
    710 
    711          # If this link is OK and no interwiki advisory is needed, just mark as "OK"
    712          if [ $INTERWIKI_INDEX == -1 ]; then
     699   # First check if this is a link to a domain that we have an interwiki prefix for
     700   for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
     701      if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]]; then
     702         STATUS="IW"
     703         let IW_LINKS+=1
     704         INTERWIKI_INDEX=$i
     705         break
     706      fi
     707   done
     708
     709   # If we didn't match an interwiki domain, see if the status code is in our "OK" codes list
     710   if [ $STATUS == "??" ]; then
     711      for CODE in "${OK_CODES[@]}"; do
     712         if [[ $CODE == $CURL_CODE ]]; then
    713713            STATUS="OK"
     714            let OK_LINKS+=1
     715            break
    714716         fi
    715          break
    716       fi
    717    done
     717      done
     718   fi
    718719
    719720   # If we didn't get a match with the "OK" codes, check it against the "RD" codes
     
    721722      for CODE in "${RD_CODES[@]}"; do
    722723         if [[ $CODE == $CURL_CODE ]]; then
    723             STATUS="RD"
    724             let RD_LINKS+=1
    725 
    726724            # Get URL header again in order to retrieve the URL we are being redirected to
    727725            NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{redirect_url}\n' $URL)
    728726
     727            # Check if the redirect URL is just the original URL with https:// instead of http://
     728            # (this happens a lot and is not an important correction to us); if so, just make it "OK"
     729            URL_NO_PROTOCOL=${URL#*://}
     730            NEW_URL_NO_PROTOCOL=${NEW_URL#*://}
     731            if [ $URL_NO_PROTOCOL == $NEW_URL_NO_PROTOCOL ]; then
     732               STATUS="OK"
     733               let OK_LINKS+=1
     734            else
     735               STATUS="RD"
     736               let RD_LINKS+=1
     737            fi
    729738            break
    730739         fi
Note: See TracChangeset for help on using the changeset viewer.