Changeset 1067 for Validate External Links/validate_external_links.sh
- Timestamp:
- Aug 1, 2017, 7:09:42 PM (7 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
Validate External Links/validate_external_links.sh
r1066 r1067 49 49 declare -a HTTP_TLDS_AND_PAGES=(com net org uk ru de it htm html php pl asp aspx shtml pgi cgi php3 x jsp phtml cfm css action stm js) 50 50 51 # These arrays tells us which HTTP response codes are OK (good) and which are NG (no good). Pages that 52 # return NG codes will not be screenshotted. Remember to update http_codes.txt if you add a new code. 53 declare -a OK_CODES=(200 301 307 401 405 406 501) 54 declare -a NG_CODES=(000 302 403 404 410 500 503) 51 # These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which 52 # are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt 53 # if you add a new code. 54 declare -a OK_CODES=(200 401 405 406 501) 55 declare -a RD_CODES=(301 302 303 307 308) 56 declare -a NG_CODES=(000 403 404 410 500 503) 55 57 56 58 # Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using … … 66 68 LINK_NUM=0 67 69 OK_LINKS=0 70 RD_LINKS=0 71 IW_LINKS=0 68 72 NG_LINKS=0 69 73 SKIP_UNK_NS=0 … … 320 324 } 321 325 326 # Output "is" if parameter 1 is 1, otherwise "are" 327 function pluralCheckIs() 328 { 329 if [ $1 -ne 1 ]; then 330 echo "are" 331 else 332 echo "is" 333 fi 334 } 335 322 336 # Output "was" if parameter 1 is 1, otherwise "were" 323 337 function pluralCheckWas() … … 327 341 else 328 342 echo "was" 343 fi 344 } 345 346 # Output "a " if parameter 1 is 1, otherwise nothing 347 function pluralCheckA() 348 { 349 if [ $1 -eq 1 ]; then 350 echo "a " 351 fi 352 } 353 354 # Output "an " if parameter 1 is 1, otherwise nothing 355 function pluralCheckAn() 356 { 357 if [ $1 -eq 1 ]; then 358 echo "an " 329 359 fi 330 360 } … … 386 416 if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi 387 417 if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi 388 valPrint ctrh "Out of the $LINKS_CHECKED links checked, $OK_LINKS $(pluralCheckWas $OK_LINKS) OK and $NG_LINKS $(pluralCheckWas $NG_LINKS) NG." 418 valPrint ctrh "Out of the $LINKS_CHECKED links checked, $OK_LINKS $(pluralCheckWas $OK_LINKS) OK, $RD_LINKS $(pluralCheckWas $RD_LINKS) $(pluralCheckA $RD_LINKS)redirection $(pluralCheckNoun notice $RD_LINKS), and $NG_LINKS $(pluralCheckWas $NG_LINKS) NG." 419 if [ $IW_LINKS -gt 0 ]; then 420 valPrint ctrh "$IW_LINKS/$OK_LINKS OK $(pluralCheckNoun link $OK_LINKS) $(pluralCheckIs $IW_LINKS) $(pluralCheckAn $IW_LINKS)external $(pluralCheckNoun link $IW_LINKS) that could be $(pluralCheckAn $IW_LINKS)interwiki $(pluralCheckNoun link $IW_LINKS)." 421 fi 389 422 if [ $SKIP_EXCEPT -gt 0 ]; then 390 423 valPrint ctrh "$SKIP_EXCEPT/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS) went unlisted due to being found in the exceptions file." … … 467 500 valPrint hn "<h3>Legend</h3>" 468 501 valPrint trh "OK = URL seems to be working." 469 valPrint trh "NG = URL no longer seems to work. You should click each URL marked as NG before attempting to fix it. False negatives will occur from time to time due to hiccups in the Internet. Please report any persistent false negatives or other issues to Iritscen." 502 valPrint trh "NG = URL no longer seems to work. You should click each URL marked as NG before attempting to fix it, because false negatives will occur from time to time due to hiccups in the Internet. Please report any persistent false negatives or other issues to Iritscen. An NG link should be followed by a link to the Internet Archive's Wayback Machine which may help you repair the link. If the link cannot be repaired, you can disable it on the wiki (which prevents it from showing up in future ValExtLinks reports) by wrapping it in nowiki tags." 503 valPrint trh "RD = The server responding to this URL is saying that the page moved and you should instead use the supplied new URL. Some RD links represent minor adjustments in the organization of a web site, and some are soft 404s (the file/page has been removed and you are being redirected to something like the main page of the web site). You will have to look at the new URL yourself to determine if it represents an OK link and the link on the wiki should be updated to this one, or if the desired file/page is actually gone and we need to replace the wiki link with an Internet Archive snapshot link -- or disable the URL if it has not been archived." 470 504 valPrint trh "IW = URL is working but should be converted to interwiki link using the suggested markup." 471 505 valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $HTTP_CODES)." … … 659 693 # Determine if this code is in our "OK" list 660 694 STATUS="??" 695 NEW_URL="" 661 696 INTERWIKI_INDEX=-1 662 697 for CODE in "${OK_CODES[@]}"; do … … 668 703 if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]]; then 669 704 STATUS="IW" 705 let IW_LINKS+=1 670 706 INTERWIKI_INDEX=$i 671 707 break … … 681 717 done 682 718 683 # If we didn't get a match with the "OK" codes, check it against the "NG" codes 719 # If we didn't get a match with the "OK" codes, check it against the "RD" codes 720 if [ $STATUS == "??" ]; then 721 for CODE in "${RD_CODES[@]}"; do 722 if [[ $CODE == $CURL_CODE ]]; then 723 STATUS="RD" 724 let RD_LINKS+=1 725 726 # Get URL header again in order to retrieve the URL we are being redirected to 727 NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{redirect_url}\n' $URL) 728 729 break 730 fi 731 done 732 fi 733 734 # If we didn't get a match with the "RD" codes, check it against the "NG" codes 684 735 if [ $STATUS == "??" ]; then 685 736 for CODE in "${NG_CODES[@]}"; do … … 720 771 fi 721 772 722 # Stupid hack since the text "IW" is narrower than "OK" or "NG" and it takes an extra tab to get723 # to the desired level of indentation in the RTF log773 # Stupid hack since the text "IW" is narrower than "OK", "RD", or "NG" and it takes an extra tab 774 # to get to the desired level of indentation in the RTF log 724 775 RTF_TABS=" " 725 776 if [ $STATUS == "IW" ]; then … … 734 785 valPrint hn "<tr><td style=\"white-space:nowrap\">$STATUS ($CURL_RESULT)</td><td align=\"right\">$STR_TYPE</td><td><a href=\"$URL\" target=\"_blank\">$URL</a></td></tr>" 735 786 valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>" 787 788 # Record redirect URL if one was given by a 3xx response page 789 if [ $STATUS == "RD" ]; then 790 valPrint t " Server suggests $NEW_URL" 791 valPrint r " Server suggests {\field{\*\fldinst{HYPERLINK \"$NEW_URL\"}}{\fldrslt $NEW_URL}}" 792 valPrint hn "<tr><td colspan=\"2\" align=\"right\">Server suggests</td><td><a href=\"$NEW_URL\" target=\"_blank\">$NEW_URL</a></td></tr>" 793 fi 736 794 737 795 # Notify reader if we can use an interwiki prefix for this URL
Note:
See TracChangeset
for help on using the changeset viewer.