Changeset 1141


Ignore:
Timestamp:
Sep 4, 2020, 4:54:30 AM (4 years ago)
Author:
iritscen
Message:

Committing the changes to Val which I meant to commit over a week ago. I committed everything but the updated script itself. See last Val commit message for list of changes.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • Validate External Links/validate_external_links.sh

    r1137 r1141  
    22
    33# Validate External Links by Iritscen
    4 # Provided with a list of external links in an expected CSV format, this script validates them. The
    5 # resulting logs are produced in three formats: TXT (for easy diffing with an earlier log), RTF (for
    6 # reading as a local file with clickable links), and HTML (for uploading as a web page). Call script
    7 # with "--help" argument for documentation. Also see Read Me First.rtf for critical notes.
     4#
     5# Validates a list of external links in CSV format. The resulting logs are produced in three formats:
     6# - TXT (for easy diffing with an earlier log)
     7# - RTF (for reading as a local file with clickable links)
     8# - HTML (for uploading as a web page).
     9# Call script with "--help" argument for documentation. Also see Read Me First.rtf for critical notes.
     10#
    811# Recommended rule:
    912# |----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----|
     13#
     14# Table of contents (sections of script in order of appearance, not execution):
     15# • Globals
     16# • Help Output
     17# • Setup
     18# • Utility Functions
     19# • Summary Output
     20# • Initialization
     21#   • Data Sourcing
     22#   • Config Output
     23#   • Legend Output
     24# • Main Loop
    1025
    1126# Set separator token to newline
     
    2540SKIP_ARCHIVE_LINKS=0 # don't check URLs under the archive.org domain
    2641TAKE_PAGE_SHOT=0     # take a screenshot of each OK page
     42TIMEOUT=10           # time to wait for a response when querying a site
    2743CHROME_PATH=""       # path to a copy of Google Chrome that has the command-line screenshot feature
    2844URL_START=1          # start at this URL in LINKS_FILE (1 by default)
     
    3652ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
    3753CHROME_SCREENSHOT="screenshot.png"
    38 CURL_CODES="http://iritscen.oni2.net/val/curl_codes.txt"
    3954EXCEPT_FILE_NAME="exceptions.txt"
    4055EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
    41 HTTP_CODES="http://iritscen.oni2.net/val/http_codes.txt"
    42 MY_WIKI_PAGE="https://wiki.oni2.net/User:Iritscen"
     56WIKI_CURL="https://wiki.oni2.net/Validate_External_Links/Curl_codes"
     57WIKI_HTTP="https://wiki.oni2.net/Validate_External_Links/HTTP_codes"
     58WIKI_MAIN="https://wiki.oni2.net/Validate_External_Links"
     59WIKI_ME="http://iritscen.oni2.net"
    4360THIS_DIR=$(cd $(dirname $0); pwd)
    4461WORKING_DIR=$(pwd)
     
    101118
    102119
    103 ### HELP ###
     120### HELP OUTPUT ###
    104121# A pseudo-man page. Here is the 80-character rule for the page text:
    105122# 234567890123456789012345678901234567890123456789012345678901234567890123456789
     
    116133          [--record-ok-links] [--show-added-slashes] [--show-https-upgrades]
    117134          [--show-yt-redirects] [--suggest-snapshots] [--skip-archive-links]
    118           [--take-screenshots FILE] [--start-url NUM] [--end-url NUM]
    119           [--upload FILE]
     135          [--take-screenshots FILE] [--timeout NUM] [--start-url NUM]
     136          [--end-url NUM] [--upload FILE]
    120137
    121138DESCRIPTION
     
    163180       --take-screenshots FILE Call the Google Chrome binary at this path to
    164181                               take screenshots of each "OK" page.
     182       --timeout NUM           Wait this many seconds for a site to respond. The
     183                               default is 10.
    165184       --start-url NUM         Start at this link in the links CSV file.
    166185       --end-url NUM           Stop at this link in the links CSV file.
     
    197216      --skip-archive-links )  SKIP_ARCHIVE_LINKS=1;               shift;;
    198217      --take-screenshots )    TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
     218      --timeout )             TIMEOUT=$2;                         shift 2;;
    199219      --start-url )           URL_START=$2;                       shift 2;;
    200220      --end-url )             URL_LIMIT=$2;                       shift 2;;
     
    267287   valPrint t "generated $NICE_TIME"
    268288   valPrint t "from data of $LINKS_DATE"
    269    valPrint t "script by Iritscen (contact: $MY_WIKI_PAGE)"
     289   valPrint t "script by Iritscen (contact: $WIKI_ME)"
    270290   valPrint t ""
    271291}
     
    284304generated $NICE_TIME\\
    285305from data of $LINKS_DATE\\
    286 script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$MY_WIKI_PAGE\"}}{\fldrslt contact}})
     306script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$WIKI_ME\"}}{\fldrslt contact}})
    287307\\
    288308\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
     
    307327<h3>generated $NICE_TIME<br />
    308328from data of $LINKS_DATE<br />
    309 script by Iritscen (<a href=\"$MY_WIKI_PAGE\" target=\"_blank\">contact</a>)</h3>"
     329script by Iritscen (<a href=\"$WIKI_ME\" target=\"_blank\">contact</a>)</h3>"
    310330}
    311331
     
    319339# The central logging function. The first parameter is a string composed of one or more characters that
    320340# indicate which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
    321 # 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 's' means "Print an extra newline at the end." 'w' means "Don't pass console output through 'fmt'" ("fmt" fits the output
     341# 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 's' means "Print an
     342# extra newline at the end." 'w' means "Don't pass console output through 'fmt'" ("fmt" fits the output
    322343# to an 80-column CLI but can break special formatting and the 'n' option).
    323344function valPrint()
     
    465486   LINKS_CHECKED=$((LINKS_PROCESSED-LINK_ERRORS))
    466487
    467    # Print summary header
     488   ## SUMMARY OUTPUT ##
    468489   valPrint ct "Summary ($ELAPSED):"
    469490   valPrint r "\b1 Summary \b0 ($ELAPSED)"
     
    526547printHTMheader
    527548
     549## DATA SOURCING ##
     550valPrint t "Startup:"
     551valPrint r "\b1 Startup \b0"
     552valPrint hn "<h3>Startup</h3>"
     553
    528554# Attempt to download file at LINKS_URL, then check that it succeeded
    529 valPrint t "Config:"
    530 valPrint r "\b1 Config \b0"
    531 valPrint hn "<h3>Config</h3>"
    532 valPrint cwtrh "Downloading list of external links from $LINKS_URL."
     555valPrint cwtrhn "Downloading list of external links from $LINKS_URL..."
    533556LINKS_FILE_NAME=$(echo "$LINKS_URL" | sed 's/.*\///')
    534557LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
    535558curl --silent -o "$LINKS_FILE" $LINKS_URL
    536559if [ ! -f "$LINKS_FILE" ]; then
    537    echo "The download of $LINKS_URL appears to have failed. Aborting."
     560   echo -e "\nThe download of $LINKS_URL appears to have failed. Aborting."
    538561   wrapupAndExit
     562else
     563   valPrint ctrh " success."
    539564fi
    540565
    541566# Attempt to download file at EXCEPT_URL, then check that it succeeded
    542567if [ ! -z $EXCEPT_URL ]; then
    543    valPrint cwtrh "Downloading list of reporting exceptions from $EXCEPT_URL."
     568   valPrint cwtrhn "Downloading list of reporting exceptions from $EXCEPT_URL..."
    544569   EXCEPT_DATA=$(curl --silent $EXCEPT_URL)
    545570   if [ -z "$EXCEPT_DATA" ]; then
    546       echo "The download of the exceptions data from '$EXCEPT_URL' appears to have failed. Aborting."
     571      echo -e "\nThe download of the exceptions data from '$EXCEPT_URL' appears to have failed. Aborting."
    547572      wrapupAndExit
     573   else
     574      valPrint ctrh " success."
    548575   fi
    549576   EXCEPT_DATA=${EXCEPT_DATA%END LIST*}
     
    564591LINK_COUNT=$(echo "${LINK_COUNT_STRING}" | tr -d '[:space:]')
    565592let LINK_COUNT-=1
    566 
    567 # Calculate number of URLs to consider
     593valPrint ctrh "Found $LINK_COUNT links to process."
     594valPrint trh ""
     595
     596## CONFIG OUTPUT ##
     597valPrint t "Config:"
     598valPrint r "\b1 Config \b0"
     599valPrint hn "<h3>Config</h3>"
     600
     601valPrint ctrhn "Links to consider: "
    568602if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
    569    valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((URL_LIMIT-URL_START+1)) of them, from link $URL_START to $URL_LIMIT."
     603   valPrint ctrh "$((URL_LIMIT-URL_START+1)), from link $URL_START to $URL_LIMIT"
    570604elif [ $URL_START -ne 1 ]; then
    571    valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((LINK_COUNT-URL_START+1)) of them, from link $URL_START to $LINK_COUNT."
     605   valPrint ctrh "$((LINK_COUNT-URL_START+1)), from link $URL_START to $LINK_COUNT"
    572606else
    573    valPrint ctrh "Found $LINK_COUNT links to process. I will be considering all of them."
     607   valPrint ctrh "$LINK_COUNT"
    574608fi
    575609
    576 # Print settings to console and log
    577 declare -a SETTINGS_MSG=(I will be checking the response code of each link "and will" take a screenshot of each page. Pages that are OK will "also" be logged. I "will" ask the Internet Archive for a suggested snapshot URL for each NG page. "I will not report links that are in the exceptions list." "I will ignore URLs that simply have ending slashes added onto them." "I will ignore URLs that only upgrade from HTTP to HTTPS." "I will ignore youtu.be links that are merely being expanded." "I will not check the validity of Internet Archive snapshot URLs.")
    578 if [ $TAKE_PAGE_SHOT -eq 0 ]; then SETTINGS_MSG[10]="but will not"; fi
    579 if [ $RECORD_OK_LINKS -eq 0 ]; then SETTINGS_MSG[22]="not"; fi
    580 if [ $SUGGEST_SNAPSHOTS -eq 0 ]; then SETTINGS_MSG[26]="will not"; fi
    581 if [ -z $EXCEPT_URL ] || [ $RECORD_OK_LINKS -eq 1 ]; then SETTINGS_MSG[40]=""; fi
    582 if [ $SHOW_SLASH -eq 1 ]; then SETTINGS_MSG[41]=""; fi
    583 if [ $SHOW_HTTPS -eq 1 ]; then SETTINGS_MSG[42]=""; fi
    584 if [ $SHOW_YT_RD -eq 1 ]; then SETTINGS_MSG[43]=""; fi
    585 if [ $SKIP_ARCHIVE_LINKS -eq 0 ]; then SETTINGS_MSG[44]=""; fi
    586 SETTINGS_STR=${SETTINGS_MSG[@]}
    587 valPrint ctrh "$SETTINGS_STR"
     610valPrint ctrh "Site query timeout: $TIMEOUT seconds"
     611
     612valPrint ctrhn "Show OK links: "
     613if [ $RECORD_OK_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
     614
     615valPrint ctrhn "Take screenshots: "
     616if [ $TAKE_PAGE_SHOT -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
     617
     618valPrint ctrhn "Suggest Archive.org snapshots: "
     619if [ $SUGGEST_SNAPSHOTS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
     620
     621valPrint ctrhn "Ignore slash-adding redirects: "
     622if [ $SHOW_SLASH -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
     623
     624valPrint ctrhn "Ignore HTTPS-upgrading redirects: "
     625if [ $SHOW_HTTPS -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
     626
     627valPrint ctrhn "Ignore youtu.be redirects: "
     628if [ $SHOW_YT_RD -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
     629
     630valPrint ctrhn "Check archive.org links: "
     631if [ $SKIP_ARCHIVE_LINKS -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
     632
    588633valPrint tr "A summary of my findings will be found at the bottom of the report."
    589634valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
    590635valPrint trh ""
    591636
    592 # Print legend to logs
     637## LEGEND OUTPUT ##
    593638valPrint t "Legend:"
    594639valPrint r "\b1 Legend \b0"
    595640valPrint hn "<h3>Legend</h3>"
    596 valPrint trh "OK = URL seems to be working."
    597 valPrint trh "NG = URL no longer seems to work. You should click each URL marked as NG before attempting to fix it, because false negatives will occur from time to time due to hiccups in the Internet. Please report any persistent false negatives or other issues to the script's author (see top of report). An NG link should be followed by a link to the Internet Archive's Wayback Machine which may help you repair the link, unless the Archive does not have any snapshots of the site. If the link cannot be repaired, you can delete it from the wiki page, or, if this would disrupt the surrounding material on the page, disable the link by wrapping the URL in nowiki tags."
    598 valPrint trh "RD = The server responding to this URL is saying that the page moved and you should instead use the supplied new URL. Some RD links represent minor adjustments in the organization of a web site, and some are soft 404s (the file/page has been removed and you are being redirected to something like the main page of the web site). You will have to look at the new URL yourself to determine if it represents an OK link and the link on the wiki should be updated to this one, or if the desired file/page is actually gone and we need to replace the wiki link with an Internet Archive snapshot link -- or disable the URL if it has not been archived."
    599 valPrint trh "EI = URL is an external link to an internal page and should be converted to an intrawiki link using the suggested markup."
    600 valPrint trh "IW = URL is an external link to a fellow wiki and should be converted to an interwiki link using the suggested markup."
    601 valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $HTTP_CODES)."
    602 valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$HTTP_CODES\"}}{\fldrslt here}} for code reference)."
    603 valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$HTTP_CODES\" target=\"_blank\">here</a> for code reference)."
    604 valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $CURL_CODES)."
    605 valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$CURL_CODES\"}}{\fldrslt here}} for code reference)."
    606 valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$CURL_CODES\" target=\"_blank\">here</a> for code reference)."
    607 valPrint trh "IA suggests = Last available snapshot suggested by the Internet Archive."
    608 valPrint trh "Try browsing = The Archive occasionally fails to return a snapshot URL even when one exists, so you will need to check for a snapshot manually using this link to the Wayback Machine before concluding that a site has not been archived."
     641valPrint t "(For guidance in fixing these links, see $WIKI_MAIN.)"
     642valPrint r "(For guidance in fixing these links, see {\field{\*\fldinst{HYPERLINK \"$WIKI_MAIN\"}}{\fldrslt here}}.)"
     643valPrint h "(For guidance in fixing these links, see <a href=\"$WIKI_MAIN\" target=\"_blank\">here</a>.)"
     644valPrint trh "OK = URL seems to be working"
     645valPrint trh "NG = URL no longer seems to work"
     646valPrint trh "RD = URL is redirecting to this new URL"
     647valPrint trh "EI = URL is an external link to an internal page and should be converted to an intrawiki link using the suggested markup"
     648valPrint trh "IW = URL is an external link to a fellow wiki and should be converted to an interwiki link using the suggested markup"
     649valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $WIKI_HTTP)"
     650valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$WIKI_HTTP\"}}{\fldrslt here}} for code reference)"
     651valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$WIKI_HTTP\" target=\"_blank\">here</a> for code reference)"
     652valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $WIKI_CURL)"
     653valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$WIKI_CURL\"}}{\fldrslt here}} for code reference)"
     654valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$WIKI_CURL\" target=\"_blank\">here</a> for code reference)"
     655valPrint trh "IA suggests = Last available snapshot returned by the Internet Archive"
     656valPrint trh "Try browsing = The Archive failed to return a snapshot URL, so check for a snapshot manually using this link"
    609657valPrint trh ""
    610658
     
    814862   # Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
    815863   # issue with sites that require HTTPS
    816    CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time 10 --write-out '%{http_code}\n' $URL)
     864   CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --write-out '%{http_code}\n' $URL)
    817865   CURL_ERR=$(echo $?)
    818866   CURL_RESULT=$CURL_CODE
     
    864912         if [[ $CODE == $CURL_CODE ]]; then
    865913            # Get URL header again in order to retrieve the URL we are being redirected to
    866             NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time 10 --write-out '%{redirect_url}\n' $URL)
     914            NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --write-out '%{redirect_url}\n' $URL)
    867915
    868916            # Adjust the old and new URLs to both use HTTP for comparison purposes, so we can filter
     
    10321080      # Query Internet Archive for latest "OK" snapshot for "NG" page
    10331081      if [ $STATUS == "NG" ] && [ $SUGGEST_SNAPSHOTS -eq 1 ]; then
    1034          ARCHIVE_QUERY=$(curl --silent --max-time 10 "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
     1082         ARCHIVE_QUERY=$(curl --silent --max-time $TIMEOUT "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
    10351083
    10361084         # If a "closest" snapshot was received...
Note: See TracChangeset for help on using the changeset viewer.