Changeset 1177


Ignore:
Timestamp:
Jan 13, 2023, 11:26:56 PM (23 months ago)
Author:
iritscen
Message:

ValExtLinks now skips URLs that aren't HTTP(S) protocol. Added some error-checking on line parsing. Added my email address.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • Validate External Links/validate_external_links.sh

    r1175 r1177  
    11#!/bin/bash
    22
    3 # Validate External Links by Iritscen
     3# Validate External Links by Iritscen (iritscen@yahoo.com)
    44#
    55# Validates a list of external links in CSV format. The resulting logs are produced in three formats:
     
    3232LINKS_URL=""           # download external link CSV from this location (can use "file://" protocol)
    3333EXCEPT_URL=""          # location of wiki page with a list of exceptions for NG results
    34 OUTPUT_DIR=""          # place reports and all other output in a folder inside this existing folder
     34OUTPUT_DIR=""           # place reports and all other output in a folder inside this existing folder
    3535RECORD_OK_LINKS=0      # record response code to the log even when it's a value in OK_CODES
    3636SHOW_SLASH=0           # record issue when a slash is added to the end of a URL
     
    9898RD_LINKS=0
    9999NG_LINKS=0
     100SKIP_PARSE_FAIL=0
     101SKIP_UNK_PROT=0
    100102SKIP_UNK_NS=0
    101103SKIP_JS_PAGE=0
     
    506508   LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
    507509   TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP+SKIP_YOUTU_BE))
    508    LINK_ERRORS=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
     510   LINK_ERRORS=$((SKIP_PARSE_FAIL+SKIP_UNK_PROT+SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
    509511   LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_RD+SKIP_EXPECT_EI+SKIP_EXPECT_IW))
    510512   LINK_PROBLEMS_TOTAL=$((NG_LINKS+RD_LINKS+EI_LINKS+IW_LINKS))
     
    545547      valPrint rt "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS):"
    546548   fi
     549   if [ $SKIP_PARSE_FAIL -gt 0 ]; then valPrint ctrh "- $SKIP_PARSE_FAIL line-parsing $(pluralCheckNoun failure $SKIP_PARSE_FAIL)"; fi
     550   if [ $SKIP_UNK_PROT -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_PROT unknown $(pluralCheckNoun protocol $SKIP_UNK_PROT)"; fi
    547551   if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
    548552   if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
     
    782786      wrapupAndExit
    783787   fi
     788   
     789   # Parse line into namespace ID number, containing wiki page, and external link URL
     790   NS_ID=${LINE%%,*}
     791   PAGE_NAME=${LINE#$NS_ID,}
     792   PAGE_NAME=${PAGE_NAME%%,*} # a comma in the page name will break this
     793   URL=${LINE#$NS_ID,$PAGE_NAME,} # commas can be in this
     794   if [ -z "$NS_ID" ] || [ -z "$PAGE_NAME" ] || [ -z "$URL" ]; then
     795      valPrint trs "Skipping line $LINK_NUM ('$LINE') because the namespace, wiki page or link URL could not be read."
     796      let SKIP_PARSE_FAIL+=1
     797      continue
     798   fi
     799   
     800   # Skip any link that isn't "http://" or "https://"
     801   if [[ ! $URL =~ ^http* ]]; then
     802      valPrint trs "Skipping line $LINK_NUM ('$LINE') because the protocol isn't 'http://' or 'https://'."
     803      let SKIP_UNK_PROT+=1
     804      continue
     805   fi
    784806
    785807   # Print progress to screen
     
    788810   fi
    789811   valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."
    790 
    791    # The number of the namespace is the element before the first comma on the line
    792    NS_ID=${LINE%%,*}
    793812
    794813   # Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
     
    815834   fi
    816835
    817    # The name of the page is everything between the namespace ID and the next comma on the line (commas
    818    # in page names will break this)
    819    PAGE_NAME=${LINE#$NS_ID,}
    820    PAGE_NAME=${PAGE_NAME%%,*}
    821 
    822836   # Build longer wiki page URLs from namespace and page names
    823837   FULL_PAGE_PATH=https://$WIKI_PATH/$NS_NAME:$PAGE_NAME
     
    839853      continue
    840854   fi
    841 
    842    # The URL being linked to is everything after the previous two fields (this allows commas to be in
    843    # the URLs, but a comma in the previous field, the page name, will break this)
    844    URL=${LINE#$NS_ID,$PAGE_NAME,}
    845855
    846856   # Scan for illegal characters
Note: See TracChangeset for help on using the changeset viewer.