ViewVC Help
View File | Revision Log | View Changeset | Root Listing
root/Oni2/Validate External Links/validate_external_links.sh
(Generate patch)

Comparing Validate External Links/validate_external_links.sh (file contents):
Revision 1175 by iritscen, Tue Aug 23 14:15:48 2022 UTC vs.
Revision 1177 by iritscen, Fri Jan 13 22:26:56 2023 UTC

# Line 1 | Line 1
1   #!/bin/bash
2  
3 < # Validate External Links by Iritscen
3 > # Validate External Links by Iritscen (iritscen@yahoo.com)
4   #
5   # Validates a list of external links in CSV format. The resulting logs are produced in three formats:
6   # - TXT (for easy diffing with an earlier log)
# Line 31 | Line 31 | IFS="
31   # Settings -- these will be changed from their defaults by the arguments passed in to the script
32   LINKS_URL=""           # download external link CSV from this location (can use "file://" protocol)
33   EXCEPT_URL=""          # location of wiki page with a list of exceptions for NG results
34 < OUTPUT_DIR=""          # place reports and all other output in a folder inside this existing folder
34 > OUTPUT_DIR=""           # place reports and all other output in a folder inside this existing folder
35   RECORD_OK_LINKS=0      # record response code to the log even when it's a value in OK_CODES
36   SHOW_SLASH=0           # record issue when a slash is added to the end of a URL
37   SHOW_HTTPS=0           # record issue when "http" is upgraded to "https"
# Line 97 | Line 97 | IW_LINKS=0
97   OK_LINKS=0
98   RD_LINKS=0
99   NG_LINKS=0
100 + SKIP_PARSE_FAIL=0
101 + SKIP_UNK_PROT=0
102   SKIP_UNK_NS=0
103   SKIP_JS_PAGE=0
104   SKIP_BAD_URL=0
# Line 505 | Line 507 | function wrapupAndExit()
507     # Do some math on results of session
508     LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
509     TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP+SKIP_YOUTU_BE))
510 <   LINK_ERRORS=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
510 >   LINK_ERRORS=$((SKIP_PARSE_FAIL+SKIP_UNK_PROT+SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
511     LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_RD+SKIP_EXPECT_EI+SKIP_EXPECT_IW))
512     LINK_PROBLEMS_TOTAL=$((NG_LINKS+RD_LINKS+EI_LINKS+IW_LINKS))
513     LINK_PROBLEMS_NG=$((NG_LINKS-SKIP_EXPECT_NG))
# Line 544 | Line 546 | function wrapupAndExit()
546        valPrint h "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for specific links):"
547        valPrint rt "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS):"
548     fi
549 +   if [ $SKIP_PARSE_FAIL -gt 0 ]; then valPrint ctrh "- $SKIP_PARSE_FAIL line-parsing $(pluralCheckNoun failure $SKIP_PARSE_FAIL)"; fi
550 +   if [ $SKIP_UNK_PROT -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_PROT unknown $(pluralCheckNoun protocol $SKIP_UNK_PROT)"; fi
551     if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
552     if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
553     if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
# Line 781 | Line 785 | for LINE in `cat "$LINKS_FILE"`; do
785        FINISHED_LIST="limit"
786        wrapupAndExit
787     fi
788 +  
789 +   # Parse line into namespace ID number, containing wiki page, and external link URL
790 +   NS_ID=${LINE%%,*}
791 +   PAGE_NAME=${LINE#$NS_ID,}
792 +   PAGE_NAME=${PAGE_NAME%%,*} # a comma in the page name will break this
793 +   URL=${LINE#$NS_ID,$PAGE_NAME,} # commas can be in this
794 +   if [ -z "$NS_ID" ] || [ -z "$PAGE_NAME" ] || [ -z "$URL" ]; then
795 +      valPrint trs "Skipping line $LINK_NUM ('$LINE') because the namespace, wiki page or link URL could not be read."
796 +      let SKIP_PARSE_FAIL+=1
797 +      continue
798 +   fi
799 +  
800 +   # Skip any link that isn't "http://" or "https://"
801 +   if [[ ! $URL =~ ^http* ]]; then
802 +      valPrint trs "Skipping line $LINK_NUM ('$LINE') because the protocol isn't 'http://' or 'https://'."
803 +      let SKIP_UNK_PROT+=1
804 +      continue
805 +   fi
806  
807     # Print progress to screen
808     if [ $LINK_NUM -gt 1 ]; then
# Line 788 | Line 810 | for LINE in `cat "$LINKS_FILE"`; do
810     fi
811     valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..."
812  
791   # The number of the namespace is the element before the first comma on the line
792   NS_ID=${LINE%%,*}
793
813     # Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
814     NS_NAME=""
815     a=0
# Line 814 | Line 833 | for LINE in `cat "$LINKS_FILE"`; do
833        continue
834     fi
835  
817   # The name of the page is everything between the namespace ID and the next comma on the line (commas
818   # in page names will break this)
819   PAGE_NAME=${LINE#$NS_ID,}
820   PAGE_NAME=${PAGE_NAME%%,*}
821
836     # Build longer wiki page URLs from namespace and page names
837     FULL_PAGE_PATH=https://$WIKI_PATH/$NS_NAME:$PAGE_NAME
838     LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
# Line 839 | Line 853 | for LINE in `cat "$LINKS_FILE"`; do
853        continue
854     fi
855  
842   # The URL being linked to is everything after the previous two fields (this allows commas to be in
843   # the URLs, but a comma in the previous field, the page name, will break this)
844   URL=${LINE#$NS_ID,$PAGE_NAME,}
845
856     # Scan for illegal characters
857     if [[ $URL == *[$ILLEGAL_CHARS]* ]]; then
858        valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because it contains characters illegal in a URL."

Diff Legend

Removed lines
+ Added lines
< Changed lines (old)
> Changed lines (new)