1 |
|
#!/bin/bash |
2 |
|
|
3 |
< |
# Validate External Links by Iritscen |
3 |
> |
# Validate External Links by Iritscen (iritscen@yahoo.com) |
4 |
|
# |
5 |
|
# Validates a list of external links in CSV format. The resulting logs are produced in three formats: |
6 |
|
# - TXT (for easy diffing with an earlier log) |
31 |
|
# Settings -- these will be changed from their defaults by the arguments passed in to the script |
32 |
|
LINKS_URL="" # download external link CSV from this location (can use "file://" protocol) |
33 |
|
EXCEPT_URL="" # location of wiki page with a list of exceptions for NG results |
34 |
< |
OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder |
34 |
> |
OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder |
35 |
|
RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES |
36 |
|
SHOW_SLASH=0 # record issue when a slash is added to the end of a URL |
37 |
|
SHOW_HTTPS=0 # record issue when "http" is upgraded to "https" |
97 |
|
OK_LINKS=0 |
98 |
|
RD_LINKS=0 |
99 |
|
NG_LINKS=0 |
100 |
+ |
SKIP_PARSE_FAIL=0 |
101 |
+ |
SKIP_UNK_PROT=0 |
102 |
|
SKIP_UNK_NS=0 |
103 |
|
SKIP_JS_PAGE=0 |
104 |
|
SKIP_BAD_URL=0 |
507 |
|
# Do some math on results of session |
508 |
|
LINKS_PROCESSED=$((LINK_NUM-URL_START+1)) |
509 |
|
TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP+SKIP_YOUTU_BE)) |
510 |
< |
LINK_ERRORS=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE)) |
510 |
> |
LINK_ERRORS=$((SKIP_PARSE_FAIL+SKIP_UNK_PROT+SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE)) |
511 |
|
LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_RD+SKIP_EXPECT_EI+SKIP_EXPECT_IW)) |
512 |
|
LINK_PROBLEMS_TOTAL=$((NG_LINKS+RD_LINKS+EI_LINKS+IW_LINKS)) |
513 |
|
LINK_PROBLEMS_NG=$((NG_LINKS-SKIP_EXPECT_NG)) |
546 |
|
valPrint h "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for specific links):" |
547 |
|
valPrint rt "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS):" |
548 |
|
fi |
549 |
+ |
if [ $SKIP_PARSE_FAIL -gt 0 ]; then valPrint ctrh "- $SKIP_PARSE_FAIL line-parsing $(pluralCheckNoun failure $SKIP_PARSE_FAIL)"; fi |
550 |
+ |
if [ $SKIP_UNK_PROT -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_PROT unknown $(pluralCheckNoun protocol $SKIP_UNK_PROT)"; fi |
551 |
|
if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi |
552 |
|
if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi |
553 |
|
if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi |
785 |
|
FINISHED_LIST="limit" |
786 |
|
wrapupAndExit |
787 |
|
fi |
788 |
+ |
|
789 |
+ |
# Parse line into namespace ID number, containing wiki page, and external link URL |
790 |
+ |
NS_ID=${LINE%%,*} |
791 |
+ |
PAGE_NAME=${LINE#$NS_ID,} |
792 |
+ |
PAGE_NAME=${PAGE_NAME%%,*} # a comma in the page name will break this |
793 |
+ |
URL=${LINE#$NS_ID,$PAGE_NAME,} # commas can be in this |
794 |
+ |
if [ -z "$NS_ID" ] || [ -z "$PAGE_NAME" ] || [ -z "$URL" ]; then |
795 |
+ |
valPrint trs "Skipping line $LINK_NUM ('$LINE') because the namespace, wiki page or link URL could not be read." |
796 |
+ |
let SKIP_PARSE_FAIL+=1 |
797 |
+ |
continue |
798 |
+ |
fi |
799 |
+ |
|
800 |
+ |
# Skip any link that isn't "http://" or "https://" |
801 |
+ |
if [[ ! $URL =~ ^http* ]]; then |
802 |
+ |
valPrint trs "Skipping line $LINK_NUM ('$LINE') because the protocol isn't 'http://' or 'https://'." |
803 |
+ |
let SKIP_UNK_PROT+=1 |
804 |
+ |
continue |
805 |
+ |
fi |
806 |
|
|
807 |
|
# Print progress to screen |
808 |
|
if [ $LINK_NUM -gt 1 ]; then |
810 |
|
fi |
811 |
|
valPrint cn "Evaluating URL $LINK_NUM/$LINK_COUNT..." |
812 |
|
|
791 |
– |
# The number of the namespace is the element before the first comma on the line |
792 |
– |
NS_ID=${LINE%%,*} |
793 |
– |
|
813 |
|
# Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES |
814 |
|
NS_NAME="" |
815 |
|
a=0 |
833 |
|
continue |
834 |
|
fi |
835 |
|
|
817 |
– |
# The name of the page is everything between the namespace ID and the next comma on the line (commas |
818 |
– |
# in page names will break this) |
819 |
– |
PAGE_NAME=${LINE#$NS_ID,} |
820 |
– |
PAGE_NAME=${PAGE_NAME%%,*} |
821 |
– |
|
836 |
|
# Build longer wiki page URLs from namespace and page names |
837 |
|
FULL_PAGE_PATH=https://$WIKI_PATH/$NS_NAME:$PAGE_NAME |
838 |
|
LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME |
853 |
|
continue |
854 |
|
fi |
855 |
|
|
842 |
– |
# The URL being linked to is everything after the previous two fields (this allows commas to be in |
843 |
– |
# the URLs, but a comma in the previous field, the page name, will break this) |
844 |
– |
URL=${LINE#$NS_ID,$PAGE_NAME,} |
845 |
– |
|
856 |
|
# Scan for illegal characters |
857 |
|
if [[ $URL == *[$ILLEGAL_CHARS]* ]]; then |
858 |
|
valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because it contains characters illegal in a URL." |