--- Validate External Links/validate_external_links.sh 2021/08/15 14:20:21 1160 +++ Validate External Links/validate_external_links.sh 2022/08/23 14:15:48 1175 @@ -29,8 +29,8 @@ IFS=" ### GLOBALS ### # Settings -- these will be changed from their defaults by the arguments passed in to the script -LINKS_URL="" # use 'curl' to download file with links from this location (can be file://) -EXCEPT_URL="" # 'curl' will access this wiki page with a list of exceptions for NG results +LINKS_URL="" # download external link CSV from this location (can use "file://" protocol) +EXCEPT_URL="" # location of wiki page with a list of exceptions for NG results OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES SHOW_SLASH=0 # record issue when a slash is added to the end of a URL @@ -47,7 +47,7 @@ URL_LIMIT=0 # if non-zero, st UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report # Fixed strings -- see the occurrences of these variables to learn their purpose -AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/88.0.4324.146 Safari/537.36" +AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.134 Safari/537.36" ARCHIVE_API="http://archive.org/wayback/available" ARCHIVE_GENERIC="https://web.archive.org/web/*" ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206" @@ -68,7 +68,7 @@ declare -a NS_NAMES=("Media" "Special" " # These arrays tell the script which suffixes at the ends of URLs represent files and which are pages. # This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code. -declare -a HTTP_FILES=(3ds 7z avi BINA blend bsl cpp csv dae dll dmg doc east exe fbx first63 flv gamedata gif jpg last32 log m4a mhm mov mp3 mp4 oni ONWC pdf png psd py rar tga TRMA txt vbs wav wmv xaf xcf xml zip) +declare -a HTTP_FILES=(3ds 7z avi BINA blend bsl cpp csv dae dll dmg doc east exe fbx first63 flv gamedata gif jpg last32 log m4a mhm mov mp3 mp4 oni ONWC pdf png psd py rar tga TRMA txt vbs wav wmv xaf xcf xlsx xml zip) declare -a HTTP_TLDS_AND_PAGES=(abstract action ars asp aspx cfm cgi com css de do full htm html it js jsp net org pgi php php3 phtml pl ru shtml stm uk x) # These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which @@ -562,6 +562,39 @@ function wrapupAndExit() if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi + # Perform exceptions audit + EXCEPTION_ISSUES=0 + valPrint ctrh "Exceptions list audit:" + for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do + EXCEPT_LINE="${EXCEPT_ARRAY[$i]}" + EXCEPT_LINE=$(echo "$EXCEPT_LINE" | sed 's/\&/\&/g') # copied from exception-matching code + + if [ ${EXCEPT_FOUND[$i]} -eq 0 ]; then + EXCEPT_URL="${EXCEPT_LINE#*,}" + EXCEPT_URL="${EXCEPT_URL%,*}" + EXCEPT_PAGE="${EXCEPT_LINE##*,}" + EXCEPT_PAGE="${EXCEPT_PAGE%% *}" + if [ "$EXCEPT_PAGE" == "*" ]; then + valPrint tr "- The link '$EXCEPT_URL' did not return an error or is not on any page." + else + valPrint tr "- The link '$EXCEPT_URL' did not return an error or is not on page '$EXCEPT_PAGE'." + fi + let EXCEPTION_ISSUES+=1 + elif [ ${EXCEPT_USED[$i]} -eq 0 ]; then + EXCEPT_URL="${EXCEPT_LINE#*,}" + EXCEPT_URL="${EXCEPT_URL%,*}" + EXCEPT_CODE=${EXCEPT_LINE%%,*} + valPrint tr "- The link '$EXCEPT_URL' did not return error code $EXCEPT_CODE." + let EXCEPTION_ISSUES+=1 + fi + done + if [ $EXCEPTION_ISSUES -eq 0 ]; then + valPrint ctrh "- No issues found." + else + valPrint c "$EXCEPTION_ISSUES exception list $(pluralCheckNoun issue $EXCEPTION_ISSUES) detected (see RTF or TXT report for details)." + valPrint h "$EXCEPTION_ISSUES exception list $(pluralCheckNoun issue $EXCEPTION_ISSUES) detected (see RTF or TXT report for details)." + fi + # Print checked link totals if [ $LINK_PROBLEMS_NET -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS_NET link $(pluralCheckNoun issue $LINK_PROBLEMS_NET):"; fi if [ $LINK_PROBLEMS_NG -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_NG NG $(pluralCheckNoun link $LINK_PROBLEMS_NG)"; fi @@ -629,6 +662,14 @@ if [ ! -z $EXCEPT_URL ]; then # Transfer to array for easy searching later declare -a EXCEPT_ARRAY=($(echo "$EXCEPT_DATA")) + + # Create parallel arrays for marking which exceptions get used later + declare -a EXCEPT_USED=() + declare -a EXCEPT_FOUND=() + for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do + EXCEPT_USED+=(0) + EXCEPT_FOUND+=(0) + done fi # Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count @@ -688,9 +729,9 @@ valPrint trh "" valPrint t "Legend:" valPrint r "\b1 Legend \b0" valPrint hn "