--- Validate External Links/validate_external_links.sh 2021/08/15 14:20:21 1160 +++ Validate External Links/validate_external_links.sh 2022/08/23 14:15:48 1175 @@ -29,8 +29,8 @@ IFS=" ### GLOBALS ### # Settings -- these will be changed from their defaults by the arguments passed in to the script -LINKS_URL="" # use 'curl' to download file with links from this location (can be file://) -EXCEPT_URL="" # 'curl' will access this wiki page with a list of exceptions for NG results +LINKS_URL="" # download external link CSV from this location (can use "file://" protocol) +EXCEPT_URL="" # location of wiki page with a list of exceptions for NG results OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES SHOW_SLASH=0 # record issue when a slash is added to the end of a URL @@ -47,7 +47,7 @@ URL_LIMIT=0 # if non-zero, st UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report # Fixed strings -- see the occurrences of these variables to learn their purpose -AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/88.0.4324.146 Safari/537.36" +AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.134 Safari/537.36" ARCHIVE_API="http://archive.org/wayback/available" ARCHIVE_GENERIC="https://web.archive.org/web/*" ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206" @@ -68,7 +68,7 @@ declare -a NS_NAMES=("Media" "Special" " # These arrays tell the script which suffixes at the ends of URLs represent files and which are pages. # This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code. -declare -a HTTP_FILES=(3ds 7z avi BINA blend bsl cpp csv dae dll dmg doc east exe fbx first63 flv gamedata gif jpg last32 log m4a mhm mov mp3 mp4 oni ONWC pdf png psd py rar tga TRMA txt vbs wav wmv xaf xcf xml zip) +declare -a HTTP_FILES=(3ds 7z avi BINA blend bsl cpp csv dae dll dmg doc east exe fbx first63 flv gamedata gif jpg last32 log m4a mhm mov mp3 mp4 oni ONWC pdf png psd py rar tga TRMA txt vbs wav wmv xaf xcf xlsx xml zip) declare -a HTTP_TLDS_AND_PAGES=(abstract action ars asp aspx cfm cgi com css de do full htm html it js jsp net org pgi php php3 phtml pl ru shtml stm uk x) # These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which @@ -562,6 +562,39 @@ function wrapupAndExit() if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi + # Perform exceptions audit + EXCEPTION_ISSUES=0 + valPrint ctrh "Exceptions list audit:" + for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do + EXCEPT_LINE="${EXCEPT_ARRAY[$i]}" + EXCEPT_LINE=$(echo "$EXCEPT_LINE" | sed 's/\&/\&/g') # copied from exception-matching code + + if [ ${EXCEPT_FOUND[$i]} -eq 0 ]; then + EXCEPT_URL="${EXCEPT_LINE#*,}" + EXCEPT_URL="${EXCEPT_URL%,*}" + EXCEPT_PAGE="${EXCEPT_LINE##*,}" + EXCEPT_PAGE="${EXCEPT_PAGE%% *}" + if [ "$EXCEPT_PAGE" == "*" ]; then + valPrint tr "- The link '$EXCEPT_URL' did not return an error or is not on any page." + else + valPrint tr "- The link '$EXCEPT_URL' did not return an error or is not on page '$EXCEPT_PAGE'." + fi + let EXCEPTION_ISSUES+=1 + elif [ ${EXCEPT_USED[$i]} -eq 0 ]; then + EXCEPT_URL="${EXCEPT_LINE#*,}" + EXCEPT_URL="${EXCEPT_URL%,*}" + EXCEPT_CODE=${EXCEPT_LINE%%,*} + valPrint tr "- The link '$EXCEPT_URL' did not return error code $EXCEPT_CODE." + let EXCEPTION_ISSUES+=1 + fi + done + if [ $EXCEPTION_ISSUES -eq 0 ]; then + valPrint ctrh "- No issues found." + else + valPrint c "$EXCEPTION_ISSUES exception list $(pluralCheckNoun issue $EXCEPTION_ISSUES) detected (see RTF or TXT report for details)." + valPrint h "$EXCEPTION_ISSUES exception list $(pluralCheckNoun issue $EXCEPTION_ISSUES) detected (see RTF or TXT report for details)." + fi + # Print checked link totals if [ $LINK_PROBLEMS_NET -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS_NET link $(pluralCheckNoun issue $LINK_PROBLEMS_NET):"; fi if [ $LINK_PROBLEMS_NG -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_NG NG $(pluralCheckNoun link $LINK_PROBLEMS_NG)"; fi @@ -629,6 +662,14 @@ if [ ! -z $EXCEPT_URL ]; then # Transfer to array for easy searching later declare -a EXCEPT_ARRAY=($(echo "$EXCEPT_DATA")) + + # Create parallel arrays for marking which exceptions get used later + declare -a EXCEPT_USED=() + declare -a EXCEPT_FOUND=() + for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do + EXCEPT_USED+=(0) + EXCEPT_FOUND+=(0) + done fi # Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count @@ -688,9 +729,9 @@ valPrint trh "" valPrint t "Legend:" valPrint r "\b1 Legend \b0" valPrint hn "

Legend

" -valPrint t "(For guidance in fixing these links, see $WIKI_MAIN.)" -valPrint r "(For guidance in fixing these links, see {\field{\*\fldinst{HYPERLINK \"$WIKI_MAIN\"}}{\fldrslt here}}.)" -valPrint h "(For guidance in fixing these links, see here.)" +valPrint t "(For guidance in fixing these links, see $WIKI_MAIN. The exceptions list is at $EXCEPT_URL.)" +valPrint r "(For guidance in fixing these links, see {\field{\*\fldinst{HYPERLINK \"$WIKI_MAIN\"}}{\fldrslt here}}. The exceptions list is {\field{\*\fldinst{HYPERLINK \"$EXCEPT_URL\"}}{\fldrslt here}}.)" +valPrint h "(For guidance in fixing these links, see here. The exceptions list is here.)" valPrint trh "OK = URL seems to be working" valPrint trh "NG = URL no longer seems to work" valPrint trh "RD = URL is redirecting to this new URL" @@ -828,7 +869,7 @@ for LINE in `cat "$LINKS_FILE"`; do # If the URL ends in something like "#section_15", strip everything from the '#' onward CLEAN_URL=${CLEAN_URL%%\#*} - # 'sed' cannot handle Unicode in my Bash shell, so skip non-ASCII URL and make user check it + # 'sed' cannot handle Unicode in my Bash shell, so skip non-ASCII URL and make reader check it if [[ $CLEAN_URL == *[![:ascii:]]* ]]; then valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I cannot handle non-ASCII characters." let SKIP_NON_ASCII+=1 @@ -901,7 +942,7 @@ for LINE in `cat "$LINKS_FILE"`; do shopt -u nocasematch fi - # If this suffix escaped identification as either a file, page or TLD, inform the user + # If this suffix escaped identification as either a file, page or TLD, inform the reader STR_TYPE="" if [ $IS_FILE -eq -1 ]; then valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I encountered the unknown URL suffix '$POST_DOT'. Please add this suffix to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES." @@ -1064,7 +1105,7 @@ for LINE in `cat "$LINKS_FILE"`; do # Check problem links against exceptions list before proceeding FOUND_EXCEPT=0 - if [ $STATUS != "OK" ] && [ ! -z $EXCEPT_URL ]; then + if [ $STATUS != "OK" ] && [ ! -z "$EXCEPT_URL" ]; then # The code we expect to find in the exceptions file is either the 'curl' result or "EI"/"IW" EXPECT_CODE="$CURL_RESULT" if [ $STATUS == "EI" ]; then @@ -1082,21 +1123,27 @@ for LINE in `cat "$LINKS_FILE"`; do # other HTML-encoded characters are not found in URLs EXCEPT_LINE=$(echo "$EXCEPT_LINE" | sed 's/\&/\&/g') - # Match URL + # Check for URL match EXCEPT_URL="${EXCEPT_LINE#*,}" EXCEPT_URL="${EXCEPT_URL%,*}" if [ "$EXCEPT_URL" != "$URL" ]; then continue fi - # Match containing page's name + # Check for page name match EXCEPT_PAGE="${EXCEPT_LINE##*,}" EXCEPT_PAGE="${EXCEPT_PAGE%% *}" - if [ "$EXCEPT_PAGE" == "*" ] || [ "$EXCEPT_PAGE" == $LOCAL_PAGE_PATH ]; then - # Match result code + if [ "$EXCEPT_PAGE" == "*" ] || [ "$EXCEPT_PAGE" == "$LOCAL_PAGE_PATH" ]; then + let EXCEPT_FOUND[$i]+=1 + valPrint trs "Found exception '$URL' on page '$LOCAL_PAGE_PATH'." + + # Check for result code match EXCEPT_CODE=${EXCEPT_LINE%%,*} if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then + FOUND_EXCEPT=1 + let EXCEPT_USED[$i]+=1 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because its expected result, '$EXPECT_CODE', is in the exceptions list." + if [ $STATUS == "EI" ]; then let SKIP_EXPECT_EI+=1 elif [ $STATUS == "IW" ]; then @@ -1106,7 +1153,7 @@ for LINE in `cat "$LINKS_FILE"`; do else let SKIP_EXPECT_NG+=1 fi - FOUND_EXCEPT=1 + break fi fi @@ -1180,12 +1227,12 @@ for LINE in `cat "$LINKS_FILE"`; do # Issue query to the API ARCHIVE_QUERY=$(curl --silent --max-time $TIMEOUT "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES") - # Notify user if we hit the rate limit and just keep going + # Notify reader if we hit the rate limit and just keep going if [[ "$ARCHIVE_QUERY" == "*Too Many Requests*" ]]; then valPrint t " IA has rate-limited us!" valPrint r " IA has rate-limited us!" valPrint hs "IA suggests(hit the API rate limit!)" - # If a "closest" snapshot was received, inform user + # If a "closest" snapshot was received, inform reader elif [[ "$ARCHIVE_QUERY" == *\"closest\":* ]]; then # In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it ARCHIVE_QUERY=$(echo "$ARCHIVE_QUERY" | sed 's/#!/#\\!/') @@ -1198,7 +1245,7 @@ for LINE in `cat "$LINKS_FILE"`; do # Remove the port 80 part that IA often adds to the URL, as it's superfluous SNAPSHOT_URL=$(echo $SNAPSHOT_URL | sed 's/:80//') - # Inform the user of the snapshot URL + # Inform the reader of the snapshot URL valPrint ts " IA suggests $SNAPSHOT_URL" valPrint rs " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}" valPrint hs "IA suggests$SNAPSHOT_URL"