--- Validate External Links/validate_external_links.sh 2020/03/18 17:08:59 1120
+++ Validate External Links/validate_external_links.sh 2020/03/20 22:13:48 1122
@@ -18,6 +18,8 @@ LINKS_URL="" # use 'curl' to down
EXCEPT_URL="" # ditto above for file with exceptions to NG results
OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder
RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES
+SHOW_SLASH=0 # record response code to the log when a slash is added to the end of a URL
+SHOW_HTTPS=0 # record response code to the log when "http" is upgraded to "https"
SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page
TAKE_PAGE_SHOT=0 # take a screenshot of each OK page
CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature
@@ -34,7 +36,7 @@ CHROME_SCREENSHOT="screenshot.png"
CURL_CODES="http://iritscen.oni2.net/val/curl_codes.txt"
EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
HTTP_CODES="http://iritscen.oni2.net/val/http_codes.txt"
-MY_WIKI_PAGE="http://wiki.oni2.net/User:Iritscen"
+MY_WIKI_PAGE="https://wiki.oni2.net/User:Iritscen"
THIS_DIR=$(cd $(dirname $0); pwd)
WORKING_DIR=$(pwd)
WIKI_PATH="wiki.oni2.net"
@@ -83,6 +85,8 @@ SKIP_UNK_CODE=0
SKIP_EXPECT_NG=0
SKIP_EXPECT_EI=0
SKIP_EXPECT_IW=0
+SKIP_HTTPS_UP=0
+SKIP_SLASH_ADD=0
FILE_LINKS=0
PAGE_LINKS=0
SKIPPED_HEADER_ROW=0
@@ -139,6 +143,10 @@ OPTIONS
you supply a file:// path.
--record-ok-links Log a link in the report even if its response
code is "OK".
+ --show-added-slashes Report on redirects that simply add a '/' to the
+ end of the URL.
+ --show-https-upgrade Report on redirects that simply upgrade a
+ "http://" URL to a "https://" URL.
--suggest-snapshots Query the Internet Archive for a possible
snapshot URL for each "NG" page.
--take-screenshots FILE Call the Google Chrome binary at this path to
@@ -167,16 +175,18 @@ fi
# Parse arguments as long as there are more arguments to process
while (( "$#" )); do
case "$1" in
- --links ) LINKS_URL="$2"; shift 2;;
- --exceptions ) EXCEPT_URL="$2"; shift 2;;
- --output ) OUTPUT_DIR="$2"; shift 2;;
- --record-ok-links ) RECORD_OK_LINKS=1; shift;;
- --suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;;
- --take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
- --start-url ) URL_START=$2; shift 2;;
- --end-url ) URL_LIMIT=$2; shift 2;;
- --upload ) UPLOAD_INFO=$2; shift 2;;
- * ) echo "Invalid argument $1 detected. Aborting."; exit 1;;
+ --links ) LINKS_URL="$2"; shift 2;;
+ --exceptions ) EXCEPT_URL="$2"; shift 2;;
+ --output ) OUTPUT_DIR="$2"; shift 2;;
+ --record-ok-links ) RECORD_OK_LINKS=1; shift;;
+ --show-added-slashes ) SHOW_SLASH=1; shift;;
+ --show-https-upgrade ) SHOW_HTTPS=1; shift;;
+ --suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;;
+ --take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
+ --start-url ) URL_START=$2; shift 2;;
+ --end-url ) URL_LIMIT=$2; shift 2;;
+ --upload ) UPLOAD_INFO=$2; shift 2;;
+ * ) echo "Invalid argument $1 detected. Aborting."; exit 1;;
esac
done
@@ -433,32 +443,51 @@ function wrapupAndExit()
END_RUN=$(date +%s)
ELAPSED=$(echo $(($END_RUN - $START_RUN)) | awk '{printf "%d min. %d sec. elapsed", int($1/60), int($1%60)}')
- # Output results of session and close the log file's markup
+ # Do some math on results of session
LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
- LINKS_SKIPPED=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
- LINKS_CHECKED=$((LINKS_PROCESSED-LINKS_SKIPPED))
+ LINK_PROBLEMS=$((EI_LINKS+IW_LINKS+RD_LINKS+NG_LINKS))
+ LINK_ERRORS=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
+ LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_EI+SKIP_EXPECT_IW))
+ TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP))
+ LINKS_CHECKED=$((LINKS_PROCESSED-LINK_ERRORS))
+
+ # Print summary header
valPrint ct "Summary ($ELAPSED):"
valPrint r "\b1 Summary \b0 ($ELAPSED)"
valPrint hn "
Summary ($ELAPSED)
"
- valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT)."
- valPrint ctrh "I skipped $LINKS_SKIPPED $(pluralCheckNoun link $LINKS_SKIPPED), and found $FILE_LINKS $(pluralCheckNoun file $FILE_LINKS) and $PAGE_LINKS $(pluralCheckNoun page $PAGE_LINKS)."
- if [ $LINKS_SKIPPED -gt 0 ]; then valPrint ctrh "Skip breakdown: "; fi
- if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
+ valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT) (there were $FILE_LINKS file $(pluralCheckNoun link $FILE_LINKS) and $PAGE_LINKS page $(pluralCheckNoun link $PAGE_LINKS))."
+
+ # Print processed link totals
+ if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi
+ if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi
+ if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS processed $(pluralCheckNoun link $LINK_PROBLEMS) had issues"; fi
+ if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctrh " (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi
+ if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) were OK"; fi
+ if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctrh " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi
+
+ # Print excepted link totals
+ if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctrh "$LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"; fi
+ if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
+ if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi
+ if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi
+
+ # Print errored link totals
+ if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"; fi
+ if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
- valPrint ctrh "Out of the $LINKS_CHECKED links checked, $EI_LINKS could be $(pluralCheckAn $EI_LINKS)intrawiki $(pluralCheckNoun link $EI_LINKS), $IW_LINKS could be $(pluralCheckAn $IW_LINKS)interwiki $(pluralCheckNoun link $IW_LINKS), $OK_LINKS $(pluralCheckWas $OK_LINKS) OK, $RD_LINKS $(pluralCheckWas $RD_LINKS) $(pluralCheckA $RD_LINKS)redirection $(pluralCheckNoun notice $RD_LINKS), and $NG_LINKS $(pluralCheckWas $NG_LINKS) NG."
- if [ $SKIP_EXPECT_NG -gt 0 ]; then
- valPrint ctrh "$SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS) went unlisted due to being found in the exceptions file."
- fi
- if [ $SKIP_EXPECT_EI -gt 0 ]; then
- valPrint ctrh "$SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS) went unlisted due to being found in the exceptions file."
- fi
- if [ $SKIP_EXPECT_IW -gt 0 ]; then
- valPrint ctrh "$SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS) went unlisted due to being found in the exceptions file."
- fi
+
+ # Print checked link totals
+ if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS link $(pluralCheckNoun issues $LINKS_CHECKED):"; fi
+ if [ $NG_LINKS -gt 0 ]; then valPrint ctrh "- $NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
+ if [ $RD_LINKS -gt 0 ]; then valPrint ctrh "- $RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi
+ if [ $EI_LINKS -gt 0 ]; then valPrint ctrh "- $EI_LINKS $(pluralCheckNoun link $EI_LINKS) that could be intrawiki"; fi
+ if [ $IW_LINKS -gt 0 ]; then valPrint ctrh "- $IW_LINKS $(pluralCheckNoun link $IW_LINKS) that could be interwiki"; fi
+
+ # Close the log files' markup
valPrint trh "ValExtLinks says goodbye."
printRTFfooter
printHTMfooter
@@ -634,12 +663,12 @@ for LINE in `cat "$LINKS_FILE"`; do
fi
# Build longer wiki page URLs from namespace and page names
- FULL_PAGE_PATH=http://$WIKI_PATH/$NS_NAME:$PAGE_NAME
+ FULL_PAGE_PATH=https://$WIKI_PATH/$NS_NAME:$PAGE_NAME
LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
# Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it
# explicitly breaks the link
if [ $NS_ID -eq 0 ]; then
- FULL_PAGE_PATH=http://$WIKI_PATH/$PAGE_NAME
+ FULL_PAGE_PATH=https://$WIKI_PATH/$PAGE_NAME
LOCAL_PAGE_PATH=$PAGE_NAME
fi
@@ -794,24 +823,35 @@ for LINE in `cat "$LINKS_FILE"`; do
# Get URL header again in order to retrieve the URL we are being redirected to
NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{redirect_url}\n' $URL)
- # Filter out cases where the redirect URL is just the original URL with https:// instead of
- # http://, or with an added '/' at the end. These corrections happen a lot and are not
- # important to us.
- URL_NO_PROTOCOL=${URL#http://}
- URL_NO_PROTOCOL=${URL_NO_PROTOCOL%/}
- NEW_URL_NO_PROTOCOL=${NEW_URL#https://}
- NEW_URL_NO_PROTOCOL=${NEW_URL_NO_PROTOCOL%/}
+ # Adjust the old and new URLs to both use HTTP for comparison purposes, so we can filter
+ # those changes out if the user didn't ask for them
+ URL_HTTP=$(echo $URL | sed -E 's/^https:/http:/')
+ NEW_URL_HTTP=$(echo $NEW_URL | sed -E 's/^https:/http:/')
# Sometimes 'curl' fails to get the redirect_url due to time-out or bad web site config
- NEW_URL_LENGTH=$(echo | awk -v input=$NEW_URL_NO_PROTOCOL '{print length(input)}')
+ NEW_URL_LENGTH=$(echo | awk -v input=$NEW_URL_HTTP '{print length(input)}')
if [ $NEW_URL_LENGTH -lt $MIN_URL_LENGTH ]; then
- NEW_URL_NO_PROTOCOL="[new URL not retrieved]"
+ NEW_URL_HTTP="[new URL not retrieved]"
fi
- # If the URLs match after the above filters were applied, then the link is OK
- if [ $URL_NO_PROTOCOL == $NEW_URL_NO_PROTOCOL ]; then
+ # Remove slash at end of new URL, if present, so we can filter out the redirects that
+ # merely add an ending slash if the user didn't ask for them
+ NEW_URL_NO_SLASH=$(echo $NEW_URL_HTTP | sed -E 's:/$::')
+
+ # If the URLs match besides HTTP being upgraded to HTTPS, then the link is OK (unless user
+ # wants those to be reported)
+ if [ $SHOW_HTTPS -eq 0 ] && [ $URL_HTTP == $NEW_URL_HTTP ]; then
+ valPrint trs "Skipping URL $URL (found on page $PAGE_NAME) because we have not been asked to show http->https upgrades, and we were redirected to $NEW_URL."
+ STATUS="OK"
+ let OK_LINKS+=1
+ let SKIP_HTTPS_UP+=1
+ # If the URLs match besides an added ending slash, then the link is OK (unless user wants
+ # those to be reported)
+ elif [ $SHOW_SLASH -eq 0 ] && [ $URL_HTTP == $NEW_URL_NO_SLASH ]; then
+ valPrint trs "Skipping URL $URL (found on page $PAGE_NAME) because we have not been asked to show added trailing slashes, and we were redirected to $NEW_URL."
STATUS="OK"
let OK_LINKS+=1
+ let SKIP_SLASH_ADD+=1
else
STATUS="RD"
let RD_LINKS+=1