--- Validate External Links/validate_external_links.sh 2020/09/04 03:07:08 1142
+++ Validate External Links/validate_external_links.sh 2020/09/06 20:51:22 1144
@@ -5,7 +5,7 @@
# Validates a list of external links in CSV format. The resulting logs are produced in three formats:
# - TXT (for easy diffing with an earlier log)
# - RTF (for reading as a local file with clickable links)
-# - HTML (for uploading as a web page).
+# - HTML (for reading as a web page)
# Call script with "--help" argument for documentation. Also see Read Me.rtf for critical notes.
#
# Recommended rule:
@@ -29,21 +29,21 @@ IFS="
### GLOBALS ###
# Settings -- these will be changed from their defaults by the arguments passed in to the script
-LINKS_URL="" # use 'curl' to download file with links from this location (can be file://)
-EXCEPT_URL="" # 'curl' will access this wiki page with a list of exceptions for NG results
-OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder
-RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES
-SHOW_SLASH=0 # record response code to the log when a slash is added to the end of a URL
-SHOW_HTTPS=0 # record response code to the log when "http" is upgraded to "https"
-SHOW_YT_RD=0 # record response code to the log when a youtu.be URL is expanded to the full URL
-SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page
-SKIP_ARCHIVE_LINKS=0 # don't check URLs under the archive.org domain
-TAKE_PAGE_SHOT=0 # take a screenshot of each OK page
-TIMEOUT=10 # time to wait for a response when querying a site
-CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature
-URL_START=1 # start at this URL in LINKS_FILE
-URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE
-UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report
+LINKS_URL="" # use 'curl' to download file with links from this location (can be file://)
+EXCEPT_URL="" # 'curl' will access this wiki page with a list of exceptions for NG results
+OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder
+RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES
+SHOW_SLASH=0 # record issue when a slash is added to the end of a URL
+SHOW_HTTPS=0 # record issue when "http" is upgraded to "https"
+SHOW_YT_RD=0 # record redirection for a youtu.be URL expanding to the full URL
+SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page
+CHECK_ARCHIVE_LINKS=0 # check URLs under the archive.org domain
+TAKE_PAGE_SHOT=0 # take a screenshot of each OK page
+TIMEOUT=10 # time to wait for a response when querying a site
+CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature
+URL_START=1 # start at this URL in LINKS_FILE
+URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE
+UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report
# Fixed strings -- see the occurrences of these variables to learn their purpose
AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36 OPR/70.0.3728.154"
@@ -132,7 +132,7 @@ SYNOPSIS
validate_external_links.sh --help
validate_external_links.sh --links URL --output DIR [--exceptions URL]
[--record-ok-links] [--show-added-slashes] [--show-https-upgrades]
- [--show-yt-redirects] [--suggest-snapshots] [--skip-archive-links]
+ [--show-yt-redirects] [--suggest-snapshots] [--check-archive-links]
[--take-screenshots FILE] [--timeout NUM] [--start-url NUM]
[--end-url NUM] [--upload FILE]
@@ -176,8 +176,9 @@ OPTIONS
--show-yt-redirects Report on redirects that expand a youtu.be URL.
--suggest-snapshots Query the Internet Archive for a possible
snapshot URL for each "NG" page.
- --skip-archive-links Don't check links that are already pointing to
- a page on the Internet Archive.
+ --check-archive-links Check links that are already pointing to a page
+ on the Internet Archive. In theory these links
+ should be totally stable and not need validation.
--take-screenshots FILE Call the Google Chrome binary at this path to
take screenshots of each "OK" page.
--timeout NUM Wait this many seconds for a site to respond. The
@@ -217,7 +218,7 @@ while (( "$#" )); do
--show-https-upgrades ) SHOW_HTTPS=1; shift;;
--show-yt-redirects ) SHOW_YT_RD=1; shift;;
--suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;;
- --skip-archive-links ) SKIP_ARCHIVE_LINKS=1; shift;;
+ --check-archive-links ) CHECK_ARCHIVE_LINKS=1; shift;;
--take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
--timeout ) TIMEOUT=$2; shift 2;;
--start-url ) URL_START=$2; shift 2;;
@@ -260,9 +261,13 @@ OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
SHOT_PATH="$OUTPUT_PATH/Screenshots"
LOG_NAME="ValExtLinks report"
-LOG_TXT="$OUTPUT_PATH/$LOG_NAME.txt"
-LOG_RTF="$OUTPUT_PATH/$LOG_NAME.rtf"
-LOG_HTM="$OUTPUT_PATH/$LOG_NAME.htm"
+LOG_NAME_TXT="$LOG_NAME.txt"
+LOG_NAME_RTF="$LOG_NAME.rtf"
+LOG_NAME_HTM="$LOG_NAME.htm"
+LOG_PATH="$OUTPUT_PATH/$LOG_NAME"
+LOG_PATH_TXT="$LOG_PATH.txt"
+LOG_PATH_RTF="$LOG_PATH.rtf"
+LOG_PATH_HTM="$LOG_PATH.htm"
mkdir "$OUTPUT_PATH"
if [ $TAKE_PAGE_SHOT -eq 1 ]; then
mkdir "$SHOT_PATH"
@@ -360,29 +365,29 @@ function valPrint()
fi
if [[ "$1" == *t* ]]; then
if [[ "$1" == *n* ]]; then
- echo -n "$2" >> "$LOG_TXT"
+ echo -n "$2" >> "$LOG_PATH_TXT"
elif [[ "$1" == *s* ]]; then
- echo -e "$2\n" >> "$LOG_TXT"
+ echo -e "$2\n" >> "$LOG_PATH_TXT"
else
- echo "$2" >> "$LOG_TXT"
+ echo "$2" >> "$LOG_PATH_TXT"
fi
fi
if [[ "$1" == *r* ]]; then
if [[ "$1" == *n* ]]; then
- echo "$2" >> "$LOG_RTF"
+ echo "$2" >> "$LOG_PATH_RTF"
elif [[ "$1" == *s* ]]; then
- echo "$2\line\line" >> "$LOG_RTF"
+ echo "$2\line\line" >> "$LOG_PATH_RTF"
else
- echo "$2\line" >> "$LOG_RTF"
+ echo "$2\line" >> "$LOG_PATH_RTF"
fi
fi
if [[ "$1" == *h* ]]; then
if [[ "$1" == *s* ]]; then
- echo "$2
| |
" >> "$LOG_HTM"
+ echo "$2| |
" >> "$LOG_PATH_HTM"
elif [[ "$1" == *n* ]]; then
- echo "$2" >> "$LOG_HTM"
+ echo "$2" >> "$LOG_PATH_HTM"
else
- echo "$2
" >> "$LOG_HTM"
+ echo "$2
" >> "$LOG_PATH_HTM"
fi
fi
}
@@ -437,11 +442,11 @@ function pluralCheckAn()
fi
}
-# Upload HTML report using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
+# Upload the reports using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
# reports being saved to disk have already been closed.
function uploadReport()
{
- valPrint c "Uploading HTML report..."
+ valPrint c "Uploading reports..."
SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
SFTP_USER_NAME_MARKER="user:"
@@ -457,9 +462,15 @@ function uploadReport()
SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}
- expect "$SCRIPT_PATH" "$LOG_HTM" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.htm"
+ for SUFFIX in htm rtf txt; do
+ expect "$SCRIPT_PATH" "$LOG_PATH.$SUFFIX" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.$SUFFIX"
- valPrint c "Report was uploaded, unless an error message appears above."
+ if [ "$?" -ne 0 ]; then
+ valPrint c "Error $? occurred when attempting to upload $LOG_NAME.$SUFFIX!"
+ else
+ valPrint c "Report in `echo $SUFFIX | tr [:lower:] [:upper:]` format was uploaded."
+ fi
+ done
}
# Prints session summary when script is done
@@ -493,6 +504,15 @@ function wrapupAndExit()
LINK_PROBLEMS_IW=$((IW_LINKS-SKIP_EXPECT_IW))
LINK_PROBLEMS_NET=$((LINK_PROBLEMS_NG+LINK_PROBLEMS_RD+LINK_PROBLEMS_EI+LINK_PROBLEMS_IW))
+ # Print something in the Links section if no link issues were printed
+ if [ $LINK_PROBLEMS_NET -eq 0 ]; then
+ valPrint h "No link problems to report! See the RTF or TXT report for a list of links with issues that were not reported."
+ fi
+ if [ $LINK_PROBLEMS_TOTAL -eq 0 ]; then
+ valPrint t "No link problems to report!"
+ valPrint r "\i1 No link problems to report! \i0"
+ fi
+
## SUMMARY OUTPUT ##
valPrint ct "Summary ($ELAPSED):"
valPrint r "\b1 Summary \b0 ($ELAPSED)"
@@ -509,7 +529,11 @@ function wrapupAndExit()
if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctr " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; valPrint h " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi
# Print errored link totals
- if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"; fi
+ if [ $LINK_ERRORS -gt 0 ]; then
+ valPrint c "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"
+ valPrint h "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"
+ valPrint rt "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS):"
+ fi
if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
@@ -518,7 +542,11 @@ function wrapupAndExit()
if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
# Print excepted link totals
- if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctrh "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"; fi
+ if [ $LINKS_EXCEPTED -gt 0 ]; then
+ valPrint c "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"
+ valPrint h "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"
+ valPrint rt "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted:"
+ fi
if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
if [ $SKIP_EXPECT_RD -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_RD/$RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi
if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi
@@ -637,7 +665,7 @@ valPrint ctrhn "Ignore youtu.be redirect
if [ $SHOW_YT_RD -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
valPrint ctrhn "Check archive.org links: "
-if [ $SKIP_ARCHIVE_LINKS -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
+if [ $CHECK_ARCHIVE_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
valPrint tr "A summary of my findings will be found at the bottom of the report."
valPrint h "A summary of my findings will be found at the bottom of the report."
@@ -765,9 +793,9 @@ for LINE in `cat "$LINKS_FILE"`; do
continue
fi
- # If we're skipping Archive.org links, check if this is one
- if [ $SKIP_ARCHIVE_LINKS -eq 1 ] && [[ $URL == *web.archive.org* ]]; then
- valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have been asked not to check Wayback Machine links."
+ # If we're skipping Archive.org links, see if this is one
+ if [ $CHECK_ARCHIVE_LINKS -eq 0 ] && [[ $URL == *web.archive.org* ]]; then
+ valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to check Wayback Machine links."
let SKIP_ARCHIVE_ORG+=1
continue
fi
@@ -892,8 +920,9 @@ for LINE in `cat "$LINKS_FILE"`; do
let EI_LINKS+=1
fi
- # If it's not, check if this is a link to a domain that we have an interwiki prefix for
- if [ $STATUS == "??" ]; then
+ # If it's not, check if this is a link to a domain that we have an interwiki prefix for (also make
+ # sure that it's not an archive.org link to a page from an interwiki domain)
+ if [ $STATUS == "??" ] && [[ $URL != *web.archive.org* ]]; then
for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]] && [[ $URL != *${INTERWIKI_DOMAINS[$i]}/w/* ]]; then
STATUS="IW"