--- Validate External Links/validate_external_links.sh	2020/09/04 03:07:08	1142
+++ Validate External Links/validate_external_links.sh	2020/09/06 20:51:22	1144
@@ -5,7 +5,7 @@
 # Validates a list of external links in CSV format. The resulting logs are produced in three formats:
 # - TXT (for easy diffing with an earlier log)
 # - RTF (for reading as a local file with clickable links)
-# - HTML (for uploading as a web page).
+# - HTML (for reading as a web page)
 # Call script with "--help" argument for documentation. Also see Read Me.rtf for critical notes.
 #
 # Recommended rule:
@@ -29,21 +29,21 @@ IFS="
 
 ### GLOBALS ###
 # Settings -- these will be changed from their defaults by the arguments passed in to the script
-LINKS_URL=""         # use 'curl' to download file with links from this location (can be file://)
-EXCEPT_URL=""        # 'curl' will access this wiki page with a list of exceptions for NG results
-OUTPUT_DIR=""	     # place reports and all other output in a folder inside this existing folder
-RECORD_OK_LINKS=0    # record response code to the log even when it's a value in OK_CODES
-SHOW_SLASH=0         # record response code to the log when a slash is added to the end of a URL
-SHOW_HTTPS=0         # record response code to the log when "http" is upgraded to "https"
-SHOW_YT_RD=0         # record response code to the log when a youtu.be URL is expanded to the full URL
-SUGGEST_SNAPSHOTS=0  # query the Internet Archive for a possible snapshot URL for each NG page
-SKIP_ARCHIVE_LINKS=0 # don't check URLs under the archive.org domain
-TAKE_PAGE_SHOT=0     # take a screenshot of each OK page
-TIMEOUT=10           # time to wait for a response when querying a site
-CHROME_PATH=""       # path to a copy of Google Chrome that has the command-line screenshot feature
-URL_START=1          # start at this URL in LINKS_FILE
-URL_LIMIT=0          # if non-zero, stop at this URL in LINKS_FILE
-UPLOAD_INFO=""       # path to a file on your hard drive with the login info needed to upload a report
+LINKS_URL=""          # use 'curl' to download file with links from this location (can be file://)
+EXCEPT_URL=""         # 'curl' will access this wiki page with a list of exceptions for NG results
+OUTPUT_DIR=""	      # place reports and all other output in a folder inside this existing folder
+RECORD_OK_LINKS=0     # record response code to the log even when it's a value in OK_CODES
+SHOW_SLASH=0          # record issue when a slash is added to the end of a URL
+SHOW_HTTPS=0          # record issue when "http" is upgraded to "https"
+SHOW_YT_RD=0          # record redirection for a youtu.be URL expanding to the full URL
+SUGGEST_SNAPSHOTS=0   # query the Internet Archive for a possible snapshot URL for each NG page
+CHECK_ARCHIVE_LINKS=0 # check URLs under the archive.org domain
+TAKE_PAGE_SHOT=0      # take a screenshot of each OK page
+TIMEOUT=10            # time to wait for a response when querying a site
+CHROME_PATH=""        # path to a copy of Google Chrome that has the command-line screenshot feature
+URL_START=1           # start at this URL in LINKS_FILE
+URL_LIMIT=0           # if non-zero, stop at this URL in LINKS_FILE
+UPLOAD_INFO=""        # path to a file on your hard drive with the login info needed to upload a report
 
 # Fixed strings -- see the occurrences of these variables to learn their purpose
 AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36 OPR/70.0.3728.154"
@@ -132,7 +132,7 @@ SYNOPSIS
        validate_external_links.sh --help
        validate_external_links.sh --links URL --output DIR [--exceptions URL]
           [--record-ok-links] [--show-added-slashes] [--show-https-upgrades]
-          [--show-yt-redirects] [--suggest-snapshots] [--skip-archive-links]
+          [--show-yt-redirects] [--suggest-snapshots] [--check-archive-links]
           [--take-screenshots FILE] [--timeout NUM] [--start-url NUM]
           [--end-url NUM] [--upload FILE]
 
@@ -176,8 +176,9 @@ OPTIONS
        --show-yt-redirects     Report on redirects that expand a youtu.be URL.
        --suggest-snapshots     Query the Internet Archive for a possible
                                snapshot URL for each "NG" page.
-       --skip-archive-links    Don't check links that are already pointing to
-                               a page on the Internet Archive.
+       --check-archive-links   Check links that are already pointing to a page
+                               on the Internet Archive. In theory these links
+                               should be totally stable and not need validation.
        --take-screenshots FILE Call the Google Chrome binary at this path to
                                take screenshots of each "OK" page.
        --timeout NUM           Wait this many seconds for a site to respond. The
@@ -217,7 +218,7 @@ while (( "$#" )); do
       --show-https-upgrades ) SHOW_HTTPS=1;                       shift;;
       --show-yt-redirects )   SHOW_YT_RD=1;                       shift;;
       --suggest-snapshots )   SUGGEST_SNAPSHOTS=1;                shift;;
-      --skip-archive-links )  SKIP_ARCHIVE_LINKS=1;               shift;;
+      --check-archive-links ) CHECK_ARCHIVE_LINKS=1;              shift;;
       --take-screenshots )    TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
       --timeout )             TIMEOUT=$2;                         shift 2;;
       --start-url )           URL_START=$2;                       shift 2;;
@@ -260,9 +261,13 @@ OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
 OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
 SHOT_PATH="$OUTPUT_PATH/Screenshots"
 LOG_NAME="ValExtLinks report"
-LOG_TXT="$OUTPUT_PATH/$LOG_NAME.txt"
-LOG_RTF="$OUTPUT_PATH/$LOG_NAME.rtf"
-LOG_HTM="$OUTPUT_PATH/$LOG_NAME.htm"
+LOG_NAME_TXT="$LOG_NAME.txt"
+LOG_NAME_RTF="$LOG_NAME.rtf"
+LOG_NAME_HTM="$LOG_NAME.htm"
+LOG_PATH="$OUTPUT_PATH/$LOG_NAME"
+LOG_PATH_TXT="$LOG_PATH.txt"
+LOG_PATH_RTF="$LOG_PATH.rtf"
+LOG_PATH_HTM="$LOG_PATH.htm"
 mkdir "$OUTPUT_PATH"
 if [ $TAKE_PAGE_SHOT -eq 1 ]; then
    mkdir "$SHOT_PATH"
@@ -360,29 +365,29 @@ function valPrint()
    fi
    if [[ "$1" == *t* ]]; then
       if [[ "$1" == *n* ]]; then
-         echo -n "$2" >> "$LOG_TXT"
+         echo -n "$2" >> "$LOG_PATH_TXT"
       elif [[ "$1" == *s* ]]; then
-         echo -e "$2\n" >> "$LOG_TXT"
+         echo -e "$2\n" >> "$LOG_PATH_TXT"
       else
-         echo "$2" >> "$LOG_TXT"
+         echo "$2" >> "$LOG_PATH_TXT"
       fi
    fi
    if [[ "$1" == *r* ]]; then
       if [[ "$1" == *n* ]]; then
-         echo "$2" >> "$LOG_RTF"
+         echo "$2" >> "$LOG_PATH_RTF"
       elif [[ "$1" == *s* ]]; then
-         echo "$2\line\line" >> "$LOG_RTF"
+         echo "$2\line\line" >> "$LOG_PATH_RTF"
       else
-         echo "$2\line" >> "$LOG_RTF"
+         echo "$2\line" >> "$LOG_PATH_RTF"
       fi
    fi
    if [[ "$1" == *h* ]]; then
       if [[ "$1" == *s* ]]; then
-         echo "$2<tr><td>&nbsp;</td></tr>" >> "$LOG_HTM"
+         echo "$2<tr><td>&nbsp;</td></tr>" >> "$LOG_PATH_HTM"
       elif [[ "$1" == *n* ]]; then
-         echo "$2" >> "$LOG_HTM"
+         echo "$2" >> "$LOG_PATH_HTM"
       else
-         echo "$2<br />" >> "$LOG_HTM"
+         echo "$2<br />" >> "$LOG_PATH_HTM"
       fi
    fi
 }
@@ -437,11 +442,11 @@ function pluralCheckAn()
    fi
 }
 
-# Upload HTML report using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
+# Upload the reports using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
 # reports being saved to disk have already been closed.
 function uploadReport()
 {
-   valPrint c "Uploading HTML report..."
+   valPrint c "Uploading reports..."
 
    SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
    SFTP_USER_NAME_MARKER="user:"
@@ -457,9 +462,15 @@ function uploadReport()
    SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
    SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}
 
-   expect "$SCRIPT_PATH" "$LOG_HTM" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.htm"
+   for SUFFIX in htm rtf txt; do
+      expect "$SCRIPT_PATH" "$LOG_PATH.$SUFFIX" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.$SUFFIX"
 
-   valPrint c "Report was uploaded, unless an error message appears above."
+      if [ "$?" -ne 0 ]; then
+         valPrint c "Error $? occurred when attempting to upload $LOG_NAME.$SUFFIX!"
+      else
+         valPrint c "Report in `echo $SUFFIX | tr [:lower:] [:upper:]` format was uploaded."
+      fi
+   done
 }
 
 # Prints session summary when script is done
@@ -493,6 +504,15 @@ function wrapupAndExit()
    LINK_PROBLEMS_IW=$((IW_LINKS-SKIP_EXPECT_IW))
    LINK_PROBLEMS_NET=$((LINK_PROBLEMS_NG+LINK_PROBLEMS_RD+LINK_PROBLEMS_EI+LINK_PROBLEMS_IW))
 
+   # Print something in the Links section if no link issues were printed
+   if [ $LINK_PROBLEMS_NET -eq 0 ]; then
+      valPrint h "<i>No link problems to report! See the <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for a list of links with issues that were not reported.</i>"
+   fi
+   if [ $LINK_PROBLEMS_TOTAL -eq 0 ]; then
+      valPrint t "No link problems to report!"
+      valPrint r "\i1 No link problems to report! \i0"
+   fi
+
    ## SUMMARY OUTPUT ##
    valPrint ct "Summary ($ELAPSED):"
    valPrint r "\b1 Summary \b0 ($ELAPSED)"
@@ -509,7 +529,11 @@ function wrapupAndExit()
    if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctr "  (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; valPrint h "&nbsp;&nbsp;(counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi
 
    # Print errored link totals
-   if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"; fi
+   if [ $LINK_ERRORS -gt 0 ]; then
+      valPrint c "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"
+      valPrint h "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for specific links):"
+      valPrint rt "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS):"
+   fi
    if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
    if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
    if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
@@ -518,7 +542,11 @@ function wrapupAndExit()
    if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
 
    # Print excepted link totals
-   if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctrh "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"; fi
+   if [ $LINKS_EXCEPTED -gt 0 ]; then
+      valPrint c "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"
+      valPrint h "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for specific links):"
+      valPrint rt "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted:"
+   fi
    if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
    if [ $SKIP_EXPECT_RD -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_RD/$RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi
    if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi
@@ -637,7 +665,7 @@ valPrint ctrhn "Ignore youtu.be redirect
 if [ $SHOW_YT_RD -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
 
 valPrint ctrhn "Check archive.org links: "
-if [ $SKIP_ARCHIVE_LINKS -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
+if [ $CHECK_ARCHIVE_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
 
 valPrint tr "A summary of my findings will be found at the bottom of the report."
 valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
@@ -765,9 +793,9 @@ for LINE in `cat "$LINKS_FILE"`; do
       continue
    fi
 
-   # If we're skipping Archive.org links, check if this is one
-   if [ $SKIP_ARCHIVE_LINKS -eq 1 ] && [[ $URL == *web.archive.org* ]]; then
-      valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have been asked not to check Wayback Machine links."
+   # If we're skipping Archive.org links, see if this is one
+   if [ $CHECK_ARCHIVE_LINKS -eq 0 ] && [[ $URL == *web.archive.org* ]]; then
+      valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to check Wayback Machine links."
       let SKIP_ARCHIVE_ORG+=1
       continue
    fi
@@ -892,8 +920,9 @@ for LINE in `cat "$LINKS_FILE"`; do
       let EI_LINKS+=1
    fi
 
-   # If it's not, check if this is a link to a domain that we have an interwiki prefix for
-   if [ $STATUS == "??" ]; then
+   # If it's not, check if this is a link to a domain that we have an interwiki prefix for (also make
+   # sure that it's not an archive.org link to a page from an interwiki domain)
+   if [ $STATUS == "??" ] && [[ $URL != *web.archive.org* ]]; then
       for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
          if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]] && [[ $URL != *${INTERWIKI_DOMAINS[$i]}/w/* ]]; then
             STATUS="IW"