--- Validate External Links/validate_external_links.sh	2017/08/01 17:09:42	1067
+++ Validate External Links/validate_external_links.sh	2017/08/02 04:26:48	1069
@@ -103,8 +103,10 @@ DESCRIPTION
        This script parses a list of external links found in the OniGalore wiki
        (which is dumped by the Oni2.net domain periodically in a particular
        format), validates them using the Unix tool 'curl', and produces a report
-       of which links were OK (responded to an HTTP query) and which were NG (no
-       good). This report can then be automatically uploaded to the location of
+       of which links were OK (responded positively to an HTTP query), which
+       were RD (responded with a 3xx redirect code), which could be IW (inter-
+       wiki) links, and which were NG (no good; a negative response to the
+       query). This report can then be automatically uploaded to the location of
        your choice. The script can also suggest Internet Archive snapshots for
        NG links, and take screenshots of OK links for visual verification by the
        reader that the page in question is the one intended to be displayed.
@@ -120,11 +122,15 @@ OPTIONS
                            file:// protocol) (required)
        --output DIR        Place the folder which will contain the reports and
                            optional screenshots at this path (required)
-       --exceptions DIR    Don't log an NG link if it is listed in the file
-                           provided at this path as long as the response code is
-                           the same as the one associated with the link
-       --record-ok-links   Log a link in the report whether its response code is
-                           in the OK_CODES or the NG_CODES array
+       --exceptions URL    In order to remove links from the list which show as
+                           NG but which you regard as OK, prepare a plain-text
+                           file where each line contains a response code being
+                           returned and the URL returning it, separated by a
+                           comma, e.g. "403,http://www.example.com" (note that
+                           this can be a local file if you use the
+                           file:// protocol)
+       --record-ok-links   Log a link in the report even if its response code is
+                           OK
        --suggest-snapshots Query the Internet Archive for a possible snapshot
                            URL for each NG page
        --take-screenshots  Save screenshots of each OK page (requires Google
@@ -415,10 +421,7 @@ function wrapupAndExit()
    if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
    if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
    if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
-   valPrint ctrh "Out of the $LINKS_CHECKED links checked, $OK_LINKS $(pluralCheckWas $OK_LINKS) OK, $RD_LINKS $(pluralCheckWas $RD_LINKS) $(pluralCheckA $RD_LINKS)redirection $(pluralCheckNoun notice $RD_LINKS), and $NG_LINKS $(pluralCheckWas $NG_LINKS) NG."
-   if [ $IW_LINKS -gt 0 ]; then
-      valPrint ctrh "$IW_LINKS/$OK_LINKS OK $(pluralCheckNoun link $OK_LINKS) $(pluralCheckIs $IW_LINKS) $(pluralCheckAn $IW_LINKS)external $(pluralCheckNoun link $IW_LINKS) that could be $(pluralCheckAn $IW_LINKS)interwiki $(pluralCheckNoun link $IW_LINKS)."
-   fi
+   valPrint ctrh "Out of the $LINKS_CHECKED links checked, $IW_LINKS could be $(pluralCheckAn $IW_LINKS)interwiki $(pluralCheckNoun link $IW_LINKS), $OK_LINKS $(pluralCheckWas $OK_LINKS) OK, $RD_LINKS $(pluralCheckWas $RD_LINKS) $(pluralCheckA $RD_LINKS)redirection $(pluralCheckNoun notice $RD_LINKS), and $NG_LINKS $(pluralCheckWas $NG_LINKS) NG."
    if [ $SKIP_EXCEPT -gt 0 ]; then
       valPrint ctrh "$SKIP_EXCEPT/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS) went unlisted due to being found in the exceptions file."
    fi
@@ -445,7 +448,7 @@ printRTFheader
 printHTMheader
 
 # Attempt to download file at LINKS_URL, then check that it succeeded
-valPrint ctrh "Downloading list of external links from $LINKS_URL."
+valPrint cwtrh "Downloading list of external links from $LINKS_URL."
 LINKS_FILE_NAME=$(echo "$LINKS_URL" | sed 's/.*\///')
 LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
 curl --silent -o "$LINKS_FILE" $LINKS_URL
@@ -456,7 +459,7 @@ fi
 
 # Attempt to download file at EXCEPT_URL, then check that it succeeded
 if [ ! -z $EXCEPT_URL ]; then
-   valPrint ctrh "Downloading list of NG exceptions from $EXCEPT_URL."
+   valPrint cwtrh "Downloading list of NG exceptions from $EXCEPT_URL."
    EXCEPT_FILE_NAME=$(echo "$EXCEPT_URL" | sed 's/.*\///')
    EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
    curl --silent -o "$EXCEPT_FILE" $EXCEPT_URL
@@ -554,8 +557,7 @@ for LINE in `cat "$LINKS_FILE"`; do
    # Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
    NS_NAME=""
    a=0
-   while [ "x${NS_IDS[$a]}" != "x" ] # once this evaluates to "x", the array is done
-   do
+   while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done
       if [ $NS_ID -eq ${NS_IDS[$a]} ]; then
          NS_NAME="${NS_NAMES[$a]}"
          break
@@ -690,42 +692,49 @@ for LINE in `cat "$LINKS_FILE"`; do
       CURL_RESULT="$CURL_RESULT-$CURL_ERR"
    fi
 
-   # Determine if this code is in our "OK" list
+   # Determine our status code for this URL (IW, OK, RD, or NG)
    STATUS="??"
    NEW_URL=""
    INTERWIKI_INDEX=-1
-   for CODE in "${OK_CODES[@]}"; do
-      if [[ $CODE == $CURL_CODE ]]; then
-         let OK_LINKS+=1
-
-         # Determine if this is a link to a domain that we have an interwiki prefix for
-         for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
-            if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]]; then
-               STATUS="IW"
-               let IW_LINKS+=1
-               INTERWIKI_INDEX=$i
-               break
-            fi
-         done
-
-         # If this link is OK and no interwiki advisory is needed, just mark as "OK"
-         if [ $INTERWIKI_INDEX == -1 ]; then
-            STATUS="OK"
-         fi
+   # First check if this is a link to a domain that we have an interwiki prefix for
+   for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
+      if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]]; then
+         STATUS="IW"
+         let IW_LINKS+=1
+         INTERWIKI_INDEX=$i
          break
       fi
    done
 
+   # If we didn't match an interwiki domain, see if the status code is in our "OK" codes list
+   if [ $STATUS == "??" ]; then
+      for CODE in "${OK_CODES[@]}"; do
+         if [[ $CODE == $CURL_CODE ]]; then
+            STATUS="OK"
+            let OK_LINKS+=1
+            break
+         fi
+      done
+   fi
+
    # If we didn't get a match with the "OK" codes, check it against the "RD" codes
    if [ $STATUS == "??" ]; then
       for CODE in "${RD_CODES[@]}"; do
          if [[ $CODE == $CURL_CODE ]]; then
-            STATUS="RD"
-            let RD_LINKS+=1
-
             # Get URL header again in order to retrieve the URL we are being redirected to
             NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{redirect_url}\n' $URL)
 
+            # Check if the redirect URL is just the original URL with https:// instead of http://
+            # (this happens a lot and is not an important correction to us); if so, just make it "OK"
+            URL_NO_PROTOCOL=${URL#*://}
+            NEW_URL_NO_PROTOCOL=${NEW_URL#*://}
+            if [ $URL_NO_PROTOCOL == $NEW_URL_NO_PROTOCOL ]; then
+               STATUS="OK"
+               let OK_LINKS+=1
+            else
+               STATUS="RD"
+               let RD_LINKS+=1
+            fi
             break
          fi
       done