[ViewVC] Diff of: Oni2/Validate External Links/validate_external

Comparing Validate External Links/validate_external_links.sh (file contents):
Revision 1067 by iritscen, Tue Aug 1 17:09:42 2017 UTC vs.
Revision 1069 by iritscen, Wed Aug 2 04:26:48 2017 UTC

+       This script parses a list of external links found in the OniGalore wiki
+       (which is dumped by the Oni2.net domain periodically in a particular
+       format), validates them using the Unix tool 'curl', and produces a report
-<
+       of which links were OK (responded to an HTTP query) and which were NG (no
-<
+       good). This report can then be automatically uploaded to the location of
->
+       of which links were OK (responded positively to an HTTP query), which
->
+       were RD (responded with a 3xx redirect code), which could be IW (inter-
->
+       wiki) links, and which were NG (no good; a negative response to the
->
+       query). This report can then be automatically uploaded to the location of
+       your choice. The script can also suggest Internet Archive snapshots for
+       NG links, and take screenshots of OK links for visual verification by the
+       reader that the page in question is the one intended to be displayed.
+                           file:// protocol) (required)
+       --output DIR        Place the folder which will contain the reports and
+                           optional screenshots at this path (required)
-<
+       --exceptions DIR    Don't log an NG link if it is listed in the file
-<
+                           provided at this path as long as the response code is
-<
+                           the same as the one associated with the link
-<
+       --record-ok-links   Log a link in the report whether its response code is
-<
+                           in the OK_CODES or the NG_CODES array
->
+       --exceptions URL    In order to remove links from the list which show as
->
+                           NG but which you regard as OK, prepare a plain-text
->
+                           file where each line contains a response code being
->
+                           returned and the URL returning it, separated by a
->
+                           comma, e.g. "403,http://www.example.com" (note that
->
+                           this can be a local file if you use the
->
+                           file:// protocol)
->
+       --record-ok-links   Log a link in the report even if its response code is
->
+                           OK
+       --suggest-snapshots Query the Internet Archive for a possible snapshot
+                           URL for each NG page
+       --take-screenshots  Save screenshots of each OK page (requires Google
+   if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
+   if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
+   if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
-<
+   valPrint ctrh "Out of the $LINKS_CHECKED links checked, $OK_LINKS $(pluralCheckWas $OK_LINKS) OK, $RD_LINKS $(pluralCheckWas $RD_LINKS) $(pluralCheckA $RD_LINKS)redirection $(pluralCheckNoun notice $RD_LINKS), and $NG_LINKS $(pluralCheckWas $NG_LINKS) NG."
-<
+   if [ $IW_LINKS -gt 0 ]; then
-<
+      valPrint ctrh "$IW_LINKS/$OK_LINKS OK $(pluralCheckNoun link $OK_LINKS) $(pluralCheckIs $IW_LINKS) $(pluralCheckAn $IW_LINKS)external $(pluralCheckNoun link $IW_LINKS) that could be $(pluralCheckAn $IW_LINKS)interwiki $(pluralCheckNoun link $IW_LINKS)."
-<
+   fi
->
+   valPrint ctrh "Out of the $LINKS_CHECKED links checked, $IW_LINKS could be $(pluralCheckAn $IW_LINKS)interwiki $(pluralCheckNoun link $IW_LINKS), $OK_LINKS $(pluralCheckWas $OK_LINKS) OK, $RD_LINKS $(pluralCheckWas $RD_LINKS) $(pluralCheckA $RD_LINKS)redirection $(pluralCheckNoun notice $RD_LINKS), and $NG_LINKS $(pluralCheckWas $NG_LINKS) NG."
+   if [ $SKIP_EXCEPT -gt 0 ]; then
+      valPrint ctrh "$SKIP_EXCEPT/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS) went unlisted due to being found in the exceptions file."
+   fi
+printHTMheader
+# Attempt to download file at LINKS_URL, then check that it succeeded
-<
+valPrint ctrh "Downloading list of external links from $LINKS_URL."
->
+valPrint cwtrh "Downloading list of external links from $LINKS_URL."
+LINKS_FILE_NAME=$(echo "$LINKS_URL" | sed 's/.*\///')
+LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
+curl --silent -o "$LINKS_FILE" $LINKS_URL
+# Attempt to download file at EXCEPT_URL, then check that it succeeded
+if [ ! -z $EXCEPT_URL ]; then
-<
+   valPrint ctrh "Downloading list of NG exceptions from $EXCEPT_URL."
->
+   valPrint cwtrh "Downloading list of NG exceptions from $EXCEPT_URL."
+   EXCEPT_FILE_NAME=$(echo "$EXCEPT_URL" | sed 's/.*\///')
+   EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
+   curl --silent -o "$EXCEPT_FILE" $EXCEPT_URL
+   # Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
+   NS_NAME=""
+   a=0
-<
+   while [ "x${NS_IDS[$a]}" != "x" ] # once this evaluates to "x", the array is done
-<
+   do
->
+   while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done
+      if [ $NS_ID -eq ${NS_IDS[$a]} ]; then
+         NS_NAME="${NS_NAMES[$a]}"
+         break
+      CURL_RESULT="$CURL_RESULT-$CURL_ERR"
+   fi
-<
+   # Determine if this code is in our "OK" list
->
+   # Determine our status code for this URL (IW, OK, RD, or NG)
+   STATUS="??"
+   NEW_URL=""
+   INTERWIKI_INDEX=-1
-<
+   for CODE in "${OK_CODES[@]}"; do
-<
+      if [[ $CODE == $CURL_CODE ]]; then
-<
+         let OK_LINKS+=1
-<
-<
+         # Determine if this is a link to a domain that we have an interwiki prefix for
-<
+         for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
-<
+            if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]]; then
-<
+               STATUS="IW"
-<
+               let IW_LINKS+=1
-<
+               INTERWIKI_INDEX=$i
-<
+               break
-<
+            fi
-<
+         done
-<
-<
+         # If this link is OK and no interwiki advisory is needed, just mark as "OK"
-<
+         if [ $INTERWIKI_INDEX == -1 ]; then
-<
+            STATUS="OK"
-<
+         fi
->
+   # First check if this is a link to a domain that we have an interwiki prefix for
->
+   for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
->
+      if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]]; then
->
+         STATUS="IW"
->
+         let IW_LINKS+=1
->
+         INTERWIKI_INDEX=$i
+         break
+      fi
+   done
-+
+   # If we didn't match an interwiki domain, see if the status code is in our "OK" codes list
-+
+   if [ $STATUS == "??" ]; then
-+
+      for CODE in "${OK_CODES[@]}"; do
-+
+         if [[ $CODE == $CURL_CODE ]]; then
-+
+            STATUS="OK"
-+
+            let OK_LINKS+=1
-+
+            break
-+
+         fi
-+
+      done
-+
+   fi
-+
+   # If we didn't get a match with the "OK" codes, check it against the "RD" codes
+   if [ $STATUS == "??" ]; then
+      for CODE in "${RD_CODES[@]}"; do
+         if [[ $CODE == $CURL_CODE ]]; then
-–
+            STATUS="RD"
-–
+            let RD_LINKS+=1
-–
+            # Get URL header again in order to retrieve the URL we are being redirected to
+            NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{redirect_url}\n' $URL)
-+
+            # Check if the redirect URL is just the original URL with https:// instead of http://
-+
+            # (this happens a lot and is not an important correction to us); if so, just make it "OK"
-+
+            URL_NO_PROTOCOL=${URL#*://}
-+
+            NEW_URL_NO_PROTOCOL=${NEW_URL#*://}
-+
+            if [ $URL_NO_PROTOCOL == $NEW_URL_NO_PROTOCOL ]; then
-+
+               STATUS="OK"
-+
+               let OK_LINKS+=1
-+
+            else
-+
+               STATUS="RD"
-+
+               let RD_LINKS+=1
-+
+            fi
+            break
+         fi
+      done

Diff Legend

-–
+Removed lines
-+
+Added lines
-<
+Changed lines (old)
->
+Changed lines (new)

Comparing Validate External Links/validate_external_links.sh (file contents): Revision 1067 by iritscen, Tue Aug 1 17:09:42 2017 UTC vs. Revision 1069 by iritscen, Wed Aug 2 04:26:48 2017 UTC

Diff Legend

Comparing Validate External Links/validate_external_links.sh (file contents):
Revision 1067 by iritscen, Tue Aug 1 17:09:42 2017 UTC vs.
Revision 1069 by iritscen, Wed Aug 2 04:26:48 2017 UTC