[ViewVC] Diff of: Oni2/Validate External Links/validate_external

Comparing Validate External Links/validate_external_links.sh (file contents):
Revision 1160 by iritscen, Sun Aug 15 14:20:21 2021 UTC vs.
Revision 1175 by iritscen, Tue Aug 23 14:15:48 2022 UTC

+### GLOBALS ###
+# Settings -- these will be changed from their defaults by the arguments passed in to the script
-<
+LINKS_URL=""           # use 'curl' to download file with links from this location (can be file://)
-<
+EXCEPT_URL=""          # 'curl' will access this wiki page with a list of exceptions for NG results
->
+LINKS_URL=""           # download external link CSV from this location (can use "file://" protocol)
->
+EXCEPT_URL=""          # location of wiki page with a list of exceptions for NG results
+OUTPUT_DIR=""          # place reports and all other output in a folder inside this existing folder
+RECORD_OK_LINKS=0      # record response code to the log even when it's a value in OK_CODES
+SHOW_SLASH=0           # record issue when a slash is added to the end of a URL
+UPLOAD_INFO=""         # path to a file on your hard drive with the login info needed to upload a report
+# Fixed strings -- see the occurrences of these variables to learn their purpose
-<
+AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/88.0.4324.146 Safari/537.36"
->
+AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.134 Safari/537.36"
+ARCHIVE_API="http://archive.org/wayback/available"
+ARCHIVE_GENERIC="https://web.archive.org/web/*"
+ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
+# These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
+# This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code.
-<
+declare -a HTTP_FILES=(3ds 7z avi BINA blend bsl cpp csv dae dll dmg doc east exe fbx first63 flv gamedata gif jpg last32 log m4a mhm mov mp3 mp4 oni ONWC pdf png psd py rar tga TRMA txt vbs wav wmv xaf xcf xml zip)
->
+declare -a HTTP_FILES=(3ds 7z avi BINA blend bsl cpp csv dae dll dmg doc east exe fbx first63 flv gamedata gif jpg last32 log m4a mhm mov mp3 mp4 oni ONWC pdf png psd py rar tga TRMA txt vbs wav wmv xaf xcf xlsx xml zip)
+declare -a HTTP_TLDS_AND_PAGES=(abstract action ars asp aspx cfm cgi com css de do full htm html it js jsp net org pgi php php3 phtml pl ru shtml stm uk x)
+# These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
+   if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi
+   if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi
-+
+   # Perform exceptions audit
-+
+   EXCEPTION_ISSUES=0
-+
+   valPrint ctrh "Exceptions list audit:"
-+
+   for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
-+
+      EXCEPT_LINE="${EXCEPT_ARRAY[$i]}"
-+
+      EXCEPT_LINE=$(echo "$EXCEPT_LINE" | sed 's/\&amp;/\&/g') # copied from exception-matching code
-+
-+
+      if [ ${EXCEPT_FOUND[$i]} -eq 0 ]; then
-+
+         EXCEPT_URL="${EXCEPT_LINE#*,}"
-+
+         EXCEPT_URL="${EXCEPT_URL%,*}"
-+
+         EXCEPT_PAGE="${EXCEPT_LINE##*,}"
-+
+         EXCEPT_PAGE="${EXCEPT_PAGE%% *}"
-+
+         if [ "$EXCEPT_PAGE" == "*" ]; then
-+
+            valPrint tr "- The link '$EXCEPT_URL' did not return an error or is not on any page."
-+
+         else
-+
+            valPrint tr "- The link '$EXCEPT_URL' did not return an error or is not on page '$EXCEPT_PAGE'."
-+
+         fi
-+
+         let EXCEPTION_ISSUES+=1
-+
+      elif [ ${EXCEPT_USED[$i]} -eq 0 ]; then
-+
+         EXCEPT_URL="${EXCEPT_LINE#*,}"
-+
+         EXCEPT_URL="${EXCEPT_URL%,*}"
-+
+         EXCEPT_CODE=${EXCEPT_LINE%%,*}
-+
+         valPrint tr "- The link '$EXCEPT_URL' did not return error code $EXCEPT_CODE."
-+
+         let EXCEPTION_ISSUES+=1
-+
+      fi
-+
+   done
-+
+   if [ $EXCEPTION_ISSUES -eq 0 ]; then
-+
+      valPrint ctrh "- No issues found."
-+
+   else
-+
+      valPrint c "$EXCEPTION_ISSUES exception list $(pluralCheckNoun issue $EXCEPTION_ISSUES) detected (see RTF or TXT report for details)."
-+
+      valPrint h "$EXCEPTION_ISSUES exception list $(pluralCheckNoun issue $EXCEPTION_ISSUES) detected (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for details)."
-+
+   fi
-+
+   # Print checked link totals
+   if [ $LINK_PROBLEMS_NET -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS_NET link $(pluralCheckNoun issue $LINK_PROBLEMS_NET):"; fi
+   if [ $LINK_PROBLEMS_NG -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_NG NG $(pluralCheckNoun link $LINK_PROBLEMS_NG)"; fi
+   # Transfer to array for easy searching later
+   declare -a EXCEPT_ARRAY=($(echo "$EXCEPT_DATA"))
-+
-+
+   # Create parallel arrays for marking which exceptions get used later
-+
+   declare -a EXCEPT_USED=()
-+
+   declare -a EXCEPT_FOUND=()
-+
+   for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
-+
+      EXCEPT_USED+=(0)
-+
+      EXCEPT_FOUND+=(0)
-+
+   done
+fi
+# Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
+valPrint t "Legend:"
+valPrint r "\b1 Legend \b0"
+valPrint hn "<h3>Legend</h3>"
-<
+valPrint t "(For guidance in fixing these links, see $WIKI_MAIN.)"
-<
+valPrint r "(For guidance in fixing these links, see {\field{\*\fldinst{HYPERLINK \"$WIKI_MAIN\"}}{\fldrslt here}}.)"
-<
+valPrint h "(For guidance in fixing these links, see <a href=\"$WIKI_MAIN\" target=\"_blank\">here</a>.)"
->
+valPrint t "(For guidance in fixing these links, see $WIKI_MAIN. The exceptions list is at $EXCEPT_URL.)"
->
+valPrint r "(For guidance in fixing these links, see {\field{\*\fldinst{HYPERLINK \"$WIKI_MAIN\"}}{\fldrslt here}}. The exceptions list is {\field{\*\fldinst{HYPERLINK \"$EXCEPT_URL\"}}{\fldrslt here}}.)"
->
+valPrint h "(For guidance in fixing these links, see <a href=\"$WIKI_MAIN\" target=\"_blank\">here</a>. The exceptions list is <a href=\"$EXCEPT_URL\" target=\"_blank\">here</a>.)"
+valPrint trh "OK = URL seems to be working"
+valPrint trh "NG = URL no longer seems to work"
+valPrint trh "RD = URL is redirecting to this new URL"
+   # If the URL ends in something like "#section_15", strip everything from the '#' onward
+   CLEAN_URL=${CLEAN_URL%%\#*}
-<
+   # 'sed' cannot handle Unicode in my Bash shell, so skip non-ASCII URL and make user check it
->
+   # 'sed' cannot handle Unicode in my Bash shell, so skip non-ASCII URL and make reader check it
+   if [[ $CLEAN_URL == *[![:ascii:]]* ]]; then
+      valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I cannot handle non-ASCII characters."
+      let SKIP_NON_ASCII+=1
+      shopt -u nocasematch
+   fi
-<
+   # If this suffix escaped identification as either a file, page or TLD, inform the user
->
+   # If this suffix escaped identification as either a file, page or TLD, inform the reader
+   STR_TYPE=""
+   if [ $IS_FILE -eq -1 ]; then
+      valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I encountered the unknown URL suffix '$POST_DOT'. Please add this suffix to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
+   # Check problem links against exceptions list before proceeding
+   FOUND_EXCEPT=0
-<
+   if [ $STATUS != "OK" ] && [ ! -z $EXCEPT_URL ]; then
->
+   if [ $STATUS != "OK" ] && [ ! -z "$EXCEPT_URL" ]; then
+      # The code we expect to find in the exceptions file is either the 'curl' result or "EI"/"IW"
+      EXPECT_CODE="$CURL_RESULT"
+      if [ $STATUS == "EI" ]; then
+         # other HTML-encoded characters are not found in URLs
+         EXCEPT_LINE=$(echo "$EXCEPT_LINE" | sed 's/\&amp;/\&/g')
-<
+         # Match URL
->
+         # Check for URL match
+         EXCEPT_URL="${EXCEPT_LINE#*,}"
+         EXCEPT_URL="${EXCEPT_URL%,*}"
+         if [ "$EXCEPT_URL" != "$URL" ]; then
+            continue
+         fi
-<
+         # Match containing page's name
->
+         # Check for page name match
+         EXCEPT_PAGE="${EXCEPT_LINE##*,}"
+         EXCEPT_PAGE="${EXCEPT_PAGE%% *}"
-<
+         if [ "$EXCEPT_PAGE" == "*" ] || [ "$EXCEPT_PAGE" == $LOCAL_PAGE_PATH ]; then
-<
+            # Match result code
->
+         if [ "$EXCEPT_PAGE" == "*" ] || [ "$EXCEPT_PAGE" == "$LOCAL_PAGE_PATH" ]; then
->
+            let EXCEPT_FOUND[$i]+=1
->
+            valPrint trs "Found exception '$URL' on page '$LOCAL_PAGE_PATH'."
->
->
+            # Check for result code match
+            EXCEPT_CODE=${EXCEPT_LINE%%,*}
+            if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then
-+
+               FOUND_EXCEPT=1
-+
+               let EXCEPT_USED[$i]+=1
+               valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because its expected result, '$EXPECT_CODE', is in the exceptions list."
-+
+               if [ $STATUS == "EI" ]; then
+                  let SKIP_EXPECT_EI+=1
+               elif [ $STATUS == "IW" ]; then
+               else
+                  let SKIP_EXPECT_NG+=1
+               fi
-<
+               FOUND_EXCEPT=1
->
+               break
+            fi
+         fi
+         # Issue query to the API
+         ARCHIVE_QUERY=$(curl --silent --max-time $TIMEOUT "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
-<
+         # Notify user if we hit the rate limit and just keep going
->
+         # Notify reader if we hit the rate limit and just keep going
+         if [[ "$ARCHIVE_QUERY" == "*Too Many Requests*" ]]; then
+            valPrint t "  IA has rate-limited us!"
+            valPrint r "                IA has rate-limited us!"
+            valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td>(hit the API rate limit!)</td></tr>"
-<
+         # If a "closest" snapshot was received, inform user
->
+         # If a "closest" snapshot was received, inform reader
+         elif [[ "$ARCHIVE_QUERY" == *\"closest\":* ]]; then
+            # In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it
+            ARCHIVE_QUERY=$(echo "$ARCHIVE_QUERY" | sed 's/#!/#\\!/')
+            # Remove the port 80 part that IA often adds to the URL, as it's superfluous
+            SNAPSHOT_URL=$(echo $SNAPSHOT_URL | sed 's/:80//')
-<
+            # Inform the user of the snapshot URL
->
+            # Inform the reader of the snapshot URL
+            valPrint ts "  IA suggests $SNAPSHOT_URL"
+            valPrint rs "               IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
+            valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"

Diff Legend

-–
+Removed lines
-+
+Added lines
-<
+Changed lines (old)
->
+Changed lines (new)

Comparing Validate External Links/validate_external_links.sh (file contents): Revision 1160 by iritscen, Sun Aug 15 14:20:21 2021 UTC vs. Revision 1175 by iritscen, Tue Aug 23 14:15:48 2022 UTC

Diff Legend

Comparing Validate External Links/validate_external_links.sh (file contents):
Revision 1160 by iritscen, Sun Aug 15 14:20:21 2021 UTC vs.
Revision 1175 by iritscen, Tue Aug 23 14:15:48 2022 UTC