[ViewVC] Diff of: Oni2/Validate External Links/validate_external

Comparing Validate External Links/validate_external_links.sh (file contents):
Revision 1137 by iritscen, Tue Jul 21 14:16:54 2020 UTC vs.
Revision 1141 by iritscen, Fri Sep 4 02:54:30 2020 UTC

+#!/bin/bash
+# Validate External Links by Iritscen
-<
+# Provided with a list of external links in an expected CSV format, this script validates them. The
-<
+# resulting logs are produced in three formats: TXT (for easy diffing with an earlier log), RTF (for
-<
+# reading as a local file with clickable links), and HTML (for uploading as a web page). Call script
-<
+# with "--help" argument for documentation. Also see Read Me First.rtf for critical notes.
->
+#
->
+# Validates a list of external links in CSV format. The resulting logs are produced in three formats:
->
+# - TXT (for easy diffing with an earlier log)
->
+# - RTF (for reading as a local file with clickable links)
->
+# - HTML (for uploading as a web page).
->
+# Call script with "--help" argument for documentation. Also see Read Me First.rtf for critical notes.
->
+#
+# Recommended rule:
+# |----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----|
-+
+#
-+
+# Table of contents (sections of script in order of appearance, not execution):
-+
+# • Globals
-+
+# • Help Output
-+
+# • Setup
-+
+# • Utility Functions
-+
+# • Summary Output
-+
+# • Initialization
-+
+#   • Data Sourcing
-+
+#   • Config Output
-+
+#   • Legend Output
-+
+# • Main Loop
+# Set separator token to newline
+IFS="
+SUGGEST_SNAPSHOTS=0  # query the Internet Archive for a possible snapshot URL for each NG page
+SKIP_ARCHIVE_LINKS=0 # don't check URLs under the archive.org domain
+TAKE_PAGE_SHOT=0     # take a screenshot of each OK page
-+
+TIMEOUT=10           # time to wait for a response when querying a site
+CHROME_PATH=""       # path to a copy of Google Chrome that has the command-line screenshot feature
+URL_START=1          # start at this URL in LINKS_FILE (1 by default)
+URL_LIMIT=0          # if non-zero, stop at this URL in LINKS_FILE
+ARCHIVE_GENERIC="https://web.archive.org/web/*"
+ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
+CHROME_SCREENSHOT="screenshot.png"
-–
+CURL_CODES="http://iritscen.oni2.net/val/curl_codes.txt"
+EXCEPT_FILE_NAME="exceptions.txt"
+EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
-<
+HTTP_CODES="http://iritscen.oni2.net/val/http_codes.txt"
-<
+MY_WIKI_PAGE="https://wiki.oni2.net/User:Iritscen"
->
+WIKI_CURL="https://wiki.oni2.net/Validate_External_Links/Curl_codes"
->
+WIKI_HTTP="https://wiki.oni2.net/Validate_External_Links/HTTP_codes"
->
+WIKI_MAIN="https://wiki.oni2.net/Validate_External_Links"
->
+WIKI_ME="http://iritscen.oni2.net"
+THIS_DIR=$(cd $(dirname $0); pwd)
+WORKING_DIR=$(pwd)
+WIKI_PATH="wiki.oni2.net"
+END_RUN=0
-<
+### HELP ###
->
+### HELP OUTPUT ###
+# A pseudo-man page. Here is the 80-character rule for the page text:
+# 234567890123456789012345678901234567890123456789012345678901234567890123456789
+function printHelp()
+       validate_external_links.sh --links URL --output DIR [--exceptions URL]
+          [--record-ok-links] [--show-added-slashes] [--show-https-upgrades]
+          [--show-yt-redirects] [--suggest-snapshots] [--skip-archive-links]
-<
+          [--take-screenshots FILE] [--start-url NUM] [--end-url NUM]
-<
+          [--upload FILE]
->
+          [--take-screenshots FILE] [--timeout NUM] [--start-url NUM]
->
+          [--end-url NUM] [--upload FILE]
+DESCRIPTION
+       This script parses a list of external links found in the OniGalore wiki
+                               a page on the Internet Archive.
+       --take-screenshots FILE Call the Google Chrome binary at this path to
+                               take screenshots of each "OK" page.
-+
+       --timeout NUM           Wait this many seconds for a site to respond. The
-+
+                               default is 10.
+       --start-url NUM         Start at this link in the links CSV file.
+       --end-url NUM           Stop at this link in the links CSV file.
+       --upload FILE           Upload report using the credentials and path
+      --suggest-snapshots )   SUGGEST_SNAPSHOTS=1;                shift;;
+      --skip-archive-links )  SKIP_ARCHIVE_LINKS=1;               shift;;
+      --take-screenshots )    TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
-+
+      --timeout )             TIMEOUT=$2;                         shift 2;;
+      --start-url )           URL_START=$2;                       shift 2;;
+      --end-url )             URL_LIMIT=$2;                       shift 2;;
+      --upload )              UPLOAD_INFO=$2;                     shift 2;;
+   valPrint t "Validate External Links report"
+   valPrint t "generated $NICE_TIME"
+   valPrint t "from data of $LINKS_DATE"
-<
+   valPrint t "script by Iritscen (contact: $MY_WIKI_PAGE)"
->
+   valPrint t "script by Iritscen (contact: $WIKI_ME)"
+   valPrint t ""
+\f0\fs28 \cf0 \b1 Validate External Links report \b0\\
+generated $NICE_TIME\\
+from data of $LINKS_DATE\\
-<
+script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$MY_WIKI_PAGE\"}}{\fldrslt contact}})
->
+script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$WIKI_ME\"}}{\fldrslt contact}})
+\\
+\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
+\cf0 "
+<h2>Validate External Links report</h2>
+<h3>generated $NICE_TIME<br />
+from data of $LINKS_DATE<br />
-<
+script by Iritscen (<a href=\"$MY_WIKI_PAGE\" target=\"_blank\">contact</a>)</h3>"
->
+script by Iritscen (<a href=\"$WIKI_ME\" target=\"_blank\">contact</a>)</h3>"
+# Closes the HTML markup of the HTML log file
+# The central logging function. The first parameter is a string composed of one or more characters that
+# indicate which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
-<
+# 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 's' means "Print an extra newline at the end." 'w' means "Don't pass console output through 'fmt'" ("fmt" fits the output
->
+# 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 's' means "Print an
->
+# extra newline at the end." 'w' means "Don't pass console output through 'fmt'" ("fmt" fits the output
+# to an 80-column CLI but can break special formatting and the 'n' option).
+function valPrint()
+   TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP+SKIP_YOUTU_BE))
+   LINKS_CHECKED=$((LINKS_PROCESSED-LINK_ERRORS))
-<
+   # Print summary header
->
+   ## SUMMARY OUTPUT ##
+   valPrint ct "Summary ($ELAPSED):"
+   valPrint r "\b1 Summary \b0 ($ELAPSED)"
+   valPrint hn "<h3><span id=\"summary\">Summary ($ELAPSED)</span></h3>"
+printRTFheader
+printHTMheader
-+
+## DATA SOURCING ##
-+
+valPrint t "Startup:"
-+
+valPrint r "\b1 Startup \b0"
-+
+valPrint hn "<h3>Startup</h3>"
-+
+# Attempt to download file at LINKS_URL, then check that it succeeded
-<
+valPrint t "Config:"
-<
+valPrint r "\b1 Config \b0"
-<
+valPrint hn "<h3>Config</h3>"
-<
+valPrint cwtrh "Downloading list of external links from $LINKS_URL."
->
+valPrint cwtrhn "Downloading list of external links from $LINKS_URL..."
+LINKS_FILE_NAME=$(echo "$LINKS_URL" | sed 's/.*\///')
+LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
+curl --silent -o "$LINKS_FILE" $LINKS_URL
+if [ ! -f "$LINKS_FILE" ]; then
-<
+   echo "The download of $LINKS_URL appears to have failed. Aborting."
->
+   echo -e "\nThe download of $LINKS_URL appears to have failed. Aborting."
+   wrapupAndExit
-+
+else
-+
+   valPrint ctrh " success."
+fi
+# Attempt to download file at EXCEPT_URL, then check that it succeeded
+if [ ! -z $EXCEPT_URL ]; then
-<
+   valPrint cwtrh "Downloading list of reporting exceptions from $EXCEPT_URL."
->
+   valPrint cwtrhn "Downloading list of reporting exceptions from $EXCEPT_URL..."
+   EXCEPT_DATA=$(curl --silent $EXCEPT_URL)
+   if [ -z "$EXCEPT_DATA" ]; then
-<
+      echo "The download of the exceptions data from '$EXCEPT_URL' appears to have failed. Aborting."
->
+      echo -e "\nThe download of the exceptions data from '$EXCEPT_URL' appears to have failed. Aborting."
+      wrapupAndExit
-+
+   else
-+
+      valPrint ctrh " success."
+   fi
+   EXCEPT_DATA=${EXCEPT_DATA%END LIST*}
+   EXCEPT_DATA=${EXCEPT_DATA#*BEGIN LIST}
+# Number of URLs is number of lines minus one (first line is column header row for the CSV)
+LINK_COUNT=$(echo "${LINK_COUNT_STRING}" | tr -d '[:space:]')
+let LINK_COUNT-=1
-+
+valPrint ctrh "Found $LINK_COUNT links to process."
-+
+valPrint trh ""
-<
+# Calculate number of URLs to consider
->
+## CONFIG OUTPUT ##
->
+valPrint t "Config:"
->
+valPrint r "\b1 Config \b0"
->
+valPrint hn "<h3>Config</h3>"
->
->
+valPrint ctrhn "Links to consider: "
+if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
-<
+   valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((URL_LIMIT-URL_START+1)) of them, from link $URL_START to $URL_LIMIT."
->
+   valPrint ctrh "$((URL_LIMIT-URL_START+1)), from link $URL_START to $URL_LIMIT"
+elif [ $URL_START -ne 1 ]; then
-<
+   valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((LINK_COUNT-URL_START+1)) of them, from link $URL_START to $LINK_COUNT."
->
+   valPrint ctrh "$((LINK_COUNT-URL_START+1)), from link $URL_START to $LINK_COUNT"
+else
-<
+   valPrint ctrh "Found $LINK_COUNT links to process. I will be considering all of them."
->
+   valPrint ctrh "$LINK_COUNT"
+fi
-<
+# Print settings to console and log
-<
+declare -a SETTINGS_MSG=(I will be checking the response code of each link "and will" take a screenshot of each page. Pages that are OK will "also" be logged. I "will" ask the Internet Archive for a suggested snapshot URL for each NG page. "I will not report links that are in the exceptions list." "I will ignore URLs that simply have ending slashes added onto them." "I will ignore URLs that only upgrade from HTTP to HTTPS." "I will ignore youtu.be links that are merely being expanded." "I will not check the validity of Internet Archive snapshot URLs.")
-<
+if [ $TAKE_PAGE_SHOT -eq 0 ]; then SETTINGS_MSG[10]="but will not"; fi
-<
+if [ $RECORD_OK_LINKS -eq 0 ]; then SETTINGS_MSG[22]="not"; fi
-<
+if [ $SUGGEST_SNAPSHOTS -eq 0 ]; then SETTINGS_MSG[26]="will not"; fi
-<
+if [ -z $EXCEPT_URL ] || [ $RECORD_OK_LINKS -eq 1 ]; then SETTINGS_MSG[40]=""; fi
-<
+if [ $SHOW_SLASH -eq 1 ]; then SETTINGS_MSG[41]=""; fi
-<
+if [ $SHOW_HTTPS -eq 1 ]; then SETTINGS_MSG[42]=""; fi
-<
+if [ $SHOW_YT_RD -eq 1 ]; then SETTINGS_MSG[43]=""; fi
-<
+if [ $SKIP_ARCHIVE_LINKS -eq 0 ]; then SETTINGS_MSG[44]=""; fi
-<
+SETTINGS_STR=${SETTINGS_MSG[@]}
-<
+valPrint ctrh "$SETTINGS_STR"
->
+valPrint ctrh "Site query timeout: $TIMEOUT seconds"
->
->
+valPrint ctrhn "Show OK links: "
->
+if [ $RECORD_OK_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
->
->
+valPrint ctrhn "Take screenshots: "
->
+if [ $TAKE_PAGE_SHOT -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
->
->
+valPrint ctrhn "Suggest Archive.org snapshots: "
->
+if [ $SUGGEST_SNAPSHOTS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
->
->
+valPrint ctrhn "Ignore slash-adding redirects: "
->
+if [ $SHOW_SLASH -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
->
->
+valPrint ctrhn "Ignore HTTPS-upgrading redirects: "
->
+if [ $SHOW_HTTPS -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
->
->
+valPrint ctrhn "Ignore youtu.be redirects: "
->
+if [ $SHOW_YT_RD -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
->
->
+valPrint ctrhn "Check archive.org links: "
->
+if [ $SKIP_ARCHIVE_LINKS -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
->
+valPrint tr "A summary of my findings will be found at the bottom of the report."
+valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
+valPrint trh ""
-<
+# Print legend to logs
->
+## LEGEND OUTPUT ##
+valPrint t "Legend:"
+valPrint r "\b1 Legend \b0"
+valPrint hn "<h3>Legend</h3>"
-<
+valPrint trh "OK = URL seems to be working."
-<
+valPrint trh "NG = URL no longer seems to work. You should click each URL marked as NG before attempting to fix it, because false negatives will occur from time to time due to hiccups in the Internet. Please report any persistent false negatives or other issues to the script's author (see top of report). An NG link should be followed by a link to the Internet Archive's Wayback Machine which may help you repair the link, unless the Archive does not have any snapshots of the site. If the link cannot be repaired, you can delete it from the wiki page, or, if this would disrupt the surrounding material on the page, disable the link by wrapping the URL in nowiki tags."
-<
+valPrint trh "RD = The server responding to this URL is saying that the page moved and you should instead use the supplied new URL. Some RD links represent minor adjustments in the organization of a web site, and some are soft 404s (the file/page has been removed and you are being redirected to something like the main page of the web site). You will have to look at the new URL yourself to determine if it represents an OK link and the link on the wiki should be updated to this one, or if the desired file/page is actually gone and we need to replace the wiki link with an Internet Archive snapshot link -- or disable the URL if it has not been archived."
-<
+valPrint trh "EI = URL is an external link to an internal page and should be converted to an intrawiki link using the suggested markup."
-<
+valPrint trh "IW = URL is an external link to a fellow wiki and should be converted to an interwiki link using the suggested markup."
-<
+valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $HTTP_CODES)."
-<
+valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$HTTP_CODES\"}}{\fldrslt here}} for code reference)."
-<
+valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$HTTP_CODES\" target=\"_blank\">here</a> for code reference)."
-<
+valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $CURL_CODES)."
-<
+valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$CURL_CODES\"}}{\fldrslt here}} for code reference)."
-<
+valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$CURL_CODES\" target=\"_blank\">here</a> for code reference)."
-<
+valPrint trh "IA suggests = Last available snapshot suggested by the Internet Archive."
-<
+valPrint trh "Try browsing = The Archive occasionally fails to return a snapshot URL even when one exists, so you will need to check for a snapshot manually using this link to the Wayback Machine before concluding that a site has not been archived."
->
+valPrint t "(For guidance in fixing these links, see $WIKI_MAIN.)"
->
+valPrint r "(For guidance in fixing these links, see {\field{\*\fldinst{HYPERLINK \"$WIKI_MAIN\"}}{\fldrslt here}}.)"
->
+valPrint h "(For guidance in fixing these links, see <a href=\"$WIKI_MAIN\" target=\"_blank\">here</a>.)"
->
+valPrint trh "OK = URL seems to be working"
->
+valPrint trh "NG = URL no longer seems to work"
->
+valPrint trh "RD = URL is redirecting to this new URL"
->
+valPrint trh "EI = URL is an external link to an internal page and should be converted to an intrawiki link using the suggested markup"
->
+valPrint trh "IW = URL is an external link to a fellow wiki and should be converted to an interwiki link using the suggested markup"
->
+valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $WIKI_HTTP)"
->
+valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$WIKI_HTTP\"}}{\fldrslt here}} for code reference)"
->
+valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$WIKI_HTTP\" target=\"_blank\">here</a> for code reference)"
->
+valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $WIKI_CURL)"
->
+valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$WIKI_CURL\"}}{\fldrslt here}} for code reference)"
->
+valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$WIKI_CURL\" target=\"_blank\">here</a> for code reference)"
->
+valPrint trh "IA suggests = Last available snapshot returned by the Internet Archive"
->
+valPrint trh "Try browsing = The Archive failed to return a snapshot URL, so check for a snapshot manually using this link"
+valPrint trh ""
+   # Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
+   # issue with sites that require HTTPS
-<
+   CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time 10 --write-out '%{http_code}\n' $URL)
->
+   CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --write-out '%{http_code}\n' $URL)
+   CURL_ERR=$(echo $?)
+   CURL_RESULT=$CURL_CODE
+      for CODE in "${RD_CODES[@]}"; do
+         if [[ $CODE == $CURL_CODE ]]; then
+            # Get URL header again in order to retrieve the URL we are being redirected to
-<
+            NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time 10 --write-out '%{redirect_url}\n' $URL)
->
+            NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --write-out '%{redirect_url}\n' $URL)
+            # Adjust the old and new URLs to both use HTTP for comparison purposes, so we can filter
+            # those changes out if the user didn't ask for them
+      # Query Internet Archive for latest "OK" snapshot for "NG" page
+      if [ $STATUS == "NG" ] && [ $SUGGEST_SNAPSHOTS -eq 1 ]; then
-<
+         ARCHIVE_QUERY=$(curl --silent --max-time 10 "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
->
+         ARCHIVE_QUERY=$(curl --silent --max-time $TIMEOUT "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
+         # If a "closest" snapshot was received...
+         if [[ "$ARCHIVE_QUERY" == *\"closest\":* ]]; then

Diff Legend

-–
+Removed lines
-+
+Added lines
-<
+Changed lines (old)
->
+Changed lines (new)

Comparing Validate External Links/validate_external_links.sh (file contents): Revision 1137 by iritscen, Tue Jul 21 14:16:54 2020 UTC vs. Revision 1141 by iritscen, Fri Sep 4 02:54:30 2020 UTC

Diff Legend

Comparing Validate External Links/validate_external_links.sh (file contents):
Revision 1137 by iritscen, Tue Jul 21 14:16:54 2020 UTC vs.
Revision 1141 by iritscen, Fri Sep 4 02:54:30 2020 UTC