[ViewVC] Diff of: Oni2/Validate External Links/validate_external

Comparing Validate External Links/validate_external_links.sh (file contents):
Revision 1146 by iritscen, Sun Nov 1 18:55:05 2020 UTC vs.
Revision 1147 by iritscen, Tue Feb 2 20:10:39 2021 UTC

+### GLOBALS ###
+# Settings -- these will be changed from their defaults by the arguments passed in to the script
-<
+LINKS_URL=""          # use 'curl' to download file with links from this location (can be file://)
-<
+EXCEPT_URL=""         # 'curl' will access this wiki page with a list of exceptions for NG results
-<
+OUTPUT_DIR=""         # place reports and all other output in a folder inside this existing folder
-<
+RECORD_OK_LINKS=0     # record response code to the log even when it's a value in OK_CODES
-<
+SHOW_SLASH=0          # record issue when a slash is added to the end of a URL
-<
+SHOW_HTTPS=0          # record issue when "http" is upgraded to "https"
-<
+SHOW_YT_RD=0          # record redirection for a youtu.be URL expanding to the full URL
-<
+SUGGEST_SNAPSHOTS=0   # query the Internet Archive for a possible snapshot URL for each NG page
-<
+CHECK_ARCHIVE_LINKS=0 # check URLs under the archive.org domain
-<
+TAKE_PAGE_SHOT=0      # take a screenshot of each OK page
-<
+TIMEOUT=10            # time to wait for a response when querying a site
-<
+CHROME_PATH=""        # path to a copy of Google Chrome that has the command-line screenshot feature
-<
+URL_START=1           # start at this URL in LINKS_FILE
-<
+URL_LIMIT=0           # if non-zero, stop at this URL in LINKS_FILE
-<
+UPLOAD_INFO=""        # path to a file on your hard drive with the login info needed to upload a report
->
+LINKS_URL=""           # use 'curl' to download file with links from this location (can be file://)
->
+EXCEPT_URL=""          # 'curl' will access this wiki page with a list of exceptions for NG results
->
+OUTPUT_DIR=""          # place reports and all other output in a folder inside this existing folder
->
+RECORD_OK_LINKS=0      # record response code to the log even when it's a value in OK_CODES
->
+SHOW_SLASH=0           # record issue when a slash is added to the end of a URL
->
+SHOW_HTTPS=0           # record issue when "http" is upgraded to "https"
->
+SHOW_YT_RD=0           # record redirection for a youtu.be URL expanding to the full URL
->
+SUGGEST_SNAPSHOTS_NG=0 # query the Internet Archive for a possible snapshot URL for each NG page
->
+SUGGEST_SNAPSHOTS_OK=0 # query the Internet Archive for an existing snapshot of each OK page
->
+CHECK_ARCHIVE_LINKS=0  # check URLs under the archive.org domain
->
+TAKE_PAGE_SHOT=0       # take a screenshot of each OK page
->
+TIMEOUT=10             # time to wait for a response when querying a site
->
+CHROME_PATH=""         # path to a copy of Google Chrome that has the command-line screenshot feature
->
+URL_START=1            # start at this URL in LINKS_FILE
->
+URL_LIMIT=0            # if non-zero, stop at this URL in LINKS_FILE
->
+UPLOAD_INFO=""         # path to a file on your hard drive with the login info needed to upload a report
+# Fixed strings -- see the occurrences of these variables to learn their purpose
+AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36 OPR/70.0.3728.154"
+DESCRIPTION
+       This script parses a list of external links found in the OniGalore wiki
-<
+       (which is dumped by the Oni2.net domain periodically in a particular
->
+       (which is dumped by the Oni2.net server periodically in a particular
+       format), validates them using the Unix tool 'curl', and produces a report
+       of which links were "OK" (responded positively to an HTTP query), which
+       were "RD" (responded with a 3xx redirect code), which could be "IW"
+       --show-https-upgrades   Report on redirects that simply upgrade a
+                               "http://" URL to a "https://" URL.
+       --show-yt-redirects     Report on redirects that expand a youtu.be URL.
-<
+       --suggest-snapshots     Query the Internet Archive for a possible
->
+       --suggest-snapshots-ng  Query the Internet Archive for a possible
+                               snapshot URL for each "NG" page.
-+
+       --suggest-snapshots-ok  Query the Internet Archive for a snapshot of each
-+
+                               "OK" page just to make sure it's available. Note
-+
+                               that this will add a tremendous amount of time to
-+
+                               the script execution because there is a rate
-+
+                               limit to the Archive API. Note that this option
-+
+                               does nothing unless you also use the
-+
+                               --record-ok-links argument.
+       --check-archive-links   Check links that are already pointing to a page
+                               on the Internet Archive. In theory these links
+                               should be totally stable and not need validation.
+# Parse arguments as long as there are more arguments to process
+while (( "$#" )); do
+   case "$1" in
-<
+      --links )               LINKS_URL="$2";                     shift 2;;
-<
+      --exceptions )          EXCEPT_URL="$2";                    shift 2;;
-<
+      --output )              OUTPUT_DIR="$2";                    shift 2;;
-<
+      --record-ok-links )     RECORD_OK_LINKS=1;                  shift;;
-<
+      --show-added-slashes )  SHOW_SLASH=1;                       shift;;
-<
+      --show-https-upgrades ) SHOW_HTTPS=1;                       shift;;
-<
+      --show-yt-redirects )   SHOW_YT_RD=1;                       shift;;
-<
+      --suggest-snapshots )   SUGGEST_SNAPSHOTS=1;                shift;;
-<
+      --check-archive-links ) CHECK_ARCHIVE_LINKS=1;              shift;;
-<
+      --take-screenshots )    TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
-<
+      --timeout )             TIMEOUT=$2;                         shift 2;;
-<
+      --start-url )           URL_START=$2;                       shift 2;;
-<
+      --end-url )             URL_LIMIT=$2;                       shift 2;;
-<
+      --upload )              UPLOAD_INFO=$2;                     shift 2;;
-<
+      * )                     echo "Invalid argument $1 detected. Aborting."; exit 1;;
->
+      --links )                LINKS_URL="$2";                     shift 2;;
->
+      --exceptions )           EXCEPT_URL="$2";                    shift 2;;
->
+      --output )               OUTPUT_DIR="$2";                    shift 2;;
->
+      --record-ok-links )      RECORD_OK_LINKS=1;                  shift;;
->
+      --show-added-slashes )   SHOW_SLASH=1;                       shift;;
->
+      --show-https-upgrades )  SHOW_HTTPS=1;                       shift;;
->
+      --show-yt-redirects )    SHOW_YT_RD=1;                       shift;;
->
+      --suggest-snapshots-ng ) SUGGEST_SNAPSHOTS_NG=1;             shift;;
->
+      --suggest-snapshots-ok ) SUGGEST_SNAPSHOTS_OK=1;             shift;;
->
+      --check-archive-links )  CHECK_ARCHIVE_LINKS=1;              shift;;
->
+      --take-screenshots )     TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
->
+      --timeout )              TIMEOUT=$2;                         shift 2;;
->
+      --start-url )            URL_START=$2;                       shift 2;;
->
+      --end-url )              URL_LIMIT=$2;                       shift 2;;
->
+      --upload )               UPLOAD_INFO=$2;                     shift 2;;
->
+      * )                      echo "Invalid argument $1 detected. Aborting."; exit 1;;
+  esac
+done
+valPrint ctrhn "Take screenshots: "
+if [ $TAKE_PAGE_SHOT -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
-<
+valPrint ctrhn "Suggest archive.org snapshots: "
-<
+if [ $SUGGEST_SNAPSHOTS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
->
+valPrint ctrhn "Suggest archive.org snapshots for NG pages: "
->
+if [ $SUGGEST_SNAPSHOTS_NG -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
->
->
+valPrint ctrhn "Suggest archive.org snapshots for OK pages: "
->
+if [ $SUGGEST_SNAPSHOTS_OK -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
+valPrint ctrhn "Ignore slash-adding redirects: "
+if [ $SHOW_SLASH -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
+START_RUN=$(date +%s)
+# Process each line of the .csv in LINKS_FILE
+for LINE in `cat "$LINKS_FILE"`; do
-+
+   START_LINK=$(date +%s)
+   let LINK_NUM+=1
+   # First line is the column header row for the CSV, so let's verify that the format hasn't changed
+      valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"
+      # Place vertical space here since we won't be printing anything more about this link
-<
+      if [ $STATUS == "OK" ]; then valPrint tr ""; valPrint hs ""; fi
->
+      if [ $STATUS == "OK" ] && [ $SUGGEST_SNAPSHOTS_OK -eq 0 ]; then valPrint tr ""; valPrint hs ""; fi
+      # Record redirect URL if one was given by a 3xx response page
+      if [ $STATUS == "RD" ]; then
+      fi
+      # Query Internet Archive for latest "OK" snapshot for "NG" page
-<
+      if [ $STATUS == "NG" ] && [ $SUGGEST_SNAPSHOTS -eq 1 ]; then
->
+      if [[ ( $STATUS == "NG" && $SUGGEST_SNAPSHOTS_NG -eq 1 ) || ( $STATUS == "OK" && $SUGGEST_SNAPSHOTS_OK -eq 1 ) ]]; then
->
->
+         # We need to watch out for the rate limit or we'll get locked out; look at how much time has
->
+         # elapsed and then wait the remainder between that and how long of a wait we think is needed
->
+         # to avoid the dreaded "Too Many Requests" response. 5 seconds is just a guess.
->
+         CUR_TIME=$(date +%s)
->
+         WAIT_REMAINDER=$((5 - $CUR_TIME + $START_LINK))
->
+         if [ $WAIT_REMAINDER -gt 0 ]; then
->
+            valPrint t "Waiting $WAIT_REMAINDER second(s) to conform to Archive.org rate limit."
->
+            sleep $WAIT_REMAINDER
->
+         fi
->
->
+         # Issue query to the API
+         ARCHIVE_QUERY=$(curl --silent --max-time $TIMEOUT "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
-<
+         # If a "closest" snapshot was received...
-<
+         if [[ "$ARCHIVE_QUERY" == *\"closest\":* ]]; then
->
+         # Notify user if we hit the rate limit and just keep going
->
+         if [[ "$ARCHIVE_QUERY" == "*Too Many Requests*" ]]; then
->
+            valPrint t "  IA has rate-limited us!"
->
+            valPrint r "                IA has rate-limited us!"
->
+            valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td>(hit the API rate limit!)</td></tr>"
->
+         # If a "closest" snapshot was received, inform user
->
+         elif [[ "$ARCHIVE_QUERY" == *\"closest\":* ]]; then
+            # In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it
+            ARCHIVE_QUERY=$(echo "$ARCHIVE_QUERY" | sed 's/#!/#\\!/')
+            valPrint ts "  IA suggests $SNAPSHOT_URL"
+            valPrint rs "               IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
+            valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
-<
+         else # ...otherwise give generic Wayback Machine link for this URL
->
+         else # Otherwise give a generic Wayback Machine link for this URL, which might work
+            valPrint ts " Try browsing $ARCHIVE_GENERIC/$URL"
+            valPrint rs "               Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
+            valPrint hs "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"

Diff Legend

-–
+Removed lines
-+
+Added lines
-<
+Changed lines (old)
->
+Changed lines (new)

Comparing Validate External Links/validate_external_links.sh (file contents): Revision 1146 by iritscen, Sun Nov 1 18:55:05 2020 UTC vs. Revision 1147 by iritscen, Tue Feb 2 20:10:39 2021 UTC

Diff Legend

Comparing Validate External Links/validate_external_links.sh (file contents):
Revision 1146 by iritscen, Sun Nov 1 18:55:05 2020 UTC vs.
Revision 1147 by iritscen, Tue Feb 2 20:10:39 2021 UTC