--- Validate External Links/validate_external_links.sh 2020/11/01 18:55:05 1146 +++ Validate External Links/validate_external_links.sh 2021/02/02 20:10:39 1147 @@ -29,21 +29,22 @@ IFS=" ### GLOBALS ### # Settings -- these will be changed from their defaults by the arguments passed in to the script -LINKS_URL="" # use 'curl' to download file with links from this location (can be file://) -EXCEPT_URL="" # 'curl' will access this wiki page with a list of exceptions for NG results -OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder -RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES -SHOW_SLASH=0 # record issue when a slash is added to the end of a URL -SHOW_HTTPS=0 # record issue when "http" is upgraded to "https" -SHOW_YT_RD=0 # record redirection for a youtu.be URL expanding to the full URL -SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page -CHECK_ARCHIVE_LINKS=0 # check URLs under the archive.org domain -TAKE_PAGE_SHOT=0 # take a screenshot of each OK page -TIMEOUT=10 # time to wait for a response when querying a site -CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature -URL_START=1 # start at this URL in LINKS_FILE -URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE -UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report +LINKS_URL="" # use 'curl' to download file with links from this location (can be file://) +EXCEPT_URL="" # 'curl' will access this wiki page with a list of exceptions for NG results +OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder +RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES +SHOW_SLASH=0 # record issue when a slash is added to the end of a URL +SHOW_HTTPS=0 # record issue when "http" is upgraded to "https" +SHOW_YT_RD=0 # record redirection for a youtu.be URL expanding to the full URL +SUGGEST_SNAPSHOTS_NG=0 # query the Internet Archive for a possible snapshot URL for each NG page +SUGGEST_SNAPSHOTS_OK=0 # query the Internet Archive for an existing snapshot of each OK page +CHECK_ARCHIVE_LINKS=0 # check URLs under the archive.org domain +TAKE_PAGE_SHOT=0 # take a screenshot of each OK page +TIMEOUT=10 # time to wait for a response when querying a site +CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature +URL_START=1 # start at this URL in LINKS_FILE +URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE +UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report # Fixed strings -- see the occurrences of these variables to learn their purpose AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36 OPR/70.0.3728.154" @@ -138,7 +139,7 @@ SYNOPSIS DESCRIPTION This script parses a list of external links found in the OniGalore wiki - (which is dumped by the Oni2.net domain periodically in a particular + (which is dumped by the Oni2.net server periodically in a particular format), validates them using the Unix tool 'curl', and produces a report of which links were "OK" (responded positively to an HTTP query), which were "RD" (responded with a 3xx redirect code), which could be "IW" @@ -174,8 +175,15 @@ OPTIONS --show-https-upgrades Report on redirects that simply upgrade a "http://" URL to a "https://" URL. --show-yt-redirects Report on redirects that expand a youtu.be URL. - --suggest-snapshots Query the Internet Archive for a possible + --suggest-snapshots-ng Query the Internet Archive for a possible snapshot URL for each "NG" page. + --suggest-snapshots-ok Query the Internet Archive for a snapshot of each + "OK" page just to make sure it's available. Note + that this will add a tremendous amount of time to + the script execution because there is a rate + limit to the Archive API. Note that this option + does nothing unless you also use the + --record-ok-links argument. --check-archive-links Check links that are already pointing to a page on the Internet Archive. In theory these links should be totally stable and not need validation. @@ -210,21 +218,22 @@ fi # Parse arguments as long as there are more arguments to process while (( "$#" )); do case "$1" in - --links ) LINKS_URL="$2"; shift 2;; - --exceptions ) EXCEPT_URL="$2"; shift 2;; - --output ) OUTPUT_DIR="$2"; shift 2;; - --record-ok-links ) RECORD_OK_LINKS=1; shift;; - --show-added-slashes ) SHOW_SLASH=1; shift;; - --show-https-upgrades ) SHOW_HTTPS=1; shift;; - --show-yt-redirects ) SHOW_YT_RD=1; shift;; - --suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;; - --check-archive-links ) CHECK_ARCHIVE_LINKS=1; shift;; - --take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;; - --timeout ) TIMEOUT=$2; shift 2;; - --start-url ) URL_START=$2; shift 2;; - --end-url ) URL_LIMIT=$2; shift 2;; - --upload ) UPLOAD_INFO=$2; shift 2;; - * ) echo "Invalid argument $1 detected. Aborting."; exit 1;; + --links ) LINKS_URL="$2"; shift 2;; + --exceptions ) EXCEPT_URL="$2"; shift 2;; + --output ) OUTPUT_DIR="$2"; shift 2;; + --record-ok-links ) RECORD_OK_LINKS=1; shift;; + --show-added-slashes ) SHOW_SLASH=1; shift;; + --show-https-upgrades ) SHOW_HTTPS=1; shift;; + --show-yt-redirects ) SHOW_YT_RD=1; shift;; + --suggest-snapshots-ng ) SUGGEST_SNAPSHOTS_NG=1; shift;; + --suggest-snapshots-ok ) SUGGEST_SNAPSHOTS_OK=1; shift;; + --check-archive-links ) CHECK_ARCHIVE_LINKS=1; shift;; + --take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;; + --timeout ) TIMEOUT=$2; shift 2;; + --start-url ) URL_START=$2; shift 2;; + --end-url ) URL_LIMIT=$2; shift 2;; + --upload ) UPLOAD_INFO=$2; shift 2;; + * ) echo "Invalid argument $1 detected. Aborting."; exit 1;; esac done @@ -652,8 +661,11 @@ if [ $RECORD_OK_LINKS -eq 1 ]; then valP valPrint ctrhn "Take screenshots: " if [ $TAKE_PAGE_SHOT -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi -valPrint ctrhn "Suggest archive.org snapshots: " -if [ $SUGGEST_SNAPSHOTS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi +valPrint ctrhn "Suggest archive.org snapshots for NG pages: " +if [ $SUGGEST_SNAPSHOTS_NG -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi + +valPrint ctrhn "Suggest archive.org snapshots for OK pages: " +if [ $SUGGEST_SNAPSHOTS_OK -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi valPrint ctrhn "Ignore slash-adding redirects: " if [ $SHOW_SLASH -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi @@ -701,6 +713,7 @@ valPrint hn "