--- Validate External Links/validate_external_links.sh 2020/11/01 18:55:05 1146 +++ Validate External Links/validate_external_links.sh 2021/02/02 20:10:39 1147 @@ -29,21 +29,22 @@ IFS=" ### GLOBALS ### # Settings -- these will be changed from their defaults by the arguments passed in to the script -LINKS_URL="" # use 'curl' to download file with links from this location (can be file://) -EXCEPT_URL="" # 'curl' will access this wiki page with a list of exceptions for NG results -OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder -RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES -SHOW_SLASH=0 # record issue when a slash is added to the end of a URL -SHOW_HTTPS=0 # record issue when "http" is upgraded to "https" -SHOW_YT_RD=0 # record redirection for a youtu.be URL expanding to the full URL -SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page -CHECK_ARCHIVE_LINKS=0 # check URLs under the archive.org domain -TAKE_PAGE_SHOT=0 # take a screenshot of each OK page -TIMEOUT=10 # time to wait for a response when querying a site -CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature -URL_START=1 # start at this URL in LINKS_FILE -URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE -UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report +LINKS_URL="" # use 'curl' to download file with links from this location (can be file://) +EXCEPT_URL="" # 'curl' will access this wiki page with a list of exceptions for NG results +OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder +RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES +SHOW_SLASH=0 # record issue when a slash is added to the end of a URL +SHOW_HTTPS=0 # record issue when "http" is upgraded to "https" +SHOW_YT_RD=0 # record redirection for a youtu.be URL expanding to the full URL +SUGGEST_SNAPSHOTS_NG=0 # query the Internet Archive for a possible snapshot URL for each NG page +SUGGEST_SNAPSHOTS_OK=0 # query the Internet Archive for an existing snapshot of each OK page +CHECK_ARCHIVE_LINKS=0 # check URLs under the archive.org domain +TAKE_PAGE_SHOT=0 # take a screenshot of each OK page +TIMEOUT=10 # time to wait for a response when querying a site +CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature +URL_START=1 # start at this URL in LINKS_FILE +URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE +UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report # Fixed strings -- see the occurrences of these variables to learn their purpose AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36 OPR/70.0.3728.154" @@ -138,7 +139,7 @@ SYNOPSIS DESCRIPTION This script parses a list of external links found in the OniGalore wiki - (which is dumped by the Oni2.net domain periodically in a particular + (which is dumped by the Oni2.net server periodically in a particular format), validates them using the Unix tool 'curl', and produces a report of which links were "OK" (responded positively to an HTTP query), which were "RD" (responded with a 3xx redirect code), which could be "IW" @@ -174,8 +175,15 @@ OPTIONS --show-https-upgrades Report on redirects that simply upgrade a "http://" URL to a "https://" URL. --show-yt-redirects Report on redirects that expand a youtu.be URL. - --suggest-snapshots Query the Internet Archive for a possible + --suggest-snapshots-ng Query the Internet Archive for a possible snapshot URL for each "NG" page. + --suggest-snapshots-ok Query the Internet Archive for a snapshot of each + "OK" page just to make sure it's available. Note + that this will add a tremendous amount of time to + the script execution because there is a rate + limit to the Archive API. Note that this option + does nothing unless you also use the + --record-ok-links argument. --check-archive-links Check links that are already pointing to a page on the Internet Archive. In theory these links should be totally stable and not need validation. @@ -210,21 +218,22 @@ fi # Parse arguments as long as there are more arguments to process while (( "$#" )); do case "$1" in - --links ) LINKS_URL="$2"; shift 2;; - --exceptions ) EXCEPT_URL="$2"; shift 2;; - --output ) OUTPUT_DIR="$2"; shift 2;; - --record-ok-links ) RECORD_OK_LINKS=1; shift;; - --show-added-slashes ) SHOW_SLASH=1; shift;; - --show-https-upgrades ) SHOW_HTTPS=1; shift;; - --show-yt-redirects ) SHOW_YT_RD=1; shift;; - --suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;; - --check-archive-links ) CHECK_ARCHIVE_LINKS=1; shift;; - --take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;; - --timeout ) TIMEOUT=$2; shift 2;; - --start-url ) URL_START=$2; shift 2;; - --end-url ) URL_LIMIT=$2; shift 2;; - --upload ) UPLOAD_INFO=$2; shift 2;; - * ) echo "Invalid argument $1 detected. Aborting."; exit 1;; + --links ) LINKS_URL="$2"; shift 2;; + --exceptions ) EXCEPT_URL="$2"; shift 2;; + --output ) OUTPUT_DIR="$2"; shift 2;; + --record-ok-links ) RECORD_OK_LINKS=1; shift;; + --show-added-slashes ) SHOW_SLASH=1; shift;; + --show-https-upgrades ) SHOW_HTTPS=1; shift;; + --show-yt-redirects ) SHOW_YT_RD=1; shift;; + --suggest-snapshots-ng ) SUGGEST_SNAPSHOTS_NG=1; shift;; + --suggest-snapshots-ok ) SUGGEST_SNAPSHOTS_OK=1; shift;; + --check-archive-links ) CHECK_ARCHIVE_LINKS=1; shift;; + --take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;; + --timeout ) TIMEOUT=$2; shift 2;; + --start-url ) URL_START=$2; shift 2;; + --end-url ) URL_LIMIT=$2; shift 2;; + --upload ) UPLOAD_INFO=$2; shift 2;; + * ) echo "Invalid argument $1 detected. Aborting."; exit 1;; esac done @@ -652,8 +661,11 @@ if [ $RECORD_OK_LINKS -eq 1 ]; then valP valPrint ctrhn "Take screenshots: " if [ $TAKE_PAGE_SHOT -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi -valPrint ctrhn "Suggest archive.org snapshots: " -if [ $SUGGEST_SNAPSHOTS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi +valPrint ctrhn "Suggest archive.org snapshots for NG pages: " +if [ $SUGGEST_SNAPSHOTS_NG -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi + +valPrint ctrhn "Suggest archive.org snapshots for OK pages: " +if [ $SUGGEST_SNAPSHOTS_OK -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi valPrint ctrhn "Ignore slash-adding redirects: " if [ $SHOW_SLASH -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi @@ -701,6 +713,7 @@ valPrint hn "

Links

" START_RUN=$(date +%s) # Process each line of the .csv in LINKS_FILE for LINE in `cat "$LINKS_FILE"`; do + START_LINK=$(date +%s) let LINK_NUM+=1 # First line is the column header row for the CSV, so let's verify that the format hasn't changed @@ -1096,7 +1109,7 @@ for LINE in `cat "$LINKS_FILE"`; do valPrint hn "linked from$LOCAL_PAGE_PATH" # Place vertical space here since we won't be printing anything more about this link - if [ $STATUS == "OK" ]; then valPrint tr ""; valPrint hs ""; fi + if [ $STATUS == "OK" ] && [ $SUGGEST_SNAPSHOTS_OK -eq 0 ]; then valPrint tr ""; valPrint hs ""; fi # Record redirect URL if one was given by a 3xx response page if [ $STATUS == "RD" ]; then @@ -1122,11 +1135,28 @@ for LINE in `cat "$LINKS_FILE"`; do fi # Query Internet Archive for latest "OK" snapshot for "NG" page - if [ $STATUS == "NG" ] && [ $SUGGEST_SNAPSHOTS -eq 1 ]; then + if [[ ( $STATUS == "NG" && $SUGGEST_SNAPSHOTS_NG -eq 1 ) || ( $STATUS == "OK" && $SUGGEST_SNAPSHOTS_OK -eq 1 ) ]]; then + + # We need to watch out for the rate limit or we'll get locked out; look at how much time has + # elapsed and then wait the remainder between that and how long of a wait we think is needed + # to avoid the dreaded "Too Many Requests" response. 5 seconds is just a guess. + CUR_TIME=$(date +%s) + WAIT_REMAINDER=$((5 - $CUR_TIME + $START_LINK)) + if [ $WAIT_REMAINDER -gt 0 ]; then + valPrint t "Waiting $WAIT_REMAINDER second(s) to conform to Archive.org rate limit." + sleep $WAIT_REMAINDER + fi + + # Issue query to the API ARCHIVE_QUERY=$(curl --silent --max-time $TIMEOUT "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES") - # If a "closest" snapshot was received... - if [[ "$ARCHIVE_QUERY" == *\"closest\":* ]]; then + # Notify user if we hit the rate limit and just keep going + if [[ "$ARCHIVE_QUERY" == "*Too Many Requests*" ]]; then + valPrint t " IA has rate-limited us!" + valPrint r " IA has rate-limited us!" + valPrint hs "IA suggests(hit the API rate limit!)" + # If a "closest" snapshot was received, inform user + elif [[ "$ARCHIVE_QUERY" == *\"closest\":* ]]; then # In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it ARCHIVE_QUERY=$(echo "$ARCHIVE_QUERY" | sed 's/#!/#\\!/') @@ -1142,7 +1172,7 @@ for LINE in `cat "$LINKS_FILE"`; do valPrint ts " IA suggests $SNAPSHOT_URL" valPrint rs " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}" valPrint hs "IA suggests$SNAPSHOT_URL" - else # ...otherwise give generic Wayback Machine link for this URL + else # Otherwise give a generic Wayback Machine link for this URL, which might work valPrint ts " Try browsing $ARCHIVE_GENERIC/$URL" valPrint rs " Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}" valPrint hs "Try browsing$ARCHIVE_GENERIC/$URL"