29 |
|
|
30 |
|
### GLOBALS ### |
31 |
|
# Settings -- these will be changed from their defaults by the arguments passed in to the script |
32 |
< |
LINKS_URL="" # use 'curl' to download file with links from this location (can be file://) |
33 |
< |
EXCEPT_URL="" # 'curl' will access this wiki page with a list of exceptions for NG results |
34 |
< |
OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder |
35 |
< |
RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES |
36 |
< |
SHOW_SLASH=0 # record issue when a slash is added to the end of a URL |
37 |
< |
SHOW_HTTPS=0 # record issue when "http" is upgraded to "https" |
38 |
< |
SHOW_YT_RD=0 # record redirection for a youtu.be URL expanding to the full URL |
39 |
< |
SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page |
40 |
< |
CHECK_ARCHIVE_LINKS=0 # check URLs under the archive.org domain |
41 |
< |
TAKE_PAGE_SHOT=0 # take a screenshot of each OK page |
42 |
< |
TIMEOUT=10 # time to wait for a response when querying a site |
43 |
< |
CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature |
44 |
< |
URL_START=1 # start at this URL in LINKS_FILE |
45 |
< |
URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE |
46 |
< |
UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report |
32 |
> |
LINKS_URL="" # use 'curl' to download file with links from this location (can be file://) |
33 |
> |
EXCEPT_URL="" # 'curl' will access this wiki page with a list of exceptions for NG results |
34 |
> |
OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder |
35 |
> |
RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES |
36 |
> |
SHOW_SLASH=0 # record issue when a slash is added to the end of a URL |
37 |
> |
SHOW_HTTPS=0 # record issue when "http" is upgraded to "https" |
38 |
> |
SHOW_YT_RD=0 # record redirection for a youtu.be URL expanding to the full URL |
39 |
> |
SUGGEST_SNAPSHOTS_NG=0 # query the Internet Archive for a possible snapshot URL for each NG page |
40 |
> |
SUGGEST_SNAPSHOTS_OK=0 # query the Internet Archive for an existing snapshot of each OK page |
41 |
> |
CHECK_ARCHIVE_LINKS=0 # check URLs under the archive.org domain |
42 |
> |
TAKE_PAGE_SHOT=0 # take a screenshot of each OK page |
43 |
> |
TIMEOUT=10 # time to wait for a response when querying a site |
44 |
> |
CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature |
45 |
> |
URL_START=1 # start at this URL in LINKS_FILE |
46 |
> |
URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE |
47 |
> |
UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report |
48 |
|
|
49 |
|
# Fixed strings -- see the occurrences of these variables to learn their purpose |
50 |
|
AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36 OPR/70.0.3728.154" |
139 |
|
|
140 |
|
DESCRIPTION |
141 |
|
This script parses a list of external links found in the OniGalore wiki |
142 |
< |
(which is dumped by the Oni2.net domain periodically in a particular |
142 |
> |
(which is dumped by the Oni2.net server periodically in a particular |
143 |
|
format), validates them using the Unix tool 'curl', and produces a report |
144 |
|
of which links were "OK" (responded positively to an HTTP query), which |
145 |
|
were "RD" (responded with a 3xx redirect code), which could be "IW" |
175 |
|
--show-https-upgrades Report on redirects that simply upgrade a |
176 |
|
"http://" URL to a "https://" URL. |
177 |
|
--show-yt-redirects Report on redirects that expand a youtu.be URL. |
178 |
< |
--suggest-snapshots Query the Internet Archive for a possible |
178 |
> |
--suggest-snapshots-ng Query the Internet Archive for a possible |
179 |
|
snapshot URL for each "NG" page. |
180 |
+ |
--suggest-snapshots-ok Query the Internet Archive for a snapshot of each |
181 |
+ |
"OK" page just to make sure it's available. Note |
182 |
+ |
that this will add a tremendous amount of time to |
183 |
+ |
the script execution because there is a rate |
184 |
+ |
limit to the Archive API. Note that this option |
185 |
+ |
does nothing unless you also use the |
186 |
+ |
--record-ok-links argument. |
187 |
|
--check-archive-links Check links that are already pointing to a page |
188 |
|
on the Internet Archive. In theory these links |
189 |
|
should be totally stable and not need validation. |
218 |
|
# Parse arguments as long as there are more arguments to process |
219 |
|
while (( "$#" )); do |
220 |
|
case "$1" in |
221 |
< |
--links ) LINKS_URL="$2"; shift 2;; |
222 |
< |
--exceptions ) EXCEPT_URL="$2"; shift 2;; |
223 |
< |
--output ) OUTPUT_DIR="$2"; shift 2;; |
224 |
< |
--record-ok-links ) RECORD_OK_LINKS=1; shift;; |
225 |
< |
--show-added-slashes ) SHOW_SLASH=1; shift;; |
226 |
< |
--show-https-upgrades ) SHOW_HTTPS=1; shift;; |
227 |
< |
--show-yt-redirects ) SHOW_YT_RD=1; shift;; |
228 |
< |
--suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;; |
229 |
< |
--check-archive-links ) CHECK_ARCHIVE_LINKS=1; shift;; |
230 |
< |
--take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;; |
231 |
< |
--timeout ) TIMEOUT=$2; shift 2;; |
232 |
< |
--start-url ) URL_START=$2; shift 2;; |
233 |
< |
--end-url ) URL_LIMIT=$2; shift 2;; |
234 |
< |
--upload ) UPLOAD_INFO=$2; shift 2;; |
235 |
< |
* ) echo "Invalid argument $1 detected. Aborting."; exit 1;; |
221 |
> |
--links ) LINKS_URL="$2"; shift 2;; |
222 |
> |
--exceptions ) EXCEPT_URL="$2"; shift 2;; |
223 |
> |
--output ) OUTPUT_DIR="$2"; shift 2;; |
224 |
> |
--record-ok-links ) RECORD_OK_LINKS=1; shift;; |
225 |
> |
--show-added-slashes ) SHOW_SLASH=1; shift;; |
226 |
> |
--show-https-upgrades ) SHOW_HTTPS=1; shift;; |
227 |
> |
--show-yt-redirects ) SHOW_YT_RD=1; shift;; |
228 |
> |
--suggest-snapshots-ng ) SUGGEST_SNAPSHOTS_NG=1; shift;; |
229 |
> |
--suggest-snapshots-ok ) SUGGEST_SNAPSHOTS_OK=1; shift;; |
230 |
> |
--check-archive-links ) CHECK_ARCHIVE_LINKS=1; shift;; |
231 |
> |
--take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;; |
232 |
> |
--timeout ) TIMEOUT=$2; shift 2;; |
233 |
> |
--start-url ) URL_START=$2; shift 2;; |
234 |
> |
--end-url ) URL_LIMIT=$2; shift 2;; |
235 |
> |
--upload ) UPLOAD_INFO=$2; shift 2;; |
236 |
> |
* ) echo "Invalid argument $1 detected. Aborting."; exit 1;; |
237 |
|
esac |
238 |
|
done |
239 |
|
|
661 |
|
valPrint ctrhn "Take screenshots: " |
662 |
|
if [ $TAKE_PAGE_SHOT -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi |
663 |
|
|
664 |
< |
valPrint ctrhn "Suggest archive.org snapshots: " |
665 |
< |
if [ $SUGGEST_SNAPSHOTS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi |
664 |
> |
valPrint ctrhn "Suggest archive.org snapshots for NG pages: " |
665 |
> |
if [ $SUGGEST_SNAPSHOTS_NG -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi |
666 |
> |
|
667 |
> |
valPrint ctrhn "Suggest archive.org snapshots for OK pages: " |
668 |
> |
if [ $SUGGEST_SNAPSHOTS_OK -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi |
669 |
|
|
670 |
|
valPrint ctrhn "Ignore slash-adding redirects: " |
671 |
|
if [ $SHOW_SLASH -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi |
713 |
|
START_RUN=$(date +%s) |
714 |
|
# Process each line of the .csv in LINKS_FILE |
715 |
|
for LINE in `cat "$LINKS_FILE"`; do |
716 |
+ |
START_LINK=$(date +%s) |
717 |
|
let LINK_NUM+=1 |
718 |
|
|
719 |
|
# First line is the column header row for the CSV, so let's verify that the format hasn't changed |
1109 |
|
valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>" |
1110 |
|
|
1111 |
|
# Place vertical space here since we won't be printing anything more about this link |
1112 |
< |
if [ $STATUS == "OK" ]; then valPrint tr ""; valPrint hs ""; fi |
1112 |
> |
if [ $STATUS == "OK" ] && [ $SUGGEST_SNAPSHOTS_OK -eq 0 ]; then valPrint tr ""; valPrint hs ""; fi |
1113 |
|
|
1114 |
|
# Record redirect URL if one was given by a 3xx response page |
1115 |
|
if [ $STATUS == "RD" ]; then |
1135 |
|
fi |
1136 |
|
|
1137 |
|
# Query Internet Archive for latest "OK" snapshot for "NG" page |
1138 |
< |
if [ $STATUS == "NG" ] && [ $SUGGEST_SNAPSHOTS -eq 1 ]; then |
1138 |
> |
if [[ ( $STATUS == "NG" && $SUGGEST_SNAPSHOTS_NG -eq 1 ) || ( $STATUS == "OK" && $SUGGEST_SNAPSHOTS_OK -eq 1 ) ]]; then |
1139 |
> |
|
1140 |
> |
# We need to watch out for the rate limit or we'll get locked out; look at how much time has |
1141 |
> |
# elapsed and then wait the remainder between that and how long of a wait we think is needed |
1142 |
> |
# to avoid the dreaded "Too Many Requests" response. 5 seconds is just a guess. |
1143 |
> |
CUR_TIME=$(date +%s) |
1144 |
> |
WAIT_REMAINDER=$((5 - $CUR_TIME + $START_LINK)) |
1145 |
> |
if [ $WAIT_REMAINDER -gt 0 ]; then |
1146 |
> |
valPrint t "Waiting $WAIT_REMAINDER second(s) to conform to Archive.org rate limit." |
1147 |
> |
sleep $WAIT_REMAINDER |
1148 |
> |
fi |
1149 |
> |
|
1150 |
> |
# Issue query to the API |
1151 |
|
ARCHIVE_QUERY=$(curl --silent --max-time $TIMEOUT "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES") |
1152 |
|
|
1153 |
< |
# If a "closest" snapshot was received... |
1154 |
< |
if [[ "$ARCHIVE_QUERY" == *\"closest\":* ]]; then |
1153 |
> |
# Notify user if we hit the rate limit and just keep going |
1154 |
> |
if [[ "$ARCHIVE_QUERY" == "*Too Many Requests*" ]]; then |
1155 |
> |
valPrint t " IA has rate-limited us!" |
1156 |
> |
valPrint r " IA has rate-limited us!" |
1157 |
> |
valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td>(hit the API rate limit!)</td></tr>" |
1158 |
> |
# If a "closest" snapshot was received, inform user |
1159 |
> |
elif [[ "$ARCHIVE_QUERY" == *\"closest\":* ]]; then |
1160 |
|
# In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it |
1161 |
|
ARCHIVE_QUERY=$(echo "$ARCHIVE_QUERY" | sed 's/#!/#\\!/') |
1162 |
|
|
1172 |
|
valPrint ts " IA suggests $SNAPSHOT_URL" |
1173 |
|
valPrint rs " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}" |
1174 |
|
valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>" |
1175 |
< |
else # ...otherwise give generic Wayback Machine link for this URL |
1175 |
> |
else # Otherwise give a generic Wayback Machine link for this URL, which might work |
1176 |
|
valPrint ts " Try browsing $ARCHIVE_GENERIC/$URL" |
1177 |
|
valPrint rs " Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}" |
1178 |
|
valPrint hs "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>" |