ViewVC Help
View File | Revision Log | View Changeset | Root Listing
root/Oni2/Validate External Links/validate_external_links.sh
(Generate patch)

Comparing Validate External Links/validate_external_links.sh (file contents):
Revision 1146 by iritscen, Sun Nov 1 18:55:05 2020 UTC vs.
Revision 1147 by iritscen, Tue Feb 2 20:10:39 2021 UTC

# Line 29 | Line 29 | IFS="
29  
30   ### GLOBALS ###
31   # Settings -- these will be changed from their defaults by the arguments passed in to the script
32 < LINKS_URL=""          # use 'curl' to download file with links from this location (can be file://)
33 < EXCEPT_URL=""         # 'curl' will access this wiki page with a list of exceptions for NG results
34 < OUTPUT_DIR=""         # place reports and all other output in a folder inside this existing folder
35 < RECORD_OK_LINKS=0     # record response code to the log even when it's a value in OK_CODES
36 < SHOW_SLASH=0          # record issue when a slash is added to the end of a URL
37 < SHOW_HTTPS=0          # record issue when "http" is upgraded to "https"
38 < SHOW_YT_RD=0          # record redirection for a youtu.be URL expanding to the full URL
39 < SUGGEST_SNAPSHOTS=0   # query the Internet Archive for a possible snapshot URL for each NG page
40 < CHECK_ARCHIVE_LINKS=0 # check URLs under the archive.org domain
41 < TAKE_PAGE_SHOT=0      # take a screenshot of each OK page
42 < TIMEOUT=10            # time to wait for a response when querying a site
43 < CHROME_PATH=""        # path to a copy of Google Chrome that has the command-line screenshot feature
44 < URL_START=1           # start at this URL in LINKS_FILE
45 < URL_LIMIT=0           # if non-zero, stop at this URL in LINKS_FILE
46 < UPLOAD_INFO=""        # path to a file on your hard drive with the login info needed to upload a report
32 > LINKS_URL=""           # use 'curl' to download file with links from this location (can be file://)
33 > EXCEPT_URL=""          # 'curl' will access this wiki page with a list of exceptions for NG results
34 > OUTPUT_DIR=""          # place reports and all other output in a folder inside this existing folder
35 > RECORD_OK_LINKS=0      # record response code to the log even when it's a value in OK_CODES
36 > SHOW_SLASH=0           # record issue when a slash is added to the end of a URL
37 > SHOW_HTTPS=0           # record issue when "http" is upgraded to "https"
38 > SHOW_YT_RD=0           # record redirection for a youtu.be URL expanding to the full URL
39 > SUGGEST_SNAPSHOTS_NG=0 # query the Internet Archive for a possible snapshot URL for each NG page
40 > SUGGEST_SNAPSHOTS_OK=0 # query the Internet Archive for an existing snapshot of each OK page
41 > CHECK_ARCHIVE_LINKS=0  # check URLs under the archive.org domain
42 > TAKE_PAGE_SHOT=0       # take a screenshot of each OK page
43 > TIMEOUT=10             # time to wait for a response when querying a site
44 > CHROME_PATH=""         # path to a copy of Google Chrome that has the command-line screenshot feature
45 > URL_START=1            # start at this URL in LINKS_FILE
46 > URL_LIMIT=0            # if non-zero, stop at this URL in LINKS_FILE
47 > UPLOAD_INFO=""         # path to a file on your hard drive with the login info needed to upload a report
48  
49   # Fixed strings -- see the occurrences of these variables to learn their purpose
50   AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36 OPR/70.0.3728.154"
# Line 138 | Line 139 | SYNOPSIS
139  
140   DESCRIPTION
141         This script parses a list of external links found in the OniGalore wiki
142 <       (which is dumped by the Oni2.net domain periodically in a particular
142 >       (which is dumped by the Oni2.net server periodically in a particular
143         format), validates them using the Unix tool 'curl', and produces a report
144         of which links were "OK" (responded positively to an HTTP query), which
145         were "RD" (responded with a 3xx redirect code), which could be "IW"
# Line 174 | Line 175 | OPTIONS
175         --show-https-upgrades   Report on redirects that simply upgrade a
176                                 "http://" URL to a "https://" URL.
177         --show-yt-redirects     Report on redirects that expand a youtu.be URL.
178 <       --suggest-snapshots     Query the Internet Archive for a possible
178 >       --suggest-snapshots-ng  Query the Internet Archive for a possible
179                                 snapshot URL for each "NG" page.
180 +       --suggest-snapshots-ok  Query the Internet Archive for a snapshot of each
181 +                               "OK" page just to make sure it's available. Note
182 +                               that this will add a tremendous amount of time to
183 +                               the script execution because there is a rate
184 +                               limit to the Archive API. Note that this option
185 +                               does nothing unless you also use the
186 +                               --record-ok-links argument.
187         --check-archive-links   Check links that are already pointing to a page
188                                 on the Internet Archive. In theory these links
189                                 should be totally stable and not need validation.
# Line 210 | Line 218 | fi
218   # Parse arguments as long as there are more arguments to process
219   while (( "$#" )); do
220     case "$1" in
221 <      --links )               LINKS_URL="$2";                     shift 2;;
222 <      --exceptions )          EXCEPT_URL="$2";                    shift 2;;
223 <      --output )              OUTPUT_DIR="$2";                    shift 2;;
224 <      --record-ok-links )     RECORD_OK_LINKS=1;                  shift;;
225 <      --show-added-slashes )  SHOW_SLASH=1;                       shift;;
226 <      --show-https-upgrades ) SHOW_HTTPS=1;                       shift;;
227 <      --show-yt-redirects )   SHOW_YT_RD=1;                       shift;;
228 <      --suggest-snapshots )   SUGGEST_SNAPSHOTS=1;                shift;;
229 <      --check-archive-links ) CHECK_ARCHIVE_LINKS=1;              shift;;
230 <      --take-screenshots )    TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
231 <      --timeout )             TIMEOUT=$2;                         shift 2;;
232 <      --start-url )           URL_START=$2;                       shift 2;;
233 <      --end-url )             URL_LIMIT=$2;                       shift 2;;
234 <      --upload )              UPLOAD_INFO=$2;                     shift 2;;
235 <      * )                     echo "Invalid argument $1 detected. Aborting."; exit 1;;
221 >      --links )                LINKS_URL="$2";                     shift 2;;
222 >      --exceptions )           EXCEPT_URL="$2";                    shift 2;;
223 >      --output )               OUTPUT_DIR="$2";                    shift 2;;
224 >      --record-ok-links )      RECORD_OK_LINKS=1;                  shift;;
225 >      --show-added-slashes )   SHOW_SLASH=1;                       shift;;
226 >      --show-https-upgrades )  SHOW_HTTPS=1;                       shift;;
227 >      --show-yt-redirects )    SHOW_YT_RD=1;                       shift;;
228 >      --suggest-snapshots-ng ) SUGGEST_SNAPSHOTS_NG=1;             shift;;
229 >      --suggest-snapshots-ok ) SUGGEST_SNAPSHOTS_OK=1;             shift;;
230 >      --check-archive-links )  CHECK_ARCHIVE_LINKS=1;              shift;;
231 >      --take-screenshots )     TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
232 >      --timeout )              TIMEOUT=$2;                         shift 2;;
233 >      --start-url )            URL_START=$2;                       shift 2;;
234 >      --end-url )              URL_LIMIT=$2;                       shift 2;;
235 >      --upload )               UPLOAD_INFO=$2;                     shift 2;;
236 >      * )                      echo "Invalid argument $1 detected. Aborting."; exit 1;;
237    esac
238   done
239  
# Line 652 | Line 661 | if [ $RECORD_OK_LINKS -eq 1 ]; then valP
661   valPrint ctrhn "Take screenshots: "
662   if [ $TAKE_PAGE_SHOT -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
663  
664 < valPrint ctrhn "Suggest archive.org snapshots: "
665 < if [ $SUGGEST_SNAPSHOTS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
664 > valPrint ctrhn "Suggest archive.org snapshots for NG pages: "
665 > if [ $SUGGEST_SNAPSHOTS_NG -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
666 >
667 > valPrint ctrhn "Suggest archive.org snapshots for OK pages: "
668 > if [ $SUGGEST_SNAPSHOTS_OK -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
669  
670   valPrint ctrhn "Ignore slash-adding redirects: "
671   if [ $SHOW_SLASH -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
# Line 701 | Line 713 | valPrint hn "<h3>Links</h3>"
713   START_RUN=$(date +%s)
714   # Process each line of the .csv in LINKS_FILE
715   for LINE in `cat "$LINKS_FILE"`; do
716 +   START_LINK=$(date +%s)
717     let LINK_NUM+=1
718  
719     # First line is the column header row for the CSV, so let's verify that the format hasn't changed
# Line 1096 | Line 1109 | for LINE in `cat "$LINKS_FILE"`; do
1109        valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>"
1110  
1111        # Place vertical space here since we won't be printing anything more about this link
1112 <      if [ $STATUS == "OK" ]; then valPrint tr ""; valPrint hs ""; fi
1112 >      if [ $STATUS == "OK" ] && [ $SUGGEST_SNAPSHOTS_OK -eq 0 ]; then valPrint tr ""; valPrint hs ""; fi
1113  
1114        # Record redirect URL if one was given by a 3xx response page
1115        if [ $STATUS == "RD" ]; then
# Line 1122 | Line 1135 | for LINE in `cat "$LINKS_FILE"`; do
1135        fi
1136  
1137        # Query Internet Archive for latest "OK" snapshot for "NG" page
1138 <      if [ $STATUS == "NG" ] && [ $SUGGEST_SNAPSHOTS -eq 1 ]; then
1138 >      if [[ ( $STATUS == "NG" && $SUGGEST_SNAPSHOTS_NG -eq 1 ) || ( $STATUS == "OK" && $SUGGEST_SNAPSHOTS_OK -eq 1 ) ]]; then
1139 >
1140 >         # We need to watch out for the rate limit or we'll get locked out; look at how much time has
1141 >         # elapsed and then wait the remainder between that and how long of a wait we think is needed
1142 >         # to avoid the dreaded "Too Many Requests" response. 5 seconds is just a guess.
1143 >         CUR_TIME=$(date +%s)
1144 >         WAIT_REMAINDER=$((5 - $CUR_TIME + $START_LINK))
1145 >         if [ $WAIT_REMAINDER -gt 0 ]; then
1146 >            valPrint t "Waiting $WAIT_REMAINDER second(s) to conform to Archive.org rate limit."
1147 >            sleep $WAIT_REMAINDER
1148 >         fi
1149 >
1150 >         # Issue query to the API
1151           ARCHIVE_QUERY=$(curl --silent --max-time $TIMEOUT "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
1152  
1153 <         # If a "closest" snapshot was received...
1154 <         if [[ "$ARCHIVE_QUERY" == *\"closest\":* ]]; then
1153 >         # Notify user if we hit the rate limit and just keep going
1154 >         if [[ "$ARCHIVE_QUERY" == "*Too Many Requests*" ]]; then
1155 >            valPrint t "  IA has rate-limited us!"
1156 >            valPrint r "                IA has rate-limited us!"
1157 >            valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td>(hit the API rate limit!)</td></tr>"
1158 >         # If a "closest" snapshot was received, inform user
1159 >         elif [[ "$ARCHIVE_QUERY" == *\"closest\":* ]]; then
1160              # In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it
1161              ARCHIVE_QUERY=$(echo "$ARCHIVE_QUERY" | sed 's/#!/#\\!/')
1162  
# Line 1142 | Line 1172 | for LINE in `cat "$LINKS_FILE"`; do
1172              valPrint ts "  IA suggests $SNAPSHOT_URL"
1173              valPrint rs "               IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
1174              valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"
1175 <         else # ...otherwise give generic Wayback Machine link for this URL
1175 >         else # Otherwise give a generic Wayback Machine link for this URL, which might work
1176              valPrint ts " Try browsing $ARCHIVE_GENERIC/$URL"
1177              valPrint rs "               Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}"
1178              valPrint hs "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>"

Diff Legend

Removed lines
+ Added lines
< Changed lines (old)
> Changed lines (new)