| 29 |  |  | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 30 |  | ### GLOBALS ### | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 31 |  | # Settings -- these will be changed from their defaults by the arguments passed in to the script | 
 
 
 
 
 
 
 
 
 
 
 | 32 | < | LINKS_URL=""          # use 'curl' to download file with links from this location (can be file://) | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 33 | < | EXCEPT_URL=""         # 'curl' will access this wiki page with a list of exceptions for NG results | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 34 | < | OUTPUT_DIR=""         # place reports and all other output in a folder inside this existing folder | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 35 | < | RECORD_OK_LINKS=0     # record response code to the log even when it's a value in OK_CODES | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 36 | < | SHOW_SLASH=0          # record issue when a slash is added to the end of a URL | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 37 | < | SHOW_HTTPS=0          # record issue when "http" is upgraded to "https" | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 38 | < | SHOW_YT_RD=0          # record redirection for a youtu.be URL expanding to the full URL | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 39 | < | SUGGEST_SNAPSHOTS=0   # query the Internet Archive for a possible snapshot URL for each NG page | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 40 | < | CHECK_ARCHIVE_LINKS=0 # check URLs under the archive.org domain | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 41 | < | TAKE_PAGE_SHOT=0      # take a screenshot of each OK page | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 42 | < | TIMEOUT=10            # time to wait for a response when querying a site | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 43 | < | CHROME_PATH=""        # path to a copy of Google Chrome that has the command-line screenshot feature | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 44 | < | URL_START=1           # start at this URL in LINKS_FILE | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 45 | < | URL_LIMIT=0           # if non-zero, stop at this URL in LINKS_FILE | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 46 | < | UPLOAD_INFO=""        # path to a file on your hard drive with the login info needed to upload a report | 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 | 32 | > | LINKS_URL=""           # use 'curl' to download file with links from this location (can be file://) | 
 
 
 
 
 | 33 | > | EXCEPT_URL=""          # 'curl' will access this wiki page with a list of exceptions for NG results | 
 
 
 
 
 | 34 | > | OUTPUT_DIR=""          # place reports and all other output in a folder inside this existing folder | 
 
 
 
 
 | 35 | > | RECORD_OK_LINKS=0      # record response code to the log even when it's a value in OK_CODES | 
 
 
 
 
 | 36 | > | SHOW_SLASH=0           # record issue when a slash is added to the end of a URL | 
 
 
 
 
 | 37 | > | SHOW_HTTPS=0           # record issue when "http" is upgraded to "https" | 
 
 
 
 
 | 38 | > | SHOW_YT_RD=0           # record redirection for a youtu.be URL expanding to the full URL | 
 
 
 
 
 | 39 | > | SUGGEST_SNAPSHOTS_NG=0 # query the Internet Archive for a possible snapshot URL for each NG page | 
 
 
 
 
 | 40 | > | SUGGEST_SNAPSHOTS_OK=0 # query the Internet Archive for an existing snapshot of each OK page | 
 
 
 
 
 | 41 | > | CHECK_ARCHIVE_LINKS=0  # check URLs under the archive.org domain | 
 
 
 
 
 | 42 | > | TAKE_PAGE_SHOT=0       # take a screenshot of each OK page | 
 
 
 
 
 | 43 | > | TIMEOUT=10             # time to wait for a response when querying a site | 
 
 
 
 
 | 44 | > | CHROME_PATH=""         # path to a copy of Google Chrome that has the command-line screenshot feature | 
 
 
 
 
 | 45 | > | URL_START=1            # start at this URL in LINKS_FILE | 
 
 
 
 
 | 46 | > | URL_LIMIT=0            # if non-zero, stop at this URL in LINKS_FILE | 
 
 
 
 
 | 47 | > | UPLOAD_INFO=""         # path to a file on your hard drive with the login info needed to upload a report | 
 
 
 
 
 
 
 
 
 
 
 | 48 |  |  | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 49 |  | # Fixed strings -- see the occurrences of these variables to learn their purpose | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 50 |  | AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36 OPR/70.0.3728.154" | 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 | 139 |  |  | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 140 |  | DESCRIPTION | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 141 |  | This script parses a list of external links found in the OniGalore wiki | 
 
 
 
 
 
 
 
 
 
 
 | 142 | < | (which is dumped by the Oni2.net domain periodically in a particular | 
 
 
 
 
 
 
 
 
 | 142 | > | (which is dumped by the Oni2.net server periodically in a particular | 
 
 
 
 
 
 
 
 
 
 
 | 143 |  | format), validates them using the Unix tool 'curl', and produces a report | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 144 |  | of which links were "OK" (responded positively to an HTTP query), which | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 145 |  | were "RD" (responded with a 3xx redirect code), which could be "IW" | 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 | 175 |  | --show-https-upgrades   Report on redirects that simply upgrade a | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 176 |  | "http://" URL to a "https://" URL. | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 177 |  | --show-yt-redirects     Report on redirects that expand a youtu.be URL. | 
 
 
 
 
 
 
 
 
 
 
 | 178 | < | --suggest-snapshots     Query the Internet Archive for a possible | 
 
 
 
 
 
 
 
 
 | 178 | > | --suggest-snapshots-ng  Query the Internet Archive for a possible | 
 
 
 
 
 
 
 
 
 
 
 | 179 |  | snapshot URL for each "NG" page. | 
 
 
 
 
 
 
 
 | 180 | + | --suggest-snapshots-ok  Query the Internet Archive for a snapshot of each | 
 
 
 
 
 
 
 
 | 181 | + | "OK" page just to make sure it's available. Note | 
 
 
 
 
 
 
 
 | 182 | + | that this will add a tremendous amount of time to | 
 
 
 
 
 
 
 
 | 183 | + | the script execution because there is a rate | 
 
 
 
 
 
 
 
 | 184 | + | limit to the Archive API. Note that this option | 
 
 
 
 
 
 
 
 | 185 | + | does nothing unless you also use the | 
 
 
 
 
 
 
 
 | 186 | + | --record-ok-links argument. | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 187 |  | --check-archive-links   Check links that are already pointing to a page | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 188 |  | on the Internet Archive. In theory these links | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 189 |  | should be totally stable and not need validation. | 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 | 218 |  | # Parse arguments as long as there are more arguments to process | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 219 |  | while (( "$#" )); do | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 220 |  | case "$1" in | 
 
 
 
 
 
 
 
 
 
 
 | 221 | < | --links )               LINKS_URL="$2";                     shift 2;; | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 222 | < | --exceptions )          EXCEPT_URL="$2";                    shift 2;; | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 223 | < | --output )              OUTPUT_DIR="$2";                    shift 2;; | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 224 | < | --record-ok-links )     RECORD_OK_LINKS=1;                  shift;; | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 225 | < | --show-added-slashes )  SHOW_SLASH=1;                       shift;; | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 226 | < | --show-https-upgrades ) SHOW_HTTPS=1;                       shift;; | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 227 | < | --show-yt-redirects )   SHOW_YT_RD=1;                       shift;; | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 228 | < | --suggest-snapshots )   SUGGEST_SNAPSHOTS=1;                shift;; | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 229 | < | --check-archive-links ) CHECK_ARCHIVE_LINKS=1;              shift;; | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 230 | < | --take-screenshots )    TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;; | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 231 | < | --timeout )             TIMEOUT=$2;                         shift 2;; | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 232 | < | --start-url )           URL_START=$2;                       shift 2;; | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 233 | < | --end-url )             URL_LIMIT=$2;                       shift 2;; | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 234 | < | --upload )              UPLOAD_INFO=$2;                     shift 2;; | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 235 | < | * )                     echo "Invalid argument $1 detected. Aborting."; exit 1;; | 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 | 221 | > | --links )                LINKS_URL="$2";                     shift 2;; | 
 
 
 
 
 | 222 | > | --exceptions )           EXCEPT_URL="$2";                    shift 2;; | 
 
 
 
 
 | 223 | > | --output )               OUTPUT_DIR="$2";                    shift 2;; | 
 
 
 
 
 | 224 | > | --record-ok-links )      RECORD_OK_LINKS=1;                  shift;; | 
 
 
 
 
 | 225 | > | --show-added-slashes )   SHOW_SLASH=1;                       shift;; | 
 
 
 
 
 | 226 | > | --show-https-upgrades )  SHOW_HTTPS=1;                       shift;; | 
 
 
 
 
 | 227 | > | --show-yt-redirects )    SHOW_YT_RD=1;                       shift;; | 
 
 
 
 
 | 228 | > | --suggest-snapshots-ng ) SUGGEST_SNAPSHOTS_NG=1;             shift;; | 
 
 
 
 
 | 229 | > | --suggest-snapshots-ok ) SUGGEST_SNAPSHOTS_OK=1;             shift;; | 
 
 
 
 
 | 230 | > | --check-archive-links )  CHECK_ARCHIVE_LINKS=1;              shift;; | 
 
 
 
 
 | 231 | > | --take-screenshots )     TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;; | 
 
 
 
 
 | 232 | > | --timeout )              TIMEOUT=$2;                         shift 2;; | 
 
 
 
 
 | 233 | > | --start-url )            URL_START=$2;                       shift 2;; | 
 
 
 
 
 | 234 | > | --end-url )              URL_LIMIT=$2;                       shift 2;; | 
 
 
 
 
 | 235 | > | --upload )               UPLOAD_INFO=$2;                     shift 2;; | 
 
 
 
 
 | 236 | > | * )                      echo "Invalid argument $1 detected. Aborting."; exit 1;; | 
 
 
 
 
 
 
 
 
 
 
 | 237 |  | esac | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 238 |  | done | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 239 |  |  | 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 | 661 |  | valPrint ctrhn "Take screenshots: " | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 662 |  | if [ $TAKE_PAGE_SHOT -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 663 |  |  | 
 
 
 
 
 
 
 
 
 
 
 | 664 | < | valPrint ctrhn "Suggest archive.org snapshots: " | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 665 | < | if [ $SUGGEST_SNAPSHOTS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi | 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 | 664 | > | valPrint ctrhn "Suggest archive.org snapshots for NG pages: " | 
 
 
 
 
 | 665 | > | if [ $SUGGEST_SNAPSHOTS_NG -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi | 
 
 
 
 
 | 666 | > |  | 
 
 
 
 
 | 667 | > | valPrint ctrhn "Suggest archive.org snapshots for OK pages: " | 
 
 
 
 
 | 668 | > | if [ $SUGGEST_SNAPSHOTS_OK -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi | 
 
 
 
 
 
 
 
 
 
 
 | 669 |  |  | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 670 |  | valPrint ctrhn "Ignore slash-adding redirects: " | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 671 |  | if [ $SHOW_SLASH -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi | 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 | 713 |  | START_RUN=$(date +%s) | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 714 |  | # Process each line of the .csv in LINKS_FILE | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 715 |  | for LINE in `cat "$LINKS_FILE"`; do | 
 
 
 
 
 
 
 
 | 716 | + | START_LINK=$(date +%s) | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 717 |  | let LINK_NUM+=1 | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 718 |  |  | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 719 |  | # First line is the column header row for the CSV, so let's verify that the format hasn't changed | 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 | 1109 |  | valPrint hn "<tr><td colspan=\"2\" align=\"right\">linked from</td><td><a href=\"$FULL_PAGE_PATH\" target=\"_blank\">$LOCAL_PAGE_PATH</a></td></tr>" | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 1110 |  |  | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 1111 |  | # Place vertical space here since we won't be printing anything more about this link | 
 
 
 
 
 
 
 
 
 
 
 | 1112 | < | if [ $STATUS == "OK" ]; then valPrint tr ""; valPrint hs ""; fi | 
 
 
 
 
 
 
 
 
 | 1112 | > | if [ $STATUS == "OK" ] && [ $SUGGEST_SNAPSHOTS_OK -eq 0 ]; then valPrint tr ""; valPrint hs ""; fi | 
 
 
 
 
 
 
 
 
 
 
 | 1113 |  |  | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 1114 |  | # Record redirect URL if one was given by a 3xx response page | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 1115 |  | if [ $STATUS == "RD" ]; then | 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 | 1135 |  | fi | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 1136 |  |  | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 1137 |  | # Query Internet Archive for latest "OK" snapshot for "NG" page | 
 
 
 
 
 
 
 
 
 
 
 | 1138 | < | if [ $STATUS == "NG" ] && [ $SUGGEST_SNAPSHOTS -eq 1 ]; then | 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 | 1138 | > | if [[ ( $STATUS == "NG" && $SUGGEST_SNAPSHOTS_NG -eq 1 ) || ( $STATUS == "OK" && $SUGGEST_SNAPSHOTS_OK -eq 1 ) ]]; then | 
 
 
 
 
 | 1139 | > |  | 
 
 
 
 
 | 1140 | > | # We need to watch out for the rate limit or we'll get locked out; look at how much time has | 
 
 
 
 
 | 1141 | > | # elapsed and then wait the remainder between that and how long of a wait we think is needed | 
 
 
 
 
 | 1142 | > | # to avoid the dreaded "Too Many Requests" response. 5 seconds is just a guess. | 
 
 
 
 
 | 1143 | > | CUR_TIME=$(date +%s) | 
 
 
 
 
 | 1144 | > | WAIT_REMAINDER=$((5 - $CUR_TIME + $START_LINK)) | 
 
 
 
 
 | 1145 | > | if [ $WAIT_REMAINDER -gt 0 ]; then | 
 
 
 
 
 | 1146 | > | valPrint t "Waiting $WAIT_REMAINDER second(s) to conform to Archive.org rate limit." | 
 
 
 
 
 | 1147 | > | sleep $WAIT_REMAINDER | 
 
 
 
 
 | 1148 | > | fi | 
 
 
 
 
 | 1149 | > |  | 
 
 
 
 
 | 1150 | > | # Issue query to the API | 
 
 
 
 
 
 
 
 
 
 
 | 1151 |  | ARCHIVE_QUERY=$(curl --silent --max-time $TIMEOUT "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES") | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 1152 |  |  | 
 
 
 
 
 
 
 
 
 
 
 | 1153 | < | # If a "closest" snapshot was received... | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 1154 | < | if [[ "$ARCHIVE_QUERY" == *\"closest\":* ]]; then | 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 | 1153 | > | # Notify user if we hit the rate limit and just keep going | 
 
 
 
 
 | 1154 | > | if [[ "$ARCHIVE_QUERY" == "*Too Many Requests*" ]]; then | 
 
 
 
 
 | 1155 | > | valPrint t "  IA has rate-limited us!" | 
 
 
 
 
 | 1156 | > | valPrint r "                IA has rate-limited us!" | 
 
 
 
 
 | 1157 | > | valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td>(hit the API rate limit!)</td></tr>" | 
 
 
 
 
 | 1158 | > | # If a "closest" snapshot was received, inform user | 
 
 
 
 
 | 1159 | > | elif [[ "$ARCHIVE_QUERY" == *\"closest\":* ]]; then | 
 
 
 
 
 
 
 
 
 
 
 | 1160 |  | # In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 1161 |  | ARCHIVE_QUERY=$(echo "$ARCHIVE_QUERY" | sed 's/#!/#\\!/') | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 1162 |  |  | 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 | 1172 |  | valPrint ts "  IA suggests $SNAPSHOT_URL" | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 1173 |  | valPrint rs "               IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}" | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 1174 |  | valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>" | 
 
 
 
 
 
 
 
 
 
 
 | 1175 | < | else # ...otherwise give generic Wayback Machine link for this URL | 
 
 
 
 
 
 
 
 
 | 1175 | > | else # Otherwise give a generic Wayback Machine link for this URL, which might work | 
 
 
 
 
 
 
 
 
 
 
 | 1176 |  | valPrint ts " Try browsing $ARCHIVE_GENERIC/$URL" | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 1177 |  | valPrint rs "               Try browsing {\field{\*\fldinst{HYPERLINK \"$ARCHIVE_GENERIC/$URL\"}}{\fldrslt $ARCHIVE_GENERIC/$URL}}" | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 1178 |  | valPrint hs "<tr><td colspan=\"2\" align=\"right\">Try browsing</td><td><a href=\"$ARCHIVE_GENERIC/$URL\" target=\"_blank\">$ARCHIVE_GENERIC/$URL</a></td></tr>" |