--- Validate External Links/validate_external_links.sh 2021/05/09 21:53:48 1157 +++ Validate External Links/validate_external_links.sh 2021/06/13 20:50:43 1158 @@ -38,7 +38,7 @@ SHOW_HTTPS=0 # record issue wh SHOW_YT_RD=0 # record redirection for a youtu.be URL expanding to the full URL SUGGEST_SNAPSHOTS_NG=0 # query the Internet Archive for a possible snapshot URL for each NG page SUGGEST_SNAPSHOTS_OK=0 # query the Internet Archive for an existing snapshot of each OK page -CHECK_ARCHIVE_LINKS=0 # check URLs under the archive.org domain +CHECK_ARCHIVE_LINKS=0 # check URLs on archive.org and archive.is TAKE_PAGE_SHOT=0 # take a screenshot of each OK page TIMEOUT=10 # time to wait for a response when querying a site CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature @@ -110,7 +110,7 @@ SKIP_EXPECT_IW=0 SKIP_HTTPS_UP=0 SKIP_SLASH_ADD=0 SKIP_YOUTU_BE=0 -SKIP_ARCHIVE_ORG=0 +SKIP_ARCHIVES=0 FILE_LINKS=0 PAGE_LINKS=0 SKIPPED_HEADER_ROW=0 @@ -185,8 +185,9 @@ OPTIONS does nothing unless you also use the --record-ok-links argument. --check-archive-links Check links that are already pointing to a page - on the Internet Archive. In theory these links - should be totally stable and not need validation. + on the Internet Archive or archive.is (AKA + archive.today). In theory these links should be + totally stable and not need validation. --take-screenshots FILE Call the Google Chrome binary at this path to take screenshots of each "OK" page. --timeout NUM Wait this many seconds for a site to respond. The @@ -531,7 +532,7 @@ function wrapupAndExit() # Print processed link totals if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi - if [ $SKIP_ARCHIVE_ORG -gt 0 ]; then valPrint ctrh "- $SKIP_ARCHIVE_ORG Archive.org $(pluralCheckNoun link $SKIP_ARCHIVE_ORG) were not checked"; fi + if [ $SKIP_ARCHIVES -gt 0 ]; then valPrint ctrh "- $SKIP_ARCHIVES archive.org/archive.is $(pluralCheckNoun link $SKIP_ARCHIVES) were not checked"; fi if [ $LINK_PROBLEMS_TOTAL -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_TOTAL processed $(pluralCheckNoun link $LINK_PROBLEMS_TOTAL) had $(pluralCheckAn $LINK_PROBLEMS_TOTAL)$(pluralCheckNoun issue $LINK_PROBLEMS_TOTAL)"; fi if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr " (excepted $LINKS_EXCEPTED link $(pluralCheckNoun issue $LINKS_EXCEPTED) from report)"; valPrint h "  (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) $(pluralCheckWas $OK_LINKS) OK"; fi @@ -676,7 +677,7 @@ if [ $SHOW_HTTPS -eq 1 ]; then valPrint valPrint ctrhn "Ignore youtu.be redirects: " if [ $SHOW_YT_RD -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi -valPrint ctrhn "Check archive.org links: " +valPrint ctrhn "Check archive.org and archive.is links: " if [ $CHECK_ARCHIVE_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi valPrint tr "A summary of my findings will be found at the bottom of the report." @@ -809,10 +810,10 @@ for LINE in `cat "$LINKS_FILE"`; do continue fi - # If we're skipping Archive.org links, see if this is one - if [ $CHECK_ARCHIVE_LINKS -eq 0 ] && [[ $URL == *web.archive.org* ]]; then - valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to check Wayback Machine links." - let SKIP_ARCHIVE_ORG+=1 + # If we're skipping archive links, see if this is one + if [ $CHECK_ARCHIVE_LINKS -eq 0 ] && [[ ( $URL == *web.archive.org* || $URL == *archive.is* ) ]]; then + valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to check archive links." + let SKIP_ARCHIVES+=1 let PAGE_LINKS+=1 continue fi @@ -916,7 +917,7 @@ for LINE in `cat "$LINKS_FILE"`; do # Get response code using 'curl' to see if this link is valid; the --insecure option avoids an # issue with sites that require HTTPS - CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --retry 2 --write-out '%{http_code}\n' $URL) + CURL_CODE=$(curl -o /dev/null --silent --insecure --compressed --head --user-agent '$AGENT' --max-time $TIMEOUT --retry 2 --write-out '%{http_code}\n' $URL) CURL_ERR=$(echo $?) CURL_RESULT=$CURL_CODE