--- Validate External Links/validate_external_links.sh 2020/03/25 21:50:30 1125 +++ Validate External Links/validate_external_links.sh 2020/03/28 02:08:29 1127 @@ -20,6 +20,7 @@ OUTPUT_DIR="" # place reports and al RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES SHOW_SLASH=0 # record response code to the log when a slash is added to the end of a URL SHOW_HTTPS=0 # record response code to the log when "http" is upgraded to "https" +SHOW_YT_RD=0 # record response code to the log when a youtu.be URL is expanded to the full URL SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page TAKE_PAGE_SHOT=0 # take a screenshot of each OK page CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature @@ -47,15 +48,15 @@ declare -a NS_NAMES=("Media" "Special" " # These arrays tell the script which suffixes at the ends of URLs represent files and which are pages. # This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code. -declare -a HTTP_FILES=(txt zip wmv jpg png m4a bsl rar oni mp3 mov ONWC vbs TRMA mp4 doc avi log gif pdf dmg exe cpp tga 7z wav east BINA xml dll dae xaf fbx 3ds blend flv csv) -declare -a HTTP_TLDS_AND_PAGES=(com net org uk ru de it htm html php pl asp aspx shtml pgi cgi php3 x jsp phtml cfm css action stm js) +declare -a HTTP_FILES=(3ds 7z avi BINA blend bsl cpp csv dae dll dmg doc east exe fbx first63 flv gamedata gif jpg last32 log m4a mhm mov mp3 mp4 oni ONWC pdf png py rar tga TRMA txt vbs wav wmv xaf xml zip) +declare -a HTTP_TLDS_AND_PAGES=(action ars asp aspx cfm cgi com css de htm html it js jsp net org pgi php php3 phtml pl ru shtml stm uk x) # These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which # are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt # if you add a new code. -declare -a OK_CODES=(200 401 405 406 501) +declare -a OK_CODES=(200 401 405 406 418 501) declare -a RD_CODES=(301 302 303 307 308) -declare -a NG_CODES=(000 403 404 410 500 503) +declare -a NG_CODES=(000 400 403 404 410 500 502 503 530) # Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using # transcluded text, and if the transclusion fails, then the braces show up in the URL @@ -87,6 +88,7 @@ SKIP_EXPECT_EI=0 SKIP_EXPECT_IW=0 SKIP_HTTPS_UP=0 SKIP_SLASH_ADD=0 +SKIP_YOUTU_BE=0 FILE_LINKS=0 PAGE_LINKS=0 SKIPPED_HEADER_ROW=0 @@ -145,8 +147,9 @@ OPTIONS code is "OK". --show-added-slashes Report on redirects that simply add a '/' to the end of the URL. - --show-https-upgrade Report on redirects that simply upgrade a + --show-https-upgrades Report on redirects that simply upgrade a "http://" URL to a "https://" URL. + --show-yt-redirects Report on redirects that expand a youtu.be URL. --suggest-snapshots Query the Internet Archive for a possible snapshot URL for each "NG" page. --take-screenshots FILE Call the Google Chrome binary at this path to @@ -175,18 +178,19 @@ fi # Parse arguments as long as there are more arguments to process while (( "$#" )); do case "$1" in - --links ) LINKS_URL="$2"; shift 2;; - --exceptions ) EXCEPT_URL="$2"; shift 2;; - --output ) OUTPUT_DIR="$2"; shift 2;; - --record-ok-links ) RECORD_OK_LINKS=1; shift;; - --show-added-slashes ) SHOW_SLASH=1; shift;; - --show-https-upgrade ) SHOW_HTTPS=1; shift;; - --suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;; - --take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;; - --start-url ) URL_START=$2; shift 2;; - --end-url ) URL_LIMIT=$2; shift 2;; - --upload ) UPLOAD_INFO=$2; shift 2;; - * ) echo "Invalid argument $1 detected. Aborting."; exit 1;; + --links ) LINKS_URL="$2"; shift 2;; + --exceptions ) EXCEPT_URL="$2"; shift 2;; + --output ) OUTPUT_DIR="$2"; shift 2;; + --record-ok-links ) RECORD_OK_LINKS=1; shift;; + --show-added-slashes ) SHOW_SLASH=1; shift;; + --show-https-upgrades ) SHOW_HTTPS=1; shift;; + --show-yt-redirects ) SHOW_YT_RD=1; shift;; + --suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;; + --take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;; + --start-url ) URL_START=$2; shift 2;; + --end-url ) URL_LIMIT=$2; shift 2;; + --upload ) UPLOAD_INFO=$2; shift 2;; + * ) echo "Invalid argument $1 detected. Aborting."; exit 1;; esac done @@ -448,7 +452,7 @@ function wrapupAndExit() LINK_PROBLEMS=$((EI_LINKS+IW_LINKS+RD_LINKS+NG_LINKS)) LINK_ERRORS=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE)) LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_EI+SKIP_EXPECT_IW)) - TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP)) + TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP+SKIP_YOUTU_BE)) LINKS_CHECKED=$((LINKS_PROCESSED-LINK_ERRORS)) # Print summary header @@ -461,7 +465,7 @@ function wrapupAndExit() if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS processed $(pluralCheckNoun link $LINK_PROBLEMS) had $(pluralCheckAn $LINK_PROBLEMS)$(pluralCheckNoun issue $LINK_PROBLEMS)"; fi - if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr " (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; valPrint h "nbsp;nbsp;(excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi + if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr " (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; valPrint h "  (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) $(pluralCheckWas $OK_LINKS) OK"; fi if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctr " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; valPrint h "  (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi @@ -722,11 +726,21 @@ for LINE in `cat "$LINKS_FILE"`; do # Turn off case sensitivity while we compare suffixes shopt -s nocasematch - # Special case: URLs ending in something like "/productID.304297400" are pages, not files, so if + # Special case: URLs ending in something like "productID.304297400" are pages, not files, so if # the URL's suffix is all numbers, we are looking at the end of a web page URL if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then IS_FILE=0 fi + + # Special case: URLs ending in a parens, e.g. "ms537113(v=vs.85)", are pages + if [[ $POST_DOT =~ ^.*[\(\)]$ ]]; then + IS_FILE=0 + fi + + # Special case: URLs containing a '%', e.g. "10.1007%2FBF00329055", are pages + if [[ $POST_DOT == *%* ]]; then + IS_FILE=0 + fi # If we did not identify this URL as a web page above, we need to compare the suffix against known # file extensions @@ -838,6 +852,13 @@ for LINE in `cat "$LINKS_FILE"`; do # merely add an ending slash if the user didn't ask for them NEW_URL_NO_SLASH=$(echo $NEW_URL_HTTP | sed -E 's:/$::') + # Detect if this is a youtu.be link simply being expanded by YouTube to the full + # youtube.com address + YOUTU_BE=0 + if [[ $URL_HTTP == http://youtu.be* ]] && [[ $NEW_URL_HTTP == http://www.youtube.com* ]]; then + YOUTU_BE=1 + fi + # If the URLs match besides HTTP being upgraded to HTTPS, then the link is OK (unless user # wants those to be reported) if [ $SHOW_HTTPS -eq 0 ] && [ $URL_HTTP == $NEW_URL_HTTP ]; then @@ -852,6 +873,11 @@ for LINE in `cat "$LINKS_FILE"`; do STATUS="OK" let OK_LINKS+=1 let SKIP_SLASH_ADD+=1 + elif [ $SHOW_YT_RD -eq 0 ] && [ $YOUTU_BE -eq 1 ]; then + valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show redirects for youtu.be links, and I was redirected to '$NEW_URL'." + STATUS="OK" + let OK_LINKS+=1 + let SKIP_YOUTU_BE+=1 else STATUS="RD" let RD_LINKS+=1 @@ -874,7 +900,7 @@ for LINE in `cat "$LINKS_FILE"`; do # If we didn't match a known status code, advise the reader if [ $STATUS == "??" ]; then - valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I encountered the unknown return code $CURL_CODE." + valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I encountered the unknown response code $CURL_CODE." let SKIP_UNK_CODE+=1 continue fi