--- Validate External Links/validate_external_links.sh 2023/01/23 01:51:32 1178 +++ Validate External Links/validate_external_links.sh 2023/05/07 19:53:19 1182 @@ -72,8 +72,8 @@ declare -a HTTP_FILES=(3ds 7z avi BINA b declare -a HTTP_TLDS_AND_PAGES=(abstract action ars asp aspx cfm cgi com css de do full htm html it js jsp net org pgi php php3 phtml pl ru shtml stm uk x) # These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which -# are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt -# if you add a new code. +# are NG (no good). Pages that return OK codes will be screenshotted when screenshots are asked for. +# Remember to update http_codes.txt if you add a new code. declare -a OK_CODES=(200 401 405 406 418 501) declare -a RD_CODES=(301 302 303 307 308) declare -a NG_CODES=(000 400 403 404 410 429 500 502 503 504 520 530) @@ -1013,14 +1013,46 @@ for LINE in `cat "$LINKS_FILE"`; do # If this is a YouTube link, we have to look at the actual page source to know if the video # is good or not; override the link's info if it's actually NG if [[ $URL == *www.youtube.com* ]]; then - PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $URL | grep "\"simpleText\":\"Video unavailable\"") - if [ ! -z "$PAGE_TEXT" ]; then + PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $URL) + CURL_ERR=$(echo $?) + if [ "$CURL_ERR" != "0" ]; then + STATUS="NG" + CURL_RESULT="000-$CURL_ERR" + let OK_LINKS-=1 + let NG_LINKS+=1 + elif [[ "$PAGE_TEXT" =~ "simpleText\":\"Video unavailable" ]]; then + STATUS="NG" + CURL_CODE="404" + CURL_RESULT=$CURL_CODE + let OK_LINKS-=1 + let NG_LINKS+=1 + fi + fi + + # If this is a OneDrive link, we have to look at the actual page source to know if the file + # is really still at this URL; override the link's info if it's actually NG or RD + if [[ $URL == *skydrive.live.com* ]]; then + PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $URL) + CURL_ERR=$(echo $?) + if [ "$CURL_ERR" != "0" ]; then + STATUS="NG" + CURL_RESULT="000-$CURL_ERR" + let OK_LINKS-=1 + let NG_LINKS+=1 + elif [[ "$PAGE_TEXT" =~ "

Sorry, something went wrong" ]]; then STATUS="NG" - CURL_RESULT=404 + CURL_CODE="404" + CURL_RESULT=$CURL_CODE let OK_LINKS-=1 let NG_LINKS+=1 + elif [[ "$PAGE_TEXT" =~ "

Object moved to" ]]; then + STATUS="??" # have to send the code through the next block to treat the redirect properly + CURL_CODE="301" + CURL_RESULT=$CURL_CODE + let OK_LINKS-=1 fi fi + break fi done @@ -1030,8 +1062,14 @@ for LINE in `cat "$LINKS_FILE"`; do if [ $STATUS == "??" ]; then for CODE in "${RD_CODES[@]}"; do if [[ $CODE == $CURL_CODE ]]; then - # Get URL header again in order to retrieve the URL we are being redirected to - NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --write-out '%{redirect_url}\n' $URL) + # Get URL header again in order to retrieve the URL we are being redirected to, but if this + # is a OneDrive link, we already have the new URL in $PAGE_TEXT + if [[ $URL == *skydrive.live.com* ]]; then + NEW_URL=${PAGE_TEXT##*href=\"} + NEW_URL=${NEW_URL%\">here*} + else + NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --write-out '%{redirect_url}\n' $URL) + fi # Adjust the old and new URLs to both use HTTP for comparison purposes, so we can filter # those changes out if the user didn't ask for them @@ -1128,7 +1166,7 @@ for LINE in `cat "$LINKS_FILE"`; do for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do { EXCEPT_LINE="${EXCEPT_ARRAY[$i]}" - + # Undo any HTML-encoding from the wiki page; for now we just worry about the ampersand, as most # other HTML-encoded characters are not found in URLs EXCEPT_LINE=$(echo "$EXCEPT_LINE" | sed 's/\&/\&/g') @@ -1137,7 +1175,7 @@ for LINE in `cat "$LINKS_FILE"`; do EXCEPT_URL="${EXCEPT_LINE#*,}" EXCEPT_URL="${EXCEPT_URL%,*}" if [[ "$EXCEPT_URL" =~ \* ]]; then # if this exception URL contains the '*' wildcard, use pattern-matching with it - if [[ "$URL" =~ "$EXCEPT_URL" ]]; then + if [[ ! "$URL" == $EXCEPT_URL ]]; then continue fi else