[ViewVC] Diff of: Oni2/Validate External Links/validate_external

Comparing Validate External Links/validate_external_links.sh (file contents):
Revision 1178 by iritscen, Mon Jan 23 01:51:32 2023 UTC vs.
Revision 1182 by iritscen, Sun May 7 19:53:19 2023 UTC

+declare -a HTTP_TLDS_AND_PAGES=(abstract action ars asp aspx cfm cgi com css de do full htm html it js jsp net org pgi php php3 phtml pl ru shtml stm uk x)
+# These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
-<
+# are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt
-<
+# if you add a new code.
->
+# are NG (no good). Pages that return OK codes will be screenshotted when screenshots are asked for.
->
+# Remember to update http_codes.txt if you add a new code.
+declare -a OK_CODES=(200 401 405 406 418 501)
+declare -a RD_CODES=(301 302 303 307 308)
+declare -a NG_CODES=(000 400 403 404 410 429 500 502 503 504 520 530)
+            # If this is a YouTube link, we have to look at the actual page source to know if the video
+            # is good or not; override the link's info if it's actually NG
+            if [[ $URL == *www.youtube.com* ]]; then
-<
+               PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $URL | grep "\"simpleText\":\"Video unavailable\"")
-<
+               if [ ! -z "$PAGE_TEXT" ]; then
->
+               PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $URL)
->
+               CURL_ERR=$(echo $?)
->
+               if [ "$CURL_ERR" != "0" ]; then
->
+                  STATUS="NG"
->
+                  CURL_RESULT="000-$CURL_ERR"
->
+                  let OK_LINKS-=1
->
+                  let NG_LINKS+=1
->
+               elif [[ "$PAGE_TEXT" =~ "simpleText\":\"Video unavailable" ]]; then
->
+                  STATUS="NG"
->
+                  CURL_CODE="404"
->
+                  CURL_RESULT=$CURL_CODE
->
+                  let OK_LINKS-=1
->
+                  let NG_LINKS+=1
->
+               fi
->
+            fi
->
->
+            # If this is a OneDrive link, we have to look at the actual page source to know if the file
->
+            # is really still at this URL; override the link's info if it's actually NG or RD
->
+            if [[ $URL == *skydrive.live.com* ]]; then
->
+               PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $URL)
->
+               CURL_ERR=$(echo $?)
->
+               if [ "$CURL_ERR" != "0" ]; then
->
+                  STATUS="NG"
->
+                  CURL_RESULT="000-$CURL_ERR"
->
+                  let OK_LINKS-=1
->
+                  let NG_LINKS+=1
->
+               elif [[ "$PAGE_TEXT" =~ "<h1>Sorry, something went wrong" ]]; then
+                  STATUS="NG"
-<
+                  CURL_RESULT=404
->
+                  CURL_CODE="404"
->
+                  CURL_RESULT=$CURL_CODE
+                  let OK_LINKS-=1
+                  let NG_LINKS+=1
-+
+               elif [[ "$PAGE_TEXT" =~ "<h2>Object moved to" ]]; then
-+
+                  STATUS="??" # have to send the code through the next block to treat the redirect properly
-+
+                  CURL_CODE="301"
-+
+                  CURL_RESULT=$CURL_CODE
-+
+                  let OK_LINKS-=1
+               fi
+            fi
-+
+            break
+         fi
+      done
+   if [ $STATUS == "??" ]; then
+      for CODE in "${RD_CODES[@]}"; do
+         if [[ $CODE == $CURL_CODE ]]; then
-<
+            # Get URL header again in order to retrieve the URL we are being redirected to
-<
+            NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --write-out '%{redirect_url}\n' $URL)
->
+            # Get URL header again in order to retrieve the URL we are being redirected to, but if this
->
+            # is a OneDrive link, we already have the new URL in $PAGE_TEXT
->
+            if [[ $URL == *skydrive.live.com* ]]; then
->
+               NEW_URL=${PAGE_TEXT##*href=\"}
->
+               NEW_URL=${NEW_URL%\">here*}
->
+            else
->
+               NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --write-out '%{redirect_url}\n' $URL)
->
+            fi
+            # Adjust the old and new URLs to both use HTTP for comparison purposes, so we can filter
+            # those changes out if the user didn't ask for them
+      for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
+         EXCEPT_LINE="${EXCEPT_ARRAY[$i]}"
-<
->
+         # Undo any HTML-encoding from the wiki page; for now we just worry about the ampersand, as most
+         # other HTML-encoded characters are not found in URLs
+         EXCEPT_LINE=$(echo "$EXCEPT_LINE" | sed 's/\&amp;/\&/g')
+         EXCEPT_URL="${EXCEPT_LINE#*,}"
+         EXCEPT_URL="${EXCEPT_URL%,*}"
+         if [[ "$EXCEPT_URL" =~ \* ]]; then # if this exception URL contains the '*' wildcard, use pattern-matching with it
-<
+            if [[ "$URL" =~ "$EXCEPT_URL" ]]; then
->
+            if [[ ! "$URL" == $EXCEPT_URL ]]; then
+               continue
+            fi
+         else

Diff Legend

-–
+Removed lines
-+
+Added lines
-<
+Changed lines (old)
->
+Changed lines (new)

Comparing Validate External Links/validate_external_links.sh (file contents): Revision 1178 by iritscen, Mon Jan 23 01:51:32 2023 UTC vs. Revision 1182 by iritscen, Sun May 7 19:53:19 2023 UTC

Diff Legend

Comparing Validate External Links/validate_external_links.sh (file contents):
Revision 1178 by iritscen, Mon Jan 23 01:51:32 2023 UTC vs.
Revision 1182 by iritscen, Sun May 7 19:53:19 2023 UTC