ViewVC Help
View File | Revision Log | View Changeset | Root Listing
root/Oni2/Validate External Links/validate_external_links.sh
(Generate patch)

Comparing Validate External Links/validate_external_links.sh (file contents):
Revision 1178 by iritscen, Mon Jan 23 01:51:32 2023 UTC vs.
Revision 1182 by iritscen, Sun May 7 19:53:19 2023 UTC

# Line 72 | Line 72 | declare -a HTTP_FILES=(3ds 7z avi BINA b
72   declare -a HTTP_TLDS_AND_PAGES=(abstract action ars asp aspx cfm cgi com css de do full htm html it js jsp net org pgi php php3 phtml pl ru shtml stm uk x)
73  
74   # These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
75 < # are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt
76 < # if you add a new code.
75 > # are NG (no good). Pages that return OK codes will be screenshotted when screenshots are asked for.
76 > # Remember to update http_codes.txt if you add a new code.
77   declare -a OK_CODES=(200 401 405 406 418 501)
78   declare -a RD_CODES=(301 302 303 307 308)
79   declare -a NG_CODES=(000 400 403 404 410 429 500 502 503 504 520 530)
# Line 1013 | Line 1013 | for LINE in `cat "$LINKS_FILE"`; do
1013              # If this is a YouTube link, we have to look at the actual page source to know if the video
1014              # is good or not; override the link's info if it's actually NG
1015              if [[ $URL == *www.youtube.com* ]]; then
1016 <               PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $URL | grep "\"simpleText\":\"Video unavailable\"")
1017 <               if [ ! -z "$PAGE_TEXT" ]; then
1016 >               PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $URL)
1017 >               CURL_ERR=$(echo $?)
1018 >               if [ "$CURL_ERR" != "0" ]; then
1019 >                  STATUS="NG"
1020 >                  CURL_RESULT="000-$CURL_ERR"
1021 >                  let OK_LINKS-=1
1022 >                  let NG_LINKS+=1
1023 >               elif [[ "$PAGE_TEXT" =~ "simpleText\":\"Video unavailable" ]]; then
1024 >                  STATUS="NG"
1025 >                  CURL_CODE="404"
1026 >                  CURL_RESULT=$CURL_CODE
1027 >                  let OK_LINKS-=1
1028 >                  let NG_LINKS+=1
1029 >               fi
1030 >            fi
1031 >            
1032 >            # If this is a OneDrive link, we have to look at the actual page source to know if the file
1033 >            # is really still at this URL; override the link's info if it's actually NG or RD
1034 >            if [[ $URL == *skydrive.live.com* ]]; then
1035 >               PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $URL)
1036 >               CURL_ERR=$(echo $?)
1037 >               if [ "$CURL_ERR" != "0" ]; then
1038 >                  STATUS="NG"
1039 >                  CURL_RESULT="000-$CURL_ERR"
1040 >                  let OK_LINKS-=1
1041 >                  let NG_LINKS+=1
1042 >               elif [[ "$PAGE_TEXT" =~ "<h1>Sorry, something went wrong" ]]; then
1043                    STATUS="NG"
1044 <                  CURL_RESULT=404
1044 >                  CURL_CODE="404"
1045 >                  CURL_RESULT=$CURL_CODE
1046                    let OK_LINKS-=1
1047                    let NG_LINKS+=1
1048 +               elif [[ "$PAGE_TEXT" =~ "<h2>Object moved to" ]]; then
1049 +                  STATUS="??" # have to send the code through the next block to treat the redirect properly
1050 +                  CURL_CODE="301"
1051 +                  CURL_RESULT=$CURL_CODE
1052 +                  let OK_LINKS-=1
1053                 fi
1054              fi
1055 +            
1056              break
1057           fi
1058        done
# Line 1030 | Line 1062 | for LINE in `cat "$LINKS_FILE"`; do
1062     if [ $STATUS == "??" ]; then
1063        for CODE in "${RD_CODES[@]}"; do
1064           if [[ $CODE == $CURL_CODE ]]; then
1065 <            # Get URL header again in order to retrieve the URL we are being redirected to
1066 <            NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --write-out '%{redirect_url}\n' $URL)
1065 >            # Get URL header again in order to retrieve the URL we are being redirected to, but if this
1066 >            # is a OneDrive link, we already have the new URL in $PAGE_TEXT
1067 >            if [[ $URL == *skydrive.live.com* ]]; then
1068 >               NEW_URL=${PAGE_TEXT##*href=\"}
1069 >               NEW_URL=${NEW_URL%\">here*}
1070 >            else
1071 >               NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --write-out '%{redirect_url}\n' $URL)
1072 >            fi
1073  
1074              # Adjust the old and new URLs to both use HTTP for comparison purposes, so we can filter
1075              # those changes out if the user didn't ask for them
# Line 1128 | Line 1166 | for LINE in `cat "$LINKS_FILE"`; do
1166        for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
1167        {
1168           EXCEPT_LINE="${EXCEPT_ARRAY[$i]}"
1169 <
1169 >        
1170           # Undo any HTML-encoding from the wiki page; for now we just worry about the ampersand, as most
1171           # other HTML-encoded characters are not found in URLs
1172           EXCEPT_LINE=$(echo "$EXCEPT_LINE" | sed 's/\&amp;/\&/g')
# Line 1137 | Line 1175 | for LINE in `cat "$LINKS_FILE"`; do
1175           EXCEPT_URL="${EXCEPT_LINE#*,}"
1176           EXCEPT_URL="${EXCEPT_URL%,*}"
1177           if [[ "$EXCEPT_URL" =~ \* ]]; then # if this exception URL contains the '*' wildcard, use pattern-matching with it
1178 <            if [[ "$URL" =~ "$EXCEPT_URL" ]]; then
1178 >            if [[ ! "$URL" == $EXCEPT_URL ]]; then
1179                 continue
1180              fi
1181           else

Diff Legend

Removed lines
+ Added lines
< Changed lines (old)
> Changed lines (new)