72 |
|
declare -a HTTP_TLDS_AND_PAGES=(abstract action ars asp aspx cfm cgi com css de do full htm html it js jsp net org pgi php php3 phtml pl ru shtml stm uk x) |
73 |
|
|
74 |
|
# These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which |
75 |
< |
# are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt |
76 |
< |
# if you add a new code. |
75 |
> |
# are NG (no good). Pages that return OK codes will be screenshotted when screenshots are asked for. |
76 |
> |
# Remember to update http_codes.txt if you add a new code. |
77 |
|
declare -a OK_CODES=(200 401 405 406 418 501) |
78 |
|
declare -a RD_CODES=(301 302 303 307 308) |
79 |
|
declare -a NG_CODES=(000 400 403 404 410 429 500 502 503 504 520 530) |
1013 |
|
# If this is a YouTube link, we have to look at the actual page source to know if the video |
1014 |
|
# is good or not; override the link's info if it's actually NG |
1015 |
|
if [[ $URL == *www.youtube.com* ]]; then |
1016 |
< |
PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $URL | grep "\"simpleText\":\"Video unavailable\"") |
1017 |
< |
if [ ! -z "$PAGE_TEXT" ]; then |
1016 |
> |
PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $URL) |
1017 |
> |
CURL_ERR=$(echo $?) |
1018 |
> |
if [ "$CURL_ERR" != "0" ]; then |
1019 |
> |
STATUS="NG" |
1020 |
> |
CURL_RESULT="000-$CURL_ERR" |
1021 |
> |
let OK_LINKS-=1 |
1022 |
> |
let NG_LINKS+=1 |
1023 |
> |
elif [[ "$PAGE_TEXT" =~ "simpleText\":\"Video unavailable" ]]; then |
1024 |
> |
STATUS="NG" |
1025 |
> |
CURL_CODE="404" |
1026 |
> |
CURL_RESULT=$CURL_CODE |
1027 |
> |
let OK_LINKS-=1 |
1028 |
> |
let NG_LINKS+=1 |
1029 |
> |
fi |
1030 |
> |
fi |
1031 |
> |
|
1032 |
> |
# If this is a OneDrive link, we have to look at the actual page source to know if the file |
1033 |
> |
# is really still at this URL; override the link's info if it's actually NG or RD |
1034 |
> |
if [[ $URL == *skydrive.live.com* ]]; then |
1035 |
> |
PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $URL) |
1036 |
> |
CURL_ERR=$(echo $?) |
1037 |
> |
if [ "$CURL_ERR" != "0" ]; then |
1038 |
> |
STATUS="NG" |
1039 |
> |
CURL_RESULT="000-$CURL_ERR" |
1040 |
> |
let OK_LINKS-=1 |
1041 |
> |
let NG_LINKS+=1 |
1042 |
> |
elif [[ "$PAGE_TEXT" =~ "<h1>Sorry, something went wrong" ]]; then |
1043 |
|
STATUS="NG" |
1044 |
< |
CURL_RESULT=404 |
1044 |
> |
CURL_CODE="404" |
1045 |
> |
CURL_RESULT=$CURL_CODE |
1046 |
|
let OK_LINKS-=1 |
1047 |
|
let NG_LINKS+=1 |
1048 |
+ |
elif [[ "$PAGE_TEXT" =~ "<h2>Object moved to" ]]; then |
1049 |
+ |
STATUS="??" # have to send the code through the next block to treat the redirect properly |
1050 |
+ |
CURL_CODE="301" |
1051 |
+ |
CURL_RESULT=$CURL_CODE |
1052 |
+ |
let OK_LINKS-=1 |
1053 |
|
fi |
1054 |
|
fi |
1055 |
+ |
|
1056 |
|
break |
1057 |
|
fi |
1058 |
|
done |
1062 |
|
if [ $STATUS == "??" ]; then |
1063 |
|
for CODE in "${RD_CODES[@]}"; do |
1064 |
|
if [[ $CODE == $CURL_CODE ]]; then |
1065 |
< |
# Get URL header again in order to retrieve the URL we are being redirected to |
1066 |
< |
NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --write-out '%{redirect_url}\n' $URL) |
1065 |
> |
# Get URL header again in order to retrieve the URL we are being redirected to, but if this |
1066 |
> |
# is a OneDrive link, we already have the new URL in $PAGE_TEXT |
1067 |
> |
if [[ $URL == *skydrive.live.com* ]]; then |
1068 |
> |
NEW_URL=${PAGE_TEXT##*href=\"} |
1069 |
> |
NEW_URL=${NEW_URL%\">here*} |
1070 |
> |
else |
1071 |
> |
NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --write-out '%{redirect_url}\n' $URL) |
1072 |
> |
fi |
1073 |
|
|
1074 |
|
# Adjust the old and new URLs to both use HTTP for comparison purposes, so we can filter |
1075 |
|
# those changes out if the user didn't ask for them |
1166 |
|
for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do |
1167 |
|
{ |
1168 |
|
EXCEPT_LINE="${EXCEPT_ARRAY[$i]}" |
1169 |
< |
|
1169 |
> |
|
1170 |
|
# Undo any HTML-encoding from the wiki page; for now we just worry about the ampersand, as most |
1171 |
|
# other HTML-encoded characters are not found in URLs |
1172 |
|
EXCEPT_LINE=$(echo "$EXCEPT_LINE" | sed 's/\&/\&/g') |
1175 |
|
EXCEPT_URL="${EXCEPT_LINE#*,}" |
1176 |
|
EXCEPT_URL="${EXCEPT_URL%,*}" |
1177 |
|
if [[ "$EXCEPT_URL" =~ \* ]]; then # if this exception URL contains the '*' wildcard, use pattern-matching with it |
1178 |
< |
if [[ "$URL" =~ "$EXCEPT_URL" ]]; then |
1178 |
> |
if [[ ! "$URL" == $EXCEPT_URL ]]; then |
1179 |
|
continue |
1180 |
|
fi |
1181 |
|
else |