47 |
|
UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report |
48 |
|
|
49 |
|
# Fixed strings -- see the occurrences of these variables to learn their purpose |
50 |
< |
AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36 OPR/70.0.3728.154" |
50 |
> |
AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/88.0.4324.146 Safari/537.36" |
51 |
|
ARCHIVE_API="http://archive.org/wayback/available" |
52 |
|
ARCHIVE_GENERIC="https://web.archive.org/web/*" |
53 |
|
ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206" |
76 |
|
# if you add a new code. |
77 |
|
declare -a OK_CODES=(200 401 405 406 418 501) |
78 |
|
declare -a RD_CODES=(301 302 303 307 308) |
79 |
< |
declare -a NG_CODES=(000 400 403 404 410 500 502 503 530) |
79 |
> |
declare -a NG_CODES=(000 400 403 404 410 429 500 502 503 530) |
80 |
|
|
81 |
|
# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using |
82 |
|
# transcluded text, and if the transclusion fails, then the braces show up in the URL |
720 |
|
if [ $SKIPPED_HEADER_ROW -eq 0 ]; then |
721 |
|
if [ $LINE == "namespace,title,target" ]; then |
722 |
|
SKIPPED_HEADER_ROW=1 |
723 |
< |
LINK_NUM=0 # this line is it's not a link, so reset the link counter |
723 |
> |
LINK_NUM=0 # this line is not a link, so reset the link counter |
724 |
|
valPrint hn "<table>" |
725 |
|
continue |
726 |
|
else |
768 |
|
valPrint trs "Skipping line $LINK_NUM ('$LINE') because I could not find a name for namespace ID $NS_ID." |
769 |
|
fi |
770 |
|
let SKIP_UNK_NS+=1 |
771 |
+ |
let PAGE_LINKS+=1 |
772 |
|
continue |
773 |
|
fi |
774 |
|
|
783 |
|
if [ $PAGE_NAME_SUFFIX == "js" ]; then |
784 |
|
valPrint trs "Skipping URL '${LINE#$NS_ID,$PAGE_NAME,}' on line $LINK_NUM because it was found on JavaScript page '$PAGE_NAME'." |
785 |
|
let SKIP_JS_PAGE+=1 |
786 |
+ |
let PAGE_LINKS+=1 |
787 |
|
continue |
788 |
|
fi |
789 |
|
|
805 |
|
if [[ $URL == *[$ILLEGAL_CHARS]* ]]; then |
806 |
|
valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because it contains characters illegal in a URL." |
807 |
|
let SKIP_BAD_URL+=1 |
808 |
+ |
let PAGE_LINKS+=1 |
809 |
|
continue |
810 |
|
fi |
811 |
|
|
813 |
|
if [ $CHECK_ARCHIVE_LINKS -eq 0 ] && [[ $URL == *web.archive.org* ]]; then |
814 |
|
valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to check Wayback Machine links." |
815 |
|
let SKIP_ARCHIVE_ORG+=1 |
816 |
+ |
let PAGE_LINKS+=1 |
817 |
|
continue |
818 |
|
fi |
819 |
|
|
831 |
|
if [[ $CLEAN_URL == *[![:ascii:]]* ]]; then |
832 |
|
valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I cannot handle non-ASCII characters." |
833 |
|
let SKIP_NON_ASCII+=1 |
834 |
+ |
let PAGE_LINKS+=1 |
835 |
|
continue |
836 |
|
fi |
837 |
|
|
909 |
|
elif [ $IS_FILE -eq 1 ]; then |
910 |
|
STR_TYPE="file" |
911 |
|
let FILE_LINKS+=1 |
912 |
< |
elif [ $IS_FILE -eq 0 ]; then |
912 |
> |
else |
913 |
|
STR_TYPE="page" |
914 |
|
let PAGE_LINKS+=1 |
915 |
|
fi |
957 |
|
if [[ $CODE == $CURL_CODE ]]; then |
958 |
|
STATUS="OK" |
959 |
|
let OK_LINKS+=1 |
960 |
+ |
|
961 |
+ |
# If this is a YouTube link, we have to look at the actual page source to know if the video |
962 |
+ |
# is good or not |
963 |
+ |
if [[ $URL == *www.youtube.com* ]]; then |
964 |
+ |
PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $URL | grep "\"simpleText\":\"Video unavailable\"") |
965 |
+ |
if [ ! -z "$PAGE_TEXT" ]; then |
966 |
+ |
STATUS="NG" |
967 |
+ |
let OK_LINKS-=1 |
968 |
+ |
let NG_LINKS+=1 |
969 |
+ |
fi |
970 |
+ |
fi |
971 |
|
break |
972 |
|
fi |
973 |
|
done |
1016 |
|
STATUS="OK" |
1017 |
|
let OK_LINKS+=1 |
1018 |
|
let SKIP_SLASH_ADD+=1 |
1019 |
< |
elif [ $SHOW_YT_RD -eq 0 ] && [ $YOUTU_BE -eq 1 ]; then |
1020 |
< |
valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show redirects for youtu.be links, and I was redirected to '$NEW_URL'." |
1021 |
< |
STATUS="OK" |
1022 |
< |
let OK_LINKS+=1 |
1023 |
< |
let SKIP_YOUTU_BE+=1 |
1019 |
> |
elif [ $YOUTU_BE -eq 1 ]; then |
1020 |
> |
# We have to look at the actual page source to know if a YouTube video is good or not |
1021 |
> |
PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $NEW_URL | grep "\"simpleText\":\"Video unavailable\"") |
1022 |
> |
if [ ! -z "$PAGE_TEXT" ]; then |
1023 |
> |
STATUS="NG" |
1024 |
> |
let NG_LINKS+=1 |
1025 |
> |
else |
1026 |
> |
if [ $SHOW_YT_RD -eq 0 ]; then |
1027 |
> |
valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show redirects for youtu.be links, and I was redirected to '$NEW_URL'." |
1028 |
> |
STATUS="OK" |
1029 |
> |
let OK_LINKS+=1 |
1030 |
> |
let SKIP_YOUTU_BE+=1 |
1031 |
> |
else |
1032 |
> |
STATUS="RD" |
1033 |
> |
let RD_LINKS+=1 |
1034 |
> |
fi |
1035 |
> |
fi |
1036 |
|
else |
1037 |
|
STATUS="RD" |
1038 |
|
let RD_LINKS+=1 |