| 47 |
|
UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report |
| 48 |
|
|
| 49 |
|
# Fixed strings -- see the occurrences of these variables to learn their purpose |
| 50 |
< |
AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36 OPR/70.0.3728.154" |
| 50 |
> |
AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/88.0.4324.146 Safari/537.36" |
| 51 |
|
ARCHIVE_API="http://archive.org/wayback/available" |
| 52 |
|
ARCHIVE_GENERIC="https://web.archive.org/web/*" |
| 53 |
|
ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206" |
| 76 |
|
# if you add a new code. |
| 77 |
|
declare -a OK_CODES=(200 401 405 406 418 501) |
| 78 |
|
declare -a RD_CODES=(301 302 303 307 308) |
| 79 |
< |
declare -a NG_CODES=(000 400 403 404 410 500 502 503 530) |
| 79 |
> |
declare -a NG_CODES=(000 400 403 404 410 429 500 502 503 530) |
| 80 |
|
|
| 81 |
|
# Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using |
| 82 |
|
# transcluded text, and if the transclusion fails, then the braces show up in the URL |
| 720 |
|
if [ $SKIPPED_HEADER_ROW -eq 0 ]; then |
| 721 |
|
if [ $LINE == "namespace,title,target" ]; then |
| 722 |
|
SKIPPED_HEADER_ROW=1 |
| 723 |
< |
LINK_NUM=0 # this line is it's not a link, so reset the link counter |
| 723 |
> |
LINK_NUM=0 # this line is not a link, so reset the link counter |
| 724 |
|
valPrint hn "<table>" |
| 725 |
|
continue |
| 726 |
|
else |
| 768 |
|
valPrint trs "Skipping line $LINK_NUM ('$LINE') because I could not find a name for namespace ID $NS_ID." |
| 769 |
|
fi |
| 770 |
|
let SKIP_UNK_NS+=1 |
| 771 |
+ |
let PAGE_LINKS+=1 |
| 772 |
|
continue |
| 773 |
|
fi |
| 774 |
|
|
| 783 |
|
if [ $PAGE_NAME_SUFFIX == "js" ]; then |
| 784 |
|
valPrint trs "Skipping URL '${LINE#$NS_ID,$PAGE_NAME,}' on line $LINK_NUM because it was found on JavaScript page '$PAGE_NAME'." |
| 785 |
|
let SKIP_JS_PAGE+=1 |
| 786 |
+ |
let PAGE_LINKS+=1 |
| 787 |
|
continue |
| 788 |
|
fi |
| 789 |
|
|
| 805 |
|
if [[ $URL == *[$ILLEGAL_CHARS]* ]]; then |
| 806 |
|
valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because it contains characters illegal in a URL." |
| 807 |
|
let SKIP_BAD_URL+=1 |
| 808 |
+ |
let PAGE_LINKS+=1 |
| 809 |
|
continue |
| 810 |
|
fi |
| 811 |
|
|
| 813 |
|
if [ $CHECK_ARCHIVE_LINKS -eq 0 ] && [[ $URL == *web.archive.org* ]]; then |
| 814 |
|
valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to check Wayback Machine links." |
| 815 |
|
let SKIP_ARCHIVE_ORG+=1 |
| 816 |
+ |
let PAGE_LINKS+=1 |
| 817 |
|
continue |
| 818 |
|
fi |
| 819 |
|
|
| 831 |
|
if [[ $CLEAN_URL == *[![:ascii:]]* ]]; then |
| 832 |
|
valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I cannot handle non-ASCII characters." |
| 833 |
|
let SKIP_NON_ASCII+=1 |
| 834 |
+ |
let PAGE_LINKS+=1 |
| 835 |
|
continue |
| 836 |
|
fi |
| 837 |
|
|
| 909 |
|
elif [ $IS_FILE -eq 1 ]; then |
| 910 |
|
STR_TYPE="file" |
| 911 |
|
let FILE_LINKS+=1 |
| 912 |
< |
elif [ $IS_FILE -eq 0 ]; then |
| 912 |
> |
else |
| 913 |
|
STR_TYPE="page" |
| 914 |
|
let PAGE_LINKS+=1 |
| 915 |
|
fi |
| 957 |
|
if [[ $CODE == $CURL_CODE ]]; then |
| 958 |
|
STATUS="OK" |
| 959 |
|
let OK_LINKS+=1 |
| 960 |
+ |
|
| 961 |
+ |
# If this is a YouTube link, we have to look at the actual page source to know if the video |
| 962 |
+ |
# is good or not |
| 963 |
+ |
if [[ $URL == *www.youtube.com* ]]; then |
| 964 |
+ |
PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $URL | grep "\"simpleText\":\"Video unavailable\"") |
| 965 |
+ |
if [ ! -z "$PAGE_TEXT" ]; then |
| 966 |
+ |
STATUS="NG" |
| 967 |
+ |
let OK_LINKS-=1 |
| 968 |
+ |
let NG_LINKS+=1 |
| 969 |
+ |
fi |
| 970 |
+ |
fi |
| 971 |
|
break |
| 972 |
|
fi |
| 973 |
|
done |
| 1016 |
|
STATUS="OK" |
| 1017 |
|
let OK_LINKS+=1 |
| 1018 |
|
let SKIP_SLASH_ADD+=1 |
| 1019 |
< |
elif [ $SHOW_YT_RD -eq 0 ] && [ $YOUTU_BE -eq 1 ]; then |
| 1020 |
< |
valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show redirects for youtu.be links, and I was redirected to '$NEW_URL'." |
| 1021 |
< |
STATUS="OK" |
| 1022 |
< |
let OK_LINKS+=1 |
| 1023 |
< |
let SKIP_YOUTU_BE+=1 |
| 1019 |
> |
elif [ $YOUTU_BE -eq 1 ]; then |
| 1020 |
> |
# We have to look at the actual page source to know if a YouTube video is good or not |
| 1021 |
> |
PAGE_TEXT=$(curl --silent --insecure --user-agent '$AGENT' --max-time $TIMEOUT $NEW_URL | grep "\"simpleText\":\"Video unavailable\"") |
| 1022 |
> |
if [ ! -z "$PAGE_TEXT" ]; then |
| 1023 |
> |
STATUS="NG" |
| 1024 |
> |
let NG_LINKS+=1 |
| 1025 |
> |
else |
| 1026 |
> |
if [ $SHOW_YT_RD -eq 0 ]; then |
| 1027 |
> |
valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show redirects for youtu.be links, and I was redirected to '$NEW_URL'." |
| 1028 |
> |
STATUS="OK" |
| 1029 |
> |
let OK_LINKS+=1 |
| 1030 |
> |
let SKIP_YOUTU_BE+=1 |
| 1031 |
> |
else |
| 1032 |
> |
STATUS="RD" |
| 1033 |
> |
let RD_LINKS+=1 |
| 1034 |
> |
fi |
| 1035 |
> |
fi |
| 1036 |
|
else |
| 1037 |
|
STATUS="RD" |
| 1038 |
|
let RD_LINKS+=1 |