--- Validate External Links/validate_external_links.sh 2023/01/13 22:26:56 1177 +++ Validate External Links/validate_external_links.sh 2023/01/23 01:51:32 1178 @@ -47,7 +47,7 @@ URL_LIMIT=0 # if non-zero, st UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report # Fixed strings -- see the occurrences of these variables to learn their purpose -AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.134 Safari/537.36" +AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36" ARCHIVE_API="http://archive.org/wayback/available" ARCHIVE_GENERIC="https://web.archive.org/web/*" ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206" @@ -76,7 +76,7 @@ declare -a HTTP_TLDS_AND_PAGES=(abstract # if you add a new code. declare -a OK_CODES=(200 401 405 406 418 501) declare -a RD_CODES=(301 302 303 307 308) -declare -a NG_CODES=(000 400 403 404 410 429 500 502 503 504 530) +declare -a NG_CODES=(000 400 403 404 410 429 500 502 503 504 520 530) # Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using # transcluded text, and if the transclusion fails, then the braces show up in the URL @@ -123,7 +123,7 @@ END_RUN=0 ### HELP OUTPUT ### # A pseudo-man page. Here is the 80-character rule for the page text: -# 234567890123456789012345678901234567890123456789012345678901234567890123456789 +# 345678901234567890123456789012345678901234567890123456789012345678901234567890 function printHelp() { cat << EOF @@ -534,7 +534,7 @@ function wrapupAndExit() # Print processed link totals if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi - if [ $SKIP_ARCHIVES -gt 0 ]; then valPrint ctrh "- $SKIP_ARCHIVES archive.org/archive.is $(pluralCheckNoun link $SKIP_ARCHIVES) were not checked"; fi + if [ $SKIP_ARCHIVES -gt 0 ]; then valPrint ctrh "- $SKIP_ARCHIVES archive.org/archive.is $(pluralCheckNoun link $SKIP_ARCHIVES) $(pluralCheckWas $SKIP_ARCHIVES) not checked"; fi if [ $LINK_PROBLEMS_TOTAL -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_TOTAL processed $(pluralCheckNoun link $LINK_PROBLEMS_TOTAL) had $(pluralCheckAn $LINK_PROBLEMS_TOTAL)$(pluralCheckNoun issue $LINK_PROBLEMS_TOTAL)"; fi if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr " (excepted $LINKS_EXCEPTED link $(pluralCheckNoun issue $LINKS_EXCEPTED) from report)"; valPrint h "  (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) $(pluralCheckWas $OK_LINKS) OK"; fi @@ -1136,8 +1136,14 @@ for LINE in `cat "$LINKS_FILE"`; do # Check for URL match EXCEPT_URL="${EXCEPT_LINE#*,}" EXCEPT_URL="${EXCEPT_URL%,*}" - if [ "$EXCEPT_URL" != "$URL" ]; then - continue + if [[ "$EXCEPT_URL" =~ \* ]]; then # if this exception URL contains the '*' wildcard, use pattern-matching with it + if [[ "$URL" =~ "$EXCEPT_URL" ]]; then + continue + fi + else + if [ "$EXCEPT_URL" != "$URL" ]; then # otherwise just use a straight string comparison + continue + fi fi # Check for page name match