--- Validate External Links/validate_external_links.sh 2021/02/04 23:15:20 1148 +++ Validate External Links/validate_external_links.sh 2021/02/07 22:36:56 1149 @@ -76,7 +76,7 @@ declare -a HTTP_TLDS_AND_PAGES=(action a # if you add a new code. declare -a OK_CODES=(200 401 405 406 418 501) declare -a RD_CODES=(301 302 303 307 308) -declare -a NG_CODES=(000 400 403 404 410 429 500 502 503 530) +declare -a NG_CODES=(000 400 403 404 410 429 500 502 503 504 530) # Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using # transcluded text, and if the transclusion fails, then the braces show up in the URL @@ -777,16 +777,6 @@ for LINE in `cat "$LINKS_FILE"`; do PAGE_NAME=${LINE#$NS_ID,} PAGE_NAME=${PAGE_NAME%%,*} - # We don't want to consider wiki pages ending in .js, as the MW parser cannot reliably isolate URLS - # in JavaScript code, so it returns erroneous links - PAGE_NAME_SUFFIX=$(echo $PAGE_NAME | sed 's/.*\.//') - if [ $PAGE_NAME_SUFFIX == "js" ]; then - valPrint trs "Skipping URL '${LINE#$NS_ID,$PAGE_NAME,}' on line $LINK_NUM because it was found on JavaScript page '$PAGE_NAME'." - let SKIP_JS_PAGE+=1 - let PAGE_LINKS+=1 - continue - fi - # Build longer wiki page URLs from namespace and page names FULL_PAGE_PATH=https://$WIKI_PATH/$NS_NAME:$PAGE_NAME LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME @@ -797,13 +787,23 @@ for LINE in `cat "$LINKS_FILE"`; do LOCAL_PAGE_PATH=$PAGE_NAME fi + # We don't want to consider wiki pages ending in .js, as the MW parser cannot reliably isolate URLS + # in JavaScript code, so it returns erroneous links + PAGE_NAME_SUFFIX=$(echo $PAGE_NAME | sed 's/.*\.//') + if [ $PAGE_NAME_SUFFIX == "js" ]; then + valPrint trs "Skipping URL '${LINE#$NS_ID,$PAGE_NAME,}' on line $LINK_NUM because it was found on JavaScript page '$LOCAL_PAGE_PATH'." + let SKIP_JS_PAGE+=1 + let PAGE_LINKS+=1 + continue + fi + # The URL being linked to is everything after the previous two fields (this allows commas to be in # the URLs, but a comma in the previous field, the page name, will break this) URL=${LINE#$NS_ID,$PAGE_NAME,} # Scan for illegal characters if [[ $URL == *[$ILLEGAL_CHARS]* ]]; then - valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because it contains characters illegal in a URL." + valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because it contains characters illegal in a URL." let SKIP_BAD_URL+=1 let PAGE_LINKS+=1 continue @@ -811,7 +811,7 @@ for LINE in `cat "$LINKS_FILE"`; do # If we're skipping Archive.org links, see if this is one if [ $CHECK_ARCHIVE_LINKS -eq 0 ] && [[ $URL == *web.archive.org* ]]; then - valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to check Wayback Machine links." + valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to check Wayback Machine links." let SKIP_ARCHIVE_ORG+=1 let PAGE_LINKS+=1 continue @@ -829,7 +829,7 @@ for LINE in `cat "$LINKS_FILE"`; do # 'sed' cannot handle Unicode in my Bash shell, so skip non-ASCII URL and make user check it if [[ $CLEAN_URL == *[![:ascii:]]* ]]; then - valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I cannot handle non-ASCII characters." + valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I cannot handle non-ASCII characters." let SKIP_NON_ASCII+=1 let PAGE_LINKS+=1 continue @@ -903,7 +903,7 @@ for LINE in `cat "$LINKS_FILE"`; do # If this suffix escaped identification as either a file, page or TLD, inform the user STR_TYPE="" if [ $IS_FILE -eq -1 ]; then - valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I encountered the unknown URL ending '$POST_DOT'. Please add this ending to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES." + valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I encountered the unknown URL ending '$POST_DOT'. Please add this ending to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES." let SKIP_UNK_SUFFIX+=1 continue elif [ $IS_FILE -eq 1 ]; then @@ -1005,14 +1005,14 @@ for LINE in `cat "$LINKS_FILE"`; do # If the URLs match besides HTTP being upgraded to HTTPS, then the link is OK (unless user # wants those to be reported) if [ $SHOW_HTTPS -eq 0 ] && [ $URL_HTTP == $NEW_URL_HTTP ]; then - valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show http->https upgrades, and I was redirected to '$NEW_URL'." + valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to show http->https upgrades, and I was redirected to '$NEW_URL'." STATUS="OK" let OK_LINKS+=1 let SKIP_HTTPS_UP+=1 # If the URLs match besides an added ending slash, then the link is OK (unless user wants # those to be reported) elif [ $SHOW_SLASH -eq 0 ] && [ $URL_HTTP == $NEW_URL_NO_SLASH ]; then - valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show added trailing slashes, and I was redirected to '$NEW_URL'." + valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to show added trailing slashes, and I was redirected to '$NEW_URL'." STATUS="OK" let OK_LINKS+=1 let SKIP_SLASH_ADD+=1 @@ -1024,7 +1024,7 @@ for LINE in `cat "$LINKS_FILE"`; do let NG_LINKS+=1 else if [ $SHOW_YT_RD -eq 0 ]; then - valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show redirects for youtu.be links, and I was redirected to '$NEW_URL'." + valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I have not been asked to show redirects for youtu.be links, and I was redirected to '$NEW_URL'." STATUS="OK" let OK_LINKS+=1 let SKIP_YOUTU_BE+=1 @@ -1055,7 +1055,7 @@ for LINE in `cat "$LINKS_FILE"`; do # If we didn't match a known status code, advise the reader if [ $STATUS == "??" ]; then - valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I encountered the unknown response code $CURL_CODE." + valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I encountered the unknown response code $CURL_CODE." let SKIP_UNK_CODE+=1 continue fi @@ -1094,7 +1094,7 @@ for LINE in `cat "$LINKS_FILE"`; do # Match result code EXCEPT_CODE=${EXCEPT_LINE%%,*} if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then - valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because its expected result, '$EXPECT_CODE', is in the exceptions list." + valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because its expected result, '$EXPECT_CODE', is in the exceptions list." if [ $STATUS == "EI" ]; then let SKIP_EXPECT_EI+=1 elif [ $STATUS == "IW" ]; then