--- Validate External Links/validate_external_links.sh 2020/09/04 02:54:30 1141 +++ Validate External Links/validate_external_links.sh 2020/09/04 03:07:08 1142 @@ -6,7 +6,7 @@ # - TXT (for easy diffing with an earlier log) # - RTF (for reading as a local file with clickable links) # - HTML (for uploading as a web page). -# Call script with "--help" argument for documentation. Also see Read Me First.rtf for critical notes. +# Call script with "--help" argument for documentation. Also see Read Me.rtf for critical notes. # # Recommended rule: # |----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----| @@ -41,12 +41,12 @@ SKIP_ARCHIVE_LINKS=0 # don't check URLs TAKE_PAGE_SHOT=0 # take a screenshot of each OK page TIMEOUT=10 # time to wait for a response when querying a site CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature -URL_START=1 # start at this URL in LINKS_FILE (1 by default) +URL_START=1 # start at this URL in LINKS_FILE URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report # Fixed strings -- see the occurrences of these variables to learn their purpose -AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36 OPR/69.0.3686.77" +AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36 OPR/70.0.3728.154" ARCHIVE_API="http://archive.org/wayback/available" ARCHIVE_GENERIC="https://web.archive.org/web/*" ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206" @@ -103,6 +103,7 @@ SKIP_NON_ASCII=0 SKIP_UNK_SUFFIX=0 SKIP_UNK_CODE=0 SKIP_EXPECT_NG=0 +SKIP_EXPECT_RD=0 SKIP_EXPECT_EI=0 SKIP_EXPECT_IW=0 SKIP_HTTPS_UP=0 @@ -180,7 +181,10 @@ OPTIONS --take-screenshots FILE Call the Google Chrome binary at this path to take screenshots of each "OK" page. --timeout NUM Wait this many seconds for a site to respond. The - default is 10. + default is 10. Important note: Val will attempt + to reach each URL three times, so the time taken + to ping an unresponsive site will be three times + this setting. --start-url NUM Start at this link in the links CSV file. --end-url NUM Stop at this link in the links CSV file. --upload FILE Upload report using the credentials and path @@ -479,11 +483,15 @@ function wrapupAndExit() # Do some math on results of session LINKS_PROCESSED=$((LINK_NUM-URL_START+1)) - LINK_PROBLEMS=$((EI_LINKS+IW_LINKS+RD_LINKS+NG_LINKS)) - LINK_ERRORS=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE)) - LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_EI+SKIP_EXPECT_IW)) TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP+SKIP_YOUTU_BE)) - LINKS_CHECKED=$((LINKS_PROCESSED-LINK_ERRORS)) + LINK_ERRORS=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE)) + LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_RD+SKIP_EXPECT_EI+SKIP_EXPECT_IW)) + LINK_PROBLEMS_TOTAL=$((NG_LINKS+RD_LINKS+EI_LINKS+IW_LINKS)) + LINK_PROBLEMS_NG=$((NG_LINKS-SKIP_EXPECT_NG)) + LINK_PROBLEMS_RD=$((RD_LINKS-SKIP_EXPECT_RD)) + LINK_PROBLEMS_EI=$((EI_LINKS-SKIP_EXPECT_EI)) + LINK_PROBLEMS_IW=$((IW_LINKS-SKIP_EXPECT_IW)) + LINK_PROBLEMS_NET=$((LINK_PROBLEMS_NG+LINK_PROBLEMS_RD+LINK_PROBLEMS_EI+LINK_PROBLEMS_IW)) ## SUMMARY OUTPUT ## valPrint ct "Summary ($ELAPSED):" @@ -495,17 +503,11 @@ function wrapupAndExit() if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi if [ $SKIP_ARCHIVE_ORG -gt 0 ]; then valPrint ctrh "- $SKIP_ARCHIVE_ORG Archive.org $(pluralCheckNoun link $SKIP_ARCHIVE_ORG) were not checked"; fi - if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS processed $(pluralCheckNoun link $LINK_PROBLEMS) had $(pluralCheckAn $LINK_PROBLEMS)$(pluralCheckNoun issue $LINK_PROBLEMS)"; fi - if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr " (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; valPrint h "  (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi + if [ $LINK_PROBLEMS_TOTAL -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_TOTAL processed $(pluralCheckNoun link $LINK_PROBLEMS_TOTAL) had $(pluralCheckAn $LINK_PROBLEMS_TOTAL)$(pluralCheckNoun issue $LINK_PROBLEMS_TOTAL)"; fi + if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr " (excepted $LINKS_EXCEPTED link $(pluralCheckNoun issue $LINKS_EXCEPTED) from report)"; valPrint h "  (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) $(pluralCheckWas $OK_LINKS) OK"; fi if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctr " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; valPrint h "  (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi - # Print excepted link totals - if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctrh "$LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"; fi - if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi - if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi - if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi - # Print errored link totals if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"; fi if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi @@ -515,12 +517,19 @@ function wrapupAndExit() if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi + # Print excepted link totals + if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctrh "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"; fi + if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi + if [ $SKIP_EXPECT_RD -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_RD/$RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi + if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi + if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi + # Print checked link totals - if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS link $(pluralCheckNoun issue $LINK_PROBLEMS):"; fi - if [ $NG_LINKS -gt 0 ]; then valPrint ctrh "- $NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi - if [ $RD_LINKS -gt 0 ]; then valPrint ctrh "- $RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi - if [ $EI_LINKS -gt 0 ]; then valPrint ctrh "- $EI_LINKS $(pluralCheckNoun link $EI_LINKS) that could be intrawiki"; fi - if [ $IW_LINKS -gt 0 ]; then valPrint ctrh "- $IW_LINKS $(pluralCheckNoun link $IW_LINKS) that could be interwiki"; fi + if [ $LINK_PROBLEMS_NET -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS_NET link $(pluralCheckNoun issue $LINK_PROBLEMS_NET):"; fi + if [ $LINK_PROBLEMS_NG -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_NG NG $(pluralCheckNoun link $LINK_PROBLEMS_NG)"; fi + if [ $LINK_PROBLEMS_RD -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_RD $(pluralCheckNoun redirection $LINK_PROBLEMS_RD)"; fi + if [ $LINK_PROBLEMS_EI -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_EI $(pluralCheckNoun link $LINK_PROBLEMS_EI) that could be intrawiki"; fi + if [ $LINK_PROBLEMS_IW -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_IW $(pluralCheckNoun link $LINK_PROBLEMS_IW) that could be interwiki"; fi # Close the log files' markup valPrint trh "ValExtLinks says goodbye." @@ -615,7 +624,7 @@ if [ $RECORD_OK_LINKS -eq 1 ]; then valP valPrint ctrhn "Take screenshots: " if [ $TAKE_PAGE_SHOT -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi -valPrint ctrhn "Suggest Archive.org snapshots: " +valPrint ctrhn "Suggest archive.org snapshots: " if [ $SUGGEST_SNAPSHOTS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi valPrint ctrhn "Ignore slash-adding redirects: " @@ -861,7 +870,7 @@ for LINE in `cat "$LINKS_FILE"`; do # Get response code using 'curl' to see if this link is valid; the --insecure option avoids an # issue with sites that require HTTPS - CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --write-out '%{http_code}\n' $URL) + CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --retry 2 --write-out '%{http_code}\n' $URL) CURL_ERR=$(echo $?) CURL_RESULT=$CURL_CODE @@ -997,6 +1006,10 @@ for LINE in `cat "$LINKS_FILE"`; do { EXCEPT_LINE="${EXCEPT_ARRAY[$i]}" + # Undo any HTML-encoding from the wiki page; for now we just worry about the ampersand, as most + # other HTML-encoded characters are not found in URLs + EXCEPT_LINE=$(echo "$EXCEPT_LINE" | sed 's/\&/\&/') + # Match URL EXCEPT_URL="${EXCEPT_LINE#*,}" EXCEPT_URL="${EXCEPT_URL%,*}" @@ -1016,6 +1029,8 @@ for LINE in `cat "$LINKS_FILE"`; do let SKIP_EXPECT_EI+=1 elif [ $STATUS == "IW" ]; then let SKIP_EXPECT_IW+=1 + elif [ $STATUS == "RD" ]; then + let SKIP_EXPECT_RD+=1 else let SKIP_EXPECT_NG+=1 fi