| 6 |
|
# - TXT (for easy diffing with an earlier log) |
| 7 |
|
# - RTF (for reading as a local file with clickable links) |
| 8 |
|
# - HTML (for uploading as a web page). |
| 9 |
< |
# Call script with "--help" argument for documentation. Also see Read Me First.rtf for critical notes. |
| 9 |
> |
# Call script with "--help" argument for documentation. Also see Read Me.rtf for critical notes. |
| 10 |
|
# |
| 11 |
|
# Recommended rule: |
| 12 |
|
# |----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----| |
| 41 |
|
TAKE_PAGE_SHOT=0 # take a screenshot of each OK page |
| 42 |
|
TIMEOUT=10 # time to wait for a response when querying a site |
| 43 |
|
CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature |
| 44 |
< |
URL_START=1 # start at this URL in LINKS_FILE (1 by default) |
| 44 |
> |
URL_START=1 # start at this URL in LINKS_FILE |
| 45 |
|
URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE |
| 46 |
|
UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report |
| 47 |
|
|
| 48 |
|
# Fixed strings -- see the occurrences of these variables to learn their purpose |
| 49 |
< |
AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36 OPR/69.0.3686.77" |
| 49 |
> |
AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36 OPR/70.0.3728.154" |
| 50 |
|
ARCHIVE_API="http://archive.org/wayback/available" |
| 51 |
|
ARCHIVE_GENERIC="https://web.archive.org/web/*" |
| 52 |
|
ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206" |
| 103 |
|
SKIP_UNK_SUFFIX=0 |
| 104 |
|
SKIP_UNK_CODE=0 |
| 105 |
|
SKIP_EXPECT_NG=0 |
| 106 |
+ |
SKIP_EXPECT_RD=0 |
| 107 |
|
SKIP_EXPECT_EI=0 |
| 108 |
|
SKIP_EXPECT_IW=0 |
| 109 |
|
SKIP_HTTPS_UP=0 |
| 181 |
|
--take-screenshots FILE Call the Google Chrome binary at this path to |
| 182 |
|
take screenshots of each "OK" page. |
| 183 |
|
--timeout NUM Wait this many seconds for a site to respond. The |
| 184 |
< |
default is 10. |
| 184 |
> |
default is 10. Important note: Val will attempt |
| 185 |
> |
to reach each URL three times, so the time taken |
| 186 |
> |
to ping an unresponsive site will be three times |
| 187 |
> |
this setting. |
| 188 |
|
--start-url NUM Start at this link in the links CSV file. |
| 189 |
|
--end-url NUM Stop at this link in the links CSV file. |
| 190 |
|
--upload FILE Upload report using the credentials and path |
| 483 |
|
|
| 484 |
|
# Do some math on results of session |
| 485 |
|
LINKS_PROCESSED=$((LINK_NUM-URL_START+1)) |
| 482 |
– |
LINK_PROBLEMS=$((EI_LINKS+IW_LINKS+RD_LINKS+NG_LINKS)) |
| 483 |
– |
LINK_ERRORS=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE)) |
| 484 |
– |
LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_EI+SKIP_EXPECT_IW)) |
| 486 |
|
TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP+SKIP_YOUTU_BE)) |
| 487 |
< |
LINKS_CHECKED=$((LINKS_PROCESSED-LINK_ERRORS)) |
| 487 |
> |
LINK_ERRORS=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE)) |
| 488 |
> |
LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_RD+SKIP_EXPECT_EI+SKIP_EXPECT_IW)) |
| 489 |
> |
LINK_PROBLEMS_TOTAL=$((NG_LINKS+RD_LINKS+EI_LINKS+IW_LINKS)) |
| 490 |
> |
LINK_PROBLEMS_NG=$((NG_LINKS-SKIP_EXPECT_NG)) |
| 491 |
> |
LINK_PROBLEMS_RD=$((RD_LINKS-SKIP_EXPECT_RD)) |
| 492 |
> |
LINK_PROBLEMS_EI=$((EI_LINKS-SKIP_EXPECT_EI)) |
| 493 |
> |
LINK_PROBLEMS_IW=$((IW_LINKS-SKIP_EXPECT_IW)) |
| 494 |
> |
LINK_PROBLEMS_NET=$((LINK_PROBLEMS_NG+LINK_PROBLEMS_RD+LINK_PROBLEMS_EI+LINK_PROBLEMS_IW)) |
| 495 |
|
|
| 496 |
|
## SUMMARY OUTPUT ## |
| 497 |
|
valPrint ct "Summary ($ELAPSED):" |
| 503 |
|
if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi |
| 504 |
|
if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi |
| 505 |
|
if [ $SKIP_ARCHIVE_ORG -gt 0 ]; then valPrint ctrh "- $SKIP_ARCHIVE_ORG Archive.org $(pluralCheckNoun link $SKIP_ARCHIVE_ORG) were not checked"; fi |
| 506 |
< |
if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS processed $(pluralCheckNoun link $LINK_PROBLEMS) had $(pluralCheckAn $LINK_PROBLEMS)$(pluralCheckNoun issue $LINK_PROBLEMS)"; fi |
| 507 |
< |
if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr " (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; valPrint h " (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi |
| 506 |
> |
if [ $LINK_PROBLEMS_TOTAL -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_TOTAL processed $(pluralCheckNoun link $LINK_PROBLEMS_TOTAL) had $(pluralCheckAn $LINK_PROBLEMS_TOTAL)$(pluralCheckNoun issue $LINK_PROBLEMS_TOTAL)"; fi |
| 507 |
> |
if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr " (excepted $LINKS_EXCEPTED link $(pluralCheckNoun issue $LINKS_EXCEPTED) from report)"; valPrint h " (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi |
| 508 |
|
if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) $(pluralCheckWas $OK_LINKS) OK"; fi |
| 509 |
|
if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctr " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; valPrint h " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi |
| 510 |
|
|
| 503 |
– |
# Print excepted link totals |
| 504 |
– |
if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctrh "$LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"; fi |
| 505 |
– |
if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi |
| 506 |
– |
if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi |
| 507 |
– |
if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi |
| 508 |
– |
|
| 511 |
|
# Print errored link totals |
| 512 |
|
if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"; fi |
| 513 |
|
if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi |
| 517 |
|
if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi |
| 518 |
|
if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi |
| 519 |
|
|
| 520 |
+ |
# Print excepted link totals |
| 521 |
+ |
if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctrh "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"; fi |
| 522 |
+ |
if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi |
| 523 |
+ |
if [ $SKIP_EXPECT_RD -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_RD/$RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi |
| 524 |
+ |
if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi |
| 525 |
+ |
if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi |
| 526 |
+ |
|
| 527 |
|
# Print checked link totals |
| 528 |
< |
if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS link $(pluralCheckNoun issue $LINK_PROBLEMS):"; fi |
| 529 |
< |
if [ $NG_LINKS -gt 0 ]; then valPrint ctrh "- $NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi |
| 530 |
< |
if [ $RD_LINKS -gt 0 ]; then valPrint ctrh "- $RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi |
| 531 |
< |
if [ $EI_LINKS -gt 0 ]; then valPrint ctrh "- $EI_LINKS $(pluralCheckNoun link $EI_LINKS) that could be intrawiki"; fi |
| 532 |
< |
if [ $IW_LINKS -gt 0 ]; then valPrint ctrh "- $IW_LINKS $(pluralCheckNoun link $IW_LINKS) that could be interwiki"; fi |
| 528 |
> |
if [ $LINK_PROBLEMS_NET -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS_NET link $(pluralCheckNoun issue $LINK_PROBLEMS_NET):"; fi |
| 529 |
> |
if [ $LINK_PROBLEMS_NG -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_NG NG $(pluralCheckNoun link $LINK_PROBLEMS_NG)"; fi |
| 530 |
> |
if [ $LINK_PROBLEMS_RD -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_RD $(pluralCheckNoun redirection $LINK_PROBLEMS_RD)"; fi |
| 531 |
> |
if [ $LINK_PROBLEMS_EI -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_EI $(pluralCheckNoun link $LINK_PROBLEMS_EI) that could be intrawiki"; fi |
| 532 |
> |
if [ $LINK_PROBLEMS_IW -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_IW $(pluralCheckNoun link $LINK_PROBLEMS_IW) that could be interwiki"; fi |
| 533 |
|
|
| 534 |
|
# Close the log files' markup |
| 535 |
|
valPrint trh "ValExtLinks says goodbye." |
| 624 |
|
valPrint ctrhn "Take screenshots: " |
| 625 |
|
if [ $TAKE_PAGE_SHOT -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi |
| 626 |
|
|
| 627 |
< |
valPrint ctrhn "Suggest Archive.org snapshots: " |
| 627 |
> |
valPrint ctrhn "Suggest archive.org snapshots: " |
| 628 |
|
if [ $SUGGEST_SNAPSHOTS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi |
| 629 |
|
|
| 630 |
|
valPrint ctrhn "Ignore slash-adding redirects: " |
| 870 |
|
|
| 871 |
|
# Get response code using 'curl' to see if this link is valid; the --insecure option avoids an |
| 872 |
|
# issue with sites that require HTTPS |
| 873 |
< |
CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --write-out '%{http_code}\n' $URL) |
| 873 |
> |
CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --retry 2 --write-out '%{http_code}\n' $URL) |
| 874 |
|
CURL_ERR=$(echo $?) |
| 875 |
|
CURL_RESULT=$CURL_CODE |
| 876 |
|
|
| 1006 |
|
{ |
| 1007 |
|
EXCEPT_LINE="${EXCEPT_ARRAY[$i]}" |
| 1008 |
|
|
| 1009 |
+ |
# Undo any HTML-encoding from the wiki page; for now we just worry about the ampersand, as most |
| 1010 |
+ |
# other HTML-encoded characters are not found in URLs |
| 1011 |
+ |
EXCEPT_LINE=$(echo "$EXCEPT_LINE" | sed 's/\&/\&/') |
| 1012 |
+ |
|
| 1013 |
|
# Match URL |
| 1014 |
|
EXCEPT_URL="${EXCEPT_LINE#*,}" |
| 1015 |
|
EXCEPT_URL="${EXCEPT_URL%,*}" |
| 1029 |
|
let SKIP_EXPECT_EI+=1 |
| 1030 |
|
elif [ $STATUS == "IW" ]; then |
| 1031 |
|
let SKIP_EXPECT_IW+=1 |
| 1032 |
+ |
elif [ $STATUS == "RD" ]; then |
| 1033 |
+ |
let SKIP_EXPECT_RD+=1 |
| 1034 |
|
else |
| 1035 |
|
let SKIP_EXPECT_NG+=1 |
| 1036 |
|
fi |