6 |
|
# - TXT (for easy diffing with an earlier log) |
7 |
|
# - RTF (for reading as a local file with clickable links) |
8 |
|
# - HTML (for uploading as a web page). |
9 |
< |
# Call script with "--help" argument for documentation. Also see Read Me First.rtf for critical notes. |
9 |
> |
# Call script with "--help" argument for documentation. Also see Read Me.rtf for critical notes. |
10 |
|
# |
11 |
|
# Recommended rule: |
12 |
|
# |----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----| |
41 |
|
TAKE_PAGE_SHOT=0 # take a screenshot of each OK page |
42 |
|
TIMEOUT=10 # time to wait for a response when querying a site |
43 |
|
CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature |
44 |
< |
URL_START=1 # start at this URL in LINKS_FILE (1 by default) |
44 |
> |
URL_START=1 # start at this URL in LINKS_FILE |
45 |
|
URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE |
46 |
|
UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report |
47 |
|
|
48 |
|
# Fixed strings -- see the occurrences of these variables to learn their purpose |
49 |
< |
AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36 OPR/69.0.3686.77" |
49 |
> |
AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36 OPR/70.0.3728.154" |
50 |
|
ARCHIVE_API="http://archive.org/wayback/available" |
51 |
|
ARCHIVE_GENERIC="https://web.archive.org/web/*" |
52 |
|
ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206" |
103 |
|
SKIP_UNK_SUFFIX=0 |
104 |
|
SKIP_UNK_CODE=0 |
105 |
|
SKIP_EXPECT_NG=0 |
106 |
+ |
SKIP_EXPECT_RD=0 |
107 |
|
SKIP_EXPECT_EI=0 |
108 |
|
SKIP_EXPECT_IW=0 |
109 |
|
SKIP_HTTPS_UP=0 |
181 |
|
--take-screenshots FILE Call the Google Chrome binary at this path to |
182 |
|
take screenshots of each "OK" page. |
183 |
|
--timeout NUM Wait this many seconds for a site to respond. The |
184 |
< |
default is 10. |
184 |
> |
default is 10. Important note: Val will attempt |
185 |
> |
to reach each URL three times, so the time taken |
186 |
> |
to ping an unresponsive site will be three times |
187 |
> |
this setting. |
188 |
|
--start-url NUM Start at this link in the links CSV file. |
189 |
|
--end-url NUM Stop at this link in the links CSV file. |
190 |
|
--upload FILE Upload report using the credentials and path |
483 |
|
|
484 |
|
# Do some math on results of session |
485 |
|
LINKS_PROCESSED=$((LINK_NUM-URL_START+1)) |
482 |
– |
LINK_PROBLEMS=$((EI_LINKS+IW_LINKS+RD_LINKS+NG_LINKS)) |
483 |
– |
LINK_ERRORS=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE)) |
484 |
– |
LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_EI+SKIP_EXPECT_IW)) |
486 |
|
TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP+SKIP_YOUTU_BE)) |
487 |
< |
LINKS_CHECKED=$((LINKS_PROCESSED-LINK_ERRORS)) |
487 |
> |
LINK_ERRORS=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE)) |
488 |
> |
LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_RD+SKIP_EXPECT_EI+SKIP_EXPECT_IW)) |
489 |
> |
LINK_PROBLEMS_TOTAL=$((NG_LINKS+RD_LINKS+EI_LINKS+IW_LINKS)) |
490 |
> |
LINK_PROBLEMS_NG=$((NG_LINKS-SKIP_EXPECT_NG)) |
491 |
> |
LINK_PROBLEMS_RD=$((RD_LINKS-SKIP_EXPECT_RD)) |
492 |
> |
LINK_PROBLEMS_EI=$((EI_LINKS-SKIP_EXPECT_EI)) |
493 |
> |
LINK_PROBLEMS_IW=$((IW_LINKS-SKIP_EXPECT_IW)) |
494 |
> |
LINK_PROBLEMS_NET=$((LINK_PROBLEMS_NG+LINK_PROBLEMS_RD+LINK_PROBLEMS_EI+LINK_PROBLEMS_IW)) |
495 |
|
|
496 |
|
## SUMMARY OUTPUT ## |
497 |
|
valPrint ct "Summary ($ELAPSED):" |
503 |
|
if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi |
504 |
|
if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi |
505 |
|
if [ $SKIP_ARCHIVE_ORG -gt 0 ]; then valPrint ctrh "- $SKIP_ARCHIVE_ORG Archive.org $(pluralCheckNoun link $SKIP_ARCHIVE_ORG) were not checked"; fi |
506 |
< |
if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS processed $(pluralCheckNoun link $LINK_PROBLEMS) had $(pluralCheckAn $LINK_PROBLEMS)$(pluralCheckNoun issue $LINK_PROBLEMS)"; fi |
507 |
< |
if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr " (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; valPrint h " (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi |
506 |
> |
if [ $LINK_PROBLEMS_TOTAL -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_TOTAL processed $(pluralCheckNoun link $LINK_PROBLEMS_TOTAL) had $(pluralCheckAn $LINK_PROBLEMS_TOTAL)$(pluralCheckNoun issue $LINK_PROBLEMS_TOTAL)"; fi |
507 |
> |
if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr " (excepted $LINKS_EXCEPTED link $(pluralCheckNoun issue $LINKS_EXCEPTED) from report)"; valPrint h " (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi |
508 |
|
if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) $(pluralCheckWas $OK_LINKS) OK"; fi |
509 |
|
if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctr " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; valPrint h " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi |
510 |
|
|
503 |
– |
# Print excepted link totals |
504 |
– |
if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctrh "$LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"; fi |
505 |
– |
if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi |
506 |
– |
if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi |
507 |
– |
if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi |
508 |
– |
|
511 |
|
# Print errored link totals |
512 |
|
if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"; fi |
513 |
|
if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi |
517 |
|
if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi |
518 |
|
if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi |
519 |
|
|
520 |
+ |
# Print excepted link totals |
521 |
+ |
if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctrh "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"; fi |
522 |
+ |
if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi |
523 |
+ |
if [ $SKIP_EXPECT_RD -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_RD/$RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi |
524 |
+ |
if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi |
525 |
+ |
if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi |
526 |
+ |
|
527 |
|
# Print checked link totals |
528 |
< |
if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS link $(pluralCheckNoun issue $LINK_PROBLEMS):"; fi |
529 |
< |
if [ $NG_LINKS -gt 0 ]; then valPrint ctrh "- $NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi |
530 |
< |
if [ $RD_LINKS -gt 0 ]; then valPrint ctrh "- $RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi |
531 |
< |
if [ $EI_LINKS -gt 0 ]; then valPrint ctrh "- $EI_LINKS $(pluralCheckNoun link $EI_LINKS) that could be intrawiki"; fi |
532 |
< |
if [ $IW_LINKS -gt 0 ]; then valPrint ctrh "- $IW_LINKS $(pluralCheckNoun link $IW_LINKS) that could be interwiki"; fi |
528 |
> |
if [ $LINK_PROBLEMS_NET -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS_NET link $(pluralCheckNoun issue $LINK_PROBLEMS_NET):"; fi |
529 |
> |
if [ $LINK_PROBLEMS_NG -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_NG NG $(pluralCheckNoun link $LINK_PROBLEMS_NG)"; fi |
530 |
> |
if [ $LINK_PROBLEMS_RD -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_RD $(pluralCheckNoun redirection $LINK_PROBLEMS_RD)"; fi |
531 |
> |
if [ $LINK_PROBLEMS_EI -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_EI $(pluralCheckNoun link $LINK_PROBLEMS_EI) that could be intrawiki"; fi |
532 |
> |
if [ $LINK_PROBLEMS_IW -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_IW $(pluralCheckNoun link $LINK_PROBLEMS_IW) that could be interwiki"; fi |
533 |
|
|
534 |
|
# Close the log files' markup |
535 |
|
valPrint trh "ValExtLinks says goodbye." |
624 |
|
valPrint ctrhn "Take screenshots: " |
625 |
|
if [ $TAKE_PAGE_SHOT -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi |
626 |
|
|
627 |
< |
valPrint ctrhn "Suggest Archive.org snapshots: " |
627 |
> |
valPrint ctrhn "Suggest archive.org snapshots: " |
628 |
|
if [ $SUGGEST_SNAPSHOTS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi |
629 |
|
|
630 |
|
valPrint ctrhn "Ignore slash-adding redirects: " |
870 |
|
|
871 |
|
# Get response code using 'curl' to see if this link is valid; the --insecure option avoids an |
872 |
|
# issue with sites that require HTTPS |
873 |
< |
CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --write-out '%{http_code}\n' $URL) |
873 |
> |
CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --retry 2 --write-out '%{http_code}\n' $URL) |
874 |
|
CURL_ERR=$(echo $?) |
875 |
|
CURL_RESULT=$CURL_CODE |
876 |
|
|
1006 |
|
{ |
1007 |
|
EXCEPT_LINE="${EXCEPT_ARRAY[$i]}" |
1008 |
|
|
1009 |
+ |
# Undo any HTML-encoding from the wiki page; for now we just worry about the ampersand, as most |
1010 |
+ |
# other HTML-encoded characters are not found in URLs |
1011 |
+ |
EXCEPT_LINE=$(echo "$EXCEPT_LINE" | sed 's/\&/\&/') |
1012 |
+ |
|
1013 |
|
# Match URL |
1014 |
|
EXCEPT_URL="${EXCEPT_LINE#*,}" |
1015 |
|
EXCEPT_URL="${EXCEPT_URL%,*}" |
1029 |
|
let SKIP_EXPECT_EI+=1 |
1030 |
|
elif [ $STATUS == "IW" ]; then |
1031 |
|
let SKIP_EXPECT_IW+=1 |
1032 |
+ |
elif [ $STATUS == "RD" ]; then |
1033 |
+ |
let SKIP_EXPECT_RD+=1 |
1034 |
|
else |
1035 |
|
let SKIP_EXPECT_NG+=1 |
1036 |
|
fi |