| 5 |
|
# Validates a list of external links in CSV format. The resulting logs are produced in three formats: |
| 6 |
|
# - TXT (for easy diffing with an earlier log) |
| 7 |
|
# - RTF (for reading as a local file with clickable links) |
| 8 |
< |
# - HTML (for uploading as a web page). |
| 8 |
> |
# - HTML (for reading as a web page) |
| 9 |
|
# Call script with "--help" argument for documentation. Also see Read Me.rtf for critical notes. |
| 10 |
|
# |
| 11 |
|
# Recommended rule: |
| 29 |
|
|
| 30 |
|
### GLOBALS ### |
| 31 |
|
# Settings -- these will be changed from their defaults by the arguments passed in to the script |
| 32 |
< |
LINKS_URL="" # use 'curl' to download file with links from this location (can be file://) |
| 33 |
< |
EXCEPT_URL="" # 'curl' will access this wiki page with a list of exceptions for NG results |
| 34 |
< |
OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder |
| 35 |
< |
RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES |
| 36 |
< |
SHOW_SLASH=0 # record response code to the log when a slash is added to the end of a URL |
| 37 |
< |
SHOW_HTTPS=0 # record response code to the log when "http" is upgraded to "https" |
| 38 |
< |
SHOW_YT_RD=0 # record response code to the log when a youtu.be URL is expanded to the full URL |
| 39 |
< |
SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page |
| 40 |
< |
SKIP_ARCHIVE_LINKS=0 # don't check URLs under the archive.org domain |
| 41 |
< |
TAKE_PAGE_SHOT=0 # take a screenshot of each OK page |
| 42 |
< |
TIMEOUT=10 # time to wait for a response when querying a site |
| 43 |
< |
CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature |
| 44 |
< |
URL_START=1 # start at this URL in LINKS_FILE |
| 45 |
< |
URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE |
| 46 |
< |
UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report |
| 32 |
> |
LINKS_URL="" # use 'curl' to download file with links from this location (can be file://) |
| 33 |
> |
EXCEPT_URL="" # 'curl' will access this wiki page with a list of exceptions for NG results |
| 34 |
> |
OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder |
| 35 |
> |
RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES |
| 36 |
> |
SHOW_SLASH=0 # record issue when a slash is added to the end of a URL |
| 37 |
> |
SHOW_HTTPS=0 # record issue when "http" is upgraded to "https" |
| 38 |
> |
SHOW_YT_RD=0 # record redirection for a youtu.be URL expanding to the full URL |
| 39 |
> |
SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page |
| 40 |
> |
CHECK_ARCHIVE_LINKS=0 # check URLs under the archive.org domain |
| 41 |
> |
TAKE_PAGE_SHOT=0 # take a screenshot of each OK page |
| 42 |
> |
TIMEOUT=10 # time to wait for a response when querying a site |
| 43 |
> |
CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature |
| 44 |
> |
URL_START=1 # start at this URL in LINKS_FILE |
| 45 |
> |
URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE |
| 46 |
> |
UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report |
| 47 |
|
|
| 48 |
|
# Fixed strings -- see the occurrences of these variables to learn their purpose |
| 49 |
|
AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36 OPR/70.0.3728.154" |
| 132 |
|
validate_external_links.sh --help |
| 133 |
|
validate_external_links.sh --links URL --output DIR [--exceptions URL] |
| 134 |
|
[--record-ok-links] [--show-added-slashes] [--show-https-upgrades] |
| 135 |
< |
[--show-yt-redirects] [--suggest-snapshots] [--skip-archive-links] |
| 135 |
> |
[--show-yt-redirects] [--suggest-snapshots] [--check-archive-links] |
| 136 |
|
[--take-screenshots FILE] [--timeout NUM] [--start-url NUM] |
| 137 |
|
[--end-url NUM] [--upload FILE] |
| 138 |
|
|
| 176 |
|
--show-yt-redirects Report on redirects that expand a youtu.be URL. |
| 177 |
|
--suggest-snapshots Query the Internet Archive for a possible |
| 178 |
|
snapshot URL for each "NG" page. |
| 179 |
< |
--skip-archive-links Don't check links that are already pointing to |
| 180 |
< |
a page on the Internet Archive. |
| 179 |
> |
--check-archive-links Check links that are already pointing to a page |
| 180 |
> |
on the Internet Archive. In theory these links |
| 181 |
> |
should be totally stable and not need validation. |
| 182 |
|
--take-screenshots FILE Call the Google Chrome binary at this path to |
| 183 |
|
take screenshots of each "OK" page. |
| 184 |
|
--timeout NUM Wait this many seconds for a site to respond. The |
| 218 |
|
--show-https-upgrades ) SHOW_HTTPS=1; shift;; |
| 219 |
|
--show-yt-redirects ) SHOW_YT_RD=1; shift;; |
| 220 |
|
--suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;; |
| 221 |
< |
--skip-archive-links ) SKIP_ARCHIVE_LINKS=1; shift;; |
| 221 |
> |
--check-archive-links ) CHECK_ARCHIVE_LINKS=1; shift;; |
| 222 |
|
--take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;; |
| 223 |
|
--timeout ) TIMEOUT=$2; shift 2;; |
| 224 |
|
--start-url ) URL_START=$2; shift 2;; |
| 261 |
|
OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER" |
| 262 |
|
SHOT_PATH="$OUTPUT_PATH/Screenshots" |
| 263 |
|
LOG_NAME="ValExtLinks report" |
| 264 |
< |
LOG_TXT="$OUTPUT_PATH/$LOG_NAME.txt" |
| 265 |
< |
LOG_RTF="$OUTPUT_PATH/$LOG_NAME.rtf" |
| 266 |
< |
LOG_HTM="$OUTPUT_PATH/$LOG_NAME.htm" |
| 264 |
> |
LOG_NAME_TXT="$LOG_NAME.txt" |
| 265 |
> |
LOG_NAME_RTF="$LOG_NAME.rtf" |
| 266 |
> |
LOG_NAME_HTM="$LOG_NAME.htm" |
| 267 |
> |
LOG_PATH="$OUTPUT_PATH/$LOG_NAME" |
| 268 |
> |
LOG_PATH_TXT="$LOG_PATH.txt" |
| 269 |
> |
LOG_PATH_RTF="$LOG_PATH.rtf" |
| 270 |
> |
LOG_PATH_HTM="$LOG_PATH.htm" |
| 271 |
|
mkdir "$OUTPUT_PATH" |
| 272 |
|
if [ $TAKE_PAGE_SHOT -eq 1 ]; then |
| 273 |
|
mkdir "$SHOT_PATH" |
| 365 |
|
fi |
| 366 |
|
if [[ "$1" == *t* ]]; then |
| 367 |
|
if [[ "$1" == *n* ]]; then |
| 368 |
< |
echo -n "$2" >> "$LOG_TXT" |
| 368 |
> |
echo -n "$2" >> "$LOG_PATH_TXT" |
| 369 |
|
elif [[ "$1" == *s* ]]; then |
| 370 |
< |
echo -e "$2\n" >> "$LOG_TXT" |
| 370 |
> |
echo -e "$2\n" >> "$LOG_PATH_TXT" |
| 371 |
|
else |
| 372 |
< |
echo "$2" >> "$LOG_TXT" |
| 372 |
> |
echo "$2" >> "$LOG_PATH_TXT" |
| 373 |
|
fi |
| 374 |
|
fi |
| 375 |
|
if [[ "$1" == *r* ]]; then |
| 376 |
|
if [[ "$1" == *n* ]]; then |
| 377 |
< |
echo "$2" >> "$LOG_RTF" |
| 377 |
> |
echo "$2" >> "$LOG_PATH_RTF" |
| 378 |
|
elif [[ "$1" == *s* ]]; then |
| 379 |
< |
echo "$2\line\line" >> "$LOG_RTF" |
| 379 |
> |
echo "$2\line\line" >> "$LOG_PATH_RTF" |
| 380 |
|
else |
| 381 |
< |
echo "$2\line" >> "$LOG_RTF" |
| 381 |
> |
echo "$2\line" >> "$LOG_PATH_RTF" |
| 382 |
|
fi |
| 383 |
|
fi |
| 384 |
|
if [[ "$1" == *h* ]]; then |
| 385 |
|
if [[ "$1" == *s* ]]; then |
| 386 |
< |
echo "$2<tr><td> </td></tr>" >> "$LOG_HTM" |
| 386 |
> |
echo "$2<tr><td> </td></tr>" >> "$LOG_PATH_HTM" |
| 387 |
|
elif [[ "$1" == *n* ]]; then |
| 388 |
< |
echo "$2" >> "$LOG_HTM" |
| 388 |
> |
echo "$2" >> "$LOG_PATH_HTM" |
| 389 |
|
else |
| 390 |
< |
echo "$2<br />" >> "$LOG_HTM" |
| 390 |
> |
echo "$2<br />" >> "$LOG_PATH_HTM" |
| 391 |
|
fi |
| 392 |
|
fi |
| 393 |
|
} |
| 442 |
|
fi |
| 443 |
|
} |
| 444 |
|
|
| 445 |
< |
# Upload HTML report using info specified in the --upload argument. ONLY USE "valPrint c" here, as the |
| 445 |
> |
# Upload the reports using info specified in the --upload argument. ONLY USE "valPrint c" here, as the |
| 446 |
|
# reports being saved to disk have already been closed. |
| 447 |
|
function uploadReport() |
| 448 |
|
{ |
| 449 |
< |
valPrint c "Uploading HTML report..." |
| 449 |
> |
valPrint c "Uploading reports..." |
| 450 |
|
|
| 451 |
|
SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME" |
| 452 |
|
SFTP_USER_NAME_MARKER="user:" |
| 462 |
|
SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO) |
| 463 |
|
SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER} |
| 464 |
|
|
| 465 |
< |
expect "$SCRIPT_PATH" "$LOG_HTM" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.htm" |
| 465 |
> |
for SUFFIX in htm rtf txt; do |
| 466 |
> |
expect "$SCRIPT_PATH" "$LOG_PATH.$SUFFIX" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.$SUFFIX" |
| 467 |
|
|
| 468 |
< |
valPrint c "Report was uploaded, unless an error message appears above." |
| 468 |
> |
if [ "$?" -ne 0 ]; then |
| 469 |
> |
valPrint c "Error $? occurred when attempting to upload $LOG_NAME.$SUFFIX!" |
| 470 |
> |
else |
| 471 |
> |
valPrint c "Report in `echo $SUFFIX | tr [:lower:] [:upper:]` format was uploaded." |
| 472 |
> |
fi |
| 473 |
> |
done |
| 474 |
|
} |
| 475 |
|
|
| 476 |
|
# Prints session summary when script is done |
| 504 |
|
LINK_PROBLEMS_IW=$((IW_LINKS-SKIP_EXPECT_IW)) |
| 505 |
|
LINK_PROBLEMS_NET=$((LINK_PROBLEMS_NG+LINK_PROBLEMS_RD+LINK_PROBLEMS_EI+LINK_PROBLEMS_IW)) |
| 506 |
|
|
| 507 |
+ |
# Print something in the Links section if no link issues were printed |
| 508 |
+ |
if [ $LINK_PROBLEMS_NET -eq 0 ]; then |
| 509 |
+ |
valPrint h "<i>No link problems to report! See the <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for a list of links with issues that were not reported.</i>" |
| 510 |
+ |
fi |
| 511 |
+ |
if [ $LINK_PROBLEMS_TOTAL -eq 0 ]; then |
| 512 |
+ |
valPrint t "No link problems to report!" |
| 513 |
+ |
valPrint r "\i1 No link problems to report! \i0" |
| 514 |
+ |
fi |
| 515 |
+ |
|
| 516 |
|
## SUMMARY OUTPUT ## |
| 517 |
|
valPrint ct "Summary ($ELAPSED):" |
| 518 |
|
valPrint r "\b1 Summary \b0 ($ELAPSED)" |
| 529 |
|
if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctr " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; valPrint h " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi |
| 530 |
|
|
| 531 |
|
# Print errored link totals |
| 532 |
< |
if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"; fi |
| 532 |
> |
if [ $LINK_ERRORS -gt 0 ]; then |
| 533 |
> |
valPrint c "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):" |
| 534 |
> |
valPrint h "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for specific links):" |
| 535 |
> |
valPrint rt "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS):" |
| 536 |
> |
fi |
| 537 |
|
if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi |
| 538 |
|
if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi |
| 539 |
|
if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi |
| 542 |
|
if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi |
| 543 |
|
|
| 544 |
|
# Print excepted link totals |
| 545 |
< |
if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctrh "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"; fi |
| 545 |
> |
if [ $LINKS_EXCEPTED -gt 0 ]; then |
| 546 |
> |
valPrint c "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):" |
| 547 |
> |
valPrint h "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for specific links):" |
| 548 |
> |
valPrint rt "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted:" |
| 549 |
> |
fi |
| 550 |
|
if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi |
| 551 |
|
if [ $SKIP_EXPECT_RD -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_RD/$RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi |
| 552 |
|
if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi |
| 665 |
|
if [ $SHOW_YT_RD -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi |
| 666 |
|
|
| 667 |
|
valPrint ctrhn "Check archive.org links: " |
| 668 |
< |
if [ $SKIP_ARCHIVE_LINKS -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi |
| 668 |
> |
if [ $CHECK_ARCHIVE_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi |
| 669 |
|
|
| 670 |
|
valPrint tr "A summary of my findings will be found at the bottom of the report." |
| 671 |
|
valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report." |
| 793 |
|
continue |
| 794 |
|
fi |
| 795 |
|
|
| 796 |
< |
# If we're skipping Archive.org links, check if this is one |
| 797 |
< |
if [ $SKIP_ARCHIVE_LINKS -eq 1 ] && [[ $URL == *web.archive.org* ]]; then |
| 798 |
< |
valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have been asked not to check Wayback Machine links." |
| 796 |
> |
# If we're skipping Archive.org links, see if this is one |
| 797 |
> |
if [ $CHECK_ARCHIVE_LINKS -eq 0 ] && [[ $URL == *web.archive.org* ]]; then |
| 798 |
> |
valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to check Wayback Machine links." |
| 799 |
|
let SKIP_ARCHIVE_ORG+=1 |
| 800 |
|
continue |
| 801 |
|
fi |
| 920 |
|
let EI_LINKS+=1 |
| 921 |
|
fi |
| 922 |
|
|
| 923 |
< |
# If it's not, check if this is a link to a domain that we have an interwiki prefix for |
| 924 |
< |
if [ $STATUS == "??" ]; then |
| 923 |
> |
# If it's not, check if this is a link to a domain that we have an interwiki prefix for (also make |
| 924 |
> |
# sure that it's not an archive.org link to a page from an interwiki domain) |
| 925 |
> |
if [ $STATUS == "??" ] && [[ $URL != *web.archive.org* ]]; then |
| 926 |
|
for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do |
| 927 |
|
if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]] && [[ $URL != *${INTERWIKI_DOMAINS[$i]}/w/* ]]; then |
| 928 |
|
STATUS="IW" |