5 |
|
# Validates a list of external links in CSV format. The resulting logs are produced in three formats: |
6 |
|
# - TXT (for easy diffing with an earlier log) |
7 |
|
# - RTF (for reading as a local file with clickable links) |
8 |
< |
# - HTML (for uploading as a web page). |
8 |
> |
# - HTML (for reading as a web page) |
9 |
|
# Call script with "--help" argument for documentation. Also see Read Me.rtf for critical notes. |
10 |
|
# |
11 |
|
# Recommended rule: |
29 |
|
|
30 |
|
### GLOBALS ### |
31 |
|
# Settings -- these will be changed from their defaults by the arguments passed in to the script |
32 |
< |
LINKS_URL="" # use 'curl' to download file with links from this location (can be file://) |
33 |
< |
EXCEPT_URL="" # 'curl' will access this wiki page with a list of exceptions for NG results |
34 |
< |
OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder |
35 |
< |
RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES |
36 |
< |
SHOW_SLASH=0 # record response code to the log when a slash is added to the end of a URL |
37 |
< |
SHOW_HTTPS=0 # record response code to the log when "http" is upgraded to "https" |
38 |
< |
SHOW_YT_RD=0 # record response code to the log when a youtu.be URL is expanded to the full URL |
39 |
< |
SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page |
40 |
< |
SKIP_ARCHIVE_LINKS=0 # don't check URLs under the archive.org domain |
41 |
< |
TAKE_PAGE_SHOT=0 # take a screenshot of each OK page |
42 |
< |
TIMEOUT=10 # time to wait for a response when querying a site |
43 |
< |
CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature |
44 |
< |
URL_START=1 # start at this URL in LINKS_FILE |
45 |
< |
URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE |
46 |
< |
UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report |
32 |
> |
LINKS_URL="" # use 'curl' to download file with links from this location (can be file://) |
33 |
> |
EXCEPT_URL="" # 'curl' will access this wiki page with a list of exceptions for NG results |
34 |
> |
OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder |
35 |
> |
RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES |
36 |
> |
SHOW_SLASH=0 # record issue when a slash is added to the end of a URL |
37 |
> |
SHOW_HTTPS=0 # record issue when "http" is upgraded to "https" |
38 |
> |
SHOW_YT_RD=0 # record redirection for a youtu.be URL expanding to the full URL |
39 |
> |
SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page |
40 |
> |
CHECK_ARCHIVE_LINKS=0 # check URLs under the archive.org domain |
41 |
> |
TAKE_PAGE_SHOT=0 # take a screenshot of each OK page |
42 |
> |
TIMEOUT=10 # time to wait for a response when querying a site |
43 |
> |
CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature |
44 |
> |
URL_START=1 # start at this URL in LINKS_FILE |
45 |
> |
URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE |
46 |
> |
UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report |
47 |
|
|
48 |
|
# Fixed strings -- see the occurrences of these variables to learn their purpose |
49 |
|
AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36 OPR/70.0.3728.154" |
132 |
|
validate_external_links.sh --help |
133 |
|
validate_external_links.sh --links URL --output DIR [--exceptions URL] |
134 |
|
[--record-ok-links] [--show-added-slashes] [--show-https-upgrades] |
135 |
< |
[--show-yt-redirects] [--suggest-snapshots] [--skip-archive-links] |
135 |
> |
[--show-yt-redirects] [--suggest-snapshots] [--check-archive-links] |
136 |
|
[--take-screenshots FILE] [--timeout NUM] [--start-url NUM] |
137 |
|
[--end-url NUM] [--upload FILE] |
138 |
|
|
176 |
|
--show-yt-redirects Report on redirects that expand a youtu.be URL. |
177 |
|
--suggest-snapshots Query the Internet Archive for a possible |
178 |
|
snapshot URL for each "NG" page. |
179 |
< |
--skip-archive-links Don't check links that are already pointing to |
180 |
< |
a page on the Internet Archive. |
179 |
> |
--check-archive-links Check links that are already pointing to a page |
180 |
> |
on the Internet Archive. In theory these links |
181 |
> |
should be totally stable and not need validation. |
182 |
|
--take-screenshots FILE Call the Google Chrome binary at this path to |
183 |
|
take screenshots of each "OK" page. |
184 |
|
--timeout NUM Wait this many seconds for a site to respond. The |
218 |
|
--show-https-upgrades ) SHOW_HTTPS=1; shift;; |
219 |
|
--show-yt-redirects ) SHOW_YT_RD=1; shift;; |
220 |
|
--suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;; |
221 |
< |
--skip-archive-links ) SKIP_ARCHIVE_LINKS=1; shift;; |
221 |
> |
--check-archive-links ) CHECK_ARCHIVE_LINKS=1; shift;; |
222 |
|
--take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;; |
223 |
|
--timeout ) TIMEOUT=$2; shift 2;; |
224 |
|
--start-url ) URL_START=$2; shift 2;; |
261 |
|
OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER" |
262 |
|
SHOT_PATH="$OUTPUT_PATH/Screenshots" |
263 |
|
LOG_NAME="ValExtLinks report" |
264 |
< |
LOG_TXT="$OUTPUT_PATH/$LOG_NAME.txt" |
265 |
< |
LOG_RTF="$OUTPUT_PATH/$LOG_NAME.rtf" |
266 |
< |
LOG_HTM="$OUTPUT_PATH/$LOG_NAME.htm" |
264 |
> |
LOG_NAME_TXT="$LOG_NAME.txt" |
265 |
> |
LOG_NAME_RTF="$LOG_NAME.rtf" |
266 |
> |
LOG_NAME_HTM="$LOG_NAME.htm" |
267 |
> |
LOG_PATH="$OUTPUT_PATH/$LOG_NAME" |
268 |
> |
LOG_PATH_TXT="$LOG_PATH.txt" |
269 |
> |
LOG_PATH_RTF="$LOG_PATH.rtf" |
270 |
> |
LOG_PATH_HTM="$LOG_PATH.htm" |
271 |
|
mkdir "$OUTPUT_PATH" |
272 |
|
if [ $TAKE_PAGE_SHOT -eq 1 ]; then |
273 |
|
mkdir "$SHOT_PATH" |
365 |
|
fi |
366 |
|
if [[ "$1" == *t* ]]; then |
367 |
|
if [[ "$1" == *n* ]]; then |
368 |
< |
echo -n "$2" >> "$LOG_TXT" |
368 |
> |
echo -n "$2" >> "$LOG_PATH_TXT" |
369 |
|
elif [[ "$1" == *s* ]]; then |
370 |
< |
echo -e "$2\n" >> "$LOG_TXT" |
370 |
> |
echo -e "$2\n" >> "$LOG_PATH_TXT" |
371 |
|
else |
372 |
< |
echo "$2" >> "$LOG_TXT" |
372 |
> |
echo "$2" >> "$LOG_PATH_TXT" |
373 |
|
fi |
374 |
|
fi |
375 |
|
if [[ "$1" == *r* ]]; then |
376 |
|
if [[ "$1" == *n* ]]; then |
377 |
< |
echo "$2" >> "$LOG_RTF" |
377 |
> |
echo "$2" >> "$LOG_PATH_RTF" |
378 |
|
elif [[ "$1" == *s* ]]; then |
379 |
< |
echo "$2\line\line" >> "$LOG_RTF" |
379 |
> |
echo "$2\line\line" >> "$LOG_PATH_RTF" |
380 |
|
else |
381 |
< |
echo "$2\line" >> "$LOG_RTF" |
381 |
> |
echo "$2\line" >> "$LOG_PATH_RTF" |
382 |
|
fi |
383 |
|
fi |
384 |
|
if [[ "$1" == *h* ]]; then |
385 |
|
if [[ "$1" == *s* ]]; then |
386 |
< |
echo "$2<tr><td> </td></tr>" >> "$LOG_HTM" |
386 |
> |
echo "$2<tr><td> </td></tr>" >> "$LOG_PATH_HTM" |
387 |
|
elif [[ "$1" == *n* ]]; then |
388 |
< |
echo "$2" >> "$LOG_HTM" |
388 |
> |
echo "$2" >> "$LOG_PATH_HTM" |
389 |
|
else |
390 |
< |
echo "$2<br />" >> "$LOG_HTM" |
390 |
> |
echo "$2<br />" >> "$LOG_PATH_HTM" |
391 |
|
fi |
392 |
|
fi |
393 |
|
} |
442 |
|
fi |
443 |
|
} |
444 |
|
|
445 |
< |
# Upload HTML report using info specified in the --upload argument. ONLY USE "valPrint c" here, as the |
445 |
> |
# Upload the reports using info specified in the --upload argument. ONLY USE "valPrint c" here, as the |
446 |
|
# reports being saved to disk have already been closed. |
447 |
|
function uploadReport() |
448 |
|
{ |
449 |
< |
valPrint c "Uploading HTML report..." |
449 |
> |
valPrint c "Uploading reports..." |
450 |
|
|
451 |
|
SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME" |
452 |
|
SFTP_USER_NAME_MARKER="user:" |
462 |
|
SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO) |
463 |
|
SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER} |
464 |
|
|
465 |
< |
expect "$SCRIPT_PATH" "$LOG_HTM" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.htm" |
465 |
> |
for SUFFIX in htm rtf txt; do |
466 |
> |
expect "$SCRIPT_PATH" "$LOG_PATH.$SUFFIX" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.$SUFFIX" |
467 |
|
|
468 |
< |
valPrint c "Report was uploaded, unless an error message appears above." |
468 |
> |
if [ "$?" -ne 0 ]; then |
469 |
> |
valPrint c "Error $? occurred when attempting to upload $LOG_NAME.$SUFFIX!" |
470 |
> |
else |
471 |
> |
valPrint c "Report in `echo $SUFFIX | tr [:lower:] [:upper:]` format was uploaded." |
472 |
> |
fi |
473 |
> |
done |
474 |
|
} |
475 |
|
|
476 |
|
# Prints session summary when script is done |
504 |
|
LINK_PROBLEMS_IW=$((IW_LINKS-SKIP_EXPECT_IW)) |
505 |
|
LINK_PROBLEMS_NET=$((LINK_PROBLEMS_NG+LINK_PROBLEMS_RD+LINK_PROBLEMS_EI+LINK_PROBLEMS_IW)) |
506 |
|
|
507 |
+ |
# Print something in the Links section if no link issues were printed |
508 |
+ |
if [ $LINK_PROBLEMS_NET -eq 0 ]; then |
509 |
+ |
valPrint h "<i>No link problems to report! See the <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for a list of links with issues that were not reported.</i>" |
510 |
+ |
fi |
511 |
+ |
if [ $LINK_PROBLEMS_TOTAL -eq 0 ]; then |
512 |
+ |
valPrint t "No link problems to report!" |
513 |
+ |
valPrint r "\i1 No link problems to report! \i0" |
514 |
+ |
fi |
515 |
+ |
|
516 |
|
## SUMMARY OUTPUT ## |
517 |
|
valPrint ct "Summary ($ELAPSED):" |
518 |
|
valPrint r "\b1 Summary \b0 ($ELAPSED)" |
529 |
|
if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctr " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; valPrint h " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi |
530 |
|
|
531 |
|
# Print errored link totals |
532 |
< |
if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"; fi |
532 |
> |
if [ $LINK_ERRORS -gt 0 ]; then |
533 |
> |
valPrint c "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):" |
534 |
> |
valPrint h "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for specific links):" |
535 |
> |
valPrint rt "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS):" |
536 |
> |
fi |
537 |
|
if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi |
538 |
|
if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi |
539 |
|
if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi |
542 |
|
if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi |
543 |
|
|
544 |
|
# Print excepted link totals |
545 |
< |
if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctrh "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"; fi |
545 |
> |
if [ $LINKS_EXCEPTED -gt 0 ]; then |
546 |
> |
valPrint c "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):" |
547 |
> |
valPrint h "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for specific links):" |
548 |
> |
valPrint rt "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted:" |
549 |
> |
fi |
550 |
|
if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi |
551 |
|
if [ $SKIP_EXPECT_RD -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_RD/$RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi |
552 |
|
if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi |
665 |
|
if [ $SHOW_YT_RD -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi |
666 |
|
|
667 |
|
valPrint ctrhn "Check archive.org links: " |
668 |
< |
if [ $SKIP_ARCHIVE_LINKS -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi |
668 |
> |
if [ $CHECK_ARCHIVE_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi |
669 |
|
|
670 |
|
valPrint tr "A summary of my findings will be found at the bottom of the report." |
671 |
|
valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report." |
793 |
|
continue |
794 |
|
fi |
795 |
|
|
796 |
< |
# If we're skipping Archive.org links, check if this is one |
797 |
< |
if [ $SKIP_ARCHIVE_LINKS -eq 1 ] && [[ $URL == *web.archive.org* ]]; then |
798 |
< |
valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have been asked not to check Wayback Machine links." |
796 |
> |
# If we're skipping Archive.org links, see if this is one |
797 |
> |
if [ $CHECK_ARCHIVE_LINKS -eq 0 ] && [[ $URL == *web.archive.org* ]]; then |
798 |
> |
valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to check Wayback Machine links." |
799 |
|
let SKIP_ARCHIVE_ORG+=1 |
800 |
|
continue |
801 |
|
fi |
920 |
|
let EI_LINKS+=1 |
921 |
|
fi |
922 |
|
|
923 |
< |
# If it's not, check if this is a link to a domain that we have an interwiki prefix for |
924 |
< |
if [ $STATUS == "??" ]; then |
923 |
> |
# If it's not, check if this is a link to a domain that we have an interwiki prefix for (also make |
924 |
> |
# sure that it's not an archive.org link to a page from an interwiki domain) |
925 |
> |
if [ $STATUS == "??" ] && [[ $URL != *web.archive.org* ]]; then |
926 |
|
for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do |
927 |
|
if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]] && [[ $URL != *${INTERWIKI_DOMAINS[$i]}/w/* ]]; then |
928 |
|
STATUS="IW" |