--- Validate External Links/validate_external_links.sh 2020/09/04 03:07:08 1142 +++ Validate External Links/validate_external_links.sh 2020/09/06 20:51:22 1144 @@ -5,7 +5,7 @@ # Validates a list of external links in CSV format. The resulting logs are produced in three formats: # - TXT (for easy diffing with an earlier log) # - RTF (for reading as a local file with clickable links) -# - HTML (for uploading as a web page). +# - HTML (for reading as a web page) # Call script with "--help" argument for documentation. Also see Read Me.rtf for critical notes. # # Recommended rule: @@ -29,21 +29,21 @@ IFS=" ### GLOBALS ### # Settings -- these will be changed from their defaults by the arguments passed in to the script -LINKS_URL="" # use 'curl' to download file with links from this location (can be file://) -EXCEPT_URL="" # 'curl' will access this wiki page with a list of exceptions for NG results -OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder -RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES -SHOW_SLASH=0 # record response code to the log when a slash is added to the end of a URL -SHOW_HTTPS=0 # record response code to the log when "http" is upgraded to "https" -SHOW_YT_RD=0 # record response code to the log when a youtu.be URL is expanded to the full URL -SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page -SKIP_ARCHIVE_LINKS=0 # don't check URLs under the archive.org domain -TAKE_PAGE_SHOT=0 # take a screenshot of each OK page -TIMEOUT=10 # time to wait for a response when querying a site -CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature -URL_START=1 # start at this URL in LINKS_FILE -URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE -UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report +LINKS_URL="" # use 'curl' to download file with links from this location (can be file://) +EXCEPT_URL="" # 'curl' will access this wiki page with a list of exceptions for NG results +OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder +RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES +SHOW_SLASH=0 # record issue when a slash is added to the end of a URL +SHOW_HTTPS=0 # record issue when "http" is upgraded to "https" +SHOW_YT_RD=0 # record redirection for a youtu.be URL expanding to the full URL +SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page +CHECK_ARCHIVE_LINKS=0 # check URLs under the archive.org domain +TAKE_PAGE_SHOT=0 # take a screenshot of each OK page +TIMEOUT=10 # time to wait for a response when querying a site +CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature +URL_START=1 # start at this URL in LINKS_FILE +URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE +UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report # Fixed strings -- see the occurrences of these variables to learn their purpose AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36 OPR/70.0.3728.154" @@ -132,7 +132,7 @@ SYNOPSIS validate_external_links.sh --help validate_external_links.sh --links URL --output DIR [--exceptions URL] [--record-ok-links] [--show-added-slashes] [--show-https-upgrades] - [--show-yt-redirects] [--suggest-snapshots] [--skip-archive-links] + [--show-yt-redirects] [--suggest-snapshots] [--check-archive-links] [--take-screenshots FILE] [--timeout NUM] [--start-url NUM] [--end-url NUM] [--upload FILE] @@ -176,8 +176,9 @@ OPTIONS --show-yt-redirects Report on redirects that expand a youtu.be URL. --suggest-snapshots Query the Internet Archive for a possible snapshot URL for each "NG" page. - --skip-archive-links Don't check links that are already pointing to - a page on the Internet Archive. + --check-archive-links Check links that are already pointing to a page + on the Internet Archive. In theory these links + should be totally stable and not need validation. --take-screenshots FILE Call the Google Chrome binary at this path to take screenshots of each "OK" page. --timeout NUM Wait this many seconds for a site to respond. The @@ -217,7 +218,7 @@ while (( "$#" )); do --show-https-upgrades ) SHOW_HTTPS=1; shift;; --show-yt-redirects ) SHOW_YT_RD=1; shift;; --suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;; - --skip-archive-links ) SKIP_ARCHIVE_LINKS=1; shift;; + --check-archive-links ) CHECK_ARCHIVE_LINKS=1; shift;; --take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;; --timeout ) TIMEOUT=$2; shift 2;; --start-url ) URL_START=$2; shift 2;; @@ -260,9 +261,13 @@ OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)" OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER" SHOT_PATH="$OUTPUT_PATH/Screenshots" LOG_NAME="ValExtLinks report" -LOG_TXT="$OUTPUT_PATH/$LOG_NAME.txt" -LOG_RTF="$OUTPUT_PATH/$LOG_NAME.rtf" -LOG_HTM="$OUTPUT_PATH/$LOG_NAME.htm" +LOG_NAME_TXT="$LOG_NAME.txt" +LOG_NAME_RTF="$LOG_NAME.rtf" +LOG_NAME_HTM="$LOG_NAME.htm" +LOG_PATH="$OUTPUT_PATH/$LOG_NAME" +LOG_PATH_TXT="$LOG_PATH.txt" +LOG_PATH_RTF="$LOG_PATH.rtf" +LOG_PATH_HTM="$LOG_PATH.htm" mkdir "$OUTPUT_PATH" if [ $TAKE_PAGE_SHOT -eq 1 ]; then mkdir "$SHOT_PATH" @@ -360,29 +365,29 @@ function valPrint() fi if [[ "$1" == *t* ]]; then if [[ "$1" == *n* ]]; then - echo -n "$2" >> "$LOG_TXT" + echo -n "$2" >> "$LOG_PATH_TXT" elif [[ "$1" == *s* ]]; then - echo -e "$2\n" >> "$LOG_TXT" + echo -e "$2\n" >> "$LOG_PATH_TXT" else - echo "$2" >> "$LOG_TXT" + echo "$2" >> "$LOG_PATH_TXT" fi fi if [[ "$1" == *r* ]]; then if [[ "$1" == *n* ]]; then - echo "$2" >> "$LOG_RTF" + echo "$2" >> "$LOG_PATH_RTF" elif [[ "$1" == *s* ]]; then - echo "$2\line\line" >> "$LOG_RTF" + echo "$2\line\line" >> "$LOG_PATH_RTF" else - echo "$2\line" >> "$LOG_RTF" + echo "$2\line" >> "$LOG_PATH_RTF" fi fi if [[ "$1" == *h* ]]; then if [[ "$1" == *s* ]]; then - echo "$2 " >> "$LOG_HTM" + echo "$2 " >> "$LOG_PATH_HTM" elif [[ "$1" == *n* ]]; then - echo "$2" >> "$LOG_HTM" + echo "$2" >> "$LOG_PATH_HTM" else - echo "$2
" >> "$LOG_HTM" + echo "$2
" >> "$LOG_PATH_HTM" fi fi } @@ -437,11 +442,11 @@ function pluralCheckAn() fi } -# Upload HTML report using info specified in the --upload argument. ONLY USE "valPrint c" here, as the +# Upload the reports using info specified in the --upload argument. ONLY USE "valPrint c" here, as the # reports being saved to disk have already been closed. function uploadReport() { - valPrint c "Uploading HTML report..." + valPrint c "Uploading reports..." SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME" SFTP_USER_NAME_MARKER="user:" @@ -457,9 +462,15 @@ function uploadReport() SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO) SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER} - expect "$SCRIPT_PATH" "$LOG_HTM" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.htm" + for SUFFIX in htm rtf txt; do + expect "$SCRIPT_PATH" "$LOG_PATH.$SUFFIX" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.$SUFFIX" - valPrint c "Report was uploaded, unless an error message appears above." + if [ "$?" -ne 0 ]; then + valPrint c "Error $? occurred when attempting to upload $LOG_NAME.$SUFFIX!" + else + valPrint c "Report in `echo $SUFFIX | tr [:lower:] [:upper:]` format was uploaded." + fi + done } # Prints session summary when script is done @@ -493,6 +504,15 @@ function wrapupAndExit() LINK_PROBLEMS_IW=$((IW_LINKS-SKIP_EXPECT_IW)) LINK_PROBLEMS_NET=$((LINK_PROBLEMS_NG+LINK_PROBLEMS_RD+LINK_PROBLEMS_EI+LINK_PROBLEMS_IW)) + # Print something in the Links section if no link issues were printed + if [ $LINK_PROBLEMS_NET -eq 0 ]; then + valPrint h "No link problems to report! See the RTF or TXT report for a list of links with issues that were not reported." + fi + if [ $LINK_PROBLEMS_TOTAL -eq 0 ]; then + valPrint t "No link problems to report!" + valPrint r "\i1 No link problems to report! \i0" + fi + ## SUMMARY OUTPUT ## valPrint ct "Summary ($ELAPSED):" valPrint r "\b1 Summary \b0 ($ELAPSED)" @@ -509,7 +529,11 @@ function wrapupAndExit() if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctr " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; valPrint h "  (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi # Print errored link totals - if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"; fi + if [ $LINK_ERRORS -gt 0 ]; then + valPrint c "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):" + valPrint h "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):" + valPrint rt "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS):" + fi if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi @@ -518,7 +542,11 @@ function wrapupAndExit() if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi # Print excepted link totals - if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctrh "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"; fi + if [ $LINKS_EXCEPTED -gt 0 ]; then + valPrint c "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):" + valPrint h "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):" + valPrint rt "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted:" + fi if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi if [ $SKIP_EXPECT_RD -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_RD/$RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi @@ -637,7 +665,7 @@ valPrint ctrhn "Ignore youtu.be redirect if [ $SHOW_YT_RD -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi valPrint ctrhn "Check archive.org links: " -if [ $SKIP_ARCHIVE_LINKS -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi +if [ $CHECK_ARCHIVE_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi valPrint tr "A summary of my findings will be found at the bottom of the report." valPrint h "A summary of my findings will be found at the bottom of the report." @@ -765,9 +793,9 @@ for LINE in `cat "$LINKS_FILE"`; do continue fi - # If we're skipping Archive.org links, check if this is one - if [ $SKIP_ARCHIVE_LINKS -eq 1 ] && [[ $URL == *web.archive.org* ]]; then - valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have been asked not to check Wayback Machine links." + # If we're skipping Archive.org links, see if this is one + if [ $CHECK_ARCHIVE_LINKS -eq 0 ] && [[ $URL == *web.archive.org* ]]; then + valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to check Wayback Machine links." let SKIP_ARCHIVE_ORG+=1 continue fi @@ -892,8 +920,9 @@ for LINE in `cat "$LINKS_FILE"`; do let EI_LINKS+=1 fi - # If it's not, check if this is a link to a domain that we have an interwiki prefix for - if [ $STATUS == "??" ]; then + # If it's not, check if this is a link to a domain that we have an interwiki prefix for (also make + # sure that it's not an archive.org link to a page from an interwiki domain) + if [ $STATUS == "??" ] && [[ $URL != *web.archive.org* ]]; then for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]] && [[ $URL != *${INTERWIKI_DOMAINS[$i]}/w/* ]]; then STATUS="IW"