--- Validate External Links/validate_external_links.sh 2020/03/28 02:08:29 1127 +++ Validate External Links/validate_external_links.sh 2020/07/12 23:57:00 1135 @@ -1,10 +1,10 @@ #!/bin/bash # Validate External Links by Iritscen -# Provided with a list of external links found in the OniGalore wiki, this script validates them. -# The resulting logs are produced in three formats: TXT (for easy diffing with an earlier log), RTF -# (for reading as a local file with clickable links), and HTML (for uploading as a web page). -# Call script with "--help" argument for documentation. Also see Read Me First.rtf for critical notes. +# Provided with a list of external links in an expected CSV format, this script validates them. The +# resulting logs are produced in three formats: TXT (for easy diffing with an earlier log), RTF (for +# reading as a local file with clickable links), and HTML (for uploading as a web page). Call script +# with "--help" argument for documentation. Also see Read Me First.rtf for critical notes. # Recommended rule: # |----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----| @@ -14,19 +14,20 @@ IFS=" ### GLOBALS ### # Settings -- these will be changed from their defaults by the arguments passed in to the script -LINKS_URL="" # use 'curl' to download file with links from this location (can be file://) -EXCEPT_URL="" # ditto above for file with exceptions to NG results -OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder -RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES -SHOW_SLASH=0 # record response code to the log when a slash is added to the end of a URL -SHOW_HTTPS=0 # record response code to the log when "http" is upgraded to "https" -SHOW_YT_RD=0 # record response code to the log when a youtu.be URL is expanded to the full URL -SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page -TAKE_PAGE_SHOT=0 # take a screenshot of each OK page -CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature -URL_START=1 # start at this URL in LINKS_FILE (1 by default) -URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE -UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report +LINKS_URL="" # use 'curl' to download file with links from this location (can be file://) +EXCEPT_URL="" # ditto above for file with exceptions to NG results +OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder +RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES +SHOW_SLASH=0 # record response code to the log when a slash is added to the end of a URL +SHOW_HTTPS=0 # record response code to the log when "http" is upgraded to "https" +SHOW_YT_RD=0 # record response code to the log when a youtu.be URL is expanded to the full URL +SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page +SKIP_ARCHIVE_LINKS=0 # don't check URLs under the archive.org domain +TAKE_PAGE_SHOT=0 # take a screenshot of each OK page +CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature +URL_START=1 # start at this URL in LINKS_FILE (1 by default) +URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE +UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report # Fixed strings -- see the occurrences of these variables to learn their purpose AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36 OPR/67.0.3575.53" @@ -89,6 +90,7 @@ SKIP_EXPECT_IW=0 SKIP_HTTPS_UP=0 SKIP_SLASH_ADD=0 SKIP_YOUTU_BE=0 +SKIP_ARCHIVE_ORG=0 FILE_LINKS=0 PAGE_LINKS=0 SKIPPED_HEADER_ROW=0 @@ -152,6 +154,8 @@ OPTIONS --show-yt-redirects Report on redirects that expand a youtu.be URL. --suggest-snapshots Query the Internet Archive for a possible snapshot URL for each "NG" page. + --skip-archive-links Don't check links that are already pointing to + a page on the Internet Archive. --take-screenshots FILE Call the Google Chrome binary at this path to take screenshots of each "OK" page. --start-url NUM Start at this link in the links CSV file. @@ -186,6 +190,7 @@ while (( "$#" )); do --show-https-upgrades ) SHOW_HTTPS=1; shift;; --show-yt-redirects ) SHOW_YT_RD=1; shift;; --suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;; + --skip-archive-links ) SKIP_ARCHIVE_LINKS=1; shift;; --take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;; --start-url ) URL_START=$2; shift 2;; --end-url ) URL_LIMIT=$2; shift 2;; @@ -464,6 +469,7 @@ function wrapupAndExit() # Print processed link totals if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi + if [ $SKIP_ARCHIVE_ORG -gt 0 ]; then valPrint ctrh "- $SKIP_ARCHIVE_ORG Archive.org $(pluralCheckNoun link $SKIP_ARCHIVE_ORG) were not checked"; fi if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS processed $(pluralCheckNoun link $LINK_PROBLEMS) had $(pluralCheckAn $LINK_PROBLEMS)$(pluralCheckNoun issue $LINK_PROBLEMS)"; fi if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr " (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; valPrint h " (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) $(pluralCheckWas $OK_LINKS) OK"; fi @@ -557,11 +563,15 @@ else fi # Print settings to console and log -declare -a SETTINGS_MSG=(I will be checking the response code of each link "and will" take a screenshot of each page. Pages that are OK will "also" be logged. I "will" ask the Internet Archive for a suggested snapshot URL for each NG page. "I will not report links that are listed in the exceptions file.") +declare -a SETTINGS_MSG=(I will be checking the response code of each link "and will" take a screenshot of each page. Pages that are OK will "also" be logged. I "will" ask the Internet Archive for a suggested snapshot URL for each NG page. "I will not report links that are listed in the exceptions file." "I will ignore URLs that simply have ending slashes added onto them." "I will ignore URLs that only upgrade from HTTP to HTTPS." "I will ignore youtu.be links that are merely being expanded." "I will not check the validity of Internet Archive snapshot URLs.") if [ $TAKE_PAGE_SHOT -eq 0 ]; then SETTINGS_MSG[10]="but will not"; fi if [ $RECORD_OK_LINKS -eq 0 ]; then SETTINGS_MSG[22]="not"; fi if [ $SUGGEST_SNAPSHOTS -eq 0 ]; then SETTINGS_MSG[26]="will not"; fi if [ -z $EXCEPT_URL ] || [ $RECORD_OK_LINKS -eq 1 ]; then SETTINGS_MSG[40]=""; fi +if [ $SHOW_SLASH -eq 1 ]; then SETTINGS_MSG[41]=""; fi +if [ $SHOW_HTTPS -eq 1 ]; then SETTINGS_MSG[42]=""; fi +if [ $SHOW_YT_RD -eq 1 ]; then SETTINGS_MSG[43]=""; fi +if [ $SKIP_ARCHIVE_LINKS -eq 0 ]; then SETTINGS_MSG[44]=""; fi SETTINGS_STR=${SETTINGS_MSG[@]} valPrint ctrh "$SETTINGS_STR" valPrint tr "A summary of my findings will be found at the bottom of the report." @@ -573,7 +583,7 @@ valPrint t "Legend:" valPrint r "\b1 Legend \b0" valPrint hn "