--- Validate External Links/validate_external_links.sh 2017/08/02 04:26:48 1069 +++ Validate External Links/validate_external_links.sh 2017/10/03 03:01:32 1070 @@ -17,19 +17,19 @@ IFS=" LINKS_URL="" # use 'curl' to download file with links from this location (can be file://) EXCEPT_URL="" # ditto above for file with exceptions to NG results OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder -RECORD_OK_LINKS=0 # record response code to the log whether it's a value in OK_CODES or NG_CODES +RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page TAKE_PAGE_SHOT=0 # take a screenshot of each OK page +CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature URL_START=1 # start at this URL in LINKS_FILE (1 by default) URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report # Fixed strings -- see the occurrences of these variables to learn their purpose -AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:54.0) Gecko/20100101 Firefox/54.0" +AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0" ARCHIVE_API="http://archive.org/wayback/available" ARCHIVE_GENERIC="https://web.archive.org/web/*" ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206" -CHROME="/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary" CHROME_SCREENSHOT="screenshot.png" CURL_CODES="http://iritscen.oni2.net/val/curl_codes.txt" EXPECT_SCRIPT_NAME="val_expect_sftp.txt" @@ -44,7 +44,7 @@ declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk") # These arrays tell the script which suffixes at the ends of URLs represent files and which are pages. -# This determines whether the script tries to take a screenshot of the page or just gets its HTTP code. +# This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code. declare -a HTTP_FILES=(txt zip wmv jpg png m4a bsl rar oni mp3 mov ONWC vbs TRMA mp4 doc avi log gif pdf dmg exe cpp tga 7z wav east BINA xml dll dae xaf fbx 3ds blend flv csv) declare -a HTTP_TLDS_AND_PAGES=(com net org uk ru de it htm html php pl asp aspx shtml pgi cgi php3 x jsp phtml cfm css action stm js) @@ -59,16 +59,20 @@ declare -a NG_CODES=(000 403 404 410 500 # transcluded text, and if the transclusion fails, then the braces show up in the URL ILLEGAL_CHARS="{ }" +# The shortest URL possible, used for sanity-checking some URLs: http://a.co +MIN_URL_LENGTH=11 + # These are parallel arrays giving the prefixes that can be used in place of normal external links to # some wikis and other sites -declare -a INTERWIKI_PREFIXES=(metawikipedia wikipedia wikiquote wiktionary) -declare -a INTERWIKI_DOMAINS=(meta.wikipedia.org wikipedia.org wikiquote.org wiktionary.org) +declare -a INTERWIKI_PREFIXES=(commons metawikimedia mw wikibooks wikidata wikimedia wikinews wikiquote wikisource wikispecies wikiversity wikivoyage wikt wp) +declare -a INTERWIKI_DOMAINS=(commons.wikimedia.org meta.wikimedia.org mediawiki.org wikibooks.org wikidata.org wikimediafoundation.org wikinews.org wikiquote.org wikisource.org species.wikimedia.org wikiversity.org wikivoyage.org wiktionary.org wikipedia.org) # Variables for keeping track of main loop progress and findings LINK_NUM=0 +EI_LINKS=0 +IW_LINKS=0 OK_LINKS=0 RD_LINKS=0 -IW_LINKS=0 NG_LINKS=0 SKIP_UNK_NS=0 SKIP_JS_PAGE=0 @@ -76,7 +80,9 @@ SKIP_BAD_URL=0 SKIP_NON_ASCII=0 SKIP_UNK_SUFFIX=0 SKIP_UNK_CODE=0 -SKIP_EXCEPT=0 +SKIP_EXPECT_NG=0 +SKIP_EXPECT_EI=0 +SKIP_EXPECT_IW=0 FILE_LINKS=0 PAGE_LINKS=0 SKIPPED_HEADER_ROW=0 @@ -95,49 +101,51 @@ NAME SYNOPSIS validate_external_links.sh --help - validate_external_links.sh --links URL --output PATH [--exceptions FILE] - [--record-ok-links] [--suggest-snapshots] [--take-screenshots] - [--start-url NUM] [--end-url NUM] [--upload PATH] + validate_external_links.sh --links URL --output DIR [--exceptions URL] + [--record-ok-links] [--suggest-snapshots] [--take-screenshots DIR] + [--start-url NUM] [--end-url NUM] [--upload FILE] DESCRIPTION This script parses a list of external links found in the OniGalore wiki (which is dumped by the Oni2.net domain periodically in a particular format), validates them using the Unix tool 'curl', and produces a report - of which links were OK (responded positively to an HTTP query), which - were RD (responded with a 3xx redirect code), which could be IW (inter- - wiki) links, and which were NG (no good; a negative response to the + of which links were "OK" (responded positively to an HTTP query), which + were "RD" (responded with a 3xx redirect code), which could be "IW" + (interwiki) links, which are "EI" (external internal) links and could be + intrawiki links, and which were "NG" (no good; a negative response to the query). This report can then be automatically uploaded to the location of your choice. The script can also suggest Internet Archive snapshots for - NG links, and take screenshots of OK links for visual verification by the - reader that the page in question is the one intended to be displayed. + "NG" links, and take screenshots of "OK" links for visual verification by + the reader that the page in question is the one intended to be displayed. You must pass this script the URL at which the list of links is found - (--links) and the path where logs should be outputted (--output). All - other arguments are optional. + (--links) and the path where the directory of logs should be outputted + (--output). All other arguments are optional. OPTIONS - --help Show this page - --links URL URL from which to download file with external links - (note that this can be a local file if you use the - file:// protocol) (required) - --output DIR Place the folder which will contain the reports and - optional screenshots at this path (required) - --exceptions URL In order to remove links from the list which show as - NG but which you regard as OK, prepare a plain-text - file where each line contains a response code being - returned and the URL returning it, separated by a - comma, e.g. "403,http://www.example.com" (note that - this can be a local file if you use the - file:// protocol) - --record-ok-links Log a link in the report even if its response code is - OK - --suggest-snapshots Query the Internet Archive for a possible snapshot - URL for each NG page - --take-screenshots Save screenshots of each OK page (requires Google - Chrome to be found at the path in CHROME) - --start-url NUM Start at this link in the links file - --end-url NUM Stop at this link in the links file - --upload FILE Upload report using info in this local file + --help Show this page. + --links URL (required) URL from which to download the CSV file + with external links. Note that this URL can be a + local file if you supply a file:// path. + --output DIR (required) Place the folder which will contain the + reports and optional screenshots at this (Unix- + format) path. + --exceptions URL In order to remove links from the report which Val + finds an issue with, but which you regard as OK, + list those desired exceptions in this file. See + the sample file exceptions.txt for details. Note + that this text file can be a local file if you + supply a file:// path. + --record-ok-links Log a link in the report even if its response code + is "OK". + --suggest-snapshots Query the Internet Archive for a possible snapshot + URL for each "NG" page. + --take-screenshots DIR Use the copy of Google Chrome at this path to take + screenshots of each "OK" page. + --start-url NUM Start at this link in the link dump CSV file. + --end-url NUM Stop at this link in the link dump CSV file. + --upload FILE Upload report using the credentials in this local + text file. See sftp_login.txt for example. BUGS The script cannot properly parse any line in the external links file @@ -157,36 +165,43 @@ fi # Parse arguments as long as there are more arguments to process while (( "$#" )); do case "$1" in - --links ) LINKS_URL="$2"; shift 2;; - --exceptions ) EXCEPT_URL="$2"; shift 2;; - --output ) OUTPUT_DIR="$2"; shift 2;; - --record-ok-links ) RECORD_OK_LINKS=1; shift;; - --suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;; - --take-screenshots ) TAKE_PAGE_SHOT=1; shift;; - --start-url ) URL_START=$2; shift 2;; - --end-url ) URL_LIMIT=$2; shift 2;; - --upload ) UPLOAD_INFO=$2; shift 2;; + --links ) LINKS_URL="$2"; shift 2;; + --exceptions ) EXCEPT_URL="$2"; shift 2;; + --output ) OUTPUT_DIR="$2"; shift 2;; + --record-ok-links ) RECORD_OK_LINKS=1; shift;; + --suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;; + --take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;; + --start-url ) URL_START=$2; shift 2;; + --end-url ) URL_LIMIT=$2; shift 2;; + --upload ) UPLOAD_INFO=$2; shift 2;; * ) echo "Invalid argument $1 detected. Aborting."; exit 1;; esac done # If the required arguments were not supplied, print help page and quit if [ -z $LINKS_URL ] || [ -z $OUTPUT_DIR ]; then - printHelp - echo "Error: I did not receive one or both required arguments." + echo "Error: I did not receive one or both required arguments. Run me with the \"--help\" argument for documentation." exit 2 fi +# If user wants screenshots, make sure path to Chrome was passed in and is valid +if [ $TAKE_PAGE_SHOT -eq 1 ]; then + if [ ! -f "$CHROME_PATH" ]; then + echo "Error: You need to supply a path to the Google Chrome application in order to take screenshots." + exit 3 + fi +fi + # Check that UPLOAD_INFO exists, if this argument was supplied if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting." - exit 3 + exit 4 fi # Check that OUTPUT_DIR is a directory if [ ! -d "$OUTPUT_DIR" ]; then echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting." - exit 4 + exit 5 fi # Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots @@ -207,14 +222,14 @@ fi # Check that 'mkdir' succeeded if [ ! -d "$OUTPUT_PATH" ]; then echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting." - exit 5 + exit 6 fi # Get date on the file at LINKS_URL and print to log LINKS_DATE=$(curl --silent --head $LINKS_URL | grep "Last-Modified") if [ -z "$LINKS_DATE" ]; then echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting." - exit 6 + exit 7 fi LINKS_DATE=${LINKS_DATE#Last-Modified: } @@ -277,7 +292,7 @@ function printHTMfooter() } # The central logging function. The first parameter is a string composed of one or more characters that -# indicates which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and +# indicate which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and # 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 'w' means "Don't # pass console output through 'fmt'" ("fmt" fits the output to an 80-column CLI but can break special # formatting and the 'n' option). @@ -416,15 +431,22 @@ function wrapupAndExit() valPrint ctrh "I skipped $LINKS_SKIPPED $(pluralCheckNoun link $LINKS_SKIPPED), and found $FILE_LINKS $(pluralCheckNoun file $FILE_LINKS) and $PAGE_LINKS $(pluralCheckNoun page $PAGE_LINKS)." if [ $LINKS_SKIPPED -gt 0 ]; then valPrint ctrh "Skip breakdown: "; fi if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi - if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE links on JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi + if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi - valPrint ctrh "Out of the $LINKS_CHECKED links checked, $IW_LINKS could be $(pluralCheckAn $IW_LINKS)interwiki $(pluralCheckNoun link $IW_LINKS), $OK_LINKS $(pluralCheckWas $OK_LINKS) OK, $RD_LINKS $(pluralCheckWas $RD_LINKS) $(pluralCheckA $RD_LINKS)redirection $(pluralCheckNoun notice $RD_LINKS), and $NG_LINKS $(pluralCheckWas $NG_LINKS) NG." - if [ $SKIP_EXCEPT -gt 0 ]; then - valPrint ctrh "$SKIP_EXCEPT/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS) went unlisted due to being found in the exceptions file." + valPrint ctrh "Out of the $LINKS_CHECKED links checked, $EI_LINKS could be $(pluralCheckAn $EI_LINKS)intrawiki $(pluralCheckNoun link $EI_LINKS), $IW_LINKS could be $(pluralCheckAn $IW_LINKS)interwiki $(pluralCheckNoun link $IW_LINKS), $OK_LINKS $(pluralCheckWas $OK_LINKS) OK, $RD_LINKS $(pluralCheckWas $RD_LINKS) $(pluralCheckA $RD_LINKS)redirection $(pluralCheckNoun notice $RD_LINKS), and $NG_LINKS $(pluralCheckWas $NG_LINKS) NG." + if [ $SKIP_EXPECT_NG -gt 0 ]; then + valPrint ctrh "$SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS) went unlisted due to being found in the exceptions file." + fi + if [ $SKIP_EXPECT_EI -gt 0 ]; then + valPrint ctrh "$SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS) went unlisted due to being found in the exceptions file." fi + if [ $SKIP_EXPECT_IW -gt 0 ]; then + valPrint ctrh "$SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS) went unlisted due to being found in the exceptions file." + fi + valPrint trh "ValExtLinks says goodbye." printRTFfooter printHTMfooter @@ -459,7 +481,7 @@ fi # Attempt to download file at EXCEPT_URL, then check that it succeeded if [ ! -z $EXCEPT_URL ]; then - valPrint cwtrh "Downloading list of NG exceptions from $EXCEPT_URL." + valPrint cwtrh "Downloading list of reporting exceptions from $EXCEPT_URL." EXCEPT_FILE_NAME=$(echo "$EXCEPT_URL" | sed 's/.*\///') EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME" curl --silent -o "$EXCEPT_FILE" $EXCEPT_URL @@ -486,7 +508,7 @@ else fi # Print settings to console and log -declare -a SETTINGS_MSG=(I will be checking the response code of each link "and will" take a screenshot of each page. Pages that are OK will "also" be logged. I "will" ask the Internet Archive for a suggested snapshot URL for each NG page. "I will not print NG links that are listed in the exceptions file.") +declare -a SETTINGS_MSG=(I will be checking the response code of each link "and will" take a screenshot of each page. Pages that are OK will "also" be logged. I "will" ask the Internet Archive for a suggested snapshot URL for each NG page. "I will not report links that are listed in the exceptions file.") if [ $TAKE_PAGE_SHOT -eq 0 ]; then SETTINGS_MSG[10]="but will not"; fi if [ $RECORD_OK_LINKS -eq 0 ]; then SETTINGS_MSG[22]="not"; fi if [ $SUGGEST_SNAPSHOTS -eq 0 ]; then SETTINGS_MSG[26]="will not"; fi @@ -504,7 +526,8 @@ valPrint hn "