15 |
|
### GLOBALS ### |
16 |
|
# Settings -- these will be changed from their defaults by the arguments passed in to the script |
17 |
|
LINKS_URL="" # use 'curl' to download file with links from this location (can be file://) |
18 |
< |
EXCEPT_URL="" # ditto above for file with exceptions to NG results |
18 |
> |
EXCEPT_URL="" # 'curl' will access this wiki page with a list of exceptions for NG results |
19 |
|
OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder |
20 |
|
RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES |
21 |
|
SHOW_SLASH=0 # record response code to the log when a slash is added to the end of a URL |
30 |
|
UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report |
31 |
|
|
32 |
|
# Fixed strings -- see the occurrences of these variables to learn their purpose |
33 |
< |
AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36 OPR/67.0.3575.53" |
33 |
> |
AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36 OPR/69.0.3686.77" |
34 |
|
ARCHIVE_API="http://archive.org/wayback/available" |
35 |
|
ARCHIVE_GENERIC="https://web.archive.org/web/*" |
36 |
|
ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206" |
37 |
|
CHROME_SCREENSHOT="screenshot.png" |
38 |
|
CURL_CODES="http://iritscen.oni2.net/val/curl_codes.txt" |
39 |
+ |
EXCEPT_FILE_NAME="exceptions.txt" |
40 |
|
EXPECT_SCRIPT_NAME="val_expect_sftp.txt" |
41 |
|
HTTP_CODES="http://iritscen.oni2.net/val/http_codes.txt" |
42 |
|
MY_WIKI_PAGE="https://wiki.oni2.net/User:Iritscen" |
113 |
|
SYNOPSIS |
114 |
|
validate_external_links.sh --help |
115 |
|
validate_external_links.sh --links URL --output DIR [--exceptions URL] |
116 |
< |
[--record-ok-links] [--suggest-snapshots] [--take-screenshots FILE] |
117 |
< |
[--start-url NUM] [--end-url NUM] [--upload FILE] |
116 |
> |
[--record-ok-links] [--show-added-slashes] [--show-https-upgrades] |
117 |
> |
[--show-yt-redirects] [--suggest-snapshots] [--skip-archive-links] |
118 |
> |
[--take-screenshots FILE] [--start-url NUM] [--end-url NUM] |
119 |
> |
[--upload FILE] |
120 |
|
|
121 |
|
DESCRIPTION |
122 |
|
This script parses a list of external links found in the OniGalore wiki |
143 |
|
--output DIR (required) Unix path to directory in which Val |
144 |
|
should place its reports. |
145 |
|
--exceptions URL In order to remove links from the report which |
146 |
< |
Val finds an issue with, but which you regard as |
147 |
< |
OK, list those desired exceptions in this file. |
148 |
< |
See the sample file exceptions.txt for details. |
149 |
< |
Note that this URL can point to a local file if |
150 |
< |
you supply a file:// path. |
146 |
> |
Val finds an issue with but which you regard as |
147 |
> |
OK, list those desired exceptions on a wiki page. |
148 |
> |
See the sample file "exceptions.pdf" for the |
149 |
> |
required format of the page. Note that this URL |
150 |
> |
can point to a local file if you supply a path |
151 |
> |
beginning with "file://". |
152 |
|
--record-ok-links Log a link in the report even if its response |
153 |
|
code is "OK". |
154 |
|
--show-added-slashes Report on redirects that simply add a '/' to the |
541 |
|
# Attempt to download file at EXCEPT_URL, then check that it succeeded |
542 |
|
if [ ! -z $EXCEPT_URL ]; then |
543 |
|
valPrint cwtrh "Downloading list of reporting exceptions from $EXCEPT_URL." |
544 |
< |
EXCEPT_FILE_NAME=$(echo "$EXCEPT_URL" | sed 's/.*\///') |
545 |
< |
EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME" |
546 |
< |
curl --silent -o "$EXCEPT_FILE" $EXCEPT_URL |
543 |
< |
if [ ! -f "$EXCEPT_FILE" ]; then |
544 |
< |
echo "The download of $EXCEPT_URL appears to have failed. Aborting." |
544 |
> |
EXCEPT_DATA=$(curl --silent $EXCEPT_URL) |
545 |
> |
if [ -z "$EXCEPT_DATA" ]; then |
546 |
> |
echo "The download of the exceptions data from '$EXCEPT_URL' appears to have failed. Aborting." |
547 |
|
wrapupAndExit |
548 |
|
fi |
549 |
+ |
EXCEPT_DATA=${EXCEPT_DATA%END LIST*} |
550 |
+ |
EXCEPT_DATA=${EXCEPT_DATA#*BEGIN LIST} |
551 |
+ |
EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME" |
552 |
+ |
|
553 |
+ |
# Store on disk for debugging purposes |
554 |
+ |
echo "$EXCEPT_DATA" > "$EXCEPT_FILE" |
555 |
+ |
|
556 |
+ |
# Transfer to array for easy searching later |
557 |
+ |
declare -a EXCEPT_ARRAY=($(echo "$EXCEPT_DATA")) |
558 |
|
fi |
559 |
|
|
560 |
|
# Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count |
574 |
|
fi |
575 |
|
|
576 |
|
# Print settings to console and log |
577 |
< |
declare -a SETTINGS_MSG=(I will be checking the response code of each link "and will" take a screenshot of each page. Pages that are OK will "also" be logged. I "will" ask the Internet Archive for a suggested snapshot URL for each NG page. "I will not report links that are listed in the exceptions file." "I will ignore URLs that simply have ending slashes added onto them." "I will ignore URLs that only upgrade from HTTP to HTTPS." "I will ignore youtu.be links that are merely being expanded." "I will not check the validity of Internet Archive snapshot URLs.") |
577 |
> |
declare -a SETTINGS_MSG=(I will be checking the response code of each link "and will" take a screenshot of each page. Pages that are OK will "also" be logged. I "will" ask the Internet Archive for a suggested snapshot URL for each NG page. "I will not report links that are in the exceptions list." "I will ignore URLs that simply have ending slashes added onto them." "I will ignore URLs that only upgrade from HTTP to HTTPS." "I will ignore youtu.be links that are merely being expanded." "I will not check the validity of Internet Archive snapshot URLs.") |
578 |
|
if [ $TAKE_PAGE_SHOT -eq 0 ]; then SETTINGS_MSG[10]="but will not"; fi |
579 |
|
if [ $RECORD_OK_LINKS -eq 0 ]; then SETTINGS_MSG[22]="not"; fi |
580 |
|
if [ $SUGGEST_SNAPSHOTS -eq 0 ]; then SETTINGS_MSG[26]="will not"; fi |
933 |
|
continue |
934 |
|
fi |
935 |
|
|
936 |
< |
# Check problem links against exceptions file before proceeding |
936 |
> |
# Check problem links against exceptions list before proceeding |
937 |
> |
FOUND_EXCEPT=0 |
938 |
|
if [ $STATUS != "OK" ] && [ ! -z $EXCEPT_URL ]; then |
939 |
|
# The code we expect to find in the exceptions file is either the 'curl' result or "EI"/"IW" |
940 |
|
EXPECT_CODE="$CURL_RESULT" |
944 |
|
EXPECT_CODE="IW" |
945 |
|
fi |
946 |
|
|
947 |
< |
# Look for link in exceptions file and make sure its listed result code and wiki page also match |
948 |
< |
GREP_RESULT=$(grep --max-count=1 "$URL" "$EXCEPT_FILE") |
949 |
< |
EXCEPT_PAGE=${GREP_RESULT##*,} |
950 |
< |
if [ "$EXCEPT_PAGE" == "*" ] || [ "$EXCEPT_PAGE" == $LOCAL_PAGE_PATH ]; then |
951 |
< |
EXCEPT_CODE=${GREP_RESULT%%,*} |
952 |
< |
if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then |
953 |
< |
valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because its expected result, $EXPECT_CODE, is listed in the exceptions file." |
954 |
< |
if [ $STATUS == "EI" ]; then |
955 |
< |
let SKIP_EXPECT_EI+=1 |
944 |
< |
elif [ $STATUS == "IW" ]; then |
945 |
< |
let SKIP_EXPECT_IW+=1 |
946 |
< |
else |
947 |
< |
let SKIP_EXPECT_NG+=1 |
948 |
< |
fi |
947 |
> |
# Look for link in exceptions list and make sure the listed result code and wiki page also match |
948 |
> |
for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do |
949 |
> |
{ |
950 |
> |
EXCEPT_LINE="${EXCEPT_ARRAY[$i]}" |
951 |
> |
|
952 |
> |
# Match URL |
953 |
> |
EXCEPT_URL="${EXCEPT_LINE#*,}" |
954 |
> |
EXCEPT_URL="${EXCEPT_URL%,*}" |
955 |
> |
if [ "$EXCEPT_URL" != "$URL" ]; then |
956 |
|
continue |
957 |
|
fi |
958 |
< |
fi |
958 |
> |
|
959 |
> |
# Match containing page's name |
960 |
> |
EXCEPT_PAGE="${EXCEPT_LINE##*,}" |
961 |
> |
EXCEPT_PAGE="${EXCEPT_PAGE%% *}" |
962 |
> |
if [ "$EXCEPT_PAGE" == "*" ] || [ "$EXCEPT_PAGE" == $LOCAL_PAGE_PATH ]; then |
963 |
> |
# Match result code |
964 |
> |
EXCEPT_CODE=${EXCEPT_LINE%%,*} |
965 |
> |
if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then |
966 |
> |
valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because its expected result, '$EXPECT_CODE', is in the exceptions list." |
967 |
> |
if [ $STATUS == "EI" ]; then |
968 |
> |
let SKIP_EXPECT_EI+=1 |
969 |
> |
elif [ $STATUS == "IW" ]; then |
970 |
> |
let SKIP_EXPECT_IW+=1 |
971 |
> |
else |
972 |
> |
let SKIP_EXPECT_NG+=1 |
973 |
> |
fi |
974 |
> |
FOUND_EXCEPT=1 |
975 |
> |
break |
976 |
> |
fi |
977 |
> |
fi |
978 |
> |
} done |
979 |
> |
fi |
980 |
> |
if [ $FOUND_EXCEPT -eq 1 ]; then |
981 |
> |
continue |
982 |
|
fi |
983 |
|
|
984 |
|
# If appropriate, record this link to the log, with clickable URLs when possible |