| 18 |
|
EXCEPT_URL="" # ditto above for file with exceptions to NG results |
| 19 |
|
OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder |
| 20 |
|
RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES |
| 21 |
+ |
SHOW_SLASH=0 # record response code to the log when a slash is added to the end of a URL |
| 22 |
+ |
SHOW_HTTPS=0 # record response code to the log when "http" is upgraded to "https" |
| 23 |
|
SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page |
| 24 |
|
TAKE_PAGE_SHOT=0 # take a screenshot of each OK page |
| 25 |
|
CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature |
| 36 |
|
CURL_CODES="http://iritscen.oni2.net/val/curl_codes.txt" |
| 37 |
|
EXPECT_SCRIPT_NAME="val_expect_sftp.txt" |
| 38 |
|
HTTP_CODES="http://iritscen.oni2.net/val/http_codes.txt" |
| 39 |
< |
MY_WIKI_PAGE="http://wiki.oni2.net/User:Iritscen" |
| 39 |
> |
MY_WIKI_PAGE="https://wiki.oni2.net/User:Iritscen" |
| 40 |
|
THIS_DIR=$(cd $(dirname $0); pwd) |
| 41 |
|
WORKING_DIR=$(pwd) |
| 42 |
|
WIKI_PATH="wiki.oni2.net" |
| 85 |
|
SKIP_EXPECT_NG=0 |
| 86 |
|
SKIP_EXPECT_EI=0 |
| 87 |
|
SKIP_EXPECT_IW=0 |
| 88 |
+ |
SKIP_HTTPS_UP=0 |
| 89 |
+ |
SKIP_SLASH_ADD=0 |
| 90 |
|
FILE_LINKS=0 |
| 91 |
|
PAGE_LINKS=0 |
| 92 |
|
SKIPPED_HEADER_ROW=0 |
| 143 |
|
you supply a file:// path. |
| 144 |
|
--record-ok-links Log a link in the report even if its response |
| 145 |
|
code is "OK". |
| 146 |
+ |
--show-added-slashes Report on redirects that simply add a '/' to the |
| 147 |
+ |
end of the URL. |
| 148 |
+ |
--show-https-upgrade Report on redirects that simply upgrade a |
| 149 |
+ |
"http://" URL to a "https://" URL. |
| 150 |
|
--suggest-snapshots Query the Internet Archive for a possible |
| 151 |
|
snapshot URL for each "NG" page. |
| 152 |
|
--take-screenshots FILE Call the Google Chrome binary at this path to |
| 175 |
|
# Parse arguments as long as there are more arguments to process |
| 176 |
|
while (( "$#" )); do |
| 177 |
|
case "$1" in |
| 178 |
< |
--links ) LINKS_URL="$2"; shift 2;; |
| 179 |
< |
--exceptions ) EXCEPT_URL="$2"; shift 2;; |
| 180 |
< |
--output ) OUTPUT_DIR="$2"; shift 2;; |
| 181 |
< |
--record-ok-links ) RECORD_OK_LINKS=1; shift;; |
| 182 |
< |
--suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;; |
| 183 |
< |
--take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;; |
| 184 |
< |
--start-url ) URL_START=$2; shift 2;; |
| 185 |
< |
--end-url ) URL_LIMIT=$2; shift 2;; |
| 186 |
< |
--upload ) UPLOAD_INFO=$2; shift 2;; |
| 187 |
< |
* ) echo "Invalid argument $1 detected. Aborting."; exit 1;; |
| 178 |
> |
--links ) LINKS_URL="$2"; shift 2;; |
| 179 |
> |
--exceptions ) EXCEPT_URL="$2"; shift 2;; |
| 180 |
> |
--output ) OUTPUT_DIR="$2"; shift 2;; |
| 181 |
> |
--record-ok-links ) RECORD_OK_LINKS=1; shift;; |
| 182 |
> |
--show-added-slashes ) SHOW_SLASH=1; shift;; |
| 183 |
> |
--show-https-upgrade ) SHOW_HTTPS=1; shift;; |
| 184 |
> |
--suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;; |
| 185 |
> |
--take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;; |
| 186 |
> |
--start-url ) URL_START=$2; shift 2;; |
| 187 |
> |
--end-url ) URL_LIMIT=$2; shift 2;; |
| 188 |
> |
--upload ) UPLOAD_INFO=$2; shift 2;; |
| 189 |
> |
* ) echo "Invalid argument $1 detected. Aborting."; exit 1;; |
| 190 |
|
esac |
| 191 |
|
done |
| 192 |
|
|
| 443 |
|
END_RUN=$(date +%s) |
| 444 |
|
ELAPSED=$(echo $(($END_RUN - $START_RUN)) | awk '{printf "%d min. %d sec. elapsed", int($1/60), int($1%60)}') |
| 445 |
|
|
| 446 |
< |
# Output results of session and close the log file's markup |
| 446 |
> |
# Do some math on results of session |
| 447 |
|
LINKS_PROCESSED=$((LINK_NUM-URL_START+1)) |
| 448 |
< |
LINKS_SKIPPED=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE)) |
| 449 |
< |
LINKS_CHECKED=$((LINKS_PROCESSED-LINKS_SKIPPED)) |
| 448 |
> |
LINK_PROBLEMS=$((EI_LINKS+IW_LINKS+RD_LINKS+NG_LINKS)) |
| 449 |
> |
LINK_ERRORS=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE)) |
| 450 |
> |
LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_EI+SKIP_EXPECT_IW)) |
| 451 |
> |
TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP)) |
| 452 |
> |
LINKS_CHECKED=$((LINKS_PROCESSED-LINK_ERRORS)) |
| 453 |
> |
|
| 454 |
> |
# Print summary header |
| 455 |
|
valPrint ct "Summary ($ELAPSED):" |
| 456 |
|
valPrint r "\b1 Summary \b0 ($ELAPSED)" |
| 457 |
|
valPrint hn "<h3><span id=\"summary\">Summary ($ELAPSED)</span></h3>" |
| 458 |
< |
valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT)." |
| 459 |
< |
valPrint ctrh "I skipped $LINKS_SKIPPED $(pluralCheckNoun link $LINKS_SKIPPED), and found $FILE_LINKS $(pluralCheckNoun file $FILE_LINKS) and $PAGE_LINKS $(pluralCheckNoun page $PAGE_LINKS)." |
| 460 |
< |
if [ $LINKS_SKIPPED -gt 0 ]; then valPrint ctrh "Skip breakdown: "; fi |
| 461 |
< |
if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi |
| 458 |
> |
valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT) (there were $FILE_LINKS file $(pluralCheckNoun link $FILE_LINKS) and $PAGE_LINKS page $(pluralCheckNoun link $PAGE_LINKS))." |
| 459 |
> |
|
| 460 |
> |
# Print processed link totals |
| 461 |
> |
if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi |
| 462 |
> |
if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi |
| 463 |
> |
if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS processed $(pluralCheckNoun link $LINK_PROBLEMS) had issues"; fi |
| 464 |
> |
if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctrh " (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi |
| 465 |
> |
if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) were OK"; fi |
| 466 |
> |
if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctrh " (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi |
| 467 |
> |
|
| 468 |
> |
# Print excepted link totals |
| 469 |
> |
if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctrh "$LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"; fi |
| 470 |
> |
if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi |
| 471 |
> |
if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi |
| 472 |
> |
if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi |
| 473 |
> |
|
| 474 |
> |
# Print errored link totals |
| 475 |
> |
if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"; fi |
| 476 |
> |
if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi |
| 477 |
|
if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi |
| 478 |
|
if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi |
| 479 |
|
if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi |
| 480 |
|
if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi |
| 481 |
|
if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi |
| 482 |
< |
valPrint ctrh "Out of the $LINKS_CHECKED links checked, $EI_LINKS could be $(pluralCheckAn $EI_LINKS)intrawiki $(pluralCheckNoun link $EI_LINKS), $IW_LINKS could be $(pluralCheckAn $IW_LINKS)interwiki $(pluralCheckNoun link $IW_LINKS), $OK_LINKS $(pluralCheckWas $OK_LINKS) OK, $RD_LINKS $(pluralCheckWas $RD_LINKS) $(pluralCheckA $RD_LINKS)redirection $(pluralCheckNoun notice $RD_LINKS), and $NG_LINKS $(pluralCheckWas $NG_LINKS) NG." |
| 483 |
< |
if [ $SKIP_EXPECT_NG -gt 0 ]; then |
| 484 |
< |
valPrint ctrh "$SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS) went unlisted due to being found in the exceptions file." |
| 485 |
< |
fi |
| 486 |
< |
if [ $SKIP_EXPECT_EI -gt 0 ]; then |
| 487 |
< |
valPrint ctrh "$SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS) went unlisted due to being found in the exceptions file." |
| 488 |
< |
fi |
| 489 |
< |
if [ $SKIP_EXPECT_IW -gt 0 ]; then |
| 490 |
< |
valPrint ctrh "$SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS) went unlisted due to being found in the exceptions file." |
| 461 |
< |
fi |
| 482 |
> |
|
| 483 |
> |
# Print checked link totals |
| 484 |
> |
if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS link $(pluralCheckNoun issues $LINKS_CHECKED):"; fi |
| 485 |
> |
if [ $NG_LINKS -gt 0 ]; then valPrint ctrh "- $NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi |
| 486 |
> |
if [ $RD_LINKS -gt 0 ]; then valPrint ctrh "- $RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi |
| 487 |
> |
if [ $EI_LINKS -gt 0 ]; then valPrint ctrh "- $EI_LINKS $(pluralCheckNoun link $EI_LINKS) that could be intrawiki"; fi |
| 488 |
> |
if [ $IW_LINKS -gt 0 ]; then valPrint ctrh "- $IW_LINKS $(pluralCheckNoun link $IW_LINKS) that could be interwiki"; fi |
| 489 |
> |
|
| 490 |
> |
# Close the log files' markup |
| 491 |
|
valPrint trh "ValExtLinks says goodbye." |
| 492 |
|
printRTFfooter |
| 493 |
|
printHTMfooter |
| 663 |
|
fi |
| 664 |
|
|
| 665 |
|
# Build longer wiki page URLs from namespace and page names |
| 666 |
< |
FULL_PAGE_PATH=http://$WIKI_PATH/$NS_NAME:$PAGE_NAME |
| 666 |
> |
FULL_PAGE_PATH=https://$WIKI_PATH/$NS_NAME:$PAGE_NAME |
| 667 |
|
LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME |
| 668 |
|
# Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it |
| 669 |
|
# explicitly breaks the link |
| 670 |
|
if [ $NS_ID -eq 0 ]; then |
| 671 |
< |
FULL_PAGE_PATH=http://$WIKI_PATH/$PAGE_NAME |
| 671 |
> |
FULL_PAGE_PATH=https://$WIKI_PATH/$PAGE_NAME |
| 672 |
|
LOCAL_PAGE_PATH=$PAGE_NAME |
| 673 |
|
fi |
| 674 |
|
|
| 823 |
|
# Get URL header again in order to retrieve the URL we are being redirected to |
| 824 |
|
NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{redirect_url}\n' $URL) |
| 825 |
|
|
| 826 |
< |
# Filter out cases where the redirect URL is just the original URL with https:// instead of |
| 827 |
< |
# http://, or with an added '/' at the end. These corrections happen a lot and are not |
| 828 |
< |
# important to us. |
| 829 |
< |
URL_NO_PROTOCOL=${URL#http://} |
| 801 |
< |
URL_NO_PROTOCOL=${URL_NO_PROTOCOL%/} |
| 802 |
< |
NEW_URL_NO_PROTOCOL=${NEW_URL#https://} |
| 803 |
< |
NEW_URL_NO_PROTOCOL=${NEW_URL_NO_PROTOCOL%/} |
| 826 |
> |
# Adjust the old and new URLs to both use HTTP for comparison purposes, so we can filter |
| 827 |
> |
# those changes out if the user didn't ask for them |
| 828 |
> |
URL_HTTP=$(echo $URL | sed -E 's/^https:/http:/') |
| 829 |
> |
NEW_URL_HTTP=$(echo $NEW_URL | sed -E 's/^https:/http:/') |
| 830 |
|
|
| 831 |
|
# Sometimes 'curl' fails to get the redirect_url due to time-out or bad web site config |
| 832 |
< |
NEW_URL_LENGTH=$(echo | awk -v input=$NEW_URL_NO_PROTOCOL '{print length(input)}') |
| 832 |
> |
NEW_URL_LENGTH=$(echo | awk -v input=$NEW_URL_HTTP '{print length(input)}') |
| 833 |
|
if [ $NEW_URL_LENGTH -lt $MIN_URL_LENGTH ]; then |
| 834 |
< |
NEW_URL_NO_PROTOCOL="[new URL not retrieved]" |
| 834 |
> |
NEW_URL_HTTP="[new URL not retrieved]" |
| 835 |
|
fi |
| 836 |
|
|
| 837 |
< |
# If the URLs match after the above filters were applied, then the link is OK |
| 838 |
< |
if [ $URL_NO_PROTOCOL == $NEW_URL_NO_PROTOCOL ]; then |
| 837 |
> |
# Remove slash at end of new URL, if present, so we can filter out the redirects that |
| 838 |
> |
# merely add an ending slash if the user didn't ask for them |
| 839 |
> |
NEW_URL_NO_SLASH=$(echo $NEW_URL_HTTP | sed -E 's:/$::') |
| 840 |
> |
|
| 841 |
> |
# If the URLs match besides HTTP being upgraded to HTTPS, then the link is OK (unless user |
| 842 |
> |
# wants those to be reported) |
| 843 |
> |
if [ $SHOW_HTTPS -eq 0 ] && [ $URL_HTTP == $NEW_URL_HTTP ]; then |
| 844 |
> |
valPrint trs "Skipping URL $URL (found on page $PAGE_NAME) because we have not been asked to show http->https upgrades, and we were redirected to $NEW_URL." |
| 845 |
> |
STATUS="OK" |
| 846 |
> |
let OK_LINKS+=1 |
| 847 |
> |
let SKIP_HTTPS_UP+=1 |
| 848 |
> |
# If the URLs match besides an added ending slash, then the link is OK (unless user wants |
| 849 |
> |
# those to be reported) |
| 850 |
> |
elif [ $SHOW_SLASH -eq 0 ] && [ $URL_HTTP == $NEW_URL_NO_SLASH ]; then |
| 851 |
> |
valPrint trs "Skipping URL $URL (found on page $PAGE_NAME) because we have not been asked to show added trailing slashes, and we were redirected to $NEW_URL." |
| 852 |
|
STATUS="OK" |
| 853 |
|
let OK_LINKS+=1 |
| 854 |
+ |
let SKIP_SLASH_ADD+=1 |
| 855 |
|
else |
| 856 |
|
STATUS="RD" |
| 857 |
|
let RD_LINKS+=1 |