| 103 |
|
This script parses a list of external links found in the OniGalore wiki |
| 104 |
|
(which is dumped by the Oni2.net domain periodically in a particular |
| 105 |
|
format), validates them using the Unix tool 'curl', and produces a report |
| 106 |
< |
of which links were OK (responded to an HTTP query) and which were NG (no |
| 107 |
< |
good). This report can then be automatically uploaded to the location of |
| 106 |
> |
of which links were OK (responded positively to an HTTP query), which |
| 107 |
> |
were RD (responded with a 3xx redirect code), which could be IW (inter- |
| 108 |
> |
wiki) links, and which were NG (no good; a negative response to the |
| 109 |
> |
query). This report can then be automatically uploaded to the location of |
| 110 |
|
your choice. The script can also suggest Internet Archive snapshots for |
| 111 |
|
NG links, and take screenshots of OK links for visual verification by the |
| 112 |
|
reader that the page in question is the one intended to be displayed. |
| 122 |
|
file:// protocol) (required) |
| 123 |
|
--output DIR Place the folder which will contain the reports and |
| 124 |
|
optional screenshots at this path (required) |
| 125 |
< |
--exceptions DIR Don't log an NG link if it is listed in the file |
| 126 |
< |
provided at this path as long as the response code is |
| 127 |
< |
the same as the one associated with the link |
| 128 |
< |
--record-ok-links Log a link in the report whether its response code is |
| 129 |
< |
in the OK_CODES or the NG_CODES array |
| 125 |
> |
--exceptions URL In order to remove links from the list which show as |
| 126 |
> |
NG but which you regard as OK, prepare a plain-text |
| 127 |
> |
file where each line contains a response code being |
| 128 |
> |
returned and the URL returning it, separated by a |
| 129 |
> |
comma, e.g. "403,http://www.example.com" (note that |
| 130 |
> |
this can be a local file if you use the |
| 131 |
> |
file:// protocol) |
| 132 |
> |
--record-ok-links Log a link in the report even if its response code is |
| 133 |
> |
OK |
| 134 |
|
--suggest-snapshots Query the Internet Archive for a possible snapshot |
| 135 |
|
URL for each NG page |
| 136 |
|
--take-screenshots Save screenshots of each OK page (requires Google |
| 421 |
|
if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi |
| 422 |
|
if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi |
| 423 |
|
if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi |
| 424 |
< |
valPrint ctrh "Out of the $LINKS_CHECKED links checked, $OK_LINKS $(pluralCheckWas $OK_LINKS) OK, $RD_LINKS $(pluralCheckWas $RD_LINKS) $(pluralCheckA $RD_LINKS)redirection $(pluralCheckNoun notice $RD_LINKS), and $NG_LINKS $(pluralCheckWas $NG_LINKS) NG." |
| 419 |
< |
if [ $IW_LINKS -gt 0 ]; then |
| 420 |
< |
valPrint ctrh "$IW_LINKS/$OK_LINKS OK $(pluralCheckNoun link $OK_LINKS) $(pluralCheckIs $IW_LINKS) $(pluralCheckAn $IW_LINKS)external $(pluralCheckNoun link $IW_LINKS) that could be $(pluralCheckAn $IW_LINKS)interwiki $(pluralCheckNoun link $IW_LINKS)." |
| 421 |
< |
fi |
| 424 |
> |
valPrint ctrh "Out of the $LINKS_CHECKED links checked, $IW_LINKS could be $(pluralCheckAn $IW_LINKS)interwiki $(pluralCheckNoun link $IW_LINKS), $OK_LINKS $(pluralCheckWas $OK_LINKS) OK, $RD_LINKS $(pluralCheckWas $RD_LINKS) $(pluralCheckA $RD_LINKS)redirection $(pluralCheckNoun notice $RD_LINKS), and $NG_LINKS $(pluralCheckWas $NG_LINKS) NG." |
| 425 |
|
if [ $SKIP_EXCEPT -gt 0 ]; then |
| 426 |
|
valPrint ctrh "$SKIP_EXCEPT/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS) went unlisted due to being found in the exceptions file." |
| 427 |
|
fi |
| 448 |
|
printHTMheader |
| 449 |
|
|
| 450 |
|
# Attempt to download file at LINKS_URL, then check that it succeeded |
| 451 |
< |
valPrint ctrh "Downloading list of external links from $LINKS_URL." |
| 451 |
> |
valPrint cwtrh "Downloading list of external links from $LINKS_URL." |
| 452 |
|
LINKS_FILE_NAME=$(echo "$LINKS_URL" | sed 's/.*\///') |
| 453 |
|
LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME" |
| 454 |
|
curl --silent -o "$LINKS_FILE" $LINKS_URL |
| 459 |
|
|
| 460 |
|
# Attempt to download file at EXCEPT_URL, then check that it succeeded |
| 461 |
|
if [ ! -z $EXCEPT_URL ]; then |
| 462 |
< |
valPrint ctrh "Downloading list of NG exceptions from $EXCEPT_URL." |
| 462 |
> |
valPrint cwtrh "Downloading list of NG exceptions from $EXCEPT_URL." |
| 463 |
|
EXCEPT_FILE_NAME=$(echo "$EXCEPT_URL" | sed 's/.*\///') |
| 464 |
|
EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME" |
| 465 |
|
curl --silent -o "$EXCEPT_FILE" $EXCEPT_URL |
| 557 |
|
# Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES |
| 558 |
|
NS_NAME="" |
| 559 |
|
a=0 |
| 560 |
< |
while [ "x${NS_IDS[$a]}" != "x" ] # once this evaluates to "x", the array is done |
| 558 |
< |
do |
| 560 |
> |
while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done |
| 561 |
|
if [ $NS_ID -eq ${NS_IDS[$a]} ]; then |
| 562 |
|
NS_NAME="${NS_NAMES[$a]}" |
| 563 |
|
break |
| 692 |
|
CURL_RESULT="$CURL_RESULT-$CURL_ERR" |
| 693 |
|
fi |
| 694 |
|
|
| 695 |
< |
# Determine if this code is in our "OK" list |
| 695 |
> |
# Determine our status code for this URL (IW, OK, RD, or NG) |
| 696 |
|
STATUS="??" |
| 697 |
|
NEW_URL="" |
| 698 |
|
INTERWIKI_INDEX=-1 |
| 699 |
< |
for CODE in "${OK_CODES[@]}"; do |
| 700 |
< |
if [[ $CODE == $CURL_CODE ]]; then |
| 701 |
< |
let OK_LINKS+=1 |
| 702 |
< |
|
| 703 |
< |
# Determine if this is a link to a domain that we have an interwiki prefix for |
| 704 |
< |
for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do |
| 703 |
< |
if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]]; then |
| 704 |
< |
STATUS="IW" |
| 705 |
< |
let IW_LINKS+=1 |
| 706 |
< |
INTERWIKI_INDEX=$i |
| 707 |
< |
break |
| 708 |
< |
fi |
| 709 |
< |
done |
| 710 |
< |
|
| 711 |
< |
# If this link is OK and no interwiki advisory is needed, just mark as "OK" |
| 712 |
< |
if [ $INTERWIKI_INDEX == -1 ]; then |
| 713 |
< |
STATUS="OK" |
| 714 |
< |
fi |
| 699 |
> |
# First check if this is a link to a domain that we have an interwiki prefix for |
| 700 |
> |
for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do |
| 701 |
> |
if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]]; then |
| 702 |
> |
STATUS="IW" |
| 703 |
> |
let IW_LINKS+=1 |
| 704 |
> |
INTERWIKI_INDEX=$i |
| 705 |
|
break |
| 706 |
|
fi |
| 707 |
|
done |
| 708 |
|
|
| 709 |
+ |
# If we didn't match an interwiki domain, see if the status code is in our "OK" codes list |
| 710 |
+ |
if [ $STATUS == "??" ]; then |
| 711 |
+ |
for CODE in "${OK_CODES[@]}"; do |
| 712 |
+ |
if [[ $CODE == $CURL_CODE ]]; then |
| 713 |
+ |
STATUS="OK" |
| 714 |
+ |
let OK_LINKS+=1 |
| 715 |
+ |
break |
| 716 |
+ |
fi |
| 717 |
+ |
done |
| 718 |
+ |
fi |
| 719 |
+ |
|
| 720 |
|
# If we didn't get a match with the "OK" codes, check it against the "RD" codes |
| 721 |
|
if [ $STATUS == "??" ]; then |
| 722 |
|
for CODE in "${RD_CODES[@]}"; do |
| 723 |
|
if [[ $CODE == $CURL_CODE ]]; then |
| 723 |
– |
STATUS="RD" |
| 724 |
– |
let RD_LINKS+=1 |
| 725 |
– |
|
| 724 |
|
# Get URL header again in order to retrieve the URL we are being redirected to |
| 725 |
|
NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{redirect_url}\n' $URL) |
| 726 |
|
|
| 727 |
+ |
# Check if the redirect URL is just the original URL with https:// instead of http:// |
| 728 |
+ |
# (this happens a lot and is not an important correction to us); if so, just make it "OK" |
| 729 |
+ |
URL_NO_PROTOCOL=${URL#*://} |
| 730 |
+ |
NEW_URL_NO_PROTOCOL=${NEW_URL#*://} |
| 731 |
+ |
if [ $URL_NO_PROTOCOL == $NEW_URL_NO_PROTOCOL ]; then |
| 732 |
+ |
STATUS="OK" |
| 733 |
+ |
let OK_LINKS+=1 |
| 734 |
+ |
else |
| 735 |
+ |
STATUS="RD" |
| 736 |
+ |
let RD_LINKS+=1 |
| 737 |
+ |
fi |
| 738 |
|
break |
| 739 |
|
fi |
| 740 |
|
done |