103 |
|
This script parses a list of external links found in the OniGalore wiki |
104 |
|
(which is dumped by the Oni2.net domain periodically in a particular |
105 |
|
format), validates them using the Unix tool 'curl', and produces a report |
106 |
< |
of which links were OK (responded to an HTTP query) and which were NG (no |
107 |
< |
good). This report can then be automatically uploaded to the location of |
106 |
> |
of which links were OK (responded positively to an HTTP query), which |
107 |
> |
were RD (responded with a 3xx redirect code), which could be IW (inter- |
108 |
> |
wiki) links, and which were NG (no good; a negative response to the |
109 |
> |
query). This report can then be automatically uploaded to the location of |
110 |
|
your choice. The script can also suggest Internet Archive snapshots for |
111 |
|
NG links, and take screenshots of OK links for visual verification by the |
112 |
|
reader that the page in question is the one intended to be displayed. |
122 |
|
file:// protocol) (required) |
123 |
|
--output DIR Place the folder which will contain the reports and |
124 |
|
optional screenshots at this path (required) |
125 |
< |
--exceptions DIR Don't log an NG link if it is listed in the file |
126 |
< |
provided at this path as long as the response code is |
127 |
< |
the same as the one associated with the link |
128 |
< |
--record-ok-links Log a link in the report whether its response code is |
129 |
< |
in the OK_CODES or the NG_CODES array |
125 |
> |
--exceptions URL In order to remove links from the list which show as |
126 |
> |
NG but which you regard as OK, prepare a plain-text |
127 |
> |
file where each line contains a response code being |
128 |
> |
returned and the URL returning it, separated by a |
129 |
> |
comma, e.g. "403,http://www.example.com" (note that |
130 |
> |
this can be a local file if you use the |
131 |
> |
file:// protocol) |
132 |
> |
--record-ok-links Log a link in the report even if its response code is |
133 |
> |
OK |
134 |
|
--suggest-snapshots Query the Internet Archive for a possible snapshot |
135 |
|
URL for each NG page |
136 |
|
--take-screenshots Save screenshots of each OK page (requires Google |
421 |
|
if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi |
422 |
|
if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi |
423 |
|
if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi |
424 |
< |
valPrint ctrh "Out of the $LINKS_CHECKED links checked, $OK_LINKS $(pluralCheckWas $OK_LINKS) OK, $RD_LINKS $(pluralCheckWas $RD_LINKS) $(pluralCheckA $RD_LINKS)redirection $(pluralCheckNoun notice $RD_LINKS), and $NG_LINKS $(pluralCheckWas $NG_LINKS) NG." |
419 |
< |
if [ $IW_LINKS -gt 0 ]; then |
420 |
< |
valPrint ctrh "$IW_LINKS/$OK_LINKS OK $(pluralCheckNoun link $OK_LINKS) $(pluralCheckIs $IW_LINKS) $(pluralCheckAn $IW_LINKS)external $(pluralCheckNoun link $IW_LINKS) that could be $(pluralCheckAn $IW_LINKS)interwiki $(pluralCheckNoun link $IW_LINKS)." |
421 |
< |
fi |
424 |
> |
valPrint ctrh "Out of the $LINKS_CHECKED links checked, $IW_LINKS could be $(pluralCheckAn $IW_LINKS)interwiki $(pluralCheckNoun link $IW_LINKS), $OK_LINKS $(pluralCheckWas $OK_LINKS) OK, $RD_LINKS $(pluralCheckWas $RD_LINKS) $(pluralCheckA $RD_LINKS)redirection $(pluralCheckNoun notice $RD_LINKS), and $NG_LINKS $(pluralCheckWas $NG_LINKS) NG." |
425 |
|
if [ $SKIP_EXCEPT -gt 0 ]; then |
426 |
|
valPrint ctrh "$SKIP_EXCEPT/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS) went unlisted due to being found in the exceptions file." |
427 |
|
fi |
448 |
|
printHTMheader |
449 |
|
|
450 |
|
# Attempt to download file at LINKS_URL, then check that it succeeded |
451 |
< |
valPrint ctrh "Downloading list of external links from $LINKS_URL." |
451 |
> |
valPrint cwtrh "Downloading list of external links from $LINKS_URL." |
452 |
|
LINKS_FILE_NAME=$(echo "$LINKS_URL" | sed 's/.*\///') |
453 |
|
LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME" |
454 |
|
curl --silent -o "$LINKS_FILE" $LINKS_URL |
459 |
|
|
460 |
|
# Attempt to download file at EXCEPT_URL, then check that it succeeded |
461 |
|
if [ ! -z $EXCEPT_URL ]; then |
462 |
< |
valPrint ctrh "Downloading list of NG exceptions from $EXCEPT_URL." |
462 |
> |
valPrint cwtrh "Downloading list of NG exceptions from $EXCEPT_URL." |
463 |
|
EXCEPT_FILE_NAME=$(echo "$EXCEPT_URL" | sed 's/.*\///') |
464 |
|
EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME" |
465 |
|
curl --silent -o "$EXCEPT_FILE" $EXCEPT_URL |
557 |
|
# Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES |
558 |
|
NS_NAME="" |
559 |
|
a=0 |
560 |
< |
while [ "x${NS_IDS[$a]}" != "x" ] # once this evaluates to "x", the array is done |
558 |
< |
do |
560 |
> |
while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done |
561 |
|
if [ $NS_ID -eq ${NS_IDS[$a]} ]; then |
562 |
|
NS_NAME="${NS_NAMES[$a]}" |
563 |
|
break |
692 |
|
CURL_RESULT="$CURL_RESULT-$CURL_ERR" |
693 |
|
fi |
694 |
|
|
695 |
< |
# Determine if this code is in our "OK" list |
695 |
> |
# Determine our status code for this URL (IW, OK, RD, or NG) |
696 |
|
STATUS="??" |
697 |
|
NEW_URL="" |
698 |
|
INTERWIKI_INDEX=-1 |
699 |
< |
for CODE in "${OK_CODES[@]}"; do |
700 |
< |
if [[ $CODE == $CURL_CODE ]]; then |
701 |
< |
let OK_LINKS+=1 |
702 |
< |
|
703 |
< |
# Determine if this is a link to a domain that we have an interwiki prefix for |
704 |
< |
for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do |
703 |
< |
if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]]; then |
704 |
< |
STATUS="IW" |
705 |
< |
let IW_LINKS+=1 |
706 |
< |
INTERWIKI_INDEX=$i |
707 |
< |
break |
708 |
< |
fi |
709 |
< |
done |
710 |
< |
|
711 |
< |
# If this link is OK and no interwiki advisory is needed, just mark as "OK" |
712 |
< |
if [ $INTERWIKI_INDEX == -1 ]; then |
713 |
< |
STATUS="OK" |
714 |
< |
fi |
699 |
> |
# First check if this is a link to a domain that we have an interwiki prefix for |
700 |
> |
for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do |
701 |
> |
if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]]; then |
702 |
> |
STATUS="IW" |
703 |
> |
let IW_LINKS+=1 |
704 |
> |
INTERWIKI_INDEX=$i |
705 |
|
break |
706 |
|
fi |
707 |
|
done |
708 |
|
|
709 |
+ |
# If we didn't match an interwiki domain, see if the status code is in our "OK" codes list |
710 |
+ |
if [ $STATUS == "??" ]; then |
711 |
+ |
for CODE in "${OK_CODES[@]}"; do |
712 |
+ |
if [[ $CODE == $CURL_CODE ]]; then |
713 |
+ |
STATUS="OK" |
714 |
+ |
let OK_LINKS+=1 |
715 |
+ |
break |
716 |
+ |
fi |
717 |
+ |
done |
718 |
+ |
fi |
719 |
+ |
|
720 |
|
# If we didn't get a match with the "OK" codes, check it against the "RD" codes |
721 |
|
if [ $STATUS == "??" ]; then |
722 |
|
for CODE in "${RD_CODES[@]}"; do |
723 |
|
if [[ $CODE == $CURL_CODE ]]; then |
723 |
– |
STATUS="RD" |
724 |
– |
let RD_LINKS+=1 |
725 |
– |
|
724 |
|
# Get URL header again in order to retrieve the URL we are being redirected to |
725 |
|
NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{redirect_url}\n' $URL) |
726 |
|
|
727 |
+ |
# Check if the redirect URL is just the original URL with https:// instead of http:// |
728 |
+ |
# (this happens a lot and is not an important correction to us); if so, just make it "OK" |
729 |
+ |
URL_NO_PROTOCOL=${URL#*://} |
730 |
+ |
NEW_URL_NO_PROTOCOL=${NEW_URL#*://} |
731 |
+ |
if [ $URL_NO_PROTOCOL == $NEW_URL_NO_PROTOCOL ]; then |
732 |
+ |
STATUS="OK" |
733 |
+ |
let OK_LINKS+=1 |
734 |
+ |
else |
735 |
+ |
STATUS="RD" |
736 |
+ |
let RD_LINKS+=1 |
737 |
+ |
fi |
738 |
|
break |
739 |
|
fi |
740 |
|
done |