ViewVC Help
View File | Revision Log | View Changeset | Root Listing
root/Oni2/Validate External Links/validate_external_links.sh
(Generate patch)

Comparing Validate External Links/validate_external_links.sh (file contents):
Revision 1067 by iritscen, Tue Aug 1 17:09:42 2017 UTC vs.
Revision 1069 by iritscen, Wed Aug 2 04:26:48 2017 UTC

# Line 103 | Line 103 | DESCRIPTION
103         This script parses a list of external links found in the OniGalore wiki
104         (which is dumped by the Oni2.net domain periodically in a particular
105         format), validates them using the Unix tool 'curl', and produces a report
106 <       of which links were OK (responded to an HTTP query) and which were NG (no
107 <       good). This report can then be automatically uploaded to the location of
106 >       of which links were OK (responded positively to an HTTP query), which
107 >       were RD (responded with a 3xx redirect code), which could be IW (inter-
108 >       wiki) links, and which were NG (no good; a negative response to the
109 >       query). This report can then be automatically uploaded to the location of
110         your choice. The script can also suggest Internet Archive snapshots for
111         NG links, and take screenshots of OK links for visual verification by the
112         reader that the page in question is the one intended to be displayed.
# Line 120 | Line 122 | OPTIONS
122                             file:// protocol) (required)
123         --output DIR        Place the folder which will contain the reports and
124                             optional screenshots at this path (required)
125 <       --exceptions DIR    Don't log an NG link if it is listed in the file
126 <                           provided at this path as long as the response code is
127 <                           the same as the one associated with the link
128 <       --record-ok-links   Log a link in the report whether its response code is
129 <                           in the OK_CODES or the NG_CODES array
125 >       --exceptions URL    In order to remove links from the list which show as
126 >                           NG but which you regard as OK, prepare a plain-text
127 >                           file where each line contains a response code being
128 >                           returned and the URL returning it, separated by a
129 >                           comma, e.g. "403,http://www.example.com" (note that
130 >                           this can be a local file if you use the
131 >                           file:// protocol)
132 >       --record-ok-links   Log a link in the report even if its response code is
133 >                           OK
134         --suggest-snapshots Query the Internet Archive for a possible snapshot
135                             URL for each NG page
136         --take-screenshots  Save screenshots of each OK page (requires Google
# Line 415 | Line 421 | function wrapupAndExit()
421     if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
422     if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
423     if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
424 <   valPrint ctrh "Out of the $LINKS_CHECKED links checked, $OK_LINKS $(pluralCheckWas $OK_LINKS) OK, $RD_LINKS $(pluralCheckWas $RD_LINKS) $(pluralCheckA $RD_LINKS)redirection $(pluralCheckNoun notice $RD_LINKS), and $NG_LINKS $(pluralCheckWas $NG_LINKS) NG."
419 <   if [ $IW_LINKS -gt 0 ]; then
420 <      valPrint ctrh "$IW_LINKS/$OK_LINKS OK $(pluralCheckNoun link $OK_LINKS) $(pluralCheckIs $IW_LINKS) $(pluralCheckAn $IW_LINKS)external $(pluralCheckNoun link $IW_LINKS) that could be $(pluralCheckAn $IW_LINKS)interwiki $(pluralCheckNoun link $IW_LINKS)."
421 <   fi
424 >   valPrint ctrh "Out of the $LINKS_CHECKED links checked, $IW_LINKS could be $(pluralCheckAn $IW_LINKS)interwiki $(pluralCheckNoun link $IW_LINKS), $OK_LINKS $(pluralCheckWas $OK_LINKS) OK, $RD_LINKS $(pluralCheckWas $RD_LINKS) $(pluralCheckA $RD_LINKS)redirection $(pluralCheckNoun notice $RD_LINKS), and $NG_LINKS $(pluralCheckWas $NG_LINKS) NG."
425     if [ $SKIP_EXCEPT -gt 0 ]; then
426        valPrint ctrh "$SKIP_EXCEPT/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS) went unlisted due to being found in the exceptions file."
427     fi
# Line 445 | Line 448 | printRTFheader
448   printHTMheader
449  
450   # Attempt to download file at LINKS_URL, then check that it succeeded
451 < valPrint ctrh "Downloading list of external links from $LINKS_URL."
451 > valPrint cwtrh "Downloading list of external links from $LINKS_URL."
452   LINKS_FILE_NAME=$(echo "$LINKS_URL" | sed 's/.*\///')
453   LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
454   curl --silent -o "$LINKS_FILE" $LINKS_URL
# Line 456 | Line 459 | fi
459  
460   # Attempt to download file at EXCEPT_URL, then check that it succeeded
461   if [ ! -z $EXCEPT_URL ]; then
462 <   valPrint ctrh "Downloading list of NG exceptions from $EXCEPT_URL."
462 >   valPrint cwtrh "Downloading list of NG exceptions from $EXCEPT_URL."
463     EXCEPT_FILE_NAME=$(echo "$EXCEPT_URL" | sed 's/.*\///')
464     EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
465     curl --silent -o "$EXCEPT_FILE" $EXCEPT_URL
# Line 554 | Line 557 | for LINE in `cat "$LINKS_FILE"`; do
557     # Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES
558     NS_NAME=""
559     a=0
560 <   while [ "x${NS_IDS[$a]}" != "x" ] # once this evaluates to "x", the array is done
558 <   do
560 >   while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done
561        if [ $NS_ID -eq ${NS_IDS[$a]} ]; then
562           NS_NAME="${NS_NAMES[$a]}"
563           break
# Line 690 | Line 692 | for LINE in `cat "$LINKS_FILE"`; do
692        CURL_RESULT="$CURL_RESULT-$CURL_ERR"
693     fi
694  
695 <   # Determine if this code is in our "OK" list
695 >   # Determine our status code for this URL (IW, OK, RD, or NG)
696     STATUS="??"
697     NEW_URL=""
698     INTERWIKI_INDEX=-1
699 <   for CODE in "${OK_CODES[@]}"; do
700 <      if [[ $CODE == $CURL_CODE ]]; then
701 <         let OK_LINKS+=1
702 <
703 <         # Determine if this is a link to a domain that we have an interwiki prefix for
704 <         for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
703 <            if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]]; then
704 <               STATUS="IW"
705 <               let IW_LINKS+=1
706 <               INTERWIKI_INDEX=$i
707 <               break
708 <            fi
709 <         done
710 <
711 <         # If this link is OK and no interwiki advisory is needed, just mark as "OK"
712 <         if [ $INTERWIKI_INDEX == -1 ]; then
713 <            STATUS="OK"
714 <         fi
699 >   # First check if this is a link to a domain that we have an interwiki prefix for
700 >   for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
701 >      if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]]; then
702 >         STATUS="IW"
703 >         let IW_LINKS+=1
704 >         INTERWIKI_INDEX=$i
705           break
706        fi
707     done
708  
709 +   # If we didn't match an interwiki domain, see if the status code is in our "OK" codes list
710 +   if [ $STATUS == "??" ]; then
711 +      for CODE in "${OK_CODES[@]}"; do
712 +         if [[ $CODE == $CURL_CODE ]]; then
713 +            STATUS="OK"
714 +            let OK_LINKS+=1
715 +            break
716 +         fi
717 +      done
718 +   fi
719 +
720     # If we didn't get a match with the "OK" codes, check it against the "RD" codes
721     if [ $STATUS == "??" ]; then
722        for CODE in "${RD_CODES[@]}"; do
723           if [[ $CODE == $CURL_CODE ]]; then
723            STATUS="RD"
724            let RD_LINKS+=1
725
724              # Get URL header again in order to retrieve the URL we are being redirected to
725              NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{redirect_url}\n' $URL)
726  
727 +            # Check if the redirect URL is just the original URL with https:// instead of http://
728 +            # (this happens a lot and is not an important correction to us); if so, just make it "OK"
729 +            URL_NO_PROTOCOL=${URL#*://}
730 +            NEW_URL_NO_PROTOCOL=${NEW_URL#*://}
731 +            if [ $URL_NO_PROTOCOL == $NEW_URL_NO_PROTOCOL ]; then
732 +               STATUS="OK"
733 +               let OK_LINKS+=1
734 +            else
735 +               STATUS="RD"
736 +               let RD_LINKS+=1
737 +            fi
738              break
739           fi
740        done

Diff Legend

Removed lines
+ Added lines
< Changed lines (old)
> Changed lines (new)