ViewVC Help
View File | Revision Log | View Changeset | Root Listing
root/Oni2/Validate External Links/validate_external_links.sh
(Generate patch)

Comparing Validate External Links/validate_external_links.sh (file contents):
Revision 1125 by iritscen, Wed Mar 25 21:50:30 2020 UTC vs.
Revision 1127 by iritscen, Sat Mar 28 02:08:29 2020 UTC

# Line 20 | Line 20 | OUTPUT_DIR=""      # place reports and al
20   RECORD_OK_LINKS=0   # record response code to the log even when it's a value in OK_CODES
21   SHOW_SLASH=0        # record response code to the log when a slash is added to the end of a URL
22   SHOW_HTTPS=0        # record response code to the log when "http" is upgraded to "https"
23 + SHOW_YT_RD=0        # record response code to the log when a youtu.be URL is expanded to the full URL
24   SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page
25   TAKE_PAGE_SHOT=0    # take a screenshot of each OK page
26   CHROME_PATH=""      # path to a copy of Google Chrome that has the command-line screenshot feature
# Line 47 | Line 48 | declare -a NS_NAMES=("Media" "Special" "
48  
49   # These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
50   # This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code.
51 < declare -a HTTP_FILES=(txt zip wmv jpg png m4a bsl rar oni mp3 mov ONWC vbs TRMA mp4 doc avi log gif pdf dmg exe cpp tga 7z wav east BINA xml dll dae xaf fbx 3ds blend flv csv)
52 < declare -a HTTP_TLDS_AND_PAGES=(com net org uk ru de it htm html php pl asp aspx shtml pgi cgi php3 x jsp phtml cfm css action stm js)
51 > declare -a HTTP_FILES=(3ds 7z avi BINA blend bsl cpp csv dae dll dmg doc east exe fbx first63 flv gamedata gif jpg last32 log m4a mhm mov mp3 mp4 oni ONWC pdf png py rar tga TRMA txt vbs wav wmv xaf xml zip)
52 > declare -a HTTP_TLDS_AND_PAGES=(action ars asp aspx cfm cgi com css de htm html it js jsp net org pgi php php3 phtml pl ru shtml stm uk x)
53  
54   # These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
55   # are NG (no good). Pages that return OK codes will be screenshotted. Remember to update http_codes.txt
56   # if you add a new code.
57 < declare -a OK_CODES=(200 401 405 406 501)
57 > declare -a OK_CODES=(200 401 405 406 418 501)
58   declare -a RD_CODES=(301 302 303 307 308)
59 < declare -a NG_CODES=(000 403 404 410 500 503)
59 > declare -a NG_CODES=(000 400 403 404 410 500 502 503 530)
60  
61   # Characters not allowed in a URL. Curly braces are sometimes used on the wiki to build a link using
62   # transcluded text, and if the transclusion fails, then the braces show up in the URL
# Line 87 | Line 88 | SKIP_EXPECT_EI=0
88   SKIP_EXPECT_IW=0
89   SKIP_HTTPS_UP=0
90   SKIP_SLASH_ADD=0
91 + SKIP_YOUTU_BE=0
92   FILE_LINKS=0
93   PAGE_LINKS=0
94   SKIPPED_HEADER_ROW=0
# Line 145 | Line 147 | OPTIONS
147                                 code is "OK".
148         --show-added-slashes    Report on redirects that simply add a '/' to the
149                                 end of the URL.
150 <       --show-https-upgrade    Report on redirects that simply upgrade a
150 >       --show-https-upgrades   Report on redirects that simply upgrade a
151                                 "http://" URL to a "https://" URL.
152 +       --show-yt-redirects     Report on redirects that expand a youtu.be URL.
153         --suggest-snapshots     Query the Internet Archive for a possible
154                                 snapshot URL for each "NG" page.
155         --take-screenshots FILE Call the Google Chrome binary at this path to
# Line 175 | Line 178 | fi
178   # Parse arguments as long as there are more arguments to process
179   while (( "$#" )); do
180     case "$1" in
181 <      --links )              LINKS_URL="$2";                     shift 2;;
182 <      --exceptions )         EXCEPT_URL="$2";                    shift 2;;
183 <      --output )             OUTPUT_DIR="$2";                    shift 2;;
184 <      --record-ok-links )    RECORD_OK_LINKS=1;                  shift;;
185 <      --show-added-slashes ) SHOW_SLASH=1;                       shift;;
186 <      --show-https-upgrade ) SHOW_HTTPS=1;                       shift;;
187 <      --suggest-snapshots )  SUGGEST_SNAPSHOTS=1;                shift;;
188 <      --take-screenshots )   TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
189 <      --start-url )          URL_START=$2;                       shift 2;;
190 <      --end-url )            URL_LIMIT=$2;                       shift 2;;
191 <      --upload )             UPLOAD_INFO=$2;                     shift 2;;
192 <      * )                    echo "Invalid argument $1 detected. Aborting."; exit 1;;
181 >      --links )               LINKS_URL="$2";                     shift 2;;
182 >      --exceptions )          EXCEPT_URL="$2";                    shift 2;;
183 >      --output )              OUTPUT_DIR="$2";                    shift 2;;
184 >      --record-ok-links )     RECORD_OK_LINKS=1;                  shift;;
185 >      --show-added-slashes )  SHOW_SLASH=1;                       shift;;
186 >      --show-https-upgrades ) SHOW_HTTPS=1;                       shift;;
187 >      --show-yt-redirects )   SHOW_YT_RD=1;                       shift;;
188 >      --suggest-snapshots )   SUGGEST_SNAPSHOTS=1;                shift;;
189 >      --take-screenshots )    TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
190 >      --start-url )           URL_START=$2;                       shift 2;;
191 >      --end-url )             URL_LIMIT=$2;                       shift 2;;
192 >      --upload )              UPLOAD_INFO=$2;                     shift 2;;
193 >      * )                     echo "Invalid argument $1 detected. Aborting."; exit 1;;
194    esac
195   done
196  
# Line 448 | Line 452 | function wrapupAndExit()
452     LINK_PROBLEMS=$((EI_LINKS+IW_LINKS+RD_LINKS+NG_LINKS))
453     LINK_ERRORS=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
454     LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_EI+SKIP_EXPECT_IW))
455 <   TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP))
455 >   TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP+SKIP_YOUTU_BE))
456     LINKS_CHECKED=$((LINKS_PROCESSED-LINK_ERRORS))
457  
458     # Print summary header
# Line 461 | Line 465 | function wrapupAndExit()
465     if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi
466     if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi
467     if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS processed $(pluralCheckNoun link $LINK_PROBLEMS) had $(pluralCheckAn $LINK_PROBLEMS)$(pluralCheckNoun issue $LINK_PROBLEMS)"; fi
468 <   if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr "  (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; valPrint h "nbsp;nbsp;(excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi
468 >   if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr "  (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; valPrint h "&nbsp;&nbsp;(excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi
469     if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) $(pluralCheckWas $OK_LINKS) OK"; fi
470     if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctr "  (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; valPrint h "&nbsp;&nbsp;(counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi
471  
# Line 722 | Line 726 | for LINE in `cat "$LINKS_FILE"`; do
726        # Turn off case sensitivity while we compare suffixes
727        shopt -s nocasematch
728  
729 <      # Special case: URLs ending in something like "/productID.304297400" are pages, not files, so if
729 >      # Special case: URLs ending in something like "productID.304297400" are pages, not files, so if
730        # the URL's suffix is all numbers, we are looking at the end of a web page URL
731        if [[ $POST_DOT =~ ^-?[0-9]+$ ]]; then
732           IS_FILE=0
733        fi
734 +
735 +      # Special case: URLs ending in a parens, e.g. "ms537113(v=vs.85)", are pages
736 +      if [[ $POST_DOT =~ ^.*[\(\)]$ ]]; then
737 +         IS_FILE=0
738 +      fi
739 +
740 +      # Special case: URLs containing a '%', e.g. "10.1007%2FBF00329055", are pages
741 +      if [[ $POST_DOT == *%* ]]; then
742 +         IS_FILE=0
743 +      fi
744        
745        # If we did not identify this URL as a web page above, we need to compare the suffix against known
746        # file extensions
# Line 838 | Line 852 | for LINE in `cat "$LINKS_FILE"`; do
852              # merely add an ending slash if the user didn't ask for them
853              NEW_URL_NO_SLASH=$(echo $NEW_URL_HTTP | sed -E 's:/$::')
854  
855 +            # Detect if this is a youtu.be link simply being expanded by YouTube to the full
856 +            # youtube.com address
857 +            YOUTU_BE=0
858 +            if [[ $URL_HTTP == http://youtu.be* ]] && [[ $NEW_URL_HTTP == http://www.youtube.com* ]]; then
859 +               YOUTU_BE=1
860 +            fi
861 +
862              # If the URLs match besides HTTP being upgraded to HTTPS, then the link is OK (unless user
863              # wants those to be reported)
864              if [ $SHOW_HTTPS -eq 0 ] && [ $URL_HTTP == $NEW_URL_HTTP ]; then
# Line 852 | Line 873 | for LINE in `cat "$LINKS_FILE"`; do
873                 STATUS="OK"
874                 let OK_LINKS+=1
875                 let SKIP_SLASH_ADD+=1
876 +            elif [ $SHOW_YT_RD -eq 0 ] && [ $YOUTU_BE -eq 1 ]; then
877 +               valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to show redirects for youtu.be links, and I was redirected to '$NEW_URL'."
878 +               STATUS="OK"
879 +               let OK_LINKS+=1
880 +               let SKIP_YOUTU_BE+=1
881              else
882                 STATUS="RD"
883                 let RD_LINKS+=1
# Line 874 | Line 900 | for LINE in `cat "$LINKS_FILE"`; do
900  
901     # If we didn't match a known status code, advise the reader
902     if [ $STATUS == "??" ]; then
903 <      valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I encountered the unknown return code $CURL_CODE."
903 >      valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I encountered the unknown response code $CURL_CODE."
904        let SKIP_UNK_CODE+=1
905        continue
906     fi

Diff Legend

Removed lines
+ Added lines
< Changed lines (old)
> Changed lines (new)