ViewVC Help
View File | Revision Log | View Changeset | Root Listing
root/Oni2/Validate External Links/validate_external_links.sh
(Generate patch)

Comparing Validate External Links/validate_external_links.sh (file contents):
Revision 1135 by iritscen, Sun Jul 12 23:57:00 2020 UTC vs.
Revision 1136 by iritscen, Mon Jul 20 15:58:39 2020 UTC

# Line 15 | Line 15 | IFS="
15   ### GLOBALS ###
16   # Settings -- these will be changed from their defaults by the arguments passed in to the script
17   LINKS_URL=""         # use 'curl' to download file with links from this location (can be file://)
18 < EXCEPT_URL=""        # ditto above for file with exceptions to NG results
18 > EXCEPT_URL=""        # 'curl' will access this wiki page with a list of exceptions for NG results
19   OUTPUT_DIR=""        # place reports and all other output in a folder inside this existing folder
20   RECORD_OK_LINKS=0    # record response code to the log even when it's a value in OK_CODES
21   SHOW_SLASH=0         # record response code to the log when a slash is added to the end of a URL
# Line 30 | Line 30 | URL_LIMIT=0          # if non-zero, stop
30   UPLOAD_INFO=""       # path to a file on your hard drive with the login info needed to upload a report
31  
32   # Fixed strings -- see the occurrences of these variables to learn their purpose
33 < AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36 OPR/67.0.3575.53"
33 > AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36 OPR/69.0.3686.77"
34   ARCHIVE_API="http://archive.org/wayback/available"
35   ARCHIVE_GENERIC="https://web.archive.org/web/*"
36   ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
37   CHROME_SCREENSHOT="screenshot.png"
38   CURL_CODES="http://iritscen.oni2.net/val/curl_codes.txt"
39 + EXCEPT_FILE_NAME="exceptions.txt"
40   EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
41   HTTP_CODES="http://iritscen.oni2.net/val/http_codes.txt"
42   MY_WIKI_PAGE="https://wiki.oni2.net/User:Iritscen"
# Line 112 | Line 113 | NAME
113   SYNOPSIS
114         validate_external_links.sh --help
115         validate_external_links.sh --links URL --output DIR [--exceptions URL]
116 <          [--record-ok-links] [--suggest-snapshots] [--take-screenshots FILE]
117 <          [--start-url NUM] [--end-url NUM] [--upload FILE]
116 >          [--record-ok-links] [--show-added-slashes] [--show-https-upgrades]
117 >          [--show-yt-redirects] [--suggest-snapshots] [--skip-archive-links]
118 >          [--take-screenshots FILE] [--start-url NUM] [--end-url NUM]
119 >          [--upload FILE]
120  
121   DESCRIPTION
122         This script parses a list of external links found in the OniGalore wiki
# Line 140 | Line 143 | OPTIONS
143         --output DIR            (required) Unix path to directory in which Val
144                                 should place its reports.
145         --exceptions URL        In order to remove links from the report which
146 <                               Val finds an issue with, but which you regard as
147 <                               OK, list those desired exceptions in this file.
148 <                               See the sample file exceptions.txt for details.
149 <                               Note that this URL can point to a local file if
150 <                               you supply a file:// path.
146 >                               Val finds an issue with but which you regard as
147 >                               OK, list those desired exceptions on a wiki page.
148 >                               See the sample file "exceptions.pdf" for the
149 >                               required format of the page. Note that this URL
150 >                               can point to a local file if you supply a path
151 >                               beginning with "file://".
152         --record-ok-links       Log a link in the report even if its response
153                                 code is "OK".
154         --show-added-slashes    Report on redirects that simply add a '/' to the
# Line 537 | Line 541 | fi
541   # Attempt to download file at EXCEPT_URL, then check that it succeeded
542   if [ ! -z $EXCEPT_URL ]; then
543     valPrint cwtrh "Downloading list of reporting exceptions from $EXCEPT_URL."
544 <   EXCEPT_FILE_NAME=$(echo "$EXCEPT_URL" | sed 's/.*\///')
545 <   EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
546 <   curl --silent -o "$EXCEPT_FILE" $EXCEPT_URL
543 <   if [ ! -f "$EXCEPT_FILE" ]; then
544 <      echo "The download of $EXCEPT_URL appears to have failed. Aborting."
544 >   EXCEPT_DATA=$(curl --silent $EXCEPT_URL)
545 >   if [ -z "$EXCEPT_DATA" ]; then
546 >      echo "The download of the exceptions data from '$EXCEPT_URL' appears to have failed. Aborting."
547        wrapupAndExit
548     fi
549 +   EXCEPT_DATA=${EXCEPT_DATA%END LIST*}
550 +   EXCEPT_DATA=${EXCEPT_DATA#*BEGIN LIST}
551 +   EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
552 +
553 +   # Store on disk for debugging purposes
554 +   echo "$EXCEPT_DATA" > "$EXCEPT_FILE"
555 +
556 +   # Transfer to array for easy searching later
557 +   declare -a EXCEPT_ARRAY=($(echo "$EXCEPT_DATA"))
558   fi
559  
560   # Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
# Line 563 | Line 574 | else
574   fi
575  
576   # Print settings to console and log
577 < declare -a SETTINGS_MSG=(I will be checking the response code of each link "and will" take a screenshot of each page. Pages that are OK will "also" be logged. I "will" ask the Internet Archive for a suggested snapshot URL for each NG page. "I will not report links that are listed in the exceptions file." "I will ignore URLs that simply have ending slashes added onto them." "I will ignore URLs that only upgrade from HTTP to HTTPS." "I will ignore youtu.be links that are merely being expanded." "I will not check the validity of Internet Archive snapshot URLs.")
577 > declare -a SETTINGS_MSG=(I will be checking the response code of each link "and will" take a screenshot of each page. Pages that are OK will "also" be logged. I "will" ask the Internet Archive for a suggested snapshot URL for each NG page. "I will not report links that are in the exceptions list." "I will ignore URLs that simply have ending slashes added onto them." "I will ignore URLs that only upgrade from HTTP to HTTPS." "I will ignore youtu.be links that are merely being expanded." "I will not check the validity of Internet Archive snapshot URLs.")
578   if [ $TAKE_PAGE_SHOT -eq 0 ]; then SETTINGS_MSG[10]="but will not"; fi
579   if [ $RECORD_OK_LINKS -eq 0 ]; then SETTINGS_MSG[22]="not"; fi
580   if [ $SUGGEST_SNAPSHOTS -eq 0 ]; then SETTINGS_MSG[26]="will not"; fi
# Line 922 | Line 933 | for LINE in `cat "$LINKS_FILE"`; do
933        continue
934     fi
935  
936 <   # Check problem links against exceptions file before proceeding
936 >   # Check problem links against exceptions list before proceeding
937 >   FOUND_EXCEPT=0
938     if [ $STATUS != "OK" ] && [ ! -z $EXCEPT_URL ]; then
939        # The code we expect to find in the exceptions file is either the 'curl' result or "EI"/"IW"
940        EXPECT_CODE="$CURL_RESULT"
# Line 932 | Line 944 | for LINE in `cat "$LINKS_FILE"`; do
944           EXPECT_CODE="IW"
945        fi
946  
947 <      # Look for link in exceptions file and make sure its listed result code and wiki page also match
948 <      GREP_RESULT=$(grep --max-count=1 "$URL" "$EXCEPT_FILE")
949 <      EXCEPT_PAGE=${GREP_RESULT##*,}
950 <      if [ "$EXCEPT_PAGE" == "*" ] || [ "$EXCEPT_PAGE" == $LOCAL_PAGE_PATH ]; then
951 <         EXCEPT_CODE=${GREP_RESULT%%,*}
952 <         if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then
953 <            valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because its expected result, $EXPECT_CODE, is listed in the exceptions file."
954 <            if [ $STATUS == "EI" ]; then
955 <               let SKIP_EXPECT_EI+=1
944 <            elif [ $STATUS == "IW" ]; then
945 <               let SKIP_EXPECT_IW+=1
946 <            else
947 <               let SKIP_EXPECT_NG+=1
948 <            fi
947 >      # Look for link in exceptions list and make sure the listed result code and wiki page also match
948 >      for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
949 >      {
950 >         EXCEPT_LINE="${EXCEPT_ARRAY[$i]}"
951 >
952 >         # Match URL
953 >         EXCEPT_URL="${EXCEPT_LINE#*,}"
954 >         EXCEPT_URL="${EXCEPT_URL%,*}"
955 >         if [ "$EXCEPT_URL" != "$URL" ]; then
956              continue
957           fi
958 <      fi
958 >
959 >         # Match containing page's name
960 >         EXCEPT_PAGE="${EXCEPT_LINE##*,}"
961 >         EXCEPT_PAGE="${EXCEPT_PAGE%% *}"
962 >         if [ "$EXCEPT_PAGE" == "*" ] || [ "$EXCEPT_PAGE" == $LOCAL_PAGE_PATH ]; then
963 >            # Match result code
964 >            EXCEPT_CODE=${EXCEPT_LINE%%,*}
965 >            if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then
966 >               valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because its expected result, '$EXPECT_CODE', is in the exceptions list."
967 >               if [ $STATUS == "EI" ]; then
968 >                  let SKIP_EXPECT_EI+=1
969 >               elif [ $STATUS == "IW" ]; then
970 >                  let SKIP_EXPECT_IW+=1
971 >               else
972 >                  let SKIP_EXPECT_NG+=1
973 >               fi
974 >               FOUND_EXCEPT=1
975 >               break
976 >            fi
977 >         fi
978 >      } done
979 >   fi
980 >   if [ $FOUND_EXCEPT -eq 1 ]; then
981 >      continue
982     fi
983  
984     # If appropriate, record this link to the log, with clickable URLs when possible

Diff Legend

Removed lines
+ Added lines
< Changed lines (old)
> Changed lines (new)