ViewVC Help
View File | Revision Log | View Changeset | Root Listing
root/Oni2/Validate External Links/validate_external_links.sh
(Generate patch)

Comparing Validate External Links/validate_external_links.sh (file contents):
Revision 1160 by iritscen, Sun Aug 15 14:20:21 2021 UTC vs.
Revision 1175 by iritscen, Tue Aug 23 14:15:48 2022 UTC

# Line 29 | Line 29 | IFS="
29  
30   ### GLOBALS ###
31   # Settings -- these will be changed from their defaults by the arguments passed in to the script
32 < LINKS_URL=""           # use 'curl' to download file with links from this location (can be file://)
33 < EXCEPT_URL=""          # 'curl' will access this wiki page with a list of exceptions for NG results
32 > LINKS_URL=""           # download external link CSV from this location (can use "file://" protocol)
33 > EXCEPT_URL=""          # location of wiki page with a list of exceptions for NG results
34   OUTPUT_DIR=""          # place reports and all other output in a folder inside this existing folder
35   RECORD_OK_LINKS=0      # record response code to the log even when it's a value in OK_CODES
36   SHOW_SLASH=0           # record issue when a slash is added to the end of a URL
# Line 47 | Line 47 | URL_LIMIT=0            # if non-zero, st
47   UPLOAD_INFO=""         # path to a file on your hard drive with the login info needed to upload a report
48  
49   # Fixed strings -- see the occurrences of these variables to learn their purpose
50 < AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/88.0.4324.146 Safari/537.36"
50 > AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.134 Safari/537.36"
51   ARCHIVE_API="http://archive.org/wayback/available"
52   ARCHIVE_GENERIC="https://web.archive.org/web/*"
53   ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
# Line 68 | Line 68 | declare -a NS_NAMES=("Media" "Special" "
68  
69   # These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
70   # This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code.
71 < declare -a HTTP_FILES=(3ds 7z avi BINA blend bsl cpp csv dae dll dmg doc east exe fbx first63 flv gamedata gif jpg last32 log m4a mhm mov mp3 mp4 oni ONWC pdf png psd py rar tga TRMA txt vbs wav wmv xaf xcf xml zip)
71 > declare -a HTTP_FILES=(3ds 7z avi BINA blend bsl cpp csv dae dll dmg doc east exe fbx first63 flv gamedata gif jpg last32 log m4a mhm mov mp3 mp4 oni ONWC pdf png psd py rar tga TRMA txt vbs wav wmv xaf xcf xlsx xml zip)
72   declare -a HTTP_TLDS_AND_PAGES=(abstract action ars asp aspx cfm cgi com css de do full htm html it js jsp net org pgi php php3 phtml pl ru shtml stm uk x)
73  
74   # These arrays tells us which HTTP response codes are OK (good), which are RD (redirections), and which
# Line 562 | Line 562 | function wrapupAndExit()
562     if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi
563     if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi
564  
565 +   # Perform exceptions audit
566 +   EXCEPTION_ISSUES=0
567 +   valPrint ctrh "Exceptions list audit:"
568 +   for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
569 +      EXCEPT_LINE="${EXCEPT_ARRAY[$i]}"
570 +      EXCEPT_LINE=$(echo "$EXCEPT_LINE" | sed 's/\&amp;/\&/g') # copied from exception-matching code
571 +
572 +      if [ ${EXCEPT_FOUND[$i]} -eq 0 ]; then
573 +         EXCEPT_URL="${EXCEPT_LINE#*,}"
574 +         EXCEPT_URL="${EXCEPT_URL%,*}"
575 +         EXCEPT_PAGE="${EXCEPT_LINE##*,}"
576 +         EXCEPT_PAGE="${EXCEPT_PAGE%% *}"
577 +         if [ "$EXCEPT_PAGE" == "*" ]; then
578 +            valPrint tr "- The link '$EXCEPT_URL' did not return an error or is not on any page."
579 +         else
580 +            valPrint tr "- The link '$EXCEPT_URL' did not return an error or is not on page '$EXCEPT_PAGE'."
581 +         fi
582 +         let EXCEPTION_ISSUES+=1
583 +      elif [ ${EXCEPT_USED[$i]} -eq 0 ]; then
584 +         EXCEPT_URL="${EXCEPT_LINE#*,}"
585 +         EXCEPT_URL="${EXCEPT_URL%,*}"
586 +         EXCEPT_CODE=${EXCEPT_LINE%%,*}
587 +         valPrint tr "- The link '$EXCEPT_URL' did not return error code $EXCEPT_CODE."
588 +         let EXCEPTION_ISSUES+=1
589 +      fi
590 +   done
591 +   if [ $EXCEPTION_ISSUES -eq 0 ]; then
592 +      valPrint ctrh "- No issues found."
593 +   else
594 +      valPrint c "$EXCEPTION_ISSUES exception list $(pluralCheckNoun issue $EXCEPTION_ISSUES) detected (see RTF or TXT report for details)."
595 +      valPrint h "$EXCEPTION_ISSUES exception list $(pluralCheckNoun issue $EXCEPTION_ISSUES) detected (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for details)."
596 +   fi
597 +
598     # Print checked link totals
599     if [ $LINK_PROBLEMS_NET -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS_NET link $(pluralCheckNoun issue $LINK_PROBLEMS_NET):"; fi
600     if [ $LINK_PROBLEMS_NG -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_NG NG $(pluralCheckNoun link $LINK_PROBLEMS_NG)"; fi
# Line 629 | Line 662 | if [ ! -z $EXCEPT_URL ]; then
662  
663     # Transfer to array for easy searching later
664     declare -a EXCEPT_ARRAY=($(echo "$EXCEPT_DATA"))
665 +
666 +   # Create parallel arrays for marking which exceptions get used later
667 +   declare -a EXCEPT_USED=()
668 +   declare -a EXCEPT_FOUND=()
669 +   for ((i = 0; i < ${#EXCEPT_ARRAY[@]}; ++i)); do
670 +      EXCEPT_USED+=(0)
671 +      EXCEPT_FOUND+=(0)
672 +   done
673   fi
674  
675   # Pipe 'cat' to 'wc' because passing LINKS_FILE to 'wc' would print the file name after the line count
# Line 688 | Line 729 | valPrint trh ""
729   valPrint t "Legend:"
730   valPrint r "\b1 Legend \b0"
731   valPrint hn "<h3>Legend</h3>"
732 < valPrint t "(For guidance in fixing these links, see $WIKI_MAIN.)"
733 < valPrint r "(For guidance in fixing these links, see {\field{\*\fldinst{HYPERLINK \"$WIKI_MAIN\"}}{\fldrslt here}}.)"
734 < valPrint h "(For guidance in fixing these links, see <a href=\"$WIKI_MAIN\" target=\"_blank\">here</a>.)"
732 > valPrint t "(For guidance in fixing these links, see $WIKI_MAIN. The exceptions list is at $EXCEPT_URL.)"
733 > valPrint r "(For guidance in fixing these links, see {\field{\*\fldinst{HYPERLINK \"$WIKI_MAIN\"}}{\fldrslt here}}. The exceptions list is {\field{\*\fldinst{HYPERLINK \"$EXCEPT_URL\"}}{\fldrslt here}}.)"
734 > valPrint h "(For guidance in fixing these links, see <a href=\"$WIKI_MAIN\" target=\"_blank\">here</a>. The exceptions list is <a href=\"$EXCEPT_URL\" target=\"_blank\">here</a>.)"
735   valPrint trh "OK = URL seems to be working"
736   valPrint trh "NG = URL no longer seems to work"
737   valPrint trh "RD = URL is redirecting to this new URL"
# Line 828 | Line 869 | for LINE in `cat "$LINKS_FILE"`; do
869     # If the URL ends in something like "#section_15", strip everything from the '#' onward
870     CLEAN_URL=${CLEAN_URL%%\#*}
871  
872 <   # 'sed' cannot handle Unicode in my Bash shell, so skip non-ASCII URL and make user check it
872 >   # 'sed' cannot handle Unicode in my Bash shell, so skip non-ASCII URL and make reader check it
873     if [[ $CLEAN_URL == *[![:ascii:]]* ]]; then
874        valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I cannot handle non-ASCII characters."
875        let SKIP_NON_ASCII+=1
# Line 901 | Line 942 | for LINE in `cat "$LINKS_FILE"`; do
942        shopt -u nocasematch
943     fi
944  
945 <   # If this suffix escaped identification as either a file, page or TLD, inform the user
945 >   # If this suffix escaped identification as either a file, page or TLD, inform the reader
946     STR_TYPE=""
947     if [ $IS_FILE -eq -1 ]; then
948        valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because I encountered the unknown URL suffix '$POST_DOT'. Please add this suffix to the appropriate array in this script, HTTP_FILES or HTTP_TLDS_AND_PAGES."
# Line 1064 | Line 1105 | for LINE in `cat "$LINKS_FILE"`; do
1105  
1106     # Check problem links against exceptions list before proceeding
1107     FOUND_EXCEPT=0
1108 <   if [ $STATUS != "OK" ] && [ ! -z $EXCEPT_URL ]; then
1108 >   if [ $STATUS != "OK" ] && [ ! -z "$EXCEPT_URL" ]; then
1109        # The code we expect to find in the exceptions file is either the 'curl' result or "EI"/"IW"
1110        EXPECT_CODE="$CURL_RESULT"
1111        if [ $STATUS == "EI" ]; then
# Line 1082 | Line 1123 | for LINE in `cat "$LINKS_FILE"`; do
1123           # other HTML-encoded characters are not found in URLs
1124           EXCEPT_LINE=$(echo "$EXCEPT_LINE" | sed 's/\&amp;/\&/g')
1125  
1126 <         # Match URL
1126 >         # Check for URL match
1127           EXCEPT_URL="${EXCEPT_LINE#*,}"
1128           EXCEPT_URL="${EXCEPT_URL%,*}"
1129           if [ "$EXCEPT_URL" != "$URL" ]; then
1130              continue
1131           fi
1132  
1133 <         # Match containing page's name
1133 >         # Check for page name match
1134           EXCEPT_PAGE="${EXCEPT_LINE##*,}"
1135           EXCEPT_PAGE="${EXCEPT_PAGE%% *}"
1136 <         if [ "$EXCEPT_PAGE" == "*" ] || [ "$EXCEPT_PAGE" == $LOCAL_PAGE_PATH ]; then
1137 <            # Match result code
1136 >         if [ "$EXCEPT_PAGE" == "*" ] || [ "$EXCEPT_PAGE" == "$LOCAL_PAGE_PATH" ]; then
1137 >            let EXCEPT_FOUND[$i]+=1
1138 >            valPrint trs "Found exception '$URL' on page '$LOCAL_PAGE_PATH'."
1139 >
1140 >            # Check for result code match
1141              EXCEPT_CODE=${EXCEPT_LINE%%,*}
1142              if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then
1143 +               FOUND_EXCEPT=1
1144 +               let EXCEPT_USED[$i]+=1
1145                 valPrint trs "Skipping URL '$URL' (found on page '$LOCAL_PAGE_PATH') because its expected result, '$EXPECT_CODE', is in the exceptions list."
1146 +
1147                 if [ $STATUS == "EI" ]; then
1148                    let SKIP_EXPECT_EI+=1
1149                 elif [ $STATUS == "IW" ]; then
# Line 1106 | Line 1153 | for LINE in `cat "$LINKS_FILE"`; do
1153                 else
1154                    let SKIP_EXPECT_NG+=1
1155                 fi
1156 <               FOUND_EXCEPT=1
1156 >
1157                 break
1158              fi
1159           fi
# Line 1180 | Line 1227 | for LINE in `cat "$LINKS_FILE"`; do
1227           # Issue query to the API
1228           ARCHIVE_QUERY=$(curl --silent --max-time $TIMEOUT "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
1229  
1230 <         # Notify user if we hit the rate limit and just keep going
1230 >         # Notify reader if we hit the rate limit and just keep going
1231           if [[ "$ARCHIVE_QUERY" == "*Too Many Requests*" ]]; then
1232              valPrint t "  IA has rate-limited us!"
1233              valPrint r "                IA has rate-limited us!"
1234              valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td>(hit the API rate limit!)</td></tr>"
1235 <         # If a "closest" snapshot was received, inform user
1235 >         # If a "closest" snapshot was received, inform reader
1236           elif [[ "$ARCHIVE_QUERY" == *\"closest\":* ]]; then
1237              # In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it
1238              ARCHIVE_QUERY=$(echo "$ARCHIVE_QUERY" | sed 's/#!/#\\!/')
# Line 1198 | Line 1245 | for LINE in `cat "$LINKS_FILE"`; do
1245              # Remove the port 80 part that IA often adds to the URL, as it's superfluous
1246              SNAPSHOT_URL=$(echo $SNAPSHOT_URL | sed 's/:80//')
1247  
1248 <            # Inform the user of the snapshot URL
1248 >            # Inform the reader of the snapshot URL
1249              valPrint ts "  IA suggests $SNAPSHOT_URL"
1250              valPrint rs "               IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
1251              valPrint hs "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"

Diff Legend

Removed lines
+ Added lines
< Changed lines (old)
> Changed lines (new)