[ViewVC] Diff of: Oni2/Validate External Links/validate_external

Comparing Validate External Links/validate_external_links.sh (file contents):
Revision 1142 by iritscen, Fri Sep 4 03:07:08 2020 UTC vs.
Revision 1144 by iritscen, Sun Sep 6 20:51:22 2020 UTC

+# Validates a list of external links in CSV format. The resulting logs are produced in three formats:
+# - TXT (for easy diffing with an earlier log)
+# - RTF (for reading as a local file with clickable links)
-<
+# - HTML (for uploading as a web page).
->
+# - HTML (for reading as a web page)
+# Call script with "--help" argument for documentation. Also see Read Me.rtf for critical notes.
+# Recommended rule:
+### GLOBALS ###
+# Settings -- these will be changed from their defaults by the arguments passed in to the script
-<
+LINKS_URL=""         # use 'curl' to download file with links from this location (can be file://)
-<
+EXCEPT_URL=""        # 'curl' will access this wiki page with a list of exceptions for NG results
-<
+OUTPUT_DIR=""        # place reports and all other output in a folder inside this existing folder
-<
+RECORD_OK_LINKS=0    # record response code to the log even when it's a value in OK_CODES
-<
+SHOW_SLASH=0         # record response code to the log when a slash is added to the end of a URL
-<
+SHOW_HTTPS=0         # record response code to the log when "http" is upgraded to "https"
-<
+SHOW_YT_RD=0         # record response code to the log when a youtu.be URL is expanded to the full URL
-<
+SUGGEST_SNAPSHOTS=0  # query the Internet Archive for a possible snapshot URL for each NG page
-<
+SKIP_ARCHIVE_LINKS=0 # don't check URLs under the archive.org domain
-<
+TAKE_PAGE_SHOT=0     # take a screenshot of each OK page
-<
+TIMEOUT=10           # time to wait for a response when querying a site
-<
+CHROME_PATH=""       # path to a copy of Google Chrome that has the command-line screenshot feature
-<
+URL_START=1          # start at this URL in LINKS_FILE
-<
+URL_LIMIT=0          # if non-zero, stop at this URL in LINKS_FILE
-<
+UPLOAD_INFO=""       # path to a file on your hard drive with the login info needed to upload a report
->
+LINKS_URL=""          # use 'curl' to download file with links from this location (can be file://)
->
+EXCEPT_URL=""         # 'curl' will access this wiki page with a list of exceptions for NG results
->
+OUTPUT_DIR=""         # place reports and all other output in a folder inside this existing folder
->
+RECORD_OK_LINKS=0     # record response code to the log even when it's a value in OK_CODES
->
+SHOW_SLASH=0          # record issue when a slash is added to the end of a URL
->
+SHOW_HTTPS=0          # record issue when "http" is upgraded to "https"
->
+SHOW_YT_RD=0          # record redirection for a youtu.be URL expanding to the full URL
->
+SUGGEST_SNAPSHOTS=0   # query the Internet Archive for a possible snapshot URL for each NG page
->
+CHECK_ARCHIVE_LINKS=0 # check URLs under the archive.org domain
->
+TAKE_PAGE_SHOT=0      # take a screenshot of each OK page
->
+TIMEOUT=10            # time to wait for a response when querying a site
->
+CHROME_PATH=""        # path to a copy of Google Chrome that has the command-line screenshot feature
->
+URL_START=1           # start at this URL in LINKS_FILE
->
+URL_LIMIT=0           # if non-zero, stop at this URL in LINKS_FILE
->
+UPLOAD_INFO=""        # path to a file on your hard drive with the login info needed to upload a report
+# Fixed strings -- see the occurrences of these variables to learn their purpose
+AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36 OPR/70.0.3728.154"
+       validate_external_links.sh --help
+       validate_external_links.sh --links URL --output DIR [--exceptions URL]
+          [--record-ok-links] [--show-added-slashes] [--show-https-upgrades]
-<
+          [--show-yt-redirects] [--suggest-snapshots] [--skip-archive-links]
->
+          [--show-yt-redirects] [--suggest-snapshots] [--check-archive-links]
+          [--take-screenshots FILE] [--timeout NUM] [--start-url NUM]
+          [--end-url NUM] [--upload FILE]
+       --show-yt-redirects     Report on redirects that expand a youtu.be URL.
+       --suggest-snapshots     Query the Internet Archive for a possible
+                               snapshot URL for each "NG" page.
-<
+       --skip-archive-links    Don't check links that are already pointing to
-<
+                               a page on the Internet Archive.
->
+       --check-archive-links   Check links that are already pointing to a page
->
+                               on the Internet Archive. In theory these links
->
+                               should be totally stable and not need validation.
+       --take-screenshots FILE Call the Google Chrome binary at this path to
+                               take screenshots of each "OK" page.
+       --timeout NUM           Wait this many seconds for a site to respond. The
+      --show-https-upgrades ) SHOW_HTTPS=1;                       shift;;
+      --show-yt-redirects )   SHOW_YT_RD=1;                       shift;;
+      --suggest-snapshots )   SUGGEST_SNAPSHOTS=1;                shift;;
-<
+      --skip-archive-links )  SKIP_ARCHIVE_LINKS=1;               shift;;
->
+      --check-archive-links ) CHECK_ARCHIVE_LINKS=1;              shift;;
+      --take-screenshots )    TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
+      --timeout )             TIMEOUT=$2;                         shift 2;;
+      --start-url )           URL_START=$2;                       shift 2;;
+OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
+SHOT_PATH="$OUTPUT_PATH/Screenshots"
+LOG_NAME="ValExtLinks report"
-<
+LOG_TXT="$OUTPUT_PATH/$LOG_NAME.txt"
-<
+LOG_RTF="$OUTPUT_PATH/$LOG_NAME.rtf"
-<
+LOG_HTM="$OUTPUT_PATH/$LOG_NAME.htm"
->
+LOG_NAME_TXT="$LOG_NAME.txt"
->
+LOG_NAME_RTF="$LOG_NAME.rtf"
->
+LOG_NAME_HTM="$LOG_NAME.htm"
->
+LOG_PATH="$OUTPUT_PATH/$LOG_NAME"
->
+LOG_PATH_TXT="$LOG_PATH.txt"
->
+LOG_PATH_RTF="$LOG_PATH.rtf"
->
+LOG_PATH_HTM="$LOG_PATH.htm"
+mkdir "$OUTPUT_PATH"
+if [ $TAKE_PAGE_SHOT -eq 1 ]; then
+   mkdir "$SHOT_PATH"
+   fi
+   if [[ "$1" == *t* ]]; then
+      if [[ "$1" == *n* ]]; then
-<
+         echo -n "$2" >> "$LOG_TXT"
->
+         echo -n "$2" >> "$LOG_PATH_TXT"
+      elif [[ "$1" == *s* ]]; then
-<
+         echo -e "$2\n" >> "$LOG_TXT"
->
+         echo -e "$2\n" >> "$LOG_PATH_TXT"
+      else
-<
+         echo "$2" >> "$LOG_TXT"
->
+         echo "$2" >> "$LOG_PATH_TXT"
+      fi
+   fi
+   if [[ "$1" == *r* ]]; then
+      if [[ "$1" == *n* ]]; then
-<
+         echo "$2" >> "$LOG_RTF"
->
+         echo "$2" >> "$LOG_PATH_RTF"
+      elif [[ "$1" == *s* ]]; then
-<
+         echo "$2\line\line" >> "$LOG_RTF"
->
+         echo "$2\line\line" >> "$LOG_PATH_RTF"
+      else
-<
+         echo "$2\line" >> "$LOG_RTF"
->
+         echo "$2\line" >> "$LOG_PATH_RTF"
+      fi
+   fi
+   if [[ "$1" == *h* ]]; then
+      if [[ "$1" == *s* ]]; then
-<
+         echo "$2<tr><td>&nbsp;</td></tr>" >> "$LOG_HTM"
->
+         echo "$2<tr><td>&nbsp;</td></tr>" >> "$LOG_PATH_HTM"
+      elif [[ "$1" == *n* ]]; then
-<
+         echo "$2" >> "$LOG_HTM"
->
+         echo "$2" >> "$LOG_PATH_HTM"
+      else
-<
+         echo "$2<br />" >> "$LOG_HTM"
->
+         echo "$2<br />" >> "$LOG_PATH_HTM"
+      fi
+   fi
+   fi
-<
+# Upload HTML report using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
->
+# Upload the reports using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
+# reports being saved to disk have already been closed.
+function uploadReport()
-<
+   valPrint c "Uploading HTML report..."
->
+   valPrint c "Uploading reports..."
+   SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
+   SFTP_USER_NAME_MARKER="user:"
+   SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
+   SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}
-<
+   expect "$SCRIPT_PATH" "$LOG_HTM" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.htm"
->
+   for SUFFIX in htm rtf txt; do
->
+      expect "$SCRIPT_PATH" "$LOG_PATH.$SUFFIX" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.$SUFFIX"
-<
+   valPrint c "Report was uploaded, unless an error message appears above."
->
+      if [ "$?" -ne 0 ]; then
->
+         valPrint c "Error $? occurred when attempting to upload $LOG_NAME.$SUFFIX!"
->
+      else
->
+         valPrint c "Report in `echo $SUFFIX | tr [:lower:] [:upper:]` format was uploaded."
->
+      fi
->
+   done
+# Prints session summary when script is done
+   LINK_PROBLEMS_IW=$((IW_LINKS-SKIP_EXPECT_IW))
+   LINK_PROBLEMS_NET=$((LINK_PROBLEMS_NG+LINK_PROBLEMS_RD+LINK_PROBLEMS_EI+LINK_PROBLEMS_IW))
-+
+   # Print something in the Links section if no link issues were printed
-+
+   if [ $LINK_PROBLEMS_NET -eq 0 ]; then
-+
+      valPrint h "<i>No link problems to report! See the <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for a list of links with issues that were not reported.</i>"
-+
+   fi
-+
+   if [ $LINK_PROBLEMS_TOTAL -eq 0 ]; then
-+
+      valPrint t "No link problems to report!"
-+
+      valPrint r "\i1 No link problems to report! \i0"
-+
+   fi
-+
+   ## SUMMARY OUTPUT ##
+   valPrint ct "Summary ($ELAPSED):"
+   valPrint r "\b1 Summary \b0 ($ELAPSED)"
+   if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctr "  (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; valPrint h "&nbsp;&nbsp;(counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi
+   # Print errored link totals
-<
+   if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"; fi
->
+   if [ $LINK_ERRORS -gt 0 ]; then
->
+      valPrint c "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"
->
+      valPrint h "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for specific links):"
->
+      valPrint rt "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS):"
->
+   fi
+   if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
+   if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
+   if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
+   if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
+   # Print excepted link totals
-<
+   if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctrh "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"; fi
->
+   if [ $LINKS_EXCEPTED -gt 0 ]; then
->
+      valPrint c "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"
->
+      valPrint h "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for specific links):"
->
+      valPrint rt "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted:"
->
+   fi
+   if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
+   if [ $SKIP_EXPECT_RD -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_RD/$RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi
+   if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi
+if [ $SHOW_YT_RD -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
+valPrint ctrhn "Check archive.org links: "
-<
+if [ $SKIP_ARCHIVE_LINKS -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
->
+if [ $CHECK_ARCHIVE_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
+valPrint tr "A summary of my findings will be found at the bottom of the report."
+valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
+      continue
+   fi
-<
+   # If we're skipping Archive.org links, check if this is one
-<
+   if [ $SKIP_ARCHIVE_LINKS -eq 1 ] && [[ $URL == *web.archive.org* ]]; then
-<
+      valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have been asked not to check Wayback Machine links."
->
+   # If we're skipping Archive.org links, see if this is one
->
+   if [ $CHECK_ARCHIVE_LINKS -eq 0 ] && [[ $URL == *web.archive.org* ]]; then
->
+      valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to check Wayback Machine links."
+      let SKIP_ARCHIVE_ORG+=1
+      continue
+   fi
+      let EI_LINKS+=1
+   fi
-<
+   # If it's not, check if this is a link to a domain that we have an interwiki prefix for
-<
+   if [ $STATUS == "??" ]; then
->
+   # If it's not, check if this is a link to a domain that we have an interwiki prefix for (also make
->
+   # sure that it's not an archive.org link to a page from an interwiki domain)
->
+   if [ $STATUS == "??" ] && [[ $URL != *web.archive.org* ]]; then
+      for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
+         if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]] && [[ $URL != *${INTERWIKI_DOMAINS[$i]}/w/* ]]; then
+            STATUS="IW"

Diff Legend

-–
+Removed lines
-+
+Added lines
-<
+Changed lines (old)
->
+Changed lines (new)

Comparing Validate External Links/validate_external_links.sh (file contents): Revision 1142 by iritscen, Fri Sep 4 03:07:08 2020 UTC vs. Revision 1144 by iritscen, Sun Sep 6 20:51:22 2020 UTC

Diff Legend

Comparing Validate External Links/validate_external_links.sh (file contents):
Revision 1142 by iritscen, Fri Sep 4 03:07:08 2020 UTC vs.
Revision 1144 by iritscen, Sun Sep 6 20:51:22 2020 UTC