ViewVC Help
View File | Revision Log | View Changeset | Root Listing
root/Oni2/Validate External Links/validate_external_links.sh
(Generate patch)

Comparing Validate External Links/validate_external_links.sh (file contents):
Revision 1137 by iritscen, Tue Jul 21 14:16:54 2020 UTC vs.
Revision 1141 by iritscen, Fri Sep 4 02:54:30 2020 UTC

# Line 1 | Line 1
1   #!/bin/bash
2  
3   # Validate External Links by Iritscen
4 < # Provided with a list of external links in an expected CSV format, this script validates them. The
5 < # resulting logs are produced in three formats: TXT (for easy diffing with an earlier log), RTF (for
6 < # reading as a local file with clickable links), and HTML (for uploading as a web page). Call script
7 < # with "--help" argument for documentation. Also see Read Me First.rtf for critical notes.
4 > #
5 > # Validates a list of external links in CSV format. The resulting logs are produced in three formats:
6 > # - TXT (for easy diffing with an earlier log)
7 > # - RTF (for reading as a local file with clickable links)
8 > # - HTML (for uploading as a web page).
9 > # Call script with "--help" argument for documentation. Also see Read Me First.rtf for critical notes.
10 > #
11   # Recommended rule:
12   # |----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----|
13 + #
14 + # Table of contents (sections of script in order of appearance, not execution):
15 + # • Globals
16 + # • Help Output
17 + # • Setup
18 + # • Utility Functions
19 + # • Summary Output
20 + # • Initialization
21 + #   • Data Sourcing
22 + #   • Config Output
23 + #   • Legend Output
24 + # • Main Loop
25  
26   # Set separator token to newline
27   IFS="
# Line 24 | Line 39 | SHOW_YT_RD=0         # record response c
39   SUGGEST_SNAPSHOTS=0  # query the Internet Archive for a possible snapshot URL for each NG page
40   SKIP_ARCHIVE_LINKS=0 # don't check URLs under the archive.org domain
41   TAKE_PAGE_SHOT=0     # take a screenshot of each OK page
42 + TIMEOUT=10           # time to wait for a response when querying a site
43   CHROME_PATH=""       # path to a copy of Google Chrome that has the command-line screenshot feature
44   URL_START=1          # start at this URL in LINKS_FILE (1 by default)
45   URL_LIMIT=0          # if non-zero, stop at this URL in LINKS_FILE
# Line 35 | Line 51 | ARCHIVE_API="http://archive.org/wayback/
51   ARCHIVE_GENERIC="https://web.archive.org/web/*"
52   ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
53   CHROME_SCREENSHOT="screenshot.png"
38 CURL_CODES="http://iritscen.oni2.net/val/curl_codes.txt"
54   EXCEPT_FILE_NAME="exceptions.txt"
55   EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
56 < HTTP_CODES="http://iritscen.oni2.net/val/http_codes.txt"
57 < MY_WIKI_PAGE="https://wiki.oni2.net/User:Iritscen"
56 > WIKI_CURL="https://wiki.oni2.net/Validate_External_Links/Curl_codes"
57 > WIKI_HTTP="https://wiki.oni2.net/Validate_External_Links/HTTP_codes"
58 > WIKI_MAIN="https://wiki.oni2.net/Validate_External_Links"
59 > WIKI_ME="http://iritscen.oni2.net"
60   THIS_DIR=$(cd $(dirname $0); pwd)
61   WORKING_DIR=$(pwd)
62   WIKI_PATH="wiki.oni2.net"
# Line 100 | Line 117 | START_RUN=0
117   END_RUN=0
118  
119  
120 < ### HELP ###
120 > ### HELP OUTPUT ###
121   # A pseudo-man page. Here is the 80-character rule for the page text:
122   # 234567890123456789012345678901234567890123456789012345678901234567890123456789
123   function printHelp()
# Line 115 | Line 132 | SYNOPSIS
132         validate_external_links.sh --links URL --output DIR [--exceptions URL]
133            [--record-ok-links] [--show-added-slashes] [--show-https-upgrades]
134            [--show-yt-redirects] [--suggest-snapshots] [--skip-archive-links]
135 <          [--take-screenshots FILE] [--start-url NUM] [--end-url NUM]
136 <          [--upload FILE]
135 >          [--take-screenshots FILE] [--timeout NUM] [--start-url NUM]
136 >          [--end-url NUM] [--upload FILE]
137  
138   DESCRIPTION
139         This script parses a list of external links found in the OniGalore wiki
# Line 162 | Line 179 | OPTIONS
179                                 a page on the Internet Archive.
180         --take-screenshots FILE Call the Google Chrome binary at this path to
181                                 take screenshots of each "OK" page.
182 +       --timeout NUM           Wait this many seconds for a site to respond. The
183 +                               default is 10.
184         --start-url NUM         Start at this link in the links CSV file.
185         --end-url NUM           Stop at this link in the links CSV file.
186         --upload FILE           Upload report using the credentials and path
# Line 196 | Line 215 | while (( "$#" )); do
215        --suggest-snapshots )   SUGGEST_SNAPSHOTS=1;                shift;;
216        --skip-archive-links )  SKIP_ARCHIVE_LINKS=1;               shift;;
217        --take-screenshots )    TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
218 +      --timeout )             TIMEOUT=$2;                         shift 2;;
219        --start-url )           URL_START=$2;                       shift 2;;
220        --end-url )             URL_LIMIT=$2;                       shift 2;;
221        --upload )              UPLOAD_INFO=$2;                     shift 2;;
# Line 266 | Line 286 | function printTXTheader()
286     valPrint t "Validate External Links report"
287     valPrint t "generated $NICE_TIME"
288     valPrint t "from data of $LINKS_DATE"
289 <   valPrint t "script by Iritscen (contact: $MY_WIKI_PAGE)"
289 >   valPrint t "script by Iritscen (contact: $WIKI_ME)"
290     valPrint t ""
291   }
292  
# Line 283 | Line 303 | function printRTFheader()
303   \f0\fs28 \cf0 \b1 Validate External Links report \b0\\
304   generated $NICE_TIME\\
305   from data of $LINKS_DATE\\
306 < script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$MY_WIKI_PAGE\"}}{\fldrslt contact}})
306 > script by Iritscen ({\field{\*\fldinst{HYPERLINK \"$WIKI_ME\"}}{\fldrslt contact}})
307   \\
308   \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\li360\pardirnatural\partightenfactor0
309   \cf0 "
# Line 306 | Line 326 | function printHTMheader()
326   <h2>Validate External Links report</h2>
327   <h3>generated $NICE_TIME<br />
328   from data of $LINKS_DATE<br />
329 < script by Iritscen (<a href=\"$MY_WIKI_PAGE\" target=\"_blank\">contact</a>)</h3>"
329 > script by Iritscen (<a href=\"$WIKI_ME\" target=\"_blank\">contact</a>)</h3>"
330   }
331  
332   # Closes the HTML markup of the HTML log file
# Line 318 | Line 338 | function printHTMfooter()
338  
339   # The central logging function. The first parameter is a string composed of one or more characters that
340   # indicate which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
341 < # 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 's' means "Print an extra newline at the end." 'w' means "Don't pass console output through 'fmt'" ("fmt" fits the output
341 > # 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 's' means "Print an
342 > # extra newline at the end." 'w' means "Don't pass console output through 'fmt'" ("fmt" fits the output
343   # to an 80-column CLI but can break special formatting and the 'n' option).
344   function valPrint()
345   {
# Line 464 | Line 485 | function wrapupAndExit()
485     TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP+SKIP_YOUTU_BE))
486     LINKS_CHECKED=$((LINKS_PROCESSED-LINK_ERRORS))
487  
488 <   # Print summary header
488 >   ## SUMMARY OUTPUT ##
489     valPrint ct "Summary ($ELAPSED):"
490     valPrint r "\b1 Summary \b0 ($ELAPSED)"
491     valPrint hn "<h3><span id=\"summary\">Summary ($ELAPSED)</span></h3>"
# Line 525 | Line 546 | printTXTheader
546   printRTFheader
547   printHTMheader
548  
549 + ## DATA SOURCING ##
550 + valPrint t "Startup:"
551 + valPrint r "\b1 Startup \b0"
552 + valPrint hn "<h3>Startup</h3>"
553 +
554   # Attempt to download file at LINKS_URL, then check that it succeeded
555 < valPrint t "Config:"
530 < valPrint r "\b1 Config \b0"
531 < valPrint hn "<h3>Config</h3>"
532 < valPrint cwtrh "Downloading list of external links from $LINKS_URL."
555 > valPrint cwtrhn "Downloading list of external links from $LINKS_URL..."
556   LINKS_FILE_NAME=$(echo "$LINKS_URL" | sed 's/.*\///')
557   LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME"
558   curl --silent -o "$LINKS_FILE" $LINKS_URL
559   if [ ! -f "$LINKS_FILE" ]; then
560 <   echo "The download of $LINKS_URL appears to have failed. Aborting."
560 >   echo -e "\nThe download of $LINKS_URL appears to have failed. Aborting."
561     wrapupAndExit
562 + else
563 +   valPrint ctrh " success."
564   fi
565  
566   # Attempt to download file at EXCEPT_URL, then check that it succeeded
567   if [ ! -z $EXCEPT_URL ]; then
568 <   valPrint cwtrh "Downloading list of reporting exceptions from $EXCEPT_URL."
568 >   valPrint cwtrhn "Downloading list of reporting exceptions from $EXCEPT_URL..."
569     EXCEPT_DATA=$(curl --silent $EXCEPT_URL)
570     if [ -z "$EXCEPT_DATA" ]; then
571 <      echo "The download of the exceptions data from '$EXCEPT_URL' appears to have failed. Aborting."
571 >      echo -e "\nThe download of the exceptions data from '$EXCEPT_URL' appears to have failed. Aborting."
572        wrapupAndExit
573 +   else
574 +      valPrint ctrh " success."
575     fi
576     EXCEPT_DATA=${EXCEPT_DATA%END LIST*}
577     EXCEPT_DATA=${EXCEPT_DATA#*BEGIN LIST}
# Line 563 | Line 590 | LINK_COUNT_STRING=$(cat "$LINKS_FILE" |
590   # Number of URLs is number of lines minus one (first line is column header row for the CSV)
591   LINK_COUNT=$(echo "${LINK_COUNT_STRING}" | tr -d '[:space:]')
592   let LINK_COUNT-=1
593 + valPrint ctrh "Found $LINK_COUNT links to process."
594 + valPrint trh ""
595  
596 < # Calculate number of URLs to consider
596 > ## CONFIG OUTPUT ##
597 > valPrint t "Config:"
598 > valPrint r "\b1 Config \b0"
599 > valPrint hn "<h3>Config</h3>"
600 >
601 > valPrint ctrhn "Links to consider: "
602   if [ $URL_LIMIT -ne 0 ] && [ $URL_LIMIT -lt $LINK_COUNT ]; then
603 <   valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((URL_LIMIT-URL_START+1)) of them, from link $URL_START to $URL_LIMIT."
603 >   valPrint ctrh "$((URL_LIMIT-URL_START+1)), from link $URL_START to $URL_LIMIT"
604   elif [ $URL_START -ne 1 ]; then
605 <   valPrint ctrh "Found $LINK_COUNT links to process. I will be considering $((LINK_COUNT-URL_START+1)) of them, from link $URL_START to $LINK_COUNT."
605 >   valPrint ctrh "$((LINK_COUNT-URL_START+1)), from link $URL_START to $LINK_COUNT"
606   else
607 <   valPrint ctrh "Found $LINK_COUNT links to process. I will be considering all of them."
607 >   valPrint ctrh "$LINK_COUNT"
608   fi
609  
610 < # Print settings to console and log
611 < declare -a SETTINGS_MSG=(I will be checking the response code of each link "and will" take a screenshot of each page. Pages that are OK will "also" be logged. I "will" ask the Internet Archive for a suggested snapshot URL for each NG page. "I will not report links that are in the exceptions list." "I will ignore URLs that simply have ending slashes added onto them." "I will ignore URLs that only upgrade from HTTP to HTTPS." "I will ignore youtu.be links that are merely being expanded." "I will not check the validity of Internet Archive snapshot URLs.")
612 < if [ $TAKE_PAGE_SHOT -eq 0 ]; then SETTINGS_MSG[10]="but will not"; fi
613 < if [ $RECORD_OK_LINKS -eq 0 ]; then SETTINGS_MSG[22]="not"; fi
614 < if [ $SUGGEST_SNAPSHOTS -eq 0 ]; then SETTINGS_MSG[26]="will not"; fi
615 < if [ -z $EXCEPT_URL ] || [ $RECORD_OK_LINKS -eq 1 ]; then SETTINGS_MSG[40]=""; fi
616 < if [ $SHOW_SLASH -eq 1 ]; then SETTINGS_MSG[41]=""; fi
617 < if [ $SHOW_HTTPS -eq 1 ]; then SETTINGS_MSG[42]=""; fi
618 < if [ $SHOW_YT_RD -eq 1 ]; then SETTINGS_MSG[43]=""; fi
619 < if [ $SKIP_ARCHIVE_LINKS -eq 0 ]; then SETTINGS_MSG[44]=""; fi
620 < SETTINGS_STR=${SETTINGS_MSG[@]}
621 < valPrint ctrh "$SETTINGS_STR"
610 > valPrint ctrh "Site query timeout: $TIMEOUT seconds"
611 >
612 > valPrint ctrhn "Show OK links: "
613 > if [ $RECORD_OK_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
614 >
615 > valPrint ctrhn "Take screenshots: "
616 > if [ $TAKE_PAGE_SHOT -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
617 >
618 > valPrint ctrhn "Suggest Archive.org snapshots: "
619 > if [ $SUGGEST_SNAPSHOTS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
620 >
621 > valPrint ctrhn "Ignore slash-adding redirects: "
622 > if [ $SHOW_SLASH -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
623 >
624 > valPrint ctrhn "Ignore HTTPS-upgrading redirects: "
625 > if [ $SHOW_HTTPS -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
626 >
627 > valPrint ctrhn "Ignore youtu.be redirects: "
628 > if [ $SHOW_YT_RD -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
629 >
630 > valPrint ctrhn "Check archive.org links: "
631 > if [ $SKIP_ARCHIVE_LINKS -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
632 >
633   valPrint tr "A summary of my findings will be found at the bottom of the report."
634   valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
635   valPrint trh ""
636  
637 < # Print legend to logs
637 > ## LEGEND OUTPUT ##
638   valPrint t "Legend:"
639   valPrint r "\b1 Legend \b0"
640   valPrint hn "<h3>Legend</h3>"
641 < valPrint trh "OK = URL seems to be working."
642 < valPrint trh "NG = URL no longer seems to work. You should click each URL marked as NG before attempting to fix it, because false negatives will occur from time to time due to hiccups in the Internet. Please report any persistent false negatives or other issues to the script's author (see top of report). An NG link should be followed by a link to the Internet Archive's Wayback Machine which may help you repair the link, unless the Archive does not have any snapshots of the site. If the link cannot be repaired, you can delete it from the wiki page, or, if this would disrupt the surrounding material on the page, disable the link by wrapping the URL in nowiki tags."
643 < valPrint trh "RD = The server responding to this URL is saying that the page moved and you should instead use the supplied new URL. Some RD links represent minor adjustments in the organization of a web site, and some are soft 404s (the file/page has been removed and you are being redirected to something like the main page of the web site). You will have to look at the new URL yourself to determine if it represents an OK link and the link on the wiki should be updated to this one, or if the desired file/page is actually gone and we need to replace the wiki link with an Internet Archive snapshot link -- or disable the URL if it has not been archived."
644 < valPrint trh "EI = URL is an external link to an internal page and should be converted to an intrawiki link using the suggested markup."
645 < valPrint trh "IW = URL is an external link to a fellow wiki and should be converted to an interwiki link using the suggested markup."
646 < valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $HTTP_CODES)."
647 < valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$HTTP_CODES\"}}{\fldrslt here}} for code reference)."
648 < valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$HTTP_CODES\" target=\"_blank\">here</a> for code reference)."
649 < valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $CURL_CODES)."
650 < valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$CURL_CODES\"}}{\fldrslt here}} for code reference)."
651 < valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$CURL_CODES\" target=\"_blank\">here</a> for code reference)."
652 < valPrint trh "IA suggests = Last available snapshot suggested by the Internet Archive."
653 < valPrint trh "Try browsing = The Archive occasionally fails to return a snapshot URL even when one exists, so you will need to check for a snapshot manually using this link to the Wayback Machine before concluding that a site has not been archived."
641 > valPrint t "(For guidance in fixing these links, see $WIKI_MAIN.)"
642 > valPrint r "(For guidance in fixing these links, see {\field{\*\fldinst{HYPERLINK \"$WIKI_MAIN\"}}{\fldrslt here}}.)"
643 > valPrint h "(For guidance in fixing these links, see <a href=\"$WIKI_MAIN\" target=\"_blank\">here</a>.)"
644 > valPrint trh "OK = URL seems to be working"
645 > valPrint trh "NG = URL no longer seems to work"
646 > valPrint trh "RD = URL is redirecting to this new URL"
647 > valPrint trh "EI = URL is an external link to an internal page and should be converted to an intrawiki link using the suggested markup"
648 > valPrint trh "IW = URL is an external link to a fellow wiki and should be converted to an interwiki link using the suggested markup"
649 > valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $WIKI_HTTP)"
650 > valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$WIKI_HTTP\"}}{\fldrslt here}} for code reference)"
651 > valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$WIKI_HTTP\" target=\"_blank\">here</a> for code reference)"
652 > valPrint t "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see here for code reference: $WIKI_CURL)"
653 > valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$WIKI_CURL\"}}{\fldrslt here}} for code reference)"
654 > valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$WIKI_CURL\" target=\"_blank\">here</a> for code reference)"
655 > valPrint trh "IA suggests = Last available snapshot returned by the Internet Archive"
656 > valPrint trh "Try browsing = The Archive failed to return a snapshot URL, so check for a snapshot manually using this link"
657   valPrint trh ""
658  
659  
# Line 813 | Line 861 | for LINE in `cat "$LINKS_FILE"`; do
861  
862     # Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
863     # issue with sites that require HTTPS
864 <   CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time 10 --write-out '%{http_code}\n' $URL)
864 >   CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --write-out '%{http_code}\n' $URL)
865     CURL_ERR=$(echo $?)
866     CURL_RESULT=$CURL_CODE
867  
# Line 863 | Line 911 | for LINE in `cat "$LINKS_FILE"`; do
911        for CODE in "${RD_CODES[@]}"; do
912           if [[ $CODE == $CURL_CODE ]]; then
913              # Get URL header again in order to retrieve the URL we are being redirected to
914 <            NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time 10 --write-out '%{redirect_url}\n' $URL)
914 >            NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --write-out '%{redirect_url}\n' $URL)
915  
916              # Adjust the old and new URLs to both use HTTP for comparison purposes, so we can filter
917              # those changes out if the user didn't ask for them
# Line 1031 | Line 1079 | for LINE in `cat "$LINKS_FILE"`; do
1079  
1080        # Query Internet Archive for latest "OK" snapshot for "NG" page
1081        if [ $STATUS == "NG" ] && [ $SUGGEST_SNAPSHOTS -eq 1 ]; then
1082 <         ARCHIVE_QUERY=$(curl --silent --max-time 10 "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
1082 >         ARCHIVE_QUERY=$(curl --silent --max-time $TIMEOUT "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
1083  
1084           # If a "closest" snapshot was received...
1085           if [[ "$ARCHIVE_QUERY" == *\"closest\":* ]]; then

Diff Legend

Removed lines
+ Added lines
< Changed lines (old)
> Changed lines (new)