ViewVC Help
View File | Revision Log | View Changeset | Root Listing
root/Oni2/Validate External Links/validate_external_links.sh
(Generate patch)

Comparing Validate External Links/validate_external_links.sh (file contents):
Revision 1069 by iritscen, Wed Aug 2 04:26:48 2017 UTC vs.
Revision 1070 by iritscen, Tue Oct 3 03:01:32 2017 UTC

# Line 17 | Line 17 | IFS="
17   LINKS_URL=""        # use 'curl' to download file with links from this location (can be file://)
18   EXCEPT_URL=""       # ditto above for file with exceptions to NG results
19   OUTPUT_DIR=""       # place reports and all other output in a folder inside this existing folder
20 < RECORD_OK_LINKS=0   # record response code to the log whether it's a value in OK_CODES or NG_CODES
20 > RECORD_OK_LINKS=0   # record response code to the log even when it's a value in OK_CODES
21   SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page
22   TAKE_PAGE_SHOT=0    # take a screenshot of each OK page
23 + CHROME_PATH=""      # path to a copy of Google Chrome that has the command-line screenshot feature
24   URL_START=1         # start at this URL in LINKS_FILE (1 by default)
25   URL_LIMIT=0         # if non-zero, stop at this URL in LINKS_FILE
26   UPLOAD_INFO=""      # path to a file on your hard drive with the login info needed to upload a report
27  
28   # Fixed strings -- see the occurrences of these variables to learn their purpose
29 < AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:54.0) Gecko/20100101 Firefox/54.0"
29 > AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0"
30   ARCHIVE_API="http://archive.org/wayback/available"
31   ARCHIVE_GENERIC="https://web.archive.org/web/*"
32   ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
32 CHROME="/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary"
33   CHROME_SCREENSHOT="screenshot.png"
34   CURL_CODES="http://iritscen.oni2.net/val/curl_codes.txt"
35   EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
# Line 44 | Line 44 | declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7
44   declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk")
45  
46   # These arrays tell the script which suffixes at the ends of URLs represent files and which are pages.
47 < # This determines whether the script tries to take a screenshot of the page or just gets its HTTP code.
47 > # This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code.
48   declare -a HTTP_FILES=(txt zip wmv jpg png m4a bsl rar oni mp3 mov ONWC vbs TRMA mp4 doc avi log gif pdf dmg exe cpp tga 7z wav east BINA xml dll dae xaf fbx 3ds blend flv csv)
49   declare -a HTTP_TLDS_AND_PAGES=(com net org uk ru de it htm html php pl asp aspx shtml pgi cgi php3 x jsp phtml cfm css action stm js)
50  
# Line 59 | Line 59 | declare -a NG_CODES=(000 403 404 410 500
59   # transcluded text, and if the transclusion fails, then the braces show up in the URL
60   ILLEGAL_CHARS="{ }"
61  
62 + # The shortest URL possible, used for sanity-checking some URLs: http://a.co
63 + MIN_URL_LENGTH=11
64 +
65   # These are parallel arrays giving the prefixes that can be used in place of normal external links to
66   # some wikis and other sites
67 < declare -a INTERWIKI_PREFIXES=(metawikipedia wikipedia wikiquote wiktionary)
68 < declare -a INTERWIKI_DOMAINS=(meta.wikipedia.org wikipedia.org wikiquote.org wiktionary.org)
67 > declare -a INTERWIKI_PREFIXES=(commons metawikimedia mw wikibooks wikidata wikimedia wikinews wikiquote wikisource wikispecies wikiversity wikivoyage wikt wp)
68 > declare -a INTERWIKI_DOMAINS=(commons.wikimedia.org meta.wikimedia.org mediawiki.org wikibooks.org wikidata.org wikimediafoundation.org wikinews.org wikiquote.org wikisource.org species.wikimedia.org wikiversity.org wikivoyage.org wiktionary.org wikipedia.org)
69  
70   # Variables for keeping track of main loop progress and findings
71   LINK_NUM=0
72 + EI_LINKS=0
73 + IW_LINKS=0
74   OK_LINKS=0
75   RD_LINKS=0
71 IW_LINKS=0
76   NG_LINKS=0
77   SKIP_UNK_NS=0
78   SKIP_JS_PAGE=0
# Line 76 | Line 80 | SKIP_BAD_URL=0
80   SKIP_NON_ASCII=0
81   SKIP_UNK_SUFFIX=0
82   SKIP_UNK_CODE=0
83 < SKIP_EXCEPT=0
83 > SKIP_EXPECT_NG=0
84 > SKIP_EXPECT_EI=0
85 > SKIP_EXPECT_IW=0
86   FILE_LINKS=0
87   PAGE_LINKS=0
88   SKIPPED_HEADER_ROW=0
# Line 95 | Line 101 | NAME
101  
102   SYNOPSIS
103         validate_external_links.sh --help
104 <       validate_external_links.sh --links URL --output PATH [--exceptions FILE]
105 <          [--record-ok-links] [--suggest-snapshots] [--take-screenshots]
106 <          [--start-url NUM] [--end-url NUM] [--upload PATH]
104 >       validate_external_links.sh --links URL --output DIR [--exceptions URL]
105 >          [--record-ok-links] [--suggest-snapshots] [--take-screenshots DIR]
106 >          [--start-url NUM] [--end-url NUM] [--upload FILE]
107  
108   DESCRIPTION
109         This script parses a list of external links found in the OniGalore wiki
110         (which is dumped by the Oni2.net domain periodically in a particular
111         format), validates them using the Unix tool 'curl', and produces a report
112 <       of which links were OK (responded positively to an HTTP query), which
113 <       were RD (responded with a 3xx redirect code), which could be IW (inter-
114 <       wiki) links, and which were NG (no good; a negative response to the
112 >       of which links were "OK" (responded positively to an HTTP query), which
113 >       were "RD" (responded with a 3xx redirect code), which could be "IW"
114 >       (interwiki) links, which are "EI" (external internal) links and could be
115 >       intrawiki links, and which were "NG" (no good; a negative response to the
116         query). This report can then be automatically uploaded to the location of
117         your choice. The script can also suggest Internet Archive snapshots for
118 <       NG links, and take screenshots of OK links for visual verification by the
119 <       reader that the page in question is the one intended to be displayed.
118 >       "NG" links, and take screenshots of "OK" links for visual verification by
119 >       the reader that the page in question is the one intended to be displayed.
120  
121         You must pass this script the URL at which the list of links is found
122 <       (--links) and the path where logs should be outputted (--output). All
123 <       other arguments are optional.
122 >       (--links) and the path where the directory of logs should be outputted
123 >       (--output). All other arguments are optional.
124  
125   OPTIONS
126 <       --help              Show this page
127 <       --links URL         URL from which to download file with external links
128 <                           (note that this can be a local file if you use the
129 <                           file:// protocol) (required)
130 <       --output DIR        Place the folder which will contain the reports and
131 <                           optional screenshots at this path (required)
132 <       --exceptions URL    In order to remove links from the list which show as
133 <                           NG but which you regard as OK, prepare a plain-text
134 <                           file where each line contains a response code being
135 <                           returned and the URL returning it, separated by a
136 <                           comma, e.g. "403,http://www.example.com" (note that
137 <                           this can be a local file if you use the
138 <                           file:// protocol)
139 <       --record-ok-links   Log a link in the report even if its response code is
140 <                           OK
141 <       --suggest-snapshots Query the Internet Archive for a possible snapshot
142 <                           URL for each NG page
143 <       --take-screenshots  Save screenshots of each OK page (requires Google
144 <                           Chrome to be found at the path in CHROME)
145 <       --start-url NUM     Start at this link in the links file
146 <       --end-url NUM       Stop at this link in the links file
147 <       --upload FILE       Upload report using info in this local file
126 >       --help                 Show this page.
127 >       --links URL            (required) URL from which to download the CSV file
128 >                              with external links. Note that this URL can be a
129 >                              local file if you supply a file:// path.
130 >       --output DIR           (required) Place the folder which will contain the
131 >                              reports and optional screenshots at this (Unix-
132 >                              format) path.
133 >       --exceptions URL       In order to remove links from the report which Val
134 >                              finds an issue with, but which you regard as OK,
135 >                              list those desired exceptions in this file. See
136 >                              the sample file exceptions.txt for details. Note
137 >                              that this text file can be a local file if you
138 >                              supply a file:// path.
139 >       --record-ok-links      Log a link in the report even if its response code
140 >                              is "OK".
141 >       --suggest-snapshots    Query the Internet Archive for a possible snapshot
142 >                              URL for each "NG" page.
143 >       --take-screenshots DIR Use the copy of Google Chrome at this path to take
144 >                              screenshots of each "OK" page.
145 >       --start-url NUM        Start at this link in the link dump CSV file.
146 >       --end-url NUM          Stop at this link in the link dump CSV file.
147 >       --upload FILE          Upload report using the credentials in this local
148 >                              text file. See sftp_login.txt for example.
149  
150   BUGS
151         The script cannot properly parse any line in the external links file
# Line 157 | Line 165 | fi
165   # Parse arguments as long as there are more arguments to process
166   while (( "$#" )); do
167     case "$1" in
168 <      --links )             LINKS_URL="$2";      shift 2;;
169 <      --exceptions )        EXCEPT_URL="$2";     shift 2;;
170 <      --output )            OUTPUT_DIR="$2";     shift 2;;
171 <      --record-ok-links )   RECORD_OK_LINKS=1;   shift;;
172 <      --suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;;
173 <      --take-screenshots )  TAKE_PAGE_SHOT=1;    shift;;
174 <      --start-url )         URL_START=$2;        shift 2;;
175 <      --end-url )           URL_LIMIT=$2;        shift 2;;
176 <      --upload )            UPLOAD_INFO=$2;      shift 2;;
168 >      --links )             LINKS_URL="$2";                     shift 2;;
169 >      --exceptions )        EXCEPT_URL="$2";                    shift 2;;
170 >      --output )            OUTPUT_DIR="$2";                    shift 2;;
171 >      --record-ok-links )   RECORD_OK_LINKS=1;                  shift;;
172 >      --suggest-snapshots ) SUGGEST_SNAPSHOTS=1;                shift;;
173 >      --take-screenshots )  TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
174 >      --start-url )         URL_START=$2;                       shift 2;;
175 >      --end-url )           URL_LIMIT=$2;                       shift 2;;
176 >      --upload )            UPLOAD_INFO=$2;                     shift 2;;
177        * )                   echo "Invalid argument $1 detected. Aborting."; exit 1;;
178    esac
179   done
180  
181   # If the required arguments were not supplied, print help page and quit
182   if [ -z $LINKS_URL ] || [ -z $OUTPUT_DIR ]; then
183 <   printHelp
176 <   echo "Error: I did not receive one or both required arguments."
183 >   echo "Error: I did not receive one or both required arguments. Run me with the \"--help\" argument for documentation."
184     exit 2
185   fi
186  
187 + # If user wants screenshots, make sure path to Chrome was passed in and is valid
188 + if [ $TAKE_PAGE_SHOT -eq 1 ]; then
189 +   if [ ! -f "$CHROME_PATH" ]; then
190 +      echo "Error: You need to supply a path to the Google Chrome application in order to take screenshots."
191 +      exit 3
192 +   fi
193 + fi
194 +
195   # Check that UPLOAD_INFO exists, if this argument was supplied
196   if [ ! -z $UPLOAD_INFO ] && [ ! -f "$UPLOAD_INFO" ]; then
197     echo "Error: The file $UPLOAD_INFO supplied by the --upload argument does not appear to exist. Aborting."
198 <   exit 3
198 >   exit 4
199   fi
200  
201   # Check that OUTPUT_DIR is a directory
202   if [ ! -d "$OUTPUT_DIR" ]; then
203     echo "Error: The path $OUTPUT_DIR supplied by the --output argument does not appear to be a directory. Aborting."
204 <   exit 4
204 >   exit 5
205   fi
206  
207   # Make timestamped folder inside OUTPUT_DIR for this session's log and screenshots
# Line 207 | Line 222 | fi
222   # Check that 'mkdir' succeeded
223   if [ ! -d "$OUTPUT_PATH" ]; then
224     echo "Error: I could not create the folder \"$OUTPUT_FOLDER\" inside the directory $OUTPUT_PATH. Aborting."
225 <   exit 5
225 >   exit 6
226   fi
227  
228   # Get date on the file at LINKS_URL and print to log
229   LINKS_DATE=$(curl --silent --head $LINKS_URL | grep "Last-Modified")
230   if [ -z "$LINKS_DATE" ]; then
231     echo "Error: I could not find the external links file at the path \"$LINKS_URL\" supplied by the --links argument. Aborting."
232 <   exit 6
232 >   exit 7
233   fi
234   LINKS_DATE=${LINKS_DATE#Last-Modified: }
235  
# Line 277 | Line 292 | function printHTMfooter()
292   }
293  
294   # The central logging function. The first parameter is a string composed of one or more characters that
295 < # indicates which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
295 > # indicate which output to use: 'c' means console, 't' means the TXT log, 'r' means the RTF log, and
296   # 'h' means the HTML log. 'n' means "Don't print a newline at the end of the line." 'w' means "Don't
297   # pass console output through 'fmt'" ("fmt" fits the output to an 80-column CLI but can break special
298   # formatting and the 'n' option).
# Line 416 | Line 431 | function wrapupAndExit()
431     valPrint ctrh "I skipped $LINKS_SKIPPED $(pluralCheckNoun link $LINKS_SKIPPED), and found $FILE_LINKS $(pluralCheckNoun file $FILE_LINKS) and $PAGE_LINKS $(pluralCheckNoun page $PAGE_LINKS)."
432     if [ $LINKS_SKIPPED -gt 0 ]; then valPrint ctrh "Skip breakdown: "; fi
433     if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
434 <   if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE links on JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
434 >   if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
435     if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
436     if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
437     if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
438     if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
439 <   valPrint ctrh "Out of the $LINKS_CHECKED links checked, $IW_LINKS could be $(pluralCheckAn $IW_LINKS)interwiki $(pluralCheckNoun link $IW_LINKS), $OK_LINKS $(pluralCheckWas $OK_LINKS) OK, $RD_LINKS $(pluralCheckWas $RD_LINKS) $(pluralCheckA $RD_LINKS)redirection $(pluralCheckNoun notice $RD_LINKS), and $NG_LINKS $(pluralCheckWas $NG_LINKS) NG."
440 <   if [ $SKIP_EXCEPT -gt 0 ]; then
441 <      valPrint ctrh "$SKIP_EXCEPT/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS) went unlisted due to being found in the exceptions file."
439 >   valPrint ctrh "Out of the $LINKS_CHECKED links checked, $EI_LINKS could be $(pluralCheckAn $EI_LINKS)intrawiki $(pluralCheckNoun link $EI_LINKS), $IW_LINKS could be $(pluralCheckAn $IW_LINKS)interwiki $(pluralCheckNoun link $IW_LINKS), $OK_LINKS $(pluralCheckWas $OK_LINKS) OK, $RD_LINKS $(pluralCheckWas $RD_LINKS) $(pluralCheckA $RD_LINKS)redirection $(pluralCheckNoun notice $RD_LINKS), and $NG_LINKS $(pluralCheckWas $NG_LINKS) NG."
440 >   if [ $SKIP_EXPECT_NG -gt 0 ]; then
441 >      valPrint ctrh "$SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS) went unlisted due to being found in the exceptions file."
442 >   fi
443 >   if [ $SKIP_EXPECT_EI -gt 0 ]; then
444 >      valPrint ctrh "$SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS) went unlisted due to being found in the exceptions file."
445     fi
446 +   if [ $SKIP_EXPECT_IW -gt 0 ]; then
447 +      valPrint ctrh "$SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS) went unlisted due to being found in the exceptions file."
448 +   fi
449 +   valPrint trh "ValExtLinks says goodbye."
450     printRTFfooter
451     printHTMfooter
452  
# Line 459 | Line 481 | fi
481  
482   # Attempt to download file at EXCEPT_URL, then check that it succeeded
483   if [ ! -z $EXCEPT_URL ]; then
484 <   valPrint cwtrh "Downloading list of NG exceptions from $EXCEPT_URL."
484 >   valPrint cwtrh "Downloading list of reporting exceptions from $EXCEPT_URL."
485     EXCEPT_FILE_NAME=$(echo "$EXCEPT_URL" | sed 's/.*\///')
486     EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME"
487     curl --silent -o "$EXCEPT_FILE" $EXCEPT_URL
# Line 486 | Line 508 | else
508   fi
509  
510   # Print settings to console and log
511 < declare -a SETTINGS_MSG=(I will be checking the response code of each link "and will" take a screenshot of each page. Pages that are OK will "also" be logged. I "will" ask the Internet Archive for a suggested snapshot URL for each NG page. "I will not print NG links that are listed in the exceptions file.")
511 > declare -a SETTINGS_MSG=(I will be checking the response code of each link "and will" take a screenshot of each page. Pages that are OK will "also" be logged. I "will" ask the Internet Archive for a suggested snapshot URL for each NG page. "I will not report links that are listed in the exceptions file.")
512   if [ $TAKE_PAGE_SHOT -eq 0 ]; then SETTINGS_MSG[10]="but will not"; fi
513   if [ $RECORD_OK_LINKS -eq 0 ]; then SETTINGS_MSG[22]="not"; fi
514   if [ $SUGGEST_SNAPSHOTS -eq 0 ]; then SETTINGS_MSG[26]="will not"; fi
# Line 504 | Line 526 | valPrint hn "<h3>Legend</h3>"
526   valPrint trh "OK = URL seems to be working."
527   valPrint trh "NG = URL no longer seems to work. You should click each URL marked as NG before attempting to fix it, because false negatives will occur from time to time due to hiccups in the Internet. Please report any persistent false negatives or other issues to Iritscen. An NG link should be followed by a link to the Internet Archive's Wayback Machine which may help you repair the link. If the link cannot be repaired, you can disable it on the wiki (which prevents it from showing up in future ValExtLinks reports) by wrapping it in nowiki tags."
528   valPrint trh "RD = The server responding to this URL is saying that the page moved and you should instead use the supplied new URL. Some RD links represent minor adjustments in the organization of a web site, and some are soft 404s (the file/page has been removed and you are being redirected to something like the main page of the web site). You will have to look at the new URL yourself to determine if it represents an OK link and the link on the wiki should be updated to this one, or if the desired file/page is actually gone and we need to replace the wiki link with an Internet Archive snapshot link -- or disable the URL if it has not been archived."
529 < valPrint trh "IW = URL is working but should be converted to interwiki link using the suggested markup."
529 > valPrint trh "EI = URL is an external link to an internal page and should be converted to an intrawiki link using the suggested markup."
530 > valPrint trh "IW = URL is an external link to a fellow wiki and should be converted to an interwiki link using the suggested markup."
531   valPrint t "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see here for code reference: $HTTP_CODES)."
532   valPrint r "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see {\field{\*\fldinst{HYPERLINK \"$HTTP_CODES\"}}{\fldrslt here}} for code reference)."
533   valPrint h "(xxx) = Unix tool 'curl' obtained this HTTP response status code (see <a href=\"$HTTP_CODES\" target=\"_blank\">here</a> for code reference)."
# Line 512 | Line 535 | valPrint t "(000-xx) = 'curl' did not ge
535   valPrint r "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see {\field{\*\fldinst{HYPERLINK \"$CURL_CODES\"}}{\fldrslt here}} for code reference)."
536   valPrint h "(000-xx) = 'curl' did not get an HTTP response code, but returned this exit code (see <a href=\"$CURL_CODES\" target=\"_blank\">here</a> for code reference)."
537   valPrint trh "IA suggests = Last available snapshot suggested by the Internet Archive."
538 < valPrint trh "Try browsing = The Archive occasionally fails to return a snapshot URL even when one exists, so you will need to check for a snapshot manually using the Wayback Machine before concluding that a site has not been archived."
538 > valPrint trh "Try browsing = The Archive occasionally fails to return a snapshot URL even when one exists, so you will need to check for a snapshot manually using this link to the Wayback Machine before concluding that a site has not been archived."
539   valPrint trh ""
540  
541  
# Line 584 | Line 607 | for LINE in `cat "$LINKS_FILE"`; do
607        continue
608     fi
609  
610 +   # Build longer wiki page URLs from namespace and page names
611 +   FULL_PAGE_PATH=http://$WIKI_PATH/$NS_NAME:$PAGE_NAME
612 +   LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
613 +   # Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it
614 +   # explicitly breaks the link
615 +   if [ $NS_ID -eq 0 ]; then
616 +      FULL_PAGE_PATH=http://$WIKI_PATH/$PAGE_NAME
617 +      LOCAL_PAGE_PATH=$PAGE_NAME
618 +   fi
619 +
620     # The URL being linked to is everything after the previous two fields (this allows commas to be in
621     # the URLs, but a comma in the previous field, the page name, will break this)
622     URL=${LINE#$NS_ID,$PAGE_NAME,}
# Line 600 | Line 633 | for LINE in `cat "$LINKS_FILE"`; do
633     HAS_SUFFIX=0
634  
635     # If the URL ends in something like ".php?foo=bar", strip everything from the '?' onward
636 <   SAN_URL=${URL%%\?*}
636 >   CLEAN_URL=${URL%%\?*}
637  
638     # If the URL ends in something like "#section_15", strip everything from the '#' onward
639 <   SAN_URL=${SAN_URL%%\#*}
639 >   CLEAN_URL=${CLEAN_URL%%\#*}
640  
641     # 'sed' cannot handle Unicode in my Bash shell, so skip this URL and make user check it
642 <   if [[ $SAN_URL == *[![:ascii:]]* ]]; then
642 >   if [[ $CLEAN_URL == *[![:ascii:]]* ]]; then
643        valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because I cannot handle non-ASCII characters."
644        let SKIP_NON_ASCII+=1
645        continue
646     fi
647  
648     # Isolate the characters after the last period and after the last slash
649 <   POST_DOT=$(echo "$SAN_URL" | sed 's/.*\.//')
650 <   POST_SLASH=$(echo "$SAN_URL" | sed 's/.*\///')
649 >   POST_DOT=$(echo "$CLEAN_URL" | sed 's/.*\.//')
650 >   POST_SLASH=$(echo "$CLEAN_URL" | sed 's/.*\///')
651  
652     # If the last period comes after the last slash, then the URL ends in a suffix
653     POST_DOT_LENGTH=$(echo | awk -v input=$POST_DOT '{print length(input)}')
# Line 692 | Line 725 | for LINE in `cat "$LINKS_FILE"`; do
725        CURL_RESULT="$CURL_RESULT-$CURL_ERR"
726     fi
727  
728 <   # Determine our status code for this URL (IW, OK, RD, or NG)
728 >   # Begin to determine our status code for this URL (EI, IW, OK, RD, or NG)
729     STATUS="??"
730     NEW_URL=""
731     INTERWIKI_INDEX=-1
732 <   # First check if this is a link to a domain that we have an interwiki prefix for
733 <   for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
734 <      if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]]; then
735 <         STATUS="IW"
736 <         let IW_LINKS+=1
737 <         INTERWIKI_INDEX=$i
738 <         break
739 <      fi
740 <   done
732 >
733 >   # First make sure that this isn't an "external internal" link to our own wiki that can be replaced
734 >   # by "[[page_name]]". If it uses a special access URL beginning with "/w/", let it pass, as it
735 >   # probably cannot be replaced by "[[ ]]" markup
736 >   if [[ $URL == *$WIKI_PATH* ]] && [[ $URL != *$WIKI_PATH/w/* ]]; then
737 >      STATUS="EI"
738 >      let EI_LINKS+=1
739 >   fi
740 >
741 >   # If it's not, check if this is a link to a domain that we have an interwiki prefix for
742 >   if [ $STATUS == "??" ]; then
743 >      for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
744 >         if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]] && [[ $URL != *${INTERWIKI_DOMAINS[$i]}/w/* ]]; then
745 >            STATUS="IW"
746 >            let IW_LINKS+=1
747 >            INTERWIKI_INDEX=$i
748 >            break
749 >         fi
750 >      done
751 >   fi
752  
753     # If we didn't match an interwiki domain, see if the status code is in our "OK" codes list
754     if [ $STATUS == "??" ]; then
# Line 724 | Line 768 | for LINE in `cat "$LINKS_FILE"`; do
768              # Get URL header again in order to retrieve the URL we are being redirected to
769              NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{redirect_url}\n' $URL)
770  
771 <            # Check if the redirect URL is just the original URL with https:// instead of http://
772 <            # (this happens a lot and is not an important correction to us); if so, just make it "OK"
773 <            URL_NO_PROTOCOL=${URL#*://}
774 <            NEW_URL_NO_PROTOCOL=${NEW_URL#*://}
771 >            # Filter out cases where the redirect URL is just the original URL with https:// instead of
772 >            # http://, or with an added '/' at the end. These corrections happen a lot and are not
773 >            # important to us.
774 >            URL_NO_PROTOCOL=${URL#http://}
775 >            URL_NO_PROTOCOL=${URL_NO_PROTOCOL%/}
776 >            NEW_URL_NO_PROTOCOL=${NEW_URL#https://}
777 >            NEW_URL_NO_PROTOCOL=${NEW_URL_NO_PROTOCOL%/}
778 >
779 >            # Sometimes 'curl' fails to get the redirect_url due to time-out or bad web site config
780 >            NEW_URL_LENGTH=$(echo | awk -v input=$NEW_URL_NO_PROTOCOL '{print length(input)}')
781 >            if [ $NEW_URL_LENGTH -lt $MIN_URL_LENGTH ]; then
782 >               NEW_URL_NO_PROTOCOL="[new URL not retrieved]"
783 >            fi
784 >
785 >            # If the URLs match after the above filters were applied, then the link is OK
786              if [ $URL_NO_PROTOCOL == $NEW_URL_NO_PROTOCOL ]; then
787                 STATUS="OK"
788                 let OK_LINKS+=1
# Line 758 | Line 813 | for LINE in `cat "$LINKS_FILE"`; do
813        continue
814     fi
815  
816 <   # If link is "NG" and there is an exceptions file, compare URL against the list before logging it
817 <   if [ $STATUS == "NG" ] && [ ! -z $EXCEPT_URL ]; then
816 >   # Check problem links against exceptions file before proceeding
817 >   if [ $STATUS != "OK" ] && [ ! -z $EXCEPT_URL ]; then
818 >      # The code we expect to find in the exceptions file is either the 'curl' result or "EI"/"IW"
819 >      EXPECT_CODE="$CURL_RESULT"
820 >      if [ $STATUS == "EI" ]; then
821 >         EXPECT_CODE="EI"
822 >      elif [ $STATUS == "IW" ]; then
823 >         EXPECT_CODE="IW"
824 >      fi
825 >
826 >      # Look for link in exceptions file and make sure its listed result code and wiki page also match
827        GREP_RESULT=$(grep --max-count=1 "$URL" "$EXCEPT_FILE")
828 <      EXCEPT_CODE=${GREP_RESULT%%,*}
829 <      if [ "$EXCEPT_CODE" == $CURL_RESULT ]; then
830 <         valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because its status code, $CURL_RESULT, is listed in the exceptions file."
831 <         let SKIP_EXCEPT+=1
832 <         continue
828 >      EXCEPT_PAGE=${GREP_RESULT##*,}
829 >      if [ "$EXCEPT_PAGE" == "*" ] || [ "$EXCEPT_PAGE" == $LOCAL_PAGE_PATH ]; then
830 >         EXCEPT_CODE=${GREP_RESULT%%,*}
831 >         if [ "$EXCEPT_CODE" == "$EXPECT_CODE" ]; then
832 >            valPrint tr "Skipping URL $URL (found on page $PAGE_NAME) because its expected result, $EXPECT_CODE, is listed in the exceptions file."
833 >            if [ $STATUS == "EI" ]; then
834 >               let SKIP_EXPECT_EI+=1
835 >            elif [ $STATUS == "IW" ]; then
836 >               let SKIP_EXPECT_IW+=1
837 >            else
838 >               let SKIP_EXPECT_NG+=1
839 >            fi
840 >            continue
841 >         fi
842        fi
843     fi
844  
845     # If appropriate, record this link to the log, with clickable URLs when possible
846     if [ $STATUS != "OK" ] || [ $RECORD_OK_LINKS -eq 1 ]; then
847 <      FULL_PAGE_PATH=http://$WIKI_PATH/$NS_NAME:$PAGE_NAME
848 <      LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
776 <      # Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it explicitly breaks the link
777 <      if [ $NS_ID -eq 0 ]; then
778 <         FULL_PAGE_PATH=http://$WIKI_PATH/$PAGE_NAME
779 <         LOCAL_PAGE_PATH=$PAGE_NAME
780 <      fi
781 <
782 <      # Stupid hack since the text "IW" is narrower than "OK", "RD", or "NG" and it takes an extra tab
783 <      # to get to the desired level of indentation in the RTF log
847 >      # Stupid hack since the strings "IW" and "EI" are narrower than "OK", "RD", or "NG" and it takes
848 >      # an extra tab to get to the desired level of indentation in the RTF log
849        RTF_TABS="        "
850 <      if [ $STATUS == "IW" ]; then
850 >      if [ $STATUS == "IW" ] || [ $STATUS == "EI" ]; then
851           RTF_TABS="             "
852        fi
853        
# Line 801 | Line 866 | for LINE in `cat "$LINKS_FILE"`; do
866           valPrint hn "<tr><td colspan=\"2\" align=\"right\">Server suggests</td><td><a href=\"$NEW_URL\" target=\"_blank\">$NEW_URL</a></td></tr>"
867        fi
868  
869 +      # Get everything after domain name in URL for use in EI and IW listings
870 +      POST_DOMAIN=${URL#*://*/}
871 +
872 +      # Notify reader if we can use an intrawiki link for this URL
873 +      if [ $STATUS == "EI" ]; then
874 +         valPrint t "  Just use [[$POST_DOMAIN]]"
875 +         valPrint r "           Just use [[$POST_DOMAIN]]"
876 +         valPrint hn "<tr><td colspan=\"2\" align=\"right\">Just use</td><td>[[$POST_DOMAIN]]</td></tr>"
877 +      fi
878 +
879        # Notify reader if we can use an interwiki prefix for this URL
880        if [ $STATUS == "IW" ]; then
881 <         valPrint t "  You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$POST_SLASH]]"
882 <         valPrint r "           You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$POST_SLASH]]"
883 <         valPrint hn "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$POST_SLASH]]</td></tr>"
881 >         valPrint t "  You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$POST_DOMAIN]]"
882 >         valPrint r "           You can use [[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$POST_DOMAIN]]"
883 >         valPrint hn "<tr><td colspan=\"2\" align=\"right\">You can use</td><td>[[${INTERWIKI_PREFIXES[$INTERWIKI_INDEX]}:$POST_DOMAIN]]</td></tr>"
884        fi
885  
886        # Query Internet Archive for latest "OK" snapshot for "NG" page
# Line 835 | Line 910 | for LINE in `cat "$LINKS_FILE"`; do
910  
911        # Don't take screenshot if we already encountered this page and screenshotted it
912        if [ ! -f "$SHOT_FILE" ]; then
913 <         "$CHROME" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
913 >         "$CHROME_PATH" --headless --disable-gpu --screenshot --window-size=1500,900 $URL > /dev/null 2>&1
914           if [ -f "$WORKING_DIR/$CHROME_SCREENSHOT" ]; then
915              mv -n "$WORKING_DIR/$CHROME_SCREENSHOT" "$SHOT_FILE"
916           else

Diff Legend

Removed lines
+ Added lines
< Changed lines (old)
> Changed lines (new)