ViewVC Help
View File | Revision Log | View Changeset | Root Listing
root/Oni2/Validate External Links/validate_external_links.sh
(Generate patch)

Comparing Validate External Links/validate_external_links.sh (file contents):
Revision 1141 by iritscen, Fri Sep 4 02:54:30 2020 UTC vs.
Revision 1142 by iritscen, Fri Sep 4 03:07:08 2020 UTC

# Line 6 | Line 6
6   # - TXT (for easy diffing with an earlier log)
7   # - RTF (for reading as a local file with clickable links)
8   # - HTML (for uploading as a web page).
9 < # Call script with "--help" argument for documentation. Also see Read Me First.rtf for critical notes.
9 > # Call script with "--help" argument for documentation. Also see Read Me.rtf for critical notes.
10   #
11   # Recommended rule:
12   # |----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----|
# Line 41 | Line 41 | SKIP_ARCHIVE_LINKS=0 # don't check URLs
41   TAKE_PAGE_SHOT=0     # take a screenshot of each OK page
42   TIMEOUT=10           # time to wait for a response when querying a site
43   CHROME_PATH=""       # path to a copy of Google Chrome that has the command-line screenshot feature
44 < URL_START=1          # start at this URL in LINKS_FILE (1 by default)
44 > URL_START=1          # start at this URL in LINKS_FILE
45   URL_LIMIT=0          # if non-zero, stop at this URL in LINKS_FILE
46   UPLOAD_INFO=""       # path to a file on your hard drive with the login info needed to upload a report
47  
48   # Fixed strings -- see the occurrences of these variables to learn their purpose
49 < AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36 OPR/69.0.3686.77"
49 > AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36 OPR/70.0.3728.154"
50   ARCHIVE_API="http://archive.org/wayback/available"
51   ARCHIVE_GENERIC="https://web.archive.org/web/*"
52   ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
# Line 103 | Line 103 | SKIP_NON_ASCII=0
103   SKIP_UNK_SUFFIX=0
104   SKIP_UNK_CODE=0
105   SKIP_EXPECT_NG=0
106 + SKIP_EXPECT_RD=0
107   SKIP_EXPECT_EI=0
108   SKIP_EXPECT_IW=0
109   SKIP_HTTPS_UP=0
# Line 180 | Line 181 | OPTIONS
181         --take-screenshots FILE Call the Google Chrome binary at this path to
182                                 take screenshots of each "OK" page.
183         --timeout NUM           Wait this many seconds for a site to respond. The
184 <                               default is 10.
184 >                               default is 10. Important note: Val will attempt
185 >                               to reach each URL three times, so the time taken
186 >                               to ping an unresponsive site will be three times
187 >                               this setting.
188         --start-url NUM         Start at this link in the links CSV file.
189         --end-url NUM           Stop at this link in the links CSV file.
190         --upload FILE           Upload report using the credentials and path
# Line 479 | Line 483 | function wrapupAndExit()
483  
484     # Do some math on results of session
485     LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
482   LINK_PROBLEMS=$((EI_LINKS+IW_LINKS+RD_LINKS+NG_LINKS))
483   LINK_ERRORS=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
484   LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_EI+SKIP_EXPECT_IW))
486     TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP+SKIP_YOUTU_BE))
487 <   LINKS_CHECKED=$((LINKS_PROCESSED-LINK_ERRORS))
487 >   LINK_ERRORS=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
488 >   LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_RD+SKIP_EXPECT_EI+SKIP_EXPECT_IW))
489 >   LINK_PROBLEMS_TOTAL=$((NG_LINKS+RD_LINKS+EI_LINKS+IW_LINKS))
490 >   LINK_PROBLEMS_NG=$((NG_LINKS-SKIP_EXPECT_NG))
491 >   LINK_PROBLEMS_RD=$((RD_LINKS-SKIP_EXPECT_RD))
492 >   LINK_PROBLEMS_EI=$((EI_LINKS-SKIP_EXPECT_EI))
493 >   LINK_PROBLEMS_IW=$((IW_LINKS-SKIP_EXPECT_IW))
494 >   LINK_PROBLEMS_NET=$((LINK_PROBLEMS_NG+LINK_PROBLEMS_RD+LINK_PROBLEMS_EI+LINK_PROBLEMS_IW))
495  
496     ## SUMMARY OUTPUT ##
497     valPrint ct "Summary ($ELAPSED):"
# Line 495 | Line 503 | function wrapupAndExit()
503     if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi
504     if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi
505     if [ $SKIP_ARCHIVE_ORG -gt 0 ]; then valPrint ctrh "- $SKIP_ARCHIVE_ORG Archive.org $(pluralCheckNoun link $SKIP_ARCHIVE_ORG) were not checked"; fi
506 <   if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS processed $(pluralCheckNoun link $LINK_PROBLEMS) had $(pluralCheckAn $LINK_PROBLEMS)$(pluralCheckNoun issue $LINK_PROBLEMS)"; fi
507 <   if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr "  (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; valPrint h "&nbsp;&nbsp;(excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi
506 >   if [ $LINK_PROBLEMS_TOTAL -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_TOTAL processed $(pluralCheckNoun link $LINK_PROBLEMS_TOTAL) had $(pluralCheckAn $LINK_PROBLEMS_TOTAL)$(pluralCheckNoun issue $LINK_PROBLEMS_TOTAL)"; fi
507 >   if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr "  (excepted $LINKS_EXCEPTED link $(pluralCheckNoun issue $LINKS_EXCEPTED) from report)"; valPrint h "&nbsp;&nbsp;(excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi
508     if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) $(pluralCheckWas $OK_LINKS) OK"; fi
509     if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctr "  (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; valPrint h "&nbsp;&nbsp;(counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi
510  
503   # Print excepted link totals
504   if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctrh "$LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"; fi
505   if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
506   if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi
507   if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi
508
511     # Print errored link totals
512     if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"; fi
513     if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
# Line 515 | Line 517 | function wrapupAndExit()
517     if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
518     if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
519  
520 +   # Print excepted link totals
521 +   if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctrh "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"; fi
522 +   if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
523 +   if [ $SKIP_EXPECT_RD -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_RD/$RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi
524 +   if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi
525 +   if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi
526 +
527     # Print checked link totals
528 <   if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS link $(pluralCheckNoun issue $LINK_PROBLEMS):"; fi
529 <   if [ $NG_LINKS -gt 0 ]; then valPrint ctrh "- $NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
530 <   if [ $RD_LINKS -gt 0 ]; then valPrint ctrh "- $RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi
531 <   if [ $EI_LINKS -gt 0 ]; then valPrint ctrh "- $EI_LINKS $(pluralCheckNoun link $EI_LINKS) that could be intrawiki"; fi
532 <   if [ $IW_LINKS -gt 0 ]; then valPrint ctrh "- $IW_LINKS $(pluralCheckNoun link $IW_LINKS) that could be interwiki"; fi
528 >   if [ $LINK_PROBLEMS_NET -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS_NET link $(pluralCheckNoun issue $LINK_PROBLEMS_NET):"; fi
529 >   if [ $LINK_PROBLEMS_NG -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_NG NG $(pluralCheckNoun link $LINK_PROBLEMS_NG)"; fi
530 >   if [ $LINK_PROBLEMS_RD -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_RD $(pluralCheckNoun redirection $LINK_PROBLEMS_RD)"; fi
531 >   if [ $LINK_PROBLEMS_EI -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_EI $(pluralCheckNoun link $LINK_PROBLEMS_EI) that could be intrawiki"; fi
532 >   if [ $LINK_PROBLEMS_IW -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS_IW $(pluralCheckNoun link $LINK_PROBLEMS_IW) that could be interwiki"; fi
533  
534     # Close the log files' markup
535     valPrint trh "ValExtLinks says goodbye."
# Line 615 | Line 624 | if [ $RECORD_OK_LINKS -eq 1 ]; then valP
624   valPrint ctrhn "Take screenshots: "
625   if [ $TAKE_PAGE_SHOT -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
626  
627 < valPrint ctrhn "Suggest Archive.org snapshots: "
627 > valPrint ctrhn "Suggest archive.org snapshots: "
628   if [ $SUGGEST_SNAPSHOTS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
629  
630   valPrint ctrhn "Ignore slash-adding redirects: "
# Line 861 | Line 870 | for LINE in `cat "$LINKS_FILE"`; do
870  
871     # Get response code using 'curl' to see if this link is valid; the --insecure option avoids an
872     # issue with sites that require HTTPS
873 <   CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --write-out '%{http_code}\n' $URL)
873 >   CURL_CODE=$(curl -o /dev/null --silent --insecure --head --user-agent '$AGENT' --max-time $TIMEOUT --retry 2 --write-out '%{http_code}\n' $URL)
874     CURL_ERR=$(echo $?)
875     CURL_RESULT=$CURL_CODE
876  
# Line 997 | Line 1006 | for LINE in `cat "$LINKS_FILE"`; do
1006        {
1007           EXCEPT_LINE="${EXCEPT_ARRAY[$i]}"
1008  
1009 +         # Undo any HTML-encoding from the wiki page; for now we just worry about the ampersand, as most
1010 +         # other HTML-encoded characters are not found in URLs
1011 +         EXCEPT_LINE=$(echo "$EXCEPT_LINE" | sed 's/\&amp;/\&/')
1012 +
1013           # Match URL
1014           EXCEPT_URL="${EXCEPT_LINE#*,}"
1015           EXCEPT_URL="${EXCEPT_URL%,*}"
# Line 1016 | Line 1029 | for LINE in `cat "$LINKS_FILE"`; do
1029                    let SKIP_EXPECT_EI+=1
1030                 elif [ $STATUS == "IW" ]; then
1031                    let SKIP_EXPECT_IW+=1
1032 +               elif [ $STATUS == "RD" ]; then
1033 +                  let SKIP_EXPECT_RD+=1
1034                 else
1035                    let SKIP_EXPECT_NG+=1
1036                 fi

Diff Legend

Removed lines
+ Added lines
< Changed lines (old)
> Changed lines (new)