ViewVC Help
View File | Revision Log | View Changeset | Root Listing
root/Oni2/Validate External Links/validate_external_links.sh
(Generate patch)

Comparing Validate External Links/validate_external_links.sh (file contents):
Revision 1142 by iritscen, Fri Sep 4 03:07:08 2020 UTC vs.
Revision 1144 by iritscen, Sun Sep 6 20:51:22 2020 UTC

# Line 5 | Line 5
5   # Validates a list of external links in CSV format. The resulting logs are produced in three formats:
6   # - TXT (for easy diffing with an earlier log)
7   # - RTF (for reading as a local file with clickable links)
8 < # - HTML (for uploading as a web page).
8 > # - HTML (for reading as a web page)
9   # Call script with "--help" argument for documentation. Also see Read Me.rtf for critical notes.
10   #
11   # Recommended rule:
# Line 29 | Line 29 | IFS="
29  
30   ### GLOBALS ###
31   # Settings -- these will be changed from their defaults by the arguments passed in to the script
32 < LINKS_URL=""         # use 'curl' to download file with links from this location (can be file://)
33 < EXCEPT_URL=""        # 'curl' will access this wiki page with a list of exceptions for NG results
34 < OUTPUT_DIR=""        # place reports and all other output in a folder inside this existing folder
35 < RECORD_OK_LINKS=0    # record response code to the log even when it's a value in OK_CODES
36 < SHOW_SLASH=0         # record response code to the log when a slash is added to the end of a URL
37 < SHOW_HTTPS=0         # record response code to the log when "http" is upgraded to "https"
38 < SHOW_YT_RD=0         # record response code to the log when a youtu.be URL is expanded to the full URL
39 < SUGGEST_SNAPSHOTS=0  # query the Internet Archive for a possible snapshot URL for each NG page
40 < SKIP_ARCHIVE_LINKS=0 # don't check URLs under the archive.org domain
41 < TAKE_PAGE_SHOT=0     # take a screenshot of each OK page
42 < TIMEOUT=10           # time to wait for a response when querying a site
43 < CHROME_PATH=""       # path to a copy of Google Chrome that has the command-line screenshot feature
44 < URL_START=1          # start at this URL in LINKS_FILE
45 < URL_LIMIT=0          # if non-zero, stop at this URL in LINKS_FILE
46 < UPLOAD_INFO=""       # path to a file on your hard drive with the login info needed to upload a report
32 > LINKS_URL=""          # use 'curl' to download file with links from this location (can be file://)
33 > EXCEPT_URL=""         # 'curl' will access this wiki page with a list of exceptions for NG results
34 > OUTPUT_DIR=""         # place reports and all other output in a folder inside this existing folder
35 > RECORD_OK_LINKS=0     # record response code to the log even when it's a value in OK_CODES
36 > SHOW_SLASH=0          # record issue when a slash is added to the end of a URL
37 > SHOW_HTTPS=0          # record issue when "http" is upgraded to "https"
38 > SHOW_YT_RD=0          # record redirection for a youtu.be URL expanding to the full URL
39 > SUGGEST_SNAPSHOTS=0   # query the Internet Archive for a possible snapshot URL for each NG page
40 > CHECK_ARCHIVE_LINKS=0 # check URLs under the archive.org domain
41 > TAKE_PAGE_SHOT=0      # take a screenshot of each OK page
42 > TIMEOUT=10            # time to wait for a response when querying a site
43 > CHROME_PATH=""        # path to a copy of Google Chrome that has the command-line screenshot feature
44 > URL_START=1           # start at this URL in LINKS_FILE
45 > URL_LIMIT=0           # if non-zero, stop at this URL in LINKS_FILE
46 > UPLOAD_INFO=""        # path to a file on your hard drive with the login info needed to upload a report
47  
48   # Fixed strings -- see the occurrences of these variables to learn their purpose
49   AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36 OPR/70.0.3728.154"
# Line 132 | Line 132 | SYNOPSIS
132         validate_external_links.sh --help
133         validate_external_links.sh --links URL --output DIR [--exceptions URL]
134            [--record-ok-links] [--show-added-slashes] [--show-https-upgrades]
135 <          [--show-yt-redirects] [--suggest-snapshots] [--skip-archive-links]
135 >          [--show-yt-redirects] [--suggest-snapshots] [--check-archive-links]
136            [--take-screenshots FILE] [--timeout NUM] [--start-url NUM]
137            [--end-url NUM] [--upload FILE]
138  
# Line 176 | Line 176 | OPTIONS
176         --show-yt-redirects     Report on redirects that expand a youtu.be URL.
177         --suggest-snapshots     Query the Internet Archive for a possible
178                                 snapshot URL for each "NG" page.
179 <       --skip-archive-links    Don't check links that are already pointing to
180 <                               a page on the Internet Archive.
179 >       --check-archive-links   Check links that are already pointing to a page
180 >                               on the Internet Archive. In theory these links
181 >                               should be totally stable and not need validation.
182         --take-screenshots FILE Call the Google Chrome binary at this path to
183                                 take screenshots of each "OK" page.
184         --timeout NUM           Wait this many seconds for a site to respond. The
# Line 217 | Line 218 | while (( "$#" )); do
218        --show-https-upgrades ) SHOW_HTTPS=1;                       shift;;
219        --show-yt-redirects )   SHOW_YT_RD=1;                       shift;;
220        --suggest-snapshots )   SUGGEST_SNAPSHOTS=1;                shift;;
221 <      --skip-archive-links )  SKIP_ARCHIVE_LINKS=1;               shift;;
221 >      --check-archive-links ) CHECK_ARCHIVE_LINKS=1;              shift;;
222        --take-screenshots )    TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
223        --timeout )             TIMEOUT=$2;                         shift 2;;
224        --start-url )           URL_START=$2;                       shift 2;;
# Line 260 | Line 261 | OUTPUT_FOLDER="ValExtLinks ($SAFE_TIME)"
261   OUTPUT_PATH="$OUTPUT_DIR/$OUTPUT_FOLDER"
262   SHOT_PATH="$OUTPUT_PATH/Screenshots"
263   LOG_NAME="ValExtLinks report"
264 < LOG_TXT="$OUTPUT_PATH/$LOG_NAME.txt"
265 < LOG_RTF="$OUTPUT_PATH/$LOG_NAME.rtf"
266 < LOG_HTM="$OUTPUT_PATH/$LOG_NAME.htm"
264 > LOG_NAME_TXT="$LOG_NAME.txt"
265 > LOG_NAME_RTF="$LOG_NAME.rtf"
266 > LOG_NAME_HTM="$LOG_NAME.htm"
267 > LOG_PATH="$OUTPUT_PATH/$LOG_NAME"
268 > LOG_PATH_TXT="$LOG_PATH.txt"
269 > LOG_PATH_RTF="$LOG_PATH.rtf"
270 > LOG_PATH_HTM="$LOG_PATH.htm"
271   mkdir "$OUTPUT_PATH"
272   if [ $TAKE_PAGE_SHOT -eq 1 ]; then
273     mkdir "$SHOT_PATH"
# Line 360 | Line 365 | function valPrint()
365     fi
366     if [[ "$1" == *t* ]]; then
367        if [[ "$1" == *n* ]]; then
368 <         echo -n "$2" >> "$LOG_TXT"
368 >         echo -n "$2" >> "$LOG_PATH_TXT"
369        elif [[ "$1" == *s* ]]; then
370 <         echo -e "$2\n" >> "$LOG_TXT"
370 >         echo -e "$2\n" >> "$LOG_PATH_TXT"
371        else
372 <         echo "$2" >> "$LOG_TXT"
372 >         echo "$2" >> "$LOG_PATH_TXT"
373        fi
374     fi
375     if [[ "$1" == *r* ]]; then
376        if [[ "$1" == *n* ]]; then
377 <         echo "$2" >> "$LOG_RTF"
377 >         echo "$2" >> "$LOG_PATH_RTF"
378        elif [[ "$1" == *s* ]]; then
379 <         echo "$2\line\line" >> "$LOG_RTF"
379 >         echo "$2\line\line" >> "$LOG_PATH_RTF"
380        else
381 <         echo "$2\line" >> "$LOG_RTF"
381 >         echo "$2\line" >> "$LOG_PATH_RTF"
382        fi
383     fi
384     if [[ "$1" == *h* ]]; then
385        if [[ "$1" == *s* ]]; then
386 <         echo "$2<tr><td>&nbsp;</td></tr>" >> "$LOG_HTM"
386 >         echo "$2<tr><td>&nbsp;</td></tr>" >> "$LOG_PATH_HTM"
387        elif [[ "$1" == *n* ]]; then
388 <         echo "$2" >> "$LOG_HTM"
388 >         echo "$2" >> "$LOG_PATH_HTM"
389        else
390 <         echo "$2<br />" >> "$LOG_HTM"
390 >         echo "$2<br />" >> "$LOG_PATH_HTM"
391        fi
392     fi
393   }
# Line 437 | Line 442 | function pluralCheckAn()
442     fi
443   }
444  
445 < # Upload HTML report using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
445 > # Upload the reports using info specified in the --upload argument. ONLY USE "valPrint c" here, as the
446   # reports being saved to disk have already been closed.
447   function uploadReport()
448   {
449 <   valPrint c "Uploading HTML report..."
449 >   valPrint c "Uploading reports..."
450  
451     SCRIPT_PATH="$THIS_DIR/$EXPECT_SCRIPT_NAME"
452     SFTP_USER_NAME_MARKER="user:"
# Line 457 | Line 462 | function uploadReport()
462     SFTP_PATH=$(grep $SFTP_PATH_MARKER $UPLOAD_INFO)
463     SFTP_PATH=${SFTP_PATH#$SFTP_PATH_MARKER}
464  
465 <   expect "$SCRIPT_PATH" "$LOG_HTM" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.htm"
465 >   for SUFFIX in htm rtf txt; do
466 >      expect "$SCRIPT_PATH" "$LOG_PATH.$SUFFIX" $SFTP_USER_NAME $SFTP_PASSWORD $SFTP_PORT $SFTP_PATH "$LOG_NAME.$SUFFIX"
467  
468 <   valPrint c "Report was uploaded, unless an error message appears above."
468 >      if [ "$?" -ne 0 ]; then
469 >         valPrint c "Error $? occurred when attempting to upload $LOG_NAME.$SUFFIX!"
470 >      else
471 >         valPrint c "Report in `echo $SUFFIX | tr [:lower:] [:upper:]` format was uploaded."
472 >      fi
473 >   done
474   }
475  
476   # Prints session summary when script is done
# Line 493 | Line 504 | function wrapupAndExit()
504     LINK_PROBLEMS_IW=$((IW_LINKS-SKIP_EXPECT_IW))
505     LINK_PROBLEMS_NET=$((LINK_PROBLEMS_NG+LINK_PROBLEMS_RD+LINK_PROBLEMS_EI+LINK_PROBLEMS_IW))
506  
507 +   # Print something in the Links section if no link issues were printed
508 +   if [ $LINK_PROBLEMS_NET -eq 0 ]; then
509 +      valPrint h "<i>No link problems to report! See the <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for a list of links with issues that were not reported.</i>"
510 +   fi
511 +   if [ $LINK_PROBLEMS_TOTAL -eq 0 ]; then
512 +      valPrint t "No link problems to report!"
513 +      valPrint r "\i1 No link problems to report! \i0"
514 +   fi
515 +
516     ## SUMMARY OUTPUT ##
517     valPrint ct "Summary ($ELAPSED):"
518     valPrint r "\b1 Summary \b0 ($ELAPSED)"
# Line 509 | Line 529 | function wrapupAndExit()
529     if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctr "  (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; valPrint h "&nbsp;&nbsp;(counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi
530  
531     # Print errored link totals
532 <   if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"; fi
532 >   if [ $LINK_ERRORS -gt 0 ]; then
533 >      valPrint c "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"
534 >      valPrint h "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for specific links):"
535 >      valPrint rt "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS):"
536 >   fi
537     if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
538     if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
539     if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
# Line 518 | Line 542 | function wrapupAndExit()
542     if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
543  
544     # Print excepted link totals
545 <   if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctrh "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"; fi
545 >   if [ $LINKS_EXCEPTED -gt 0 ]; then
546 >      valPrint c "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"
547 >      valPrint h "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted (see <a href=\"$LOG_NAME_RTF\" target=\"_blank\">RTF</a> or <a href=\"$LOG_NAME_TXT\" target=\"_blank\">TXT</a> report for specific links):"
548 >      valPrint rt "$LINKS_EXCEPTED link $(pluralCheckNoun problem $LINKS_EXCEPTED) excepted:"
549 >   fi
550     if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
551     if [ $SKIP_EXPECT_RD -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_RD/$RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi
552     if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi
# Line 637 | Line 665 | valPrint ctrhn "Ignore youtu.be redirect
665   if [ $SHOW_YT_RD -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
666  
667   valPrint ctrhn "Check archive.org links: "
668 < if [ $SKIP_ARCHIVE_LINKS -eq 1 ]; then valPrint ctrh "No"; else valPrint ctrh "Yes"; fi
668 > if [ $CHECK_ARCHIVE_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi
669  
670   valPrint tr "A summary of my findings will be found at the bottom of the report."
671   valPrint h "A summary of my findings will be found at the <a href=\"#summary\">bottom</a> of the report."
# Line 765 | Line 793 | for LINE in `cat "$LINKS_FILE"`; do
793        continue
794     fi
795  
796 <   # If we're skipping Archive.org links, check if this is one
797 <   if [ $SKIP_ARCHIVE_LINKS -eq 1 ] && [[ $URL == *web.archive.org* ]]; then
798 <      valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have been asked not to check Wayback Machine links."
796 >   # If we're skipping Archive.org links, see if this is one
797 >   if [ $CHECK_ARCHIVE_LINKS -eq 0 ] && [[ $URL == *web.archive.org* ]]; then
798 >      valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have not been asked to check Wayback Machine links."
799        let SKIP_ARCHIVE_ORG+=1
800        continue
801     fi
# Line 892 | Line 920 | for LINE in `cat "$LINKS_FILE"`; do
920        let EI_LINKS+=1
921     fi
922  
923 <   # If it's not, check if this is a link to a domain that we have an interwiki prefix for
924 <   if [ $STATUS == "??" ]; then
923 >   # If it's not, check if this is a link to a domain that we have an interwiki prefix for (also make
924 >   # sure that it's not an archive.org link to a page from an interwiki domain)
925 >   if [ $STATUS == "??" ] && [[ $URL != *web.archive.org* ]]; then
926        for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do
927           if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]] && [[ $URL != *${INTERWIKI_DOMAINS[$i]}/w/* ]]; then
928              STATUS="IW"

Diff Legend

Removed lines
+ Added lines
< Changed lines (old)
> Changed lines (new)