ViewVC Help
View File | Revision Log | View Changeset | Root Listing
root/Oni2/Validate External Links/validate_external_links.sh
(Generate patch)

Comparing Validate External Links/validate_external_links.sh (file contents):
Revision 1127 by iritscen, Sat Mar 28 02:08:29 2020 UTC vs.
Revision 1135 by iritscen, Sun Jul 12 23:57:00 2020 UTC

# Line 1 | Line 1
1   #!/bin/bash
2  
3   # Validate External Links by Iritscen
4 < # Provided with a list of external links found in the OniGalore wiki, this script validates them.
5 < # The resulting logs are produced in three formats: TXT (for easy diffing with an earlier log), RTF
6 < # (for reading as a local file with clickable links), and HTML (for uploading as a web page).
7 < # Call script with "--help" argument for documentation. Also see Read Me First.rtf for critical notes.
4 > # Provided with a list of external links in an expected CSV format, this script validates them. The
5 > # resulting logs are produced in three formats: TXT (for easy diffing with an earlier log), RTF (for
6 > # reading as a local file with clickable links), and HTML (for uploading as a web page). Call script
7 > # with "--help" argument for documentation. Also see Read Me First.rtf for critical notes.
8   # Recommended rule:
9   # |----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----|
10  
# Line 14 | Line 14 | IFS="
14  
15   ### GLOBALS ###
16   # Settings -- these will be changed from their defaults by the arguments passed in to the script
17 < LINKS_URL=""        # use 'curl' to download file with links from this location (can be file://)
18 < EXCEPT_URL=""       # ditto above for file with exceptions to NG results
19 < OUTPUT_DIR=""       # place reports and all other output in a folder inside this existing folder
20 < RECORD_OK_LINKS=0   # record response code to the log even when it's a value in OK_CODES
21 < SHOW_SLASH=0        # record response code to the log when a slash is added to the end of a URL
22 < SHOW_HTTPS=0        # record response code to the log when "http" is upgraded to "https"
23 < SHOW_YT_RD=0        # record response code to the log when a youtu.be URL is expanded to the full URL
24 < SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page
25 < TAKE_PAGE_SHOT=0    # take a screenshot of each OK page
26 < CHROME_PATH=""      # path to a copy of Google Chrome that has the command-line screenshot feature
27 < URL_START=1         # start at this URL in LINKS_FILE (1 by default)
28 < URL_LIMIT=0         # if non-zero, stop at this URL in LINKS_FILE
29 < UPLOAD_INFO=""      # path to a file on your hard drive with the login info needed to upload a report
17 > LINKS_URL=""         # use 'curl' to download file with links from this location (can be file://)
18 > EXCEPT_URL=""        # ditto above for file with exceptions to NG results
19 > OUTPUT_DIR=""        # place reports and all other output in a folder inside this existing folder
20 > RECORD_OK_LINKS=0    # record response code to the log even when it's a value in OK_CODES
21 > SHOW_SLASH=0         # record response code to the log when a slash is added to the end of a URL
22 > SHOW_HTTPS=0         # record response code to the log when "http" is upgraded to "https"
23 > SHOW_YT_RD=0         # record response code to the log when a youtu.be URL is expanded to the full URL
24 > SUGGEST_SNAPSHOTS=0  # query the Internet Archive for a possible snapshot URL for each NG page
25 > SKIP_ARCHIVE_LINKS=0 # don't check URLs under the archive.org domain
26 > TAKE_PAGE_SHOT=0     # take a screenshot of each OK page
27 > CHROME_PATH=""       # path to a copy of Google Chrome that has the command-line screenshot feature
28 > URL_START=1          # start at this URL in LINKS_FILE (1 by default)
29 > URL_LIMIT=0          # if non-zero, stop at this URL in LINKS_FILE
30 > UPLOAD_INFO=""       # path to a file on your hard drive with the login info needed to upload a report
31  
32   # Fixed strings -- see the occurrences of these variables to learn their purpose
33   AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36 OPR/67.0.3575.53"
# Line 89 | Line 90 | SKIP_EXPECT_IW=0
90   SKIP_HTTPS_UP=0
91   SKIP_SLASH_ADD=0
92   SKIP_YOUTU_BE=0
93 + SKIP_ARCHIVE_ORG=0
94   FILE_LINKS=0
95   PAGE_LINKS=0
96   SKIPPED_HEADER_ROW=0
# Line 152 | Line 154 | OPTIONS
154         --show-yt-redirects     Report on redirects that expand a youtu.be URL.
155         --suggest-snapshots     Query the Internet Archive for a possible
156                                 snapshot URL for each "NG" page.
157 +       --skip-archive-links    Don't check links that are already pointing to
158 +                               a page on the Internet Archive.
159         --take-screenshots FILE Call the Google Chrome binary at this path to
160                                 take screenshots of each "OK" page.
161         --start-url NUM         Start at this link in the links CSV file.
# Line 186 | Line 190 | while (( "$#" )); do
190        --show-https-upgrades ) SHOW_HTTPS=1;                       shift;;
191        --show-yt-redirects )   SHOW_YT_RD=1;                       shift;;
192        --suggest-snapshots )   SUGGEST_SNAPSHOTS=1;                shift;;
193 +      --skip-archive-links )  SKIP_ARCHIVE_LINKS=1;               shift;;
194        --take-screenshots )    TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
195        --start-url )           URL_START=$2;                       shift 2;;
196        --end-url )             URL_LIMIT=$2;                       shift 2;;
# Line 464 | Line 469 | function wrapupAndExit()
469     # Print processed link totals
470     if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi
471     if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi
472 +   if [ $SKIP_ARCHIVE_ORG -gt 0 ]; then valPrint ctrh "- $SKIP_ARCHIVE_ORG Archive.org $(pluralCheckNoun link $SKIP_ARCHIVE_ORG) were not checked"; fi
473     if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS processed $(pluralCheckNoun link $LINK_PROBLEMS) had $(pluralCheckAn $LINK_PROBLEMS)$(pluralCheckNoun issue $LINK_PROBLEMS)"; fi
474     if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr "  (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; valPrint h "&nbsp;&nbsp;(excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi
475     if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) $(pluralCheckWas $OK_LINKS) OK"; fi
# Line 557 | Line 563 | else
563   fi
564  
565   # Print settings to console and log
566 < declare -a SETTINGS_MSG=(I will be checking the response code of each link "and will" take a screenshot of each page. Pages that are OK will "also" be logged. I "will" ask the Internet Archive for a suggested snapshot URL for each NG page. "I will not report links that are listed in the exceptions file.")
566 > declare -a SETTINGS_MSG=(I will be checking the response code of each link "and will" take a screenshot of each page. Pages that are OK will "also" be logged. I "will" ask the Internet Archive for a suggested snapshot URL for each NG page. "I will not report links that are listed in the exceptions file." "I will ignore URLs that simply have ending slashes added onto them." "I will ignore URLs that only upgrade from HTTP to HTTPS." "I will ignore youtu.be links that are merely being expanded." "I will not check the validity of Internet Archive snapshot URLs.")
567   if [ $TAKE_PAGE_SHOT -eq 0 ]; then SETTINGS_MSG[10]="but will not"; fi
568   if [ $RECORD_OK_LINKS -eq 0 ]; then SETTINGS_MSG[22]="not"; fi
569   if [ $SUGGEST_SNAPSHOTS -eq 0 ]; then SETTINGS_MSG[26]="will not"; fi
570   if [ -z $EXCEPT_URL ] || [ $RECORD_OK_LINKS -eq 1 ]; then SETTINGS_MSG[40]=""; fi
571 + if [ $SHOW_SLASH -eq 1 ]; then SETTINGS_MSG[41]=""; fi
572 + if [ $SHOW_HTTPS -eq 1 ]; then SETTINGS_MSG[42]=""; fi
573 + if [ $SHOW_YT_RD -eq 1 ]; then SETTINGS_MSG[43]=""; fi
574 + if [ $SKIP_ARCHIVE_LINKS -eq 0 ]; then SETTINGS_MSG[44]=""; fi
575   SETTINGS_STR=${SETTINGS_MSG[@]}
576   valPrint ctrh "$SETTINGS_STR"
577   valPrint tr "A summary of my findings will be found at the bottom of the report."
# Line 573 | Line 583 | valPrint t "Legend:"
583   valPrint r "\b1 Legend \b0"
584   valPrint hn "<h3>Legend</h3>"
585   valPrint trh "OK = URL seems to be working."
586 < valPrint trh "NG = URL no longer seems to work. You should click each URL marked as NG before attempting to fix it, because false negatives will occur from time to time due to hiccups in the Internet. Please report any persistent false negatives or other issues to Iritscen. An NG link should be followed by a link to the Internet Archive's Wayback Machine which may help you repair the link. If the link cannot be repaired, you can disable it on the wiki (which prevents it from showing up in future ValExtLinks reports) by wrapping it in nowiki tags."
586 > valPrint trh "NG = URL no longer seems to work. You should click each URL marked as NG before attempting to fix it, because false negatives will occur from time to time due to hiccups in the Internet. Please report any persistent false negatives or other issues to the script's author (see top of report). An NG link should be followed by a link to the Internet Archive's Wayback Machine which may help you repair the link, unless the Archive does not have any snapshots of the site. If the link cannot be repaired, you can delete it from the wiki page, or, if this would disrupt the surrounding material on the page, disable the link by wrapping the URL in nowiki tags."
587   valPrint trh "RD = The server responding to this URL is saying that the page moved and you should instead use the supplied new URL. Some RD links represent minor adjustments in the organization of a web site, and some are soft 404s (the file/page has been removed and you are being redirected to something like the main page of the web site). You will have to look at the new URL yourself to determine if it represents an OK link and the link on the wiki should be updated to this one, or if the desired file/page is actually gone and we need to replace the wiki link with an Internet Archive snapshot link -- or disable the URL if it has not been archived."
588   valPrint trh "EI = URL is an external link to an internal page and should be converted to an intrawiki link using the suggested markup."
589   valPrint trh "IW = URL is an external link to a fellow wiki and should be converted to an interwiki link using the suggested markup."
# Line 657 | Line 667 | for LINE in `cat "$LINKS_FILE"`; do
667     PAGE_NAME=${LINE#$NS_ID,}
668     PAGE_NAME=${PAGE_NAME%%,*}
669  
670 <   # We don't want to consider wiki pages ending in .js, as the parser cannot reliably isolate URLS in
671 <   # JavaScript code, so it will return erroneous links
670 >   # We don't want to consider wiki pages ending in .js, as the MW parser cannot reliably isolate URLS
671 >   # in JavaScript code, so it returns erroneous links
672     PAGE_NAME_SUFFIX=$(echo $PAGE_NAME | sed 's/.*\.//')
673     if [ $PAGE_NAME_SUFFIX == "js" ]; then
674        valPrint trs "Skipping URL '${LINE#$NS_ID,$PAGE_NAME,}' on line $LINK_NUM because it was found on JavaScript page '$PAGE_NAME'."
# Line 687 | Line 697 | for LINE in `cat "$LINKS_FILE"`; do
697        continue
698     fi
699  
700 +   # If we're skipping Archive.org links, check if this is one
701 +   if [ $SKIP_ARCHIVE_LINKS -eq 1 ] && [[ $URL == *web.archive.org* ]]; then
702 +      valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have been asked not to check Wayback Machine links."
703 +      let SKIP_ARCHIVE_ORG+=1
704 +      continue
705 +   fi
706 +
707     # Now we need to know if the URL is for a file or a web page. First step is to determine if the
708     # URL ends in a suffix
709     HAS_SUFFIX=0
# Line 697 | Line 714 | for LINE in `cat "$LINKS_FILE"`; do
714     # If the URL ends in something like "#section_15", strip everything from the '#' onward
715     CLEAN_URL=${CLEAN_URL%%\#*}
716  
717 <   # 'sed' cannot handle Unicode in my Bash shell, so skip this URL and make user check it
717 >   # 'sed' cannot handle Unicode in my Bash shell, so skip non-ASCII URL and make user check it
718     if [[ $CLEAN_URL == *[![:ascii:]]* ]]; then
719        valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I cannot handle non-ASCII characters."
720        let SKIP_NON_ASCII+=1

Diff Legend

Removed lines
+ Added lines
< Changed lines (old)
> Changed lines (new)