1 |
|
#!/bin/bash |
2 |
|
|
3 |
|
# Validate External Links by Iritscen |
4 |
< |
# Provided with a list of external links found in the OniGalore wiki, this script validates them. |
5 |
< |
# The resulting logs are produced in three formats: TXT (for easy diffing with an earlier log), RTF |
6 |
< |
# (for reading as a local file with clickable links), and HTML (for uploading as a web page). |
7 |
< |
# Call script with "--help" argument for documentation. Also see Read Me First.rtf for critical notes. |
4 |
> |
# Provided with a list of external links in an expected CSV format, this script validates them. The |
5 |
> |
# resulting logs are produced in three formats: TXT (for easy diffing with an earlier log), RTF (for |
6 |
> |
# reading as a local file with clickable links), and HTML (for uploading as a web page). Call script |
7 |
> |
# with "--help" argument for documentation. Also see Read Me First.rtf for critical notes. |
8 |
|
# Recommended rule: |
9 |
|
# |----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----| |
10 |
|
|
14 |
|
|
15 |
|
### GLOBALS ### |
16 |
|
# Settings -- these will be changed from their defaults by the arguments passed in to the script |
17 |
< |
LINKS_URL="" # use 'curl' to download file with links from this location (can be file://) |
18 |
< |
EXCEPT_URL="" # ditto above for file with exceptions to NG results |
19 |
< |
OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder |
20 |
< |
RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES |
21 |
< |
SHOW_SLASH=0 # record response code to the log when a slash is added to the end of a URL |
22 |
< |
SHOW_HTTPS=0 # record response code to the log when "http" is upgraded to "https" |
23 |
< |
SHOW_YT_RD=0 # record response code to the log when a youtu.be URL is expanded to the full URL |
24 |
< |
SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page |
25 |
< |
TAKE_PAGE_SHOT=0 # take a screenshot of each OK page |
26 |
< |
CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature |
27 |
< |
URL_START=1 # start at this URL in LINKS_FILE (1 by default) |
28 |
< |
URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE |
29 |
< |
UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report |
17 |
> |
LINKS_URL="" # use 'curl' to download file with links from this location (can be file://) |
18 |
> |
EXCEPT_URL="" # ditto above for file with exceptions to NG results |
19 |
> |
OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder |
20 |
> |
RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES |
21 |
> |
SHOW_SLASH=0 # record response code to the log when a slash is added to the end of a URL |
22 |
> |
SHOW_HTTPS=0 # record response code to the log when "http" is upgraded to "https" |
23 |
> |
SHOW_YT_RD=0 # record response code to the log when a youtu.be URL is expanded to the full URL |
24 |
> |
SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page |
25 |
> |
SKIP_ARCHIVE_LINKS=0 # don't check URLs under the archive.org domain |
26 |
> |
TAKE_PAGE_SHOT=0 # take a screenshot of each OK page |
27 |
> |
CHROME_PATH="" # path to a copy of Google Chrome that has the command-line screenshot feature |
28 |
> |
URL_START=1 # start at this URL in LINKS_FILE (1 by default) |
29 |
> |
URL_LIMIT=0 # if non-zero, stop at this URL in LINKS_FILE |
30 |
> |
UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report |
31 |
|
|
32 |
|
# Fixed strings -- see the occurrences of these variables to learn their purpose |
33 |
|
AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36 OPR/67.0.3575.53" |
90 |
|
SKIP_HTTPS_UP=0 |
91 |
|
SKIP_SLASH_ADD=0 |
92 |
|
SKIP_YOUTU_BE=0 |
93 |
+ |
SKIP_ARCHIVE_ORG=0 |
94 |
|
FILE_LINKS=0 |
95 |
|
PAGE_LINKS=0 |
96 |
|
SKIPPED_HEADER_ROW=0 |
154 |
|
--show-yt-redirects Report on redirects that expand a youtu.be URL. |
155 |
|
--suggest-snapshots Query the Internet Archive for a possible |
156 |
|
snapshot URL for each "NG" page. |
157 |
+ |
--skip-archive-links Don't check links that are already pointing to |
158 |
+ |
a page on the Internet Archive. |
159 |
|
--take-screenshots FILE Call the Google Chrome binary at this path to |
160 |
|
take screenshots of each "OK" page. |
161 |
|
--start-url NUM Start at this link in the links CSV file. |
190 |
|
--show-https-upgrades ) SHOW_HTTPS=1; shift;; |
191 |
|
--show-yt-redirects ) SHOW_YT_RD=1; shift;; |
192 |
|
--suggest-snapshots ) SUGGEST_SNAPSHOTS=1; shift;; |
193 |
+ |
--skip-archive-links ) SKIP_ARCHIVE_LINKS=1; shift;; |
194 |
|
--take-screenshots ) TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;; |
195 |
|
--start-url ) URL_START=$2; shift 2;; |
196 |
|
--end-url ) URL_LIMIT=$2; shift 2;; |
469 |
|
# Print processed link totals |
470 |
|
if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi |
471 |
|
if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi |
472 |
+ |
if [ $SKIP_ARCHIVE_ORG -gt 0 ]; then valPrint ctrh "- $SKIP_ARCHIVE_ORG Archive.org $(pluralCheckNoun link $SKIP_ARCHIVE_ORG) were not checked"; fi |
473 |
|
if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS processed $(pluralCheckNoun link $LINK_PROBLEMS) had $(pluralCheckAn $LINK_PROBLEMS)$(pluralCheckNoun issue $LINK_PROBLEMS)"; fi |
474 |
|
if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctr " (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; valPrint h " (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi |
475 |
|
if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) $(pluralCheckWas $OK_LINKS) OK"; fi |
563 |
|
fi |
564 |
|
|
565 |
|
# Print settings to console and log |
566 |
< |
declare -a SETTINGS_MSG=(I will be checking the response code of each link "and will" take a screenshot of each page. Pages that are OK will "also" be logged. I "will" ask the Internet Archive for a suggested snapshot URL for each NG page. "I will not report links that are listed in the exceptions file.") |
566 |
> |
declare -a SETTINGS_MSG=(I will be checking the response code of each link "and will" take a screenshot of each page. Pages that are OK will "also" be logged. I "will" ask the Internet Archive for a suggested snapshot URL for each NG page. "I will not report links that are listed in the exceptions file." "I will ignore URLs that simply have ending slashes added onto them." "I will ignore URLs that only upgrade from HTTP to HTTPS." "I will ignore youtu.be links that are merely being expanded." "I will not check the validity of Internet Archive snapshot URLs.") |
567 |
|
if [ $TAKE_PAGE_SHOT -eq 0 ]; then SETTINGS_MSG[10]="but will not"; fi |
568 |
|
if [ $RECORD_OK_LINKS -eq 0 ]; then SETTINGS_MSG[22]="not"; fi |
569 |
|
if [ $SUGGEST_SNAPSHOTS -eq 0 ]; then SETTINGS_MSG[26]="will not"; fi |
570 |
|
if [ -z $EXCEPT_URL ] || [ $RECORD_OK_LINKS -eq 1 ]; then SETTINGS_MSG[40]=""; fi |
571 |
+ |
if [ $SHOW_SLASH -eq 1 ]; then SETTINGS_MSG[41]=""; fi |
572 |
+ |
if [ $SHOW_HTTPS -eq 1 ]; then SETTINGS_MSG[42]=""; fi |
573 |
+ |
if [ $SHOW_YT_RD -eq 1 ]; then SETTINGS_MSG[43]=""; fi |
574 |
+ |
if [ $SKIP_ARCHIVE_LINKS -eq 0 ]; then SETTINGS_MSG[44]=""; fi |
575 |
|
SETTINGS_STR=${SETTINGS_MSG[@]} |
576 |
|
valPrint ctrh "$SETTINGS_STR" |
577 |
|
valPrint tr "A summary of my findings will be found at the bottom of the report." |
583 |
|
valPrint r "\b1 Legend \b0" |
584 |
|
valPrint hn "<h3>Legend</h3>" |
585 |
|
valPrint trh "OK = URL seems to be working." |
586 |
< |
valPrint trh "NG = URL no longer seems to work. You should click each URL marked as NG before attempting to fix it, because false negatives will occur from time to time due to hiccups in the Internet. Please report any persistent false negatives or other issues to Iritscen. An NG link should be followed by a link to the Internet Archive's Wayback Machine which may help you repair the link. If the link cannot be repaired, you can disable it on the wiki (which prevents it from showing up in future ValExtLinks reports) by wrapping it in nowiki tags." |
586 |
> |
valPrint trh "NG = URL no longer seems to work. You should click each URL marked as NG before attempting to fix it, because false negatives will occur from time to time due to hiccups in the Internet. Please report any persistent false negatives or other issues to the script's author (see top of report). An NG link should be followed by a link to the Internet Archive's Wayback Machine which may help you repair the link, unless the Archive does not have any snapshots of the site. If the link cannot be repaired, you can delete it from the wiki page, or, if this would disrupt the surrounding material on the page, disable the link by wrapping the URL in nowiki tags." |
587 |
|
valPrint trh "RD = The server responding to this URL is saying that the page moved and you should instead use the supplied new URL. Some RD links represent minor adjustments in the organization of a web site, and some are soft 404s (the file/page has been removed and you are being redirected to something like the main page of the web site). You will have to look at the new URL yourself to determine if it represents an OK link and the link on the wiki should be updated to this one, or if the desired file/page is actually gone and we need to replace the wiki link with an Internet Archive snapshot link -- or disable the URL if it has not been archived." |
588 |
|
valPrint trh "EI = URL is an external link to an internal page and should be converted to an intrawiki link using the suggested markup." |
589 |
|
valPrint trh "IW = URL is an external link to a fellow wiki and should be converted to an interwiki link using the suggested markup." |
667 |
|
PAGE_NAME=${LINE#$NS_ID,} |
668 |
|
PAGE_NAME=${PAGE_NAME%%,*} |
669 |
|
|
670 |
< |
# We don't want to consider wiki pages ending in .js, as the parser cannot reliably isolate URLS in |
671 |
< |
# JavaScript code, so it will return erroneous links |
670 |
> |
# We don't want to consider wiki pages ending in .js, as the MW parser cannot reliably isolate URLS |
671 |
> |
# in JavaScript code, so it returns erroneous links |
672 |
|
PAGE_NAME_SUFFIX=$(echo $PAGE_NAME | sed 's/.*\.//') |
673 |
|
if [ $PAGE_NAME_SUFFIX == "js" ]; then |
674 |
|
valPrint trs "Skipping URL '${LINE#$NS_ID,$PAGE_NAME,}' on line $LINK_NUM because it was found on JavaScript page '$PAGE_NAME'." |
697 |
|
continue |
698 |
|
fi |
699 |
|
|
700 |
+ |
# If we're skipping Archive.org links, check if this is one |
701 |
+ |
if [ $SKIP_ARCHIVE_LINKS -eq 1 ] && [[ $URL == *web.archive.org* ]]; then |
702 |
+ |
valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I have been asked not to check Wayback Machine links." |
703 |
+ |
let SKIP_ARCHIVE_ORG+=1 |
704 |
+ |
continue |
705 |
+ |
fi |
706 |
+ |
|
707 |
|
# Now we need to know if the URL is for a file or a web page. First step is to determine if the |
708 |
|
# URL ends in a suffix |
709 |
|
HAS_SUFFIX=0 |
714 |
|
# If the URL ends in something like "#section_15", strip everything from the '#' onward |
715 |
|
CLEAN_URL=${CLEAN_URL%%\#*} |
716 |
|
|
717 |
< |
# 'sed' cannot handle Unicode in my Bash shell, so skip this URL and make user check it |
717 |
> |
# 'sed' cannot handle Unicode in my Bash shell, so skip non-ASCII URL and make user check it |
718 |
|
if [[ $CLEAN_URL == *[![:ascii:]]* ]]; then |
719 |
|
valPrint trs "Skipping URL '$URL' (found on page '$PAGE_NAME') because I cannot handle non-ASCII characters." |
720 |
|
let SKIP_NON_ASCII+=1 |