ViewVC Help
View File | Revision Log | View Changeset | Root Listing
root/Oni2/Validate External Links/validate_external_links.sh
(Generate patch)

Comparing Validate External Links/validate_external_links.sh (file contents):
Revision 1120 by iritscen, Wed Mar 18 17:08:59 2020 UTC vs.
Revision 1122 by iritscen, Fri Mar 20 22:13:48 2020 UTC

# Line 18 | Line 18 | LINKS_URL=""        # use 'curl' to down
18   EXCEPT_URL=""       # ditto above for file with exceptions to NG results
19   OUTPUT_DIR=""       # place reports and all other output in a folder inside this existing folder
20   RECORD_OK_LINKS=0   # record response code to the log even when it's a value in OK_CODES
21 + SHOW_SLASH=0        # record response code to the log when a slash is added to the end of a URL
22 + SHOW_HTTPS=0        # record response code to the log when "http" is upgraded to "https"
23   SUGGEST_SNAPSHOTS=0 # query the Internet Archive for a possible snapshot URL for each NG page
24   TAKE_PAGE_SHOT=0    # take a screenshot of each OK page
25   CHROME_PATH=""      # path to a copy of Google Chrome that has the command-line screenshot feature
# Line 34 | Line 36 | CHROME_SCREENSHOT="screenshot.png"
36   CURL_CODES="http://iritscen.oni2.net/val/curl_codes.txt"
37   EXPECT_SCRIPT_NAME="val_expect_sftp.txt"
38   HTTP_CODES="http://iritscen.oni2.net/val/http_codes.txt"
39 < MY_WIKI_PAGE="http://wiki.oni2.net/User:Iritscen"
39 > MY_WIKI_PAGE="https://wiki.oni2.net/User:Iritscen"
40   THIS_DIR=$(cd $(dirname $0); pwd)
41   WORKING_DIR=$(pwd)
42   WIKI_PATH="wiki.oni2.net"
# Line 83 | Line 85 | SKIP_UNK_CODE=0
85   SKIP_EXPECT_NG=0
86   SKIP_EXPECT_EI=0
87   SKIP_EXPECT_IW=0
88 + SKIP_HTTPS_UP=0
89 + SKIP_SLASH_ADD=0
90   FILE_LINKS=0
91   PAGE_LINKS=0
92   SKIPPED_HEADER_ROW=0
# Line 139 | Line 143 | OPTIONS
143                                 you supply a file:// path.
144         --record-ok-links       Log a link in the report even if its response
145                                 code is "OK".
146 +       --show-added-slashes    Report on redirects that simply add a '/' to the
147 +                               end of the URL.
148 +       --show-https-upgrade    Report on redirects that simply upgrade a
149 +                               "http://" URL to a "https://" URL.
150         --suggest-snapshots     Query the Internet Archive for a possible
151                                 snapshot URL for each "NG" page.
152         --take-screenshots FILE Call the Google Chrome binary at this path to
# Line 167 | Line 175 | fi
175   # Parse arguments as long as there are more arguments to process
176   while (( "$#" )); do
177     case "$1" in
178 <      --links )             LINKS_URL="$2";                     shift 2;;
179 <      --exceptions )        EXCEPT_URL="$2";                    shift 2;;
180 <      --output )            OUTPUT_DIR="$2";                    shift 2;;
181 <      --record-ok-links )   RECORD_OK_LINKS=1;                  shift;;
182 <      --suggest-snapshots ) SUGGEST_SNAPSHOTS=1;                shift;;
183 <      --take-screenshots )  TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
184 <      --start-url )         URL_START=$2;                       shift 2;;
185 <      --end-url )           URL_LIMIT=$2;                       shift 2;;
186 <      --upload )            UPLOAD_INFO=$2;                     shift 2;;
187 <      * )                   echo "Invalid argument $1 detected. Aborting."; exit 1;;
178 >      --links )              LINKS_URL="$2";                     shift 2;;
179 >      --exceptions )         EXCEPT_URL="$2";                    shift 2;;
180 >      --output )             OUTPUT_DIR="$2";                    shift 2;;
181 >      --record-ok-links )    RECORD_OK_LINKS=1;                  shift;;
182 >      --show-added-slashes ) SHOW_SLASH=1;                       shift;;
183 >      --show-https-upgrade ) SHOW_HTTPS=1;                       shift;;
184 >      --suggest-snapshots )  SUGGEST_SNAPSHOTS=1;                shift;;
185 >      --take-screenshots )   TAKE_PAGE_SHOT=1; CHROME_PATH="$2"; shift 2;;
186 >      --start-url )          URL_START=$2;                       shift 2;;
187 >      --end-url )            URL_LIMIT=$2;                       shift 2;;
188 >      --upload )             UPLOAD_INFO=$2;                     shift 2;;
189 >      * )                    echo "Invalid argument $1 detected. Aborting."; exit 1;;
190    esac
191   done
192  
# Line 433 | Line 443 | function wrapupAndExit()
443     END_RUN=$(date +%s)
444     ELAPSED=$(echo $(($END_RUN - $START_RUN)) | awk '{printf "%d min. %d sec. elapsed", int($1/60), int($1%60)}')
445  
446 <   # Output results of session and close the log file's markup
446 >   # Do some math on results of session
447     LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
448 <   LINKS_SKIPPED=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
449 <   LINKS_CHECKED=$((LINKS_PROCESSED-LINKS_SKIPPED))
448 >   LINK_PROBLEMS=$((EI_LINKS+IW_LINKS+RD_LINKS+NG_LINKS))
449 >   LINK_ERRORS=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
450 >   LINKS_EXCEPTED=$((SKIP_EXPECT_NG+SKIP_EXPECT_EI+SKIP_EXPECT_IW))
451 >   TRIVIAL_RDS=$((SKIP_SLASH_ADD+SKIP_HTTPS_UP))
452 >   LINKS_CHECKED=$((LINKS_PROCESSED-LINK_ERRORS))
453 >
454 >   # Print summary header
455     valPrint ct "Summary ($ELAPSED):"
456     valPrint r "\b1 Summary \b0 ($ELAPSED)"
457     valPrint hn "<h3><span id=\"summary\">Summary ($ELAPSED)</span></h3>"
458 <   valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT)."
459 <   valPrint ctrh "I skipped $LINKS_SKIPPED $(pluralCheckNoun link $LINKS_SKIPPED), and found $FILE_LINKS $(pluralCheckNoun file $FILE_LINKS) and $PAGE_LINKS $(pluralCheckNoun page $PAGE_LINKS)."
460 <   if [ $LINKS_SKIPPED -gt 0 ]; then valPrint ctrh "Skip breakdown: "; fi
461 <   if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
458 >   valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT) (there were $FILE_LINKS file $(pluralCheckNoun link $FILE_LINKS) and $PAGE_LINKS page $(pluralCheckNoun link $PAGE_LINKS))."
459 >
460 >   # Print processed link totals
461 >   if [ $LINKS_PROCESSED -gt 0 ]; then valPrint ctrh "$LINKS_PROCESSED processed $(pluralCheckNoun link $LINKS_PROCESSED):"; fi
462 >   if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "- $LINK_ERRORS $(pluralCheckNoun link $LINK_ERRORS) could not be processed"; fi
463 >   if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "- $LINK_PROBLEMS processed $(pluralCheckNoun link $LINK_PROBLEMS) had issues"; fi
464 >   if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctrh "  (excepted $LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) from report)"; fi
465 >   if [ $OK_LINKS -gt 0 ]; then valPrint ctrh "- $OK_LINKS processed $(pluralCheckNoun link $OK_LINKS) were OK"; fi
466 >   if [ $TRIVIAL_RDS -gt 0 ]; then valPrint ctrh "  (counted $TRIVIAL_RDS trivial $(pluralCheckNoun redirection $TRIVIAL_RDS) as OK)"; fi
467 >
468 >   # Print excepted link totals
469 >   if [ $LINKS_EXCEPTED -gt 0 ]; then valPrint ctrh "$LINKS_EXCEPTED $(pluralCheckNoun link $LINKS_EXCEPTED) excepted (see RTF or TXT report for specific links):"; fi
470 >   if [ $SKIP_EXPECT_NG -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
471 >   if [ $SKIP_EXPECT_EI -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS)"; fi
472 >   if [ $SKIP_EXPECT_IW -gt 0 ]; then valPrint ctrh "- $SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS)"; fi
473 >
474 >   # Print errored link totals
475 >   if [ $LINK_ERRORS -gt 0 ]; then valPrint ctrh "$LINK_ERRORS link $(pluralCheckNoun error $LINK_ERRORS) (see RTF or TXT report for specific links):"; fi
476 >   if [ $SKIP_UNK_NS -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_NS missing/unknown $(pluralCheckNoun namespace $SKIP_UNK_NS)"; fi
477     if [ $SKIP_JS_PAGE -gt 0 ]; then valPrint ctrh "- $SKIP_JS_PAGE $(pluralCheckNoun link $SKIP_JS_PAGE) on $(pluralCheckA $SKIP_JS_PAGE)JavaScript $(pluralCheckNoun page $SKIP_JS_PAGE)"; fi
478     if [ $SKIP_BAD_URL -gt 0 ]; then valPrint ctrh "- $SKIP_BAD_URL illegal $(pluralCheckNoun URL $SKIP_BAD_URL)"; fi
479     if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi
480     if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi
481     if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi
482 <   valPrint ctrh "Out of the $LINKS_CHECKED links checked, $EI_LINKS could be $(pluralCheckAn $EI_LINKS)intrawiki $(pluralCheckNoun link $EI_LINKS), $IW_LINKS could be $(pluralCheckAn $IW_LINKS)interwiki $(pluralCheckNoun link $IW_LINKS), $OK_LINKS $(pluralCheckWas $OK_LINKS) OK, $RD_LINKS $(pluralCheckWas $RD_LINKS) $(pluralCheckA $RD_LINKS)redirection $(pluralCheckNoun notice $RD_LINKS), and $NG_LINKS $(pluralCheckWas $NG_LINKS) NG."
483 <   if [ $SKIP_EXPECT_NG -gt 0 ]; then
484 <      valPrint ctrh "$SKIP_EXPECT_NG/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS) went unlisted due to being found in the exceptions file."
485 <   fi
486 <   if [ $SKIP_EXPECT_EI -gt 0 ]; then
487 <      valPrint ctrh "$SKIP_EXPECT_EI/$EI_LINKS external internal $(pluralCheckNoun link $EI_LINKS) went unlisted due to being found in the exceptions file."
488 <   fi
489 <   if [ $SKIP_EXPECT_IW -gt 0 ]; then
490 <      valPrint ctrh "$SKIP_EXPECT_IW/$IW_LINKS potential intrawiki $(pluralCheckNoun link $IW_LINKS) went unlisted due to being found in the exceptions file."
461 <   fi
482 >
483 >   # Print checked link totals
484 >   if [ $LINK_PROBLEMS -gt 0 ]; then valPrint ctrh "$LINK_PROBLEMS link $(pluralCheckNoun issues $LINKS_CHECKED):"; fi
485 >   if [ $NG_LINKS -gt 0 ]; then valPrint ctrh "- $NG_LINKS NG $(pluralCheckNoun link $NG_LINKS)"; fi
486 >   if [ $RD_LINKS -gt 0 ]; then valPrint ctrh "- $RD_LINKS $(pluralCheckNoun redirection $RD_LINKS)"; fi
487 >   if [ $EI_LINKS -gt 0 ]; then valPrint ctrh "- $EI_LINKS $(pluralCheckNoun link $EI_LINKS) that could be intrawiki"; fi
488 >   if [ $IW_LINKS -gt 0 ]; then valPrint ctrh "- $IW_LINKS $(pluralCheckNoun link $IW_LINKS) that could be interwiki"; fi
489 >
490 >   # Close the log files' markup
491     valPrint trh "ValExtLinks says goodbye."
492     printRTFfooter
493     printHTMfooter
# Line 634 | Line 663 | for LINE in `cat "$LINKS_FILE"`; do
663     fi
664  
665     # Build longer wiki page URLs from namespace and page names
666 <   FULL_PAGE_PATH=http://$WIKI_PATH/$NS_NAME:$PAGE_NAME
666 >   FULL_PAGE_PATH=https://$WIKI_PATH/$NS_NAME:$PAGE_NAME
667     LOCAL_PAGE_PATH=$NS_NAME:$PAGE_NAME
668     # Namespace "Main:" cannot be a part of the path; it's an implicit namespace, and naming it
669     # explicitly breaks the link
670     if [ $NS_ID -eq 0 ]; then
671 <      FULL_PAGE_PATH=http://$WIKI_PATH/$PAGE_NAME
671 >      FULL_PAGE_PATH=https://$WIKI_PATH/$PAGE_NAME
672        LOCAL_PAGE_PATH=$PAGE_NAME
673     fi
674  
# Line 794 | Line 823 | for LINE in `cat "$LINKS_FILE"`; do
823              # Get URL header again in order to retrieve the URL we are being redirected to
824              NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{redirect_url}\n' $URL)
825  
826 <            # Filter out cases where the redirect URL is just the original URL with https:// instead of
827 <            # http://, or with an added '/' at the end. These corrections happen a lot and are not
828 <            # important to us.
829 <            URL_NO_PROTOCOL=${URL#http://}
801 <            URL_NO_PROTOCOL=${URL_NO_PROTOCOL%/}
802 <            NEW_URL_NO_PROTOCOL=${NEW_URL#https://}
803 <            NEW_URL_NO_PROTOCOL=${NEW_URL_NO_PROTOCOL%/}
826 >            # Adjust the old and new URLs to both use HTTP for comparison purposes, so we can filter
827 >            # those changes out if the user didn't ask for them
828 >            URL_HTTP=$(echo $URL | sed -E 's/^https:/http:/')
829 >            NEW_URL_HTTP=$(echo $NEW_URL | sed -E 's/^https:/http:/')
830  
831              # Sometimes 'curl' fails to get the redirect_url due to time-out or bad web site config
832 <            NEW_URL_LENGTH=$(echo | awk -v input=$NEW_URL_NO_PROTOCOL '{print length(input)}')
832 >            NEW_URL_LENGTH=$(echo | awk -v input=$NEW_URL_HTTP '{print length(input)}')
833              if [ $NEW_URL_LENGTH -lt $MIN_URL_LENGTH ]; then
834 <               NEW_URL_NO_PROTOCOL="[new URL not retrieved]"
834 >               NEW_URL_HTTP="[new URL not retrieved]"
835              fi
836  
837 <            # If the URLs match after the above filters were applied, then the link is OK
838 <            if [ $URL_NO_PROTOCOL == $NEW_URL_NO_PROTOCOL ]; then
837 >            # Remove slash at end of new URL, if present, so we can filter out the redirects that
838 >            # merely add an ending slash if the user didn't ask for them
839 >            NEW_URL_NO_SLASH=$(echo $NEW_URL_HTTP | sed -E 's:/$::')
840 >
841 >            # If the URLs match besides HTTP being upgraded to HTTPS, then the link is OK (unless user
842 >            # wants those to be reported)
843 >            if [ $SHOW_HTTPS -eq 0 ] && [ $URL_HTTP == $NEW_URL_HTTP ]; then
844 >               valPrint trs "Skipping URL $URL (found on page $PAGE_NAME) because we have not been asked to show http->https upgrades, and we were redirected to $NEW_URL."
845 >               STATUS="OK"
846 >               let OK_LINKS+=1
847 >               let SKIP_HTTPS_UP+=1
848 >            # If the URLs match besides an added ending slash, then the link is OK (unless user wants
849 >            # those to be reported)
850 >            elif [ $SHOW_SLASH -eq 0 ] && [ $URL_HTTP == $NEW_URL_NO_SLASH ]; then
851 >               valPrint trs "Skipping URL $URL (found on page $PAGE_NAME) because we have not been asked to show added trailing slashes, and we were redirected to $NEW_URL."
852                 STATUS="OK"
853                 let OK_LINKS+=1
854 +               let SKIP_SLASH_ADD+=1
855              else
856                 STATUS="RD"
857                 let RD_LINKS+=1

Diff Legend

Removed lines
+ Added lines
< Changed lines (old)
> Changed lines (new)