ViewVC Help
View File | Revision Log | View Changeset | Root Listing
root/Oni2/Validate External Links/validate_external_links.sh
(Generate patch)

Comparing Validate External Links/validate_external_links.sh (file contents):
Revision 1075 by iritscen, Fri Oct 6 02:02:16 2017 UTC vs.
Revision 1118 by iritscen, Tue Mar 17 16:07:35 2020 UTC

# Line 6 | Line 6
6   # (for reading as a local file with clickable links), and HTML (for uploading as a web page).
7   # Call script with "--help" argument for documentation. Also see Read Me First.rtf for critical notes.
8   # Recommended rule:
9 < # ------------------------------------------------------------------------------------------------------
9 > # |----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----|
10  
11   # Set separator token to newline
12   IFS="
# Line 26 | Line 26 | URL_LIMIT=0         # if non-zero, stop
26   UPLOAD_INFO=""      # path to a file on your hard drive with the login info needed to upload a report
27  
28   # Fixed strings -- see the occurrences of these variables to learn their purpose
29 < AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0"
29 > AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36 OPR/67.0.3575.53"
30   ARCHIVE_API="http://archive.org/wayback/available"
31   ARCHIVE_GENERIC="https://web.archive.org/web/*"
32   ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206"
# Line 87 | Line 87 | FILE_LINKS=0
87   PAGE_LINKS=0
88   SKIPPED_HEADER_ROW=0
89   FINISHED_LIST="no"
90 + START_RUN=0
91 + END_RUN=0
92  
93  
94   ### HELP ###
# Line 420 | Line 422 | function wrapupAndExit()
422        fi
423     fi
424  
425 +   # Generate string with elapsed time
426 +   END_RUN=$(date +%s)
427 +   ELAPSED=$(echo $(($END_RUN - $START_RUN)) | awk '{printf "%d min. %d sec. elapsed", int($1/60), int($1%60)}')
428 +
429     # Output results of session and close the log file's markup
430     LINKS_PROCESSED=$((LINK_NUM-URL_START+1))
431     LINKS_SKIPPED=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE))
432     LINKS_CHECKED=$((LINKS_PROCESSED-LINKS_SKIPPED))
433 <   valPrint ct "Summary:"
434 <   valPrint r "\b1 Summary \b0"
435 <   valPrint hn "<h3><span id=\"summary\">Summary</span></h3>"
433 >   valPrint ct "Summary ($ELAPSED):"
434 >   valPrint r "\b1 Summary \b0 ($ELAPSED)"
435 >   valPrint hn "<h3><span id=\"summary\">Summary ($ELAPSED)</span></h3>"
436     valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT)."
437     valPrint ctrh "I skipped $LINKS_SKIPPED $(pluralCheckNoun link $LINKS_SKIPPED), and found $FILE_LINKS $(pluralCheckNoun file $FILE_LINKS) and $PAGE_LINKS $(pluralCheckNoun page $PAGE_LINKS)."
438     if [ $LINKS_SKIPPED -gt 0 ]; then valPrint ctrh "Skip breakdown: "; fi
# Line 540 | Line 546 | valPrint trh ""
546  
547  
548   ### MAIN LOOP ###
549 + START_RUN=$(date +%s)
550   # Process each line of the .csv in LINKS_FILE
551   for LINE in `cat "$LINKS_FILE"`; do
552     let LINK_NUM+=1
# Line 581 | Line 588 | for LINE in `cat "$LINKS_FILE"`; do
588     NS_NAME=""
589     a=0
590     while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done
591 <      if [ $NS_ID -eq ${NS_IDS[$a]} ]; then
591 >      if [ $NS_ID == "NULL" ]; then
592 >         break
593 >      elif [ $NS_ID -eq ${NS_IDS[$a]} ]; then
594           NS_NAME="${NS_NAMES[$a]}"
595           break
596        fi
597        let a+=1
598     done
599 <   if [ -z "$NS_NAME" ]; then
600 <      valPrint tr "Skipping URL found on page $PAGE_NAME because I could not find a name for namespace ID $NS_ID."
599 >   if [ "$NS_NAME" == "" ]; then
600 >      if [ $NS_ID == "NULL" ]; then
601 >         valPrint tr "Skipping URL on line $LINK_NUM because the namespace (and probably the page too) is \"NULL\". Probably the link is no longer in existence on the wiki."
602 >      else
603 >         valPrint tr "Skipping URL on line $LINK_NUM found on page $PAGE_NAME because I could not find a name for namespace ID $NS_ID."
604 >      fi
605        let SKIP_UNK_NS+=1
606        continue
607     fi
# Line 602 | Line 615 | for LINE in `cat "$LINKS_FILE"`; do
615     # JavaScript code, so it will return erroneous links
616     PAGE_NAME_SUFFIX=$(echo $PAGE_NAME | sed 's/.*\.//')
617     if [ $PAGE_NAME_SUFFIX == "js" ]; then
618 <      valPrint tr "Skipping URL found on JavaScript page $PAGE_NAME."
618 >      valPrint tr "Skipping URL on line $LINK_NUM because it was found on JavaScript page $PAGE_NAME."
619        let SKIP_JS_PAGE+=1
620        continue
621     fi
# Line 886 | Line 899 | for LINE in `cat "$LINKS_FILE"`; do
899        if [ $STATUS == "NG" ] && [ $SUGGEST_SNAPSHOTS -eq 1 ]; then
900           ARCHIVE_QUERY=$(curl --silent --max-time 10 "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES")
901  
902 <         # Isolate "url" property in response and log it if a "closest" snapshot was received...
902 >         # If a "closest" snapshot was received...
903           if [[ "$ARCHIVE_QUERY" == *\"closest\":* ]]; then
904 <            SNAPSHOT_URL=${ARCHIVE_QUERY##*\"url\": \"}
905 <            SNAPSHOT_URL=${SNAPSHOT_URL%\", \"timestamp*}
904 >            # In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it
905 >            ARCHIVE_QUERY=$(echo "$ARCHIVE_QUERY" | sed 's/#!/#\\!/')
906 >
907 >            # ...isolate "url" property in the response that follows the "closest" tag
908 >            SNAPSHOT_URL=${ARCHIVE_QUERY##*\"closest\":} # everything after '"closest":'
909 >            SNAPSHOT_URL=${SNAPSHOT_URL##*\"url\": \"} # everything after '"url": "'
910 >            SNAPSHOT_URL=${SNAPSHOT_URL%%\"*} # everything before '"'
911 >
912 >            # Inform the user of the snapshot URL
913              valPrint t "  IA suggests $SNAPSHOT_URL"
914              valPrint r "                IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}"
915              valPrint hn "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>"

Diff Legend

Removed lines
+ Added lines
< Changed lines (old)
> Changed lines (new)