6 |
|
# (for reading as a local file with clickable links), and HTML (for uploading as a web page). |
7 |
|
# Call script with "--help" argument for documentation. Also see Read Me First.rtf for critical notes. |
8 |
|
# Recommended rule: |
9 |
< |
# ------------------------------------------------------------------------------------------------------ |
9 |
> |
# |----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----| |
10 |
|
|
11 |
|
# Set separator token to newline |
12 |
|
IFS=" |
26 |
|
UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report |
27 |
|
|
28 |
|
# Fixed strings -- see the occurrences of these variables to learn their purpose |
29 |
< |
AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0" |
29 |
> |
AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36 OPR/67.0.3575.53" |
30 |
|
ARCHIVE_API="http://archive.org/wayback/available" |
31 |
|
ARCHIVE_GENERIC="https://web.archive.org/web/*" |
32 |
|
ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206" |
87 |
|
PAGE_LINKS=0 |
88 |
|
SKIPPED_HEADER_ROW=0 |
89 |
|
FINISHED_LIST="no" |
90 |
+ |
START_RUN=0 |
91 |
+ |
END_RUN=0 |
92 |
|
|
93 |
|
|
94 |
|
### HELP ### |
422 |
|
fi |
423 |
|
fi |
424 |
|
|
425 |
+ |
# Generate string with elapsed time |
426 |
+ |
END_RUN=$(date +%s) |
427 |
+ |
ELAPSED=$(echo $(($END_RUN - $START_RUN)) | awk '{printf "%d min. %d sec. elapsed", int($1/60), int($1%60)}') |
428 |
+ |
|
429 |
|
# Output results of session and close the log file's markup |
430 |
|
LINKS_PROCESSED=$((LINK_NUM-URL_START+1)) |
431 |
|
LINKS_SKIPPED=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE)) |
432 |
|
LINKS_CHECKED=$((LINKS_PROCESSED-LINKS_SKIPPED)) |
433 |
< |
valPrint ct "Summary:" |
434 |
< |
valPrint r "\b1 Summary \b0" |
435 |
< |
valPrint hn "<h3><span id=\"summary\">Summary</span></h3>" |
433 |
> |
valPrint ct "Summary ($ELAPSED):" |
434 |
> |
valPrint r "\b1 Summary \b0 ($ELAPSED)" |
435 |
> |
valPrint hn "<h3><span id=\"summary\">Summary ($ELAPSED)</span></h3>" |
436 |
|
valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT)." |
437 |
|
valPrint ctrh "I skipped $LINKS_SKIPPED $(pluralCheckNoun link $LINKS_SKIPPED), and found $FILE_LINKS $(pluralCheckNoun file $FILE_LINKS) and $PAGE_LINKS $(pluralCheckNoun page $PAGE_LINKS)." |
438 |
|
if [ $LINKS_SKIPPED -gt 0 ]; then valPrint ctrh "Skip breakdown: "; fi |
546 |
|
|
547 |
|
|
548 |
|
### MAIN LOOP ### |
549 |
+ |
START_RUN=$(date +%s) |
550 |
|
# Process each line of the .csv in LINKS_FILE |
551 |
|
for LINE in `cat "$LINKS_FILE"`; do |
552 |
|
let LINK_NUM+=1 |
588 |
|
NS_NAME="" |
589 |
|
a=0 |
590 |
|
while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done |
591 |
< |
if [ $NS_ID -eq ${NS_IDS[$a]} ]; then |
591 |
> |
if [ $NS_ID == "NULL" ]; then |
592 |
> |
break |
593 |
> |
elif [ $NS_ID -eq ${NS_IDS[$a]} ]; then |
594 |
|
NS_NAME="${NS_NAMES[$a]}" |
595 |
|
break |
596 |
|
fi |
597 |
|
let a+=1 |
598 |
|
done |
599 |
< |
if [ -z "$NS_NAME" ]; then |
600 |
< |
valPrint tr "Skipping URL found on page $PAGE_NAME because I could not find a name for namespace ID $NS_ID." |
599 |
> |
if [ "$NS_NAME" == "" ]; then |
600 |
> |
if [ $NS_ID == "NULL" ]; then |
601 |
> |
valPrint tr "Skipping URL on line $LINK_NUM because the namespace (and probably the page too) is \"NULL\". Probably the link is no longer in existence on the wiki." |
602 |
> |
else |
603 |
> |
valPrint tr "Skipping URL on line $LINK_NUM found on page $PAGE_NAME because I could not find a name for namespace ID $NS_ID." |
604 |
> |
fi |
605 |
|
let SKIP_UNK_NS+=1 |
606 |
|
continue |
607 |
|
fi |
615 |
|
# JavaScript code, so it will return erroneous links |
616 |
|
PAGE_NAME_SUFFIX=$(echo $PAGE_NAME | sed 's/.*\.//') |
617 |
|
if [ $PAGE_NAME_SUFFIX == "js" ]; then |
618 |
< |
valPrint tr "Skipping URL found on JavaScript page $PAGE_NAME." |
618 |
> |
valPrint tr "Skipping URL on line $LINK_NUM because it was found on JavaScript page $PAGE_NAME." |
619 |
|
let SKIP_JS_PAGE+=1 |
620 |
|
continue |
621 |
|
fi |
899 |
|
if [ $STATUS == "NG" ] && [ $SUGGEST_SNAPSHOTS -eq 1 ]; then |
900 |
|
ARCHIVE_QUERY=$(curl --silent --max-time 10 "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES") |
901 |
|
|
902 |
< |
# Isolate "url" property in response and log it if a "closest" snapshot was received... |
902 |
> |
# If a "closest" snapshot was received... |
903 |
|
if [[ "$ARCHIVE_QUERY" == *\"closest\":* ]]; then |
904 |
< |
SNAPSHOT_URL=${ARCHIVE_QUERY##*\"url\": \"} |
905 |
< |
SNAPSHOT_URL=${SNAPSHOT_URL%\", \"timestamp*} |
904 |
> |
# In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it |
905 |
> |
ARCHIVE_QUERY=$(echo "$ARCHIVE_QUERY" | sed 's/#!/#\\!/') |
906 |
> |
|
907 |
> |
# ...isolate "url" property in the response that follows the "closest" tag |
908 |
> |
SNAPSHOT_URL=${ARCHIVE_QUERY##*\"closest\":} # everything after '"closest":' |
909 |
> |
SNAPSHOT_URL=${SNAPSHOT_URL##*\"url\": \"} # everything after '"url": "' |
910 |
> |
SNAPSHOT_URL=${SNAPSHOT_URL%%\"*} # everything before '"' |
911 |
> |
|
912 |
> |
# Inform the user of the snapshot URL |
913 |
|
valPrint t " IA suggests $SNAPSHOT_URL" |
914 |
|
valPrint r " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}" |
915 |
|
valPrint hn "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>" |