| 6 |
|
# (for reading as a local file with clickable links), and HTML (for uploading as a web page). |
| 7 |
|
# Call script with "--help" argument for documentation. Also see Read Me First.rtf for critical notes. |
| 8 |
|
# Recommended rule: |
| 9 |
< |
# ------------------------------------------------------------------------------------------------------ |
| 9 |
> |
# |----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----| |
| 10 |
|
|
| 11 |
|
# Set separator token to newline |
| 12 |
|
IFS=" |
| 26 |
|
UPLOAD_INFO="" # path to a file on your hard drive with the login info needed to upload a report |
| 27 |
|
|
| 28 |
|
# Fixed strings -- see the occurrences of these variables to learn their purpose |
| 29 |
< |
AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0" |
| 29 |
> |
AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36 OPR/67.0.3575.53" |
| 30 |
|
ARCHIVE_API="http://archive.org/wayback/available" |
| 31 |
|
ARCHIVE_GENERIC="https://web.archive.org/web/*" |
| 32 |
|
ARCHIVE_OK_CODES="statuscodes=200&statuscodes=203&statuscodes=206" |
| 87 |
|
PAGE_LINKS=0 |
| 88 |
|
SKIPPED_HEADER_ROW=0 |
| 89 |
|
FINISHED_LIST="no" |
| 90 |
+ |
START_RUN=0 |
| 91 |
+ |
END_RUN=0 |
| 92 |
|
|
| 93 |
|
|
| 94 |
|
### HELP ### |
| 422 |
|
fi |
| 423 |
|
fi |
| 424 |
|
|
| 425 |
+ |
# Generate string with elapsed time |
| 426 |
+ |
END_RUN=$(date +%s) |
| 427 |
+ |
ELAPSED=$(echo $(($END_RUN - $START_RUN)) | awk '{printf "%d min. %d sec. elapsed", int($1/60), int($1%60)}') |
| 428 |
+ |
|
| 429 |
|
# Output results of session and close the log file's markup |
| 430 |
|
LINKS_PROCESSED=$((LINK_NUM-URL_START+1)) |
| 431 |
|
LINKS_SKIPPED=$((SKIP_UNK_NS+SKIP_JS_PAGE+SKIP_BAD_URL+SKIP_NON_ASCII+SKIP_UNK_SUFFIX+SKIP_UNK_CODE)) |
| 432 |
|
LINKS_CHECKED=$((LINKS_PROCESSED-LINKS_SKIPPED)) |
| 433 |
< |
valPrint ct "Summary:" |
| 434 |
< |
valPrint r "\b1 Summary \b0" |
| 435 |
< |
valPrint hn "<h3><span id=\"summary\">Summary</span></h3>" |
| 433 |
> |
valPrint ct "Summary ($ELAPSED):" |
| 434 |
> |
valPrint r "\b1 Summary \b0 ($ELAPSED)" |
| 435 |
> |
valPrint hn "<h3><span id=\"summary\">Summary ($ELAPSED)</span></h3>" |
| 436 |
|
valPrint ctrh "I finished processing $LINKS_PROCESSED of $LINK_COUNT $(pluralCheckNoun link $LINK_COUNT)." |
| 437 |
|
valPrint ctrh "I skipped $LINKS_SKIPPED $(pluralCheckNoun link $LINKS_SKIPPED), and found $FILE_LINKS $(pluralCheckNoun file $FILE_LINKS) and $PAGE_LINKS $(pluralCheckNoun page $PAGE_LINKS)." |
| 438 |
|
if [ $LINKS_SKIPPED -gt 0 ]; then valPrint ctrh "Skip breakdown: "; fi |
| 546 |
|
|
| 547 |
|
|
| 548 |
|
### MAIN LOOP ### |
| 549 |
+ |
START_RUN=$(date +%s) |
| 550 |
|
# Process each line of the .csv in LINKS_FILE |
| 551 |
|
for LINE in `cat "$LINKS_FILE"`; do |
| 552 |
|
let LINK_NUM+=1 |
| 588 |
|
NS_NAME="" |
| 589 |
|
a=0 |
| 590 |
|
while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done |
| 591 |
< |
if [ $NS_ID -eq ${NS_IDS[$a]} ]; then |
| 591 |
> |
if [ $NS_ID == "NULL" ]; then |
| 592 |
> |
break |
| 593 |
> |
elif [ $NS_ID -eq ${NS_IDS[$a]} ]; then |
| 594 |
|
NS_NAME="${NS_NAMES[$a]}" |
| 595 |
|
break |
| 596 |
|
fi |
| 597 |
|
let a+=1 |
| 598 |
|
done |
| 599 |
< |
if [ -z "$NS_NAME" ]; then |
| 600 |
< |
valPrint tr "Skipping URL found on page $PAGE_NAME because I could not find a name for namespace ID $NS_ID." |
| 599 |
> |
if [ "$NS_NAME" == "" ]; then |
| 600 |
> |
if [ $NS_ID == "NULL" ]; then |
| 601 |
> |
valPrint tr "Skipping URL on line $LINK_NUM because the namespace (and probably the page too) is \"NULL\". Probably the link is no longer in existence on the wiki." |
| 602 |
> |
else |
| 603 |
> |
valPrint tr "Skipping URL on line $LINK_NUM found on page $PAGE_NAME because I could not find a name for namespace ID $NS_ID." |
| 604 |
> |
fi |
| 605 |
|
let SKIP_UNK_NS+=1 |
| 606 |
|
continue |
| 607 |
|
fi |
| 615 |
|
# JavaScript code, so it will return erroneous links |
| 616 |
|
PAGE_NAME_SUFFIX=$(echo $PAGE_NAME | sed 's/.*\.//') |
| 617 |
|
if [ $PAGE_NAME_SUFFIX == "js" ]; then |
| 618 |
< |
valPrint tr "Skipping URL found on JavaScript page $PAGE_NAME." |
| 618 |
> |
valPrint tr "Skipping URL on line $LINK_NUM because it was found on JavaScript page $PAGE_NAME." |
| 619 |
|
let SKIP_JS_PAGE+=1 |
| 620 |
|
continue |
| 621 |
|
fi |
| 899 |
|
if [ $STATUS == "NG" ] && [ $SUGGEST_SNAPSHOTS -eq 1 ]; then |
| 900 |
|
ARCHIVE_QUERY=$(curl --silent --max-time 10 "$ARCHIVE_API?url=$URL&$ARCHIVE_OK_CODES") |
| 901 |
|
|
| 902 |
< |
# Isolate "url" property in response and log it if a "closest" snapshot was received... |
| 902 |
> |
# If a "closest" snapshot was received... |
| 903 |
|
if [[ "$ARCHIVE_QUERY" == *\"closest\":* ]]; then |
| 904 |
< |
SNAPSHOT_URL=${ARCHIVE_QUERY##*\"url\": \"} |
| 905 |
< |
SNAPSHOT_URL=${SNAPSHOT_URL%\", \"timestamp*} |
| 904 |
> |
# In case the URL has a shebang in it (like mega.nz links do), escape the '!' to break it |
| 905 |
> |
ARCHIVE_QUERY=$(echo "$ARCHIVE_QUERY" | sed 's/#!/#\\!/') |
| 906 |
> |
|
| 907 |
> |
# ...isolate "url" property in the response that follows the "closest" tag |
| 908 |
> |
SNAPSHOT_URL=${ARCHIVE_QUERY##*\"closest\":} # everything after '"closest":' |
| 909 |
> |
SNAPSHOT_URL=${SNAPSHOT_URL##*\"url\": \"} # everything after '"url": "' |
| 910 |
> |
SNAPSHOT_URL=${SNAPSHOT_URL%%\"*} # everything before '"' |
| 911 |
> |
|
| 912 |
> |
# Inform the user of the snapshot URL |
| 913 |
|
valPrint t " IA suggests $SNAPSHOT_URL" |
| 914 |
|
valPrint r " IA suggests {\field{\*\fldinst{HYPERLINK \"$SNAPSHOT_URL\"}}{\fldrslt $SNAPSHOT_URL}}" |
| 915 |
|
valPrint hn "<tr><td colspan=\"2\" align=\"right\">IA suggests</td><td><a href=\"$SNAPSHOT_URL\" target=\"_blank\">$SNAPSHOT_URL</a></td></tr>" |