| 103 |  | This script parses a list of external links found in the OniGalore wiki | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 104 |  | (which is dumped by the Oni2.net domain periodically in a particular | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 105 |  | format), validates them using the Unix tool 'curl', and produces a report | 
 
 
 
 
 
 
 
 
 
 
 | 106 | < | of which links were OK (responded to an HTTP query) and which were NG (no | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 107 | < | good). This report can then be automatically uploaded to the location of | 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 | 106 | > | of which links were OK (responded positively to an HTTP query), which | 
 
 
 
 
 | 107 | > | were RD (responded with a 3xx redirect code), which could be IW (inter- | 
 
 
 
 
 | 108 | > | wiki) links, and which were NG (no good; a negative response to the | 
 
 
 
 
 | 109 | > | query). This report can then be automatically uploaded to the location of | 
 
 
 
 
 
 
 
 
 
 
 | 110 |  | your choice. The script can also suggest Internet Archive snapshots for | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 111 |  | NG links, and take screenshots of OK links for visual verification by the | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 112 |  | reader that the page in question is the one intended to be displayed. | 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 | 122 |  | file:// protocol) (required) | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 123 |  | --output DIR        Place the folder which will contain the reports and | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 124 |  | optional screenshots at this path (required) | 
 
 
 
 
 
 
 
 
 
 
 | 125 | < | --exceptions DIR    Don't log an NG link if it is listed in the file | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 126 | < | provided at this path as long as the response code is | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 127 | < | the same as the one associated with the link | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 128 | < | --record-ok-links   Log a link in the report whether its response code is | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 129 | < | in the OK_CODES or the NG_CODES array | 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 | 125 | > | --exceptions URL    In order to remove links from the list which show as | 
 
 
 
 
 | 126 | > | NG but which you regard as OK, prepare a plain-text | 
 
 
 
 
 | 127 | > | file where each line contains a response code being | 
 
 
 
 
 | 128 | > | returned and the URL returning it, separated by a | 
 
 
 
 
 | 129 | > | comma, e.g. "403,http://www.example.com" (note that | 
 
 
 
 
 | 130 | > | this can be a local file if you use the | 
 
 
 
 
 | 131 | > | file:// protocol) | 
 
 
 
 
 | 132 | > | --record-ok-links   Log a link in the report even if its response code is | 
 
 
 
 
 | 133 | > | OK | 
 
 
 
 
 
 
 
 
 
 
 | 134 |  | --suggest-snapshots Query the Internet Archive for a possible snapshot | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 135 |  | URL for each NG page | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 136 |  | --take-screenshots  Save screenshots of each OK page (requires Google | 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 | 421 |  | if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 422 |  | if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 423 |  | if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi | 
 
 
 
 
 
 
 
 
 
 
 | 424 | < | valPrint ctrh "Out of the $LINKS_CHECKED links checked, $OK_LINKS $(pluralCheckWas $OK_LINKS) OK, $RD_LINKS $(pluralCheckWas $RD_LINKS) $(pluralCheckA $RD_LINKS)redirection $(pluralCheckNoun notice $RD_LINKS), and $NG_LINKS $(pluralCheckWas $NG_LINKS) NG." | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 419 | < | if [ $IW_LINKS -gt 0 ]; then | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 420 | < | valPrint ctrh "$IW_LINKS/$OK_LINKS OK $(pluralCheckNoun link $OK_LINKS) $(pluralCheckIs $IW_LINKS) $(pluralCheckAn $IW_LINKS)external $(pluralCheckNoun link $IW_LINKS) that could be $(pluralCheckAn $IW_LINKS)interwiki $(pluralCheckNoun link $IW_LINKS)." | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 421 | < | fi | 
 
 
 
 
 
 
 
 
 | 424 | > | valPrint ctrh "Out of the $LINKS_CHECKED links checked, $IW_LINKS could be $(pluralCheckAn $IW_LINKS)interwiki $(pluralCheckNoun link $IW_LINKS), $OK_LINKS $(pluralCheckWas $OK_LINKS) OK, $RD_LINKS $(pluralCheckWas $RD_LINKS) $(pluralCheckA $RD_LINKS)redirection $(pluralCheckNoun notice $RD_LINKS), and $NG_LINKS $(pluralCheckWas $NG_LINKS) NG." | 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 | 425 |  | if [ $SKIP_EXCEPT -gt 0 ]; then | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 426 |  | valPrint ctrh "$SKIP_EXCEPT/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS) went unlisted due to being found in the exceptions file." | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 427 |  | fi | 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 | 448 |  | printHTMheader | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 449 |  |  | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 450 |  | # Attempt to download file at LINKS_URL, then check that it succeeded | 
 
 
 
 
 
 
 
 
 
 
 | 451 | < | valPrint ctrh "Downloading list of external links from $LINKS_URL." | 
 
 
 
 
 
 
 
 
 | 451 | > | valPrint cwtrh "Downloading list of external links from $LINKS_URL." | 
 
 
 
 
 
 
 
 
 
 
 | 452 |  | LINKS_FILE_NAME=$(echo "$LINKS_URL" | sed 's/.*\///') | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 453 |  | LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME" | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 454 |  | curl --silent -o "$LINKS_FILE" $LINKS_URL | 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 | 459 |  |  | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 460 |  | # Attempt to download file at EXCEPT_URL, then check that it succeeded | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 461 |  | if [ ! -z $EXCEPT_URL ]; then | 
 
 
 
 
 
 
 
 
 
 
 | 462 | < | valPrint ctrh "Downloading list of NG exceptions from $EXCEPT_URL." | 
 
 
 
 
 
 
 
 
 | 462 | > | valPrint cwtrh "Downloading list of NG exceptions from $EXCEPT_URL." | 
 
 
 
 
 
 
 
 
 
 
 | 463 |  | EXCEPT_FILE_NAME=$(echo "$EXCEPT_URL" | sed 's/.*\///') | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 464 |  | EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME" | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 465 |  | curl --silent -o "$EXCEPT_FILE" $EXCEPT_URL | 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 | 557 |  | # Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 558 |  | NS_NAME="" | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 559 |  | a=0 | 
 
 
 
 
 
 
 
 
 
 
 | 560 | < | while [ "x${NS_IDS[$a]}" != "x" ] # once this evaluates to "x", the array is done | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 558 | < | do | 
 
 
 
 
 
 
 
 
 | 560 | > | while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 561 |  | if [ $NS_ID -eq ${NS_IDS[$a]} ]; then | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 562 |  | NS_NAME="${NS_NAMES[$a]}" | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 563 |  | break | 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 | 692 |  | CURL_RESULT="$CURL_RESULT-$CURL_ERR" | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 693 |  | fi | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 694 |  |  | 
 
 
 
 
 
 
 
 
 
 
 | 695 | < | # Determine if this code is in our "OK" list | 
 
 
 
 
 
 
 
 
 | 695 | > | # Determine our status code for this URL (IW, OK, RD, or NG) | 
 
 
 
 
 
 
 
 
 
 
 | 696 |  | STATUS="??" | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 697 |  | NEW_URL="" | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 698 |  | INTERWIKI_INDEX=-1 | 
 
 
 
 
 
 
 
 
 
 
 | 699 | < | for CODE in "${OK_CODES[@]}"; do | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 700 | < | if [[ $CODE == $CURL_CODE ]]; then | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 701 | < | let OK_LINKS+=1 | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 702 | < |  | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 703 | < | # Determine if this is a link to a domain that we have an interwiki prefix for | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 704 | < | for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 703 | < | if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]]; then | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 704 | < | STATUS="IW" | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 705 | < | let IW_LINKS+=1 | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 706 | < | INTERWIKI_INDEX=$i | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 707 | < | break | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 708 | < | fi | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 709 | < | done | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 710 | < |  | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 711 | < | # If this link is OK and no interwiki advisory is needed, just mark as "OK" | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 712 | < | if [ $INTERWIKI_INDEX == -1 ]; then | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 713 | < | STATUS="OK" | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 714 | < | fi | 
 
 
 
 
 
 
 
 
 | 699 | > | # First check if this is a link to a domain that we have an interwiki prefix for | 
 
 
 
 
 | 700 | > | for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do | 
 
 
 
 
 | 701 | > | if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]]; then | 
 
 
 
 
 | 702 | > | STATUS="IW" | 
 
 
 
 
 | 703 | > | let IW_LINKS+=1 | 
 
 
 
 
 | 704 | > | INTERWIKI_INDEX=$i | 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 | 705 |  | break | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 706 |  | fi | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 707 |  | done | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 708 |  |  | 
 
 
 
 
 
 
 
 | 709 | + | # If we didn't match an interwiki domain, see if the status code is in our "OK" codes list | 
 
 
 
 
 
 
 
 | 710 | + | if [ $STATUS == "??" ]; then | 
 
 
 
 
 
 
 
 | 711 | + | for CODE in "${OK_CODES[@]}"; do | 
 
 
 
 
 
 
 
 | 712 | + | if [[ $CODE == $CURL_CODE ]]; then | 
 
 
 
 
 
 
 
 | 713 | + | STATUS="OK" | 
 
 
 
 
 
 
 
 | 714 | + | let OK_LINKS+=1 | 
 
 
 
 
 
 
 
 | 715 | + | break | 
 
 
 
 
 
 
 
 | 716 | + | fi | 
 
 
 
 
 
 
 
 | 717 | + | done | 
 
 
 
 
 
 
 
 | 718 | + | fi | 
 
 
 
 
 
 
 
 | 719 | + |  | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 720 |  | # If we didn't get a match with the "OK" codes, check it against the "RD" codes | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 721 |  | if [ $STATUS == "??" ]; then | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 722 |  | for CODE in "${RD_CODES[@]}"; do | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 723 |  | if [[ $CODE == $CURL_CODE ]]; then | 
 
 
 
 
 
 
 
 
 | 723 | – | STATUS="RD" | 
 
 
 
 
 
 
 
 
 | 724 | – | let RD_LINKS+=1 | 
 
 
 
 
 
 
 
 
 | 725 | – |  | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 724 |  | # Get URL header again in order to retrieve the URL we are being redirected to | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 725 |  | NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{redirect_url}\n' $URL) | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 726 |  |  | 
 
 
 
 
 
 
 
 | 727 | + | # Check if the redirect URL is just the original URL with https:// instead of http:// | 
 
 
 
 
 
 
 
 | 728 | + | # (this happens a lot and is not an important correction to us); if so, just make it "OK" | 
 
 
 
 
 
 
 
 | 729 | + | URL_NO_PROTOCOL=${URL#*://} | 
 
 
 
 
 
 
 
 | 730 | + | NEW_URL_NO_PROTOCOL=${NEW_URL#*://} | 
 
 
 
 
 
 
 
 | 731 | + | if [ $URL_NO_PROTOCOL == $NEW_URL_NO_PROTOCOL ]; then | 
 
 
 
 
 
 
 
 | 732 | + | STATUS="OK" | 
 
 
 
 
 
 
 
 | 733 | + | let OK_LINKS+=1 | 
 
 
 
 
 
 
 
 | 734 | + | else | 
 
 
 
 
 
 
 
 | 735 | + | STATUS="RD" | 
 
 
 
 
 
 
 
 | 736 | + | let RD_LINKS+=1 | 
 
 
 
 
 
 
 
 | 737 | + | fi | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 738 |  | break | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 739 |  | fi | 
 
 
 
 
 
 
 
 
 
 
 
 
 | 740 |  | done |