--- Validate External Links/validate_external_links.sh 2017/08/01 17:09:42 1067 +++ Validate External Links/validate_external_links.sh 2017/08/02 04:26:48 1069 @@ -103,8 +103,10 @@ DESCRIPTION This script parses a list of external links found in the OniGalore wiki (which is dumped by the Oni2.net domain periodically in a particular format), validates them using the Unix tool 'curl', and produces a report - of which links were OK (responded to an HTTP query) and which were NG (no - good). This report can then be automatically uploaded to the location of + of which links were OK (responded positively to an HTTP query), which + were RD (responded with a 3xx redirect code), which could be IW (inter- + wiki) links, and which were NG (no good; a negative response to the + query). This report can then be automatically uploaded to the location of your choice. The script can also suggest Internet Archive snapshots for NG links, and take screenshots of OK links for visual verification by the reader that the page in question is the one intended to be displayed. @@ -120,11 +122,15 @@ OPTIONS file:// protocol) (required) --output DIR Place the folder which will contain the reports and optional screenshots at this path (required) - --exceptions DIR Don't log an NG link if it is listed in the file - provided at this path as long as the response code is - the same as the one associated with the link - --record-ok-links Log a link in the report whether its response code is - in the OK_CODES or the NG_CODES array + --exceptions URL In order to remove links from the list which show as + NG but which you regard as OK, prepare a plain-text + file where each line contains a response code being + returned and the URL returning it, separated by a + comma, e.g. "403,http://www.example.com" (note that + this can be a local file if you use the + file:// protocol) + --record-ok-links Log a link in the report even if its response code is + OK --suggest-snapshots Query the Internet Archive for a possible snapshot URL for each NG page --take-screenshots Save screenshots of each OK page (requires Google @@ -415,10 +421,7 @@ function wrapupAndExit() if [ $SKIP_NON_ASCII -gt 0 ]; then valPrint ctrh "- $SKIP_NON_ASCII non-ASCII $(pluralCheckNoun URL $SKIP_NON_ASCII)"; fi if [ $SKIP_UNK_SUFFIX -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_SUFFIX unknown URL $(pluralCheckNoun suffix $SKIP_UNK_SUFFIX)"; fi if [ $SKIP_UNK_CODE -gt 0 ]; then valPrint ctrh "- $SKIP_UNK_CODE unknown response $(pluralCheckNoun code $SKIP_UNK_CODE)"; fi - valPrint ctrh "Out of the $LINKS_CHECKED links checked, $OK_LINKS $(pluralCheckWas $OK_LINKS) OK, $RD_LINKS $(pluralCheckWas $RD_LINKS) $(pluralCheckA $RD_LINKS)redirection $(pluralCheckNoun notice $RD_LINKS), and $NG_LINKS $(pluralCheckWas $NG_LINKS) NG." - if [ $IW_LINKS -gt 0 ]; then - valPrint ctrh "$IW_LINKS/$OK_LINKS OK $(pluralCheckNoun link $OK_LINKS) $(pluralCheckIs $IW_LINKS) $(pluralCheckAn $IW_LINKS)external $(pluralCheckNoun link $IW_LINKS) that could be $(pluralCheckAn $IW_LINKS)interwiki $(pluralCheckNoun link $IW_LINKS)." - fi + valPrint ctrh "Out of the $LINKS_CHECKED links checked, $IW_LINKS could be $(pluralCheckAn $IW_LINKS)interwiki $(pluralCheckNoun link $IW_LINKS), $OK_LINKS $(pluralCheckWas $OK_LINKS) OK, $RD_LINKS $(pluralCheckWas $RD_LINKS) $(pluralCheckA $RD_LINKS)redirection $(pluralCheckNoun notice $RD_LINKS), and $NG_LINKS $(pluralCheckWas $NG_LINKS) NG." if [ $SKIP_EXCEPT -gt 0 ]; then valPrint ctrh "$SKIP_EXCEPT/$NG_LINKS NG $(pluralCheckNoun link $NG_LINKS) went unlisted due to being found in the exceptions file." fi @@ -445,7 +448,7 @@ printRTFheader printHTMheader # Attempt to download file at LINKS_URL, then check that it succeeded -valPrint ctrh "Downloading list of external links from $LINKS_URL." +valPrint cwtrh "Downloading list of external links from $LINKS_URL." LINKS_FILE_NAME=$(echo "$LINKS_URL" | sed 's/.*\///') LINKS_FILE="$OUTPUT_PATH/$LINKS_FILE_NAME" curl --silent -o "$LINKS_FILE" $LINKS_URL @@ -456,7 +459,7 @@ fi # Attempt to download file at EXCEPT_URL, then check that it succeeded if [ ! -z $EXCEPT_URL ]; then - valPrint ctrh "Downloading list of NG exceptions from $EXCEPT_URL." + valPrint cwtrh "Downloading list of NG exceptions from $EXCEPT_URL." EXCEPT_FILE_NAME=$(echo "$EXCEPT_URL" | sed 's/.*\///') EXCEPT_FILE="$OUTPUT_PATH/$EXCEPT_FILE_NAME" curl --silent -o "$EXCEPT_FILE" $EXCEPT_URL @@ -554,8 +557,7 @@ for LINE in `cat "$LINKS_FILE"`; do # Find namespace number in NS_IDS and use it to look up namespace's name in NS_NAMES NS_NAME="" a=0 - while [ "x${NS_IDS[$a]}" != "x" ] # once this evaluates to "x", the array is done - do + while [ "x${NS_IDS[$a]}" != "x" ]; do # once this evaluates to "x", the array is done if [ $NS_ID -eq ${NS_IDS[$a]} ]; then NS_NAME="${NS_NAMES[$a]}" break @@ -690,42 +692,49 @@ for LINE in `cat "$LINKS_FILE"`; do CURL_RESULT="$CURL_RESULT-$CURL_ERR" fi - # Determine if this code is in our "OK" list + # Determine our status code for this URL (IW, OK, RD, or NG) STATUS="??" NEW_URL="" INTERWIKI_INDEX=-1 - for CODE in "${OK_CODES[@]}"; do - if [[ $CODE == $CURL_CODE ]]; then - let OK_LINKS+=1 - - # Determine if this is a link to a domain that we have an interwiki prefix for - for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do - if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]]; then - STATUS="IW" - let IW_LINKS+=1 - INTERWIKI_INDEX=$i - break - fi - done - - # If this link is OK and no interwiki advisory is needed, just mark as "OK" - if [ $INTERWIKI_INDEX == -1 ]; then - STATUS="OK" - fi + # First check if this is a link to a domain that we have an interwiki prefix for + for ((i = 0; i < ${#INTERWIKI_DOMAINS[@]}; ++i)); do + if [[ $URL == *${INTERWIKI_DOMAINS[$i]}* ]]; then + STATUS="IW" + let IW_LINKS+=1 + INTERWIKI_INDEX=$i break fi done + # If we didn't match an interwiki domain, see if the status code is in our "OK" codes list + if [ $STATUS == "??" ]; then + for CODE in "${OK_CODES[@]}"; do + if [[ $CODE == $CURL_CODE ]]; then + STATUS="OK" + let OK_LINKS+=1 + break + fi + done + fi + # If we didn't get a match with the "OK" codes, check it against the "RD" codes if [ $STATUS == "??" ]; then for CODE in "${RD_CODES[@]}"; do if [[ $CODE == $CURL_CODE ]]; then - STATUS="RD" - let RD_LINKS+=1 - # Get URL header again in order to retrieve the URL we are being redirected to NEW_URL=$(curl -o /dev/null --silent --insecure --head --user-agent '"$AGENT"' --max-time 10 --write-out '%{redirect_url}\n' $URL) + # Check if the redirect URL is just the original URL with https:// instead of http:// + # (this happens a lot and is not an important correction to us); if so, just make it "OK" + URL_NO_PROTOCOL=${URL#*://} + NEW_URL_NO_PROTOCOL=${NEW_URL#*://} + if [ $URL_NO_PROTOCOL == $NEW_URL_NO_PROTOCOL ]; then + STATUS="OK" + let OK_LINKS+=1 + else + STATUS="RD" + let RD_LINKS+=1 + fi break fi done