--- Validate External Links/validate_external_links.sh 2023/05/16 01:10:09 1183 +++ Validate External Links/validate_external_links.sh 2023/05/21 22:22:55 1184 @@ -33,6 +33,7 @@ LINKS_URL="" # download extern EXCEPT_URL="" # location of wiki page with a list of exceptions for NG results OUTPUT_DIR="" # place reports and all other output in a folder inside this existing folder RECORD_OK_LINKS=0 # record response code to the log even when it's a value in OK_CODES +ONLY_200_OK=0 # only treat code 200 as "OK" and not any other code in OK_CODES SHOW_SLASH=0 # record issue when a slash is added to the end of a URL SHOW_HTTPS=0 # record issue when "http" is upgraded to "https" SHOW_YT_RD=0 # record redirection for a youtu.be URL expanding to the full URL @@ -67,7 +68,8 @@ declare -a NS_IDS=(-2 -1 0 1 2 3 4 5 6 7 declare -a NS_NAMES=("Media" "Special" "Main" "Talk" "User" "User_talk" "OniGalore" "OniGalore_talk" "File" "File_talk" "MediaWiki" "MediaWiki_talk" "Template" "Template_talk" "Help" "Help_talk" "Category" "Category_talk" "BSL" "BSL_talk" "OBD" "OBD_talk" "AE" "AE_talk" "Oni2" "Oni2_talk" "XML" "XML_talk") # These arrays tell the script which suffixes at the ends of URLs represent files and which are pages. -# This determines whether the script tries to take a screenshot of the URL or just gets its HTTP code. +# This determines whether the script tries to take a screenshot of the URL (when screenshots are +# requested). declare -a HTTP_FILES=(3ds 7z avi BINA blend bsl cpp csv dae dll dmg doc east exe fbx first63 flv gamedata gif jpg last32 log m4a mhm mov mp3 mp4 oni ONWC pdf png psd py rar tga TRMA txt vbs wav wmv xaf xcf xlsx xml zip) declare -a HTTP_TLDS_AND_PAGES=(abstract action ars asp aspx cfm cgi com css de do full htm html it js jsp net org pgi php php3 phtml pl ru shtml stm uk x) @@ -134,10 +136,10 @@ NAME SYNOPSIS validate_external_links.sh --help validate_external_links.sh --links URL --output DIR [--exceptions URL] - [--record-ok-links] [--show-added-slashes] [--show-https-upgrades] - [--show-yt-redirects] [--suggest-snapshots] [--check-archive-links] - [--take-screenshots FILE] [--timeout NUM] [--start-url NUM] - [--end-url NUM] [--upload FILE] + [--record-ok-links] [--only-200-ok] [--show-added-slashes] + [--show-https-upgrades] [--show-yt-redirects] [--suggest-snapshots] + [--check-archive-links] [--take-screenshots FILE] [--timeout NUM] + [--start-url NUM] [--end-url NUM] [--upload FILE] DESCRIPTION This script parses a list of external links found in the OniGalore wiki @@ -172,6 +174,10 @@ OPTIONS beginning with "file://". --record-ok-links Log a link in the report even if its response code is "OK". + --only-200-ok Only treat response code 200 as "OK". Normally + several additional codes are treated as "OK" (see + the array OK_CODES in script) because they are + typically not an indicator of a bad link. --show-added-slashes Report on redirects that simply add a '/' to the end of the URL. --show-https-upgrades Report on redirects that simply upgrade a @@ -225,6 +231,7 @@ while (( "$#" )); do --exceptions ) EXCEPT_URL="$2"; shift 2;; --output ) OUTPUT_DIR="$2"; shift 2;; --record-ok-links ) RECORD_OK_LINKS=1; shift;; + --only-200-ok ) ONLY_200_OK=1; shift;; --show-added-slashes ) SHOW_SLASH=1; shift;; --show-https-upgrades ) SHOW_HTTPS=1; shift;; --show-yt-redirects ) SHOW_YT_RD=1; shift;; @@ -704,6 +711,9 @@ valPrint ctrh "Site query timeout: $TIME valPrint ctrhn "Show OK links: " if [ $RECORD_OK_LINKS -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi +valPrint ctrhn "Treat these response codes as OK: " +if [ $ONLY_200_OK -eq 1 ]; then valPrint ctrh "200"; else valPrint ctrh "${OK_CODES[*]}"; fi + valPrint ctrhn "Take screenshots: " if [ $TAKE_PAGE_SHOT -eq 1 ]; then valPrint ctrh "Yes"; else valPrint ctrh "No"; fi @@ -1006,6 +1016,10 @@ for LINE in `cat "$LINKS_FILE"`; do # If we didn't match an interwiki domain, see if the status code is in our "OK" codes list if [ $STATUS == "??" ]; then for CODE in "${OK_CODES[@]}"; do + if [ $ONLY_200_OK -eq 1 ] && [ $CODE -ne 200 ]; then + continue + fi + if [[ $CODE == $CURL_CODE ]]; then STATUS="OK" let OK_LINKS+=1 @@ -1142,6 +1156,19 @@ for LINE in `cat "$LINKS_FILE"`; do break fi done + # Also check it against the "OK" codes besides 200 if the --only-200-ok argument was received + if [ $ONLY_200_OK -eq 1 ]; then + for CODE in "${OK_CODES[@]}"; do + if [ $CODE -eq 200 ]; then + continue + fi + if [[ $CODE == $CURL_CODE ]]; then + STATUS="NG" + let NG_LINKS+=1 + break + fi + done + fi fi # If we didn't match a known status code, advise the reader