🏥 Healthchecks and Probes Recipes

Reliable health checking ensures system availability and enables proactive monitoring. This recipe provides patterns for implementing comprehensive health checks, readiness probes, and liveness checks for various system components.

🎯 Core Principles

Multi-Layer Health Checking

Implement health checks at different levels: process, functional, resource, and dependency.

#!/bin/bash
# healthcheck-framework.sh - Comprehensive health checking system

# Health check result codes
HEALTH_OK=0
HEALTH_WARNING=1
HEALTH_CRITICAL=2
HEALTH_UNKNOWN=3

# Global configuration
HEALTHCHECK_TIMEOUT=30
HEALTHCHECK_LOG="/var/log/healthcheck.log"

# Logging function
log_health_result() {
    local component="$1"
    local status="$2"
    local message="$3"
    local timestamp
    timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")

    echo "[$timestamp] $component: $status - $message" >> "$HEALTHCHECK_LOG"

    # Also output to stdout for monitoring systems
    echo "$component|$status|$message"
}

# Health check wrapper with timeout
run_healthcheck_with_timeout() {
    local check_name="$1"
    local check_function="$2"
    local timeout="${3:-$HEALTHCHECK_TIMEOUT}"

    # Run check in background
    "$check_function" &
    local check_pid=$!

    # Wait with timeout
    local count=0
    while [ $count -lt $timeout ] && kill -0 "$check_pid" 2>/dev/null; do
        sleep 1
        count=$((count + 1))
    done

    # Check if still running
    if kill -0 "$check_pid" 2>/dev/null; then
        # Timeout occurred
        kill "$check_pid" 2>/dev/null
        wait "$check_pid" 2>/dev/null
        log_health_result "$check_name" "TIMEOUT" "Check timed out after ${timeout}s"
        return $HEALTH_UNKNOWN
    else
        # Check completed
        wait "$check_pid"
        return $?
    fi
}

🔧 Process Health Checks

Basic Process Monitoring

# process-healthchecks.sh - Process-level health checks

# Check if process is running
check_process_running() {
    local process_name="$1"
    local expected_count="${2:-1}"

    local actual_count
    actual_count=$(pgrep -f "$process_name" | wc -l)

    if [ "$actual_count" -eq 0 ]; then
        log_health_result "process:$process_name" "CRITICAL" "Process not running"
        return $HEALTH_CRITICAL
    elif [ "$actual_count" -lt "$expected_count" ]; then
        log_health_result "process:$process_name" "WARNING" "Only $actual_count of $expected_count processes running"
        return $HEALTH_WARNING
    else
        log_health_result "process:$process_name" "OK" "$actual_count processes running"
        return $HEALTH_OK
    fi
}

# Check process resource usage
check_process_resources() {
    local process_name="$1"
    local cpu_threshold="${2:-80}"
    local memory_threshold="${3:-80}"

    local pids
    pids=$(pgrep -f "$process_name")

    if [ -z "$pids" ]; then
        log_health_result "resources:$process_name" "CRITICAL" "Process not running"
        return $HEALTH_CRITICAL
    fi

    local issues=()

    for pid in $pids; do
        # Check CPU usage
        local cpu_usage
        cpu_usage=$(ps -p "$pid" -o %cpu= 2>/dev/null | tr -d ' ')

        if [ -n "$cpu_usage" ] && [ "$(echo "$cpu_usage > $cpu_threshold" | bc 2>/dev/null)" = "1" ]; then
            issues+=("PID $pid CPU usage ${cpu_usage}% exceeds threshold $cpu_threshold%")
        fi

        # Check memory usage
        local memory_usage
        memory_usage=$(ps -p "$pid" -o %mem= 2>/dev/null | tr -d ' ')

        if [ -n "$memory_usage" ] && [ "$(echo "$memory_usage > $memory_threshold" | bc 2>/dev/null)" = "1" ]; then
            issues+=("PID $pid memory usage ${memory_usage}% exceeds threshold $memory_threshold%")
        fi
    done

    if [ ${#issues[@]} -eq 0 ]; then
        log_health_result "resources:$process_name" "OK" "Resource usage within limits"
        return $HEALTH_OK
    else
        local message
        message=$(IFS='; '; echo "${issues[*]}")
        log_health_result "resources:$process_name" "WARNING" "$message"
        return $HEALTH_WARNING
    fi
}

# Check process responsiveness
check_process_responsive() {
    local process_name="$1"
    local port="$2"
    local timeout="${3:-5}"

    # Check if process is listening on port
    if ! ss -tuln | grep -q ":$port "; then
        log_health_result "responsive:$process_name" "CRITICAL" "Process not listening on port $port"
        return $HEALTH_CRITICAL
    fi

    # Check if port responds to connection
    if timeout "$timeout" bash -c "echo > /dev/tcp/localhost/$port" 2>/dev/null; then
        log_health_result "responsive:$process_name" "OK" "Process responsive on port $port"
        return $HEALTH_OK
    else
        log_health_result "responsive:$process_name" "CRITICAL" "Process not responding on port $port"
        return $HEALTH_CRITICAL
    fi
}

🌐 Functional Health Checks

Service-Specific Health Verification

# functional-healthchecks.sh - Application-level health checks

# HTTP service health check
check_http_service() {
    local url="$1"
    local expected_status="${2:-200}"
    local timeout="${3:-10}"

    local http_status
    http_status=$(curl -s -o /dev/null -w "%{http_code}" --max-time "$timeout" "$url" 2>/dev/null)

    if [ "$http_status" = "$expected_status" ]; then
        log_health_result "http:$url" "OK" "HTTP $http_status received"
        return $HEALTH_OK
    elif [ -n "$http_status" ]; then
        log_health_result "http:$url" "CRITICAL" "Expected HTTP $expected_status, got $http_status"
        return $HEALTH_CRITICAL
    else
        log_health_result "http:$url" "CRITICAL" "HTTP request failed or timed out"
        return $HEALTH_CRITICAL
    fi
}

# Database connectivity check
check_database_connectivity() {
    local db_type="$1"
    local connection_string="$2"
    local timeout="${3:-10}"

    case "$db_type" in
        mysql)
            check_mysql_connectivity "$connection_string" "$timeout"
            ;;
        postgresql)
            check_postgres_connectivity "$connection_string" "$timeout"
            ;;
        *)
            log_health_result "database:$db_type" "UNKNOWN" "Unsupported database type"
            return $HEALTH_UNKNOWN
            ;;
    esac
}

check_mysql_connectivity() {
    local connection_string="$1"
    local timeout="$2"

    # Parse connection string (simplified)
    local host port user password database
    host=$(echo "$connection_string" | cut -d'@' -f2 | cut -d':' -f1)
    port=$(echo "$connection_string" | cut -d':' -f4 | cut -d'/' -f1)
    database=$(echo "$connection_string" | cut -d'/' -f4)

    # Test connection
    if timeout "$timeout" mysql -h "$host" -P "$port" -e "SELECT 1" "$database" >/dev/null 2>&1; then
        log_health_result "database:mysql" "OK" "MySQL connection successful"
        return $HEALTH_OK
    else
        log_health_result "database:mysql" "CRITICAL" "MySQL connection failed"
        return $HEALTH_CRITICAL
    fi
}

check_postgres_connectivity() {
    local connection_string="$1"
    local timeout="$2"

    # Test connection
    if timeout "$timeout" pg_isready -d "$connection_string" >/dev/null 2>&1; then
        log_health_result "database:postgresql" "OK" "PostgreSQL connection successful"
        return $HEALTH_OK
    else
        log_health_result "database:postgresql" "CRITICAL" "PostgreSQL connection failed"
        return $HEALTH_CRITICAL
    fi
}

# Cache service check
check_cache_service() {
    local cache_type="$1"
    local host="$2"
    local port="$3"
    local timeout="${4:-5}"

    case "$cache_type" in
        redis)
            check_redis_connectivity "$host" "$port" "$timeout"
            ;;
        memcached)
            check_memcached_connectivity "$host" "$port" "$timeout"
            ;;
        *)
            log_health_result "cache:$cache_type" "UNKNOWN" "Unsupported cache type"
            return $HEALTH_UNKNOWN
            ;;
    esac
}

check_redis_connectivity() {
    local host="$1"
    local port="$2"
    local timeout="$3"

    # Test Redis connectivity
    if echo "PING" | timeout "$timeout" nc "$host" "$port" 2>/dev/null | grep -q "+PONG"; then
        log_health_result "cache:redis" "OK" "Redis connection successful"
        return $HEALTH_OK
    else
        log_health_result "cache:redis" "CRITICAL" "Redis connection failed"
        return $HEALTH_CRITICAL
    fi
}

check_memcached_connectivity() {
    local host="$1"
    local port="$2"
    local timeout="$3"

    # Test Memcached connectivity
    if echo "stats" | timeout "$timeout" nc "$host" "$port" 2>/dev/null | grep -q "STAT"; then
        log_health_result "cache:memcached" "OK" "Memcached connection successful"
        return $HEALTH_OK
    else
        log_health_result "cache:memcached" "CRITICAL" "Memcached connection failed"
        return $HEALTH_CRITICAL
    fi
}

📊 Resource Health Checks

System Resource Monitoring

# resource-healthchecks.sh - System resource health checks

# Disk space check
check_disk_space() {
    local path="${1:-/}"
    local warning_threshold="${2:-80}"
    local critical_threshold="${3:-90}"

    local usage
    usage=$(df "$path" | awk 'NR==2 {print $5}' | sed 's/%//')

    if [ "$usage" -ge "$critical_threshold" ]; then
        log_health_result "disk:$path" "CRITICAL" "Disk usage ${usage}% exceeds critical threshold $critical_threshold%"
        return $HEALTH_CRITICAL
    elif [ "$usage" -ge "$warning_threshold" ]; then
        log_health_result "disk:$path" "WARNING" "Disk usage ${usage}% exceeds warning threshold $warning_threshold%"
        return $HEALTH_WARNING
    else
        log_health_result "disk:$path" "OK" "Disk usage ${usage}% within limits"
        return $HEALTH_OK
    fi
}

# Memory usage check
check_memory_usage() {
    local warning_threshold="${1:-80}"
    local critical_threshold="${2:-90}"

    local memory_usage
    memory_usage=$(free | awk 'NR==2{printf "%.0f", $3*100/$2}')

    if [ "$memory_usage" -ge "$critical_threshold" ]; then
        log_health_result "memory" "CRITICAL" "Memory usage ${memory_usage}% exceeds critical threshold $critical_threshold%"
        return $HEALTH_CRITICAL
    elif [ "$memory_usage" -ge "$warning_threshold" ]; then
        log_health_result "memory" "WARNING" "Memory usage ${memory_usage}% exceeds warning threshold $warning_threshold%"
        return $HEALTH_WARNING
    else
        log_health_result "memory" "OK" "Memory usage ${memory_usage}% within limits"
        return $HEALTH_OK
    fi
}

# CPU usage check
check_cpu_usage() {
    local warning_threshold="${1:-80}"
    local critical_threshold="${2:-90}"
    local sample_duration="${3:-5}"  # seconds

    # Get CPU usage over sample period
    local cpu_idle
    cpu_idle=$(vmstat 1 "$sample_duration" | awk 'NR>2 {sum+=$15} END {print sum/NR}')

    local cpu_usage
    cpu_usage=$(echo "100 - $cpu_idle" | bc)
    cpu_usage=${cpu_usage%.*}  # Remove decimal part

    if [ "$cpu_usage" -ge "$critical_threshold" ]; then
        log_health_result "cpu" "CRITICAL" "CPU usage ${cpu_usage}% exceeds critical threshold $critical_threshold%"
        return $HEALTH_CRITICAL
    elif [ "$cpu_usage" -ge "$warning_threshold" ]; then
        log_health_result "cpu" "WARNING" "CPU usage ${cpu_usage}% exceeds warning threshold $warning_threshold%"
        return $HEALTH_WARNING
    else
        log_health_result "cpu" "OK" "CPU usage ${cpu_usage}% within limits"
        return $HEALTH_OK
    fi
}

# Load average check
check_load_average() {
    local warning_threshold="${1:-$(nproc)}"
    local critical_threshold="${2:-$(( $(nproc) * 2 ))}"

    local load_avg
    load_avg=$(uptime | awk -F'load average:' '{print $2}' | awk '{print $1}' | sed 's/,//')

    # Convert to integer for comparison
    local load_int
    load_int=$(echo "$load_avg * 100" | bc | cut -d'.' -f1)
    local threshold_int
    threshold_int=$((critical_threshold * 100))

    if [ "$load_int" -ge "$threshold_int" ]; then
        log_health_result "load" "CRITICAL" "Load average $load_avg exceeds critical threshold $critical_threshold"
        return $HEALTH_CRITICAL
    elif [ "$load_int" -ge $((warning_threshold * 100)) ]; then
        log_health_result "load" "WARNING" "Load average $load_avg exceeds warning threshold $warning_threshold"
        return $HEALTH_WARNING
    else
        log_health_result "load" "OK" "Load average $load_avg within limits"
        return $HEALTH_OK
    fi
}

🔗 Dependency Health Checks

External Service Dependencies

# dependency-healthchecks.sh - Dependency health checks

# DNS resolution check
check_dns_resolution() {
    local hostname="$1"
    local expected_ip="${2:-}"
    local timeout="${3:-10}"

    local resolved_ip
    resolved_ip=$(timeout "$timeout" nslookup "$hostname" 2>/dev/null | awk '/^Address: / { print $2 }' | tail -1)

    if [ -z "$resolved_ip" ]; then
        log_health_result "dns:$hostname" "CRITICAL" "DNS resolution failed"
        return $HEALTH_CRITICAL
    elif [ -n "$expected_ip" ] && [ "$resolved_ip" != "$expected_ip" ]; then
        log_health_result "dns:$hostname" "WARNING" "DNS resolved to $resolved_ip, expected $expected_ip"
        return $HEALTH_WARNING
    else
        log_health_result "dns:$hostname" "OK" "DNS resolved to $resolved_ip"
        return $HEALTH_OK
    fi
}

# Network connectivity check
check_network_connectivity() {
    local target="$1"
    local port="${2:-80}"
    local timeout="${3:-10}"

    if timeout "$timeout" nc -z "$target" "$port" 2>/dev/null; then
        log_health_result "network:$target:$port" "OK" "Network connectivity successful"
        return $HEALTH_OK
    else
        log_health_result "network:$target:$port" "CRITICAL" "Network connectivity failed"
        return $HEALTH_CRITICAL
    fi
}

# TLS certificate check
check_tls_certificate() {
    local hostname="$1"
    local port="${2:-443}"
    local warning_days="${3:-30}"

    local cert_expiry
    cert_expiry=$(echo | timeout 10 openssl s_client -connect "$hostname:$port" 2>/dev/null |
                 openssl x509 -noout -enddate 2>/dev/null |
                 cut -d'=' -f2)

    if [ -z "$cert_expiry" ]; then
        log_health_result "tls:$hostname" "CRITICAL" "Unable to retrieve certificate information"
        return $HEALTH_CRITICAL
    fi

    local expiry_date_seconds
    expiry_date_seconds=$(date -d "$cert_expiry" +%s 2>/dev/null)
    local current_date_seconds
    current_date_seconds=$(date +%s)
    local days_until_expiry
    days_until_expiry=$(( (expiry_date_seconds - current_date_seconds) / 86400 ))

    if [ "$days_until_expiry" -lt 0 ]; then
        log_health_result "tls:$hostname" "CRITICAL" "Certificate expired $((days_until_expiry * -1)) days ago"
        return $HEALTH_CRITICAL
    elif [ "$days_until_expiry" -lt "$warning_days" ]; then
        log_health_result "tls:$hostname" "WARNING" "Certificate expires in $days_until_expiry days"
        return $HEALTH_WARNING
    else
        log_health_result "tls:$hostname" "OK" "Certificate valid for $days_until_expiry days"
        return $HEALTH_OK
    fi
}

🎨 Advanced Health Check Features

Composite Health Checks

# composite-healthchecks.sh - Combined health assessments

# Overall system health
check_system_health() {
    local checks=("$@")
    local overall_status=$HEALTH_OK
    local failed_checks=()
    local warning_checks=()

    echo "Running system health checks..."

    for check in "${checks[@]}"; do
        local check_name
        check_name=$(echo "$check" | cut -d' ' -f1)
        local check_args
        check_args=$(echo "$check" | cut -d' ' -f2-)

        # Run health check
        if run_healthcheck_with_timeout "$check_name" "$check" 30; then
            local result=$?
            case $result in
                $HEALTH_OK)
                    echo "✓ $check_name: OK"
                    ;;
                $HEALTH_WARNING)
                    echo "⚠ $check_name: WARNING"
                    warning_checks+=("$check_name")
                    if [ $overall_status -lt $HEALTH_WARNING ]; then
                        overall_status=$HEALTH_WARNING
                    fi
                    ;;
                $HEALTH_CRITICAL)
                    echo "✗ $check_name: CRITICAL"
                    failed_checks+=("$check_name")
                    overall_status=$HEALTH_CRITICAL
                    ;;
                *)
                    echo "? $check_name: UNKNOWN"
                    if [ $overall_status -lt $HEALTH_UNKNOWN ]; then
                        overall_status=$HEALTH_UNKNOWN
                    fi
                    ;;
            esac
        else
            echo "✗ $check_name: TIMEOUT"
            failed_checks+=("$check_name")
            overall_status=$HEALTH_CRITICAL
        fi
    done

    # Report overall status
    case $overall_status in
        $HEALTH_OK)
            echo "Overall system health: OK"
            return $HEALTH_OK
            ;;
        $HEALTH_WARNING)
            echo "Overall system health: WARNING"
            echo "Warnings: ${warning_checks[*]}"
            return $HEALTH_WARNING
            ;;
        $HEALTH_CRITICAL)
            echo "Overall system health: CRITICAL"
            echo "Failed checks: ${failed_checks[*]}"
            return $HEALTH_CRITICAL
            ;;
        *)
            echo "Overall system health: UNKNOWN"
            return $HEALTH_UNKNOWN
            ;;
    esac
}

# Kubernetes-style readiness probe
k8s_readiness_probe() {
    local required_checks=("$@")
    local failed_checks=()

    for check in "${required_checks[@]}"; do
        if ! eval "$check"; then
            failed_checks+=("$check")
        fi
    done

    if [ ${#failed_checks[@]} -eq 0 ]; then
        echo "Readiness probe: PASS"
        return 0
    else
        echo "Readiness probe: FAIL"
        echo "Failed checks: ${failed_checks[*]}"
        return 1
    fi
}

# Kubernetes-style liveness probe
k8s_liveness_probe() {
    local critical_checks=("$@")
    local failed_checks=()

    for check in "${critical_checks[@]}"; do
        if ! eval "$check"; then
            failed_checks+=("$check")
        fi
    done

    if [ ${#failed_checks[@]} -eq 0 ]; then
        echo "Liveness probe: PASS"
        return 0
    else
        echo "Liveness probe: FAIL"
        echo "Critical failures: ${failed_checks[*]}"
        return 1
    fi
}

🧾 Summary Best Practices

Health Check Implementation Guidelines

Layered Approach: Check process, functional, resource, and dependency health
Timeout Protection: Always implement timeouts to prevent hanging checks
Clear Status Codes: Use standardized return codes for monitoring systems
Detailed Logging: Provide actionable information in health check results
Performance Impact: Minimize resource usage of health checks themselves
Security Considerations: Don't expose sensitive information in health checks
Regular Testing: Test health checks regularly to ensure they work correctly

Sample Health Check Configuration

# healthcheck-config.sh - Example health check configuration

# Process checks
PROCESS_CHECKS=(
    "nginx"
    "postgresql"
    "redis-server"
)

# Functional checks
FUNCTIONAL_CHECKS=(
    "check_http_service http://localhost:80 200"
    "check_database_connectivity postgresql postgres://user:pass@localhost/db"
    "check_cache_service redis localhost 6379"
)

# Resource checks
RESOURCE_CHECKS=(
    "check_disk_space / 80 90"
    "check_memory_usage 80 90"
    "check_cpu_usage 80 90"
    "check_load_average $(nproc) $(( $(nproc) * 2 ))"
)

# Dependency checks
DEPENDENCY_CHECKS=(
    "check_dns_resolution google.com"
    "check_network_connectivity 8.8.8.8 53"
    "check_tls_certificate example.com 443 30"
)

# Composite check
ALL_CHECKS=("${PROCESS_CHECKS[@]}" "${FUNCTIONAL_CHECKS[@]}"
            "${RESOURCE_CHECKS[@]}" "${DEPENDENCY_CHECKS[@]}")

🧠 Complete Health Check Script

#!/bin/bash
# comprehensive-healthcheck.sh - Production-ready health checking

set -euo pipefail

# Source health check libraries
source healthcheck-framework.sh
source process-healthchecks.sh
source functional-healthchecks.sh
source resource-healthchecks.sh
source dependency-healthchecks.sh
source composite-healthchecks.sh

# Main health check function
main() {
    local mode="${1:-full}"

    case "$mode" in
        process)
            check_process_health
            ;;
        functional)
            check_functional_health
            ;;
        resource)
            check_resource_health
            ;;
        dependency)
            check_dependency_health
            ;;
        k8s-readiness)
            k8s_readiness_probe "${READINESS_CHECKS[@]}"
            ;;
        k8s-liveness)
            k8s_liveness_probe "${LIVENESS_CHECKS[@]}"
            ;;
        full|*)
            check_system_health "${ALL_CHECKS[@]}"
            ;;
    esac
}

# Run main function
main "$@"

🧾 See Also

Recipes: Log Processing
Recipes: Backup and Rotation
Patterns: Logging and Telemetry
Anti-Patterns: Silent Failures
Linux Debugging Tools
Monitoring and Observability Best Practices