🏥 Healthchecks and Probes Recipes
Reliable health checking ensures system availability and enables proactive monitoring. This recipe provides patterns for implementing comprehensive health checks, readiness probes, and liveness checks for various system components.
🎯 Core Principles
Multi-Layer Health Checking
Implement health checks at different levels: process, functional, resource, and dependency.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57 | #!/bin/bash
# healthcheck-framework.sh - Comprehensive health checking system
# Health check result codes
HEALTH_OK=0
HEALTH_WARNING=1
HEALTH_CRITICAL=2
HEALTH_UNKNOWN=3
# Global configuration
HEALTHCHECK_TIMEOUT=30
HEALTHCHECK_LOG="/var/log/healthcheck.log"
# Logging function
log_health_result() {
local component="$1"
local status="$2"
local message="$3"
local timestamp
timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
echo "[$timestamp] $component: $status - $message" >> "$HEALTHCHECK_LOG"
# Also output to stdout for monitoring systems
echo "$component|$status|$message"
}
# Health check wrapper with timeout
run_healthcheck_with_timeout() {
local check_name="$1"
local check_function="$2"
local timeout="${3:-$HEALTHCHECK_TIMEOUT}"
# Run check in background
"$check_function" &
local check_pid=$!
# Wait with timeout
local count=0
while [ $count -lt $timeout ] && kill -0 "$check_pid" 2>/dev/null; do
sleep 1
count=$((count + 1))
done
# Check if still running
if kill -0 "$check_pid" 2>/dev/null; then
# Timeout occurred
kill "$check_pid" 2>/dev/null
wait "$check_pid" 2>/dev/null
log_health_result "$check_name" "TIMEOUT" "Check timed out after ${timeout}s"
return $HEALTH_UNKNOWN
else
# Check completed
wait "$check_pid"
return $?
fi
}
|
🔧 Process Health Checks
Basic Process Monitoring
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88 | # process-healthchecks.sh - Process-level health checks
# Check if process is running
check_process_running() {
local process_name="$1"
local expected_count="${2:-1}"
local actual_count
actual_count=$(pgrep -f "$process_name" | wc -l)
if [ "$actual_count" -eq 0 ]; then
log_health_result "process:$process_name" "CRITICAL" "Process not running"
return $HEALTH_CRITICAL
elif [ "$actual_count" -lt "$expected_count" ]; then
log_health_result "process:$process_name" "WARNING" "Only $actual_count of $expected_count processes running"
return $HEALTH_WARNING
else
log_health_result "process:$process_name" "OK" "$actual_count processes running"
return $HEALTH_OK
fi
}
# Check process resource usage
check_process_resources() {
local process_name="$1"
local cpu_threshold="${2:-80}"
local memory_threshold="${3:-80}"
local pids
pids=$(pgrep -f "$process_name")
if [ -z "$pids" ]; then
log_health_result "resources:$process_name" "CRITICAL" "Process not running"
return $HEALTH_CRITICAL
fi
local issues=()
for pid in $pids; do
# Check CPU usage
local cpu_usage
cpu_usage=$(ps -p "$pid" -o %cpu= 2>/dev/null | tr -d ' ')
if [ -n "$cpu_usage" ] && [ "$(echo "$cpu_usage > $cpu_threshold" | bc 2>/dev/null)" = "1" ]; then
issues+=("PID $pid CPU usage ${cpu_usage}% exceeds threshold $cpu_threshold%")
fi
# Check memory usage
local memory_usage
memory_usage=$(ps -p "$pid" -o %mem= 2>/dev/null | tr -d ' ')
if [ -n "$memory_usage" ] && [ "$(echo "$memory_usage > $memory_threshold" | bc 2>/dev/null)" = "1" ]; then
issues+=("PID $pid memory usage ${memory_usage}% exceeds threshold $memory_threshold%")
fi
done
if [ ${#issues[@]} -eq 0 ]; then
log_health_result "resources:$process_name" "OK" "Resource usage within limits"
return $HEALTH_OK
else
local message
message=$(IFS='; '; echo "${issues[*]}")
log_health_result "resources:$process_name" "WARNING" "$message"
return $HEALTH_WARNING
fi
}
# Check process responsiveness
check_process_responsive() {
local process_name="$1"
local port="$2"
local timeout="${3:-5}"
# Check if process is listening on port
if ! ss -tuln | grep -q ":$port "; then
log_health_result "responsive:$process_name" "CRITICAL" "Process not listening on port $port"
return $HEALTH_CRITICAL
fi
# Check if port responds to connection
if timeout "$timeout" bash -c "echo > /dev/tcp/localhost/$port" 2>/dev/null; then
log_health_result "responsive:$process_name" "OK" "Process responsive on port $port"
return $HEALTH_OK
else
log_health_result "responsive:$process_name" "CRITICAL" "Process not responding on port $port"
return $HEALTH_CRITICAL
fi
}
|
🌐 Functional Health Checks
Service-Specific Health Verification
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127 | # functional-healthchecks.sh - Application-level health checks
# HTTP service health check
check_http_service() {
local url="$1"
local expected_status="${2:-200}"
local timeout="${3:-10}"
local http_status
http_status=$(curl -s -o /dev/null -w "%{http_code}" --max-time "$timeout" "$url" 2>/dev/null)
if [ "$http_status" = "$expected_status" ]; then
log_health_result "http:$url" "OK" "HTTP $http_status received"
return $HEALTH_OK
elif [ -n "$http_status" ]; then
log_health_result "http:$url" "CRITICAL" "Expected HTTP $expected_status, got $http_status"
return $HEALTH_CRITICAL
else
log_health_result "http:$url" "CRITICAL" "HTTP request failed or timed out"
return $HEALTH_CRITICAL
fi
}
# Database connectivity check
check_database_connectivity() {
local db_type="$1"
local connection_string="$2"
local timeout="${3:-10}"
case "$db_type" in
mysql)
check_mysql_connectivity "$connection_string" "$timeout"
;;
postgresql)
check_postgres_connectivity "$connection_string" "$timeout"
;;
*)
log_health_result "database:$db_type" "UNKNOWN" "Unsupported database type"
return $HEALTH_UNKNOWN
;;
esac
}
check_mysql_connectivity() {
local connection_string="$1"
local timeout="$2"
# Parse connection string (simplified)
local host port user password database
host=$(echo "$connection_string" | cut -d'@' -f2 | cut -d':' -f1)
port=$(echo "$connection_string" | cut -d':' -f4 | cut -d'/' -f1)
database=$(echo "$connection_string" | cut -d'/' -f4)
# Test connection
if timeout "$timeout" mysql -h "$host" -P "$port" -e "SELECT 1" "$database" >/dev/null 2>&1; then
log_health_result "database:mysql" "OK" "MySQL connection successful"
return $HEALTH_OK
else
log_health_result "database:mysql" "CRITICAL" "MySQL connection failed"
return $HEALTH_CRITICAL
fi
}
check_postgres_connectivity() {
local connection_string="$1"
local timeout="$2"
# Test connection
if timeout "$timeout" pg_isready -d "$connection_string" >/dev/null 2>&1; then
log_health_result "database:postgresql" "OK" "PostgreSQL connection successful"
return $HEALTH_OK
else
log_health_result "database:postgresql" "CRITICAL" "PostgreSQL connection failed"
return $HEALTH_CRITICAL
fi
}
# Cache service check
check_cache_service() {
local cache_type="$1"
local host="$2"
local port="$3"
local timeout="${4:-5}"
case "$cache_type" in
redis)
check_redis_connectivity "$host" "$port" "$timeout"
;;
memcached)
check_memcached_connectivity "$host" "$port" "$timeout"
;;
*)
log_health_result "cache:$cache_type" "UNKNOWN" "Unsupported cache type"
return $HEALTH_UNKNOWN
;;
esac
}
check_redis_connectivity() {
local host="$1"
local port="$2"
local timeout="$3"
# Test Redis connectivity
if echo "PING" | timeout "$timeout" nc "$host" "$port" 2>/dev/null | grep -q "+PONG"; then
log_health_result "cache:redis" "OK" "Redis connection successful"
return $HEALTH_OK
else
log_health_result "cache:redis" "CRITICAL" "Redis connection failed"
return $HEALTH_CRITICAL
fi
}
check_memcached_connectivity() {
local host="$1"
local port="$2"
local timeout="$3"
# Test Memcached connectivity
if echo "stats" | timeout "$timeout" nc "$host" "$port" 2>/dev/null | grep -q "STAT"; then
log_health_result "cache:memcached" "OK" "Memcached connection successful"
return $HEALTH_OK
else
log_health_result "cache:memcached" "CRITICAL" "Memcached connection failed"
return $HEALTH_CRITICAL
fi
}
|
📊 Resource Health Checks
System Resource Monitoring
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94 | # resource-healthchecks.sh - System resource health checks
# Disk space check
check_disk_space() {
local path="${1:-/}"
local warning_threshold="${2:-80}"
local critical_threshold="${3:-90}"
local usage
usage=$(df "$path" | awk 'NR==2 {print $5}' | sed 's/%//')
if [ "$usage" -ge "$critical_threshold" ]; then
log_health_result "disk:$path" "CRITICAL" "Disk usage ${usage}% exceeds critical threshold $critical_threshold%"
return $HEALTH_CRITICAL
elif [ "$usage" -ge "$warning_threshold" ]; then
log_health_result "disk:$path" "WARNING" "Disk usage ${usage}% exceeds warning threshold $warning_threshold%"
return $HEALTH_WARNING
else
log_health_result "disk:$path" "OK" "Disk usage ${usage}% within limits"
return $HEALTH_OK
fi
}
# Memory usage check
check_memory_usage() {
local warning_threshold="${1:-80}"
local critical_threshold="${2:-90}"
local memory_usage
memory_usage=$(free | awk 'NR==2{printf "%.0f", $3*100/$2}')
if [ "$memory_usage" -ge "$critical_threshold" ]; then
log_health_result "memory" "CRITICAL" "Memory usage ${memory_usage}% exceeds critical threshold $critical_threshold%"
return $HEALTH_CRITICAL
elif [ "$memory_usage" -ge "$warning_threshold" ]; then
log_health_result "memory" "WARNING" "Memory usage ${memory_usage}% exceeds warning threshold $warning_threshold%"
return $HEALTH_WARNING
else
log_health_result "memory" "OK" "Memory usage ${memory_usage}% within limits"
return $HEALTH_OK
fi
}
# CPU usage check
check_cpu_usage() {
local warning_threshold="${1:-80}"
local critical_threshold="${2:-90}"
local sample_duration="${3:-5}" # seconds
# Get CPU usage over sample period
local cpu_idle
cpu_idle=$(vmstat 1 "$sample_duration" | awk 'NR>2 {sum+=$15} END {print sum/NR}')
local cpu_usage
cpu_usage=$(echo "100 - $cpu_idle" | bc)
cpu_usage=${cpu_usage%.*} # Remove decimal part
if [ "$cpu_usage" -ge "$critical_threshold" ]; then
log_health_result "cpu" "CRITICAL" "CPU usage ${cpu_usage}% exceeds critical threshold $critical_threshold%"
return $HEALTH_CRITICAL
elif [ "$cpu_usage" -ge "$warning_threshold" ]; then
log_health_result "cpu" "WARNING" "CPU usage ${cpu_usage}% exceeds warning threshold $warning_threshold%"
return $HEALTH_WARNING
else
log_health_result "cpu" "OK" "CPU usage ${cpu_usage}% within limits"
return $HEALTH_OK
fi
}
# Load average check
check_load_average() {
local warning_threshold="${1:-$(nproc)}"
local critical_threshold="${2:-$(( $(nproc) * 2 ))}"
local load_avg
load_avg=$(uptime | awk -F'load average:' '{print $2}' | awk '{print $1}' | sed 's/,//')
# Convert to integer for comparison
local load_int
load_int=$(echo "$load_avg * 100" | bc | cut -d'.' -f1)
local threshold_int
threshold_int=$((critical_threshold * 100))
if [ "$load_int" -ge "$threshold_int" ]; then
log_health_result "load" "CRITICAL" "Load average $load_avg exceeds critical threshold $critical_threshold"
return $HEALTH_CRITICAL
elif [ "$load_int" -ge $((warning_threshold * 100)) ]; then
log_health_result "load" "WARNING" "Load average $load_avg exceeds warning threshold $warning_threshold"
return $HEALTH_WARNING
else
log_health_result "load" "OK" "Load average $load_avg within limits"
return $HEALTH_OK
fi
}
|
🔗 Dependency Health Checks
External Service Dependencies
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72 | # dependency-healthchecks.sh - Dependency health checks
# DNS resolution check
check_dns_resolution() {
local hostname="$1"
local expected_ip="${2:-}"
local timeout="${3:-10}"
local resolved_ip
resolved_ip=$(timeout "$timeout" nslookup "$hostname" 2>/dev/null | awk '/^Address: / { print $2 }' | tail -1)
if [ -z "$resolved_ip" ]; then
log_health_result "dns:$hostname" "CRITICAL" "DNS resolution failed"
return $HEALTH_CRITICAL
elif [ -n "$expected_ip" ] && [ "$resolved_ip" != "$expected_ip" ]; then
log_health_result "dns:$hostname" "WARNING" "DNS resolved to $resolved_ip, expected $expected_ip"
return $HEALTH_WARNING
else
log_health_result "dns:$hostname" "OK" "DNS resolved to $resolved_ip"
return $HEALTH_OK
fi
}
# Network connectivity check
check_network_connectivity() {
local target="$1"
local port="${2:-80}"
local timeout="${3:-10}"
if timeout "$timeout" nc -z "$target" "$port" 2>/dev/null; then
log_health_result "network:$target:$port" "OK" "Network connectivity successful"
return $HEALTH_OK
else
log_health_result "network:$target:$port" "CRITICAL" "Network connectivity failed"
return $HEALTH_CRITICAL
fi
}
# TLS certificate check
check_tls_certificate() {
local hostname="$1"
local port="${2:-443}"
local warning_days="${3:-30}"
local cert_expiry
cert_expiry=$(echo | timeout 10 openssl s_client -connect "$hostname:$port" 2>/dev/null |
openssl x509 -noout -enddate 2>/dev/null |
cut -d'=' -f2)
if [ -z "$cert_expiry" ]; then
log_health_result "tls:$hostname" "CRITICAL" "Unable to retrieve certificate information"
return $HEALTH_CRITICAL
fi
local expiry_date_seconds
expiry_date_seconds=$(date -d "$cert_expiry" +%s 2>/dev/null)
local current_date_seconds
current_date_seconds=$(date +%s)
local days_until_expiry
days_until_expiry=$(( (expiry_date_seconds - current_date_seconds) / 86400 ))
if [ "$days_until_expiry" -lt 0 ]; then
log_health_result "tls:$hostname" "CRITICAL" "Certificate expired $((days_until_expiry * -1)) days ago"
return $HEALTH_CRITICAL
elif [ "$days_until_expiry" -lt "$warning_days" ]; then
log_health_result "tls:$hostname" "WARNING" "Certificate expires in $days_until_expiry days"
return $HEALTH_WARNING
else
log_health_result "tls:$hostname" "OK" "Certificate valid for $days_until_expiry days"
return $HEALTH_OK
fi
}
|
🎨 Advanced Health Check Features
Composite Health Checks
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114 | # composite-healthchecks.sh - Combined health assessments
# Overall system health
check_system_health() {
local checks=("$@")
local overall_status=$HEALTH_OK
local failed_checks=()
local warning_checks=()
echo "Running system health checks..."
for check in "${checks[@]}"; do
local check_name
check_name=$(echo "$check" | cut -d' ' -f1)
local check_args
check_args=$(echo "$check" | cut -d' ' -f2-)
# Run health check
if run_healthcheck_with_timeout "$check_name" "$check" 30; then
local result=$?
case $result in
$HEALTH_OK)
echo "✓ $check_name: OK"
;;
$HEALTH_WARNING)
echo "⚠ $check_name: WARNING"
warning_checks+=("$check_name")
if [ $overall_status -lt $HEALTH_WARNING ]; then
overall_status=$HEALTH_WARNING
fi
;;
$HEALTH_CRITICAL)
echo "✗ $check_name: CRITICAL"
failed_checks+=("$check_name")
overall_status=$HEALTH_CRITICAL
;;
*)
echo "? $check_name: UNKNOWN"
if [ $overall_status -lt $HEALTH_UNKNOWN ]; then
overall_status=$HEALTH_UNKNOWN
fi
;;
esac
else
echo "✗ $check_name: TIMEOUT"
failed_checks+=("$check_name")
overall_status=$HEALTH_CRITICAL
fi
done
# Report overall status
case $overall_status in
$HEALTH_OK)
echo "Overall system health: OK"
return $HEALTH_OK
;;
$HEALTH_WARNING)
echo "Overall system health: WARNING"
echo "Warnings: ${warning_checks[*]}"
return $HEALTH_WARNING
;;
$HEALTH_CRITICAL)
echo "Overall system health: CRITICAL"
echo "Failed checks: ${failed_checks[*]}"
return $HEALTH_CRITICAL
;;
*)
echo "Overall system health: UNKNOWN"
return $HEALTH_UNKNOWN
;;
esac
}
# Kubernetes-style readiness probe
k8s_readiness_probe() {
local required_checks=("$@")
local failed_checks=()
for check in "${required_checks[@]}"; do
if ! eval "$check"; then
failed_checks+=("$check")
fi
done
if [ ${#failed_checks[@]} -eq 0 ]; then
echo "Readiness probe: PASS"
return 0
else
echo "Readiness probe: FAIL"
echo "Failed checks: ${failed_checks[*]}"
return 1
fi
}
# Kubernetes-style liveness probe
k8s_liveness_probe() {
local critical_checks=("$@")
local failed_checks=()
for check in "${critical_checks[@]}"; do
if ! eval "$check"; then
failed_checks+=("$check")
fi
done
if [ ${#failed_checks[@]} -eq 0 ]; then
echo "Liveness probe: PASS"
return 0
else
echo "Liveness probe: FAIL"
echo "Critical failures: ${failed_checks[*]}"
return 1
fi
}
|
🧾 Summary Best Practices
Health Check Implementation Guidelines
- Layered Approach: Check process, functional, resource, and dependency health
- Timeout Protection: Always implement timeouts to prevent hanging checks
- Clear Status Codes: Use standardized return codes for monitoring systems
- Detailed Logging: Provide actionable information in health check results
- Performance Impact: Minimize resource usage of health checks themselves
- Security Considerations: Don't expose sensitive information in health checks
- Regular Testing: Test health checks regularly to ensure they work correctly
Sample Health Check Configuration
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34 | # healthcheck-config.sh - Example health check configuration
# Process checks
PROCESS_CHECKS=(
"nginx"
"postgresql"
"redis-server"
)
# Functional checks
FUNCTIONAL_CHECKS=(
"check_http_service http://localhost:80 200"
"check_database_connectivity postgresql postgres://user:pass@localhost/db"
"check_cache_service redis localhost 6379"
)
# Resource checks
RESOURCE_CHECKS=(
"check_disk_space / 80 90"
"check_memory_usage 80 90"
"check_cpu_usage 80 90"
"check_load_average $(nproc) $(( $(nproc) * 2 ))"
)
# Dependency checks
DEPENDENCY_CHECKS=(
"check_dns_resolution google.com"
"check_network_connectivity 8.8.8.8 53"
"check_tls_certificate example.com 443 30"
)
# Composite check
ALL_CHECKS=("${PROCESS_CHECKS[@]}" "${FUNCTIONAL_CHECKS[@]}"
"${RESOURCE_CHECKS[@]}" "${DEPENDENCY_CHECKS[@]}")
|
🧠 Complete Health Check Script
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44 | #!/bin/bash
# comprehensive-healthcheck.sh - Production-ready health checking
set -euo pipefail
# Source health check libraries
source healthcheck-framework.sh
source process-healthchecks.sh
source functional-healthchecks.sh
source resource-healthchecks.sh
source dependency-healthchecks.sh
source composite-healthchecks.sh
# Main health check function
main() {
local mode="${1:-full}"
case "$mode" in
process)
check_process_health
;;
functional)
check_functional_health
;;
resource)
check_resource_health
;;
dependency)
check_dependency_health
;;
k8s-readiness)
k8s_readiness_probe "${READINESS_CHECKS[@]}"
;;
k8s-liveness)
k8s_liveness_probe "${LIVENESS_CHECKS[@]}"
;;
full|*)
check_system_health "${ALL_CHECKS[@]}"
;;
esac
}
# Run main function
main "$@"
|
🧾 See Also