Przejdź do treści

🤖 Shell as Supervisor Anti-Patterns

Using shell scripts as process supervisors or service managers is problematic because shells lack the robust process management, monitoring, and recovery capabilities required for reliable service supervision. This anti-pattern explains why dedicated supervisor tools are preferable.


🎯 Core Problems

Inadequate Process Management

Shells provide limited process control compared to dedicated supervisors.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# ❌ Anti-pattern: Shell-based process supervision
supervise_service() {
    local service_cmd="$1"
    local service_name="$2"

    while true; do
        echo "Starting $service_name..."

        # Simple process execution
        $service_cmd &
        local pid=$!

        # Wait for process (no sophisticated monitoring)
        wait $pid
        local exit_code=$?

        echo "$service_name exited with code $exit_code"

        # Basic restart logic
        if [ $exit_code -ne 0 ]; then
            echo "Restarting in 5 seconds..."
            sleep 5
        else
            echo "Normal exit, not restarting"
            break
        fi
    done
}

# Problems:
# - No resource limiting
# - No health checking
# - No graceful shutdown handling
# - No logging aggregation
# - No dependency management
# - No process isolation
# - No restart policies
# - No metrics collection

# ✅ Better approach: Use proper supervisor
supervise_service_proper() {
    local service_name="$1"
    local config_file="$2"

    # Use systemd, supervisor, or similar
    if command -v systemctl >/dev/null 2>&1; then
        systemctl start "$service_name"
    elif command -v supervisorctl >/dev/null 2>&1; then
        supervisorctl start "$service_name"
    else
        echo "Error: No supervisor available" >&2
        return 1
    fi
}

Poor Resource Management

Shells lack sophisticated resource control mechanisms.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# ❌ Anti-pattern: Manual resource management
run_with_limits() {
    local max_memory="100M"
    local max_cpu="50%"
    local service_cmd="$1"

    # Attempt to limit resources (platform-specific)
    if command -v ulimit >/dev/null 2>&1; then
        ulimit -m "$max_memory" 2>/dev/null || true
        ulimit -u 100 2>/dev/null || true
    fi

    # Run service
    $service_cmd

    # No ongoing monitoring or enforcement
}

# Problems:
# - ulimit limitations vary by system
# - No ongoing resource monitoring
# - No automatic killing on limit breach
# - No resource accounting
# - No fair sharing mechanisms

# ✅ Better approach: Use containerization or proper supervisors
run_with_limits_proper() {
    local service_name="$1"

    # Use systemd resource limits
    if systemctl show "$service_name" | grep -q "MemoryMax"; then
        systemctl set-property "$service_name" MemoryMax=100M
        systemctl set-property "$service_name" CPUQuota=50%
    fi

    # Or use Docker with resource constraints
    # docker run --memory=100m --cpus=0.5 "$service_name"
}

🔧 Common Supervision Abuses

Flawed Restart Logic

Simple restart mechanisms lack sophistication.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# ❌ Anti-pattern: Naive restart strategy
naive_restart_loop() {
    local service_cmd="$1"
    local max_restarts=5
    local restart_count=0

    while [ $restart_count -lt $max_restarts ]; do
        echo "Attempt $((restart_count + 1))/$max_restarts"

        $service_cmd
        local exit_code=$?

        if [ $exit_code -eq 0 ]; then
            echo "Service exited normally"
            break
        else
            restart_count=$((restart_count + 1))
            echo "Service failed, restart #$restart_count"
            sleep 10
        fi
    done

    if [ $restart_count -eq $max_restarts ]; then
        echo "Maximum restart attempts reached" >&2
        return 1
    fi
}

# Problems:
# - Fixed restart limits
# - No exponential backoff
# - No failure pattern detection
# - No circuit breaker logic
# - No health-based decisions
# - No graceful degradation

# ✅ Better approach: Sophisticated restart policies
smart_restart_strategy() {
    local service_name="$1"

    # Use supervisor with configurable policies
    cat > "/etc/supervisor/conf.d/$service_name.conf" << EOF
[program:$service_name]
command=$service_cmd
autostart=true
autorestart=true
startretries=3
startsecs=10
exitcodes=0,2
stopsignal=TERM
stopwaitsecs=10
stdout_logfile=/var/log/$service_name.log
stderr_logfile=/var/log/$service_name.err.log

; Restart policy
; auto_restart = unexpected
; restart_limit = 5
; restart_interval = 60
EOF

    supervisorctl reread
    supervisorctl update
}

Inadequate Health Monitoring

Shells lack built-in health checking capabilities.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# ❌ Anti-pattern: No real health monitoring
basic_health_check() {
    local pid_file="$1"

    if [ -f "$pid_file" ]; then
        local pid
        pid=$(cat "$pid_file")

        # Basic process existence check
        if kill -0 "$pid" 2>/dev/null; then
            echo "Process running (PID: $pid)"
            return 0
        else
            echo "Process not running"
            return 1
        fi
    else
        echo "PID file not found"
        return 1
    fi
}

# Problems:
# - Only checks process existence
# - No functional health verification
# - No response time monitoring
# - No resource usage checking
# - No dependency validation
# - No synthetic transaction testing

# ✅ Better approach: Comprehensive health checking
comprehensive_health_check() {
    local service_url="$1"
    local health_endpoint="$2"

    # Functional health check
    if curl -sf "$service_url$health_endpoint" >/dev/null 2>&1; then
        echo "Service responding"
    else
        echo "Service not responding" >&2
        return 1
    fi

    # Resource usage check
    local cpu_usage
    cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1)

    if [ "${cpu_usage%.*}" -gt 80 ]; then
        echo "Warning: High CPU usage: $cpu_usage%" >&2
    fi

    # Memory usage check
    local mem_usage
    mem_usage=$(free | grep Mem | awk '{printf("%.0f"), $3/$2 * 100.0}')

    if [ "$mem_usage" -gt 80 ]; then
        echo "Warning: High memory usage: $mem_usage%" >&2
    fi
}

🎨 Advanced Supervision Issues

Graceful Shutdown Handling

Shells provide inadequate signal handling for complex shutdowns.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# ❌ Anti-pattern: Poor shutdown handling
simple_shutdown_handler() {
    local service_pid

    # Start service
    ./my-service &
    service_pid=$!

    # Basic signal handler
    trap 'kill $service_pid; exit' INT TERM

    # Wait indefinitely
    wait $service_pid
}

# Problems:
# - No graceful shutdown sequence
# - No timeout handling
# - No cleanup coordination
# - No dependent service management
# - No state preservation
# - No checkpointing support

# ✅ Better approach: Proper signal handling
robust_shutdown_handler() {
    local service_pid
    local shutdown_timeout=30

    cleanup() {
        echo "Received shutdown signal"

        # Send graceful shutdown signal
        if [ -n "$service_pid" ] && kill -0 "$service_pid" 2>/dev/null; then
            kill -TERM "$service_pid"

            # Wait for graceful shutdown
            local count=0
            while [ $count -lt $shutdown_timeout ] && kill -0 "$service_pid" 2>/dev/null; do
                sleep 1
                count=$((count + 1))
            done

            # Force kill if needed
            if kill -0 "$service_pid" 2>/dev/null; then
                echo "Force killing service"
                kill -KILL "$service_pid"
            fi
        fi

        # Cleanup resources
        rm -f /tmp/service.pid
        exit 0
    }

    # Set up signal handlers
    trap cleanup INT TERM

    # Start service
    ./my-service &
    service_pid=$!

    # Wait for service
    wait $service_pid
}

Log Management Deficiencies

Shells lack sophisticated log handling capabilities.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# ❌ Anti-pattern: Basic log redirection
poor_log_management() {
    local log_file="/var/log/service.log"

    # Simple redirection
    ./my-service > "$log_file" 2>&1 &

    # No log rotation
    # No log aggregation
    # No structured logging
    # No log level filtering
    # No log shipping
}

# Problems:
# - Unbounded log growth
# - No log rotation
# - Mixed stdout/stderr
# - No log search capabilities
# - No centralized logging
# - No log retention policies

# ✅ Better approach: Proper log management
proper_log_management() {
    local service_name="$1"

    # Use logging infrastructure
    if command -v logger >/dev/null 2>&1; then
        # Send to syslog
        ./my-service 2>&1 | logger -t "$service_name"
    else
        # Use logrotate-compatible setup
        ./my-service > "/var/log/$service_name.log" 2>&1 &

        # Configure logrotate
        cat > "/etc/logrotate.d/$service_name" << EOF
/var/log/$service_name.log {
    daily
    rotate 7
    compress
    delaycompress
    missingok
    notifempty
    create 644 root root
}
EOF
    fi
}

🧾 Summary of Issues

Why Shells Make Poor Supervisors

Limitation Impact Professional Solution
Basic process control Limited management systemd, supervisor, k8s
No resource limits Resource exhaustion cgroups, containers
Poor restart logic Service instability Configurable policies
Weak health checks Undetected failures Comprehensive monitoring
Inadequate signaling Unclean shutdowns Proper signal handling
Basic logging Operational blindness Centralized logging

Red Flags to Avoid

🚩 while true loops for service restarts 🚩 Manual process management with & and wait 🚩 Basic signal trapping without cleanup 🚩 Direct log file redirection without rotation 🚩 No resource limiting or monitoring 🚩 Simple existence checks instead of health checks


🧠 Prevention Strategies

Supervisor Selection Guide

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# Choose appropriate supervisor based on environment
select_supervisor() {
    local environment="$1"  # dev, prod, container, etc.

    case "$environment" in
        development)
            echo "Use: pm2, nodemon, or simple systemd user services"
            ;;
        production)
            echo "Use: systemd, supervisor, or process-compose"
            ;;
        container)
            echo "Use: Kubernetes, Docker Compose, or container-native supervisors"
            ;;
        legacy)
            echo "Use: traditional init scripts or supervisord"
            ;;
        *)
            echo "Use: systemd (most common choice)"
            ;;
    esac
}

# Supervisor configuration template
create_supervisor_config() {
    local service_name="$1"
    local service_cmd="$2"
    local work_dir="$3"

    cat > "/etc/systemd/system/$service_name.service" << EOF
[Unit]
Description=$service_name Service
After=network.target

[Service]
Type=simple
User=$service_name
WorkingDirectory=$work_dir
ExecStart=$service_cmd
Restart=always
RestartSec=10
StandardOutput=journal
StandardError=journal
SyslogIdentifier=$service_name

# Resource limits
LimitNOFILE=65536
LimitNPROC=4096

# Security
NoNewPrivileges=true
PrivateTmp=true

[Install]
WantedBy=multi-user.target
EOF

    systemctl daemon-reload
    systemctl enable "$service_name"
}

Health Check Framework

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# Comprehensive health checking system
health_check_framework() {
    local service_name="$1"

    # Define health check types
    check_process() {
        systemctl is-active "$service_name" >/dev/null 2>&1
    }

    check_functional() {
        local health_url="$1"
        curl -sf "$health_url" >/dev/null 2>&1
    }

    check_resources() {
        # Check CPU, memory, disk usage
        local cpu_threshold="${1:-80}"
        local mem_threshold="${2:-80}"
        local disk_threshold="${3:-90}"

        # Implementation here...
        return 0
    }

    check_dependencies() {
        local dependencies=("$@")
        for dep in "${dependencies[@]}"; do
            if ! systemctl is-active "$dep" >/dev/null 2>&1; then
                return 1
            fi
        done
        return 0
    }

    # Run comprehensive health check
    if check_process && check_functional "http://localhost:8080/health" && check_resources; then
        echo "Health check PASSED"
        return 0
    else
        echo "Health check FAILED" >&2
        return 1
    fi
}

🧾 See Also