Przejdź do treści

🤐 Silent Failures Anti-Patterns

Silent failures occur when scripts don't properly report errors, leading to undetected problems, data corruption, and system instability. This anti-pattern demonstrates how to avoid silent failures through proper error handling and reporting.


🎯 Core Problems

Ignored Exit Codes

Failing to check command exit codes masks failures.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# ❌ Anti-pattern: Ignoring exit codes
backup_database() {
    mysqldump mydb > backup.sql    # What if this fails?
    gzip backup.sql                # What if compression fails?
    scp backup.sql.gz server:/backups/  # What if transfer fails?

    echo "Backup completed"        # Always says success
}

# Problems:
# - Failures go unnoticed
# - Corrupt/incomplete backups
# - False sense of security
# - Difficult troubleshooting
# - Cascading failures

# ✅ Better approach: Check all exit codes
backup_database_safe() {
    # Check each step individually
    if ! mysqldump mydb > backup.sql; then
        echo "Error: Database dump failed" >&2
        return 1
    fi

    if ! gzip backup.sql; then
        echo "Error: Compression failed" >&2
        return 1
    fi

    if ! scp backup.sql.gz server:/backups/; then
        echo "Error: Transfer failed" >&2
        return 1
    fi

    echo "Backup completed successfully"
    return 0
}

Suppressed Error Output

Redirecting errors to /dev/null hides critical information.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# ❌ Anti-pattern: Suppressing errors
update_system() {
    apt-get update >/dev/null 2>&1      # Hide all output
    apt-get upgrade -y >/dev/null 2>&1   # Hide errors

    if [ $? -eq 0 ]; then
        echo "System updated"
    else
        echo "Update failed"  # Too late - lost error details
    fi
}

# Problems:
# - Lose diagnostic information
# - Cannot troubleshoot issues
# - Mask permission problems
# - Hide configuration errors
# - Make debugging impossible

# ✅ Better approach: Proper error handling
update_system_safe() {
    local log_file="/tmp/update.log"

    # Capture output for analysis
    if ! apt-get update >"$log_file" 2>&1; then
        echo "Error: Update failed. Check log: $log_file" >&2
        cat "$log_file" >&2
        return 1
    fi

    if ! apt-get upgrade -y >>"$log_file" 2>&1; then
        echo "Error: Upgrade failed. Check log: $log_file" >&2
        tail -20 "$log_file" >&2
        return 1
    fi

    echo "System updated successfully"
    return 0
}

🔧 Common Failure Points

Pipeline Error Silencing

Pipelines can hide failures from intermediate commands.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# ❌ Anti-pattern: Pipeline error hiding
process_logs() {
    cat /var/log/application.log | \
    grep "ERROR" | \
    sort | \
    uniq -c | \
    sort -nr > error_summary.txt

    echo "Log processing completed"  # Always says success
}

# Problems:
# - If cat fails, pipeline still "succeeds"
# - No way to know if intermediate steps failed
# - Incomplete or corrupt output
# - set -e doesn't help in pipelines

# ✅ Better approach: Explicit error checking
process_logs_safe() {
    local input_file="/var/log/application.log"
    local output_file="error_summary.txt"
    local temp_file

    # Verify input file exists
    if [ ! -f "$input_file" ]; then
        echo "Error: Log file not found: $input_file" >&2
        return 1
    fi

    # Create temporary file
    temp_file=$(mktemp) || {
        echo "Error: Cannot create temporary file" >&2
        return 1
    }

    # Clean up on exit
    trap 'rm -f "$temp_file"' EXIT

    # Use pipefail to catch pipeline errors
    set -o pipefail

    if ! cat "$input_file" | \
       grep "ERROR" | \
       sort | \
       uniq -c | \
       sort -nr > "$temp_file"; then
        echo "Error: Log processing pipeline failed" >&2
        return 1
    fi

    # Move result to final location
    if ! mv "$temp_file" "$output_file"; then
        echo "Error: Cannot move result to $output_file" >&2
        return 1
    fi

    echo "Log processing completed successfully"
    return 0
}

Partial Operation Failures

Incomplete operations that appear successful.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# ❌ Anti-pattern: Partial success masking failures
deploy_application() {
    # Some steps succeed, some fail
    cp config/production.conf /etc/app/  # Might fail due to permissions
    systemctl restart app                 # Might fail due to config error
    echo "Deployment completed"           # Always claims success

    # But what if the service is broken?
}

# Problems:
# - System left in inconsistent state
# - Service may be down
# - Configuration errors not detected
# - Rollback not triggered
# - Monitoring systems unaware of issues

# ✅ Better approach: Atomic operations with rollback
deploy_application_safe() {
    local backup_config="/etc/app/production.conf.backup"
    local new_config="config/production.conf"
    local service_name="app"

    # Backup current configuration
    if ! cp "/etc/app/production.conf" "$backup_config"; then
        echo "Error: Cannot backup current configuration" >&2
        return 1
    fi

    # Deploy new configuration
    if ! cp "$new_config" "/etc/app/production.conf"; then
        echo "Error: Cannot deploy new configuration" >&2
        # Attempt rollback
        cp "$backup_config" "/etc/app/production.conf"
        return 1
    fi

    # Test configuration validity
    if ! app --config-test; then
        echo "Error: Configuration validation failed" >&2
        # Rollback
        cp "$backup_config" "/etc/app/production.conf"
        return 1
    fi

    # Restart service
    if ! systemctl restart "$service_name"; then
        echo "Error: Service restart failed" >&2
        # Rollback configuration
        cp "$backup_config" "/etc/app/production.conf"
        systemctl restart "$service_name"  # Try to restart with old config
        return 1
    fi

    # Verify service is running
    if ! systemctl is-active --quiet "$service_name"; then
        echo "Error: Service failed to start after deployment" >&2
        # Rollback
        cp "$backup_config" "/etc/app/production.conf"
        systemctl restart "$service_name"
        return 1
    fi

    # Clean up backup on success
    rm -f "$backup_config"

    echo "Deployment completed successfully"
    return 0
}

🎨 Advanced Silent Failure Issues

Asynchronous Operation Failures

Background processes that fail silently.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# ❌ Anti-pattern: Background process error hiding
start_background_workers() {
    # Start multiple background processes
    worker1.sh &  # Errors lost
    worker2.sh &  # Errors lost
    worker3.sh &  # Errors lost

    echo "Workers started"  # Always says success
}

# Problems:
# - No way to track individual worker status
# - Errors completely invisible
# - Cannot implement retry logic
# - Resource leaks possible
# - No health monitoring

# ✅ Better approach: Managed background processes
start_background_workers_safe() {
    local workers=("worker1.sh" "worker2.sh" "worker3.sh")
    local worker_pids=()
    local failed_workers=()

    # Start workers and track PIDs
    for worker in "${workers[@]}"; do
        if [ -x "$worker" ]; then
            "$worker" &
            worker_pids+=($!)
            echo "Started $worker with PID $!"
        else
            echo "Error: Worker script not executable: $worker" >&2
            failed_workers+=("$worker")
        fi
    done

    # Check if any workers failed to start
    if [ ${#failed_workers[@]} -gt 0 ]; then
        echo "Error: Failed to start ${#failed_workers[@]} workers" >&2
        return 1
    fi

    # Monitor worker processes
    monitor_workers() {
        local all_good=true

        for i in "${!worker_pids[@]}"; do
            local pid=${worker_pids[$i]}
            local worker=${workers[$i]}

            if ! kill -0 "$pid" 2>/dev/null; then
                echo "Error: Worker $worker (PID $pid) has died" >&2
                all_good=false
            fi
        done

        if [ "$all_good" = false ]; then
            return 1
        fi
    }

    # Set up monitoring
    while true; do
        if ! monitor_workers; then
            echo "Error: Worker monitoring detected failures" >&2
            return 1
        fi
        sleep 10
    done
}

Conditional Logic Errors

Complex conditions that fail silently.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# ❌ Anti-pattern: Complex condition failures
validate_system_state() {
    # Complex validation with silent failures
    if [ -f /etc/system.conf ] && \
       [ "$(cat /etc/system.conf | grep active)" ] && \
       [ "$(systemctl is-active network)" = "active" ]; then
        echo "System OK"
    else
        echo "System check failed"  # But why?
    fi
}

# Problems:
# - No indication which check failed
# - cat errors suppressed
# - grep errors suppressed
# - No detailed error reporting
# - Difficult to troubleshoot

# ✅ Better approach: Explicit validation with detailed reporting
validate_system_state_safe() {
    local errors=()

    # Check configuration file
    if [ ! -f /etc/system.conf ]; then
        errors+=("Configuration file not found: /etc/system.conf")
    elif [ ! -r /etc/system.conf ]; then
        errors+=("Configuration file not readable: /etc/system.conf")
    else
        # Check configuration content
        if ! grep -q "active" /etc/system.conf; then
            errors+=("Configuration not active in /etc/system.conf")
        fi
    fi

    # Check network service
    if ! systemctl is-active --quiet network; then
        errors+=("Network service not active")
    fi

    # Report results
    if [ ${#errors[@]} -eq 0 ]; then
        echo "System validation passed"
        return 0
    else
        echo "System validation failed:" >&2
        for error in "${errors[@]}"; do
            echo "  - $error" >&2
        done
        return 1
    fi
}

🧾 Summary of Issues

Common Silent Failure Patterns

Pattern Impact Solution
Ignored exit codes Hidden failures Check $? or use &&/||
Suppressed errors Lost diagnostics Log errors appropriately
Pipeline failures Incomplete results Use set -o pipefail
Partial operations Inconsistent state Atomic operations
Background failures Unmonitored processes Track PIDs and status
Complex conditions Unclear failures Validate step by step

Red Flags to Avoid

🚩 Commands without exit code checking 🚩 >/dev/null 2>&1 without good reason 🚩 Long pipelines without error handling 🚩 Background processes without monitoring 🚩 Complex conditions without step validation 🚩 No rollback mechanisms for critical operations


🧠 Prevention Strategies

Comprehensive Error Handling Framework

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# Error handling utilities
error_handling_framework() {
    # Global error tracking
    ERROR_LOG="/tmp/script_errors.log"

    # Error logging function
    log_error() {
        local message="$1"
        local timestamp
        timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")

        echo "[$timestamp] ERROR: $message" >> "$ERROR_LOG"
        echo "ERROR: $message" >&2
    }

    # Warning logging function
    log_warning() {
        local message="$1"
        local timestamp
        timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")

        echo "[$timestamp] WARNING: $message" >> "$ERROR_LOG"
        echo "WARNING: $message" >&2
    }

    # Safe command execution
    safe_execute() {
        local command="$1"
        shift
        local description="${1:-$command}"

        echo "Executing: $description"

        if ! $command; then
            log_error "Failed: $description"
            return 1
        fi

        echo "Success: $description"
        return 0
    }

    # Cleanup function
    cleanup() {
        local exit_code=$?

        if [ $exit_code -ne 0 ]; then
            echo "Script failed with exit code: $exit_code" >&2
            echo "Check error log: $ERROR_LOG" >&2
        fi

        # Perform cleanup actions
        # ...

        exit $exit_code
    }

    # Set up cleanup trap
    trap cleanup EXIT INT TERM
}

Robust Script Template

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/bin/bash
# robust-script.sh - Template with comprehensive error handling

# Set strict mode
set -euo pipefail

# Global variables
SCRIPT_NAME=$(basename "$0")
LOG_FILE="/var/log/${SCRIPT_NAME}.log"
ERROR_COUNT=0

# Logging functions
log_info() {
    local message="$1"
    local timestamp
    timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
    echo "[$timestamp] INFO: $message" | tee -a "$LOG_FILE"
}

log_error() {
    local message="$1"
    local timestamp
    timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
    echo "[$timestamp] ERROR: $message" | tee -a "$LOG_FILE" >&2
    ERROR_COUNT=$((ERROR_COUNT + 1))
}

# Safe command execution
execute_or_fail() {
    local command="$1"
    local description="${2:-$command}"

    log_info "Executing: $description"

    if ! eval "$command"; then
        log_error "Failed: $description"
        exit 1
    fi

    log_info "Completed: $description"
}

# Main function with error handling
main() {
    log_info "Starting script: $SCRIPT_NAME"

    # Your script logic here
    execute_or_fail "echo 'Hello World'" "Print greeting"
    execute_or_fail "ls /nonexistent" "List nonexistent directory" || true

    # Check for accumulated errors
    if [ $ERROR_COUNT -gt 0 ]; then
        log_error "Script completed with $ERROR_COUNT errors"
        exit 1
    fi

    log_info "Script completed successfully"
}

# Run main function
main "$@"

🧾 See Also