🐞 Advanced Debugging and Tracing

Master sophisticated debugging techniques, tracing mechanisms, and diagnostic tools for complex shell script development and production troubleshooting.

🧭 Debugging Philosophy

Effective debugging requires systematic approaches: 1. Observation - Gather comprehensive information 2. Hypothesis - Formulate theories about root causes 3. Experimentation - Test hypotheses with controlled changes 4. Verification - Confirm fixes and prevent regressions

Shell scripts present unique challenges due to their interpreted nature and system integration.

🧪 Built-in Debugging Features

Trace Execution Modes

# Basic tracing - show commands as executed
set -x

# Verbose mode - show input lines as read
set -v

# Combined tracing
set -xv

# Selective tracing
{
    echo "This will be traced"
    set -x
    problematic_function
    set +x
    echo "This won't be traced"
}

# Redirect trace output separately
BASH_XTRACEFD=7
exec 7>debug.log
set -x

Conditional Debugging

# Debug flag pattern
DEBUG=${DEBUG:-false}

debug_echo() {
    if [ "$DEBUG" = true ]; then
        echo "[DEBUG] $*" >&2
    fi
}

# Usage
debug_echo "Processing file: $filename"
complex_operation
debug_echo "Operation completed"

Function-Level Tracing

# Enhanced function tracing
trace_functions() {
    local func_name="$1"

    # Store original function
    eval "original_${func_name}() { ${func_name} \"\$@\"; }"

    # Override with tracing version
    eval "
    $func_name() {
        echo \"[TRACE] Entering $func_name with args: \$@\" >&2
        local result
        result=\$(original_${func_name} \"\$@\")
        local exit_code=\$?
        echo \"[TRACE] Exiting $func_name with code: \$exit_code, result: \$result\" >&2
        echo \"\$result\"
        return \$exit_code
    }
    "
}

# Usage
my_function() {
    echo "Hello $1"
}

trace_functions my_function
my_function "World"

🧠 Advanced Error Diagnosis

Comprehensive Error Handler

#!/usr/bin/env bash

# Advanced error handling with full context
error_handler() {
    local line_number=$1
    local error_code=$2
    local command="$3"

    # Gather context information
    local timestamp
    timestamp=$(date '+%Y-%m-%d %H:%M:%S')

    local script_name
    script_name=$(basename "$0")

    local function_stack=""
    local frame=0
    while caller $frame >/dev/null 2>&1; do
        local caller_info
        caller_info=$(caller $frame)
        function_stack="${function_stack}${caller_info}\n"
        frame=$((frame + 1))
    done

    # Log detailed error information
    cat >&2 <<EOF
======================================================================
ERROR REPORT - $timestamp
----------------------------------------------------------------------
Script: $script_name
Line: $line_number
Exit Code: $error_code
Failed Command: $command
----------------------------------------------------------------------
Function Call Stack:
$(echo -e "$function_stack")
----------------------------------------------------------------------
Environment Variables:
$(env | grep -E '^(DEBUG|LOG_|ERROR_)' | sort)
----------------------------------------------------------------------
System Information:
Hostname: $(hostname)
User: $(whoami)
Working Directory: $(pwd)
----------------------------------------------------------------------
EOF

    # Optional: Send alert
    if [ "${SEND_ERROR_ALERTS:-false}" = true ]; then
        send_error_alert "$script_name" "$line_number" "$error_code" "$command"
    fi

    # Exit with original error code
    exit $error_code
}

# Set up error trapping
trap 'error_handler $LINENO $? "$BASH_COMMAND"' ERR
set -E  # Inherit error traps in functions

Interactive Debugging Mode

# Interactive debugging interface
interactive_debug() {
    local error_line=$1
    local error_cmd="$2"

    echo "=== DEBUG MODE ===" >&2
    echo "Error at line $error_line: $error_cmd" >&2
    echo "Available commands:" >&2
    echo "  v - show variables" >&2
    echo "  s - show stack trace" >&2
    echo "  c - continue execution" >&2
    echo "  q - quit with error" >&2
    echo "  h - show this help" >&2

    while true; do
        read -p "debug> " cmd
        case $cmd in
            v)
                echo "=== VARIABLES ===" >&2
                declare -p | grep -E '^[a-zA-Z_]' >&2
                ;;
            s)
                echo "=== STACK TRACE ===" >&2
                local frame=0
                while caller $frame >/dev/null 2>&1; do
                    caller $frame >&2
                    frame=$((frame + 1))
                done
                ;;
            c)
                echo "Continuing execution..." >&2
                return 0
                ;;
            q)
                echo "Quitting with error..." >&2
                return 1
                ;;
            h|*)
                echo "Available commands: v(s)ariables, (s)tack, (c)ontinue, (q)uit, (h)elp" >&2
                ;;
        esac
    done
}

# Use in development environment
if [ "${ENVIRONMENT:-development}" = "development" ]; then
    trap 'interactive_debug $LINENO "$BASH_COMMAND"' ERR
fi

🧪 System-Level Tracing

Process Tracing with strace

# Advanced strace usage for debugging

# Trace specific system calls
strace -e trace=open,read,write,close ./myscript.sh

# Trace with timing information
strace -T ./myscript.sh

# Count system calls
strace -c ./myscript.sh

# Follow child processes
strace -f ./myscript.sh

# Trace specific process
strace -p $PID

# Output to file with detailed info
strace -tt -T -o trace.log ./myscript.sh

Network Tracing

# Network activity monitoring
network_tracer() {
    local pid=$$

    echo "Starting network trace for PID: $pid" >&2

    # Background tcpdump capture
    sudo tcpdump -i any -w "/tmp/network_trace_$pid.pcap" &
    local tcpdump_pid=$!

    # Execute main script
    "$@"
    local exit_code=$?

    # Stop capture
    sudo kill $tcpdump_pid 2>/dev/null || true
    wait $tcpdump_pid 2>/dev/null || true

    echo "Network trace saved to: /tmp/network_trace_$pid.pcap" >&2
    echo "Analyze with: wireshark /tmp/network_trace_$pid.pcap" >&2

    return $exit_code
}

# Usage
# network_tracer ./network_intensive_script.sh

🧠 Memory and Resource Debugging

Memory Usage Monitoring

# Real-time memory monitoring
memory_monitor() {
    local pid=${1:-$$}
    local interval=${2:-1}

    echo "Monitoring memory usage for PID: $pid (interval: ${interval}s)" >&2
    echo "Timestamp,PID,VSize,RSS,Shared,Clean,Dirty" >&2

    while kill -0 $pid 2>/dev/null; do
        if [ -f "/proc/$pid/statm" ]; then
            local statm
            statm=$(cat "/proc/$pid/statm" 2>/dev/null || echo "0 0 0 0 0 0")

            echo "$(date '+%Y-%m-%d %H:%M:%S'),$pid,$statm" >&2
        fi

        sleep $interval
    done
}

# Usage
# memory_monitor $$ 2 &
# ./memory_intensive_script.sh

File Descriptor Leak Detection

# Detect file descriptor leaks
fd_leak_detector() {
    local pid=${1:-$$}
    local baseline_fds
    local current_fds

    baseline_fds=$(ls "/proc/$pid/fd" 2>/dev/null | wc -l)

    echo "Baseline FD count: $baseline_fds" >&2

    # Execute monitored code
    "$@"
    local exit_code=$?

    sleep 1  # Allow cleanup

    current_fds=$(ls "/proc/$pid/fd" 2>/dev/null | wc -l)

    echo "Final FD count: $current_fds" >&2

    if [ $current_fds -gt $((baseline_fds + 5)) ]; then
        echo "Warning: Potential FD leak detected!" >&2
        echo "Open FDs:" >&2
        ls -la "/proc/$pid/fd" 2>/dev/null >&2
    fi

    return $exit_code
}

🧪 Advanced Logging and Instrumentation

Structured Logging System

#!/usr/bin/env bash
# Advanced structured logging system

# Log levels
readonly LOG_LEVEL_TRACE=0
readonly LOG_LEVEL_DEBUG=1
readonly LOG_LEVEL_INFO=2
readonly LOG_LEVEL_WARN=3
readonly LOG_LEVEL_ERROR=4
readonly LOG_LEVEL_FATAL=5

# Current log level
LOG_LEVEL=${LOG_LEVEL:-$LOG_LEVEL_INFO}

# Log format
LOG_FORMAT="${LOG_FORMAT:-json}"  # json, text, detailed

# JSON logging function
log_json() {
    local level="$1"
    local level_num="$2"
    local message="$3"
    shift 3

    # Skip if below threshold
    if [ $level_num -lt $LOG_LEVEL ]; then
        return
    fi

    # Build JSON log entry
    local timestamp
    timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")

    local fields=""
    while [ $# -gt 0 ]; do
        local key="$1"
        local value="$2"
        shift 2
        fields="$fields,\"$key\":\"$value\""
    done

    echo "{\"timestamp\":\"$timestamp\",\"level\":\"$level\",\"message\":\"$message\",\"pid\":$$,\"script\":\"$(basename "$0")\"$fields}"
}

# Text logging function
log_text() {
    local level="$1"
    local level_num="$2"
    local message="$3"

    if [ $level_num -lt $LOG_LEVEL ]; then
        return
    fi

    local timestamp
    timestamp=$(date '+%Y-%m-%d %H:%M:%S')

    echo "[$timestamp] [$level] [$$] $message" >&2
}

# Main logging function
log() {
    local level="$1"
    local message="$2"
    shift 2

    case "$level" in
        TRACE) local level_num=$LOG_LEVEL_TRACE ;;
        DEBUG) local level_num=$LOG_LEVEL_DEBUG ;;
        INFO)  local level_num=$LOG_LEVEL_INFO  ;;
        WARN)  local level_num=$LOG_LEVEL_WARN  ;;
        ERROR) local level_num=$LOG_LEVEL_ERROR ;;
        FATAL) local level_num=$LOG_LEVEL_FATAL ;;
        *)     local level_num=$LOG_LEVEL_INFO  ;;
    esac

    case "$LOG_FORMAT" in
        json) log_json "$level" $level_num "$message" "$@" ;;
        *)    log_text "$level" $level_num "$message" ;;
    esac
}

# Convenience functions
log_trace() { log "TRACE" "$@"; }
log_debug() { log "DEBUG" "$@"; }
log_info()  { log "INFO"  "$@"; }
log_warn()  { log "WARN"  "$@"; }
log_error() { log "ERROR" "$@"; }
log_fatal() { log "FATAL" "$@"; }

# Usage examples
# LOG_LEVEL=$LOG_LEVEL_DEBUG LOG_FORMAT=json ./script.sh
# log_info "Starting process" "user" "$USER" "host" "$(hostname)"

🧠 Debugging Complex Workflows

State Machine Debugger

# State machine debugging framework
declare -A state_history
declare -i state_counter=0

debug_state_enter() {
    local state="$1"
    local timestamp
    timestamp=$(date '+%Y-%m-%d %H:%M:%S.%N')

    state_history[$state_counter]="$timestamp:ENTER:$state"
    state_counter=$((state_counter + 1))

    if [ "${DEBUG_STATES:-false}" = true ]; then
        echo "[STATE] Entering: $state at $timestamp" >&2
    fi
}

debug_state_exit() {
    local state="$1"
    local timestamp
    timestamp=$(date '+%Y-%m-%d %H:%M:%S.%N')

    state_history[$state_counter]="$timestamp:EXIT:$state"
    state_counter=$((state_counter + 1))

    if [ "${DEBUG_STATES:-false}" = true ]; then
        echo "[STATE] Exiting: $state at $timestamp" >&2
    fi
}

debug_dump_state_history() {
    echo "=== STATE HISTORY ===" >&2
    for ((i=0; i<state_counter; i++)); do
        echo "${state_history[$i]}" >&2
    done
    echo "=== END STATE HISTORY ===" >&2
}

# Usage in state machine
workflow_engine() {
    debug_state_enter "INIT"
    initialize_system
    debug_state_exit "INIT"

    debug_state_enter "PROCESS"
    process_data
    debug_state_exit "PROCESS"

    debug_state_enter "CLEANUP"
    cleanup_resources
    debug_state_exit "CLEANUP"
}

# Dump history on error
trap 'debug_dump_state_history' ERR

🧪 Production Debugging Tools

Remote Debugging Agent

# Remote debugging agent for production systems
remote_debug_agent() {
    local debug_port="${REMOTE_DEBUG_PORT:-9999}"

    # Start netcat listener for remote debugging
    while true; do
        echo "Remote debug agent listening on port $debug_port" >&2

        nc -l -p $debug_port -c '
            echo "=== REMOTE DEBUG SESSION ==="
            echo "Available commands:"
            echo "  ps     - Show processes"
            echo "  env    - Show environment"
            echo "  files  - Show open files"
            echo "  net    - Show network connections"
            echo "  quit   - Exit session"
            echo ""

            while read cmd; do
                case $cmd in
                    ps) ps aux --forest ;;
                    env) env | sort ;;
                    files) lsof -p $$ ;;
                    net) netstat -tlnp ;;
                    quit) exit 0 ;;
                    *) echo "Unknown command: $cmd" ;;
                esac
                echo ""
                echo "Next command:"
            done
        ' 2>/dev/null

        sleep 5  # Restart if connection drops
    done
}

# Start agent in background
if [ "${ENABLE_REMOTE_DEBUG:-false}" = true ]; then
    remote_debug_agent &
    echo "Remote debug agent started on port ${REMOTE_DEBUG_PORT:-9999}" >&2
fi

Crash Dump Generator

# Automatic crash dump generation
crash_dump() {
    local signal="$1"
    local timestamp
    timestamp=$(date '+%Y%m%d_%H%M%S')
    local dump_file="/tmp/crash_dump_${timestamp}_$$.txt"

    echo "=== CRASH DUMP ===" > "$dump_file"
    echo "Signal: $signal" >> "$dump_file"
    echo "Timestamp: $(date)" >> "$dump_file"
    echo "Script: $0" >> "$dump_file"
    echo "PID: $$" >> "$dump_file"
    echo "PPID: $PPID" >> "$dump_file"
    echo "" >> "$dump_file"

    echo "=== ENVIRONMENT ===" >> "$dump_file"
    env | sort >> "$dump_file"
    echo "" >> "$dump_file"

    echo "=== PROCESS INFORMATION ===" >> "$dump_file"
    ps aux | grep $$ >> "$dump_file"
    echo "" >> "$dump_file"

    echo "=== OPEN FILES ===" >> "$dump_file"
    lsof -p $$ 2>/dev/null >> "$dump_file"
    echo "" >> "$dump_file"

    echo "=== NETWORK CONNECTIONS ===" >> "$dump_file"
    netstat -tlnp 2>/dev/null >> "$dump_file"
    echo "" >> "$dump_file"

    echo "=== MEMORY USAGE ===" >> "$dump_file"
    free -h >> "$dump_file"
    echo "" >> "$dump_file"

    echo "=== DISK USAGE ===" >> "$dump_file"
    df -h >> "$dump_file"
    echo "" >> "$dump_file"

    echo "Crash dump saved to: $dump_file" >&2
    echo "Please include this file when reporting issues." >&2
}

# Install crash handlers
trap 'crash_dump HUP' HUP
trap 'crash_dump INT' INT
trap 'crash_dump TERM' TERM
trap 'crash_dump SEGV' SEGV
trap 'crash_dump BUS' BUS

🧠 Debugging in Containerized Environments

Container-Aware Debugger

# Debugging tools for container environments
container_debug_info() {
    echo "=== CONTAINER DEBUG INFO ===" >&2

    # Check if running in container
    if [ -f /.dockerenv ]; then
        echo "Running in Docker container" >&2
    elif [ -f /run/.containerenv ]; then
        echo "Running in Podman container" >&2
    else
        echo "Not running in container" >&2
    fi

    # Container-specific information
    echo "Hostname: $(hostname)" >&2
    echo "PID 1: $(ps -o comm= -p 1)" >&2

    # Cgroup information
    if [ -f /proc/1/cgroup ]; then
        echo "Cgroups:" >&2
        head -10 /proc/1/cgroup >&2
    fi

    # Mount information
    echo "Interesting mounts:" >&2
    mount | grep -E "(kube|docker|container)" >&2

    # Environment variables
    echo "Container-related env vars:" >&2
    env | grep -E "(KUBE|DOCKER|CONTAINER|POD)" | sort >&2
}

# Enhanced error handler for containers
container_error_handler() {
    local line_number=$1
    local error_code=$2
    local command="$3"

    # Standard error handling
    error_handler $line_number $error_code "$command"

    # Container-specific diagnostics
    if [ -f /.dockerenv ] || [ -f /run/.containerenv ]; then
        echo "=== CONTAINER DIAGNOSTICS ===" >&2
        container_debug_info

        # Check resource limits
        if [ -f /sys/fs/cgroup/memory/memory.limit_in_bytes ]; then
            echo "Memory limit: $(cat /sys/fs/cgroup/memory/memory.limit_in_bytes)" >&2
        fi

        # Check disk space
        echo "Disk usage:" >&2
        df -h >&2
    fi
}

🧾 Debugging Best Practices

Debugging Checklist

Enable Tracing Early

set -euo pipefail
[[ "${DEBUG:-false}" == true ]] && set -x

Use Structured Logging

log_info "Processing item" "item_id" "$item_id" "batch" "$batch_num"

Implement Comprehensive Error Handling

trap 'error_handler $LINENO $? "$BASH_COMMAND"' ERR

Monitor Resource Usage
1
ulimit -a # Check limits

Test in Production-Like Environments

docker run --memory=512m --cpus=0.5 myapp:test

Debugging Tools Summary

Tool	Purpose	Usage Example
`set -x`	Command tracing	Basic debugging
`strace`	System call tracing	Deep system analysis
`lsof`	File descriptor monitoring	FD leak detection
`tcpdump`	Network tracing	Network issue debugging
`gdb`	Process debugging	Core dump analysis
Custom logging	Application-level debugging	Production monitoring

🧾 Summary

Key Debugging Principles

Instrument Early - Add logging and tracing from the start
Context is Key - Capture comprehensive environmental information
Layered Approach - Use multiple debugging techniques together
Production-Safe - Ensure debugging tools don't impact production
Automated Response - Implement automatic crash reporting and recovery
Remote Accessibility - Enable debugging in distributed environments
Performance Awareness - Balance debugging detail with overhead

Advanced Debugging Techniques

Custom error handlers with full context
Interactive debugging interfaces
System-level tracing with strace
Memory and resource monitoring
Structured logging systems
State machine debugging
Remote debugging agents
Automatic crash dump generation
Container-aware debugging tools

Debugging Anti-Patterns to Avoid

Leaving debug output in production code
Using overly verbose logging in production
Ignoring error conditions
Not cleaning up debugging artifacts
Relying solely on printf debugging
Forgetting to test debugging tools themselves
Not considering security implications of debugging endpoints

👉 Continue to: POSIX Compatibility