🐞 Advanced Debugging and Tracing
Master sophisticated debugging techniques, tracing mechanisms, and diagnostic tools for complex shell script development and production troubleshooting.
🧭 Debugging Philosophy
Effective debugging requires systematic approaches:
1. Observation - Gather comprehensive information
2. Hypothesis - Formulate theories about root causes
3. Experimentation - Test hypotheses with controlled changes
4. Verification - Confirm fixes and prevent regressions
Shell scripts present unique challenges due to their interpreted nature and system integration.
🧪 Built-in Debugging Features
Trace Execution Modes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 | # Basic tracing - show commands as executed
set -x
# Verbose mode - show input lines as read
set -v
# Combined tracing
set -xv
# Selective tracing
{
echo "This will be traced"
set -x
problematic_function
set +x
echo "This won't be traced"
}
# Redirect trace output separately
BASH_XTRACEFD=7
exec 7>debug.log
set -x
|
Conditional Debugging
1
2
3
4
5
6
7
8
9
10
11
12
13 | # Debug flag pattern
DEBUG=${DEBUG:-false}
debug_echo() {
if [ "$DEBUG" = true ]; then
echo "[DEBUG] $*" >&2
fi
}
# Usage
debug_echo "Processing file: $filename"
complex_operation
debug_echo "Operation completed"
|
Function-Level Tracing
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28 | # Enhanced function tracing
trace_functions() {
local func_name="$1"
# Store original function
eval "original_${func_name}() { ${func_name} \"\$@\"; }"
# Override with tracing version
eval "
$func_name() {
echo \"[TRACE] Entering $func_name with args: \$@\" >&2
local result
result=\$(original_${func_name} \"\$@\")
local exit_code=\$?
echo \"[TRACE] Exiting $func_name with code: \$exit_code, result: \$result\" >&2
echo \"\$result\"
return \$exit_code
}
"
}
# Usage
my_function() {
echo "Hello $1"
}
trace_functions my_function
my_function "World"
|
🧠 Advanced Error Diagnosis
Comprehensive Error Handler
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59 | #!/usr/bin/env bash
# Advanced error handling with full context
error_handler() {
local line_number=$1
local error_code=$2
local command="$3"
# Gather context information
local timestamp
timestamp=$(date '+%Y-%m-%d %H:%M:%S')
local script_name
script_name=$(basename "$0")
local function_stack=""
local frame=0
while caller $frame >/dev/null 2>&1; do
local caller_info
caller_info=$(caller $frame)
function_stack="${function_stack}${caller_info}\n"
frame=$((frame + 1))
done
# Log detailed error information
cat >&2 <<EOF
======================================================================
ERROR REPORT - $timestamp
----------------------------------------------------------------------
Script: $script_name
Line: $line_number
Exit Code: $error_code
Failed Command: $command
----------------------------------------------------------------------
Function Call Stack:
$(echo -e "$function_stack")
----------------------------------------------------------------------
Environment Variables:
$(env | grep -E '^(DEBUG|LOG_|ERROR_)' | sort)
----------------------------------------------------------------------
System Information:
Hostname: $(hostname)
User: $(whoami)
Working Directory: $(pwd)
----------------------------------------------------------------------
EOF
# Optional: Send alert
if [ "${SEND_ERROR_ALERTS:-false}" = true ]; then
send_error_alert "$script_name" "$line_number" "$error_code" "$command"
fi
# Exit with original error code
exit $error_code
}
# Set up error trapping
trap 'error_handler $LINENO $? "$BASH_COMMAND"' ERR
set -E # Inherit error traps in functions
|
Interactive Debugging Mode
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48 | # Interactive debugging interface
interactive_debug() {
local error_line=$1
local error_cmd="$2"
echo "=== DEBUG MODE ===" >&2
echo "Error at line $error_line: $error_cmd" >&2
echo "Available commands:" >&2
echo " v - show variables" >&2
echo " s - show stack trace" >&2
echo " c - continue execution" >&2
echo " q - quit with error" >&2
echo " h - show this help" >&2
while true; do
read -p "debug> " cmd
case $cmd in
v)
echo "=== VARIABLES ===" >&2
declare -p | grep -E '^[a-zA-Z_]' >&2
;;
s)
echo "=== STACK TRACE ===" >&2
local frame=0
while caller $frame >/dev/null 2>&1; do
caller $frame >&2
frame=$((frame + 1))
done
;;
c)
echo "Continuing execution..." >&2
return 0
;;
q)
echo "Quitting with error..." >&2
return 1
;;
h|*)
echo "Available commands: v(s)ariables, (s)tack, (c)ontinue, (q)uit, (h)elp" >&2
;;
esac
done
}
# Use in development environment
if [ "${ENVIRONMENT:-development}" = "development" ]; then
trap 'interactive_debug $LINENO "$BASH_COMMAND"' ERR
fi
|
🧪 System-Level Tracing
Process Tracing with strace
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 | # Advanced strace usage for debugging
# Trace specific system calls
strace -e trace=open,read,write,close ./myscript.sh
# Trace with timing information
strace -T ./myscript.sh
# Count system calls
strace -c ./myscript.sh
# Follow child processes
strace -f ./myscript.sh
# Trace specific process
strace -p $PID
# Output to file with detailed info
strace -tt -T -o trace.log ./myscript.sh
|
Network Tracing
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26 | # Network activity monitoring
network_tracer() {
local pid=$$
echo "Starting network trace for PID: $pid" >&2
# Background tcpdump capture
sudo tcpdump -i any -w "/tmp/network_trace_$pid.pcap" &
local tcpdump_pid=$!
# Execute main script
"$@"
local exit_code=$?
# Stop capture
sudo kill $tcpdump_pid 2>/dev/null || true
wait $tcpdump_pid 2>/dev/null || true
echo "Network trace saved to: /tmp/network_trace_$pid.pcap" >&2
echo "Analyze with: wireshark /tmp/network_trace_$pid.pcap" >&2
return $exit_code
}
# Usage
# network_tracer ./network_intensive_script.sh
|
🧠 Memory and Resource Debugging
Memory Usage Monitoring
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 | # Real-time memory monitoring
memory_monitor() {
local pid=${1:-$$}
local interval=${2:-1}
echo "Monitoring memory usage for PID: $pid (interval: ${interval}s)" >&2
echo "Timestamp,PID,VSize,RSS,Shared,Clean,Dirty" >&2
while kill -0 $pid 2>/dev/null; do
if [ -f "/proc/$pid/statm" ]; then
local statm
statm=$(cat "/proc/$pid/statm" 2>/dev/null || echo "0 0 0 0 0 0")
echo "$(date '+%Y-%m-%d %H:%M:%S'),$pid,$statm" >&2
fi
sleep $interval
done
}
# Usage
# memory_monitor $$ 2 &
# ./memory_intensive_script.sh
|
File Descriptor Leak Detection
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28 | # Detect file descriptor leaks
fd_leak_detector() {
local pid=${1:-$$}
local baseline_fds
local current_fds
baseline_fds=$(ls "/proc/$pid/fd" 2>/dev/null | wc -l)
echo "Baseline FD count: $baseline_fds" >&2
# Execute monitored code
"$@"
local exit_code=$?
sleep 1 # Allow cleanup
current_fds=$(ls "/proc/$pid/fd" 2>/dev/null | wc -l)
echo "Final FD count: $current_fds" >&2
if [ $current_fds -gt $((baseline_fds + 5)) ]; then
echo "Warning: Potential FD leak detected!" >&2
echo "Open FDs:" >&2
ls -la "/proc/$pid/fd" 2>/dev/null >&2
fi
return $exit_code
}
|
🧪 Advanced Logging and Instrumentation
Structured Logging System
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93 | #!/usr/bin/env bash
# Advanced structured logging system
# Log levels
readonly LOG_LEVEL_TRACE=0
readonly LOG_LEVEL_DEBUG=1
readonly LOG_LEVEL_INFO=2
readonly LOG_LEVEL_WARN=3
readonly LOG_LEVEL_ERROR=4
readonly LOG_LEVEL_FATAL=5
# Current log level
LOG_LEVEL=${LOG_LEVEL:-$LOG_LEVEL_INFO}
# Log format
LOG_FORMAT="${LOG_FORMAT:-json}" # json, text, detailed
# JSON logging function
log_json() {
local level="$1"
local level_num="$2"
local message="$3"
shift 3
# Skip if below threshold
if [ $level_num -lt $LOG_LEVEL ]; then
return
fi
# Build JSON log entry
local timestamp
timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
local fields=""
while [ $# -gt 0 ]; do
local key="$1"
local value="$2"
shift 2
fields="$fields,\"$key\":\"$value\""
done
echo "{\"timestamp\":\"$timestamp\",\"level\":\"$level\",\"message\":\"$message\",\"pid\":$$,\"script\":\"$(basename "$0")\"$fields}"
}
# Text logging function
log_text() {
local level="$1"
local level_num="$2"
local message="$3"
if [ $level_num -lt $LOG_LEVEL ]; then
return
fi
local timestamp
timestamp=$(date '+%Y-%m-%d %H:%M:%S')
echo "[$timestamp] [$level] [$$] $message" >&2
}
# Main logging function
log() {
local level="$1"
local message="$2"
shift 2
case "$level" in
TRACE) local level_num=$LOG_LEVEL_TRACE ;;
DEBUG) local level_num=$LOG_LEVEL_DEBUG ;;
INFO) local level_num=$LOG_LEVEL_INFO ;;
WARN) local level_num=$LOG_LEVEL_WARN ;;
ERROR) local level_num=$LOG_LEVEL_ERROR ;;
FATAL) local level_num=$LOG_LEVEL_FATAL ;;
*) local level_num=$LOG_LEVEL_INFO ;;
esac
case "$LOG_FORMAT" in
json) log_json "$level" $level_num "$message" "$@" ;;
*) log_text "$level" $level_num "$message" ;;
esac
}
# Convenience functions
log_trace() { log "TRACE" "$@"; }
log_debug() { log "DEBUG" "$@"; }
log_info() { log "INFO" "$@"; }
log_warn() { log "WARN" "$@"; }
log_error() { log "ERROR" "$@"; }
log_fatal() { log "FATAL" "$@"; }
# Usage examples
# LOG_LEVEL=$LOG_LEVEL_DEBUG LOG_FORMAT=json ./script.sh
# log_info "Starting process" "user" "$USER" "host" "$(hostname)"
|
🧠 Debugging Complex Workflows
State Machine Debugger
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55 | # State machine debugging framework
declare -A state_history
declare -i state_counter=0
debug_state_enter() {
local state="$1"
local timestamp
timestamp=$(date '+%Y-%m-%d %H:%M:%S.%N')
state_history[$state_counter]="$timestamp:ENTER:$state"
state_counter=$((state_counter + 1))
if [ "${DEBUG_STATES:-false}" = true ]; then
echo "[STATE] Entering: $state at $timestamp" >&2
fi
}
debug_state_exit() {
local state="$1"
local timestamp
timestamp=$(date '+%Y-%m-%d %H:%M:%S.%N')
state_history[$state_counter]="$timestamp:EXIT:$state"
state_counter=$((state_counter + 1))
if [ "${DEBUG_STATES:-false}" = true ]; then
echo "[STATE] Exiting: $state at $timestamp" >&2
fi
}
debug_dump_state_history() {
echo "=== STATE HISTORY ===" >&2
for ((i=0; i<state_counter; i++)); do
echo "${state_history[$i]}" >&2
done
echo "=== END STATE HISTORY ===" >&2
}
# Usage in state machine
workflow_engine() {
debug_state_enter "INIT"
initialize_system
debug_state_exit "INIT"
debug_state_enter "PROCESS"
process_data
debug_state_exit "PROCESS"
debug_state_enter "CLEANUP"
cleanup_resources
debug_state_exit "CLEANUP"
}
# Dump history on error
trap 'debug_dump_state_history' ERR
|
Remote Debugging Agent
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41 | # Remote debugging agent for production systems
remote_debug_agent() {
local debug_port="${REMOTE_DEBUG_PORT:-9999}"
# Start netcat listener for remote debugging
while true; do
echo "Remote debug agent listening on port $debug_port" >&2
nc -l -p $debug_port -c '
echo "=== REMOTE DEBUG SESSION ==="
echo "Available commands:"
echo " ps - Show processes"
echo " env - Show environment"
echo " files - Show open files"
echo " net - Show network connections"
echo " quit - Exit session"
echo ""
while read cmd; do
case $cmd in
ps) ps aux --forest ;;
env) env | sort ;;
files) lsof -p $$ ;;
net) netstat -tlnp ;;
quit) exit 0 ;;
*) echo "Unknown command: $cmd" ;;
esac
echo ""
echo "Next command:"
done
' 2>/dev/null
sleep 5 # Restart if connection drops
done
}
# Start agent in background
if [ "${ENABLE_REMOTE_DEBUG:-false}" = true ]; then
remote_debug_agent &
echo "Remote debug agent started on port ${REMOTE_DEBUG_PORT:-9999}" >&2
fi
|
Crash Dump Generator
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49 | # Automatic crash dump generation
crash_dump() {
local signal="$1"
local timestamp
timestamp=$(date '+%Y%m%d_%H%M%S')
local dump_file="/tmp/crash_dump_${timestamp}_$$.txt"
echo "=== CRASH DUMP ===" > "$dump_file"
echo "Signal: $signal" >> "$dump_file"
echo "Timestamp: $(date)" >> "$dump_file"
echo "Script: $0" >> "$dump_file"
echo "PID: $$" >> "$dump_file"
echo "PPID: $PPID" >> "$dump_file"
echo "" >> "$dump_file"
echo "=== ENVIRONMENT ===" >> "$dump_file"
env | sort >> "$dump_file"
echo "" >> "$dump_file"
echo "=== PROCESS INFORMATION ===" >> "$dump_file"
ps aux | grep $$ >> "$dump_file"
echo "" >> "$dump_file"
echo "=== OPEN FILES ===" >> "$dump_file"
lsof -p $$ 2>/dev/null >> "$dump_file"
echo "" >> "$dump_file"
echo "=== NETWORK CONNECTIONS ===" >> "$dump_file"
netstat -tlnp 2>/dev/null >> "$dump_file"
echo "" >> "$dump_file"
echo "=== MEMORY USAGE ===" >> "$dump_file"
free -h >> "$dump_file"
echo "" >> "$dump_file"
echo "=== DISK USAGE ===" >> "$dump_file"
df -h >> "$dump_file"
echo "" >> "$dump_file"
echo "Crash dump saved to: $dump_file" >&2
echo "Please include this file when reporting issues." >&2
}
# Install crash handlers
trap 'crash_dump HUP' HUP
trap 'crash_dump INT' INT
trap 'crash_dump TERM' TERM
trap 'crash_dump SEGV' SEGV
trap 'crash_dump BUS' BUS
|
🧠 Debugging in Containerized Environments
Container-Aware Debugger
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56 | # Debugging tools for container environments
container_debug_info() {
echo "=== CONTAINER DEBUG INFO ===" >&2
# Check if running in container
if [ -f /.dockerenv ]; then
echo "Running in Docker container" >&2
elif [ -f /run/.containerenv ]; then
echo "Running in Podman container" >&2
else
echo "Not running in container" >&2
fi
# Container-specific information
echo "Hostname: $(hostname)" >&2
echo "PID 1: $(ps -o comm= -p 1)" >&2
# Cgroup information
if [ -f /proc/1/cgroup ]; then
echo "Cgroups:" >&2
head -10 /proc/1/cgroup >&2
fi
# Mount information
echo "Interesting mounts:" >&2
mount | grep -E "(kube|docker|container)" >&2
# Environment variables
echo "Container-related env vars:" >&2
env | grep -E "(KUBE|DOCKER|CONTAINER|POD)" | sort >&2
}
# Enhanced error handler for containers
container_error_handler() {
local line_number=$1
local error_code=$2
local command="$3"
# Standard error handling
error_handler $line_number $error_code "$command"
# Container-specific diagnostics
if [ -f /.dockerenv ] || [ -f /run/.containerenv ]; then
echo "=== CONTAINER DIAGNOSTICS ===" >&2
container_debug_info
# Check resource limits
if [ -f /sys/fs/cgroup/memory/memory.limit_in_bytes ]; then
echo "Memory limit: $(cat /sys/fs/cgroup/memory/memory.limit_in_bytes)" >&2
fi
# Check disk space
echo "Disk usage:" >&2
df -h >&2
fi
}
|
🧾 Debugging Best Practices
Debugging Checklist
-
Enable Tracing Early
| set -euo pipefail
[[ "${DEBUG:-false}" == true ]] && set -x
|
-
Use Structured Logging
| log_info "Processing item" "item_id" "$item_id" "batch" "$batch_num"
|
-
Implement Comprehensive Error Handling
| trap 'error_handler $LINENO $? "$BASH_COMMAND"' ERR
|
-
Monitor Resource Usage
-
Test in Production-Like Environments
| docker run --memory=512m --cpus=0.5 myapp:test
|
| Tool |
Purpose |
Usage Example |
set -x |
Command tracing |
Basic debugging |
strace |
System call tracing |
Deep system analysis |
lsof |
File descriptor monitoring |
FD leak detection |
tcpdump |
Network tracing |
Network issue debugging |
gdb |
Process debugging |
Core dump analysis |
| Custom logging |
Application-level debugging |
Production monitoring |
🧾 Summary
Key Debugging Principles
- Instrument Early - Add logging and tracing from the start
- Context is Key - Capture comprehensive environmental information
- Layered Approach - Use multiple debugging techniques together
- Production-Safe - Ensure debugging tools don't impact production
- Automated Response - Implement automatic crash reporting and recovery
- Remote Accessibility - Enable debugging in distributed environments
- Performance Awareness - Balance debugging detail with overhead
Advanced Debugging Techniques
- Custom error handlers with full context
- Interactive debugging interfaces
- System-level tracing with strace
- Memory and resource monitoring
- Structured logging systems
- State machine debugging
- Remote debugging agents
- Automatic crash dump generation
- Container-aware debugging tools
Debugging Anti-Patterns to Avoid
- Leaving debug output in production code
- Using overly verbose logging in production
- Ignoring error conditions
- Not cleaning up debugging artifacts
- Relying solely on printf debugging
- Forgetting to test debugging tools themselves
- Not considering security implications of debugging endpoints
👉 Continue to: POSIX Compatibility