📜 Logging and Telemetry Patterns

Effective logging and telemetry are crucial for monitoring, debugging, and understanding system behavior. This pattern establishes consistent approaches for capturing, formatting, and managing operational data.

🎯 Core Principles

Structured Logging

Use structured formats that are easily parseable by machines while remaining readable by humans.

# ❌ Unstructured logging
echo "Starting backup process for database at $(date)"

# ✅ Structured logging
log_info() {
    local message="$1"
    local timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
    echo "{\"timestamp\":\"$timestamp\",\"level\":\"INFO\",\"message\":\"$message\",\"pid\":$$}"
}

log_info "Starting backup process for database"
# {"timestamp":"2024-01-15T10:30:45Z","level":"INFO","message":"Starting backup process for database","pid":12345}

Consistent Severity Levels

Standardize log levels for predictable filtering and routing.

# Standard log levels
LOG_LEVEL_TRACE=5
LOG_LEVEL_DEBUG=4
LOG_LEVEL_INFO=3
LOG_LEVEL_WARN=2
LOG_LEVEL_ERROR=1
LOG_LEVEL_FATAL=0

# Current log level (can be configured)
CURRENT_LOG_LEVEL=$LOG_LEVEL_INFO

# Level checking function
should_log() {
    local level_value="$1"
    [ "$level_value" -le "$CURRENT_LOG_LEVEL" ]
}

# Logging functions with level checking
log_trace() {
    should_log $LOG_LEVEL_TRACE && log_message "TRACE" "$1"
}

log_debug() {
    should_log $LOG_LEVEL_DEBUG && log_message "DEBUG" "$1"
}

log_info() {
    should_log $LOG_LEVEL_INFO && log_message "INFO" "$1"
}

log_warn() {
    should_log $LOG_LEVEL_WARN && log_message "WARN" "$1"
}

log_error() {
    should_log $LOG_LEVEL_ERROR && log_message "ERROR" "$1" >&2
}

log_fatal() {
    should_log $LOG_LEVEL_FATAL && log_message "FATAL" "$1" >&2
}

📋 Log Message Structure

Rich Context Information

Include relevant metadata for better diagnostics.

# Enhanced logging with context
log_with_context() {
    local level="$1"
    local message="$2"
    local timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")

    # Build context object
    local context="{"
    context+="\"timestamp\":\"$timestamp\","
    context+="\"level\":\"$level\","
    context+="\"message\":\"$message\","
    context+="\"pid\":$$,"
    context+="\"user\":\"$(whoami)\","
    context+="\"host\":\"$(hostname)\","
    context+="\"script\":\"$(basename "$0")\""

    # Add custom fields if provided
    if [ -n "${LOG_CONTEXT_EXTRA:-}" ]; then
        context+=",$LOG_CONTEXT_EXTRA"
    fi

    context+="}"

    echo "$context"
}

# Usage with additional context
export LOG_CONTEXT_EXTRA="\"component\":\"database\",\"operation\":\"backup\""
log_info "Backup started"

🎨 Output Destinations

Multi-Stream Logging

Route different log levels to appropriate destinations.

# Multi-stream logging setup
setup_logging() {
    local log_dir="${1:-"/var/log/myapp"}"

    # Create log directory
    mkdir -p "$log_dir"

    # Setup log files
    LOG_FILE_ALL="$log_dir/application.log"
    LOG_FILE_ERROR="$log_dir/error.log"
    LOG_FILE_AUDIT="$log_dir/audit.log"

    # Touch files to ensure they exist
    touch "$LOG_FILE_ALL" "$LOG_FILE_ERROR" "$LOG_FILE_AUDIT"
}

# Enhanced logging with routing
log_message_routed() {
    local level="$1"
    local message="$2"
    local log_entry=$(log_with_context "$level" "$message")

    # Always log to main file
    echo "$log_entry" >> "$LOG_FILE_ALL"

    # Route errors to error file
    case "$level" in
        ERROR|FATAL)
            echo "$log_entry" >> "$LOG_FILE_ERROR"
            ;;
    esac

    # Route audit events to audit log
    if [[ "$message" == *"AUDIT"* ]]; then
        echo "$log_entry" >> "$LOG_FILE_AUDIT"
    fi

    # Also output to console based on level
    case "$level" in
        WARN|ERROR|FATAL)
            echo "$log_entry" >&2
            ;;
        INFO)
            echo "$log_entry"
            ;;
    esac
}

📊 Telemetry Integration

Metrics Collection

Collect quantitative data alongside logs.

# Simple metrics collection
declare -A METRICS_COUNTERS
declare -A METRICS_TIMERS

metric_increment() {
    local metric_name="$1"
    local increment="${2:-1}"

    local current_value="${METRICS_COUNTERS[$metric_name]:-0}"
    METRICS_COUNTERS["$metric_name"]=$((current_value + increment))
}

metric_timer_start() {
    local timer_name="$1"
    METRICS_TIMERS["$timer_name"]=$(date +%s%3N)  # milliseconds
}

metric_timer_stop() {
    local timer_name="$1"
    local start_time="${METRICS_TIMERS[$timer_name]:-0}"

    if [ "$start_time" -gt 0 ]; then
        local end_time=$(date +%s%3N)
        local duration=$((end_time - start_time))

        # Log timing metric
        log_info "METRIC:timer.$timer_name:${duration}ms"

        # Clean up timer
        unset METRICS_TIMERS["$timer_name"]
    fi
}

# Usage example
metric_timer_start "database_backup"
# ... backup operation ...
metric_timer_stop "database_backup"
metric_increment "backups.completed"

Health Checks and Probes

Implement logging for system health monitoring.

# Health check logging
health_check_component() {
    local component="$1"
    local check_function="$2"

    log_debug "HEALTH:Checking $component"

    if $check_function; then
        log_info "HEALTH:OK:$component"
        metric_increment "health.check.$component.pass"
        return 0
    else
        log_error "HEALTH:FAILED:$component"
        metric_increment "health.check.$component.fail"
        return 1
    fi
}

# Component-specific health checks
check_database_connection() {
    # Implementation here
    nc -z database-host 5432 2>/dev/null
}

check_disk_space() {
    local threshold="${1:-90}"
    local usage=$(df / | awk 'NR==2 {print $5}' | sed 's/%//')
    [ "$usage" -lt "$threshold" ]
}

# Usage
health_check_component "database" check_database_connection
health_check_component "disk" check_disk_space

🛠️ Advanced Logging Features

Log Rotation and Management

Handle log lifecycle automatically.

# Log rotation helper
rotate_log_if_needed() {
    local log_file="$1"
    local max_size="${2:-10485760}"  # 10MB default

    if [ -f "$log_file" ]; then
        local current_size=$(stat -f%z "$log_file" 2>/dev/null || stat -c%s "$log_file" 2>/dev/null)

        if [ "$current_size" -gt "$max_size" ]; then
            local timestamp=$(date +%Y%m%d_%H%M%S)
            local rotated_name="${log_file}.${timestamp}"

            mv "$log_file" "$rotated_name"
            touch "$log_file"  # Create new empty log

            log_info "Rotated log file: $log_file -> $rotated_name"

            # Compress old logs (optional)
            gzip "$rotated_name" &
        fi
    fi
}

# Periodic log maintenance
maintain_logs() {
    rotate_log_if_needed "$LOG_FILE_ALL"
    rotate_log_if_needed "$LOG_FILE_ERROR"
    rotate_log_if_needed "$LOG_FILE_AUDIT"
}

Contextual Logging

Maintain context across function calls.

# Context stack for nested operations
CONTEXT_STACK=()

push_context() {
    local context="$1"
    CONTEXT_STACK+=("$context")
}

pop_context() {
    if [ ${#CONTEXT_STACK[@]} -gt 0 ]; then
        unset CONTEXT_STACK[$((${#CONTEXT_STACK[@]}-1))]
    fi
}

get_current_context() {
    if [ ${#CONTEXT_STACK[@]} -gt 0 ]; then
        local last_index=$((${#CONTEXT_STACK[@]}-1))
        echo "${CONTEXT_STACK[$last_index]}"
    fi
}

# Context-aware logging
log_with_context_stack() {
    local level="$1"
    local message="$2"
    local context=$(get_current_context)

    if [ -n "$context" ]; then
        message="[$context] $message"
    fi

    log_message_routed "$level" "$message"
}

# Usage example
database_operation() {
    push_context "DATABASE"

    log_info "Starting database operation"
    # ... database work ...
    log_info "Database operation completed"

    pop_context
}

🧾 Summary Best Practices

Logging Principles

✅ Use structured, parseable formats ✅ Include rich contextual information ✅ Implement consistent severity levels ✅ Route logs to appropriate destinations ✅ Handle log rotation automatically ✅ Collect metrics alongside logs

Telemetry Guidelines

✅ Instrument critical operations ✅ Track both business and technical metrics ✅ Implement health checks with logging ✅ Use timers for performance monitoring ✅ Correlate events with unique identifiers

Security Considerations

✅ Never log sensitive data (passwords, keys, PII) ✅ Sanitize log content before output ✅ Control log verbosity in production ✅ Protect log files with appropriate permissions ✅ Implement log retention policies

🧠 Complete Logging Library Template

#!/bin/bash
# lib/logging.sh - Complete logging solution

# Log levels
readonly LOG_LEVEL_TRACE=5
readonly LOG_LEVEL_DEBUG=4
readonly LOG_LEVEL_INFO=3
readonly LOG_LEVEL_WARN=2
readonly LOG_LEVEL_ERROR=1
readonly LOG_LEVEL_FATAL=0

# Configuration
LOG_LEVEL=${LOG_LEVEL:-$LOG_LEVEL_INFO}
LOG_TO_CONSOLE=${LOG_TO_CONSOLE:-true}
LOG_TO_FILE=${LOG_TO_FILE:-false}
LOG_FILE=${LOG_FILE:-"/var/log/application.log"}

# Initialize logging
init_logging() {
    if [ "$LOG_TO_FILE" = true ] && [ -n "$LOG_FILE" ]; then
        mkdir -p "$(dirname "$LOG_FILE")"
        touch "$LOG_FILE"
    fi
}

# Core logging function
_log_output() {
    local level="$1"
    local message="$2"
    local timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
    local log_entry="{\"timestamp\":\"$timestamp\",\"level\":\"$level\",\"message\":\"$message\",\"pid\":$$}"

    # Console output
    if [ "$LOG_TO_CONSOLE" = true ]; then
        case "$level" in
            ERROR|FATAL) echo "$log_entry" >&2 ;;
            *) echo "$log_entry" ;;
        esac
    fi

    # File output
    if [ "$LOG_TO_FILE" = true ] && [ -n "$LOG_FILE" ]; then
        echo "$log_entry" >> "$LOG_FILE"
    fi
}

# Level-specific functions
log_trace() { [ $LOG_LEVEL_TRACE -le $LOG_LEVEL ] && _log_output "TRACE" "$1"; }
log_debug() { [ $LOG_LEVEL_DEBUG -le $LOG_LEVEL ] && _log_output "DEBUG" "$1"; }
log_info() { [ $LOG_LEVEL_INFO -le $LOG_LEVEL ] && _log_output "INFO" "$1"; }
log_warn() { [ $LOG_LEVEL_WARN -le $LOG_LEVEL ] && _log_output "WARN" "$1"; }
log_error() { [ $LOG_LEVEL_ERROR -le $LOG_LEVEL ] && _log_output "ERROR" "$1"; }
log_fatal() { [ $LOG_LEVEL_FATAL -le $LOG_LEVEL ] && _log_output "FATAL" "$1"; }

# Initialize on load
init_logging

🧾 See Also

Patterns: Config and Env Injection
Patterns: CLI Tools and UX
Anti-Patterns: Silent Failures
Recipes: Healthchecks and Probes
Linux Debugging Tools
Observability and Monitoring Best Practices