Skip to content

Go应用监控与告警 - Golang高级面试题

Go应用的监控与告警是生产环境运维的重要组成部分。本章深入探讨Go应用的监控体系、指标收集、告警策略和最佳实践。

📋 重点面试题

面试题 1:Go应用监控体系设计

难度级别:⭐⭐⭐⭐⭐
考察范围:监控设计/运维实践
技术标签monitoring metrics observability prometheus grafana alerting

详细解答

1. 监控体系设计和指标收集

点击查看完整代码实现
点击查看完整代码实现
go
package main

import (
    "context"
    "fmt"
    "log"
    "net/http"
    "runtime"
    "sync"
    "time"
    
    "github.com/prometheus/client_golang/prometheus"
    "github.com/prometheus/client_golang/prometheus/promhttp"
)

// 监控指标定义
var (
    // 业务指标
    requestTotal = prometheus.NewCounterVec(
        prometheus.CounterOpts{
            Name: "http_requests_total",
            Help: "Total number of HTTP requests",
        },
        []string{"method", "endpoint", "status"},
    )
    
    requestDuration = prometheus.NewHistogramVec(
        prometheus.HistogramOpts{
            Name: "http_request_duration_seconds",
            Help: "HTTP request duration in seconds",
            Buckets: prometheus.DefBuckets,
        },
        []string{"method", "endpoint"},
    )
    
    activeConnections = prometheus.NewGauge(
        prometheus.GaugeOpts{
            Name: "active_connections",
            Help: "Current number of active connections",
        },
    )
    
    // 系统指标
    goroutineCount = prometheus.NewGauge(
        prometheus.GaugeOpts{
            Name: "goroutines_count",
            Help: "Current number of goroutines",
        },
    )
    
    memoryUsage = prometheus.NewGaugeVec(
        prometheus.GaugeOpts{
            Name: "memory_usage_bytes",
            Help: "Memory usage in bytes",
        },
        []string{"type"},
    )
    
    gcDuration = prometheus.NewHistogram(
        prometheus.HistogramOpts{
            Name: "gc_duration_seconds",
            Help: "GC duration in seconds",
            Buckets: []float64{0.001, 0.01, 0.1, 1.0, 10.0},
        },
    )
    
    // 错误指标
    errorTotal = prometheus.NewCounterVec(
        prometheus.CounterOpts{
            Name: "errors_total",
            Help: "Total number of errors",
        },
        []string{"type", "severity"},
    )
    
    // 自定义业务指标
    orderTotal = prometheus.NewCounterVec(
        prometheus.CounterOpts{
            Name: "orders_total",
            Help: "Total number of orders",
        },
        []string{"status", "type"},
    )
    
    inventoryLevel = prometheus.NewGaugeVec(
        prometheus.GaugeOpts{
            Name: "inventory_level",
            Help: "Current inventory level",
        },
        []string{"product_id", "location"},
    )
)

func init() {
    // 注册所有指标
    prometheus.MustRegister(
        requestTotal,
        requestDuration,
        activeConnections,
        goroutineCount,
        memoryUsage,
        gcDuration,
        errorTotal,
        orderTotal,
        inventoryLevel,
    )
}

func demonstrateMonitoringSystem() {
    fmt.Println("=== Go应用监控体系演示 ===")
    
    // 启动监控指标收集
    startMetricsCollection()
    
    // 启动HTTP监控中间件
    demonstrateHTTPMonitoring()
    
    // 演示自定义业务指标
    demonstrateBusinessMetrics()
    
    // 演示错误追踪和告警
    demonstrateErrorTracking()
    
    // 演示性能监控
    demonstratePerformanceMonitoring()
}

func startMetricsCollection() {
    fmt.Println("\n--- 启动系统指标收集 ---")
    
    // 定期收集系统指标
    go func() {
        ticker := time.NewTicker(5 * time.Second)
        defer ticker.Stop()
        
        for range ticker.C {
            collectSystemMetrics()
        }
    }()
    
    // 监控GC事件
    go func() {
        ticker := time.NewTicker(1 * time.Second)
        defer ticker.Stop()
        
        var lastGC uint32
        
        for range ticker.C {
            var m runtime.MemStats
            runtime.ReadMemStats(&m)
            
            if m.NumGC > lastGC {
                // 新的GC发生
                gcCount := m.NumGC - lastGC
                for i := uint32(0); i < gcCount; i++ {
                    pauseNs := m.PauseNs[(m.NumGC-gcCount+i+255)%256]
                    gcDuration.Observe(float64(pauseNs) / 1e9)
                }
                lastGC = m.NumGC
                
                fmt.Printf("GC事件: 次数=%d, 最近暂停时间=%v\n", 
                    gcCount, time.Duration(m.PauseNs[(m.NumGC+255)%256]))
            }
        }
    }()
}

func collectSystemMetrics() {
    var m runtime.MemStats
    runtime.ReadMemStats(&m)
    
    // 更新goroutine数量
    goroutineCount.Set(float64(runtime.NumGoroutine()))
    
    // 更新内存使用情况
    memoryUsage.WithLabelValues("heap_alloc").Set(float64(m.HeapAlloc))
    memoryUsage.WithLabelValues("heap_sys").Set(float64(m.HeapSys))
    memoryUsage.WithLabelValues("stack_sys").Set(float64(m.StackSys))
    memoryUsage.WithLabelValues("sys").Set(float64(m.Sys))
    
    fmt.Printf("系统指标更新 - Goroutines: %d, 堆内存: %d KB\n", 
        runtime.NumGoroutine(), m.HeapAlloc/1024)
}

func demonstrateHTTPMonitoring() {
    fmt.Println("\n--- HTTP监控中间件演示 ---")
    
    // HTTP监控中间件
    monitoringMiddleware := func(next http.HandlerFunc) http.HandlerFunc {
        return func(w http.ResponseWriter, r *http.Request) {
            start := time.Now()
            
            // 增加活跃连接数
            activeConnections.Inc()
            defer activeConnections.Dec()
            
            // 包装ResponseWriter以捕获状态码
            wrapped := &responseWriter{ResponseWriter: w, statusCode: 200}
            
            // 执行请求处理
            next(wrapped, r)
            
            // 记录指标
            duration := time.Since(start)
            method := r.Method
            endpoint := r.URL.Path
            status := fmt.Sprintf("%d", wrapped.statusCode)
            
            requestTotal.WithLabelValues(method, endpoint, status).Inc()
            requestDuration.WithLabelValues(method, endpoint).Observe(duration.Seconds())
            
            fmt.Printf("HTTP请求: %s %s -> %s, 耗时: %v\n", 
                method, endpoint, status, duration)
        }
    }
    
    // 示例HTTP处理器
    http.HandleFunc("/api/health", monitoringMiddleware(func(w http.ResponseWriter, r *http.Request) {
        time.Sleep(10 * time.Millisecond) // 模拟处理时间
        w.WriteHeader(http.StatusOK)
        w.Write([]byte(`{"status":"ok"}`))
    }))
    
    http.HandleFunc("/api/users", monitoringMiddleware(func(w http.ResponseWriter, r *http.Request) {
        time.Sleep(50 * time.Millisecond) // 模拟数据库查询
        w.WriteHeader(http.StatusOK)
        w.Write([]byte(`{"users":[]}`))
    }))
    
    http.HandleFunc("/api/error", monitoringMiddleware(func(w http.ResponseWriter, r *http.Request) {
        // 模拟错误
        errorTotal.WithLabelValues("api_error", "high").Inc()
        w.WriteHeader(http.StatusInternalServerError)
        w.Write([]byte(`{"error":"internal server error"}`))
    }))
    
    // Prometheus指标端点
    http.Handle("/metrics", promhttp.Handler())
    
    // 模拟HTTP请求
    go func() {
        time.Sleep(1 * time.Second) // 等待服务器启动
        
        endpoints := []string{"/api/health", "/api/users", "/api/error"}
        client := &http.Client{Timeout: 5 * time.Second}
        
        for i := 0; i < 10; i++ {
            endpoint := endpoints[i%len(endpoints)]
            resp, err := client.Get("http://localhost:8080" + endpoint)
            if err != nil {
                fmt.Printf("请求错误: %v\n", err)
                continue
            }
            resp.Body.Close()
            
            time.Sleep(100 * time.Millisecond)
        }
    }()
    
    // 启动HTTP服务器(模拟)
    fmt.Println("HTTP监控服务已启动(模拟)")
}

type responseWriter struct {
    http.ResponseWriter
    statusCode int
}

func (rw *responseWriter) WriteHeader(code int) {
    rw.statusCode = code
    rw.ResponseWriter.WriteHeader(code)
}

func demonstrateBusinessMetrics() {
    fmt.Println("\n--- 业务指标监控演示 ---")
    
    // 模拟业务事件
    businessEvents := []struct {
        orderType   string
        orderStatus string
        productID   string
        location    string
        inventory   float64
    }{
        {"online", "completed", "product_001", "warehouse_a", 100},
        {"offline", "pending", "product_002", "warehouse_b", 250},
        {"online", "failed", "product_001", "warehouse_a", 95},
        {"online", "completed", "product_003", "warehouse_c", 80},
        {"offline", "completed", "product_002", "warehouse_b", 240},
    }
    
    for i, event := range businessEvents {
        // 记录订单指标
        orderTotal.WithLabelValues(event.orderStatus, event.orderType).Inc()
        
        // 更新库存水平
        inventoryLevel.WithLabelValues(event.productID, event.location).Set(event.inventory)
        
        fmt.Printf("业务事件 %d: 订单=%s/%s, 产品=%s, 库存=%v\n",
            i+1, event.orderStatus, event.orderType, event.productID, event.inventory)
        
        time.Sleep(200 * time.Millisecond)
    }
    
    // 监控业务规则
    go func() {
        ticker := time.NewTicker(3 * time.Second)
        defer ticker.Stop()
        
        for range ticker.C {
            checkBusinessRules()
        }
    }()
}

func checkBusinessRules() {
    fmt.Println("检查业务规则...")
    
    // 示例:检查库存水平
    metric := &dto.Metric{}
    
    // 这里实际应该从Prometheus查询当前指标值
    // 为了演示,我们模拟一些检查
    
    // 模拟低库存告警
    lowInventoryThreshold := 50.0
    if 45.0 < lowInventoryThreshold { // 模拟库存值
        errorTotal.WithLabelValues("low_inventory", "medium").Inc()
        fmt.Printf("⚠️  库存告警: 产品库存低于阈值 %.0f\n", lowInventoryThreshold)
    }
    
    // 模拟高错误率告警
    errorRateThreshold := 0.1 // 10%
    if 0.15 > errorRateThreshold { // 模拟错误率
        errorTotal.WithLabelValues("high_error_rate", "high").Inc()
        fmt.Printf("🚨 错误率告警: 错误率超过阈值 %.1f%%\n", errorRateThreshold*100)
    }
}

func demonstrateErrorTracking() {
    fmt.Println("\n--- 错误追踪和告警演示 ---")
    
    // 错误分类和严重程度
    errorTypes := []struct {
        errorType string
        severity  string
        message   string
    }{
        {"database_error", "high", "数据库连接超时"},
        {"validation_error", "medium", "用户输入验证失败"},
        {"network_error", "high", "外部API调用失败"},
        {"cache_error", "low", "缓存未命中"},
        {"auth_error", "medium", "认证失败"},
    }
    
    var wg sync.WaitGroup
    
    for i, errorEvent := range errorTypes {
        wg.Add(1)
        go func(id int, event struct {
            errorType string
            severity  string
            message   string
        }) {
            defer wg.Done()
            
            // 记录错误指标
            errorTotal.WithLabelValues(event.errorType, event.severity).Inc()
            
            // 模拟错误处理
            fmt.Printf("错误事件 %d: [%s] %s - %s\n", 
                id+1, event.severity, event.errorType, event.message)
            
            // 根据严重程度决定告警策略
            switch event.severity {
            case "high":
                triggerImmediateAlert(event.errorType, event.message)
            case "medium":
                aggregateAlert(event.errorType, event.message)
            case "low":
                logAlert(event.errorType, event.message)
            }
            
            time.Sleep(100 * time.Millisecond)
        }(i, errorEvent)
    }
    
    wg.Wait()
}

func triggerImmediateAlert(errorType, message string) {
    fmt.Printf("🚨 立即告警: [%s] %s\n", errorType, message)
    
    // 实际实现中会发送到告警系统
    // 例如:PagerDuty, Slack, 邮件等
}

func aggregateAlert(errorType, message string) {
    fmt.Printf("⚠️  聚合告警: [%s] %s\n", errorType, message)
    
    // 实际实现中会累积错误,达到阈值后发送告警
}

func logAlert(errorType, message string) {
    fmt.Printf("📝 日志记录: [%s] %s\n", errorType, message)
    
    // 实际实现中只记录到日志,不发送告警
}

func demonstratePerformanceMonitoring() {
    fmt.Println("\n--- 性能监控演示 ---")
    
    // 监控不同类型的性能指标
    performanceTests := []struct {
        name     string
        testFunc func() time.Duration
    }{
        {"CPU密集型任务", performCPUIntensiveTask},
        {"内存分配任务", performMemoryAllocationTask},
        {"IO密集型任务", performIOIntensiveTask},
        {"并发任务", performConcurrentTask},
    }
    
    for _, test := range performanceTests {
        fmt.Printf("\n执行性能测试: %s\n", test.name)
        
        // 记录开始状态
        var before runtime.MemStats
        runtime.ReadMemStats(&before)
        beforeGoroutines := runtime.NumGoroutine()
        
        // 执行测试
        start := time.Now()
        duration := test.testFunc()
        totalDuration := time.Since(start)
        
        // 记录结束状态
        var after runtime.MemStats
        runtime.ReadMemStats(&after)
        afterGoroutines := runtime.NumGoroutine()
        
        // 计算性能指标
        memoryDelta := int64(after.HeapAlloc) - int64(before.HeapAlloc)
        gcDelta := after.NumGC - before.NumGC
        goroutineDelta := afterGoroutines - beforeGoroutines
        
        fmt.Printf("  执行时间: %v\n", duration)
        fmt.Printf("  总耗时: %v\n", totalDuration)
        fmt.Printf("  内存变化: %+d bytes\n", memoryDelta)
        fmt.Printf("  GC次数: %d\n", gcDelta)
        fmt.Printf("  Goroutine变化: %+d\n", goroutineDelta)
        
        // 记录到监控指标
        performanceMetric := prometheus.NewHistogramVec(
            prometheus.HistogramOpts{
                Name: "performance_test_duration_seconds",
                Help: "Performance test duration",
            },
            []string{"test_name"},
        )
        
        performanceMetric.WithLabelValues(test.name).Observe(duration.Seconds())
    }
}

func performCPUIntensiveTask() time.Duration {
    start := time.Now()
    
    // CPU密集型计算
    sum := 0
    for i := 0; i < 10000000; i++ {
        sum += i * i
    }
    
    return time.Since(start)
}

func performMemoryAllocationTask() time.Duration {
    start := time.Now()
    
    // 大量内存分配
    slices := make([][]byte, 1000)
    for i := range slices {
        slices[i] = make([]byte, 1024)
    }
    
    return time.Since(start)
}

func performIOIntensiveTask() time.Duration {
    start := time.Now()
    
    // 模拟IO操作
    var wg sync.WaitGroup
    for i := 0; i < 10; i++ {
        wg.Add(1)
        go func() {
            defer wg.Done()
            time.Sleep(10 * time.Millisecond)
        }()
    }
    wg.Wait()
    
    return time.Since(start)
}

func performConcurrentTask() time.Duration {
    start := time.Now()
    
    // 并发任务
    var wg sync.WaitGroup
    ch := make(chan int, 100)
    
    // 生产者
    wg.Add(1)
    go func() {
        defer wg.Done()
        defer close(ch)
        
        for i := 0; i < 100; i++ {
            ch <- i
        }
    }()
    
    // 消费者
    numWorkers := 5
    for i := 0; i < numWorkers; i++ {
        wg.Add(1)
        go func() {
            defer wg.Done()
            
            for data := range ch {
                // 模拟处理
                _ = data * 2
                time.Sleep(1 * time.Millisecond)
            }
        }()
    }
    
    wg.Wait()
    return time.Since(start)
}

:::

面试题 2:告警系统设计和实现

难度级别:⭐⭐⭐⭐⭐
考察范围:告警设计/可靠性工程
技术标签alerting sre reliability notification escalation

详细解答

1. 告警系统架构和策略

点击查看完整代码实现
点击查看完整代码实现
go
func demonstrateAlertingSystem() {
    fmt.Println("\n=== 告警系统设计演示 ===")
    
    // 创建告警管理器
    alertManager := NewAlertManager()
    
    // 演示告警规则配置
    demonstrateAlertRules(alertManager)
    
    // 演示告警聚合和抑制
    demonstrateAlertAggregation(alertManager)
    
    // 演示告警升级策略
    demonstrateAlertEscalation(alertManager)
    
    // 演示告警静音和维护模式
    demonstrateAlertSilencing(alertManager)
}

// 告警级别定义
type AlertSeverity int

const (
    SeverityInfo AlertSeverity = iota
    SeverityWarning
    SeverityCritical
    SeverityEmergency
)

func (s AlertSeverity) String() string {
    switch s {
    case SeverityInfo:
        return "info"
    case SeverityWarning:
        return "warning"
    case SeverityCritical:
        return "critical"
    case SeverityEmergency:
        return "emergency"
    default:
        return "unknown"
    }
}

// 告警规则
type AlertRule struct {
    Name        string
    Expression  string // PromQL表达式
    Duration    time.Duration
    Severity    AlertSeverity
    Labels      map[string]string
    Annotations map[string]string
}

// 告警实例
type Alert struct {
    ID          string
    Rule        *AlertRule
    Labels      map[string]string
    Annotations map[string]string
    StartsAt    time.Time
    EndsAt      time.Time
    Status      AlertStatus
    Fingerprint string
}

type AlertStatus int

const (
    StatusFiring AlertStatus = iota
    StatusResolved
    StatusSilenced
    StatusInhibited
)

func (s AlertStatus) String() string {
    switch s {
    case StatusFiring:
        return "firing"
    case StatusResolved:
        return "resolved"
    case StatusSilenced:
        return "silenced"
    case StatusInhibited:
        return "inhibited"
    default:
        return "unknown"
    }
}

// 告警管理器
type AlertManager struct {
    rules       []*AlertRule
    alerts      map[string]*Alert
    silences    []*Silence
    inhibitions []*Inhibition
    receivers   map[string]Receiver
    mu          sync.RWMutex
}

// 静音规则
type Silence struct {
    ID        string
    Matchers  []Matcher
    StartsAt  time.Time
    EndsAt    time.Time
    CreatedBy string
    Comment   string
}

// 抑制规则
type Inhibition struct {
    SourceMatchers []Matcher
    TargetMatchers []Matcher
    Equal          []string
}

// 标签匹配器
type Matcher struct {
    Name    string
    Value   string
    IsRegex bool
}

// 告警接收器接口
type Receiver interface {
    Send(alert *Alert) error
    Name() string
}

// 邮件接收器
type EmailReceiver struct {
    name      string
    addresses []string
    smtpHost  string
    smtpPort  int
}

func (er *EmailReceiver) Send(alert *Alert) error {
    fmt.Printf("📧 邮件告警 [%s]: %s - %s\n", 
        er.name, alert.Rule.Name, alert.Annotations["summary"])
    
    // 实际实现中会发送真实邮件
    time.Sleep(10 * time.Millisecond) // 模拟发送延迟
    return nil
}

func (er *EmailReceiver) Name() string {
    return er.name
}

// Slack接收器
type SlackReceiver struct {
    name    string
    webhook string
    channel string
}

func (sr *SlackReceiver) Send(alert *Alert) error {
    emoji := "⚠️"
    if alert.Rule.Severity == SeverityCritical {
        emoji = "🚨"
    } else if alert.Rule.Severity == SeverityEmergency {
        emoji = "🔥"
    }
    
    fmt.Printf("💬 Slack告警 [%s]: %s %s - %s\n", 
        sr.name, emoji, alert.Rule.Name, alert.Annotations["summary"])
    
    time.Sleep(5 * time.Millisecond) // 模拟发送延迟
    return nil
}

func (sr *SlackReceiver) Name() string {
    return sr.name
}

// PagerDuty接收器
type PagerDutyReceiver struct {
    name        string
    serviceKey  string
    severityMap map[AlertSeverity]string
}

func (pr *PagerDutyReceiver) Send(alert *Alert) error {
    severity := pr.severityMap[alert.Rule.Severity]
    fmt.Printf("📟 PagerDuty告警 [%s]: [%s] %s - %s\n", 
        pr.name, severity, alert.Rule.Name, alert.Annotations["summary"])
    
    time.Sleep(20 * time.Millisecond) // 模拟发送延迟
    return nil
}

func (pr *PagerDutyReceiver) Name() string {
    return pr.name
}

func NewAlertManager() *AlertManager {
    am := &AlertManager{
        rules:       make([]*AlertRule, 0),
        alerts:      make(map[string]*Alert),
        silences:    make([]*Silence, 0),
        inhibitions: make([]*Inhibition, 0),
        receivers:   make(map[string]Receiver),
    }
    
    // 配置默认接收器
    am.receivers["email-ops"] = &EmailReceiver{
        name:      "email-ops",
        addresses: []string{"ops@company.com"},
        smtpHost:  "smtp.company.com",
        smtpPort:  587,
    }
    
    am.receivers["slack-alerts"] = &SlackReceiver{
        name:    "slack-alerts",
        webhook: "https://hooks.slack.com/...",
        channel: "#alerts",
    }
    
    am.receivers["pagerduty-oncall"] = &PagerDutyReceiver{
        name:       "pagerduty-oncall",
        serviceKey: "your-service-key",
        severityMap: map[AlertSeverity]string{
            SeverityInfo:      "info",
            SeverityWarning:   "warning",
            SeverityCritical:  "error",
            SeverityEmergency: "critical",
        },
    }
    
    return am
}

func (am *AlertManager) AddRule(rule *AlertRule) {
    am.mu.Lock()
    defer am.mu.Unlock()
    am.rules = append(am.rules, rule)
}

func (am *AlertManager) TriggerAlert(ruleName string, labels map[string]string) {
    am.mu.Lock()
    defer am.mu.Unlock()
    
    // 查找规则
    var rule *AlertRule
    for _, r := range am.rules {
        if r.Name == ruleName {
            rule = r
            break
        }
    }
    
    if rule == nil {
        fmt.Printf("告警规则未找到: %s\n", ruleName)
        return
    }
    
    // 生成告警指纹
    fingerprint := generateFingerprint(rule.Name, labels)
    
    // 检查是否已存在告警
    if existingAlert, exists := am.alerts[fingerprint]; exists {
        // 更新现有告警
        existingAlert.EndsAt = time.Now().Add(5 * time.Minute)
        fmt.Printf("更新告警: %s\n", existingAlert.Rule.Name)
        return
    }
    
    // 创建新告警
    alert := &Alert{
        ID:          generateAlertID(),
        Rule:        rule,
        Labels:      mergeMaps(rule.Labels, labels),
        Annotations: rule.Annotations,
        StartsAt:    time.Now(),
        EndsAt:      time.Now().Add(5 * time.Minute),
        Status:      StatusFiring,
        Fingerprint: fingerprint,
    }
    
    am.alerts[fingerprint] = alert
    
    // 检查静音和抑制
    if am.isAlertSilenced(alert) {
        alert.Status = StatusSilenced
        fmt.Printf("告警被静音: %s\n", alert.Rule.Name)
        return
    }
    
    if am.isAlertInhibited(alert) {
        alert.Status = StatusInhibited
        fmt.Printf("告警被抑制: %s\n", alert.Rule.Name)
        return
    }
    
    // 发送告警
    go am.sendAlert(alert)
}

func (am *AlertManager) sendAlert(alert *Alert) {
    fmt.Printf("🔔 触发告警: [%s] %s\n", alert.Rule.Severity, alert.Rule.Name)
    
    // 根据严重程度选择接收器
    var receivers []Receiver
    
    switch alert.Rule.Severity {
    case SeverityInfo:
        receivers = append(receivers, am.receivers["slack-alerts"])
        
    case SeverityWarning:
        receivers = append(receivers, am.receivers["slack-alerts"])
        receivers = append(receivers, am.receivers["email-ops"])
        
    case SeverityCritical:
        receivers = append(receivers, am.receivers["slack-alerts"])
        receivers = append(receivers, am.receivers["email-ops"])
        receivers = append(receivers, am.receivers["pagerduty-oncall"])
        
    case SeverityEmergency:
        receivers = append(receivers, am.receivers["slack-alerts"])
        receivers = append(receivers, am.receivers["email-ops"])
        receivers = append(receivers, am.receivers["pagerduty-oncall"])
        // 紧急情况可能需要额外通知方式
    }
    
    // 并发发送到所有接收器
    var wg sync.WaitGroup
    for _, receiver := range receivers {
        wg.Add(1)
        go func(r Receiver) {
            defer wg.Done()
            if err := r.Send(alert); err != nil {
                fmt.Printf("发送告警失败 [%s]: %v\n", r.Name(), err)
            }
        }(receiver)
    }
    
    wg.Wait()
}

func (am *AlertManager) isAlertSilenced(alert *Alert) bool {
    am.mu.RLock()
    defer am.mu.RUnlock()
    
    now := time.Now()
    for _, silence := range am.silences {
        if now.Before(silence.StartsAt) || now.After(silence.EndsAt) {
            continue
        }
        
        if am.matchesAllMatchers(alert.Labels, silence.Matchers) {
            return true
        }
    }
    
    return false
}

func (am *AlertManager) isAlertInhibited(alert *Alert) bool {
    am.mu.RLock()
    defer am.mu.RUnlock()
    
    for _, inhibition := range am.inhibitions {
        // 检查是否有匹配的源告警(更高优先级)
        for _, sourceAlert := range am.alerts {
            if sourceAlert.Status != StatusFiring {
                continue
            }
            
            if am.matchesAllMatchers(sourceAlert.Labels, inhibition.SourceMatchers) &&
               am.matchesAllMatchers(alert.Labels, inhibition.TargetMatchers) {
                
                // 检查equal标签是否匹配
                equal := true
                for _, labelName := range inhibition.Equal {
                    if sourceAlert.Labels[labelName] != alert.Labels[labelName] {
                        equal = false
                        break
                    }
                }
                
                if equal {
                    return true
                }
            }
        }
    }
    
    return false
}

func (am *AlertManager) matchesAllMatchers(labels map[string]string, matchers []Matcher) bool {
    for _, matcher := range matchers {
        value, exists := labels[matcher.Name]
        if !exists || value != matcher.Value {
            return false
        }
    }
    return true
}

func demonstrateAlertRules(am *AlertManager) {
    fmt.Println("\n--- 告警规则配置演示 ---")
    
    // 配置各种告警规则
    rules := []*AlertRule{
        {
            Name:       "HighErrorRate",
            Expression: `rate(http_requests_total{status=~"5.."}[5m]) > 0.1`,
            Duration:   2 * time.Minute,
            Severity:   SeverityCritical,
            Labels: map[string]string{
                "service": "api",
                "team":    "backend",
            },
            Annotations: map[string]string{
                "summary":     "High error rate detected",
                "description": "Error rate is above 10% for more than 2 minutes",
                "runbook":     "https://wiki.company.com/runbooks/high-error-rate",
            },
        },
        {
            Name:       "HighMemoryUsage",
            Expression: `memory_usage_bytes{type="heap_alloc"} / memory_usage_bytes{type="heap_sys"} > 0.8`,
            Duration:   5 * time.Minute,
            Severity:   SeverityWarning,
            Labels: map[string]string{
                "service": "api",
                "team":    "backend",
            },
            Annotations: map[string]string{
                "summary":     "High memory usage",
                "description": "Memory usage is above 80% for more than 5 minutes",
            },
        },
        {
            Name:       "ServiceDown",
            Expression: `up == 0`,
            Duration:   1 * time.Minute,
            Severity:   SeverityEmergency,
            Labels: map[string]string{
                "service": "api",
                "team":    "backend",
            },
            Annotations: map[string]string{
                "summary":     "Service is down",
                "description": "Service has been down for more than 1 minute",
                "runbook":     "https://wiki.company.com/runbooks/service-down",
            },
        },
        {
            Name:       "HighGoroutineCount",
            Expression: `goroutines_count > 1000`,
            Duration:   3 * time.Minute,
            Severity:   SeverityWarning,
            Labels: map[string]string{
                "service": "api",
                "team":    "backend",
            },
            Annotations: map[string]string{
                "summary":     "High goroutine count",
                "description": "Goroutine count is above 1000",
            },
        },
    }
    
    for _, rule := range rules {
        am.AddRule(rule)
        fmt.Printf("添加告警规则: %s [%s]\n", rule.Name, rule.Severity)
    }
}

func demonstrateAlertAggregation(am *AlertManager) {
    fmt.Println("\n--- 告警聚合演示 ---")
    
    // 模拟多个相似告警
    scenarios := []struct {
        ruleName string
        labels   map[string]string
    }{
        {"HighMemoryUsage", map[string]string{"instance": "server-1", "service": "api"}},
        {"HighMemoryUsage", map[string]string{"instance": "server-2", "service": "api"}},
        {"HighMemoryUsage", map[string]string{"instance": "server-3", "service": "api"}},
        {"HighGoroutineCount", map[string]string{"instance": "server-1", "service": "api"}},
        {"HighErrorRate", map[string]string{"instance": "server-1", "service": "api"}},
    }
    
    for i, scenario := range scenarios {
        fmt.Printf("触发告警 %d: %s (instance=%s)\n", 
            i+1, scenario.ruleName, scenario.labels["instance"])
        
        am.TriggerAlert(scenario.ruleName, scenario.labels)
        time.Sleep(500 * time.Millisecond)
    }
    
    // 显示当前活跃告警
    am.mu.RLock()
    fmt.Printf("\n当前活跃告警数量: %d\n", len(am.alerts))
    for _, alert := range am.alerts {
        fmt.Printf("  - %s [%s] (实例: %s)\n", 
            alert.Rule.Name, alert.Status, alert.Labels["instance"])
    }
    am.mu.RUnlock()
}

func demonstrateAlertEscalation(am *AlertManager) {
    fmt.Println("\n--- 告警升级策略演示 ---")
    
    // 模拟服务故障场景
    fmt.Println("模拟服务故障场景...")
    
    // 1. 首先触发高内存告警(警告级别)
    am.TriggerAlert("HighMemoryUsage", map[string]string{
        "instance": "server-1",
        "service":  "api",
    })
    
    time.Sleep(1 * time.Second)
    
    // 2. 然后触发高错误率告警(严重级别)
    am.TriggerAlert("HighErrorRate", map[string]string{
        "instance": "server-1",
        "service":  "api",
    })
    
    time.Sleep(1 * time.Second)
    
    // 3. 最后触发服务下线告警(紧急级别)
    am.TriggerAlert("ServiceDown", map[string]string{
        "instance": "server-1",
        "service":  "api",
    })
    
    fmt.Println("告警升级演示完成")
}

func demonstrateAlertSilencing(am *AlertManager) {
    fmt.Println("\n--- 告警静音和维护模式演示 ---")
    
    // 添加静音规则
    silence := &Silence{
        ID: "silence-1",
        Matchers: []Matcher{
            {Name: "service", Value: "api", IsRegex: false},
            {Name: "instance", Value: "server-2", IsRegex: false},
        },
        StartsAt:  time.Now(),
        EndsAt:    time.Now().Add(10 * time.Minute),
        CreatedBy: "ops-team",
        Comment:   "Maintenance window for server-2",
    }
    
    am.mu.Lock()
    am.silences = append(am.silences, silence)
    am.mu.Unlock()
    
    fmt.Printf("添加静音规则: %s (维护窗口)\n", silence.ID)
    
    // 添加抑制规则:服务下线时抑制其他告警
    inhibition := &Inhibition{
        SourceMatchers: []Matcher{
            {Name: "alertname", Value: "ServiceDown", IsRegex: false},
        },
        TargetMatchers: []Matcher{
            {Name: "service", Value: "api", IsRegex: false},
        },
        Equal: []string{"instance"},
    }
    
    am.mu.Lock()
    am.inhibitions = append(am.inhibitions, inhibition)
    am.mu.Unlock()
    
    fmt.Println("添加抑制规则: ServiceDown 抑制其他告警")
    
    // 测试静音效果
    fmt.Println("\n测试静音效果:")
    am.TriggerAlert("HighMemoryUsage", map[string]string{
        "instance": "server-2", // 被静音的实例
        "service":  "api",
    })
    
    am.TriggerAlert("HighMemoryUsage", map[string]string{
        "instance": "server-3", // 未被静音的实例
        "service":  "api",
    })
}

// 辅助函数
func generateFingerprint(ruleName string, labels map[string]string) string {
    // 简化的指纹生成
    fingerprint := ruleName
    for k, v := range labels {
        fingerprint += fmt.Sprintf(":%s=%s", k, v)
    }
    return fingerprint
}

func generateAlertID() string {
    return fmt.Sprintf("alert-%d", time.Now().UnixNano())
}

func mergeMaps(map1, map2 map[string]string) map[string]string {
    result := make(map[string]string)
    for k, v := range map1 {
        result[k] = v
    }
    for k, v := range map2 {
        result[k] = v
    }
    return result
}

func main() {
    demonstrateMonitoringSystem()
    time.Sleep(2 * time.Second) // 让监控指标收集运行一段时间
    demonstrateAlertingSystem()
}

:::

🎯 核心知识点总结

监控体系要点

  1. 指标分类: 业务指标、系统指标、错误指标、性能指标
  2. 指标收集: 定期收集、事件驱动、推送/拉取模式
  3. 数据存储: 时序数据库、数据保留策略
  4. 可视化: Dashboard设计、图表选择、实时监控

告警系统要点

  1. 告警级别: Info、Warning、Critical、Emergency
  2. 告警规则: 条件表达式、持续时间、标签匹配
  3. 告警路由: 根据标签和严重程度路由到不同接收器
  4. 告警生命周期: 触发、发送、确认、解决

高级特性要点

  1. 告警聚合: 相似告警合并减少噪音
  2. 告警抑制: 高优先级告警抑制低优先级
  3. 告警静音: 维护窗口和已知问题静音
  4. 告警升级: 多级升级策略和通知渠道

最佳实践要点

  1. 合理阈值: 避免告警疲劳和误报
  2. 运行手册: 每个告警配备处理指南
  3. 定期回顾: 分析告警有效性和调整策略
  4. 故障演练: 验证告警系统的可靠性

🔍 面试准备建议

  1. 理解监控原理: 掌握各种监控指标的含义和采集方法
  2. 熟悉监控工具: 了解Prometheus、Grafana等主流工具
  3. 设计告警策略: 能够设计合理的告警规则和升级策略
  4. 故障处理经验: 积累实际的故障处理和告警优化经验
  5. SRE实践: 了解可靠性工程的监控和告警最佳实践

正在精进