Go应用监控与告警 - Golang高级面试题
Go应用的监控与告警是生产环境运维的重要组成部分。本章深入探讨Go应用的监控体系、指标收集、告警策略和最佳实践。
📋 重点面试题
面试题 1:Go应用监控体系设计
难度级别:⭐⭐⭐⭐⭐
考察范围:监控设计/运维实践
技术标签:monitoring metrics observability prometheus grafana alerting
详细解答
1. 监控体系设计和指标收集
点击查看完整代码实现
点击查看完整代码实现
go
package main
import (
"context"
"fmt"
"log"
"net/http"
"runtime"
"sync"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
)
// 监控指标定义
var (
// 业务指标
requestTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "http_requests_total",
Help: "Total number of HTTP requests",
},
[]string{"method", "endpoint", "status"},
)
requestDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "http_request_duration_seconds",
Help: "HTTP request duration in seconds",
Buckets: prometheus.DefBuckets,
},
[]string{"method", "endpoint"},
)
activeConnections = prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "active_connections",
Help: "Current number of active connections",
},
)
// 系统指标
goroutineCount = prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "goroutines_count",
Help: "Current number of goroutines",
},
)
memoryUsage = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "memory_usage_bytes",
Help: "Memory usage in bytes",
},
[]string{"type"},
)
gcDuration = prometheus.NewHistogram(
prometheus.HistogramOpts{
Name: "gc_duration_seconds",
Help: "GC duration in seconds",
Buckets: []float64{0.001, 0.01, 0.1, 1.0, 10.0},
},
)
// 错误指标
errorTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "errors_total",
Help: "Total number of errors",
},
[]string{"type", "severity"},
)
// 自定义业务指标
orderTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "orders_total",
Help: "Total number of orders",
},
[]string{"status", "type"},
)
inventoryLevel = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "inventory_level",
Help: "Current inventory level",
},
[]string{"product_id", "location"},
)
)
func init() {
// 注册所有指标
prometheus.MustRegister(
requestTotal,
requestDuration,
activeConnections,
goroutineCount,
memoryUsage,
gcDuration,
errorTotal,
orderTotal,
inventoryLevel,
)
}
func demonstrateMonitoringSystem() {
fmt.Println("=== Go应用监控体系演示 ===")
// 启动监控指标收集
startMetricsCollection()
// 启动HTTP监控中间件
demonstrateHTTPMonitoring()
// 演示自定义业务指标
demonstrateBusinessMetrics()
// 演示错误追踪和告警
demonstrateErrorTracking()
// 演示性能监控
demonstratePerformanceMonitoring()
}
func startMetricsCollection() {
fmt.Println("\n--- 启动系统指标收集 ---")
// 定期收集系统指标
go func() {
ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()
for range ticker.C {
collectSystemMetrics()
}
}()
// 监控GC事件
go func() {
ticker := time.NewTicker(1 * time.Second)
defer ticker.Stop()
var lastGC uint32
for range ticker.C {
var m runtime.MemStats
runtime.ReadMemStats(&m)
if m.NumGC > lastGC {
// 新的GC发生
gcCount := m.NumGC - lastGC
for i := uint32(0); i < gcCount; i++ {
pauseNs := m.PauseNs[(m.NumGC-gcCount+i+255)%256]
gcDuration.Observe(float64(pauseNs) / 1e9)
}
lastGC = m.NumGC
fmt.Printf("GC事件: 次数=%d, 最近暂停时间=%v\n",
gcCount, time.Duration(m.PauseNs[(m.NumGC+255)%256]))
}
}
}()
}
func collectSystemMetrics() {
var m runtime.MemStats
runtime.ReadMemStats(&m)
// 更新goroutine数量
goroutineCount.Set(float64(runtime.NumGoroutine()))
// 更新内存使用情况
memoryUsage.WithLabelValues("heap_alloc").Set(float64(m.HeapAlloc))
memoryUsage.WithLabelValues("heap_sys").Set(float64(m.HeapSys))
memoryUsage.WithLabelValues("stack_sys").Set(float64(m.StackSys))
memoryUsage.WithLabelValues("sys").Set(float64(m.Sys))
fmt.Printf("系统指标更新 - Goroutines: %d, 堆内存: %d KB\n",
runtime.NumGoroutine(), m.HeapAlloc/1024)
}
func demonstrateHTTPMonitoring() {
fmt.Println("\n--- HTTP监控中间件演示 ---")
// HTTP监控中间件
monitoringMiddleware := func(next http.HandlerFunc) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
start := time.Now()
// 增加活跃连接数
activeConnections.Inc()
defer activeConnections.Dec()
// 包装ResponseWriter以捕获状态码
wrapped := &responseWriter{ResponseWriter: w, statusCode: 200}
// 执行请求处理
next(wrapped, r)
// 记录指标
duration := time.Since(start)
method := r.Method
endpoint := r.URL.Path
status := fmt.Sprintf("%d", wrapped.statusCode)
requestTotal.WithLabelValues(method, endpoint, status).Inc()
requestDuration.WithLabelValues(method, endpoint).Observe(duration.Seconds())
fmt.Printf("HTTP请求: %s %s -> %s, 耗时: %v\n",
method, endpoint, status, duration)
}
}
// 示例HTTP处理器
http.HandleFunc("/api/health", monitoringMiddleware(func(w http.ResponseWriter, r *http.Request) {
time.Sleep(10 * time.Millisecond) // 模拟处理时间
w.WriteHeader(http.StatusOK)
w.Write([]byte(`{"status":"ok"}`))
}))
http.HandleFunc("/api/users", monitoringMiddleware(func(w http.ResponseWriter, r *http.Request) {
time.Sleep(50 * time.Millisecond) // 模拟数据库查询
w.WriteHeader(http.StatusOK)
w.Write([]byte(`{"users":[]}`))
}))
http.HandleFunc("/api/error", monitoringMiddleware(func(w http.ResponseWriter, r *http.Request) {
// 模拟错误
errorTotal.WithLabelValues("api_error", "high").Inc()
w.WriteHeader(http.StatusInternalServerError)
w.Write([]byte(`{"error":"internal server error"}`))
}))
// Prometheus指标端点
http.Handle("/metrics", promhttp.Handler())
// 模拟HTTP请求
go func() {
time.Sleep(1 * time.Second) // 等待服务器启动
endpoints := []string{"/api/health", "/api/users", "/api/error"}
client := &http.Client{Timeout: 5 * time.Second}
for i := 0; i < 10; i++ {
endpoint := endpoints[i%len(endpoints)]
resp, err := client.Get("http://localhost:8080" + endpoint)
if err != nil {
fmt.Printf("请求错误: %v\n", err)
continue
}
resp.Body.Close()
time.Sleep(100 * time.Millisecond)
}
}()
// 启动HTTP服务器(模拟)
fmt.Println("HTTP监控服务已启动(模拟)")
}
type responseWriter struct {
http.ResponseWriter
statusCode int
}
func (rw *responseWriter) WriteHeader(code int) {
rw.statusCode = code
rw.ResponseWriter.WriteHeader(code)
}
func demonstrateBusinessMetrics() {
fmt.Println("\n--- 业务指标监控演示 ---")
// 模拟业务事件
businessEvents := []struct {
orderType string
orderStatus string
productID string
location string
inventory float64
}{
{"online", "completed", "product_001", "warehouse_a", 100},
{"offline", "pending", "product_002", "warehouse_b", 250},
{"online", "failed", "product_001", "warehouse_a", 95},
{"online", "completed", "product_003", "warehouse_c", 80},
{"offline", "completed", "product_002", "warehouse_b", 240},
}
for i, event := range businessEvents {
// 记录订单指标
orderTotal.WithLabelValues(event.orderStatus, event.orderType).Inc()
// 更新库存水平
inventoryLevel.WithLabelValues(event.productID, event.location).Set(event.inventory)
fmt.Printf("业务事件 %d: 订单=%s/%s, 产品=%s, 库存=%v\n",
i+1, event.orderStatus, event.orderType, event.productID, event.inventory)
time.Sleep(200 * time.Millisecond)
}
// 监控业务规则
go func() {
ticker := time.NewTicker(3 * time.Second)
defer ticker.Stop()
for range ticker.C {
checkBusinessRules()
}
}()
}
func checkBusinessRules() {
fmt.Println("检查业务规则...")
// 示例:检查库存水平
metric := &dto.Metric{}
// 这里实际应该从Prometheus查询当前指标值
// 为了演示,我们模拟一些检查
// 模拟低库存告警
lowInventoryThreshold := 50.0
if 45.0 < lowInventoryThreshold { // 模拟库存值
errorTotal.WithLabelValues("low_inventory", "medium").Inc()
fmt.Printf("⚠️ 库存告警: 产品库存低于阈值 %.0f\n", lowInventoryThreshold)
}
// 模拟高错误率告警
errorRateThreshold := 0.1 // 10%
if 0.15 > errorRateThreshold { // 模拟错误率
errorTotal.WithLabelValues("high_error_rate", "high").Inc()
fmt.Printf("🚨 错误率告警: 错误率超过阈值 %.1f%%\n", errorRateThreshold*100)
}
}
func demonstrateErrorTracking() {
fmt.Println("\n--- 错误追踪和告警演示 ---")
// 错误分类和严重程度
errorTypes := []struct {
errorType string
severity string
message string
}{
{"database_error", "high", "数据库连接超时"},
{"validation_error", "medium", "用户输入验证失败"},
{"network_error", "high", "外部API调用失败"},
{"cache_error", "low", "缓存未命中"},
{"auth_error", "medium", "认证失败"},
}
var wg sync.WaitGroup
for i, errorEvent := range errorTypes {
wg.Add(1)
go func(id int, event struct {
errorType string
severity string
message string
}) {
defer wg.Done()
// 记录错误指标
errorTotal.WithLabelValues(event.errorType, event.severity).Inc()
// 模拟错误处理
fmt.Printf("错误事件 %d: [%s] %s - %s\n",
id+1, event.severity, event.errorType, event.message)
// 根据严重程度决定告警策略
switch event.severity {
case "high":
triggerImmediateAlert(event.errorType, event.message)
case "medium":
aggregateAlert(event.errorType, event.message)
case "low":
logAlert(event.errorType, event.message)
}
time.Sleep(100 * time.Millisecond)
}(i, errorEvent)
}
wg.Wait()
}
func triggerImmediateAlert(errorType, message string) {
fmt.Printf("🚨 立即告警: [%s] %s\n", errorType, message)
// 实际实现中会发送到告警系统
// 例如:PagerDuty, Slack, 邮件等
}
func aggregateAlert(errorType, message string) {
fmt.Printf("⚠️ 聚合告警: [%s] %s\n", errorType, message)
// 实际实现中会累积错误,达到阈值后发送告警
}
func logAlert(errorType, message string) {
fmt.Printf("📝 日志记录: [%s] %s\n", errorType, message)
// 实际实现中只记录到日志,不发送告警
}
func demonstratePerformanceMonitoring() {
fmt.Println("\n--- 性能监控演示 ---")
// 监控不同类型的性能指标
performanceTests := []struct {
name string
testFunc func() time.Duration
}{
{"CPU密集型任务", performCPUIntensiveTask},
{"内存分配任务", performMemoryAllocationTask},
{"IO密集型任务", performIOIntensiveTask},
{"并发任务", performConcurrentTask},
}
for _, test := range performanceTests {
fmt.Printf("\n执行性能测试: %s\n", test.name)
// 记录开始状态
var before runtime.MemStats
runtime.ReadMemStats(&before)
beforeGoroutines := runtime.NumGoroutine()
// 执行测试
start := time.Now()
duration := test.testFunc()
totalDuration := time.Since(start)
// 记录结束状态
var after runtime.MemStats
runtime.ReadMemStats(&after)
afterGoroutines := runtime.NumGoroutine()
// 计算性能指标
memoryDelta := int64(after.HeapAlloc) - int64(before.HeapAlloc)
gcDelta := after.NumGC - before.NumGC
goroutineDelta := afterGoroutines - beforeGoroutines
fmt.Printf(" 执行时间: %v\n", duration)
fmt.Printf(" 总耗时: %v\n", totalDuration)
fmt.Printf(" 内存变化: %+d bytes\n", memoryDelta)
fmt.Printf(" GC次数: %d\n", gcDelta)
fmt.Printf(" Goroutine变化: %+d\n", goroutineDelta)
// 记录到监控指标
performanceMetric := prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "performance_test_duration_seconds",
Help: "Performance test duration",
},
[]string{"test_name"},
)
performanceMetric.WithLabelValues(test.name).Observe(duration.Seconds())
}
}
func performCPUIntensiveTask() time.Duration {
start := time.Now()
// CPU密集型计算
sum := 0
for i := 0; i < 10000000; i++ {
sum += i * i
}
return time.Since(start)
}
func performMemoryAllocationTask() time.Duration {
start := time.Now()
// 大量内存分配
slices := make([][]byte, 1000)
for i := range slices {
slices[i] = make([]byte, 1024)
}
return time.Since(start)
}
func performIOIntensiveTask() time.Duration {
start := time.Now()
// 模拟IO操作
var wg sync.WaitGroup
for i := 0; i < 10; i++ {
wg.Add(1)
go func() {
defer wg.Done()
time.Sleep(10 * time.Millisecond)
}()
}
wg.Wait()
return time.Since(start)
}
func performConcurrentTask() time.Duration {
start := time.Now()
// 并发任务
var wg sync.WaitGroup
ch := make(chan int, 100)
// 生产者
wg.Add(1)
go func() {
defer wg.Done()
defer close(ch)
for i := 0; i < 100; i++ {
ch <- i
}
}()
// 消费者
numWorkers := 5
for i := 0; i < numWorkers; i++ {
wg.Add(1)
go func() {
defer wg.Done()
for data := range ch {
// 模拟处理
_ = data * 2
time.Sleep(1 * time.Millisecond)
}
}()
}
wg.Wait()
return time.Since(start)
}:::
面试题 2:告警系统设计和实现
难度级别:⭐⭐⭐⭐⭐
考察范围:告警设计/可靠性工程
技术标签:alerting sre reliability notification escalation
详细解答
1. 告警系统架构和策略
点击查看完整代码实现
点击查看完整代码实现
go
func demonstrateAlertingSystem() {
fmt.Println("\n=== 告警系统设计演示 ===")
// 创建告警管理器
alertManager := NewAlertManager()
// 演示告警规则配置
demonstrateAlertRules(alertManager)
// 演示告警聚合和抑制
demonstrateAlertAggregation(alertManager)
// 演示告警升级策略
demonstrateAlertEscalation(alertManager)
// 演示告警静音和维护模式
demonstrateAlertSilencing(alertManager)
}
// 告警级别定义
type AlertSeverity int
const (
SeverityInfo AlertSeverity = iota
SeverityWarning
SeverityCritical
SeverityEmergency
)
func (s AlertSeverity) String() string {
switch s {
case SeverityInfo:
return "info"
case SeverityWarning:
return "warning"
case SeverityCritical:
return "critical"
case SeverityEmergency:
return "emergency"
default:
return "unknown"
}
}
// 告警规则
type AlertRule struct {
Name string
Expression string // PromQL表达式
Duration time.Duration
Severity AlertSeverity
Labels map[string]string
Annotations map[string]string
}
// 告警实例
type Alert struct {
ID string
Rule *AlertRule
Labels map[string]string
Annotations map[string]string
StartsAt time.Time
EndsAt time.Time
Status AlertStatus
Fingerprint string
}
type AlertStatus int
const (
StatusFiring AlertStatus = iota
StatusResolved
StatusSilenced
StatusInhibited
)
func (s AlertStatus) String() string {
switch s {
case StatusFiring:
return "firing"
case StatusResolved:
return "resolved"
case StatusSilenced:
return "silenced"
case StatusInhibited:
return "inhibited"
default:
return "unknown"
}
}
// 告警管理器
type AlertManager struct {
rules []*AlertRule
alerts map[string]*Alert
silences []*Silence
inhibitions []*Inhibition
receivers map[string]Receiver
mu sync.RWMutex
}
// 静音规则
type Silence struct {
ID string
Matchers []Matcher
StartsAt time.Time
EndsAt time.Time
CreatedBy string
Comment string
}
// 抑制规则
type Inhibition struct {
SourceMatchers []Matcher
TargetMatchers []Matcher
Equal []string
}
// 标签匹配器
type Matcher struct {
Name string
Value string
IsRegex bool
}
// 告警接收器接口
type Receiver interface {
Send(alert *Alert) error
Name() string
}
// 邮件接收器
type EmailReceiver struct {
name string
addresses []string
smtpHost string
smtpPort int
}
func (er *EmailReceiver) Send(alert *Alert) error {
fmt.Printf("📧 邮件告警 [%s]: %s - %s\n",
er.name, alert.Rule.Name, alert.Annotations["summary"])
// 实际实现中会发送真实邮件
time.Sleep(10 * time.Millisecond) // 模拟发送延迟
return nil
}
func (er *EmailReceiver) Name() string {
return er.name
}
// Slack接收器
type SlackReceiver struct {
name string
webhook string
channel string
}
func (sr *SlackReceiver) Send(alert *Alert) error {
emoji := "⚠️"
if alert.Rule.Severity == SeverityCritical {
emoji = "🚨"
} else if alert.Rule.Severity == SeverityEmergency {
emoji = "🔥"
}
fmt.Printf("💬 Slack告警 [%s]: %s %s - %s\n",
sr.name, emoji, alert.Rule.Name, alert.Annotations["summary"])
time.Sleep(5 * time.Millisecond) // 模拟发送延迟
return nil
}
func (sr *SlackReceiver) Name() string {
return sr.name
}
// PagerDuty接收器
type PagerDutyReceiver struct {
name string
serviceKey string
severityMap map[AlertSeverity]string
}
func (pr *PagerDutyReceiver) Send(alert *Alert) error {
severity := pr.severityMap[alert.Rule.Severity]
fmt.Printf("📟 PagerDuty告警 [%s]: [%s] %s - %s\n",
pr.name, severity, alert.Rule.Name, alert.Annotations["summary"])
time.Sleep(20 * time.Millisecond) // 模拟发送延迟
return nil
}
func (pr *PagerDutyReceiver) Name() string {
return pr.name
}
func NewAlertManager() *AlertManager {
am := &AlertManager{
rules: make([]*AlertRule, 0),
alerts: make(map[string]*Alert),
silences: make([]*Silence, 0),
inhibitions: make([]*Inhibition, 0),
receivers: make(map[string]Receiver),
}
// 配置默认接收器
am.receivers["email-ops"] = &EmailReceiver{
name: "email-ops",
addresses: []string{"ops@company.com"},
smtpHost: "smtp.company.com",
smtpPort: 587,
}
am.receivers["slack-alerts"] = &SlackReceiver{
name: "slack-alerts",
webhook: "https://hooks.slack.com/...",
channel: "#alerts",
}
am.receivers["pagerduty-oncall"] = &PagerDutyReceiver{
name: "pagerduty-oncall",
serviceKey: "your-service-key",
severityMap: map[AlertSeverity]string{
SeverityInfo: "info",
SeverityWarning: "warning",
SeverityCritical: "error",
SeverityEmergency: "critical",
},
}
return am
}
func (am *AlertManager) AddRule(rule *AlertRule) {
am.mu.Lock()
defer am.mu.Unlock()
am.rules = append(am.rules, rule)
}
func (am *AlertManager) TriggerAlert(ruleName string, labels map[string]string) {
am.mu.Lock()
defer am.mu.Unlock()
// 查找规则
var rule *AlertRule
for _, r := range am.rules {
if r.Name == ruleName {
rule = r
break
}
}
if rule == nil {
fmt.Printf("告警规则未找到: %s\n", ruleName)
return
}
// 生成告警指纹
fingerprint := generateFingerprint(rule.Name, labels)
// 检查是否已存在告警
if existingAlert, exists := am.alerts[fingerprint]; exists {
// 更新现有告警
existingAlert.EndsAt = time.Now().Add(5 * time.Minute)
fmt.Printf("更新告警: %s\n", existingAlert.Rule.Name)
return
}
// 创建新告警
alert := &Alert{
ID: generateAlertID(),
Rule: rule,
Labels: mergeMaps(rule.Labels, labels),
Annotations: rule.Annotations,
StartsAt: time.Now(),
EndsAt: time.Now().Add(5 * time.Minute),
Status: StatusFiring,
Fingerprint: fingerprint,
}
am.alerts[fingerprint] = alert
// 检查静音和抑制
if am.isAlertSilenced(alert) {
alert.Status = StatusSilenced
fmt.Printf("告警被静音: %s\n", alert.Rule.Name)
return
}
if am.isAlertInhibited(alert) {
alert.Status = StatusInhibited
fmt.Printf("告警被抑制: %s\n", alert.Rule.Name)
return
}
// 发送告警
go am.sendAlert(alert)
}
func (am *AlertManager) sendAlert(alert *Alert) {
fmt.Printf("🔔 触发告警: [%s] %s\n", alert.Rule.Severity, alert.Rule.Name)
// 根据严重程度选择接收器
var receivers []Receiver
switch alert.Rule.Severity {
case SeverityInfo:
receivers = append(receivers, am.receivers["slack-alerts"])
case SeverityWarning:
receivers = append(receivers, am.receivers["slack-alerts"])
receivers = append(receivers, am.receivers["email-ops"])
case SeverityCritical:
receivers = append(receivers, am.receivers["slack-alerts"])
receivers = append(receivers, am.receivers["email-ops"])
receivers = append(receivers, am.receivers["pagerduty-oncall"])
case SeverityEmergency:
receivers = append(receivers, am.receivers["slack-alerts"])
receivers = append(receivers, am.receivers["email-ops"])
receivers = append(receivers, am.receivers["pagerduty-oncall"])
// 紧急情况可能需要额外通知方式
}
// 并发发送到所有接收器
var wg sync.WaitGroup
for _, receiver := range receivers {
wg.Add(1)
go func(r Receiver) {
defer wg.Done()
if err := r.Send(alert); err != nil {
fmt.Printf("发送告警失败 [%s]: %v\n", r.Name(), err)
}
}(receiver)
}
wg.Wait()
}
func (am *AlertManager) isAlertSilenced(alert *Alert) bool {
am.mu.RLock()
defer am.mu.RUnlock()
now := time.Now()
for _, silence := range am.silences {
if now.Before(silence.StartsAt) || now.After(silence.EndsAt) {
continue
}
if am.matchesAllMatchers(alert.Labels, silence.Matchers) {
return true
}
}
return false
}
func (am *AlertManager) isAlertInhibited(alert *Alert) bool {
am.mu.RLock()
defer am.mu.RUnlock()
for _, inhibition := range am.inhibitions {
// 检查是否有匹配的源告警(更高优先级)
for _, sourceAlert := range am.alerts {
if sourceAlert.Status != StatusFiring {
continue
}
if am.matchesAllMatchers(sourceAlert.Labels, inhibition.SourceMatchers) &&
am.matchesAllMatchers(alert.Labels, inhibition.TargetMatchers) {
// 检查equal标签是否匹配
equal := true
for _, labelName := range inhibition.Equal {
if sourceAlert.Labels[labelName] != alert.Labels[labelName] {
equal = false
break
}
}
if equal {
return true
}
}
}
}
return false
}
func (am *AlertManager) matchesAllMatchers(labels map[string]string, matchers []Matcher) bool {
for _, matcher := range matchers {
value, exists := labels[matcher.Name]
if !exists || value != matcher.Value {
return false
}
}
return true
}
func demonstrateAlertRules(am *AlertManager) {
fmt.Println("\n--- 告警规则配置演示 ---")
// 配置各种告警规则
rules := []*AlertRule{
{
Name: "HighErrorRate",
Expression: `rate(http_requests_total{status=~"5.."}[5m]) > 0.1`,
Duration: 2 * time.Minute,
Severity: SeverityCritical,
Labels: map[string]string{
"service": "api",
"team": "backend",
},
Annotations: map[string]string{
"summary": "High error rate detected",
"description": "Error rate is above 10% for more than 2 minutes",
"runbook": "https://wiki.company.com/runbooks/high-error-rate",
},
},
{
Name: "HighMemoryUsage",
Expression: `memory_usage_bytes{type="heap_alloc"} / memory_usage_bytes{type="heap_sys"} > 0.8`,
Duration: 5 * time.Minute,
Severity: SeverityWarning,
Labels: map[string]string{
"service": "api",
"team": "backend",
},
Annotations: map[string]string{
"summary": "High memory usage",
"description": "Memory usage is above 80% for more than 5 minutes",
},
},
{
Name: "ServiceDown",
Expression: `up == 0`,
Duration: 1 * time.Minute,
Severity: SeverityEmergency,
Labels: map[string]string{
"service": "api",
"team": "backend",
},
Annotations: map[string]string{
"summary": "Service is down",
"description": "Service has been down for more than 1 minute",
"runbook": "https://wiki.company.com/runbooks/service-down",
},
},
{
Name: "HighGoroutineCount",
Expression: `goroutines_count > 1000`,
Duration: 3 * time.Minute,
Severity: SeverityWarning,
Labels: map[string]string{
"service": "api",
"team": "backend",
},
Annotations: map[string]string{
"summary": "High goroutine count",
"description": "Goroutine count is above 1000",
},
},
}
for _, rule := range rules {
am.AddRule(rule)
fmt.Printf("添加告警规则: %s [%s]\n", rule.Name, rule.Severity)
}
}
func demonstrateAlertAggregation(am *AlertManager) {
fmt.Println("\n--- 告警聚合演示 ---")
// 模拟多个相似告警
scenarios := []struct {
ruleName string
labels map[string]string
}{
{"HighMemoryUsage", map[string]string{"instance": "server-1", "service": "api"}},
{"HighMemoryUsage", map[string]string{"instance": "server-2", "service": "api"}},
{"HighMemoryUsage", map[string]string{"instance": "server-3", "service": "api"}},
{"HighGoroutineCount", map[string]string{"instance": "server-1", "service": "api"}},
{"HighErrorRate", map[string]string{"instance": "server-1", "service": "api"}},
}
for i, scenario := range scenarios {
fmt.Printf("触发告警 %d: %s (instance=%s)\n",
i+1, scenario.ruleName, scenario.labels["instance"])
am.TriggerAlert(scenario.ruleName, scenario.labels)
time.Sleep(500 * time.Millisecond)
}
// 显示当前活跃告警
am.mu.RLock()
fmt.Printf("\n当前活跃告警数量: %d\n", len(am.alerts))
for _, alert := range am.alerts {
fmt.Printf(" - %s [%s] (实例: %s)\n",
alert.Rule.Name, alert.Status, alert.Labels["instance"])
}
am.mu.RUnlock()
}
func demonstrateAlertEscalation(am *AlertManager) {
fmt.Println("\n--- 告警升级策略演示 ---")
// 模拟服务故障场景
fmt.Println("模拟服务故障场景...")
// 1. 首先触发高内存告警(警告级别)
am.TriggerAlert("HighMemoryUsage", map[string]string{
"instance": "server-1",
"service": "api",
})
time.Sleep(1 * time.Second)
// 2. 然后触发高错误率告警(严重级别)
am.TriggerAlert("HighErrorRate", map[string]string{
"instance": "server-1",
"service": "api",
})
time.Sleep(1 * time.Second)
// 3. 最后触发服务下线告警(紧急级别)
am.TriggerAlert("ServiceDown", map[string]string{
"instance": "server-1",
"service": "api",
})
fmt.Println("告警升级演示完成")
}
func demonstrateAlertSilencing(am *AlertManager) {
fmt.Println("\n--- 告警静音和维护模式演示 ---")
// 添加静音规则
silence := &Silence{
ID: "silence-1",
Matchers: []Matcher{
{Name: "service", Value: "api", IsRegex: false},
{Name: "instance", Value: "server-2", IsRegex: false},
},
StartsAt: time.Now(),
EndsAt: time.Now().Add(10 * time.Minute),
CreatedBy: "ops-team",
Comment: "Maintenance window for server-2",
}
am.mu.Lock()
am.silences = append(am.silences, silence)
am.mu.Unlock()
fmt.Printf("添加静音规则: %s (维护窗口)\n", silence.ID)
// 添加抑制规则:服务下线时抑制其他告警
inhibition := &Inhibition{
SourceMatchers: []Matcher{
{Name: "alertname", Value: "ServiceDown", IsRegex: false},
},
TargetMatchers: []Matcher{
{Name: "service", Value: "api", IsRegex: false},
},
Equal: []string{"instance"},
}
am.mu.Lock()
am.inhibitions = append(am.inhibitions, inhibition)
am.mu.Unlock()
fmt.Println("添加抑制规则: ServiceDown 抑制其他告警")
// 测试静音效果
fmt.Println("\n测试静音效果:")
am.TriggerAlert("HighMemoryUsage", map[string]string{
"instance": "server-2", // 被静音的实例
"service": "api",
})
am.TriggerAlert("HighMemoryUsage", map[string]string{
"instance": "server-3", // 未被静音的实例
"service": "api",
})
}
// 辅助函数
func generateFingerprint(ruleName string, labels map[string]string) string {
// 简化的指纹生成
fingerprint := ruleName
for k, v := range labels {
fingerprint += fmt.Sprintf(":%s=%s", k, v)
}
return fingerprint
}
func generateAlertID() string {
return fmt.Sprintf("alert-%d", time.Now().UnixNano())
}
func mergeMaps(map1, map2 map[string]string) map[string]string {
result := make(map[string]string)
for k, v := range map1 {
result[k] = v
}
for k, v := range map2 {
result[k] = v
}
return result
}
func main() {
demonstrateMonitoringSystem()
time.Sleep(2 * time.Second) // 让监控指标收集运行一段时间
demonstrateAlertingSystem()
}:::
🎯 核心知识点总结
监控体系要点
- 指标分类: 业务指标、系统指标、错误指标、性能指标
- 指标收集: 定期收集、事件驱动、推送/拉取模式
- 数据存储: 时序数据库、数据保留策略
- 可视化: Dashboard设计、图表选择、实时监控
告警系统要点
- 告警级别: Info、Warning、Critical、Emergency
- 告警规则: 条件表达式、持续时间、标签匹配
- 告警路由: 根据标签和严重程度路由到不同接收器
- 告警生命周期: 触发、发送、确认、解决
高级特性要点
- 告警聚合: 相似告警合并减少噪音
- 告警抑制: 高优先级告警抑制低优先级
- 告警静音: 维护窗口和已知问题静音
- 告警升级: 多级升级策略和通知渠道
最佳实践要点
- 合理阈值: 避免告警疲劳和误报
- 运行手册: 每个告警配备处理指南
- 定期回顾: 分析告警有效性和调整策略
- 故障演练: 验证告警系统的可靠性
🔍 面试准备建议
- 理解监控原理: 掌握各种监控指标的含义和采集方法
- 熟悉监控工具: 了解Prometheus、Grafana等主流工具
- 设计告警策略: 能够设计合理的告警规则和升级策略
- 故障处理经验: 积累实际的故障处理和告警优化经验
- SRE实践: 了解可靠性工程的监控和告警最佳实践
