Prometheus 指标收集深度实践
指标收集是Prometheus监控体系的基础,通过多样化的收集机制和服务发现能力,实现对复杂分布式系统的全面监控覆盖。
🎯 服务发现机制深度解析
Kubernetes服务发现
yaml
# Prometheus配置 - Pod级别发现
scrape_configs:
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
namespaces:
names:
- default
- kube-system
- monitoring
relabel_configs:
# 只抓取带有scrape注解的Pod
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
# 使用自定义端口
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: (.+)
replacement: ${__meta_kubernetes_pod_ip}:${1}
# 使用自定义路径
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
# 协议配置
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
# 添加有用的标签
- source_labels: [__meta_kubernetes_pod_name]
target_label: pod
- source_labels: [__meta_kubernetes_namespace]
target_label: namespace
- source_labels: [__meta_kubernetes_pod_label_app]
target_label: app
- source_labels: [__meta_kubernetes_pod_label_version]
target_label: version
# 清理临时标签
- regex: __meta_kubernetes_pod_label_(.+)
action: labelmap
replacement: label_${1}yaml
# Service级别发现
- job_name: 'kubernetes-services'
kubernetes_sd_configs:
- role: service
relabel_configs:
# 基于Service注解控制抓取
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
regex: true
# Service端口配置
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: (.+)
replacement: ${__meta_kubernetes_service_name}.${__meta_kubernetes_namespace}.svc.cluster.local:${1}
# 服务信息标签
- source_labels: [__meta_kubernetes_service_name]
target_label: service
- source_labels: [__meta_kubernetes_namespace]
target_label: namespace
# 保留Service标签
- regex: __meta_kubernetes_service_label_(.+)
action: labelmap
replacement: service_label_${1}高级服务发现模式
yaml
# Endpoint发现 - 更精确的目标发现
- job_name: 'kubernetes-endpoints'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
# 只保留ready状态的endpoint
- source_labels: [__meta_kubernetes_endpoint_ready]
action: keep
regex: true
# 基于Service注解判断是否抓取
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
regex: true
# 端口名称匹配
- source_labels: [__meta_kubernetes_endpoint_port_name]
action: keep
regex: (metrics|prometheus|monitoring)
# 构建目标地址
- source_labels: [__address__, __meta_kubernetes_endpoint_port_name]
action: replace
target_label: __address__
regex: ([^:]+)(?::\d+)?;(.*)
replacement: ${1}:9090
# 添加端点信息
- source_labels: [__meta_kubernetes_endpoint_hostname]
target_label: hostname
- source_labels: [__meta_kubernetes_endpoint_port_name]
target_label: port_nameyaml
# Node级别系统监控
- job_name: 'kubernetes-nodes'
kubernetes_sd_configs:
- role: node
relabel_configs:
# 构建kubelet地址
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
# 替换为kubelet端口
- source_labels: [__address__]
action: replace
target_label: __address__
regex: ([^:]+):(.*)
replacement: ${1}:10250
# 设置指标路径
- target_label: __metrics_path__
replacement: /metrics
# 使用HTTPS
- target_label: __scheme__
replacement: https
# 节点角色标识
- source_labels: [__meta_kubernetes_node_label_node_role_kubernetes_io_master]
action: replace
target_label: node_role
regex: true
replacement: master
# TLS配置
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token动态配置管理
配置热重载和管理
yaml
configuration_management:
hot_reload:
methods:
api_call:
endpoint: "POST /-/reload"
description: "通过HTTP API触发配置重载"
example: "curl -X POST http://localhost:9090/-/reload"
signal:
signal: "SIGHUP"
description: "发送信号触发重载"
example: "kill -HUP <prometheus_pid>"
file_watch:
mechanism: "文件系统监控"
description: "自动检测配置文件变更"
configuration: "--web.enable-lifecycle"
validation:
syntax_check:
- "YAML格式验证"
- "配置项合法性检查"
- "引用文件存在性验证"
dry_run:
command: "promtool check config prometheus.yml"
description: "配置文件语法检查"
rule_validation:
command: "promtool check rules alert.rules.yml"
description: "告警规则语法检查"
dynamic_service_discovery:
file_sd:
configuration:
scrape_configs:
- job_name: 'file-sd'
file_sd_configs:
- files:
- '/etc/prometheus/file_sd/*.json'
- '/etc/prometheus/file_sd/*.yml'
refresh_interval: 30s
target_file_example: |
[
{
"targets": [
"web-server-1:9100",
"web-server-2:9100"
],
"labels": {
"env": "production",
"team": "backend",
"datacenter": "us-west-1"
}
},
{
"targets": [
"db-server-1:9104",
"db-server-2:9104"
],
"labels": {
"env": "production",
"team": "database",
"role": "mysql"
}
}
]
consul_sd:
configuration:
scrape_configs:
- job_name: 'consul'
consul_sd_configs:
- server: 'consul.service.consul:8500'
services: ['web', 'api', 'database']
tags: ['prometheus', 'monitoring']
relabel_configs:
- source_labels: ['__meta_consul_tags']
regex: '.*,prometheus,.*'
action: keep
- source_labels: ['__meta_consul_service']
target_label: job
dns_sd:
configuration:
scrape_configs:
- job_name: 'dns-sd'
dns_sd_configs:
- names:
- 'web.example.com'
- 'api.example.com'
type: 'A'
port: 9100
refresh_interval: 30s
configuration_templating:
helm_templates:
values_yaml: |
prometheus:
scrapeConfigs:
kubernetesServices:
enabled: true
namespaces: ["default", "kube-system"]
kubernetesIngresses:
enabled: true
relabelConfigs:
- sourceLabels: [__meta_kubernetes_ingress_annotation_prometheus_io_scrape]
action: keep
regex: true
template_example: |
{{- if .Values.prometheus.scrapeConfigs.kubernetesServices.enabled }}
- job_name: 'kubernetes-services'
kubernetes_sd_configs:
- role: service
namespaces:
names:
{{- range .Values.prometheus.scrapeConfigs.kubernetesServices.namespaces }}
- {{ . }}
{{- end }}
{{- end }}📊 Exporter生态系统
官方Exporters
yaml
node_exporter:
description: "系统和硬件指标收集器"
installation:
binary: "直接运行二进制文件"
docker: "容器化部署"
systemd: "系统服务管理"
key_metrics:
system:
- "node_cpu_seconds_total: CPU使用时间"
- "node_memory_MemTotal_bytes: 总内存"
- "node_memory_MemAvailable_bytes: 可用内存"
- "node_disk_io_time_seconds_total: 磁盘IO时间"
network:
- "node_network_receive_bytes_total: 网络接收字节"
- "node_network_transmit_bytes_total: 网络发送字节"
- "node_network_up: 网卡状态"
filesystem:
- "node_filesystem_size_bytes: 文件系统总大小"
- "node_filesystem_avail_bytes: 文件系统可用空间"
- "node_filesystem_files: 文件系统inode总数"
configuration:
collectors:
default_enabled:
- "cpu"
- "diskstats"
- "filesystem"
- "loadavg"
- "meminfo"
- "netdev"
- "stat"
optional:
- "systemd: SystemD服务状态"
- "processes: 进程信息"
- "interrupts: 中断统计"
- "tcpstat: TCP连接统计"
startup_flags:
- "--collector.systemd"
- "--collector.processes"
- "--no-collector.hwmon"
- "--collector.filesystem.ignored-mount-points='^/(sys|proc|dev|host|etc)($|/)'"yaml
cadvisor_monitoring:
description: "容器指标收集器"
deployment:
kubernetes_daemonset: |
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: cadvisor
namespace: monitoring
spec:
template:
spec:
containers:
- name: cadvisor
image: gcr.io/cadvisor/cadvisor:latest
ports:
- containerPort: 8080
protocol: TCP
resources:
requests:
memory: 200Mi
cpu: 150m
limits:
memory: 2000Mi
cpu: 300m
volumeMounts:
- name: rootfs
mountPath: /rootfs
readOnly: true
- name: var-run
mountPath: /var/run
readOnly: true
- name: sys
mountPath: /sys
readOnly: true
- name: docker
mountPath: /var/lib/docker
readOnly: true
volumes:
- name: rootfs
hostPath:
path: /
- name: var-run
hostPath:
path: /var/run
- name: sys
hostPath:
path: /sys
- name: docker
hostPath:
path: /var/lib/docker
key_metrics:
container_resource:
- "container_cpu_usage_seconds_total"
- "container_memory_usage_bytes"
- "container_memory_working_set_bytes"
- "container_fs_usage_bytes"
container_network:
- "container_network_receive_bytes_total"
- "container_network_transmit_bytes_total"
- "container_network_receive_packets_dropped_total"应用程序指标导出
yaml
go_application_metrics:
prometheus_client:
installation: 'go get github.com/prometheus/client_golang'
basic_setup: |
package main
import (
"net/http"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
)
var (
httpRequestsTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "http_requests_total",
Help: "Total number of HTTP requests.",
},
[]string{"method", "status"},
)
httpRequestDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "http_request_duration_seconds",
Help: "HTTP request latency in seconds.",
Buckets: prometheus.DefBuckets,
},
[]string{"method", "endpoint"},
)
)
func init() {
prometheus.MustRegister(httpRequestsTotal)
prometheus.MustRegister(httpRequestDuration)
}
func handler(w http.ResponseWriter, r *http.Request) {
timer := prometheus.NewTimer(httpRequestDuration.WithLabelValues(r.Method, r.URL.Path))
defer timer.ObserveDuration()
// 业务逻辑处理
httpRequestsTotal.WithLabelValues(r.Method, "200").Inc()
}
func main() {
http.HandleFunc("/", handler)
http.Handle("/metrics", promhttp.Handler())
http.ListenAndServe(":8080", nil)
}
custom_metrics:
business_metrics: |
var (
userRegistrations = prometheus.NewCounter(
prometheus.CounterOpts{
Name: "user_registrations_total",
Help: "Total number of user registrations",
},
)
activeUsers = prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "active_users_current",
Help: "Current number of active users",
},
)
orderProcessingTime = prometheus.NewHistogram(
prometheus.HistogramOpts{
Name: "order_processing_seconds",
Help: "Time spent processing orders",
Buckets: []float64{0.1, 0.5, 1, 2, 5, 10},
},
)
)yaml
java_application_metrics:
micrometer_integration:
dependencies: |
<dependency>
<groupId>io.micrometer</groupId>
<artifactId>micrometer-registry-prometheus</artifactId>
<version>1.11.0</version>
</dependency>
spring_boot_config: |
@Configuration
public class MetricsConfig {
@Bean
public MeterRegistry meterRegistry() {
return new PrometheusMeterRegistry(PrometheusConfig.DEFAULT);
}
@Bean
public TimedAspect timedAspect(MeterRegistry registry) {
return new TimedAspect(registry);
}
}
@RestController
public class UserController {
private final Counter userRegistrationCounter;
private final Timer orderProcessingTimer;
public UserController(MeterRegistry meterRegistry) {
this.userRegistrationCounter = Counter.builder("user.registrations")
.description("Number of user registrations")
.tag("version", "v1")
.register(meterRegistry);
this.orderProcessingTimer = Timer.builder("order.processing.time")
.description("Order processing time")
.register(meterRegistry);
}
@PostMapping("/register")
@Timed(value = "user.registration.time", description = "User registration time")
public ResponseEntity<User> registerUser(@RequestBody User user) {
Timer.Sample sample = Timer.start(orderProcessingTimer);
try {
// 用户注册逻辑
userRegistrationCounter.increment();
return ResponseEntity.ok(user);
} finally {
sample.stop();
}
}
}数据库和中间件监控
yaml
mysql_monitoring:
mysqld_exporter:
installation:
docker: |
docker run -d \
--name mysql-exporter \
-p 9104:9104 \
-e DATA_SOURCE_NAME="user:password@(localhost:3306)/" \
prom/mysqld-exporter
configuration: |
# my.cnf配置
[client]
user=prometheus
password=secret
host=localhost
port=3306
key_metrics:
connection:
- "mysql_global_status_threads_connected"
- "mysql_global_status_max_used_connections"
- "mysql_global_variables_max_connections"
performance:
- "mysql_global_status_queries"
- "mysql_global_status_slow_queries"
- "mysql_global_status_innodb_buffer_pool_read_requests"
replication:
- "mysql_slave_lag_seconds"
- "mysql_slave_sql_running"
- "mysql_slave_io_running"
custom_queries:
database_sizes: |
# custom-queries.sql
SELECT
schema_name,
SUM(data_length + index_length) as size_bytes
FROM information_schema.tables
WHERE schema_name NOT IN ('information_schema', 'performance_schema', 'mysql', 'sys')
GROUP BY schema_name;yaml
redis_monitoring:
redis_exporter:
deployment: |
apiVersion: apps/v1
kind: Deployment
metadata:
name: redis-exporter
spec:
replicas: 1
template:
spec:
containers:
- name: redis-exporter
image: oliver006/redis_exporter:latest
env:
- name: REDIS_ADDR
value: "redis://redis-service:6379"
- name: REDIS_PASSWORD
valueFrom:
secretKeyRef:
name: redis-secret
key: password
ports:
- containerPort: 9121
name: metrics
key_metrics:
memory:
- "redis_memory_used_bytes"
- "redis_memory_used_rss_bytes"
- "redis_memory_max_bytes"
connections:
- "redis_connected_clients"
- "redis_blocked_clients"
- "redis_rejected_connections_total"
operations:
- "redis_commands_processed_total"
- "redis_keyspace_hits_total"
- "redis_keyspace_misses_total"
persistence:
- "redis_rdb_last_save_timestamp_seconds"
- "redis_aof_last_rewrite_duration_sec"🔧 高级收集模式
推送网关使用
Pushgateway应用场景
yaml
pushgateway_usage:
use_cases:
batch_jobs:
description: "批处理作业指标推送"
example: |
#!/bin/bash
# 批处理脚本
JOB_NAME="backup-job"
INSTANCE="backup-server-1"
PUSHGATEWAY_URL="http://pushgateway:9091"
# 开始时间
start_time=$(date +%s)
echo "backup_job_start_time ${start_time}" | \
curl --data-binary @- \
"${PUSHGATEWAY_URL}/metrics/job/${JOB_NAME}/instance/${INSTANCE}"
# 执行备份
if perform_backup; then
status=1 # 成功
echo "backup_job_success ${status}" | \
curl --data-binary @- \
"${PUSHGATEWAY_URL}/metrics/job/${JOB_NAME}/instance/${INSTANCE}"
else
status=0 # 失败
echo "backup_job_success ${status}" | \
curl --data-binary @- \
"${PUSHGATEWAY_URL}/metrics/job/${JOB_NAME}/instance/${INSTANCE}"
fi
# 结束时间
end_time=$(date +%s)
duration=$((end_time - start_time))
echo "backup_job_duration_seconds ${duration}" | \
curl --data-binary @- \
"${PUSHGATEWAY_URL}/metrics/job/${JOB_NAME}/instance/${INSTANCE}"
cronjobs:
description: "定时任务监控"
kubernetes_example: |
apiVersion: batch/v1
kind: CronJob
metadata:
name: data-sync-job
spec:
schedule: "0 2 * * *"
jobTemplate:
spec:
template:
spec:
containers:
- name: data-sync
image: data-sync:latest
env:
- name: PUSHGATEWAY_URL
value: "http://pushgateway.monitoring:9091"
command:
- /bin/bash
- -c
- |
start_time=$(date +%s)
if /app/sync-data.sh; then
success=1
else
success=0
fi
end_time=$(date +%s)
duration=$((end_time - start_time))
cat <<EOF | curl --data-binary @- \
${PUSHGATEWAY_URL}/metrics/job/data-sync/instance/${HOSTNAME}
data_sync_last_success_timestamp ${end_time}
data_sync_duration_seconds ${duration}
data_sync_success ${success}
EOF
restartPolicy: OnFailure
best_practices:
naming_conventions:
- "使用描述性的job名称"
- "包含执行环境信息"
- "避免动态生成的标签"
data_management:
- "及时清理过期数据"
- "设置合理的推送频率"
- "避免推送大量高基数指标"
error_handling:
- "推送失败时的重试机制"
- "网络异常的处理"
- "指标数据的本地缓存"
pushgateway_configuration:
prometheus_config: |
scrape_configs:
- job_name: 'pushgateway'
static_configs:
- targets: ['pushgateway:9091']
scrape_interval: 15s
honor_labels: true # 保留推送的标签
deployment: |
apiVersion: apps/v1
kind: Deployment
metadata:
name: pushgateway
spec:
replicas: 1
template:
spec:
containers:
- name: pushgateway
image: prom/pushgateway:latest
ports:
- containerPort: 9091
resources:
requests:
memory: "64Mi"
cpu: "50m"
limits:
memory: "512Mi"
cpu: "200m"
volumeMounts:
- name: data
mountPath: /pushgateway
volumes:
- name: data
persistentVolumeClaim:
claimName: pushgateway-data自定义指标收集
yaml
jmx_monitoring:
jmx_exporter:
description: "Java JMX指标导出"
configuration: |
# jmx_config.yml
rules:
- pattern: 'java.lang<type=Memory><>HeapMemoryUsage: (\w+)'
name: jvm_memory_heap_$1
type: GAUGE
- pattern: 'java.lang<type=GarbageCollector, name=(.+)><>CollectionCount'
name: jvm_gc_collections_total
labels:
collector: '$1'
type: COUNTER
- pattern: 'java.lang<type=Threading><>ThreadCount'
name: jvm_threads_current
type: GAUGE
# Tomcat指标
- pattern: 'Catalina<type=Manager, host=(.+), context=(.+)><>activeSessions'
name: tomcat_sessions_active
labels:
host: '$1'
context: '$2'
type: GAUGE
java_agent: |
# JVM启动参数
java -javaagent:jmx_prometheus_javaagent-0.17.0.jar=8080:jmx_config.yml \
-jar your-application.jar
kubernetes_sidecar: |
apiVersion: apps/v1
kind: Deployment
spec:
template:
spec:
containers:
- name: app
image: java-app:latest
env:
- name: JAVA_OPTS
value: "-javaagent:/opt/jmx_exporter/jmx_prometheus_javaagent.jar=8080:/opt/jmx_exporter/config.yml"
- name: jmx-exporter
image: prom/jmx-exporter:latest
ports:
- containerPort: 8080
name: metricsyaml
snmp_monitoring:
snmp_exporter:
description: "SNMP设备监控"
generator_config: |
# generator.yml
modules:
if_mib:
walk:
- 1.3.6.1.2.1.2.2.1.2 # ifDescr
- 1.3.6.1.2.1.2.2.1.10 # ifInOctets
- 1.6.1.2.1.2.2.1.16 # ifOutOctets
- 1.3.6.1.2.1.2.2.1.8 # ifOperStatus
metrics:
- name: ifInOctets
oid: 1.3.6.1.2.1.2.2.1.10
type: counter
indexes:
- labelname: ifIndex
type: gauge
- name: ifOutOctets
oid: 1.3.6.1.2.1.2.2.1.16
type: counter
indexes:
- labelname: ifIndex
type: gauge
prometheus_config: |
scrape_configs:
- job_name: 'snmp'
static_configs:
- targets:
- 192.168.1.1 # 交换机IP
- 192.168.1.2 # 路由器IP
metrics_path: /snmp
params:
module: [if_mib]
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: snmp-exporter:9116📋 指标收集面试重点
基础概念类
Prometheus的Pull模式有什么优势?
- 集中化配置管理
- 健康检查集成
- 网络拓扑简化
- 目标状态可见
Kubernetes中如何实现服务自动发现?
- Pod、Service、Endpoint发现机制
- 标签重写和过滤规则
- 注解驱动的配置
- 动态目标管理
什么情况下需要使用Pushgateway?
- 短期作业和批处理任务
- 防火墙限制的环境
- 无法主动暴露指标的服务
- 网络隔离场景
技术实现类
如何在应用程序中集成Prometheus指标?
- 客户端库选择和使用
- 指标类型的正确应用
- 标签设计最佳实践
- 性能影响控制
relabel_configs的常用模式有哪些?
- keep/drop动作的目标过滤
- replace动作的标签转换
- labelmap的标签映射
- 正则表达式匹配技巧
如何监控数据库和中间件?
- 官方Exporter的选择和配置
- 自定义指标的设计
- 连接池和权限管理
- 关键性能指标识别
实际应用类
大规模环境下如何优化指标收集?
- 抓取间隔和超时设置
- 目标分组和负载分担
- 网络带宽优化
- 存储性能影响
如何处理高基数指标问题?
- 标签设计原则
- 指标聚合策略
- 采样和过滤技术
- 成本控制方法
监控系统的安全性如何保证?
- 认证和授权机制
- 网络隔离和加密
- 敏感信息保护
- 访问控制策略
🔗 相关内容
- Prometheus架构 - 整体架构和基础概念
- PromQL高级查询 - 查询语言深度应用
- 告警规则设计 - 基于指标的告警配置
- 性能优化 - 收集性能优化策略
掌握Prometheus的指标收集机制是构建有效监控系统的基础。通过合理的服务发现配置和指标设计,可以实现对复杂分布式系统的全面监控覆盖。
