| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184 |
- # ============================================================
- # Prometheus 告警规则
- # ============================================================
-
- groups:
- # ==================== 主机告警 ====================
- - name: host_alerts
- rules:
- # CPU 使用率 > 80% 持续 5 分钟
- - alert: HighCPUUsage
- expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
- for: 5m
- labels:
- severity: warning
- annotations:
- summary: "主机 CPU 使用率过高 ({{ $labels.instance }})"
- description: "CPU 使用率超过 80%,当前值: {{ $value | printf \"%.2f\" }}%"
-
- # CPU 使用率 > 95% 持续 2 分钟
- - alert: CriticalCPUUsage
- expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95
- for: 2m
- labels:
- severity: critical
- annotations:
- summary: "主机 CPU 使用率严重过高 ({{ $labels.instance }})"
- description: "CPU 使用率超过 95%,当前值: {{ $value | printf \"%.2f\" }}%"
-
- # 内存使用率 > 85%
- - alert: HighMemoryUsage
- expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
- for: 5m
- labels:
- severity: warning
- annotations:
- summary: "主机内存使用率过高 ({{ $labels.instance }})"
- description: "内存使用率超过 85%,当前值: {{ $value | printf \"%.2f\" }}%"
-
- # 磁盘使用率 > 90%
- - alert: HighDiskUsage
- expr: (1 - (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"})) * 100 > 90
- for: 5m
- labels:
- severity: critical
- annotations:
- summary: "主机磁盘空间不足 ({{ $labels.instance }})"
- description: "磁盘使用率超过 90%,当前值: {{ $value | printf \"%.2f\" }}%"
-
- # 磁盘 IO 等待 > 10%
- - alert: HighDiskIOWait
- expr: rate(node_cpu_seconds_total{mode="iowait"}[5m]) * 100 > 10
- for: 5m
- labels:
- severity: warning
- annotations:
- summary: "磁盘 IO 等待过高 ({{ $labels.instance }})"
- description: "IO 等待超过 10%,当前值: {{ $value | printf \"%.2f\" }}%"
-
- # ==================== 容器告警 ====================
- - name: container_alerts
- rules:
- # 容器 CPU 使用率 > 80%
- - alert: ContainerHighCPU
- expr: (sum by(name) (rate(container_cpu_usage_seconds_total{name!=""}[5m])) * 100) > 80
- for: 5m
- labels:
- severity: warning
- annotations:
- summary: "容器 CPU 使用率过高 ({{ $labels.name }})"
- description: "容器 {{ $labels.name }} CPU 使用率超过 80%"
-
- # 容器内存使用率 > 85%
- - alert: ContainerHighMemory
- expr: (container_memory_usage_bytes{name!=""} / container_spec_memory_limit_bytes{name!=""} * 100) > 85
- for: 5m
- labels:
- severity: warning
- annotations:
- summary: "容器内存使用率过高 ({{ $labels.name }})"
- description: "容器 {{ $labels.name }} 内存使用率超过 85%"
-
- # 容器重启次数
- - alert: ContainerFrequentRestart
- expr: increase(container_last_seen{name!=""}[1h]) > 3
- for: 0m
- labels:
- severity: warning
- annotations:
- summary: "容器频繁重启 ({{ $labels.name }})"
- description: "容器 {{ $labels.name }} 在过去1小时内重启超过3次"
-
- # ==================== 服务告警 ====================
- - name: service_alerts
- rules:
- # 服务宕机
- - alert: ServiceDown
- expr: up{job=~"wm-.*"} == 0
- for: 1m
- labels:
- severity: critical
- annotations:
- summary: "服务宕机 ({{ $labels.job }})"
- description: "服务 {{ $labels.job }} 已停止响应超过1分钟"
-
- # 服务响应时间 > 5s
- - alert: ServiceSlowResponse
- expr: histogram_quantile(0.95, rate(http_server_requests_seconds_bucket{job=~"wm-.*"}[5m])) > 5
- for: 5m
- labels:
- severity: warning
- annotations:
- summary: "服务响应缓慢 ({{ $labels.job }})"
- description: "服务 {{ $labels.job }} 95% 请求响应时间超过5秒"
-
- # HTTP 5xx 错误率 > 5%
- - alert: HighErrorRate
- expr: |
- (
- sum by(job) (rate(http_server_requests_seconds_count{status=~"5.."}[5m]))
- /
- sum by(job) (rate(http_server_requests_seconds_count[5m]))
- ) * 100 > 5
- for: 5m
- labels:
- severity: critical
- annotations:
- summary: "服务错误率过高 ({{ $labels.job }})"
- description: "服务 {{ $labels.job }} 5xx 错误率超过 5%"
-
- # ==================== 数据库告警 ====================
- - name: database_alerts
- rules:
- # PostgreSQL 连接数 > 80%
- - alert: PostgresHighConnections
- expr: pg_stat_activity_count / pg_settings_max_connections * 100 > 80
- for: 5m
- labels:
- severity: warning
- annotations:
- summary: "PostgreSQL 连接数过高"
- description: "数据库连接数使用率超过 80%"
-
- # PostgreSQL 死锁
- - alert: PostgresDeadlock
- expr: increase(pg_stat_database_deadlocks[5m]) > 0
- for: 0m
- labels:
- severity: warning
- annotations:
- summary: "PostgreSQL 检测到死锁"
- description: "数据库发生死锁"
-
- # Redis 内存使用率 > 85%
- - alert: RedisHighMemory
- expr: redis_memory_used_bytes / redis_memory_max_bytes * 100 > 85
- for: 5m
- labels:
- severity: warning
- annotations:
- summary: "Redis 内存使用率过高"
- description: "Redis 内存使用率超过 85%"
-
- # ==================== 消息队列告警 ====================
- - name: mq_alerts
- rules:
- # Kafka 消费延迟
- - alert: KafkaConsumerLag
- expr: kafka_consumergroup_lag_sum > 10000
- for: 10m
- labels:
- severity: warning
- annotations:
- summary: "Kafka 消费延迟过高"
- description: "消费者组 {{ $labels.group }} 延迟超过 10000 条消息"
-
- # EMQX 连接数过多
- - alert: EmqxHighConnections
- expr: emqx_connections_count > 10000
- for: 5m
- labels:
- severity: warning
- annotations:
- summary: "EMQX 连接数过高"
- description: "MQTT 连接数超过 10000"
|