# ============================================================ # Prometheus 告警规则 # ============================================================ groups: # ==================== 主机告警 ==================== - name: host_alerts rules: # CPU 使用率 > 80% 持续 5 分钟 - alert: HighCPUUsage expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 for: 5m labels: severity: warning annotations: summary: "主机 CPU 使用率过高 ({{ $labels.instance }})" description: "CPU 使用率超过 80%,当前值: {{ $value | printf \"%.2f\" }}%" # CPU 使用率 > 95% 持续 2 分钟 - alert: CriticalCPUUsage expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95 for: 2m labels: severity: critical annotations: summary: "主机 CPU 使用率严重过高 ({{ $labels.instance }})" description: "CPU 使用率超过 95%,当前值: {{ $value | printf \"%.2f\" }}%" # 内存使用率 > 85% - alert: HighMemoryUsage expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85 for: 5m labels: severity: warning annotations: summary: "主机内存使用率过高 ({{ $labels.instance }})" description: "内存使用率超过 85%,当前值: {{ $value | printf \"%.2f\" }}%" # 磁盘使用率 > 90% - alert: HighDiskUsage expr: (1 - (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"})) * 100 > 90 for: 5m labels: severity: critical annotations: summary: "主机磁盘空间不足 ({{ $labels.instance }})" description: "磁盘使用率超过 90%,当前值: {{ $value | printf \"%.2f\" }}%" # 磁盘 IO 等待 > 10% - alert: HighDiskIOWait expr: rate(node_cpu_seconds_total{mode="iowait"}[5m]) * 100 > 10 for: 5m labels: severity: warning annotations: summary: "磁盘 IO 等待过高 ({{ $labels.instance }})" description: "IO 等待超过 10%,当前值: {{ $value | printf \"%.2f\" }}%" # ==================== 容器告警 ==================== - name: container_alerts rules: # 容器 CPU 使用率 > 80% - alert: ContainerHighCPU expr: (sum by(name) (rate(container_cpu_usage_seconds_total{name!=""}[5m])) * 100) > 80 for: 5m labels: severity: warning annotations: summary: "容器 CPU 使用率过高 ({{ $labels.name }})" description: "容器 {{ $labels.name }} CPU 使用率超过 80%" # 容器内存使用率 > 85% - alert: ContainerHighMemory expr: (container_memory_usage_bytes{name!=""} / container_spec_memory_limit_bytes{name!=""} * 100) > 85 for: 5m labels: severity: warning annotations: summary: "容器内存使用率过高 ({{ $labels.name }})" description: "容器 {{ $labels.name }} 内存使用率超过 85%" # 容器重启次数 - alert: ContainerFrequentRestart expr: increase(container_last_seen{name!=""}[1h]) > 3 for: 0m labels: severity: warning annotations: summary: "容器频繁重启 ({{ $labels.name }})" description: "容器 {{ $labels.name }} 在过去1小时内重启超过3次" # ==================== 服务告警 ==================== - name: service_alerts rules: # 服务宕机 - alert: ServiceDown expr: up{job=~"wm-.*"} == 0 for: 1m labels: severity: critical annotations: summary: "服务宕机 ({{ $labels.job }})" description: "服务 {{ $labels.job }} 已停止响应超过1分钟" # 服务响应时间 > 5s - alert: ServiceSlowResponse expr: histogram_quantile(0.95, rate(http_server_requests_seconds_bucket{job=~"wm-.*"}[5m])) > 5 for: 5m labels: severity: warning annotations: summary: "服务响应缓慢 ({{ $labels.job }})" description: "服务 {{ $labels.job }} 95% 请求响应时间超过5秒" # HTTP 5xx 错误率 > 5% - alert: HighErrorRate expr: | ( sum by(job) (rate(http_server_requests_seconds_count{status=~"5.."}[5m])) / sum by(job) (rate(http_server_requests_seconds_count[5m])) ) * 100 > 5 for: 5m labels: severity: critical annotations: summary: "服务错误率过高 ({{ $labels.job }})" description: "服务 {{ $labels.job }} 5xx 错误率超过 5%" # ==================== 数据库告警 ==================== - name: database_alerts rules: # PostgreSQL 连接数 > 80% - alert: PostgresHighConnections expr: pg_stat_activity_count / pg_settings_max_connections * 100 > 80 for: 5m labels: severity: warning annotations: summary: "PostgreSQL 连接数过高" description: "数据库连接数使用率超过 80%" # PostgreSQL 死锁 - alert: PostgresDeadlock expr: increase(pg_stat_database_deadlocks[5m]) > 0 for: 0m labels: severity: warning annotations: summary: "PostgreSQL 检测到死锁" description: "数据库发生死锁" # Redis 内存使用率 > 85% - alert: RedisHighMemory expr: redis_memory_used_bytes / redis_memory_max_bytes * 100 > 85 for: 5m labels: severity: warning annotations: summary: "Redis 内存使用率过高" description: "Redis 内存使用率超过 85%" # ==================== 消息队列告警 ==================== - name: mq_alerts rules: # Kafka 消费延迟 - alert: KafkaConsumerLag expr: kafka_consumergroup_lag_sum > 10000 for: 10m labels: severity: warning annotations: summary: "Kafka 消费延迟过高" description: "消费者组 {{ $labels.group }} 延迟超过 10000 条消息" # EMQX 连接数过多 - alert: EmqxHighConnections expr: emqx_connections_count > 10000 for: 5m labels: severity: warning annotations: summary: "EMQX 连接数过高" description: "MQTT 连接数超过 10000"