智慧水务管理系统 - 精河县供水工程综合管理平台

alert_rules.yml 6.5KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184
  1. # ============================================================
  2. # Prometheus 告警规则
  3. # ============================================================
  4. groups:
  5. # ==================== 主机告警 ====================
  6. - name: host_alerts
  7. rules:
  8. # CPU 使用率 > 80% 持续 5 分钟
  9. - alert: HighCPUUsage
  10. expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
  11. for: 5m
  12. labels:
  13. severity: warning
  14. annotations:
  15. summary: "主机 CPU 使用率过高 ({{ $labels.instance }})"
  16. description: "CPU 使用率超过 80%,当前值: {{ $value | printf \"%.2f\" }}%"
  17. # CPU 使用率 > 95% 持续 2 分钟
  18. - alert: CriticalCPUUsage
  19. expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95
  20. for: 2m
  21. labels:
  22. severity: critical
  23. annotations:
  24. summary: "主机 CPU 使用率严重过高 ({{ $labels.instance }})"
  25. description: "CPU 使用率超过 95%,当前值: {{ $value | printf \"%.2f\" }}%"
  26. # 内存使用率 > 85%
  27. - alert: HighMemoryUsage
  28. expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
  29. for: 5m
  30. labels:
  31. severity: warning
  32. annotations:
  33. summary: "主机内存使用率过高 ({{ $labels.instance }})"
  34. description: "内存使用率超过 85%,当前值: {{ $value | printf \"%.2f\" }}%"
  35. # 磁盘使用率 > 90%
  36. - alert: HighDiskUsage
  37. expr: (1 - (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"})) * 100 > 90
  38. for: 5m
  39. labels:
  40. severity: critical
  41. annotations:
  42. summary: "主机磁盘空间不足 ({{ $labels.instance }})"
  43. description: "磁盘使用率超过 90%,当前值: {{ $value | printf \"%.2f\" }}%"
  44. # 磁盘 IO 等待 > 10%
  45. - alert: HighDiskIOWait
  46. expr: rate(node_cpu_seconds_total{mode="iowait"}[5m]) * 100 > 10
  47. for: 5m
  48. labels:
  49. severity: warning
  50. annotations:
  51. summary: "磁盘 IO 等待过高 ({{ $labels.instance }})"
  52. description: "IO 等待超过 10%,当前值: {{ $value | printf \"%.2f\" }}%"
  53. # ==================== 容器告警 ====================
  54. - name: container_alerts
  55. rules:
  56. # 容器 CPU 使用率 > 80%
  57. - alert: ContainerHighCPU
  58. expr: (sum by(name) (rate(container_cpu_usage_seconds_total{name!=""}[5m])) * 100) > 80
  59. for: 5m
  60. labels:
  61. severity: warning
  62. annotations:
  63. summary: "容器 CPU 使用率过高 ({{ $labels.name }})"
  64. description: "容器 {{ $labels.name }} CPU 使用率超过 80%"
  65. # 容器内存使用率 > 85%
  66. - alert: ContainerHighMemory
  67. expr: (container_memory_usage_bytes{name!=""} / container_spec_memory_limit_bytes{name!=""} * 100) > 85
  68. for: 5m
  69. labels:
  70. severity: warning
  71. annotations:
  72. summary: "容器内存使用率过高 ({{ $labels.name }})"
  73. description: "容器 {{ $labels.name }} 内存使用率超过 85%"
  74. # 容器重启次数
  75. - alert: ContainerFrequentRestart
  76. expr: increase(container_last_seen{name!=""}[1h]) > 3
  77. for: 0m
  78. labels:
  79. severity: warning
  80. annotations:
  81. summary: "容器频繁重启 ({{ $labels.name }})"
  82. description: "容器 {{ $labels.name }} 在过去1小时内重启超过3次"
  83. # ==================== 服务告警 ====================
  84. - name: service_alerts
  85. rules:
  86. # 服务宕机
  87. - alert: ServiceDown
  88. expr: up{job=~"wm-.*"} == 0
  89. for: 1m
  90. labels:
  91. severity: critical
  92. annotations:
  93. summary: "服务宕机 ({{ $labels.job }})"
  94. description: "服务 {{ $labels.job }} 已停止响应超过1分钟"
  95. # 服务响应时间 > 5s
  96. - alert: ServiceSlowResponse
  97. expr: histogram_quantile(0.95, rate(http_server_requests_seconds_bucket{job=~"wm-.*"}[5m])) > 5
  98. for: 5m
  99. labels:
  100. severity: warning
  101. annotations:
  102. summary: "服务响应缓慢 ({{ $labels.job }})"
  103. description: "服务 {{ $labels.job }} 95% 请求响应时间超过5秒"
  104. # HTTP 5xx 错误率 > 5%
  105. - alert: HighErrorRate
  106. expr: |
  107. (
  108. sum by(job) (rate(http_server_requests_seconds_count{status=~"5.."}[5m]))
  109. /
  110. sum by(job) (rate(http_server_requests_seconds_count[5m]))
  111. ) * 100 > 5
  112. for: 5m
  113. labels:
  114. severity: critical
  115. annotations:
  116. summary: "服务错误率过高 ({{ $labels.job }})"
  117. description: "服务 {{ $labels.job }} 5xx 错误率超过 5%"
  118. # ==================== 数据库告警 ====================
  119. - name: database_alerts
  120. rules:
  121. # PostgreSQL 连接数 > 80%
  122. - alert: PostgresHighConnections
  123. expr: pg_stat_activity_count / pg_settings_max_connections * 100 > 80
  124. for: 5m
  125. labels:
  126. severity: warning
  127. annotations:
  128. summary: "PostgreSQL 连接数过高"
  129. description: "数据库连接数使用率超过 80%"
  130. # PostgreSQL 死锁
  131. - alert: PostgresDeadlock
  132. expr: increase(pg_stat_database_deadlocks[5m]) > 0
  133. for: 0m
  134. labels:
  135. severity: warning
  136. annotations:
  137. summary: "PostgreSQL 检测到死锁"
  138. description: "数据库发生死锁"
  139. # Redis 内存使用率 > 85%
  140. - alert: RedisHighMemory
  141. expr: redis_memory_used_bytes / redis_memory_max_bytes * 100 > 85
  142. for: 5m
  143. labels:
  144. severity: warning
  145. annotations:
  146. summary: "Redis 内存使用率过高"
  147. description: "Redis 内存使用率超过 85%"
  148. # ==================== 消息队列告警 ====================
  149. - name: mq_alerts
  150. rules:
  151. # Kafka 消费延迟
  152. - alert: KafkaConsumerLag
  153. expr: kafka_consumergroup_lag_sum > 10000
  154. for: 10m
  155. labels:
  156. severity: warning
  157. annotations:
  158. summary: "Kafka 消费延迟过高"
  159. description: "消费者组 {{ $labels.group }} 延迟超过 10000 条消息"
  160. # EMQX 连接数过多
  161. - alert: EmqxHighConnections
  162. expr: emqx_connections_count > 10000
  163. for: 5m
  164. labels:
  165. severity: warning
  166. annotations:
  167. summary: "EMQX 连接数过高"
  168. description: "MQTT 连接数超过 10000"