groups:
- name: pods.rules
rules:
- alert: PodFailed
expr: sum
by(pod, namespace) (kube_pod_status_phase{phase="Failed"})
> 0
for: 30s
labels:
severity: error
annotations:
summary: '命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod
}} Pod状态Failed (当前值: {{ $value }})'
- alert: InstanceDown
expr: up == 0
for: 1m
labels:
severity: error
annotations:
description: '{{ $labels.instance }} job {{ $labels.job }} 已经停止5分钟以上.'
summary: Instance {{ $labels.instance }} 停止工作
- alert: PodCPUUsage
expr: sum
by(pod, namespace, cluster, container) (rate(container_cpu_usage_seconds_total{cluster!~"(test|job)",image!=""}[10m])
* 100) > 90
for: 5m
labels:
severity: warning
annotations:
summary: '命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod
}} 容器:{{ $labels.container }} CPU使用大于90% (当前值: {{ $value }})'
- alert: PodMemoryUsage
expr: sum
by(pod, namespace, container) (container_memory_rss{image!=""})
/ sum by(pod, namespace, container) (container_spec_memory_limit_bytes{image!=""})
* 100 != +Inf > 80
for: 5m
labels:
severity: warning
annotations:
summary: '命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod
}} 容器:{{ $labels.container }} 内存使用大于80% (当前值: {{ $value }})'
- alert: PodNetworkReceive
expr: sum
by(pod, namespace) (rate(container_network_receive_bytes_total{image!="",name=~"^k8s_.*"}[5m])
/ 1000) > 30000
for: 5m
labels:
severity: warning
annotations:
summary: '命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod
}} 入口流量大于30MB/s (当前值: {{ $value }}K/s)'
- alert: PodNetworkTransmit
expr: sum
by(pod, namespace) (rate(container_network_transmit_bytes_total{image!="",name=~"^k8s_.*"}[5m])
/ 1000) > 30000
for: 5m
labels:
severity: warning
annotations:
summary: '命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod
}} 出口流量大于30MB/s (当前值: {{ $value }}/K/s)'
- alert: PodPending
expr: sum
by(pod, namespace) (kube_pod_status_phase{phase="Pending"})
> 0
for: 1m
labels:
severity: error
annotations:
summary: '命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod
}} Pod状态Pending (当前值: {{ $value }})'
- alert: PodRestart
expr: sum
by(pod, namespace) (changes(kube_pod_container_status_restarts_total{}[1m]))
> 0
for: 1m
labels:
severity: warning
annotations:
summary: '命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod
}} Pod重启 (当前值: {{ $value }})'