cat prometheus-rules.yaml
apiVersion: v1
data:
cpu-usage.rules: |
groups:
- name: cpu-usage.rules
rules:
- alert: CPU使用率过高(大于80%)
expr: (100 - (avg by(instance) (irate(node_cpu{mode="idle",name="node-exporter"}[5m]))* 100)) > 80
for: 5m
labels:
severity: page
annotations:
description: 'CPU使用率过高: {{ $value }}'
summary: 'CPU使用率过高: {{ $value }}'
value: '{{ $value }}'
mysql-usage.rules: |
groups:
- name: mysql-usage.rules
rules:
- alert: mysql状态没有正常up
expr: mysql_up != 1
for: 2m
labels:
severity: warning
annotations:
description: 'mysql状态没有正常up: {{ $value }}'
summary: 'mysql状态没有正常up: {{ $value }}'
value: '{{ $value }}'
odl.rules: |
groups:
- name: odl.rules
rules:
- alert: OVSDB跟OPENFLOW数量差异过大
expr: abs(ODL_ESTABLISHED_NUMBER_6640 - ODL_ESTABLISHED_NUMBER_6633) > 200
for: 5m
labels:
severity: warning
annotations:
description: 'OVSDB跟OPENFLOW数量差异过大: {{ $value }}'
summary: 'OVSDB跟OPENFLOW数量差异过大: {{ $value }}'
value: '{{ $value }}'
memory.rules: |
groups:
- name: memory.rules
rules:
- alert: 内存使用率过高(>75%)
expr: (node_memory_MemTotal - node_memory_MemFree - node_memory_Cached - node_memory_Buffers) / node_memory_MemTotal* 100 > 75
for: 5m
labels:
severity: warning
annotations:
description: 'CPU使用率过高: {{ $value }}'
summary: 'CPU使用率过高: {{ $value }}'
value: '{{ $value }}'
mysql-used-connections.rules: |
groups:
- name: mysql-used-connections.rules
rules:
- alert: mysql的used_connections变化过大
expr: rate(mysql_global_status_max_used_connections[5m])> 40
for: 2m
labels:
severity: warning
annotations:
description: 'mysql的used_connections变化过大: {{ $value }}'
summary: 'mysql的used_connections变化过大: {{ $value }}'
value: '{{ $value }}'
diskhighuse.rules: |
groups:
- name: diskhighuse.rules
rules:
- alert: 磁盘使用率过高(>75%)
expr: (node_filesystem_size{mountpoint="/"} - node_filesystem_avail{mountpoint="/"}) / node_filesystem_size{mountpoint="/"}* 100 > 75
for: 5m
labels:
severity: warning
annotations:
description: '磁盘使用率过高: {{ $value }}'
summary: '磁盘使用率过高: {{ $value }}'
value: '{{ $value }}'
diskiohighuse.rules: |
groups:
- name: diskiohighuse.rules
rules:
- alert: iowait过高(>50%)
expr: node_disk_io_now{device="sda"}
> 50
for: 1m
labels:
severity: warning
annotations:
description: '磁盘iowait过高: {{ $value }}'
summary: '磁盘iowait过高: {{ $value }}'
value: '{{ $value }}'
ODL_OVSDB_closewait.rules: |
groups:
- name: ODL_OVSDB_closewait.rules
rules:
- alert: ODL_OVSDB_closewait数量过多
expr: ODL_CLOSEWAIT_NUMBER_6640 > 200
labels:
severity: warning
annotations:
description: 'ODL_OVSDB_closewait数量过多: {{ $value }}'
summary: 'ODL_OVSDB_closewait数量过多: {{ $value }}'
value: '{{ $value }}'
Openflow.rules: |
groups:
- name: openflow.rules
rules:
- alert: OPENFLOW数量跟前12小时的平均数量差距过大
expr: abs(ODL_ESTABLISHED_NUMBER_6633-avg_over_time(ODL_ESTABLISHED_NUMBER_6633[1h])) > 2000
for: 5m
labels:
severity: warning
annotations:
description: 'OPENFLOW数量跟前12小时的平均数量差距过大: {{ $value }}'
summary: 'OPENFLOW数量跟前12小时的平均数量差距过大: {{ $value }}'
value: '{{ $value }}'
ODL_OPENFLOW_closewait.rules: |
groups:
- name: ODL_OPENFLOW_closewait.rules
rules:
- alert: ODL_OPENFLOW_closewait数量过多
expr: ODL_CLOSEWAIT_NUMBER_6633 > 200
labels:
severity: warning
annotations:
description: 'ODL_OPENFLOW_closewait数量过多: {{ $value }}'
summary: 'ODL_OPENFLOW_closewait数量过多: {{ $value }}'
value: '{{ $value }}'
node_reboot.rules: |
groups:
- name: node_reboot.rules
rules:
- alert: node被重启
expr: changes(node_boot_time[5m]) > 0
labels:
severity: warning
annotations:
description: 'node被重启: {{ $value }}'
summary: 'node被重启: {{ $value }}'
value: '{{ $value }}'
redis-cluster.rules: |
groups:
- name: redis-cluster.rules
rules:
- alert: redis_cluster一分钟内没有收到数据
expr: absent(redis_cluster_nodes_fail)
for: 1m
labels:
severity: warning
annotations:
description: 'redis_cluster一分钟内没有收到数据: {{ $value }}'
summary: 'redis_cluster一分钟内没有收到数据: {{ $value }}'
value: '{{ $value }}'
redis_cluster_nodes.rules: |
groups:
- name: redis_cluster_nodes.rules
rules:
- alert: redis_cluster_nodes数量发生变化
expr: rate(redis_cluster_nodes_connected[5m]) > 0
for: 1m
labels:
severity: warning
annotations:
description: 'redis_cluster_nodes数量发生变化: {{ $value }}'
summary: 'redis_cluster_nodes数量发生变化: {{ $value }}'
value: '{{ $value }}'
redis_fail.rules: |
groups:
- name: redis_fail.rules
rules:
- alert: redis_cluster_nodes存在fail的数量
expr: redis_cluster_nodes_fail > 0
for: 1m
labels:
severity: warning
annotations:
description: 'redis_cluster_nodes存在fail的数量: {{ $value }}'
summary: 'redis_cluster_nodes存在fail的数量: {{ $value }}'
value: '{{ $value }}'
kind: ConfigMap
metadata:
creationTimestamp: null
name: prometheus-rules
namespace: monitoring