https://prometheus.io/docs/prometheus/latest/configuration/configuration/
下面监控宿主机和容器的内存,CPU,磁盘等状态
groups: - name: example #定义规则组 rules: - alert: InstanceDown #定义报警名称 expr: up == 0 #Promql语句,触发规则 for: 1m # 一分钟 labels: #标签定义报警的级别和主机 name: instance severity: Critical annotations: #注解 summary: " {{ $labels.appname }}" #报警摘要,取报警信息的appname名称 description: " 服务停止运行 " #报警信息 value: "{{ $value }}%" # 当前报警状态值 - name: Host rules: - alert: HostMemory Usage expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes)) / node_memory_MemTotal_bytes * 100 > 80 for: 1m labels: name: Memory severity: Warning annotations: summary: " {{ $labels.appname }} " description: "宿主机内存使用率超过80%." value: "{{ $value }}" - alert: HostCPU Usage expr: sum(avg without (cpu)(irate(node_cpu_seconds_total{mode!='idle'}[5m]))) by (instance,appname) > 0.65 for: 1m labels: name: CPU severity: Warning annotations: summary: " {{ $labels.appname }} " description: "宿主机CPU使用率超过65%." value: "{{ $value }}" - alert: HostLoad expr: node_load5 > 4 for: 1m labels: name: Load severity: Warning annotations: summary: "{{ $labels.appname }} " description: " 主机负载5分钟超过4." value: "{{ $value }}" - alert: HostFilesystem Usage expr: 1-(node_filesystem_free_bytes / node_filesystem_size_bytes) > 0.8 for: 1m labels: name: Disk severity: Warning annotations: summary: " {{ $labels.appname }} " description: " 宿主机 [ {{ $labels.mountpoint }} ]分区使用超过80%." value: "{{ $value }}%" - alert: HostDiskio expr: irate(node_disk_writes_completed_total{job=~"Host"}[1m]) > 10 for: 1m labels: name: Diskio severity: Warning annotations: summary: " {{ $labels.appname }} " description: " 宿主机 [{{ $labels.device }}]磁盘1分钟平均写入IO负载较高." value: "{{ $value }}iops" - alert: Network_receive expr: irate(node_network_receive_bytes_total{device!~"lo|bond[0-9]|cbr[0-9]|veth.*|virbr.*|ovs-system"}[5m]) / 1048576 > 3 for: 1m labels: name: Network_receive severity: Warning annotations: summary: " {{ $labels.appname }} " description: " 宿主机 [{{ $labels.device }}] 网卡5分钟平均接收流量超过3Mbps." value: "{{ $value }}3Mbps" - alert: Network_transmit expr: irate(node_network_transmit_bytes_total{device!~"lo|bond[0-9]|cbr[0-9]|veth.*|virbr.*|ovs-system"}[5m]) / 1048576 > 3 for: 1m labels: name: Network_transmit severity: Warning annotations: summary: " {{ $labels.appname }} " description: " 宿主机 [{{ $labels.device }}] 网卡5分钟内平均发送流量超过3Mbps." value: "{{ $value }}3Mbps" - name: Container rules: - alert: ContainerCPU Usage expr: (sum by(name,instance) (rate(container_cpu_usage_seconds_total{image!=""}[5m]))*100) > 60 for: 1m labels: name: CPU severity: Warning annotations: summary: "{{ $labels.name }} " description: " 容器CPU使用超过60%." value: "{{ $value }}%" - alert: ContainerMem Usage # expr: (container_memory_usage_bytes - container_memory_cache) / container_spec_memory_limit_bytes * 100 > 10 expr: container_memory_usage_bytes{name=~".+"} / 1048576 > 1024 for: 1m labels: name: Memory severity: Warning annotations: summary: "{{ $labels.name }} " description: " 容器内存使用超过1GB." value: "{{ $value }}G"