• Prometheus+Grafana+Altermanager监控告警(四)


    rule监控规则配置

    语法检查规则

    promtool check rules /path/to/example.rules.yml

    nodes.rules:

    groups:
    - name: nodes.rules
      rules:             
      - alert: NodeFilesystemUsage
        expr: 100 - (node_filesystem_free_bytes{fstype=~"ext4|xfs"} / node_filesystem_size_bytes{fstype=~"ext4|xfs"}) * 100 > 80
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: 'Instance {{ $labels.instance }} : {{ $labels.mountpoint }} 分区使用率过高'
          description: '节点: {{ $labels.instance }}: {{ $labels.mountpoint }} 分区使用大于80% (当前值: {{ $value }})'
           
      - alert: NodeMemoryUsage
        expr: 100 - (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 > 90
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: 'Instance {{ $labels.instance }} 内存使用率过高'
          description: '节点: {{ $labels.instance }}内存使用大于80% (当前值: {{ $value }})'
            
      - alert: NodeCPUUsage
        expr: 100 - (avg by(instance, cluster) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: 'Instance {{ $labels.instance }} CPU使用率过高'
          description: '节点: {{ $labels.instance }}CPU使用大于60% (当前值: {{ $value }})'
          
      - alert: KubeNodeNotReady
        expr: kube_node_status_condition{condition="Ready",status="true"} == 0
        for: 1m
        labels:
          severity: error
        annotations:
          summary: '节点: {{ $labels.node }} 已经有10多分钟没有准备好了.'
    View Code

    pods.rules:

    groups:
    - name: pods.rules
      rules:
      - alert: PodFailed
        expr: sum
          by(pod, namespace) (kube_pod_status_phase{phase="Failed"})
          > 0
        for: 30s
        labels:
          severity: error
        annotations:
          summary: '命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod
            }} Pod状态Failed (当前值: {{ $value }})'
            
      - alert: InstanceDown
        expr: up == 0
        for: 1m
        labels:
          severity: error
        annotations:
          description: '{{ $labels.instance }} job {{ $labels.job }} 已经停止5分钟以上.'
          summary: Instance {{ $labels.instance }} 停止工作
          
      - alert: PodCPUUsage
        expr: sum
          by(pod, namespace, cluster, container) (rate(container_cpu_usage_seconds_total{cluster!~"(test|job)",image!=""}[10m])
          * 100) > 90
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: '命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod
            }} 容器:{{ $labels.container }} CPU使用大于90% (当前值: {{ $value }})'
            
      - alert: PodMemoryUsage
        expr: sum
          by(pod, namespace, container) (container_memory_rss{image!=""})
          / sum by(pod, namespace, container) (container_spec_memory_limit_bytes{image!=""})
          * 100 != +Inf > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: '命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod
            }} 容器:{{ $labels.container }} 内存使用大于80% (当前值: {{ $value }})'
            
      - alert: PodNetworkReceive
        expr: sum
          by(pod, namespace) (rate(container_network_receive_bytes_total{image!="",name=~"^k8s_.*"}[5m])
          / 1000) > 30000
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: '命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod
            }} 入口流量大于30MB/s (当前值: {{ $value }}K/s)'
            
      - alert: PodNetworkTransmit
        expr: sum
          by(pod, namespace) (rate(container_network_transmit_bytes_total{image!="",name=~"^k8s_.*"}[5m])
          / 1000) > 30000
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: '命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod
            }} 出口流量大于30MB/s (当前值: {{ $value }}/K/s)'
            
      - alert: PodPending
        expr: sum
          by(pod, namespace) (kube_pod_status_phase{phase="Pending"})
          > 0
        for: 1m
        labels:
          severity: error
        annotations:
          summary: '命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod
            }} Pod状态Pending (当前值: {{ $value }})'
            
      - alert: PodRestart
        expr: sum
          by(pod, namespace) (changes(kube_pod_container_status_restarts_total{}[1m]))
          > 0
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: '命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod
            }} Pod重启 (当前值: {{ $value }})'
    View Code
  • 相关阅读:
    入门OJ 4187【周末舞会】
    入门OJ 1532【排队取款】
    洛谷 P3029 [USACO11NOV]【牛的阵容Cow Lineup】
    洛谷 P1638【逛画展】
    入门OJ 1256【排队】
    PKU 1945【Power Hungry Cows】
    RocketMQ重试机制和消息
    Java操作RocketMQ
    RocketMQ概述
    重定向机制
  • 原文地址:https://www.cnblogs.com/litzhiai/p/15429801.html
Copyright © 2020-2023  润新知