• Kubernetes Prometheus rule


    告警规则


    常用告警规则配置

    • alerts

      ## CPU告警规则
      groups:
      - name: CpuAlertRule
        rules:
        - alert: PodCPU告警
          expr: onecore:pod > 80 or twocore:pod / 2 > 80 or squarecore:pod / 4 > 80
          for: 2m
          labels:
            severity: warning
          annotations:
            description: "CPU使用率大于80%"
            value: "{{$value}}%"
            #summary: 'CPU使用率大于80%,当前值为{{.Value}}%,CPU使用率: {{ printf `ceil(100 - ((avg by (instance)(irate(node_cpu_seconds_total{mode="idle",instance="%s"}[1m]))) *100))` $labels.instance | query | first | value }}%'
        - alert: NodeCPU告警
          expr: round(100-avg(irate(node_cpu_seconds_total{mode="idle"}[5m]))by(kubernetes_node)*100) > 80
          for: 2m
          labels:
            severity: warning
          annotations:
            description: "CPU使用率大于80%"
            value: "{{$value}}%"
            #summary: 'CPU使用率大于80%,当前值为{{.Value}}%,CPU使用率: {{ printf `ceil(100 - ((avg by (instance)(irate(node_cpu_seconds_total{mode="idle",instance="%s"}[1m]))) *100))` $labels.instance | query | first | value }}%'
      
      ## DISK告警规则
      - name: DiskAlertRule
        rules:
        - alert: Pod磁盘告警
          expr: round(container_fs_usage_bytes{container=~".+",container!~"POD"}/1024/1024/1024/10*100) > 85
          for: 1m
          labels:
            severity: warning
          annotations:
            description: "磁盘使用率大于85%"
            value: "{{$value}}%"
        - alert: Node磁盘告警
          expr: round((1- node_filesystem_avail_bytes{fstype=~"ext.+|nfs.+",mountpoint!~".*docker.*"}/node_filesystem_size_bytes{fstype=~"ext.+|nfs.+",mountpoint!~".*docker.*"})*100) > 85
          for: 1m
          labels:
            severity: warning
          annotations:
            description: "磁盘使用率大于85%"
            value: "{{$value}}%"
      
      ## MEM告警规则
      - name: MemAlertRule
        rules:
        - alert: Pod内存告警
          expr: round(container_memory_usage_bytes{container=~".+",container!~"POD|.+reload",pod!~"^csi.+"}/container_spec_memory_limit_bytes{container=~".+",container!~"POD|.reload",pod!~"^csi.+"}*100) > 85
          for: 2m
          labels:
            severity: warning
          annotations:
            description: "内存使用率大于85%"
            value: "{{$value}}%"
        - alert: Node内存告警
          expr: round(100-((node_memory_MemAvailable_bytes*100)/node_memory_MemTotal_bytes)) > 80
          for: 2m
          labels:
            severity: warning
          annotations:
            description: "内存使用率大于85%"
            value: "{{$value}}%"
      
      ## Pod意外重启
      - name: PodRestartAlertRule
        rules:
        - alert: Pod重启告警
          expr: delta(kube_pod_container_status_restarts_total[1m]) > 0
          for: 1s
          labels:
            severity: warning
          annotations:
            description: "Pod发生意外重启事件"
      
      ## JvmCMSOldGC
      - name: PodJvmOldGCAlertRule
        rules:
        - alert: PodJvmCMSOldGC
          expr: round((jvm_memory_pool_bytes_used{pool=~".+Old Gen"}/jvm_memory_pool_bytes_max{pool=~".+Old Gen"})*100) > 89
          for: 5s
          labels:
            severity: warning
          annotations:
            description: "Pod堆内存触发CMSOldGC"
            value: "{{$value}}%"
      
      ## Pod实例异常
      - name: ContainerInstanceAlertRule
        rules:
        - alert: Pod实例异常
          expr: kube_pod_container_status_ready - kube_pod_container_status_running > 0
          for: 20s
          labels:
            severity: warning
          annotations:
            description: "Container实例异常"
      
      ## Pod实例OOM
      - name: ContainerOOMAlertRule
        rules:
        - alert: Pod实例OOM
          expr: kube_pod_container_status_terminated_reason{reason="OOMKilled"} > 0
          for: 1s
          labels:
            severity: warning
          annotations:
            description: "Container实例OOM"
      
      ## Pod实例驱逐
      - name: ContainerEvictionAlertRule
        rules:
        - alert: Pod实例驱逐
          expr: kube_pod_container_status_terminated_reason{reason="Evicted"} > 0
          for: 1s
          labels:
            severity: warning
          annotations:
            description: "Container实例驱逐"
      
      ## MQ内存告警
      - name: MQMemoryAlertRule
        rules:
        - alert: MQ内存水位线
          expr: rabbitmq_node_mem_alarm{job=~".*rabbitmq.*"} == 1
          for: 1s
          labels:
            severity: warning
          annotations:
            description: "RabbitMQ内存高水位线告警"
            summary: RabbitMQ {{`{{ $labels.instance }}`}} High Memory Alarm is going off.  Which means the node hit highwater mark and has cut off network connectivity, see RabbitMQ WebUI
        - alert: MQ内存使用告警
          expr: round(avg(rabbitmq_node_mem_used{job=~".*rabbitmq.*"} / rabbitmq_node_mem_limit{job=~".*rabbitmq.*"})by(node,kubernetes_namespace)*100) > 90
          for: 10s
          labels:
            severity: warning
          annotations:
            description: "RabbitMQ使用告警"
            value: "{{$value}}%"
            summary: RabbitMQ {{`{{ $labels.instance }}`}} Memory Usage > 90%
      
      ##PodJava进程异常
      - name: PodJavaProcessAlertRule
        rules:
        - alert: PodJava进程异常
          expr: sum(up{job="kubernetes-pods-jvm"})by(kubernetes_container_name,kubernetes_pod_name) == 0
          for: 10s
          labels:
            severity: warning
          annotations:
            description: "PodJava进程异常"
            summary: "赶快看看吧,顶不住了"
    • recording_rules

      groups:
        - name: CpuRecordRules
          rules:
          - record: onecore:pod
            expr: round(sum by(pod, container, instance, namespace, name) (irate(container_cpu_usage_seconds_total{container!~"|POD|prod-xianxiang-edu-loan|prod-xy-fund|prod-common-callcenter|prod-risk-service|prod-qn-web-api|prod-xc-fund|prod-xc-user|sys-ingress|etcd|prod-qn-mp|prod-xc-common|prod-xianxiang-zuul|prod-qn-user|prod-xc-riskapi|prod-common-message|prod-common-trust-service|prod-xc-collection|kube-controller-manager|prod-qn-risk|prod-xy-zuul|metrics-server|prod-nflow-manager|kube-scheduler|prod-qn-gateway|prod-xc-pay|coredns|kube-apiserver|prod-qn-oms|prod-common-service|prod-nfsp-service|pord-ingress|prod-qn-cms|prod-internal-ingress|prod-xc-loan|prod-rabbitmq|prometheus-server"}[5m]) * 100))
          - record: twocore:pod
            expr: round(sum by(pod, container, instance, namespace, name) (irate(container_cpu_usage_seconds_total{container=~"prod-xianxiang-edu-loan|prod-xy-fund|prod-common-callcenter|prod-risk-service|prod-qn-web-api|prod-xc-fund|prod-xc-user|sys-ingress|etcd|prod-qn-mp|prod-xc-common|prod-xianxiang-zuul|prod-qn-user|prod-xc-riskapi|prod-common-message|prod-common-trust-service|prod-xc-collection|kube-controller-manager|prod-qn-risk|prod-xy-zuul|metrics-server|prod-nflow-manager|kube-scheduler|prod-qn-gateway|prod-xc-pay|coredns|kube-apiserver|prod-qn-oms|prod-common-service|prod-nfsp-service|pord-ingress|prod-qn-cms|prod-internal-ingress|prod-xc-loan"}[5m]) * 100))
          - record: squarecore:pod
            expr: round(sum by(pod, container, instance, namespace, name) (irate(container_cpu_usage_seconds_total{container=~"prod-rabbitmq|prometheus-server"}[5m]) * 100))
  • 相关阅读:
    游戏开发之游戏策划的基本原则
    Lua游戏脚本语言入门
    游戏策划之游戏心理学理论深入浅出
    微博的10大特征包括哪些?
    普米族求助,十万火急!!! 请大家给力!!!
    剑指微博营销,速创品牌传奇
    将网络推广进行到底
    浅谈如何利用微博进行网站推广(转)
    “土风计划”,陈哲另一个伟大事业
    快速增加微博粉丝的十六大技巧
  • 原文地址:https://www.cnblogs.com/apink/p/16367699.html
Copyright © 2020-2023  润新知