• prometheus自定义监控项的报警规则


    prometheus rules:

    - name: basic-and-important
        rules:
        - alert: NodeCPUUsage
          annotations:
            description: '{{ $labels.instance }} CPU usage is above 80% (current value is {{ $value }})' 
          expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[5m])) * 100) > 80
          for: 10m
          labels:
            severity: critical
          annotations:
            description: '{{$labels.instance}} CPU usage is above 80% (current value is {{ $value }})'
        - alert: NodeMEMUsage
          expr: ((1 - (node_memory_MemAvailable_bytes{job="node-exporter"} / (node_memory_MemTotal_bytes{job="node-exporter"}))) * 100) > 80
          for: 10m
          labels:
            severity: critical
          annotations:
            description: '{{$labels.instance}} MEM usage is above 80% (current value is {{ $value }})'
        - alert: NodeDiskUsage
          expr: (1-(node_filesystem_free_bytes{job="node-exporter",fstype=~"ext4|xfs"} / node_filesystem_size_bytes{job="node-exporter",fstype=~"ext4|xfs"}))*100 > 80
          for: 10m
          labels:
            severity: critical
          annotations:
            description: '{{$labels.instance}} Disk usage is above 80% (current value is {{ $value }})'
        - alert: API response time per min
          expr: increase(http_server_requests_seconds_sum{uri!="/actuator/health"}[1m])/increase(http_server_requests_seconds_count{uri!="/actuator/health"}[1m])>2
          for: 1m 
          labels:
            severity: critical
          annotations:
            description: '{{$labels.job}} {{$labels.url}}  response time more than 2s. current value is {{ $value }}'
        - alert: Count of API request times per min
          expr: increase(http_server_requests_seconds_count{uri!="/actuator/health",uri!="/actuator/prometheus",status!="200"}[1m])>1
          for: 1m
          labels:
            severity: critical
          annotations:
            description: '{{$labels.job}} {{$labels.url}}  request error times is {{ $value }} in recent one min'
      - name: rabbitmq-monitoring
        rules:
        - alert: rabbitmq_queue_messages
          expr: rabbitmq_queue_messages{queue!~".*_DL"} > 10
          for: 5m
          labels:
            severity: critical
          annotations:
            description: 'queue name:{{$labels.queue}} is blocked. current count is {{ $value }}'
        - alert: rabbitmq_consumer_error_total
          expr: increase(rabbitmq_consumer_error_total[1m]) > 10
          for: 1m
          labels:
            severity: critical
          annotations:
            description: 'service name:{{$labels.job}} cannot consume the queues. current count is {{ $value }}'
        - alert: rabbitmq_connection_recovery_total
          expr: increase(rabbitmq_connection_recovery_total[1m]) > 10
          for: 1m
          labels:
            severity: critical
          annotations:
            description: 'service name:{{$labels.job}} connection recovery total is {{ $value }}'
  • 相关阅读:
    Jumpserver之安装在CentOS主机步骤
    Nginx负载均衡后端健康检查(支持HTTP和TCP)
    Nginx负载均衡后端健康检查
    ELK之使用packetbeat分析网络包流量
    ELK之使用heartbeat监控WEB站点
    ELK之elasticsearch导致CPU居高不下系统慢解决办法
    Saltstack如何修改主机名或者minion id
    mac中使用 sourcetree 的快速配置和git服务器登录
    [转]从三层架构迈向领域驱动设计 — 关于领域设计中描述相对简要及明了的一篇文章
    io-nio 区别示意图
  • 原文地址:https://www.cnblogs.com/malukang/p/12786507.html
Copyright © 2020-2023  润新知