• redis监控规则


    其他说明参考host主机监控规则:https://www.cnblogs.com/sanduzxcvbnm/p/13589848.html

    groups:
    - name:  Redis monitoring
      rules:
      - alert: BlackboxProbeFailed
        expr: probe_success == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Blackbox probe failed (instance {{ $labels.instance }})"
          description: "Probe failed
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: BlackboxSlowProbe
        expr: avg_over_time(probe_duration_seconds[1m]) > 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Blackbox slow probe (instance {{ $labels.instance }})"
          description: "Blackbox probe took more than 1s to complete
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: BlackboxProbeHttpFailure
        expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Blackbox probe HTTP failure (instance {{ $labels.instance }})"
          description: "HTTP status code is not 200-399
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: BlackboxSslCertificateWillExpireSoon
        expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})"
          description: "SSL certificate expires in 30 days
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: BlackboxSslCertificateWillExpireSoon
        expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 3
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})"
          description: "SSL certificate expires in 3 days
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: BlackboxSslCertificateExpired
        expr: probe_ssl_earliest_cert_expiry - time() <= 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Blackbox SSL certificate expired (instance {{ $labels.instance }})"
          description: "SSL certificate has expired already
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: BlackboxProbeSlowHttp
        expr: avg_over_time(probe_http_duration_seconds[1m]) > 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Blackbox probe slow HTTP (instance {{ $labels.instance }})"
          description: "HTTP request took more than 1s
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: BlackboxProbeSlowPing
        expr: avg_over_time(probe_icmp_duration_seconds[1m]) > 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Blackbox probe slow ping (instance {{ $labels.instance }})"
          description: "Blackbox ping took more than 1s
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: RedisDown
        expr: redis_up == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Redis down (instance {{ $labels.instance }})"
          description: "Redis instance is down
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: RedisMissingMaster
        expr: count(redis_instance_info{role="master"}) == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Redis missing master (instance {{ $labels.instance }})"
          description: "Redis cluster has no node marked as master.
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: RedisTooManyMasters
        expr: count(redis_instance_info{role="master"}) > 1
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Redis too many masters (instance {{ $labels.instance }})"
          description: "Redis cluster has too many nodes marked as master.
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: RedisDisconnectedSlaves
        expr: count without (instance, job) (redis_connected_slaves) - sum without (instance, job) (redis_connected_slaves) - 1 > 1
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Redis disconnected slaves (instance {{ $labels.instance }})"
          description: "Redis not replicating for all slaves. Consider reviewing the redis replication status.
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: RedisReplicationBroken
        expr: delta(redis_connected_slaves[1m]) < 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Redis replication broken (instance {{ $labels.instance }})"
          description: "Redis instance lost a slave
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: RedisClusterFlapping
        expr: changes(redis_connected_slaves[5m]) > 2
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Redis cluster flapping (instance {{ $labels.instance }})"
          description: "Changes have been detected in Redis replica connection. This can occur when replica nodes lose connection to the master and reconnect (a.k.a flapping).
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: RedisMissingBackup
        expr: time() - redis_rdb_last_save_timestamp_seconds > 60 * 60 * 24
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Redis missing backup (instance {{ $labels.instance }})"
          description: "Redis has not been backuped for 24 hours
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: RedisOutOfMemory
        expr: redis_memory_used_bytes / redis_total_system_memory_bytes * 100 > 90
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Redis out of memory (instance {{ $labels.instance }})"
          description: "Redis is running out of memory (> 90%)
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: RedisTooManyConnections
        expr: redis_connected_clients > 100
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Redis too many connections (instance {{ $labels.instance }})"
          description: "Redis instance has too many connections
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: RedisNotEnoughConnections
        expr: redis_connected_clients < 5
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Redis not enough connections (instance {{ $labels.instance }})"
          description: "Redis instance should have more connections (> 5)
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: RedisRejectedConnections
        expr: increase(redis_rejected_connections_total[1m]) > 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Redis rejected connections (instance {{ $labels.instance }})"
          description: "Some connections to Redis has been rejected
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: TraefikBackendDown
        expr: count(traefik_backend_server_up) by (backend) == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Traefik backend down (instance {{ $labels.instance }})"
          description: "All Traefik backends are down
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: TraefikHighHttp4xxErrorRateBackend
        expr: sum(rate(traefik_backend_requests_total{code=~"4.*"}[3m])) by (backend) / sum(rate(traefik_backend_requests_total[3m])) by (backend) * 100 > 5
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Traefik high HTTP 4xx error rate backend (instance {{ $labels.instance }})"
          description: "Traefik backend 4xx error rate is above 5%
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: TraefikHighHttp5xxErrorRateBackend
        expr: sum(rate(traefik_backend_requests_total{code=~"5.*"}[3m])) by (backend) / sum(rate(traefik_backend_requests_total[3m])) by (backend) * 100 > 5
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Traefik high HTTP 5xx error rate backend (instance {{ $labels.instance }})"
          description: "Traefik backend 5xx error rate is above 5%
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: EtcdInsufficientMembers
        expr: count(etcd_server_id) % 2 == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Etcd insufficient Members (instance {{ $labels.instance }})"
          description: "Etcd cluster should have an odd number of members
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: EtcdNoLeader
        expr: etcd_server_has_leader == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Etcd no Leader (instance {{ $labels.instance }})"
          description: "Etcd cluster have no leader
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: EtcdHighNumberOfLeaderChanges
        expr: increase(etcd_server_leader_changes_seen_total[1h]) > 3
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Etcd high number of leader changes (instance {{ $labels.instance }})"
          description: "Etcd leader changed more than 3 times during last hour
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: EtcdHighNumberOfFailedGrpcRequests
        expr: sum(rate(grpc_server_handled_total{grpc_code!="OK"}[5m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[5m])) BY (grpc_service, grpc_method) > 0.01
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Etcd high number of failed GRPC requests (instance {{ $labels.instance }})"
          description: "More than 1% GRPC request failure detected in Etcd for 5 minutes
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: EtcdHighNumberOfFailedGrpcRequests
        expr: sum(rate(grpc_server_handled_total{grpc_code!="OK"}[5m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[5m])) BY (grpc_service, grpc_method) > 0.05
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Etcd high number of failed GRPC requests (instance {{ $labels.instance }})"
          description: "More than 5% GRPC request failure detected in Etcd for 5 minutes
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: EtcdGrpcRequestsSlow
        expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le)) > 0.15
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Etcd GRPC requests slow (instance {{ $labels.instance }})"
          description: "GRPC requests slowing down, 99th percentil is over 0.15s for 5 minutes
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: EtcdHighNumberOfFailedHttpRequests
        expr: sum(rate(etcd_http_failed_total[5m])) BY (method) / sum(rate(etcd_http_received_total[5m])) BY (method) > 0.01
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Etcd high number of failed HTTP requests (instance {{ $labels.instance }})"
          description: "More than 1% HTTP failure detected in Etcd for 5 minutes
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: EtcdHighNumberOfFailedHttpRequests
        expr: sum(rate(etcd_http_failed_total[5m])) BY (method) / sum(rate(etcd_http_received_total[5m])) BY (method) > 0.05
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Etcd high number of failed HTTP requests (instance {{ $labels.instance }})"
          description: "More than 5% HTTP failure detected in Etcd for 5 minutes
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: EtcdHttpRequestsSlow
        expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Etcd HTTP requests slow (instance {{ $labels.instance }})"
          description: "HTTP requests slowing down, 99th percentil is over 0.15s for 5 minutes
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: EtcdMemberCommunicationSlow
        expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m])) > 0.15
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Etcd member communication slow (instance {{ $labels.instance }})"
          description: "Etcd member communication slowing down, 99th percentil is over 0.15s for 5 minutes
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: EtcdHighNumberOfFailedProposals
        expr: increase(etcd_server_proposals_failed_total[1h]) > 5
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Etcd high number of failed proposals (instance {{ $labels.instance }})"
          description: "Etcd server got more than 5 failed proposals past hour
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: EtcdHighFsyncDurations
        expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Etcd high fsync durations (instance {{ $labels.instance }})"
          description: "Etcd WAL fsync duration increasing, 99th percentil is over 0.5s for 5 minutes
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: EtcdHighCommitDurations
        expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Etcd high commit durations (instance {{ $labels.instance }})"
          description: "Etcd commit duration increasing, 99th percentil is over 0.25s for 5 minutes
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: OpenebsUsedPoolCapacity
        expr: (openebs_used_pool_capacity_percent) > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "OpenEBS used pool capacity (instance {{ $labels.instance }})"
          description: "OpenEBS Pool use more than 80% of his capacity
      VALUE = {{ $value }}
      LABELS: {{ $labels }}
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
    
  • 相关阅读:
    解决在Linux下安装Oracle时的中文乱码问题
    oracle linux 安装过程错误 :Error in invoking target ‘agent nmhs’ of makefile
    mysql 远程连接速度慢的解决方案
    nginx : TCP代理和负载均衡的stream模块
    Xshell同时向多个会话发送指令的方法
    迪杰斯特拉算法
    全周期课程体系
    亲手安装RabbitMq 3.7.2 并安装Trace插件
    山东省枣庄市台儿庄区云平台运维故障处理一例
    开始学习算法
  • 原文地址:https://www.cnblogs.com/sanduzxcvbnm/p/13590513.html
Copyright © 2020-2023  润新知