• Skywalking支持的告警指标


    网上看了很多,发现对于Skywalking支持哪些指标名称metrics,官方文档跟博客几乎都是指明了一个路径,没有人详细的解释,支持哪些指标,这些指标的作用又有什么作用,导致大家自定义指标的时候有很多困难。

    所以这里给大家总结下,如有错误,及时指正:

    Skywalking的oap指标存放在:/apache-skywalking-apm-bin-es78/config/oal/*.oap 目录下

    先来看第一个oap文件:

    core.oal

     1 / All scope metrics
     2 all_percentile = from(All.latency).percentile(10);  // Multiple values including p50, p75, p90, p95, p99
     3 all_heatmap = from(All.latency).histogram(100, 20); // 
     4 
     5 // Service scope metrics 服务
     6 service_resp_time = from(Service.latency).longAvg(); // 服务的平均响应时间
     7 service_sla = from(Service.*).percent(status == true); // 服务的请求成功率
     8 service_cpm = from(Service.*).cpm(); //服务的每分钟调用次数
     9 service_percentile = from(Service.latency).percentile(10); // Multiple values including p50, p75, p90, p95, p99
    10 service_apdex = from(Service.latency).apdex(name, status); // 服务的应用性能指标,apdex的衡量的是衡量满意的响应时间与不满意的响应时间的比率,默认的请求满意时间是500ms
    11 
    12 // Service relation scope metrics for topology 服务与服务间调用的调用度量指标
    13 service_relation_client_cpm = from(ServiceRelation.*).filter(detectPoint == DetectPoint.CLIENT).cpm();//在客户端检测到的每分钟调用次数
    14 service_relation_server_cpm = from(ServiceRelation.*).filter(detectPoint == DetectPoint.SERVER).cpm();//在服务端检测到的每分钟调用的次数
    15 service_relation_client_call_sla = from(ServiceRelation.*).filter(detectPoint == DetectPoint.CLIENT).percent(status == true);//在客户端检测到成功率
    16 service_relation_server_call_sla = from(ServiceRelation.*).filter(detectPoint == DetectPoint.SERVER).percent(status == true);//在服务端检测到的成功率
    17 service_relation_client_resp_time = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).longAvg();//在客户端检测到的平均响应时间
    18 service_relation_server_resp_time = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.SERVER).longAvg();//在服务端检测到的平均响应时间
    19 service_relation_client_percentile = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).percentile(10); // Multiple values including p50, p75, p90, p95, p99
    20 service_relation_server_percentile = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.SERVER).percentile(10); // Multiple values including p50, p75, p90, p95, p99
    21 
    22 // Service Instance relation scope metrics for topology 服务实例与服务实例之间的调用度量指标
    23 service_instance_relation_client_cpm = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.CLIENT).cpm();//在客户端实例检测到的每分钟调用次数
    24 service_instance_relation_server_cpm = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.SERVER).cpm();//在服务端实例检测到的每分钟调用次数
    25 service_instance_relation_client_call_sla = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.CLIENT).percent(status == true);//在客户端实例检测到的成功率
    26 service_instance_relation_server_call_sla = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.SERVER).percent(status == true);//在服务端实例检测到的成功率
    27 service_instance_relation_client_resp_time = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).longAvg();//在客户端实例检测到的平均响应时间
    28 service_instance_relation_server_resp_time = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.SERVER).longAvg();//在服务端实例检测到的平均响应时间
    29 service_instance_relation_client_percentile = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).percentile(10); // Multiple values including p50, p75, p90, p95, p99
    30 service_instance_relation_server_percentile = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.SERVER).percentile(10); // Multiple values including p50, p75, p90, p95, p99
    31 
    32 // Service Instance Scope metrics
    33 service_instance_sla = from(ServiceInstance.*).percent(status == true);//服务实例的成功率
    34 service_instance_resp_time= from(ServiceInstance.latency).longAvg();//服务实例的平均响应时间
    35 service_instance_cpm = from(ServiceInstance.*).cpm();//服务实例的每分钟调用次数
    36 
    37 // Endpoint scope metrics
    38 endpoint_cpm = from(Endpoint.*).cpm();//端点的每分钟调用次数
    39 endpoint_avg = from(Endpoint.latency).longAvg();//端口平均响应时间
    40 endpoint_sla = from(Endpoint.*).percent(status == true);//端点的成功率
    41 endpoint_percentile = from(Endpoint.latency).percentile(10); // Multiple values including p50, p75, p90, p95, p99
    42 
    43 // Endpoint relation scope metrics
    44 endpoint_relation_cpm = from(EndpointRelation.*).filter(detectPoint == DetectPoint.SERVER).cpm();//在服务端端点检测到的每分钟调用次数
    45 endpoint_relation_resp_time = from(EndpointRelation.rpcLatency).filter(detectPoint == DetectPoint.SERVER).longAvg();//在服务端检测到的rpc调用的平均耗时
    46 endpoint_relation_sla = from(EndpointRelation.*).filter(detectPoint == DetectPoint.SERVER).percent(status == true);//在服务端检测到的请求成功率
    47 endpoint_relation_percentile = from(EndpointRelation.rpcLatency).filter(detectPoint == DetectPoint.SERVER).percentile(10); // Multiple values including p50, p75, p90, p95, p99
    48 
    49 database_access_resp_time = from(DatabaseAccess.latency).longAvg();//数据库的处理平均响应时间
    50 database_access_sla = from(DatabaseAccess.*).percent(status == true);//数据库的请求成功率
    51 database_access_cpm = from(DatabaseAccess.*).cpm();//数据库的每分钟调用次数
    52 database_access_percentile = from(DatabaseAccess.latency).percentile(10);

    java-agent.oal

    // JVM instance metrics
    instance_jvm_cpu = from(ServiceInstanceJVMCPU.usePercent).doubleAvg();//jvm 平均cpu耗时百分比
    instance_jvm_memory_heap = from(ServiceInstanceJVMMemory.used).filter(heapStatus == true).longAvg();//jvm 堆空间的平均使用空间
    instance_jvm_memory_noheap = from(ServiceInstanceJVMMemory.used).filter(heapStatus == false).longAvg();//jvm 非堆空间的平均使用空间
    instance_jvm_memory_heap_max = from(ServiceInstanceJVMMemory.max).filter(heapStatus == true).longAvg();//jvm 最大堆内存的平均值
    instance_jvm_memory_noheap_max = from(ServiceInstanceJVMMemory.max).filter(heapStatus == false).longAvg();//jvm 最大非堆内存的平均值
    instance_jvm_young_gc_time = from(ServiceInstanceJVMGC.time).filter(phrase == GCPhrase.NEW).sum();//年轻代gc的耗时
    instance_jvm_old_gc_time = from(ServiceInstanceJVMGC.time).filter(phrase == GCPhrase.OLD).sum();//老年代gc的耗时
    instance_jvm_young_gc_count = from(ServiceInstanceJVMGC.count).filter(phrase == GCPhrase.NEW).sum();//年轻代gc的次数
    instance_jvm_old_gc_count = from(ServiceInstanceJVMGC.count).filter(phrase == GCPhrase.OLD).sum();//老年代gc的次数
    instance_jvm_thread_live_count = from(ServiceInstanceJVMThread.liveCount).longAvg();//存活的线程数
    instance_jvm_thread_daemon_count = from(ServiceInstanceJVMThread.daemonCount).longAvg();//守护线程数
    instance_jvm_thread_peak_count = from(ServiceInstanceJVMThread.peakCount).longAvg();//峰值线程数
    

      

    告警的设置

    rules:
        # 告警规则 名称唯一 必须以_rule 结尾
      service_resp_time_rule:
          # 度量名称,只支持int long double
        metrics-name: service_resp_time
        # 操作符
        op: ">"
        # 阈值 ms
        threshold: 1000
        # 评估度量的时间长度
        period: 10
        # 度量有多少次符合告警条件后,才会触发告警
        count: 2
        # 静默时间 默认情况下,它和周期一样,在同一个周期内只会触发一次。
        silence-period: 10
        message: 服务【{name}】的平均响应时间在最近10分钟内有2分钟超过1秒
      service_sla_rule:
        metrics-name: service_sla
        op: "<"
        threshold: 8000
        period: 10
        count: 2
        silence-period: 10
        message: 服务【{name}】的成功率在最近10分钟内有2分钟低于80%
    composite-rules:
      # 规则名称:在告警信息中显示的唯一名称,必须以_rule结尾
      comp_rule:
        # 指定如何组成规则,支持&&, ||, ()操作符
        expression: service_resp_time_rule && service_sla_rule
        message: 服务【{name}】在最近10分钟内有2分钟平均响应时间超过1秒并且成功率低于80%
  • 相关阅读:
    *VC编程规范
    C++的va_start() va_end()函数应用(转)
    * C++类的分解,抽象类与纯虚函数的需要性
    *C++中的回调
    *C++中使用接口
    C++模版使用
    *获取mac地址的方法
    *数字——字符之间的转换(转)
    eclipse雕虫小技一:eclipse打开文件目录
    Hibernate升级后注解方式的对象关系映射
  • 原文地址:https://www.cnblogs.com/xwzp/p/15577122.html
Copyright © 2020-2023  润新知