version: '3.3' services: elasticsearch: image: docker.elastic.co/elasticsearch/elasticsearch:7.5.0 container_name: elasticsearch restart: always ports: - 9200:9200 environment: - discovery.type=single-node - bootstrap.memory_lock=true - "ES_JAVA_OPTS=-Xms256m -Xmx256m" ulimits: memlock: soft: -1 hard: -1 oap: image: apache/skywalking-oap-server:8.8.0 container_name: oap depends_on: - elasticsearch links: - elasticsearch restart: always ports: - 11800:11800 - 12800:12800 environment: SW_STORAGE: elasticsearch SW_STORAGE_ES_CLUSTER_NODES: elasticsearch:9200 ui: image: apache/skywalking-ui container_name: ui depends_on: - oap links: - oap restart: always ports: - 8080:8080 environment: SW_OAP_ADDRESS: http://oap:12800
2添加依赖
nuget 包 kyAPM.Agent.AspNetCore
3编辑Skywalking配置文件skyapm.json
{ "SkyWalking": { "ServiceName": "Cnsns.SiteStarV6", "Namespace": "", "HeaderVersions": [ "sw8" ], "Sampling": { "SamplePer3Secs": -1, "Percentage": -1.0 }, "Logging": { "Level": "Information", "FilePath": "logs\\skyapm-{Date}.log" }, "Transport": { "Interval": 3000, "ProtocolVersion": "v8", "QueueSize": 30000, "BatchSize": 3000, "gRPC": { "Servers": "111.111.13.11:11800", "Timeout": 10000, "ConnectTimeout": 10000, "ReportTimeout": 600000, "Authentication": "" } } } }
配置文件生成如下
1、安装CLI(SkyAPM.DotNet.CLI) dotnet tool install -g SkyAPM.DotNet.CLI 2、自动生成skyapm.json文件 dotnet skyapm config [service name] [server]:11800 eg: dotnet skyapm config MySkyWalking_OrderService 111.111.13.11:11800 server name指的就是您刚才配置的SKYWALKING__SERVICENAME,server指的是您Skywalking的ip地址。
4 在launchSettings.json
文件配置SK
"profiles": { // 项目 "IIS Express": { // IIS部署项 "commandName": "IISExpress", "launchBrowser": true, "launchUrl": "weatherforecast", "environmentVariables": { "ASPNETCORE_ENVIRONMENT": "Development", "ASPNETCORE_HOSTINGSTARTUPASSEMBLIES": "SkyAPM.Agent.AspNetCore", "SKYWALKING__SERVICENAME": "MySkyWalkingDemoTest" } }, "SkyWalkingDemo": { // castrol部署项 "commandName": "Project", "launchBrowser": true, "launchUrl": "weatherforecast", "applicationUrl": "http://localhost:5000", "environmentVariables": { "ASPNETCORE_ENVIRONMENT": "Development", "ASPNETCORE_HOSTINGSTARTUPASSEMBLIES": "SkyAPM.Agent.AspNetCore", // 必须配置 "SKYWALKING__SERVICENAME": "MySkyWalkingDemoTest" // 必须配置,在skywalking做标识 } } }
5 在Program.cs 注册
#region 注册Skywalking builder.Services.AddSkyApmExtensions(); // 添加Skywalking相关配置 #endregion
6 调用获取链路追踪的Id
private readonly IEntrySegmentContextAccessor segContext; public SkywalkingController(IEntrySegmentContextAccessor segContext) { this.segContext = segContext; } /// <summary> /// 获取链接追踪ID /// </summary> /// <returns></returns> public IActionResult GetSkywalkingTraceId() {
return Content(_segContextAccessor.Context.TraceId.ToString()); }
7自定义链路追踪
[HttpGet] public async Task<IActionResult> SkywalkingTest() { //获取全局的skywalking的TracId var TraceId = _segContext.Context.TraceId; Console.WriteLine($"TraceId={TraceId}"); _segContext.Context.Span.AddLog(LogEvent.Message($"SkywalkingTest---Worker running at: {DateTime.Now}")); System.Threading.Thread.Sleep(1000); _segContext.Context.Span.AddLog(LogEvent.Message($"SkywalkingTest---Worker running at--end: {DateTime.Now}")); return Ok($"Ok,SkywalkingTest-TraceId={TraceId} "); }
网关和服务之间调用,同上配置。
有时候我们需要发通知(比如那个服务实例出问题了)
8 配置告警规则
进入容器
docker exec -it 12f053748e85 /bin/sh
通过cat alarm-settings.yml可以查阅文件内容,如下: docker cp 12f053748e85:/skywalking/config/alarm-settings.yml . # Sample alarm rules. rules: # Rule unique name, must be ended with `_rule`. service_resp_time_rule: metrics-name: service_resp_time op: ">" threshold: 1000 period: 10 count: 3 silence-period: 5 message: Response time of service {name} is more than 1000ms in 3 minutes of last 10 minutes. service_sla_rule: # Metrics value need to be long, double or int metrics-name: service_sla op: "<" threshold: 8000 # The length of time to evaluate the metrics period: 10 # How many times after the metrics match the condition, will trigger alarm count: 2 # How many times of checks, the alarm keeps silence after alarm triggered, default as same as period. silence-period: 3 message: Successful rate of service {name} is lower than 80% in 2 minutes of last 10 minutes service_resp_time_percentile_rule: # Metrics value need to be long, double or int metrics-name: service_percentile op: ">" threshold: 1000,1000,1000,1000,1000 period: 10 count: 3 silence-period: 5 message: Percentile response time of service {name} alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 1000, p75 > 1000, p90 > 1000, p95 > 1000, p99 > 1000 service_instance_resp_time_rule: metrics-name: service_instance_resp_time op: ">" threshold: 1000 period: 10 count: 2 silence-period: 5 message: Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes database_access_resp_time_rule: metrics-name: database_access_resp_time threshold: 1000 op: ">" period: 10 count: 2 message: Response time of database access {name} is more than 1000ms in 2 minutes of last 10 minutes endpoint_relation_resp_time_rule: metrics-name: endpoint_relation_resp_time threshold: 1000 op: ">" period: 10 count: 2 message: Response time of endpoint relation {name} is more than 1000ms in 2 minutes of last 10 minutes # Active endpoint related metrics alarm will cost more memory than service and service instance metrics alarm. # Because the number of endpoint is much more than service and instance. # # endpoint_avg_rule: # metrics-name: endpoint_avg # op: ">" # threshold: 1000 # period: 10 # count: 2 # silence-period: 5 # message: Response time of endpoint {name} is more than 1000ms in 2 minutes of last 10 minutes webhooks: # - http://127.0.0.1/notify/ # - http://127.0.0.1/go-wechat/
规则常用指标解读:
rule name: 规则名称,必须唯一,必须以 **_rule**结尾;
metrics name: oal(Observability Analysis Language)脚本中的度量名;名称在SkyWalking后端服务中已经定义,进入容器skywalking-oap之后,进入如下目录就可以找到。
include names: 本规则告警生效的实体名称,如服务名,终端名;
exclude-names:将此规则作用于不匹配的实体名称上,如服务名,终端名;
threshold: 阈值,可以是一个数组,即可以配置多个值;
op: 操作符, 可以设定 >, <, =;
period: 多久检查一次当前的指标数据是否符合告警规则;以分钟为单位
count: 超过阈值条件,达到**count**次数,触发告警;
silence period:在同一个周期,指定的**silence period**时间内,忽略相同的告警消息;
更多告警规则详情,请参照这个地址:https://github.com/apache/skywalking/blob/master/docs/en/setup/backend/backend-alarm.md
修改告警规则 rules: service_test_sal_rule: # 指定指标名称 metrics-name: service_test_sal # 小于 op: "<" # 指定阈值 threshold: 8000 # 每2分钟检测告警该规则 period: 2 # 触发1次规则就告警 count: 1 # 设置三分钟内容相同告警,不重复告警 silence-period: 3 # 配置告警信息 message: Successful rate of service {name} is lower than 80% in 2 minutes of last 10 minutes 概要:服务成功率在过去2分钟内低于80%
告警API编写 这个本质还是SkyWalking根据规则进行检查,如果符合规则条件,就通过**WebHook、gRPCHook、WeChat Hook、Dingtalk Hook**等方式进行消息通知;接收到告警数据信息之后,可以自行处理消息。这里为了方便,就采用**WebHook**的方式进行演示,即触发告警条件之后,SkyWalking会调用配置的WebHook 接口,并传递对应的告警信息; 定义数据模型 public class AlarmMsg { public int scopeId { get; set; } public string? scope { get; set; } public string? name { get; set; } public string? id0 { get; set; } public string? id1 { get; set; } public string? ruleName { get; set; } public string? alarmMessage { get; set; } } 定义WebHook调用API /// <summary> /// 告警API /// </summary> /// <param name="msgs"></param> /// <returns></returns> [HttpPost("AlarmMsg")] public void AlarmMsg(List<AlarmMsg> msgs) { string msg = "触发告警:"; msg += msgs.FirstOrDefault()?.alarmMessage; Console.WriteLine(msg); SendMail(msg); }
配置webHook http://192.168.3.105:7900/api/Skywalking/AlarmMsg # Sample alarm rules. rules: # Rule unique name, must be ended with `_rule`. service_resp_time_rule: metrics-name: service_resp_time op: ">" threshold: 1000 period: 10 count: 3 silence-period: 5 message: Response time of service {name} is more than 1000ms in 3 minutes of last 10 minutes. service_sla_rule: # Metrics value need to be long, double or int metrics-name: service_sla op: "<" threshold: 8000 # The length of time to evaluate the metrics period: 10 # How many times after the metrics match the condition, will trigger alarm count: 2 # How many times of checks, the alarm keeps silence after alarm triggered, default as same as period. silence-period: 3 message: Successful rate of service {name} is lower than 80% in 2 minutes of last 10 minutes service_resp_time_percentile_rule: # Metrics value need to be long, double or int metrics-name: service_percentile op: ">" threshold: 1000,1000,1000,1000,1000 period: 10 count: 3 silence-period: 5 message: Percentile response time of service {name} alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 1000, p75 > 1000, p90 > 1000, p95 > 1000, p99 > 1000 service_instance_resp_time_rule: metrics-name: service_instance_resp_time op: ">" threshold: 1000 period: 10 count: 2 silence-period: 5 message: Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes database_access_resp_time_rule: metrics-name: database_access_resp_time threshold: 1000 op: ">" period: 10 count: 2 message: Response time of database access {name} is more than 1000ms in 2 minutes of last 10 minutes endpoint_relation_resp_time_rule: metrics-name: endpoint_relation_resp_time threshold: 1000 op: ">" period: 10 count: 2 message: Response time of endpoint relation {name} is more than 1000ms in 2 minutes of last 10 minutes # Active endpoint related metrics alarm will cost more memory than service and service instance metrics alarm. # Because the number of endpoint is much more than service and instance. # # endpoint_avg_rule: # metrics-name: endpoint_avg # op: ">" # threshold: 1000 # period: 10 # count: 2 # silence-period: 5 # message: Response time of endpoint {name} is more than 1000ms in 2 minutes of last 10 minutes webhooks: - http://192.168.3.105:7900/api/Skywalking/AlarmMsg # - http://127.0.0.1/go-wechat/ rules: # 告警规则名称,必须唯一,以_rule结尾 service_sla_rule: # 指定metrics-name metrics-name: service_sla # 小于 op: "<" # 指定阈值 threshold: 8000 # 10分钟检测一次告警规则 period: 10 # 触发2次告警规则就告警 count: 2 # 设置的3分钟时间段有相同的告警,不重复告警. silence-period: 3 # 配置告警消息 message: Successful rate of service {name} is lower than 80% in 2 minutes of last 10 minutes webhooks: - http://192.168.3.105:7900/api/Skywalking/AlarmMsg
skywoking 为什么能无侵入,因为在ioc之前他已经注册上了