• Prometheus Alertmanager 钉钉告警


    一、安装prometheus-webhook-dingtalk

    1.1 安装go环境

    yum -y install go
    go env
    # 在GOPATH下新建目录
    cd /root/go
    mkdir -p src/github.com/timonwong/

    1.2 安装钉钉插件

    cd /root/go/src/github.com/timonwong/
    git clone  https://github.com/timonwong/prometheus-webhook-dingtalk.git
     

    1.3 安装nodejs

    wget https://nodejs.org/dist/v12.16.1/node-v12.16.1-linux-x64.tar.xz
    tar xf node-v12.16.1-linux-x64.tar.xz -C /usr/local/
    cd /usr/local
    mv node-v12.16.1-linux-x64 nodejs
    vim /etc/profile.d/nodejs.sh 
    ​
    export NODE_HOME=/usr/local/nodejs
    export PATH=$PATH:$NODE_HOME/bin
    source /etc/profile

    1.4 安装yarn环境

    npm install -g yarn

    1.5 编译

    cd /root/go/src/github.com/timonwong/prometheus-webhook-dingtalk.git
    make build

    1.6 启动配置

    mkdir /data/dingtalk
    cp config.example.yml /data/dingtalk/config.yml
    ln -s /root/go/src/github.com/timonwong/prometheus-webhook-dingtalk/prometheus-webhook-dingtalk /usr/local/bin/

    config.yml

    换成自己的钉钉设置,secret为钉钉机器人的安全加签

    ## Request timeout
    # timeout: 5s
    
    ## Uncomment following line in order to write template from scratch (be careful!)
    #no_builtin_template: true
    
    ## Customizable templates path
    #templates:
    #  - contrib/templates/legacy/template.tmpl
    templates:
       - /data/dingtalk/template.tmpl
    ## You can also override default template using `default_message`
    ## The following example to use the 'legacy' template from v0.3.0
    #default_message:
    #  title: '{{ template "legacy.title" . }}'
    #  text: '{{ template "legacy.content" . }}'
    
    ## Targets, previously was known as "profiles"
    targets:
      webhook:
        url: https://oapi.dingtalk.com/robot/send?access_token=0be802ba2f44f374ec9830812a8bbd22de94bcf3647730f006d3773412787928
        # secret for signature
        secret: SEC1e23a9deedbc4c4e744e7103ab1e2213d68e75c7685e84c629bb684dadc3f048
        message:
          title: '{{ template "ding.link.title" . }}'
          text: '{{ template "ding.link.content" . }}'

    template.tmpl

    {{ define "__subject" }}[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.SortedPairs.Values | join " " }} {{ if gt (len .CommonLabels) (len .GroupLabels) }}({{ with .CommonLabels.Remove .GroupLabels.Names }}{{ .Values | join " " }}{{ end }}){{ end }}{{ end }}
    {{ define "__alertmanagerURL" }}{{ .ExternalURL }}/#/alerts?receiver={{ .Receiver }}{{ end }}
    
    {{ define "__text_alert_list" }}{{ range . }}
    **Labels**
    {{ range .Labels.SortedPairs }} - {{ .Name }}: {{ .Value | markdown | html }}
    {{ end }}
    **Annotations**
    {{ range .Annotations.SortedPairs }} - {{ .Name }}: {{ .Value | markdown | html }}
    {{ end }}
    **Source:** [{{ .GeneratorURL }}]({{ .GeneratorURL }})
    {{ end }}{{ end }}
    
    {{ define "default.__text_alert_list" }}{{ range . }}
    ---
    **告警级别:** {{ .Labels.severity | upper }}
    
    **运营团队:** {{ .Labels.team | upper }}
    
    **触发时间:** {{ dateInZone "2006.01.02 15:04:05" (.StartsAt) "Asia/Shanghai" }}
    
    **事件信息:** 
    {{ range .Annotations.SortedPairs }} - {{ .Name }}: {{ .Value | markdown | html }}
    
    
    {{ end }}
    
    **事件标签:**
    {{ range .Labels.SortedPairs }}{{ if and (ne (.Name) "severity") (ne (.Name) "summary") (ne (.Name) "team") }} - {{ .Name }}: {{ .Value | markdown | html }}
    {{ end }}{{ end }}
    {{ end }}
    {{ end }}
    {{ define "default.__text_alertresovle_list" }}{{ range . }}
    ---
    **告警级别:** {{ .Labels.severity | upper }}
    
    **运营团队:** {{ .Labels.team | upper }}
    
    **触发时间:** {{ dateInZone "2006.01.02 15:04:05" (.StartsAt) "Asia/Shanghai" }}
    
    **结束时间:** {{ dateInZone "2006.01.02 15:04:05" (.EndsAt) "Asia/Shanghai" }}
    
    **事件信息:**
    {{ range .Annotations.SortedPairs }} - {{ .Name }}: {{ .Value | markdown | html }}
    
    
    {{ end }}
    
    **事件标签:**
    {{ range .Labels.SortedPairs }}{{ if and (ne (.Name) "severity") (ne (.Name) "summary") (ne (.Name) "team") }} - {{ .Name }}: {{ .Value | markdown | html }}
    {{ end }}{{ end }}
    {{ end }}
    {{ end }}
    
    {{/* Default */}}
    {{ define "default.title" }}{{ template "__subject" . }}{{ end }}
    {{ define "default.content" }}#### [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] **[{{ index .GroupLabels "alertname" }}]({{ template "__alertmanagerURL" . }})**
    {{ if gt (len .Alerts.Firing) 0 -}}
    
    {{ template "default.__text_alert_list" .Alerts.Firing }}
    
    
    {{- end }}
    
    {{ if gt (len .Alerts.Resolved) 0 -}}
    {{ template "default.__text_alertresovle_list" .Alerts.Resolved }}
    
    
    {{- end }}
    {{- end }}
    
    {{/* Legacy */}}
    {{ define "legacy.title" }}{{ template "__subject" . }}{{ end }}
    {{ define "legacy.content" }}#### [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] **[{{ index .GroupLabels "alertname" }}]({{ template "__alertmanagerURL" . }})**
    {{ template "__text_alert_list" .Alerts.Firing }}
    {{- end }}
    
    {{/* Following names for compatibility */}}
    {{ define "ding.link.title" }}{{ template "default.title" . }}{{ end }}
    {{ define "ding.link.content" }}{{ template "default.content" . }}{{ end }}

    编写启动文件

    start.sh

    #!/bin/bash
    basedir=$(cd `dirname $0`/; pwd)
    cd $basedir
    nohup /usr/local/bin/prometheus-webhook-dingtalk --config.file=./config.yml &
    exit 0

     

    二、Alertmanager设置

    alertmanager.yml

    global:
      resolve_timeout: 5m
    ​
    route:
      group_by: ['alertname']
      group_wait: 10s
      group_interval: 10s
      repeat_interval: 1h
      receiver: 'web.hook'
    receivers:
    - name: 'web.hook'
      webhook_configs:
      - url: 'http://localhost:8060/dingtalk/webhook/send'
        send_resolved: false
    inhibit_rules:
      - source_match:
          alertname: 'ApplicationDown'
          severity: 'critical'
        target_match:
          severity: 'warning'
        equal: ['alertname',"target","job","instance"]

    测试

    curl http://localhost:8060/dingtalk/webhook/send   -H 'Content-Type: application/json'    -d '{"msgtype": "text","text": {"content": "监控告警"}}'
    OK

    三、Prometheus设置

    3.1 配置文件新增告警内容

    alerting:
      alertmanagers:
      - static_configs:
        - targets:
           - 127.0.0.1:9093
    rule_files:
      - "rules/*.yml"

    3.2 新增rules

    cat rules/memory.yml

    groups:
    - name: memory
      rules:
      
      - alert: ApplicationDown
        expr: up==0
        for: 30s
        labels:
          severity: critical
          target: "{{$labels.job}}"
        annotations:
          summary: "应用服务{{$labels.job}}已停止"
          description: "{{$labels.instance}} of job {{$labels.job}} 已经停止."- alert: Heap_Memory_Usage_Warning
        expr: (jvm_memory_bytes_used{area="heap"}) * 100 /( jvm_memory_bytes_max{ area="heap"} )> 80
        for: 1m
        labels:
          severity: warning
          target: "{{$labels.job}}"
        annotations:
          summary: "应用 {{$labels.job}} Heap_Memory_Usage > 80%"
          description: "{{$labels.instance}} of job {{$labels.job}} has been in status [Heap_Usage > 80%] for more than 1m.Current usage ({{humanize $value}}%)"- alert: Heap_Memory_Usage_Critical
        expr: (jvm_memory_bytes_used{area="heap"}) * 100 /( jvm_memory_bytes_max{ area="heap"} )> 80
        for: 5m
        labels:
          severity: critical
          target: "{{$labels.job}}"
        annotations:
          summary: "应用 {{$labels.job}} Heap_Memory_Usage > 80%"
          description: "{{$labels.instance}} of job {{$labels.job}} has been in status [Heap_Usage > 80%] for more than 5m.Current usage ({{humanize $value}}%)"- alert: Heap_Memory_Usage_Critical
        expr: (jvm_memory_bytes_used{area="heap"}) * 100 /( jvm_memory_bytes_max{ area="heap"} )> 90
        for: 1m
        labels:
          severity: critical
          target: "{{$labels.job}}"
        annotations:
          summary: "应用 {{$labels.job}} Heap_Memory_Usage > 90%"
          description: "{{$labels.instance}} of job {{$labels.job}} has been in status [Heap_Usage > 90%] for more than 1m.Current usage ({{humanize $value}}%)"

    效果:

       

  • 相关阅读:
    windows10安装pycharm,以及pycharm教程和破解码
    windows 10安装python3和python2
    Git之仓库管理
    Python操作 Excel表格
    ansible 基础操作
    Flask-Migrate
    flask-script
    flask-sqlalchemy
    基于数字证书的二次登录认证流程
    摘录:识别系统原理(转)
  • 原文地址:https://www.cnblogs.com/bigberg/p/13673033.html
Copyright © 2020-2023  润新知