• 【监控】prometheus监控安装


    部署

    wget https://github.com/prometheus/prometheus/releases/download/v2.28.0/prometheus-2.28.0.linux-amd64.tar.gz

    tar xf prometheus-2.28.0.linux-amd64.tar.gz

    mv prometheus-2.28.0.linux-amd64 /usr/local/prometheus-2.28.0

    vim /usr/local/prometheus-2.28.0/prometheus.yml 

    # my global config
    global:
      scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
      evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
      # scrape_timeout is set to the global default (10s).
    
    # Alertmanager configuration
    alerting:
      alertmanagers:
      - static_configs:
        - targets:
          # - alertmanager:9093
    
    # Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
    rule_files:
      # - "first_rules.yml"
      # - "second_rules.yml"
    
    # A scrape configuration containing exactly one endpoint to scrape:
    # Here it's Prometheus itself.
    scrape_configs:
      # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
      - job_name: 'prometheus'
    
        # metrics_path defaults to '/metrics'
        # scheme defaults to 'http'.
    
        static_configs:
        - targets: ['localhost:9090']
    

      vim  /usr/lib/systemd/system/prometheus.service

    [Unit]
    Description=Prometheus Services
    After=network.target remote-fs.target
    
    [Service]
    Type=simple
    ExecStart=/usr/local/prometheus-2.28.0/prometheus --config.file=/usr/local/prometheus-2.28.0/prometheus.yml --storage.tsdb.path=/usr/local/prometheus-2.28.0/
    Restart=on-failure
    RestartSec=5
    
    [Install]
    WantedBy=multi-user.target
    

      systemctl restart prometheus.service

    监控

    wget https://github.com/prometheus/node_exporter/releases/download/v1.1.2/node_exporter-1.1.2.linux-amd64.tar.gz

    tar xf node_exporter-1.1.2.linux-amd64.tar.gz

    mv node_exporter-1.1.2.linux-amd64 /usr/local/node_exporter

    cat > /usr/lib/systemd/system/node_exporter.service << EOF

    [Unit]
    Description=Prometheus Node Exporter Services
    After=network.target remote-fs.target

    [Service]
    Type=simple
    ExecStart=/usr/local/node_exporter/node_exporter
    Restart=on-failure
    RestartSec=5

    [Install]
    WantedBy=multi-user.target

    EOF

    systemctl daemon-reload

    systemctl start node_exporter

    telegram报警

    git clone https://github.com/nopp/alertmanager-webhook-telegram-python.git

    yum install python3 pip3

    cd alertmanager-webhook-telegram-python/

    pip3 install -r requirements.txt

    pip3 install python-dateutil

    vim flaskAlert.py

    import telegram, json, logging
    from dateutil import parser
    from flask import Flask
    from flask import request
    from flask_basicauth import BasicAuth
    
    app = Flask(__name__)
    app.secret_key = 'lAlAlA123'
    basic_auth = BasicAuth(app)
    
    # Yes need to have -, change it!
    chatID = "" # 更改
    
    # Authentication conf, change it!
    app.config['BASIC_AUTH_FORCE'] = True
    app.config['BASIC_AUTH_USERNAME'] = '' #更改
    app.config['BASIC_AUTH_PASSWORD'] = '' #更改
    
    # Bot token, change it!
    bot = telegram.Bot(token="")  #更改
    
    @app.route('/alert', methods = ['POST'])
    def postAlertmanager():
    
        try:
            content = json.loads(request.get_data())
            for alert in content['alerts']:
                message = "Status: "+alert['status']+"
    "
                if 'name' in alert['labels']:
                    message += "Instance: "+alert['labels']['instance']+"("+alert['labels']['name']+")
    "
                else:
                    message += "Instance: "+alert['labels']['instance']+"
    "
                if 'info' in alert['annotations']:
                    message += "Info: "+alert['annotations']['info']+"
    "
                if 'summary' in alert['annotations']:
                    message += "Summary: "+alert['annotations']['summary']+"
    "                
                if 'description' in alert['annotations']:
                    message += "Description: "+alert['annotations']['description']+"
    "
                if alert['status'] == "resolved":
                    correctDate = parser.parse(alert['endsAt']).strftime('%Y-%m-%d %H:%M:%S')
                    message += "Resolved: "+correctDate
                elif alert['status'] == "firing":
                    correctDate = parser.parse(alert['startsAt']).strftime('%Y-%m-%d %H:%M:%S')
                    message += "Started: "+correctDate
                bot.sendMessage(chat_id=chatID, text=message)
                return "Alert OK", 200
        except RetryAfter:
            sleep(30)
            bot.sendMessage(chat_id=chatID, text=message)
            return "Alert OK", 200
        except TimedOut as e:
            sleep(60)
            bot.sendMessage(chat_id=chatID, text=message)
            return "Alert OK", 200
        except NetworkError as e:
            sleep(60)
            bot.sendMessage(chat_id=chatID, text=message)
            return "Alert OK", 200
        except Exception as error:       
            bot.sendMessage(chat_id=chatID, text="Error: "+str(error))
            app.logger.info("	%s",error)
            return "Alert fail", 200
    
    if __name__ == '__main__':
        logging.basicConfig(level=logging.INFO)
        app.run(host='0.0.0.0', port=9119)
    

      

    nohup python3 flaskAlert.py &

    测试

    curl -XPOST --data '{"status":"resolved","groupLabels":{"alertname":"instance_down"},"commonAnnotations":{"description":"i-0d7188fkl90bac100 of job ec2-sp-node_exporter has been down for more than 2 minutes.","summary":"Instance i-0d7188fkl90bac100 down"},"alerts":[{"status":"resolved","labels":{"name":"olokinho01-prod","instance":"i-0d7188fkl90bac100","job":"ec2-sp-node_exporter","alertname":"instance_down","os":"linux","severity":"page"},"endsAt":"2019-07-01T16:16:19.376244942-03:00","generatorURL":"http://pmts.io:9090","startsAt":"2019-07-01T16:02:19.376245319-03:00","annotations":{"description":"i-0d7188fkl90bac100 of job ec2-sp-node_exporter has been down for more than 2 minutes.","summary":"Instance i-0d7188fkl90bac100 down"}}],"version":"4","receiver":"infra-alert","externalURL":"http://alm.io:9093","commonLabels":{"name":"olokinho01-prod","instance":"i-0d7188fkl90bac100","job":"ec2-sp-node_exporter","alertname":"instance_down","os":"linux","severity":"page"}}' http://username:password@flaskAlert:9119/alert
    

      

    安装alertmanager

    wget https://github.com/prometheus/alertmanager/releases/download/v0.22.2/alertmanager-0.22.2.linux-amd64.tar.gz

    tar xf alertmanager-0.22.2.linux-amd64.tar.gz

    mv alertmanager-0.22.2.linux-amd64 /usr/local/alertmanager

    cd /usr/local/alertmanager/

    route:
      group_by: ['alertname']
      group_wait: 30s
      group_interval: 5m
      repeat_interval: 1h
      receiver: 'alertmananger-bot'
    receivers:
    - name: 'alertmananger-bot'
      webhook_configs:
      - send_resolved: true
        url: http://127.0.0.1:9119/alert
        http_config:
          basic_auth:
            username: 'goroutine'
            password: 'goroutine-12345'
    templates:
      - '/usr/local/alertmanager/test.tmpl'
    
    
    ########### /usr/local/alertmanager/test.rmpl ############
    {{ define "test.html" }}
      {{ range .Alerts }}
     <pre>
    故障实例: {{ .Labels.instance }}
    故障概要: {{ .Annotations.summary }}
    故障描述: {{ .Annotations.description }}
    告警级别: {{ .Labels.severity }}
    告警时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }} 
     </pre>
      {{ end }}
    {{ end }}
    

      

    nohup /usr/local/alertmanager/alertmanager --config.file=/usr/local/alertmanager/alertmanager.yml --storage.path=/usr/local/alertmanager/data > /usr/local/alertmanager/alertmanager.log &

    测试

    #!/usr/bin/env bash
    
    alerts_message='[
      {
        "labels": {
           "alertname": "DiskRunningFull",
           "dev": "sda1",
           "instance": "example1",
           "msgtype": "testing"
         },
         "annotations": {
            "info": "The disk sda1 is running full",
            "summary": "please check the instance example1"
          }
      },
      {
        "labels": {
           "alertname": "DiskRunningFull",
           "dev": "sda2",
           "instance": "example1",
           "msgtype": "testing"
         },
         "annotations": {
            "info": "The disk sda2 is running full",
            "summary": "please check the instance example1",
            "runbook": "the following link http://test-url should be clickable"
          }
      }
    ]'
    
    curl -XPOST -d"$alerts_message" http://127.0.0.1:9093/api/v1/alerts

      

    prometheus修改

    /usr/local/prometheus-2.28.0/prometheus.yml

    alerting:
      alertmanagers:
      - static_configs:
        - targets:
           - alertmanager:9093
    
    # Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
    rule_files:
      - "rules/*.yml"
    

      

    rules/base_rules.yml

    groups:
      - name: node-exporter-alert
        rules:
        - alert: node-exporter-down
          expr: node_exporter:up == 0 
          for: 1m
          labels: 
            severity: info
          annotations: 
            summary: "instance: {{ $labels.instance }} 宕机了"  
            description: "instance: {{ $labels.instance }} 
    - job: {{ $labels.job }} 关机了, 时间已经1分钟了。" 
            value: "{{ $value }}"
            instance: "{{ $labels.instance }}"
            grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
            type: "google-cloud"
    
        - alert: node-exporter-cpu-high 
          expr:  node_exporter:cpu:total:percent > 80
          for: 3m
          labels: 
            severity: info
          annotations: 
            summary: "instance: {{ $labels.instance }} cpu 使用率高于 {{ $value }}"  
            description: ""    
            value: "{{ $value }}"
            instance: "{{ $labels.instance }}"
            grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
            type: "google-cloud"
    
        - alert: node-exporter-cpu-iowait-high 
          expr:  node_exporter:cpu:iowait:percent >= 12
          for: 3m
          labels: 
            severity: info
          annotations: 
            summary: "instance: {{ $labels.instance }} cpu iowait 使用率高于 {{ $value }}"  
            description: ""    
            value: "{{ $value }}"
            instance: "{{ $labels.instance }}"
            grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
            type: "google-cloud"
    
        - alert: node-exporter-load-load1-high 
          expr:  (node_exporter:load:load1) > (node_exporter:cpu:count) * 1.2
          for: 3m
          labels: 
            severity: info
          annotations: 
            summary: "instance: {{ $labels.instance }} load1 使用率高于 {{ $value }}"  
            description: ""    
            value: "{{ $value }}"
            instance: "{{ $labels.instance }}"
            grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
            type: "google-cloud"
    
        - alert: node-exporter-memory-high
          expr:  node_exporter:memory:used:percent > 85
          for: 3m
          labels: 
            severity: info
          annotations: 
            summary: "instance: {{ $labels.instance }} memory 使用率高于 {{ $value }}"  
            description: ""    
            value: "{{ $value }}"
            instance: "{{ $labels.instance }}"
            grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
            type: "google-cloud"
    
        - alert: node-exporter-disk-high
          expr:  node_exporter:disk:used:percent > 88
          for: 10m
          labels: 
            severity: info
          annotations: 
            summary: "instance: {{ $labels.instance }} disk 使用率高于 {{ $value }}"  
            description: ""    
            value: "{{ $value }}"
            instance: "{{ $labels.instance }}"
            grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
            type: "google-cloud"
    
        - alert: node-exporter-disk-read:count-high
          expr:  node_exporter:disk:read:count:rate > 3000
          for: 2m
          labels: 
            severity: info
          annotations: 
            summary: "instance: {{ $labels.instance }} iops read 使用率高于 {{ $value }}"  
            description: ""    
            value: "{{ $value }}"
            instance: "{{ $labels.instance }}"
            grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
            type: "google-cloud"
    
        - alert: node-exporter-disk-write-count-high
          expr:  node_exporter:disk:write:count:rate > 3000
          for: 2m
          labels: 
            severity: info
          annotations: 
            summary: "instance: {{ $labels.instance }} iops write 使用率高于 {{ $value }}"  
            description: ""    
            value: "{{ $value }}"
            instance: "{{ $labels.instance }}"
            grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
            type: "google-cloud"
    
    
        - alert: node-exporter-disk-read-mb-high
          expr:  node_exporter:disk:read:mb:rate > 60 
          for: 2m
          labels: 
            severity: info
          annotations: 
            summary: "instance: {{ $labels.instance }} 读取字节数 高于 {{ $value }}"  
            description: ""    
            instance: "{{ $labels.instance }}"
            value: "{{ $value }}"
            grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
            type: "google-cloud"
    
        - alert: node-exporter-disk-write-mb-high
          expr:  node_exporter:disk:write:mb:rate > 60
          for: 2m
          labels: 
            severity: info
          annotations: 
            summary: "instance: {{ $labels.instance }} 写入字节数 高于 {{ $value }}"  
            description: ""    
            value: "{{ $value }}"
            instance: "{{ $labels.instance }}"
            grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
            type: "google-cloud"
    
        - alert: node-exporter-filefd-allocated-percent-high 
          expr:  node_exporter:filefd_allocated:percent > 80
          for: 10m
          labels: 
            severity: info
          annotations: 
            summary: "instance: {{ $labels.instance }} 打开文件描述符 高于 {{ $value }}"  
            description: ""    
            value: "{{ $value }}"
            instance: "{{ $labels.instance }}"
            grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
            type: "google-cloud"
    
        - alert: node-exporter-network-netin-error-rate-high
          expr:  node_exporter:network:netin:error:rate > 4
          for: 1m
          labels: 
            severity: info
          annotations: 
            summary: "instance: {{ $labels.instance }} 包进入的错误速率 高于 {{ $value }}"  
            description: ""    
            value: "{{ $value }}"
            instance: "{{ $labels.instance }}"
            grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
            type: "google-cloud"
        - alert: node-exporter-network-netin-packet-rate-high
          expr:  node_exporter:network:netin:packet:rate > 35000
          for: 1m
          labels: 
            severity: info
          annotations: 
            summary: "instance: {{ $labels.instance }} 包进入速率 高于 {{ $value }}"  
            description: ""    
            value: "{{ $value }}"
            instance: "{{ $labels.instance }}"
            grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
            type: "google-cloud"
    
        - alert: node-exporter-network-netout-packet-rate-high
          expr:  node_exporter:network:netout:packet:rate > 35000
          for: 1m
          labels: 
            severity: info
          annotations: 
            summary: "instance: {{ $labels.instance }} 包流出速率 高于 {{ $value }}"  
            description: ""    
            value: "{{ $value }}"
            instance: "{{ $labels.instance }}"
            grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
            type: "google-cloud"
    
        - alert: node-exporter-network-tcp-total-count-high
          expr:  node_exporter:network:tcp:total:count > 40000
          for: 1m
          labels: 
            severity: info
          annotations: 
            summary: "instance: {{ $labels.instance }} tcp连接数量 高于 {{ $value }}"  
            description: ""    
            value: "{{ $value }}"
            instance: "{{ $labels.instance }}"
            grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
            type: "google-cloud"
    
        - alert: node-exporter-process-zoom-total-count-high 
          expr:  node_exporter:process:zoom:total:count > 10
          for: 10m
          labels: 
            severity: info
          annotations: 
            summary: "instance: {{ $labels.instance }} 僵死进程数量 高于 {{ $value }}"  
            description: ""    
            value: "{{ $value }}"
            instance: "{{ $labels.instance }}"
            grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
            type: "google-cloud"
    
        - alert: node-exporter-time-offset-high
          expr:  node_exporter:time:offset > 0.03
          for: 2m
          labels: 
            severity: info
          annotations:
            summary: "instance: {{ $labels.instance }} {{ $labels.desc }}  {{ $value }} {{ $labels.unit }}"  
            description: ""    
            value: "{{ $value }}"
            instance: "{{ $labels.instance }}"
            grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
            type: "google-cloud"
    

      

    systemctl restart prometheus

  • 相关阅读:
    左式堆
    winsock库
    二叉堆
    关键字explicit
    HDOJ 1012
    HDOJ 1013
    STL priority实例
    二项队列
    ASP.NET Session过期问题揭秘
    RenderControl (asp.net)
  • 原文地址:https://www.cnblogs.com/wangshuyang/p/14984258.html
Copyright © 2020-2023  润新知