• prometheus 生产环境部署


    https://prometheus.io/docs/introduction/overview
    https://docs.timescale.com/timescaledb/latest/tutorials/promscale
    https://hanlingsha.cn/2021/03/31/%E4%BD%BF%E7%94%A8Prometheus%E6%89%B9%E9%87%8F%E9%83%A8%E7%BD%B2%E7%9B%91%E6%8E%A7%E4%B8%BB%E6%9C%BA/

    1. Bridge

    docker network create --driver bridge promscale-timescaledb
    

    2. TimescaleDB

    docker run --name timescaledb -d \
      --network promscale-timescaledb \
      -p 5432:5432 \
      -e POSTGRES_PASSWORD=abcd1234 \
      192.168.100.198:5000/timescale/timescaledb-ha:pg13-latest
    

    3. Promscale

    docker run --name promscale -d \
      --network promscale-timescaledb \
      -p 9201:9201 \
      192.168.100.198:5000/timescale/promscale:0.6.2 \
      -db-uri postgres://postgres:abcd1234@timescaledb:5432/postgres?sslmode=allow
    

    4. node_exporter

    docker run --name node_exporter -d \
      --network promscale-timescaledb \
      -p 9100:9100 \
      192.168.100.198:5000/prom/node-exporter:v1.3.0
    

    5. Prometheus

    mkdir -p /etc/prometheus/{targets,rules}
    
    cat > /etc/prometheus/prometheus.yml << EOF
    # Prometheus全局配置项
    global:
      scrape_interval:     15s # 设定抓取数据的周期,默认为1min
      evaluation_interval: 15s # 设定更新rules文件的周期,默认为1min
      scrape_timeout: 15s      # 设定抓取数据的超时时间,默认为10s
      external_labels:         # 额外的属性,会添加到拉取得数据并存到数据库中
    
    # Alertmanager配置
    alerting:
     alertmanagers:
     - static_configs:
       - targets: ["192.168.100.181:9093"] # 设定alertmanager和prometheus交互的接口,即alertmanager监听的ip地址和端口
         
    # rule配置,首次读取默认加载,之后根据evaluation_interval设定的周期加载
    rule_files:
     - "/etc/prometheus/rules/*.yml"
    
    # scape配置
    scrape_configs:
    - job_name: file_sd_configs
      file_sd_configs:
      - files:
        - /etc/prometheus/targets/*.json
    
    remote_write:
    - url: http://192.168.100.181:9201/write
    remote_read:
    - url: http://192.168.100.181:9201/read
    EOF
    
    cat > /etc/prometheus/targets/vm-181.json << EOF
    [{"targets": ["192.168.100.181:9100"], "labels": {"hostname": "vm-181", "service": "node"}}]
    EOF
    
    cat > /etc/prometheus/targets/prometheus.json << EOF
    [{"targets": ["192.168.100.181:9100"], "labels": {"hostname": "vm-181", "service": "prometheus"}}]
    EOF
    
    docker run --name prometheus -d \
      --network promscale-timescaledb \
      -p 9090:9090 \
      -v /etc/prometheus:/etc/prometheus \
      192.168.100.198:5000/prom/prometheus:v2.31.1
    
    http://192.168.100.181:9090/targets
    

    6. alertmanager

    mkdir -p /etc/alertmanager/template
    
    cat > /etc/alertmanager/alertmanager.yml << EOF
    # 全局配置项
    global: 
      resolve_timeout: 5m #处理超时时间,默认为5min
      smtp_smarthost: 'smtp.sina.com:25' # 邮箱smtp服务器代理
      smtp_from: '******@sina.com' # 发送邮箱名称
      smtp_auth_username: '******@sina.com' # 邮箱名称
      smtp_auth_password: '******' # 邮箱密码或授权码
      wechat_api_url: 'https://qyapi.weixin.qq.com/cgi-bin/' # 企业微信地址
    
    
    # 定义模板信心
    templates:
      - 'template/*.tmpl'
    
    # 定义路由树信息
    route:
      group_by: ['alertname'] # 报警分组依据
      group_wait: 10s # 最初即第一次等待多久时间发送一组警报的通知
      group_interval: 10s # 在发送新警报前的等待时间
      repeat_interval: 1m # 发送重复警报的周期 对于email配置中,此项不可以设置过低,否则将会由于邮件发送太多频繁,被smtp服务器拒绝
      receiver: 'email' # 发送警报的接收者的名称,以下receivers name的名称
    
    # 定义警报接收者信息
    receivers:
      - name: 'email' # 警报
        email_configs: # 邮箱配置
        - to: '******@163.com'  # 接收警报的email配置
          html: '{{ template "test.html" . }}' # 设定邮箱的内容模板
          headers: { Subject: "[WARN] 报警邮件"} # 接收邮件的标题
        webhook_configs: # webhook配置
        - url: 'http://127.0.0.1:5001'
          send_resolved: true
        wechat_configs: # 企业微信报警配置
        - send_resolved: true
          to_party: '1' # 接收组的id
          agent_id: '1000002' # (企业微信-->自定应用-->AgentId)
          corp_id: '******' # 企业信息(我的企业-->CorpId[在底部])
          api_secret: '******' # 企业微信(企业微信-->自定应用-->Secret)
          message: '{{ template "test_wechat.html" . }}' # 发送消息模板的设定
    
    # 一个inhibition规则是在与另一组匹配器匹配的警报存在的条件下,使匹配一组匹配器的警报失效的规则。两个警报必须具有一组相同的标签。 
    inhibit_rules: 
      - source_match: 
         severity: 'critical' 
        target_match: 
         severity: 'warning' 
        equal: ['alertname', 'dev', 'instance']
    EOF
    
    docker run --name alertmanager -d \
      --network promscale-timescaledb \
      -p 9093:9093 \
      -v /etc/alertmanager:/etc/alertmanager \
      192.168.100.198:5000/prom/alertmanager:v0.23.0
    

    7. pushgateway

    docker run --name pushgateway -d \
      --network promscale-timescaledb \
      -p 9091:9091 \
      192.168.100.198:5000/prom/pushgateway:v1.4.2
    

    8. docker-compose

    cat > docker-compose.yaml << EOF
    version: '3.0'
    
    services:
      db:
        image: timescaledev/timescaledb-ha:pg12-latest
        ports:
          - 5432:5432/tcp
        environment:
          POSTGRES_PASSWORD: password
          POSTGRES_USER: postgres
    
      prometheus:
        image: prom/prometheus:latest
        ports:
          - 9090:9090/tcp
        volumes:
          - ${PWD}/prometheus.yml:/etc/prometheus/prometheus.yml
    
      promscale:
        image: timescale/promscale:latest
        ports:
          - 9201:9201/tcp
        restart: on-failure
        depends_on:
          - db
          - prometheus
        environment:
          PROMSCALE_DB_CONNECT_RETRIES: 10
          PROMSCALE_WEB_TELEMETRY_PATH: /metrics-text
          PROMSCALE_DB_URI: postgres://postgres:password@db:5432/postgres?sslmode=allow
    
      node_exporter:
        image: quay.io/prometheus/node-exporter
        ports:
          - "9100:9100"
    EOF
    
    docker-compose up
    

    9. grafana

    mkdir -p /var/lib/grafana && chown -R 472:472 /var/lib/grafana
    
    docker run --name grafana -d \
      --network promscale-timescaledb \
      -p 3000:3000 \
      -v /var/lib/grafana:/var/lib/grafana \
      -e "GF_INSTALL_PLUGINS=grafana-clock-panel,grafana-simple-json-datasource" \
      192.168.100.198:5000/grafana/grafana:8.2.5
    
    # node-export
    https://grafana.com/grafana/dashboards/8919
    https://grafana.com/grafana/dashboards/8919
    
    kubernetes
    https://grafana.com/grafana/dashboards/8685
    
    ceph
    https://grafana.com/grafana/dashboards/2842
    https://grafana.com/grafana/dashboards/5342
    https://grafana.com/grafana/dashboards/5336
    https://grafana.com/grafana/dashboards/9966
    

    10. grafana loki

    192.168.100.198:5000/grafana/loki:main-88feda4
    

    11. run args

    /bin/prometheus --config.file=/etc/prometheus/prometheus.yml --storage.tsdb.path=/prometheus --web.console.libraries=/usr/share/prometheus/console_libraries --web.console.templates=/usr/share/prometheus/consoles
    /bin/alertmanager --config.file=/etc/alertmanager/alertmanager.yml --storage.path=/alertmanager
    /bin/pushgateway
    /bin/node_exporter
    

    12. 报警群发

    https://work.weixin.qq.com 注册企业微信,自建应用,获得企业id和应用secret
    
    cat /etc/alertmanager/template/email.tmpl << EOF
    {{ define "test.html" }}
    <table border="1">
      <tr>
        <td>报警项</td>
        <td>实例</td>
        <td>报警阀值</td>
        <td>开始时间</td>
      </tr>
      {{ range $i, $alert := .Alerts }}
      <tr>
        <td>{{ index $alert.Labels "alertname" }}</td>
        <td>{{ index $alert.Labels "instance" }}</td>
        <td>{{ index $alert.Annotations "value" }}</td>
        <td>{{ $alert.StartsAt }}</td>
      </tr>
      {{ end }}
    </table>
    {{ end }}
    EOF
    
    cat /etc/alertmanager/template/wechat.tmpl << EOF
    {{ define "webchat.html" }}
    <table border="1">
      <tr>
        <td>报警项</td>
        <td>实例</td>
        <td>报警阀值</td>
        <td>开始时间</td>
      </tr>
      {{ range $i, $alert := .Alerts }}
      <tr>
        <td>{{ index $alert.Labels "alertname" }}</td>
        <td>{{ index $alert.Labels "instance" }}</td>
        <td>{{ index $alert.Annotations "value" }}</td>
        <td>{{ $alert.StartsAt }}</td>
      </tr>
      {{ end }}
    </table>
    {{ end }}
    EOF
    
    cat > /etc/prometheus/rules/alertmanager_rule.yml << EOF
    groups:
     - name: test-rules
       rules:
       - alert: InstanceDown # 告警名称
         expr: up == 0 # 告警的判定条件,参考Prometheus高级查询来设定
         for: 2m # 满足告警条件持续时间多久后,才会发送告警
         labels: #标签项
           team: node
         annotations: # 解析项,详细解释告警信息
           summary: "{{$labels.instance}}: has been down"
           description: "{{$labels.instance}}: job {{$labels.job}} has been down "
           value: {{$value}}
    EOF
    
  • 相关阅读:
    《剑指offer》 面试题43 n个骰子的点数 (java)
    《剑指offer》面试题45 圆圈中最后剩下的数字(Java版本)
    《剑指offer》面试题39 二叉树的深度(java)
    《剑指offer》面试题32----从1到n整数中1出现的次数
    快速排序思路整理
    《Java程序猿面试宝典》之字符串
    Tomcat的server.xml
    easyui combobox 清除选中项目 和 清空option选项
    2019
    throw UnsupportedOperationException
  • 原文地址:https://www.cnblogs.com/liujitao79/p/15576935.html
Copyright © 2020-2023  润新知