• ansible部署prometheus+nodeexporter


    ansible部署prometheus+node-exporter

    简单部署prometheus监控系统

    yum安装ansible

    yum install ansible
    

    ansible的hosts文件

    [alertmanagers]
    10.9.119.1
    
    [prometheus]
    10.9.119.1
    
    [node-exporter]
    10.9.119.1
    10.9.119.2
    10.9.119.3
    

    文件层次格式如下:

    prometheus

    prometheus.yml

    - hosts: prometheus
      remote_user: root
      tasks:
        - name: create dir
          file:     
            path: /opt/prometheus
            state: directory # 没有目录则创建
        - name: copy file
          unarchive: 
            src: prometheus-2.24.0.linux-amd64.tar.gz 
            dest: /opt/prometheus
        - name: create link
          file:
            src: /opt/prometheus/prometheus-2.24.0.linux-amd64
            dest: /opt/prometheus/prometheus
            state: link # 软链接
        - name: copy service file
          template:
            src: prometheus.service.j2
            dest: /usr/lib/systemd/system/prometheus.service
        - name: copy config yaml
          template:
            src: prometheus.yml.j2
            dest: /opt/prometheus/prometheus/prometheus.yml
          notify:
            - restart prometheus
        - name: create rules dir
          file:
            path: /opt/prometheus/prometheus/rules
            state: directory
        - name: copy rules yaml # node里面有特殊符号所以使用copy
          copy:
            src: node.yml
            dest: /opt/prometheus/prometheus/rules/node.yml
          notify: # 此动作将触发handlers
            - restart prometheus
        - name: start prometheus
          service: 
            name: prometheus
            state: started
            enabled: yes
      handlers:
        - name: restart prometheus
          service:
            name: prometheus
            state: restarted
    

    prometheus.service.j2 可以使用copy模块,这里使用了template

    [Unit]
    Description=Prometheus
    Documentation=
    After=network.target
    
    [Service]
    WorkingDirectory=/opt/prometheus/prometheus
    ExecStart=/opt/prometheus/prometheus/prometheus
    ExecReload=/bin/kill -HUP $MAINPID
    ExecStop=/bin/kill -KILL $MAINPID
    Type=simple
    KillMode=control-group
    Restart=on-failure
    RestartSec=3s
    
    [Install]
    WantedBy=multi-user.target
    

    prometheus.yml.j2

    # 全局配置
    global:
      scrape_interval:     30s #抓取间隔时间
      evaluation_interval: 30s #规则引擎执行间隔时间
      query_log_file: ./promql.log
    
    # 告警配置
    alerting:
      alertmanagers: # Alertmanagers配置
      - static_configs: # Alertmanager静态配置
        - targets: # alertmanager发送目标配置
    {% for alertmanager in groups['alertmanagers'] %}
          - {{ alertmanager }}:9093
    {% endfor %}
    rule_files: # 规则文件配置
      - "rules/*.yml"
    
    scrape_configs: # 抓取配置
      - job_name: 'prometheus' #任务 采集目标分类
        static_configs: # 抓取目标静态配置
        - targets:
    {% for prometheu in groups['prometheus'] %}
          - "{{ prometheu }}:9090" #抓取目标
    {% endfor %}
      - job_name: "node"
        static_configs:
        - targets:
    {% for node in groups['node-exporter'] %}
          - "{{ node }}:9100"
    {% endfor %}
    

    node-rules规则文件node.yml

    groups:
    - name: node.rules   # 报警规则组名称
      rules:
      - alert: node is Down
        expr: up == 0
        for: 30s  #持续时间,表示持续30秒获取不到信息,则触发报警
        labels:
          severity: serious  # 自定义标签 严重的
        annotations:
          summary: "Instance {{ $labels.instance }} down" # 自定义摘要
          description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes." # 自定义具体描述
          
      - alert: node Filesystem
        expr: 100 - (node_filesystem_free_bytes{fstype=~"ext4|xfs"} / node_filesystem_size_bytes{fstype=~"ext4|xfs"} * 100) > 80
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "{{$labels.instance}}: {{$labels.mountpoint }} 分区使用过高"
          description: "{{$labels.instance}}: {{$labels.mountpoint }} 分区使用大于 80% (当前值: {{ $value }})"
          
      - alert: node Memory
        expr: 100 - (node_memory_MemFree_bytes+node_memory_Cached_bytes+node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 > 80
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "{{$labels.instance}}: 内存使用过高"
          description: "{{$labels.instance}}: 内存使用大于 80% (当前值: {{ $value }})"
          
      - alert: node CPU
        expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) * 100) > 80
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "{{$labels.instance}}: CPU使用过高"
          description: "{{$labels.instance}}: CPU使用大于 80% (当前值: {{ $value }})"
    

    node-exporter

    node-exporter.yml

    - hosts: node-exporter
      remote_user: root
      tasks:
        - name: create dir
          file:     
            path: /opt/prometheus
            state: directory
        - name: copy file
          unarchive: 
            src: node_exporter-1.0.1.linux-amd64.tar.gz 
            dest: /opt/prometheus
        - name: create link
          file:
            src: /opt/prometheus/node_exporter-1.0.1.linux-amd64
            dest: /opt/prometheus/node_exporter
            state: link
        - name: copy service file
          template:
            src: node_exporter.service.j2
            dest: /usr/lib/systemd/system/node_exporter.service
        - name: start node_exporter
          service: 
            name: node_exporter
            state: restarted
            enabled: yes
    

    node_exporter.service.j2

    [Unit]
    Description=Node Exporter
    Documentation=
    After=network.target
    
    [Service]
    WorkingDirectory=/opt/prometheus/node_exporter/
    ExecStart=/opt/prometheus/node_exporter/node_exporter
    ExecStop=/bin/kill -KILL $MAINPID
    Type=simple
    KillMode=control-group
    Restart=on-failure
    RestartSec=3s
    
    [Install]
    WantedBy=multi-user.target
    

    alertmanager

    alertmanager.yaml

    - hosts: alertmanagers
      remote_user: root
      tasks:
        - name: create dir
          file:     
            path: /opt/prometheus
            state: directory
        - name: copy file
          unarchive: 
            src: alertmanager-0.21.0.linux-amd64.tar.gz 
            dest: /opt/prometheus
        - name: create link
          file:
            src: /opt/prometheus/alertmanager-0.21.0.linux-amd64
            dest: /opt/prometheus/alertmanager
            state: link
        - name: copy service file
          template:
            src: alertmanager.service.j2
            dest: /usr/lib/systemd/system/alertmanager.service
        - name: copy config yaml
          template:
            src: alertmanager.yml.j2
            dest: /opt/prometheus/alertmanager/alertmanager.yml
          notify:
            - restart alertmanager
        - name: start server
          service: 
            name: alertmanager
            state: restarted
            enabled: yes
      handlers:
        - name: restart alertmanager
          service:
            name: alertmanager
            state: restarted
    

    alertmanager.service.j2

    [Unit]
    Description=AlertManager
    Documentation=
    After=network.target
    
    [Service]
    WorkingDirectory=/opt/prometheus/alertmanager/
    ExecStart=/opt/prometheus/alertmanager/alertmanager
    ExecReload=/bin/kill -HUP $MAINPID
    ExecStop=/bin/kill -KILL $MAINPID
    Type=simple
    KillMode=control-group
    Restart=on-failure
    RestartSec=3s
    
    [Install]
    WantedBy=multi-user.target
    

    alertmanager.yml.j2 这里使用了邮箱告警

    global:
      resolve_timeout: 5m # 当告警的状态有firing变为resolve的以后还要呆多长时间,才宣布告警解除。
      smtp_from: "123456789@qq.com" 
      smtp_smarthost: 'smtp.qq.com:465'
      smtp_auth_username: "123456789@qq.com" # 邮箱账号
      smtp_auth_password: "bcvizcgqbgojjjeb" # 口令密码,非QQ密码
      smtp_require_tls: false # 使用465端口,这里选false
    route:
      group_by: ['alertname'] # 采用哪个标签作为分组的依据
      group_wait: 10s # 分组等待的时间10s
      group_interval: 10s # 上下两组发送告警的间隔时间10s
      repeat_interval: 24h # 重复发送告警时间。默认1h 不会重复发送相同告警 静默
      receiver: 'default-receiver' # 默认接收人
      
      # 所有不匹配以下子路由的告警都将保留在根节点,并发送到'default-receiver'
      routes: # 分组
      - receiver: 'db'
        group_wait: 10s
        match_re:
          # 使用正则匹配告警包含两个服务,发送到db
          service: mysql|redis	#所有service=mysql或者service=redis的告警分配到db接收端
          
      - receiver: 'web'
        group_by: [product, environment] # 采用product和environment标签作为分组的依据
        match:
          team: frontend # 所有告警标签带有frontend发送到web
      
    receivers:
    - name: 'default-receiver'
      email_configs:
      - to: '123456789@qq.com' # 告警收件人
      
    - name: 'db'
          # 通过邮箱发送报警
          email_configs:
            - to: '111111111@qq.com'
    - name: 'web'
          email_configs:
            - to: '222222222@qq.com'
    inhibit_rules:  # 抑制,但两个都报警了,级别严重的会抑制级别警告的,只发生严重级别的告警
      - source_match:
          severity: 'critical' # critaical的报警会抑制warning级别的报警信息
        target_match:
          severity: 'warning'
        equal: ['alertname', 'dev', 'instance']
    

    ansible-playbook部署

    ansible-playbook -C 可以测试
    
    ansible-playbook prometheus.yaml
    ansible-playbook node-exporter.yaml
    ansible-playbook alertmanager.yaml
    

    后续会改成roles方式

    每天进步一点点
  • 相关阅读:
    BZOJ 1631 Cow Party
    BZOJ 1927 星际竞速
    BZOJ 4059 Non-boring sequences
    BZOJ 1562 变换序列
    BZOJ 4417 超级跳马
    484586
    背板问题之满包问题
    对01背包路径的记录
    带权值的图 BFS
    漫步校园 杭电1428
  • 原文地址:https://www.cnblogs.com/Otiger/p/14470488.html
Copyright © 2020-2023  润新知