• prometheus监控ES【转】


    prometheus监控es,同样采用exporter的方案。

    项目地址:

    elasticsearch_exporter:https://github.com/justwatchcom/elasticsearch_exporter

    默认端口 9114

    1、安装部署

    【1.0】封装成系统服务、一键部署

    前提,把二进制包复制过来放到当前目录

    vim es_exporter_install.sh

    #!/bin/bash
    init(){
        es_path_config=`ps -ef|grep elastic|grep "Des.path.conf"|sed 's# #
    #g'|grep "Des.path.conf"|awk -F'=' '{print $2}'`/
        configfile=elasticsearch.yml
        ip=`cat ${es_path_config}${configfile}|grep network.host|awk -F":" '{print $2}'|sed 's/[[:space:]]//g'`
        port=`cat ${es_path_config}${configfile}|grep http.port|awk -F":" '{print $2}'|sed 's/[[:space:]]//g'`
        if [ ! "$ip" -o ! "$port" ];then
            echo "init is error,can't get the es's ip and port!"
            exit 1
        fi
        if [ $ip == '0.0.0.0' ];then
        ip=127.0.0.1
        fi
        tar -zxf elasticsearch_exporter-1.1.0.linux-amd64.tar.gz
        mv elasticsearch_exporter-1.1.0.linux-amd64 /usr/local/elasticsearch_exporter
        groupadd prometheus
        useradd -g prometheus -m -d /var/lib/prometheus -s /sbin/nologin prometheus
        chown -R prometheus.prometheus /usr/local/elasticsearch_exporter
    
        
    }
    
    
    
    run(){
    if [ `uname -a |grep el7|wc -l` -eq 1 ];then
    cat << eof >/lib/systemd/system/es_exporter.service
    [Unit]
    Description=The es_exporter
    After=network.target
    
    [Service]
    PrivateTmp=true
    Type=simple
    User=prometheus
    ExecStart=/usr/local/elasticsearch_exporter/elasticsearch_exporter --es.uri=http://${ip}:${port} 
    Restart=on-failure
    ExecStop=/bin/kill -s QUIT $MAINPID
    [Install]
    WantedBy=multi-user.target
    eof
    
        systemctl daemon-reload
        systemctl start es_exporter
        systemctl enable es_exporter
        elif [ `uname -a |grep el6|wc -l` -eq 1 ];then
    cat << eof >/etc/init.d/es_exporter
    #!/bin/bash
    # chkconfig: 2345 10 90
    # description: es's exporter
    touch /var/log/es_exporter.log
    chown prometheus.prometheus /var/log/es_exporter.log
    es_path_config=`ps -ef|grep elastic|grep "Des.path.conf"|sed 's# #
    #g'|grep "Des.path.conf"|awk -F'=' '{print $2}'`/
    configfile=elasticsearch.yml
    ip=`cat ${es_path_config}${configfile}|grep network.host|awk -F":" '{print $2}'|sed 's/[[:space:]]//g'`
    port=`cat ${es_path_config}${configfile}|grep http.port|awk -F":" '{print $2}'|sed 's/[[:space:]]//g'`
    if [ $ip == '0.0.0.0' ];then
    ip=127.0.0.1
    fi
    su prometheus -s /bin/bash -c "/usr/local/elasticsearch_exporter/elasticsearch_exporter --es.uri=http://${ip}:${port} &"  >> /var/log/es_exporter.log
    eof
        chown prometheus.prometheus /etc/init.d/es_exporter
        chmod +x /etc/init.d/es_exporter
        chkconfig --add es_exporter
        chkconfig --level 3 es_exporter on
        service es_exporter start
    
    else
        echo "your os not rel7/rel6,operator fail!"
    fi
    }
    main(){
    init
    run
    }
    main
    ps -ef|grep elasticsearch_exporter

    sh es_exporter_install.sh

    【1.2】简便一键部署、脚本方式

    前提:把ES执行命令直接拿来放到一起

    vim install_es.sh

    #!/bin/bash
    
    mv elasticsearch_exporter /bin/elasticsearch_exporter
    chmod +x /bin/elasticsearch_exporter
    es_path_config=`ps -ef|grep elastic|grep "Des.path.conf"|sed 's# #
    #g'|grep "Des.path.conf"|awk -F'=' '{print $2}'`/
    configfile=elasticsearch.yml
    ip=`cat ${es_path_config}${configfile}|grep network.host|awk -F":" '{print $2}'|sed 's/[[:space:]]//g'`
    port=`cat ${es_path_config}${configfile}|grep http.port|awk -F":" '{print $2}'|sed 's/[[:space:]]//g'`
    if [ ! "$ip" -o ! "$port" ];then
        echo 'init is error,can't get the es's ip and port!'
        exit 1
    fi
    if [ $ip == '0.0.0.0' ];then
    ip=127.0.0.1
    fi
    
    echo "nohup /bin/elasticsearch_exporter --es.uri="http://${ip}:${port}" --web.listen-address="0.0.0.0:9114" >>/var/log/es_exporter.log 2>&1 & "
    nohup /bin/elasticsearch_exporter --es.uri="http://${ip}:${port}" --web.listen-address="0.0.0.0:9114" >> /var/log/es_exporter.log 2>&1 &  
    echo "nohup /bin/elasticsearch_exporter --es.uri="http://${ip}:${port}" --web.listen-address="0.0.0.0:9114" >>/var/log/es_exporter.log 2>&1 &  " >>/etc/rc.local
    ps -ef|grep elasticsearch_exporter

    【1.2】详细步骤

    接着分别在如上三台主机上进行如下配置:

    wget https://github.com/justwatchcom/elasticsearch_exporter/releases/download/v1.1.0/elasticsearch_exporter-1.1.0.linux-amd64.tar.gz
    tar -zxf elasticsearch_exporter-1.1.0.linux-amd64.tar.gz
    mv elasticsearch_exporter-1.1.0.linux-amd64 /usr/local/elasticsearch_exporter

    创建用户等

    groupadd prometheus
    useradd -g prometheus -m -d /var/lib/prometheus -s /sbin/nologin prometheus
    chown -R prometheus.prometheus /usr/local/elasticsearch_exporter

    启动监控客户端:

    nohup ./elasticsearch_exporter --web.listen-address ":9114"  --es.uri http://192.168.75.21:9200 &

    使用systemd管理:

    cat << eof >>/lib/systemd/system/es_exporter.service
    [Unit]
    Description=The es_exporter
    After=network.target
    
    [Service]
    Type=simple
    User=prometheus
    ExecStart=/usr/local/elasticsearch_exporter/elasticsearch_exporter 
    Restart=on-failure
    ExecStop=/bin/kill -s QUIT $MAINPID
    [Install] 
    WantedBy
    =multi-user.target
    eof

    启动:

    systemctl daemon-reload
    systemctl start es_exporter
    systemctl enable es_exporter

    查看metrics:

    curl 127.0.0.1:9114/metrics

    2、配置 prometheus.yml 添加监控目标

    复制代码
    vim /usr/local/prometheus/prometheus.yml
    
      - job_name: 'elasticsearch'
        scrape_interval: 60s
        scrape_timeout:  30s
        metrics_path: "/metrics"
        static_configs:
        - targets: ['192.168.75.21:9308']
          labels:
           service: elasticsearch
    复制代码

    重启服务。

    systemctl restart prometheus

    或者通过命令热加载:

    curl  -XPOST localhost:9090/-/reload

    3、配置 Grafana 的模板

    模板通过json文件进行导入,文件就在解压的包内。

    参考地址:https://shenshengkun.github.io/posts/550bdf86.html

    或者通过如下ID进行导入:2322以及其他。

     

     

    4、开启认证的启动方式

    如果es开启了认证,那么启动的时候需要将用户名密码加载进去:

    elasticsearch_exporter --web.listen-address ":9308"  --es.uri http://username:password@192.168.75.21:9200 & 

    其中使用的是monitoring的用户密码。

    当然,除去这种命令行的启动方式之外,还可以像上边一样,基于systemd进行管理,只需将认证的参数信息写入到如下内容当中:

    参考网址:https://github.com/justwatchcom/elasticsearch_exporter

    复制代码
    cat /etc/default/elasticsearch_exporter
    
    [Unit]
    Description=The es_exporter
    After=network.target
    
    [Service]
    Type=simple
    User=prometheus
    ExecStart=/usr/local/elasticsearch_exporter/elasticsearch_exporter --web.listen-address ":9308" --es.uri=http://username:password@192.168.75.21:9200
    Restart=on-failure
    
    [Install]
    WantedBy=multi-user.target
    复制代码

    【5】【最佳实践】es_alert.yml

    groups:
    - name: ES告警
      rules:
      - alert: ES-集群状态变红
        expr: elasticsearch_cluster_health_status{color="red"}==1
        for: 1m
        labels:
          severity: warning 
        annotations:
          description: "主/副本分片分配有误,该问题发生在集群:{{ $labels.cluster }}"    
    
      - alert: ES-集群状态变黄
        expr: elasticsearch_cluster_health_status{color="yellow"}==1 
        for: 1m
        labels:
          severity: warning
        annotations:
          description: "主/副本分片分配有误,该问题发生在集群:{{ $labels.cluster }}."    
    
      - alert: ES-JVM堆内存使用过高
        expr: round(elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}*100,0.01)>85
        for: 1m
        labels:
          severity: warning
        annotations:
          description: "JVM堆内存使用率超过80%
    当前:{{ $value }}"
    
      - alert: ES-集群健康状态获取失败
        expr: elasticsearch_cluster_health_up!=1
        for: 1m
        labels:
          severity: warning
        annotations:
          description: "该ES节点,获取集群监控状态失败 in cluster:[ {{ $labels.cluster }} ]"
    
      - alert: ES-太少节点运行
        expr: elasticsearch_cluster_health_number_of_nodes < 5
        for: 1m
        labels:
          severity: warning
        annotations:
          description: "ES集群运行的节点<5个(total 7) in cluster:[ {{ $labels.cluster }} ]
    当前运行节点个数:{{ $value }}"
    
      - alert: ES-GC平均执行次数过多
        expr: rate(elasticsearch_jvm_gc_collection_seconds_count{}[5m])>5
        for: 1m
        labels:
          severity: warning
        annotations:
          description: "JVM GC 1m内平均执行次数>5/s in cluster:[ {{ $labels.cluster }} ]
    当前:{{ $value }}/s"
    
      - alert: ES-GC平均运行时间过长
        expr: round((node_filesystem_size_bytes{fstype=~"ext.?|xfs"} - node_filesystem_free_bytes{fstype=~"ext.?|xfs"}) * 100 / (node_filesystem_avail_bytes{fstype=~"ext.?|xfs"} + (node_filesystem_size_bytes{fstype=~"ext.?|xfs"} - node_filesystem_free_bytes{fstype=~"ext.?|xfs"})),0.1) > 90
        for: 1m
        labels:
          severity: warning
        annotations:
          description: "ES 1m 内平均运行时间>0.3/s in cluster:[ {{ $labels.cluster }} ]
    当前:{{ $value }}/s"
    
      - alert: ES-JSON解析失败
        expr: elasticsearch_cluster_health_json_parse_failures>0
        for: 5m
        labels:
          severity: warning
        annotations:
          description: "ES节点解析json失败数 > 0 in cluster:[ {{ $labels.cluster }} ]
    当前:{{ $value }}"
    
      - alert: ES-断路器触发
        expr: rate(elasticsearch_breakers_tripped{}[5m])>0
        for: 1m
        labels:
          severity: warning
        annotations:
          description: "ES 断路器触发数 in cluster:[ {{ $labels.cluster }} ]> 0
    当前:{{ $value }}"
          
      - alert: ES-等待进程过多
        expr: elasticsearch_cluster_health_number_of_pending_tasks>10
        for: 1m
        labels:
          severity: warning
        annotations:
          description: "ES pending_tasks in cluster:[ {{ $labels.cluster }} ] > 10
    当前:{{ $value }}"      
          
      - alert: ES-增加集群节点
        expr: increase(elasticsearch_cluster_health_number_of_nodes[1m]) > 0
        for: 1s
        labels:
          severity: warning
        annotations:
          description: "ES-增加集群节点 in cluster:[ {{ $labels.cluster }} ]
    增加个数:{{ $value }}" 
          
      - alert: ES-减少集群节点
        expr: increase(elasticsearch_cluster_health_number_of_nodes[1m]) > 0
        for: 1s
        labels:
          severity: warning
        annotations:
          description: "ES-减少集群节点 in cluster:[ {{ $labels.cluster }} ]
    减少个数:{{ $value }}"       

    【6】【最佳实践】grafana模板

    模版:链接:https://pan.baidu.com/s/1mAtVhko18gD4LxdSkuCGEg  密码:3mtd

    【参考文档】

    prometheus 监控 ES:https://blog.csdn.net/aa1215018028/article/details/87003907?utm_medium=distribute.pc_relevant.none-task-blog-2~default~BlogCommendFromMachineLearnPai2~default-1.control&depth_1-utm_source=distribute.pc_relevant.none-task-blog-2~default~BlogCommendFromMachineLearnPai2~default-1.control

    基本安装:转自:https://www.cnblogs.com/fat-girl-spring/p/13143603.html

  • 相关阅读:
    c++常用库
    boost
    android
    UITableView 多选
    c++ 比较两个集合
    事件加不上的另一种原因
    ios多线程
    ubuntu android
    jna StdCallCallback 回调问题查证
    java
  • 原文地址:https://www.cnblogs.com/gered/p/14769403.html
Copyright © 2020-2023  润新知