wget https://github.com/oliver006/redis_exporter/releases/download/v0.15.0/redis_exporter-v0.15.0.linux-amd64.tar.gz tar -zxvf redis_exporter-v0.15.0.linux-amd64.tar.gz -C /usr/local/
启动命令:
nohup /usr/local/redis_exporter -web.listen-address=:9121 -redis.addr 10.3.225.66:6379 -redis.password xxx > /dev/null 2>&1 &
k8s配置redis自动发现,依托于前面的node自动发现
[root@gw_qa_250_10 test]# ls host_discovery_data main.py [root@gw_qa_250_10 test]# ls host_discovery_data/ redis-server.list [root@gw_qa_250_10 test]# cat host_discovery_data/redis-server.list 10.3.225.66 redis03-066 [root@gw_qa_250_10 test]# python main.py ^CTraceback (most recent call last): File "main.py", line 67, in <module> run() File "main.py", line 64, in run time.sleep(10) KeyboardInterrupt [root@gw_qa_250_10 test]# cat host_discovery_data/redis-server.json [{"labels": {"cluster": "redis-server", "alias": "redis03-066", "instance": "10.3.225.66"}, "targets": ["10.3.225.66:9121"]}][root@gw_qa_250_10 test]#
redis-server.list是配置redis服务列表,main.py将列表转化为Prometheus能认的json
main.py
# -*- coding: utf-8 -*- import os import logging import json import time def log_level(level): if level == "DEBUG": logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s %(message)s', datefmt='%Y-%m-%d,%H:%M:%S', ) logging.info("log_level:%s", log_level) logging.info("Debug mode") else: logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s %(message)s', datefmt='%Y-%m-%d,%H:%M:%S', ) def read_list_file(): dirnames = 'host_discovery_data' read_file_names = [name for name in os.listdir(dirnames) if name.endswith('.list')] for read_file_name in read_file_names: fo = open(dirnames + '/' + read_file_name, "r") node_dict_list = list() for fo_line in fo.readlines(): fo_line = fo_line.rstrip(" ") # for fo_line_element in fo_line.split(): # logging.debug("file_name: %s - fo_line_element: %s", read_file_name, fo_line_element) node_ip = fo_line.split()[0] node_alias = fo_line.split()[1] # logging.debug("node_ip: %s - node_alias: %s", node_ip, node_alias) node_dict = dict() node_labels_dict = dict() node_labels_dict['cluster'] = read_file_name.replace('.list', '') node_dict['targets'] = [node_ip + ':9121'] node_labels_dict['instance'] = node_ip node_labels_dict['alias'] = node_alias node_dict['labels'] = node_labels_dict node_dict_list.append(node_dict) # logging.debug("node_dict_list: %s - type %s", node_dict_list, type(node_dict_list)) node_json = json.dumps(node_dict_list) logging.debug("node_json: %s - type %s", node_json, type(node_json)) # 将json 写入文件 node_write_filename = dirnames + '/' + read_file_name.replace('.list', '') + '.json' fw = open(node_write_filename, "w") fw.write(node_json) def run(): # 设置LOG 级别 level = "INFO" log_level(level) # 持续运行 while True: read_list_file() time.sleep(10) if __name__ == '__main__': run()
监控规则redis.rules:
groups: - name: Redis.rules rules: - alert: RedisDown expr: redis_up == 0 for: 5m labels: severity: error annotations: summary: "Redis down (instance {{ $labels.instance }})" description: "Redis 挂了啊,mmp VALUE = {{ $value }} LABELS: {{ $labels }}" - alert: ReplicationBroken expr: delta(redis_connected_slaves[1m]) < 0 for: 5m labels: severity: error annotations: summary: "Replication broken (instance {{ $labels.instance }})" description: "Redis instance lost a slave VALUE = {{ $value }} LABELS: {{ $labels }}" - alert: TooManyConnections expr: redis_connected_clients > 1000 for: 5m labels: severity: warning annotations: summary: "Too many connections (instance {{ $labels.instance }})" description: "Redis instance has too many connections VALUE = {{ $value }} LABELS: {{ $labels }}" - alert: RejectedConnections expr: increase(redis_rejected_connections_total[1m]) > 0 for: 5m labels: severity: error annotations: summary: "Rejected connections (instance {{ $labels.instance }})" description: "Some connections to Redis has been rejected VALUE = {{ $value }} LABELS: {{ $labels }}"