• ceph-状态监测-脚本


    http://www.tang-lei.com/2018/06/05/ceph-%E7%8A%B6%E6%80%81%E7%9B%91%E6%B5%8B-%E8%84%9A%E6%9C%AC/

    为了能够试试获取ceph集群状态,以期发生故障,可以及时知晓,及时处理。这里结合公司使用的owl监控编写的python脚本,如果使用其他或者crontab,可以酌情修改返回格式。

    cat ceph_status.py

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    122
    123
    124
    125
    126
    127
    #!/usr/bin/python
    # -*- coding: utf-8 -*-
    import os
    import re
    import subprocess
    import sys
    try:
    import simplejson as json
    except:
    import json
     
    ##获取集群状态 HEALTH_ERR、HEALTH_WARN、HEALTH_OK
    def get_ceph_status():
    p = subprocess.Popen("ceph -s -f json", shell=True,stdout=subprocess.PIPE)
    j_data = json.loads(p.stdout.read())
    status = j_data.get('health').get('overall_status')
    mark = 0 ##正常
    if status == 'HEALTH_ERR':
    mark = 1
    if status == 'HEALTH_WARN':
    mark = 2
    if status == 'HEALTH_OK':
    mark = 0
    print {'HEALTH':mark}
    return {'HEALTH':mark}
     
    ##检查osd使用率
    def get_osd_usage():
    p = subprocess.Popen("ceph osd df | awk '{print $1,$7}'", shell=True, stdout=subprocess.PIPE)
    osds = p.stdout.readlines()
    dicts = {}
    for o in osds:
    array = o.strip().split(" ")
    try:
    id = int(array[0])
    dicts[array[0]]=array[1]
    except ValueError:
    print "Not number"
    j_data = json.dumps(dicts, indent=4)
    print j_data
    return j_data
     
    ##获取osd状态 0表示没有down的osd, 1表示有down
    def get_osd_status():
    ##p = subprocess.Popen("ceph osd tree | awk '{print $4}' |grep down", shell=True, stdout=subprocess.PIPE)
    ##osds_len = len(p.stdout.read())
    ##print osds_len
    ##if osds_len == 0:
    ## return 0
    ##else:
    ## return 1
    p = subprocess.Popen("ceph osd stat -f json", shell=True, stdout=subprocess.PIPE)
    data = json.loads(p.stdout.read())
    if data.get('num_osds') == data.get('num_up_osds') == data.get("num_in_osds"):
    print 0
    return 0
    else:
    print 1
    return 1
     
     
    ##获取pg状态 0表示 active+clean, 1表示有问题
    def get_pg_status():
    p = subprocess.Popen("ceph pg stat -f json", shell=True, stdout=subprocess.PIPE)
    data = json.loads(p.stdout.read())
    if len(data.get("num_pg_by_state")) > 1:
    print 1
    return 1
    else:
    print 0
    return 0
     
     
    ##获取osd延迟信息
    def get_osd_latency():
    p = subprocess.Popen("ceph osd perf |awk '{print $1,$3}'", shell=True, stdout=subprocess.PIPE)
    return _change_data_format(p)
     
     
    ##数据格式转换
    def _change_data_format(p):
    osds = p.stdout.readlines()
    dicts = {}
    for o in osds:
    array = o.strip().split(" ")
    try:
    id = int(array[0])
    dicts[array[0]]=array[1]
    except ValueError:
    print "Not number"
    j_data = json.dumps(dicts, indent=4)
    print j_data
    return j_data
     
    ##mon 状态
    def get_mon_status():
    p = subprocess.Popen("ceph mon_status", shell=True, stdout=subprocess.PIPE)
    usage = p.stdout.read()
    json_usage = json.loads(usage)
    print json_usage
    return json_usage
     
     
    ##ceph集群所有磁盘使用率
    def get_ceph_disk_usage():
    p = subprocess.Popen("ceph df -f json", shell=True, stdout=subprocess.PIPE)
    usage = p.stdout.read()
    json_usage = json.loads(usage)
    print json_usage
    return json_usage
     
    if __name__ == '__main__':
    inputs = sys.argv[1]
    if inputs == '1':
    get_ceph_status()
    if inputs == '2':
    get_osd_usage()
    if inputs == '3':
    get_osd_status()
    if inputs == '4':
    get_osd_latency()
    if inputs == '5':
    get_pg_status()
    if inputs == '6':
    get_mon_status()
    if inputs == '7':
    get_ceph_disk_usage()
  • 相关阅读:
    【CF103D】Time to Raid Cowavans-分块+离线处理
    【BZOJ3992】序列统计(SDOI2015)-NTT+循环卷积+快速幂
    【BZOJ3527】力(ZJOI2014)-FFT
    【HDU4609】3-idiots-FFT+生成函数
    【LuoguP3803】多项式乘法-FFT/NTT模板题(附带FFT/NTT简单介绍)
    网络流24题解题总结(更新中)
    【BZOJ3531】旅行(SDOI2014)-树链剖分+动态加点线段树
    [Noip2012]借教室
    bzoj3394:[Usaco2009 Jan]Best Spot 最佳牧场
    [NOIP2014]无线网站发射器选址
  • 原文地址:https://www.cnblogs.com/wangmo/p/11573611.html
Copyright © 2020-2023  润新知