之前前辈用 java 写的收集 jvm 脚本, 不太方便组内小伙伴维护, 遂用 python 重写了
#!/usr/bin/env python # -*- coding: utf-8 -*- # Filename: jvm_monitor # Description: collect jvm info # Author: quke # Date: 2018/8/22 import base64 import datetime import json import logging.handlers import os import random import re import socket import time from subprocess import Popen, PIPE import MySQLdb import requests from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)s - %(funcName)20s() ] %(message)s', datefmt='%Y-%m-%d %H:%M:%S', ) console_handler = logging.StreamHandler() file_handler = logging.handlers.RotatingFileHandler('jvm_monitor.log', maxBytes=10485760, backupCount=5) logger = logging.getLogger(__name__) logger.addHandler(file_handler) hostname = socket.gethostname() def run_command(cmd): process = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() if stderr: logger.error('Excepiton with run %s:%s' % (cmd, stderr)) raise SystemExit else: return stdout.strip(' ').split(' ') def requests_retry( retries=3, backoff_factor=0.3, status_forcelist=(500, 502, 504), session=None, ): session = session or requests.Session() retry = Retry( total=retries, read=retries, connect=retries, backoff_factor=backoff_factor, status_forcelist=status_forcelist, ) adapter = HTTPAdapter(max_retries=retry) session.mount('http://', adapter) session.mount('https://', adapter) return session def execute_sql(sql, host='192.168.1.1', user='user', password='password', db='db'): db = MySQLdb.connect(host, user, password, db) cursor = db.cursor() cursor.execute(sql) if 'insert' in sql or 'update' in sql: db.commit() ret = cursor.fetchall() cursor.close() db.close() return ret def get_all_mixed_info(): sql = 'select mixhost,module,alias from cmdb_mixed_relation' db_detail = execute_sql(sql, host='192.168.1.1', user='user', password='password', db='db') ret = {} for obj in db_detail: hostname, modulename, alias = obj ret.setdefault(hostname, {}).update({modulename: alias}) return ret def get_java_module(args): cur_dir = '/apps' for d in os.listdir(cur_dir): if os.path.isdir(os.path.join(cur_dir, d)): if 'java' in d or 'boot' in d or 'tomcat' in d or 'mycat' in d: if d in args: return d def get_alias(module_name): all_alias = get_all_mixed_info() alias = all_alias.get(hostname, {}).get(module_name) alias = alias if alias else 'null' return alias def get_gc_collector_name(line): for gc in ['UseParNewGC', 'UseG1GC', 'UseSerialGC', 'UseParallelGC']: if gc in line: ygc = gc break else: ygc = 'ParNew' for gc in ['UseConcMarkSweepGC', 'UseG1GC', 'UseParallelOldGC', 'UseSerialGC']: if gc in line: ogc = gc break else: ogc = 'CMS' return ygc, ogc def get_start_time(pid): ret = run_command('ps -o lstart -p %s' % pid) start_time = time.strftime('%Y-%m-%d %H:%M:%S', time.strptime(ret[1], '%a %b %d %H:%M:%S %Y')) return start_time def get_jstat_info(pid): ret = run_command('jstat -gc %s' % pid) rc = re.compile( r'(?P<s0c>[0-9.]+)s+(?P<s1c>[0-9.]+)s+(?P<s0u>[0-9.]+)s+(?P<s1u>[0-9.]+)s+(?P<ec>[0-9.]+)s+(?P<eu>[0-9.]+)s+(?P<oc>[0-9.]+)s+(?P<ou>[0-9.]+)s+(?P<pc>[0-9.]+)s+(?P<pu>[0-9.]+)s+(?P<jvmYgc>[0-9.]+)s+(?P<jvmYgct>[0-9.]+)s+(?P<jvmFgc>[0-9.]+)s+(?P<jvmFgct>[0-9.]+)s+(?P<jvmGct>[0-9.]+)') gc_statistics = rc.match(ret[1]).groupdict() return gc_statistics def get_thread_count(pid): ret = run_command('jstat -snap %s' % pid) active_thread_count = ret[-3].split('=')[1] total_thread_count = ret[-1].split('=')[1] return active_thread_count, total_thread_count def get_jvm_info(): instances = [] ret = run_command('jps -mlv') for line in ret: if line and 'sun.tools.jps.Jps' not in line and 'com.lagou.jmonitor.AgentWatcher' not in line: module = get_java_module(line) alias = hostname if module in hostname else get_alias(module) if 'null' == alias: logger.error('[%s] can not get mixed module alias name , continue' % module) continue ygc, ogc = get_gc_collector_name(line) instances_list = line.split(' ') pid = instances_list[0] start_time = get_start_time(pid) gc_statistics = get_jstat_info(pid) active_thread_count, total_thread_count = get_thread_count(pid) main_function = instances_list[1] main_args = ' '.join(instances_list[2:]) instances.append( dict( pid=pid, module=module, alias=alias, start_time=start_time, gc_statistics=gc_statistics, active_thread_count=active_thread_count, total_thread_count=total_thread_count, ygc=ygc, ogc=ogc, main_function=main_function, main_args=main_args ) ) return instances def push_to_oss(jvm): modulename = jvm.get('module') hostname = jvm.get('alias') pid = jvm.get('pid') mainclassname = jvm.get('main_function') vmparam = jvm.get('main_args') updated = jvm.get('start_time') gclist = json.dumps( [dict(useTime=jvm['gc_statistics']['jvmYgct'], name=jvm['ygc'], times=jvm['gc_statistics']['jvmYgc']), dict(useTime=jvm['gc_statistics']['jvmFgct'], name=jvm['ogc'], times=jvm['gc_statistics']['jvmFgc'])]) fgcygc = json.dumps(dict(jvmFgc=jvm['gc_statistics']['jvmFgc'], jvmYgc=jvm['gc_statistics']['jvmYgc'], jvmFgct=jvm['gc_statistics']['jvmFgct'], jvmYgct=jvm['gc_statistics']['jvmYgct'], )) get_hostnames_sql = 'select hostname,modulename from jvmmonitordata where modulename="%s"' % modulename ignore_hostname_ne_modulename = 'select hostname from jvmmonitordata where hostname="%s"' % hostname logger.info('execute sql :%s' % get_hostnames_sql) is_existing = False for obj in execute_sql(get_hostnames_sql): if hostname in obj: is_existing = True for obj in execute_sql(ignore_hostname_ne_modulename): if hostname in obj: is_existing = True if is_existing: update_jvmmonitordata_sql = "update jvmmonitordata set pid=%d,gclist='%s',fgcygc='%s' where hostname='%s'" % ( int(pid), gclist, fgcygc, hostname) logger.info('execute sql :%s' % update_jvmmonitordata_sql) execute_sql(update_jvmmonitordata_sql) else: insert_jvmmonitordata_sql = "insert into jvmmonitordata(hostname,modulename,mainclassname,pid,vmparam,gclist,updated,fgcygc) values ('%s','%s','%s',%d,'%s','%s','%s','%s')" % ( hostname, modulename, mainclassname, int(pid), vmparam, gclist, updated, fgcygc) logger.info('execute sql :%s' % insert_jvmmonitordata_sql) execute_sql(insert_jvmmonitordata_sql) def get_hbase_svr(): hbase_list = ["http://192.168.100.1:8080", "http://192.168.100.2:8080", "http://192.168.100.3:8080"] hbase_url = None retry = 10 while retry > 0: hbase_url = random.choice(hbase_list) try: r = requests.head(hbase_url, timeout=2) except: logger.info("connect" + hbase_url + "error, try another") else: if r.status_code == 200: break retry -= 1 if retry == 0: logger.error("connect hbase failed with 10 times") return hbase_url def build_hbase_data(jvm): hostName = jvm['alias'] jvmEc = float(jvm['gc_statistics']['ec']) * 1000 jvmEu = float(jvm['gc_statistics']['eu']) * 1000 jvmOc = float(jvm['gc_statistics']['oc']) * 1000 jvmOu = float(jvm['gc_statistics']['ou']) * 1000 jvmPc = float(jvm['gc_statistics']['pc']) * 1000 jvmPu = float(jvm['gc_statistics']['pu']) * 1000 jvmSc = (float(jvm['gc_statistics']['s0c']) + float(jvm['gc_statistics']['s1c'])) * 1000 jvmSu = (float(jvm['gc_statistics']['s0u']) + float(jvm['gc_statistics']['s1u'])) * 1000 totalThreadCount = int(jvm['total_thread_count']) activeThreadCount = int(jvm['active_thread_count']) return dict( hostName=hostName, jvmEc=int(jvmEc), jvmEu=int(jvmEu), jvmOc=int(jvmOc), jvmOu=int(jvmOu), jvmPc=int(jvmPc), jvmPu=int(jvmPu), jvmSc=int(jvmSc), jvmSu=int(jvmSu), totalThreadCount=totalThreadCount, activeThreadCount=activeThreadCount, ) def jvm_hbase_constructor(jvm): """jvm hbase 数据构造器""" data = build_hbase_data(jvm) rows = [] json_rows = {"Row": rows} row_key = base64.b64encode(data['hostName'] + ":" + datetime.datetime.now().strftime('%Y%m%d%H%M')) cell = [] for column in ['jvmEc', 'jvmEu', 'jvmOc', 'jvmOu', 'jvmPc', 'jvmPu', 'jvmSc', 'jvmSu', 'totalThreadCount', 'activeThreadCount']: cell.append({"column": base64.b64encode('jvm' + ":" + column), "$": base64.b64encode(str(data[column]))}) rows.append({'key': row_key, 'Cell': cell}) return row_key, json_rows def push_to_hbase(jvm): table_name = 'jvm' try: row_key, json_rows = jvm_hbase_constructor(jvm) except Exception as e: logger.error("construct hbase data error %s" % str(e)) else: for i in range(10): hbase_url = get_hbase_svr() try: response = requests.post(hbase_url + '/' + table_name + '/' + row_key, data=json.dumps(json_rows), headers={"Content-Type": "application/json", "Accept": "application/json"}, timeout=60) if response.status_code == 200: break except: pass if i == 9: logger.error("try to save hbase failed with 10 times,exit") def push_data(jvm_infos): for jvm in jvm_infos: push_to_oss(jvm) push_to_hbase(jvm) if __name__ == '__main__': jvm_infos = get_jvm_info() push_data(jvm_infos)