从hive中读取数据推送到kafka

由python2.7语言实现的，包也比较旧了。

# -*- coding: utf-8 -*-
# Version: 1.0.0
# Description: py_Hive2Kafka2kafka
# Author: wqbin
# Create_date:20191026

import re
import sys
import os
import logging
import string
import datetime
import time
import random
import subprocess as sp

from logging import handlers
from time import strftime, localtime
from pyhs2.haconnection import HAConnection
from kafka import KafkaProducer


################################环境变量的设置############################################
#1.指定编码格式
os.environ['NLS_LANG'] = 'SIMPLIFIED CHINESE_CHINA.UTF8'

#2.加载fi的环境变量
ENV_FILE='/home/root/ficlient/bigdata_env'

#加载ENV_FILE 相当于source /home/root/ficlient/bigdata_env
proc = sp.Popen(['bash','-c','source {0} && env'.format(ENV_FILE)],stdout = sp.PIPE)
for tup in map(lambda s: s.strip().split('=',1),proc.stdout):
    k=tup[0].strip()
    v=tup[1].strip()
    os.environ[k]=v
    

#3.KERBEROS 认证
KERBEROS_USER = "rootuser"
KERBEROS_KEYTAB = "/home/root/rootuser.keytab" 
TGT_PATH="/home/root/tagidentity.tgt"
os.environ['KRB5CCNAME'] = TGT_PATH
os.system("kinit -kt %s %s" % (KERBEROS_KEYTAB,KERBEROS_USER))

#4.脚本路径 日志路径 配置路径
#MAIN_PATH = os.path.abspath(os.path.join(os.path.dirname("__file__"),os.path.pardir))
MAIN_PATH = "/ETL/pybin/py_Hive2Kafka"

LOG_PATH = MAIN_PATH + "/log"
CONF_PATH = MAIN_PATH + "/conf"
#5.参数1：批次时间 20180721
batch_date = sys.argv[1]

################################日志######################################################
#日志中的时间格式
ISOTIMEFORMAT = '%Y-%m-%d %H:%M:%S'

#日志路径
logfile = "%s/%s.log" % (LOG_PATH,batch_date)

#整合层日志
LOGGER = logging.getLogger("data_life_manager")

LOGGER_HANDLER = logging.handlers.RotatingFileHandler(logfile, maxBytes=20*1024*1024, backupCount=10)
FORMATTER = logging.Formatter("
%(asctime)s [%(levelname)s] %(message)s", ISOTIMEFORMAT)
LOGGER_HANDLER.setFormatter(FORMATTER)

LOGGER.setLevel(logging.INFO)
LOGGER.addHandler(LOGGER_HANDLER)

console = logging.StreamHandler()
console.setLevel(logging.INFO)
console.setFormatter(FORMATTER)
LOGGER.addHandler(console)
logger = LOGGER
logger.info(MAIN_PATH)


###################################从配置文件中获取配置###################################

def get_conf(conf_file):
    """
    Get conf from a file having attribute which relatives by equal sign.
    Then, it will create and return  a dic with conf.
    """
    conf = {}

    def add_conf(key, value):
        conf[key] = value

    map(lambda _: add_conf(_[0:_.index('=')], _[_.index('=') + 1:]),
        map(lambda _: _.replace('"', '').replace('
', ''),
            #获取有效的配置行
            filter(lambda _: "=" in _ and not _.startswith('#'),
                   open(conf_file).readlines()
                   )
            )
        )
    return conf


db_config = get_conf(MAIN_PATH + '/conf/database.conf')


#Hive连接配置
HIVE_HOSTS = db_config.get('HIVE_HOSTS').split(',')
HIVE_PORT = db_config.get('HIVE_PORT')
queue_name = db_config.get('QUEUE_NAME')



###################################连接hive执行sql###################################
#查询统计结果sql
sql=''
if batch_date[6:8]=='03':
    print 'batch_date[6:7]:%s'%batch_date[6:8]
    sql = "select column1,column2,column3,column4 from table1 where batch_date=%s ;" % (batch_date)
else:
    print 'batch_date[6:7]:%s'%batch_date[6:8]
    sql = "select column1,column2   from table1 where batch_date=%s ;" % (batch_date)
database = "dt"
templatecode = "001"
transcode = "002"
orsenderid = "003"
orsenderchannel = "004"

def select_hive(queue_name, database, sql, logger):
    v_queue_name = "set mapred.job.queue.name=%s" % queue_name
    v_database = "use %s" % database
    sql = sql.encode('UTF-8')
    v_sql = re.sub(r';$', '', sql)
    timeout11 = 3 * 60 * 60 * 1000
    conf = {"krb_host": "hadoop001", "krb_service": "hive"}
    print v_queue_name
    print v_database
    print v_sql
    try:
        with HAConnection(hosts=HIVE_HOSTS,
                          port=HIVE_PORT,
                          timeout=timeout11,
                          authMechanism="KERBEROS",
                          user='rootuser',
                          configuration=conf) as haConn:
            with haConn.getConnection() as conn:
                with conn.cursor() as cur:
                    print v_queue_name
                    logger.info(v_queue_name)
                    cur.execute(v_queue_name)
                    print v_database
                    logger.info(v_database)
                    cur.execute(v_database)
                    print v_sql
                    logger.info(v_sql)
                    cur.execute(v_sql)
                    tuple_dic = cur.fetchall()
                    if len(tuple_dic) == 0:
                        tuple_dic = None
    except Exception, e:
        logger.error(e)
        raise Exception(e)
    return tuple_dic



####################################自定义异常类###################################

class UserDefException(Exception):
    def __init__(self,msg):
        self.msg=msg
    def __str__(self):
        return self.msg




####################################拼接json字符串 发送kafka方法###################################

def send_json_to_Kafka(batch_date):
    data_cnt_tuple_dic = select_hive(queue_name, database, sql, logger)
    print data_cnt_tuple_dic
    
    list = []
    try:
        for a in data_cnt_tuple_dic:
           if len(a)==2:
               list.append(a[0])
               list.append(a[1])
               break  
           elif len(a)==4:
              list.append(a[0])
              list.append(a[1])
              list.append(a[2])
              list.append(a[3])
              break
           else:
               raise UserDefException("select返回不是4也不是2")
    except Exception, e:
        list = []
        logger.error(e)
    print list
    
    
    orSenderSN = ''.join(random.sample(string.ascii_letters + string.digits, 22))
    agentSerialNo = ''.join(random.sample(string.ascii_letters + string.digits, 8))
    verison_name = "abc"
    model_plat = "1"
    
    msg_head = '{"TemplateCode":"%s","TransCode":"%s","orSenderID":"%s","orSenderChannel":"%s","orSenderSN":"%s",' 
           '"orSenderDate":"%s","curTime":"%d","agentSerialNo":"%s"}' 
           % (templatecode, transcode, orsenderid, orsenderchannel, orSenderSN,
              time.strftime("%Y%m%d", time.localtime()), int(round(time.time() * 1000)), agentSerialNo)
    start_time = batch_date
    end_time = batch_date
    if batch_date[6:8]=='03':
        end_time=datetime.datetime.combine(datetime.date(int(batch_date[0:4]),int(batch_date[4:6]),int(batch_date[6:8]))-datetime.timedelta(days=30),datetime.time.min).strftime("%Y%m%d")
    try:
         
        if batch_date[6:8]=='03':
            msg_result = '{' 
             '"%s":%s,' 
             '"%s":%s,' 
             '"%s":%s,' 
             '"%s":%s' 
             '}' % ("column1",list[0],"column2",list[1],"column3",list[2],"column4",list[3])
        elif batch_date[6:8]!='03':
            msg_result = '{' 
             '"%s":%s,' 
             '"%s":%s' 
             '}' % ("column1",list[0],"column2",list[1])
        else:
            raise UserDefException("select返回不是4也不是2")
    except Exception, e:
        logger.error(e)
        raise Exception(e)
    
    msg_body = '{"verison_name":"%s","version":"","model_plat":"%s","event_start_tm":"%s","event_end_tm":"%s","result":%s}' 
           % (verison_name, model_plat, start_time, end_time, str(msg_result).replace("'", '"'))
    msg = '{"head":%s,"body":%s}' % (msg_head, msg_body)
    logger.info(msg)

    try:
        send_kafka(msg)
    except Exception, e:
        logger.error(e)
        raise Exception(e)




bootstrap_servers = '192.168.164.202:9092,192.168.164.203:9092,192.168.164.204:9092'
topic = 'topic1'
retries = 2

# 发送数据到kafka
def send_kafka(msg):
    try:
        producer = KafkaProducer(bootstrap_servers=bootstrap_servers, retries=retries)
    except Exception as e:
        logger.error(e)
        raise Exception("catch an exception when create KafkaProducer")
    try:
        producer.send(topic, msg)
        producer.flush()
        producer.close()

    except Exception as e:
        logger.error(e)
        if producer:
            producer.close()
        raise Exception("catch an exception when send message:%s" % msg)



if __name__ == '__main__':
    send_json_to_Kafka(batch_date)
    print "data from hive to kafka has all successed"

conf文件如下

#Hive
HIVE_HOSTS=192.168.154.201
HIVE_PORT=10000
QUEUE_NAME=NO1
PUB_BIGDATA_ENV_FILE=/home/root/bigdata_env
PUB_HIVE_ENV_FILE=/home/root/Hive/component_env
PUB_COMMON_PATH=/etl/pub
PUB_KEYTAB_PATH=/etl/pub/keytab_file

相关阅读:
Scilab 的画图函数（2）
Webapp的display-name问题
 记录：在老XPS1330上安装CentOS7
包含Blob字段的表无法Export/Import
记一段脚本的诞生
 一个短小的JS函数，用来得到仅仅包含不重复元素的数组
 然并卵
 Linux下的定时任务Crontab
两段用来启动/重启Linux下Tomcat的Perl脚本
 JavaScript中给二维数组动态添加元素的质朴方法
原文地址：https://www.cnblogs.com/wqbin/p/11279292.html