背景:公司线下ETC机房有个Mycat集群,供订单系统使用,现需要进行数据异构将Mysql数据(近)实时写入另一套数据库用作读请求和数据归档用
技术选型:binlog解析工具:阿里开源的canal 消息中间件:kafka 流式框架:SparkStreaming
上代码
canal解析mysqlbinlog 实时写入kafka:
package kafka; import com.alibaba.fastjson.JSONObject; import com.alibaba.otter.canal.client.CanalConnector; import com.alibaba.otter.canal.protocol.CanalEntry.*; import com.alibaba.otter.canal.protocol.Message; import com.google.protobuf.InvalidProtocolBufferException; import org.apache.commons.lang.SystemUtils; import org.apache.kafka.clients.producer.Producer; import org.apache.kafka.clients.producer.ProducerRecord; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.slf4j.MDC; import org.springframework.util.Assert; import org.springframework.util.CollectionUtils; import java.text.SimpleDateFormat; import java.util.Date; import java.util.List; import java.util.concurrent.ExecutionException; public class AbstractCanalClient { protected final static Logger logger = LoggerFactory .getLogger(AbstractCanalClient.class); protected static final String SEP = SystemUtils.LINE_SEPARATOR; protected static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss"; protected volatile boolean running = false; protected Thread.UncaughtExceptionHandler handler = new Thread.UncaughtExceptionHandler() { public void uncaughtException(Thread t, Throwable e) { logger.error("parse events has an error", e); } }; protected Thread thread = null; protected CanalConnector connector; protected static String context_format = null; protected static String row_format = null; protected static String transaction_format = null; protected String destination; protected Producer<String, String> kafkaProducer = null; protected String topic; protected String table; static { context_format = SEP + "****************************************************" + SEP; context_format += "* Batch Id: [{}] ,count : [{}] , memsize : [{}] , Time : {}" + SEP; context_format += "* Start : [{}] " + SEP; context_format += "* End : [{}] " + SEP; context_format += "****************************************************" + SEP; row_format = SEP + "----------------> binlog[{}:{}] , name[{},{}] , eventType : {} , executeTime : {} , delay : {}ms" + SEP; transaction_format = SEP + "================> binlog[{}:{}] , executeTime : {} , delay : {}ms" + SEP; } public AbstractCanalClient(String destination) { this(destination, null); } public AbstractCanalClient(String destination, CanalConnector connector) { this(destination, connector, null); } public AbstractCanalClient(String destination, CanalConnector connector, Producer<String, String> kafkaProducer) { this.connector = connector; this.destination = destination; this.kafkaProducer = kafkaProducer; } protected void start() { Assert.notNull(connector, "connector is null"); Assert.notNull(kafkaProducer, "Kafka producer configuration is null"); Assert.notNull(topic, "kafaka topic is null"); Assert.notNull(table,"table is null"); thread = new Thread(new Runnable() { public void run() { process(); } }); thread.setUncaughtExceptionHandler(handler); thread.start(); running = true; } protected void stop() { if (!running) { return; } running = false; if (thread != null) { try { thread.join(); } catch (InterruptedException e) { // ignore } } kafkaProducer.close(); MDC.remove("destination"); } protected void process() { int batchSize = 1024; while (running) { try { MDC.put("destination", destination); connector.connect(); connector.subscribe("databaseName\.tableName"); while (running) { Message message = connector.getWithoutAck(batchSize); // 获取指定数量的数据 long batchId = message.getId(); try { int size = message.getEntries().size(); if (batchId == -1 || size == 0) { try { Thread.sleep(100); } catch (InterruptedException e) { } } else { kafkaEntry(message.getEntries()); } connector.ack(batchId); // 提交确认 } catch (Exception e) { connector.rollback(batchId); // 处理失败, 回滚数据 } } } catch (Exception e) { logger.error("process error!", e); } finally { connector.disconnect(); MDC.remove("destination"); } } } private void printSummary(Message message, long batchId, int size) { long memsize = 0; for (Entry entry : message.getEntries()) { memsize += entry.getHeader().getEventLength(); } String startPosition = null; String endPosition = null; if (!CollectionUtils.isEmpty(message.getEntries())) { startPosition = buildPositionForDump(message.getEntries().get(0)); endPosition = buildPositionForDump(message.getEntries().get( message.getEntries().size() - 1)); } SimpleDateFormat format = new SimpleDateFormat(DATE_FORMAT); logger.info(context_format, new Object[]{batchId, size, memsize, format.format(new Date()), startPosition, endPosition}); } protected String buildPositionForDump(Entry entry) { long time = entry.getHeader().getExecuteTime(); Date date = new Date(time); SimpleDateFormat format = new SimpleDateFormat(DATE_FORMAT); return entry.getHeader().getLogfileName() + ":" + entry.getHeader().getLogfileOffset() + ":" + entry.getHeader().getExecuteTime() + "(" + format.format(date) + ")"; } private void kafkaEntry(List<Entry> entrys) throws InterruptedException, ExecutionException { for (Entry entry : entrys) { if (entry.getEntryType() == EntryType.TRANSACTIONBEGIN || entry.getEntryType() == EntryType.TRANSACTIONEND) { continue; } RowChange rowChage = null; try { rowChage = RowChange.parseFrom(entry.getStoreValue()); } catch (Exception e) { throw new RuntimeException( "ERROR ## parser of eromanga-event has an error , data:" + entry.toString(), e); } String logfileName = entry.getHeader().getLogfileName(); Long logfileOffset = entry.getHeader().getLogfileOffset(); String dbName = entry.getHeader().getSchemaName(); String tableName = entry.getHeader().getTableName(); EventType eventType = rowChage.getEventType(); if (eventType == EventType.DELETE || eventType == EventType.UPDATE || eventType == EventType.INSERT) { for (RowData rowData : rowChage.getRowDatasList()) { String tmpstr = ""; if (eventType == EventType.DELETE) { tmpstr = getDeleteJson(rowData.getBeforeColumnsList()); } else if (eventType == EventType.INSERT) { tmpstr = getInsertJson(rowData.getAfterColumnsList()); } else if (eventType == EventType.UPDATE) { tmpstr = getUpdateJson(rowData.getBeforeColumnsList(), rowData.getAfterColumnsList()); } else { continue; } logger.info(this.topic+tmpstr); kafkaProducer.send( new ProducerRecord<String, String>(this.topic, tmpstr)).get(); } } } } private JSONObject columnToJson(List<Column> columns) { JSONObject json = new JSONObject(); for (Column column : columns) { json.put(column.getName(), column.getValue()); } return json; } private String getInsertJson(List<Column> columns) { JSONObject json = new JSONObject(); json.put("type", "insert"); json.put("data", this.columnToJson(columns)); return json.toJSONString(); } private String getUpdateJson(List<Column> befcolumns, List<Column> columns) { JSONObject json = new JSONObject(); json.put("type", "update"); json.put("data", this.columnToJson(columns)); return json.toJSONString(); } private String getDeleteJson(List<Column> columns) { JSONObject json = new JSONObject(); json.put("type", "delete"); json.put("data", this.columnToJson(columns)); return json.toJSONString(); } protected void printEntry(List<Entry> entrys) { for (Entry entry : entrys) { long executeTime = entry.getHeader().getExecuteTime(); long delayTime = new Date().getTime() - executeTime; if (entry.getEntryType() == EntryType.TRANSACTIONBEGIN || entry.getEntryType() == EntryType.TRANSACTIONEND) { if (entry.getEntryType() == EntryType.TRANSACTIONBEGIN) { TransactionBegin begin = null; try { begin = TransactionBegin.parseFrom(entry .getStoreValue()); } catch (InvalidProtocolBufferException e) { throw new RuntimeException( "parse event has an error , data:" + entry.toString(), e); } // 打印事务头信息,执行的线程id,事务耗时 logger.info( transaction_format, new Object[]{ entry.getHeader().getLogfileName(), String.valueOf(entry.getHeader() .getLogfileOffset()), String.valueOf(entry.getHeader() .getExecuteTime()), String.valueOf(delayTime)}); logger.info(" BEGIN ----> Thread id: {}", begin.getThreadId()); } else if (entry.getEntryType() == EntryType.TRANSACTIONEND) { TransactionEnd end = null; try { end = TransactionEnd.parseFrom(entry.getStoreValue()); } catch (InvalidProtocolBufferException e) { throw new RuntimeException( "parse event has an error , data:" + entry.toString(), e); } // 打印事务提交信息,事务id logger.info("---------------- "); logger.info(" END ----> transaction id: {}", end.getTransactionId()); logger.info( transaction_format, new Object[]{ entry.getHeader().getLogfileName(), String.valueOf(entry.getHeader() .getLogfileOffset()), String.valueOf(entry.getHeader() .getExecuteTime()), String.valueOf(delayTime)}); } continue; } if (entry.getEntryType() == EntryType.ROWDATA) { RowChange rowChage = null; try { rowChage = RowChange.parseFrom(entry.getStoreValue()); } catch (Exception e) { throw new RuntimeException( "parse event has an error , data:" + entry.toString(), e); } EventType eventType = rowChage.getEventType(); logger.info( row_format, new Object[]{ entry.getHeader().getLogfileName(), String.valueOf(entry.getHeader() .getLogfileOffset()), entry.getHeader().getSchemaName(), entry.getHeader().getTableName(), eventType, String.valueOf(entry.getHeader() .getExecuteTime()), String.valueOf(delayTime)}); if (eventType == EventType.QUERY || rowChage.getIsDdl()) { logger.info(" sql ----> " + rowChage.getSql() + SEP); continue; } for (RowData rowData : rowChage.getRowDatasList()) { if (eventType == EventType.DELETE) { printColumn(rowData.getBeforeColumnsList()); } else if (eventType == EventType.INSERT) { printColumn(rowData.getAfterColumnsList()); } else { printColumn(rowData.getAfterColumnsList()); } } } } } protected void printColumn(List<Column> columns) { for (Column column : columns) { StringBuilder builder = new StringBuilder(); builder.append(column.getName() + " : " + column.getValue()); builder.append(" type=" + column.getMysqlType()); if (column.getUpdated()) { builder.append(" update=" + column.getUpdated()); } builder.append(SEP); logger.info(builder.toString()); } } public void setConnector(CanalConnector connector) { this.connector = connector; } public void setKafkaProducer(Producer<String, String> kafkaProducer) { this.kafkaProducer = kafkaProducer; } public void setKafkaTopic(String topic) { this.topic = topic; } public void setFilterTable(String table) { this.table = table; } }
package kafka; import com.alibaba.otter.canal.client.CanalConnector; import com.alibaba.otter.canal.client.CanalConnectors; import org.apache.commons.lang.exception.ExceptionUtils; import org.apache.kafka.clients.producer.KafkaProducer; import java.net.InetSocketAddress; import java.util.Properties; public class ClusterCanalClient extends AbstractCanalClient { public ClusterCanalClient(String destination) { super(destination); } public static void main(String args[]) { String destination = null;//"example"; String topic = null; // String canalhazk = null; String kafka = null; String hostname = null; String table = null; if (args.length != 5) { logger.error("input param must : hostname destination topic kafka table" + "for example: localhost example topic 192.168.0.163:9092 tablname"); System.exit(1); } else { hostname = args[0]; destination = args[1]; topic = args[2]; // canalhazk = args[2]; kafka = args[3]; table = args[4]; } // 基于zookeeper动态获取canal server的地址,建立链接,其中一台server发生crash,可以支持failover CanalConnector connector = CanalConnectors.newSingleConnector(new InetSocketAddress( hostname, 11111), destination, "canal", "canal"); // CanalConnector connector = CanalConnectors.newClusterConnector( // canalhazk, destination, "userName", "passwd"); Properties props = new Properties(); props.put("bootstrap.servers", kafka); props.put("request.required.acks",1); props.put("acks", "all"); props.put("retries", 0); props.put("batch.size", 16384); props.put("linger.ms", 1); props.put("buffer.memory", 33554432);//32m props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer"); props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer"); KafkaProducer<String, String> producer = new KafkaProducer<>(props); final ClusterCanalClient clientTest = new ClusterCanalClient(destination); clientTest.setConnector(connector); clientTest.setKafkaProducer(producer); clientTest.setKafkaTopic(topic); clientTest.setFilterTable(table); clientTest.start(); Runtime.getRuntime().addShutdownHook(new Thread() { public void run() { try { logger.info("## stop the canal client"); clientTest.stop(); } catch (Throwable e) { logger.warn("##something goes wrong when stopping canal: {}", ExceptionUtils.getFullStackTrace(e)); } finally { logger.info("## canal client is down."); } } }); } } SparkStreaming 将kafka数据 写入HBase package bcw.etl.syncdata import java.util import com.alibaba.fastjson.{JSON, JSONObject} import example.utils.KafkaOffset_ZKManager import org.apache.hadoop.hbase.client.{ConnectionFactory, Put, Table} import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.{HBaseConfiguration, TableName} import org.apache.log4j.Logger import org.apache.spark.rdd.RDD import org.apache.spark.streaming.dstream.InputDStream import org.apache.spark.streaming.kafka.HasOffsetRanges import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.{SparkConf, SparkContext} /** * @author zhuqitian * 2018-5-13 * Kafka to HBase * create 'wms_schedule_main','info' * ./kafka-topics.sh --create --topic wms_schedule_main --zookeeper ip:2181/kafka0.9 --partitions 3 --replication-factor 1 * * note: * */ object Kafka_to_HBase extends App { var logger: Logger = Logger.getLogger(Kafka_to_HBase.getClass) val conf = new SparkConf() .setAppName("Kafka_to_HBASE") .setMaster("local") .set("spark.streaming.kafka.maxRatePerPartition", "100000") val kafkaParams = Map[String, String]( "metadata.broker.list" -> "ip:9092", "auto.offset.reset" -> "smallest" ) val topicSet = "wms_schedule_main".split(",").toSet val groupName = "wms_mysql_test" val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(2)) val config = HBaseConfiguration.create config.set("hbase.zookeeper.quorum", "ip") config.set("hbase.zookeeper.property.clientPort", "2181") val conn = ConnectionFactory.createConnection(config) val table: Table = conn.getTable(TableName.valueOf("wms_schedule_main")) var puts = new util.ArrayList[Put] val DStream: InputDStream[(String, String)] = KafkaOffset_ZKManager.createMyDirectKafkaStream( ssc, kafkaParams, topicSet, groupName) DStream.foreachRDD((rdd, btime) => { if (!rdd.isEmpty()) { val startTime = System.currentTimeMillis println(s">>>>>>>>>>>>>>>>>>>>>>start : $startTime") val message: RDD[JSONObject] = rdd.map(line => JSON.parseObject(line._2)) .map(json => json.getJSONObject("data")) .filter(x => x.getString("biid") != null) val jsondata = message.map(jsondata => { val rowKey = jsondata.getString("biid").reverse + jsondata.getString("chdt") val put = new Put(rowKey.getBytes)//md5 val columns = jsondata.keySet().toArray() for (key <- columns) { put.addColumn(Bytes.toBytes("info"), Bytes.toBytes(key.toString), Bytes.toBytes(jsondata.getString(key.toString))) } puts.add(put) }).count() println(" puts size : " + puts.size()) table.put(puts) val endTime = System.currentTimeMillis println(">>>>>>>>>>>>>>>>>>>>>>this batch took " + (endTime - startTime) + " milliseconds. data size is " + jsondata) println("##################################" + btime) puts.clear() println("puts clear after size : " + puts.size()) } KafkaOffset_ZKManager.storeOffsets(rdd.asInstanceOf[HasOffsetRanges].offsetRanges, groupName) }) ssc.start() ssc.awaitTermination() }
总结:1.解析mysql binlog 封装成JSON实时写kafka 2.SparkStreaming 消费kafka数据 解析JSON 3.遍历JSON中key作为字段名跟上value写入HBase(注意HBase中rowkey的设计)
延伸:消费一次且仅一次的语义实现?幂等写入?sql on HBase? HBase split重操作耗时导致请求响应延迟?
转载于:https://www.aboutyun.com/forum.php?mod=viewthread&tid=24702