HudiFlink消费kafka将增量数据实时写入Hudi(java)

零、步骤

一、Flink SQL集成Kafka

1.创建topic(一分区一备份)

flink-topic

2.准备flink-sql-connector-kafka_2.12-1.13.1.jar，放入flink/lib下

3.启动client,指定jar

./sql-client.sh embedded -j ../lib/flink-sql-connector-kafka_2.12-1.13.1.jar shell

设置分析结果展示模式为：set execution.result-mode=tableau;

4.创建表，映射到kafka topic

kafka topic中数据是CSV文件格式，有三个字段，user_id、item_id、behavior，从kafka消费数据时，设置从最新偏移量开始

CREATE TABLE test_kafka(
    `user_id` BIGINT,
    `item_id` BIGINT,
    `behavior` STRING
)
WITH(
    'connector' = 'kafka',
    'topic'='flink-topic',
    'properties.bootstrap.servers' = 'localhost:9092',
    'properties.group.id' = 'test-group-10001',
    'scan.startup.mode' = 'latest-offset',
    'format' = 'csv'
);
 
Flink SQL> select * from test_kafka;
+----+----------------------+----------------------+--------------------------------+
| op |              user_id |              item_id |                       behavior |
+----+----------------------+----------------------+————————————————+

5.kafka写入数据

kafka-console-producer.sh --broker-list localhost:9092 —-topic flink-topic
1001,90001,click
1001,90001,browser
1001,90001,click
1002,90002,click
1002,90003,click
1003,90001,order
1004,90001,order

MacBook-Pro:bin FengZhen$ kafka-console-producer.sh --broker-list localhost:9092 --topic flink-topic
>1001,90001,click
1001,90001,browser
1001,90001,click
1002,90002,click
1002,90003,click
1003,90001,order
1004,90001,order
>>>>>>>
 
数据可实时查询处理
Flink SQL> select * from test_kafka;
+----+----------------------+----------------------+--------------------------------+
| op |              user_id |              item_id |                       behavior |
+----+----------------------+----------------------+--------------------------------+
| +I |                 1001 |                90001 |                          click |
| +I |                 1001 |                90001 |                        browser |
| +I |                 1001 |                90001 |                          click |
| +I |                 1002 |                90002 |                          click |
| +I |                 1002 |                90003 |                          click |
| +I |                 1003 |                90001 |                          order |
| +I |                 1004 |                90001 |                          order |

二、代码实现

package com.zhen.hudi;

import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.TableEnvironment;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;

import static org.apache.flink.table.api.Expressions.$;

/**
 * @Author FengZhen
 * @Date 3/9/22 10:17 PM
 * @Description 基于Flink SQL Connector实现：实时消费topic中数据，转换处理后，实时存储到hudi表中
 */
public class FlinkSQLHudiDemo {
    public static void main(String[] args) {

        //1.获取表的执行环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironment();
        //并行度设置为1
        env.setParallelism(1);
        //TODO: 由于增量将数据写入到Hudi表，所以需要启动Flink Checkpoint 检查点
        env.enableCheckpointing(5 * 1000);


        EnvironmentSettings settings = EnvironmentSettings
                .newInstance()
                .inStreamingMode()//设置流式模式
                .build();
        StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env, settings);


        //2.创建输入表，TODO:从kafka消费数据
        tableEnv.executeSql(
                "CREATE TABLE order_kafka_source(\n" +
                        "    `orderId` STRING,\n" +
                        "    `userId` STRING,\n" +
                        "    `orderTime` STRING,\n" +
                        "    `ip` STRING,\n" +
                        "    `orderMoney` DOUBLE,\n" +
                        "    `orderStatus` INT\n" +
                        ")\n" +
                        "WITH(\n" +
                        "    'connector' = 'kafka',\n" +
                        "    'topic'='order-topic',\n" +
                        "    'properties.bootstrap.servers' = 'localhost:9092',\n" +
                        "    'properties.group.id' = 'gid-1001',\n" +
                        "    'scan.startup.mode' = 'latest-offset',\n" +
                        "    'format' = 'json',\n" +
                        "    'json.fail-on-missing-field' = 'false',\n" +
                        "    'json.ignore-parse-errors' = 'true'\n" +
                        ")\n"
        );

        //3.转换数据，可以使用SQL，也可以是TableAPI
        Table etlTable = tableEnv
                .from("order_kafka_source")
                //添加字段：hudi数据合并的字段，时间戳
                .addColumns(
                        $("orderId").substring(0,17).as("ts")
                )
                //添加字段：Hudi表分区字段，"orderTime": 2022-03-09 22:21:13.124
                .addColumns(
                        $("orderTime").substring(0, 10).as("partition_day")
                );


        tableEnv.createTemporaryView("view_order", etlTable);

        //4.创建输出表，TODO:关联到hudi表，指定hudi表名称，存储路径，字段名称等信息
        tableEnv.executeSql(
                "CREATE TABLE order_hudi_sink(\n" +
                        "    `orderId` STRING PRIMARY KEY NOT ENFORCED,\n" +
                        "    `userId` STRING,\n" +
                        "    `orderTime` STRING,\n" +
                        "    `ip` STRING,\n" +
                        "    `orderMoney` DOUBLE,\n" +
                        "    `orderStatus` INT,\n" +
                        "    `ts` STRING,\n" +
                        "    `partition_day` STRING\n" +
                        ")\n" +
                        "PARTITIONED BY (partition_day)\n" +
                        "WITH(\n" +
                        "    'connector' = 'hudi',\n" +
                        "    'path'='hdfs://localhost:9000/hudi-warehouse/flink_hudi_order',\n" +
                        "    'table.type' = 'MERGE_ON_READ',\n" +
                        "    'write.operation' = 'upsert',\n" +
                        "    'hoodie.datasource.write.recordkey.field' = 'orderId',\n" +
                        "    'write.precombine.field' = 'ts',\n" +
                        "    'write.tasks' = '1'\n" +
                        ")\n"
        );

        //5.通过子查询的方式，将数据写入输出表
        tableEnv.executeSql(
                "INSERT INTO order_hudi_sink " +
                        "SELECT orderId, userId, orderTime, ip, orderMoney, orderStatus, ts, partition_day FROM view_order"
        );

    }
}

kafka数据生成工具类

package com.zhen.hudi.streaming

import java.util.Properties

import org.apache.commons.lang3.time.FastDateFormat
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
import org.apache.kafka.common.serialization.StringSerializer
import org.json4s.jackson.Json

import scala.util.Random

/**
  * 订单实体类（Case Class）
  *
  * @param orderId     订单ID
  * @param userId      用户ID
  * @param orderTime   订单日期时间
  * @param ip          下单IP地址
  * @param orderMoney  订单金额
  * @param orderStatus 订单状态
  */
case class OrderRecord(
  orderId: String,
  userId: String,
  orderTime: String,
  ip: String,
  orderMoney: Double,
  orderStatus: Int
)

/**
  * @Author FengZhen
  * @Date 3/3/22 9:54 PM 
  * @Description TODO
  * 模拟生产订单数据，发送到Kafka Topic中
  * Topic中每条数据Message类型为String，以JSON格式数据发送
  * 数据转换：
  * 将Order类实例对象转换为JSON格式字符串数据（可以使用json4s类库）
  */
object MockOrderProducer {

  def main(args: Array[String]): Unit = {

    var producer: KafkaProducer[String, String] = null
    try {
      // 1. Kafka Client Producer 配置信息
      val props = new Properties()
      props.put("bootstrap.servers", "localhost:9092")
      props.put("acks", "1")
      props.put("retries", "3")

//      props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
//      props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")
      props.put("key.serializer", classOf[StringSerializer].getName)
      props.put("value.serializer", classOf[StringSerializer].getName)

      // 2. 创建KafkaProducer对象，传入配置信息
      producer = new KafkaProducer[String, String](props)

      // 随机数实例对象
      val random: Random = new Random()
      // 订单状态：订单打开 0，订单取消 1，订单关闭 2，订单完成 3
      val allStatus = Array(0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)

      while (true) {
        // 每次循环 模拟产生的订单数目
        val batchNumber: Int = random.nextInt(1) + 1
        (1 to batchNumber).foreach { number =>
          val currentTime: Long = System.currentTimeMillis()
          val orderId: String = s"${getDate(currentTime)}%06d".format(number)
          val userId: String = s"${1 + random.nextInt(5)}%08d".format(random.nextInt(1000))
          val orderTime: String = getDate(currentTime, format = "yyyy-MM-dd HH:mm:ss.SSS")
          val orderMoney: String = s"${5 + random.nextInt(500)}.%02d".format(random.nextInt(100))
          val orderStatus: Int = allStatus(random.nextInt(allStatus.length))
          // 3. 订单记录数据
          val orderRecord: OrderRecord = OrderRecord(
            orderId, userId, orderTime, getRandomIp, orderMoney.toDouble, orderStatus
          )
          // 转换为JSON格式数据
          val orderJson = new Json(org.json4s.DefaultFormats).write(orderRecord)
          println(orderJson)
          // 4. 构建ProducerRecord对象
          val record = new ProducerRecord[String, String]("order-topic", orderId, orderJson)
          // 5. 发送数据：def send(messages: KeyedMessage[K,V]*), 将数据发送到Topic
          producer.send(record)
        }
        Thread.sleep(random.nextInt(500) + 5000)
      }
    } catch {
      case e: Exception => e.printStackTrace()
    } finally {
      if (null != producer) producer.close()
    }
  }

  /** =================获取当前时间================= */
  def getDate(time: Long, format: String = "yyyyMMddHHmmssSSS"): String = {
    val fastFormat: FastDateFormat = FastDateFormat.getInstance(format)
    val formatDate: String = fastFormat.format(time) // 格式化日期
    formatDate
  }

  /** ================= 获取随机IP地址 ================= */
  def getRandomIp: String = {
    // ip范围
    val range: Array[(Int, Int)] = Array(
      (607649792, 608174079), //36.56.0.0-36.63.255.255
      (1038614528, 1039007743), //61.232.0.0-61.237.255.255
      (1783627776, 1784676351), //106.80.0.0-106.95.255.255
      (2035023872, 2035154943), //121.76.0.0-121.77.255.255
      (2078801920, 2079064063), //123.232.0.0-123.235.255.255
      (-1950089216, -1948778497), //139.196.0.0-139.215.255.255
      (-1425539072, -1425014785), //171.8.0.0-171.15.255.255
      (-1236271104, -1235419137), //182.80.0.0-182.92.255.255
      (-770113536, -768606209), //210.25.0.0-210.47.255.255
      (-569376768, -564133889) //222.16.0.0-222.95.255.255
    )
    // 随机数：IP地址范围下标
    val random = new Random()
    val index = random.nextInt(10)
    val ipNumber: Int = range(index)._1 + random.nextInt(range(index)._2 - range(index)._1)

    // 转换Int类型IP地址为IPv4格式
    number2IpString(ipNumber)
  }

  /** =================将Int类型IPv4地址转换为字符串类型================= */
  def number2IpString(ip: Int): String = {
    val buffer: Array[Int] = new Array[Int](4)
    buffer(0) = (ip >> 24) & 0xff
    buffer(1) = (ip >> 16) & 0xff
    buffer(2) = (ip >> 8) & 0xff
    buffer(3) = ip & 0xff
    // 返回IPv4地址
    buffer.mkString(".")
  }

}

相关阅读:
1、SpringBoot入门
 在一台电脑开启多个微信
 【监控】prometheus监控安装
 【hadoop3.0】hive 安装
 【google工具安装】gsutil存储管理google cloud stroge
[监控报警]elastalert安装使用
 【大数据】hadoop3.0worker集群+flink+zeppelin+kafaka+zookeeper安装部署
 【原创】fluent-bit安装使用
 [etcd]etcd集群部署
 【手打】kafka集群设置
原文地址：https://www.cnblogs.com/EnzoDin/p/15987557.html