• HudiFlink消费kafka将增量数据实时写入Hudi(java)


    零、步骤

    一、Flink SQL集成Kafka

     

    1.创建topic(一分区一备份)

    flink-topic
     

    2.准备flink-sql-connector-kafka_2.12-1.13.1.jar,放入flink/lib下

     

    3.启动client,指定jar

    ./sql-client.sh embedded -j ../lib/flink-sql-connector-kafka_2.12-1.13.1.jar shell
    设置分析结果展示模式为:set execution.result-mode=tableau;
     

    4.创建表,映射到kafka topic

    kafka topic中数据是CSV文件格式,有三个字段,user_id、item_id、behavior,从kafka消费数据时,设置从最新偏移量开始
    CREATE TABLE test_kafka(
        `user_id` BIGINT,
        `item_id` BIGINT,
        `behavior` STRING
    )
    WITH(
        'connector' = 'kafka',
        'topic'='flink-topic',
        'properties.bootstrap.servers' = 'localhost:9092',
        'properties.group.id' = 'test-group-10001',
        'scan.startup.mode' = 'latest-offset',
        'format' = 'csv'
    );
     
    Flink SQL> select * from test_kafka;
    +----+----------------------+----------------------+--------------------------------+
    | op |              user_id |              item_id |                       behavior |
    +----+----------------------+----------------------+————————————————+

    5.kafka写入数据

    kafka-console-producer.sh --broker-list localhost:9092 —-topic flink-topic
    1001,90001,click
    1001,90001,browser
    1001,90001,click
    1002,90002,click
    1002,90003,click
    1003,90001,order
    1004,90001,order
     
     
    MacBook-Pro:bin FengZhen$ kafka-console-producer.sh --broker-list localhost:9092 --topic flink-topic
    >1001,90001,click
    1001,90001,browser
    1001,90001,click
    1002,90002,click
    1002,90003,click
    1003,90001,order
    1004,90001,order
    >>>>>>>
     
    数据可实时查询处理
    Flink SQL> select * from test_kafka;
    +----+----------------------+----------------------+--------------------------------+
    | op |              user_id |              item_id |                       behavior |
    +----+----------------------+----------------------+--------------------------------+
    | +I |                 1001 |                90001 |                          click |
    | +I |                 1001 |                90001 |                        browser |
    | +I |                 1001 |                90001 |                          click |
    | +I |                 1002 |                90002 |                          click |
    | +I |                 1002 |                90003 |                          click |
    | +I |                 1003 |                90001 |                          order |
    | +I |                 1004 |                90001 |                          order |

    二、代码实现

    package com.zhen.hudi;
    
    import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
    import org.apache.flink.table.api.EnvironmentSettings;
    import org.apache.flink.table.api.Table;
    import org.apache.flink.table.api.TableEnvironment;
    import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
    
    import static org.apache.flink.table.api.Expressions.$;
    
    /**
     * @Author FengZhen
     * @Date 3/9/22 10:17 PM
     * @Description 基于Flink SQL Connector实现:实时消费topic中数据,转换处理后,实时存储到hudi表中
     */
    public class FlinkSQLHudiDemo {
        public static void main(String[] args) {
    
            //1.获取表的执行环境
            StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironment();
            //并行度设置为1
            env.setParallelism(1);
            //TODO: 由于增量将数据写入到Hudi表,所以需要启动Flink Checkpoint 检查点
            env.enableCheckpointing(5 * 1000);
    
    
            EnvironmentSettings settings = EnvironmentSettings
                    .newInstance()
                    .inStreamingMode()//设置流式模式
                    .build();
            StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env, settings);
    
    
            //2.创建输入表,TODO:从kafka消费数据
            tableEnv.executeSql(
                    "CREATE TABLE order_kafka_source(\n" +
                            "    `orderId` STRING,\n" +
                            "    `userId` STRING,\n" +
                            "    `orderTime` STRING,\n" +
                            "    `ip` STRING,\n" +
                            "    `orderMoney` DOUBLE,\n" +
                            "    `orderStatus` INT\n" +
                            ")\n" +
                            "WITH(\n" +
                            "    'connector' = 'kafka',\n" +
                            "    'topic'='order-topic',\n" +
                            "    'properties.bootstrap.servers' = 'localhost:9092',\n" +
                            "    'properties.group.id' = 'gid-1001',\n" +
                            "    'scan.startup.mode' = 'latest-offset',\n" +
                            "    'format' = 'json',\n" +
                            "    'json.fail-on-missing-field' = 'false',\n" +
                            "    'json.ignore-parse-errors' = 'true'\n" +
                            ")\n"
            );
    
            //3.转换数据,可以使用SQL,也可以是TableAPI
            Table etlTable = tableEnv
                    .from("order_kafka_source")
                    //添加字段:hudi数据合并的字段,时间戳
                    .addColumns(
                            $("orderId").substring(0,17).as("ts")
                    )
                    //添加字段:Hudi表分区字段,"orderTime": 2022-03-09 22:21:13.124
                    .addColumns(
                            $("orderTime").substring(0, 10).as("partition_day")
                    );
    
    
            tableEnv.createTemporaryView("view_order", etlTable);
    
            //4.创建输出表,TODO:关联到hudi表,指定hudi表名称,存储路径,字段名称等信息
            tableEnv.executeSql(
                    "CREATE TABLE order_hudi_sink(\n" +
                            "    `orderId` STRING PRIMARY KEY NOT ENFORCED,\n" +
                            "    `userId` STRING,\n" +
                            "    `orderTime` STRING,\n" +
                            "    `ip` STRING,\n" +
                            "    `orderMoney` DOUBLE,\n" +
                            "    `orderStatus` INT,\n" +
                            "    `ts` STRING,\n" +
                            "    `partition_day` STRING\n" +
                            ")\n" +
                            "PARTITIONED BY (partition_day)\n" +
                            "WITH(\n" +
                            "    'connector' = 'hudi',\n" +
                            "    'path'='hdfs://localhost:9000/hudi-warehouse/flink_hudi_order',\n" +
                            "    'table.type' = 'MERGE_ON_READ',\n" +
                            "    'write.operation' = 'upsert',\n" +
                            "    'hoodie.datasource.write.recordkey.field' = 'orderId',\n" +
                            "    'write.precombine.field' = 'ts',\n" +
                            "    'write.tasks' = '1'\n" +
                            ")\n"
            );
    
            //5.通过子查询的方式,将数据写入输出表
            tableEnv.executeSql(
                    "INSERT INTO order_hudi_sink " +
                            "SELECT orderId, userId, orderTime, ip, orderMoney, orderStatus, ts, partition_day FROM view_order"
            );
    
        }
    }

    kafka数据生成工具类

    package com.zhen.hudi.streaming
    
    import java.util.Properties
    
    import org.apache.commons.lang3.time.FastDateFormat
    import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
    import org.apache.kafka.common.serialization.StringSerializer
    import org.json4s.jackson.Json
    
    import scala.util.Random
    
    /**
      * 订单实体类(Case Class)
      *
      * @param orderId     订单ID
      * @param userId      用户ID
      * @param orderTime   订单日期时间
      * @param ip          下单IP地址
      * @param orderMoney  订单金额
      * @param orderStatus 订单状态
      */
    case class OrderRecord(
      orderId: String,
      userId: String,
      orderTime: String,
      ip: String,
      orderMoney: Double,
      orderStatus: Int
    )
    
    /**
      * @Author FengZhen
      * @Date 3/3/22 9:54 PM 
      * @Description TODO
      * 模拟生产订单数据,发送到Kafka Topic中
      * Topic中每条数据Message类型为String,以JSON格式数据发送
      * 数据转换:
      * 将Order类实例对象转换为JSON格式字符串数据(可以使用json4s类库)
      */
    object MockOrderProducer {
    
      def main(args: Array[String]): Unit = {
    
        var producer: KafkaProducer[String, String] = null
        try {
          // 1. Kafka Client Producer 配置信息
          val props = new Properties()
          props.put("bootstrap.servers", "localhost:9092")
          props.put("acks", "1")
          props.put("retries", "3")
    
    //      props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
    //      props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")
          props.put("key.serializer", classOf[StringSerializer].getName)
          props.put("value.serializer", classOf[StringSerializer].getName)
    
          // 2. 创建KafkaProducer对象,传入配置信息
          producer = new KafkaProducer[String, String](props)
    
          // 随机数实例对象
          val random: Random = new Random()
          // 订单状态:订单打开 0,订单取消 1,订单关闭 2,订单完成 3
          val allStatus = Array(0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
    
          while (true) {
            // 每次循环 模拟产生的订单数目
            val batchNumber: Int = random.nextInt(1) + 1
            (1 to batchNumber).foreach { number =>
              val currentTime: Long = System.currentTimeMillis()
              val orderId: String = s"${getDate(currentTime)}%06d".format(number)
              val userId: String = s"${1 + random.nextInt(5)}%08d".format(random.nextInt(1000))
              val orderTime: String = getDate(currentTime, format = "yyyy-MM-dd HH:mm:ss.SSS")
              val orderMoney: String = s"${5 + random.nextInt(500)}.%02d".format(random.nextInt(100))
              val orderStatus: Int = allStatus(random.nextInt(allStatus.length))
              // 3. 订单记录数据
              val orderRecord: OrderRecord = OrderRecord(
                orderId, userId, orderTime, getRandomIp, orderMoney.toDouble, orderStatus
              )
              // 转换为JSON格式数据
              val orderJson = new Json(org.json4s.DefaultFormats).write(orderRecord)
              println(orderJson)
              // 4. 构建ProducerRecord对象
              val record = new ProducerRecord[String, String]("order-topic", orderId, orderJson)
              // 5. 发送数据:def send(messages: KeyedMessage[K,V]*), 将数据发送到Topic
              producer.send(record)
            }
            Thread.sleep(random.nextInt(500) + 5000)
          }
        } catch {
          case e: Exception => e.printStackTrace()
        } finally {
          if (null != producer) producer.close()
        }
      }
    
      /** =================获取当前时间================= */
      def getDate(time: Long, format: String = "yyyyMMddHHmmssSSS"): String = {
        val fastFormat: FastDateFormat = FastDateFormat.getInstance(format)
        val formatDate: String = fastFormat.format(time) // 格式化日期
        formatDate
      }
    
      /** ================= 获取随机IP地址 ================= */
      def getRandomIp: String = {
        // ip范围
        val range: Array[(Int, Int)] = Array(
          (607649792, 608174079), //36.56.0.0-36.63.255.255
          (1038614528, 1039007743), //61.232.0.0-61.237.255.255
          (1783627776, 1784676351), //106.80.0.0-106.95.255.255
          (2035023872, 2035154943), //121.76.0.0-121.77.255.255
          (2078801920, 2079064063), //123.232.0.0-123.235.255.255
          (-1950089216, -1948778497), //139.196.0.0-139.215.255.255
          (-1425539072, -1425014785), //171.8.0.0-171.15.255.255
          (-1236271104, -1235419137), //182.80.0.0-182.92.255.255
          (-770113536, -768606209), //210.25.0.0-210.47.255.255
          (-569376768, -564133889) //222.16.0.0-222.95.255.255
        )
        // 随机数:IP地址范围下标
        val random = new Random()
        val index = random.nextInt(10)
        val ipNumber: Int = range(index)._1 + random.nextInt(range(index)._2 - range(index)._1)
    
        // 转换Int类型IP地址为IPv4格式
        number2IpString(ipNumber)
      }
    
      /** =================将Int类型IPv4地址转换为字符串类型================= */
      def number2IpString(ip: Int): String = {
        val buffer: Array[Int] = new Array[Int](4)
        buffer(0) = (ip >> 24) & 0xff
        buffer(1) = (ip >> 16) & 0xff
        buffer(2) = (ip >> 8) & 0xff
        buffer(3) = ip & 0xff
        // 返回IPv4地址
        buffer.mkString(".")
      }
    
    }
     
     
  • 相关阅读:
    1、SpringBoot入门
    在一台电脑开启多个微信
    【监控】prometheus监控安装
    【hadoop3.0】hive 安装
    【google工具安装】gsutil存储管理google cloud stroge
    [监控报警]elastalert安装使用
    【大数据】hadoop3.0worker集群+flink+zeppelin+kafaka+zookeeper安装部署
    【原创】fluent-bit安装使用
    [etcd]etcd集群部署
    【手打】kafka集群设置
  • 原文地址:https://www.cnblogs.com/EnzoDin/p/15987557.html
Copyright © 2020-2023  润新知