• HudiStructuredStreaming流式写入Hudi


    场景

    实时产生的数据写入到Kafka,由Spark实时读取Kafka的数据,写入hudi

    实现

    package com.zhen.hudi.streaming
    
    import com.zhen.hudi.didi.SparkUtils
    import org.apache.hudi.DataSourceWriteOptions.{PARTITIONPATH_FIELD, PRECOMBINE_FIELD, RECORDKEY_FIELD}
    import org.apache.hudi.config.HoodieWriteConfig.TBL_NAME
    import org.apache.spark.sql.functions._
    import org.apache.spark.sql.streaming.OutputMode
    import org.apache.spark.sql.{DataFrame, Dataset, Row, SaveMode, SparkSession}
    
    /**
      * @Author FengZhen
      * @Date 3/3/22 10:16 PM 
      * @Description 基于StructuredStreaming结构化流实时从kafka消费数据,经过ETL转换后,存储至Hudi表
      */
    object HudiStructureDemo {
    
      /**
        * 指定kafka topic名称,实时消费数据
        * @param spark
        * @param topicName
        * @return
        */
      def readFromKafka(spark: SparkSession, topicName: String): DataFrame = {
    
        spark.readStream
          .format("kafka")
          .option("kafka.bootstrap.servers", "localhost:9092")
          .option("subscribe", topicName)
          //消费位置
          .option("startingOffsets", "latest")
          //每次最多处理十万条
          .option("maxOffsetsPerTrigger", 100000)
          //数据丢失是否失败
          .option("failOnDataLoss", "false")
          .load()
    
      }
    
      /**
        * 对kafka获取的数据进行转换操作,获取所有字段的值,转换为String,以便保存到hudi表
        * @param streamDF
        * @return
        */
      def process(streamDF: DataFrame): DataFrame = {
    
        streamDF
          //选择字段
          .selectExpr(
          "CAST(key AS STRING) AS order_id",
            "CAST(value AS STRING) AS message",
            "topic", "partition", "offset", "timestamp"
          )
          //解析Message数据,提取字段值
          .withColumn("user_id", get_json_object(col("message"), "$.userId"))
          .withColumn("order_time", get_json_object(col("message"), "$.orderTime"))
          .withColumn("ip", get_json_object(col("message"), "$.ip"))
          .withColumn("order_money", get_json_object(col("message"), "$.orderMoney"))
          .withColumn("order_status", get_json_object(col("message"), "$.orderStatus"))
          //删除message字段
          .drop(col("message"))
          //转换订单日期时间格式为Long类型,作为hudi表中合并数据字段
          .withColumn("ts", to_timestamp(col("order_time"), "yyyy-MM-dd HH:mm:ss.SSS"))
          //订单日期时间提取分区日期:yyyy-MM-dd
          .withColumn("day", substring(col("order_time"), 0, 10))
      }
    
      /**
        * 将流式数据DataFrame保存到Hudi表中
        * @param streamDF
        */
      def saveToHudi(streamDF: DataFrame): Unit = {
        streamDF.writeStream
          .outputMode(OutputMode.Append())
          .queryName("query-hudi-streaming")
          .foreachBatch((batchDF: Dataset[Row], batchId: Long) =>{
            println(s"=============== BatchId: ${batchId} start =============== ")
            import org.apache.hudi.DataSourceWriteOptions._
            import org.apache.hudi.config.HoodieWriteConfig._
            import org.apache.hudi.keygen.constant.KeyGeneratorOptions._
    
            batchDF.write
              .mode(SaveMode.Append)
              .format("hudi")
              .option("hoodie.insert.shuffle.parallelism", "2")
              .option("hoodie.upsert.shuffle.parallelism", "2")
              //hudi表的属性值的设置
              //主键
              .option(RECORDKEY_FIELD.key(), "order_id")
              //预合并
              .option(PRECOMBINE_FIELD.key(), "ts")
              //分区
              .option(PARTITIONPATH_FIELD.key(), "day")
              //分区值对应目录格式,与hive分区策略一致
              .option(HIVE_STYLE_PARTITIONING_ENABLE.key(), "true")
              //表名
              .option(TBL_NAME.key(), "tbl_hudi_order")
              //读取时合并表类型
              .option(TABLE_TYPE.key(), "MERGE_ON_READ")
              .save("/hudi-warehouse/tbl_hudi_order")
          })
          .option("checkpointLocation", "/datas/hudi-spark/struct-ckpt-1001")
          .start()
      }
    
      def main(args: Array[String]): Unit = {
        //1.构建SparkSession实例对象
        val spark: SparkSession = SparkUtils.createSparkSession(this.getClass)
    
        //2.从kafka实时消费数据
        val kafkaStreamDF: DataFrame = readFromKafka(spark, "order-topic")
    
        //3.提取数据,转换数据类型
        val streamDF: DataFrame = process(kafkaStreamDF)
    
        //4.保存数据至hudi表中:MOR类型,读取表数据合并文件
        saveToHudi(streamDF)
    
        //5.流式应用启动以后,等待终止
        spark.streams.active.foreach(query => println(s"Query: ${query.name} is Running"))
        spark.streams.awaitAnyTermination()
    
      }
    
    }
  • 相关阅读:
    golang书签
    linux每日知识整理
    leetcode动态规划笔记五---双序列型
    leetcode动态规划笔记三---单序列型
    leetcode动态规划笔记二---矩阵型DP
    leetcode动态规划笔记一---一维DP
    linux系统IO操作
    golang知识精要(二)
    ubuntu修改键盘键位映射
    Python 使用 cx_Oracle 第三方库连接操作Oracle数据库
  • 原文地址:https://www.cnblogs.com/EnzoDin/p/15962369.html
Copyright © 2020-2023  润新知