• sparkStreaming读取kafka写入hive表


    sparkStreaming:

    package hive
     
    import java.io.File
    import org.apache.kafka.clients.consumer.ConsumerRecord
    import org.apache.kafka.common.serialization.StringDeserializer
    import org.apache.log4j.{Level, Logger}
    import org.apache.spark.sql.{Row, SparkSession}
    import org.apache.spark.streaming.{Seconds, StreamingContext}
    import org.apache.spark.streaming.dstream.InputDStream
    import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
     
    /**
      * spark消费多个topic的数据写入不同的hive表
      */
    object SparkToHive {
      def main(args: Array[String]): Unit = {
        Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
        Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.WARN)
        Logger.getLogger("org.apache.kafka.clients.consumer").setLevel(Level.WARN)
        val warehouseLocation = new File("hdfs://cluster/hive/warehouse").getAbsolutePath
        @transient
        val spark = SparkSession
          .builder()
          .appName("Spark SQL To Hive")
          .config("spark.sql.warehouse.dir", warehouseLocation)
          .enableHiveSupport()
          .getOrCreate()
        spark.conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
     
        @transient
        val sc = spark.sparkContext
        val scc = new StreamingContext(sc, Seconds(1))
        val kafkaParams = Map[String, Object](
          "auto.offset.reset" -> "latest", //latest,earliest
          "value.deserializer" -> classOf[StringDeserializer]
          , "key.deserializer" -> classOf[StringDeserializer]
          , "bootstrap.servers" -> "10.200.10.24:6667,10.200.10.26:6667,10.200.10.29:6667"
          , "group.id" -> "test_jason"
          , "enable.auto.commit" -> (true: java.lang.Boolean)
        )
     
        var stream: InputDStream[ConsumerRecord[String, String]] = null
        val topics = Array("test", "test1","test2")
     
        stream = KafkaUtils.createDirectStream[String, String](
          scc,
          LocationStrategies.PreferConsistent,
          ConsumerStrategies.Subscribe[String, String](topics, kafkaParams)
        )
     
        stream.foreachRDD(rdd=>{
          if (!rdd.isEmpty()) {
            val cache_rdd = rdd.map(_.value()).cache()
            // a 表
            val a = cache_rdd.filter(_.contains("hello"))
            // b 表
            val b = cache_rdd.filter(_.contains("jason"))
            // 都可以打印结果,下面的代码就不在写了,可以参考另一篇博客里面写hive的
            a.foreach(println)
            b.foreach(println)
          }
        })
        scc.start()
        scc.awaitTermination()
      }
    }
  • 相关阅读:
    消息中间件与kafka(二)
    维度建模基本概念(二)
    阿里开源canal
    ETL-kettle报错--org.gjt.mm.mysql.Driver
    消息中间件与rabbitmq(一)
    python装饰器--这个很pythonic
    Swift开发小技巧--识别选中照片中的二维码
    Swift开发小技巧--扫描二维码,二维码的描边与锁定,设置扫描范围,二维码的生成(高清,无码,你懂得!)
    Swift开发小技巧--自定义转场动画
    Swift基础--通知,代理和block的使用抉择以及Swift中的代理
  • 原文地址:https://www.cnblogs.com/30go/p/11676670.html
Copyright © 2020-2023  润新知