• Spark Streaming自定义Receiver


    一 背景

    Spark社区为Spark Streaming提供了很多数据源接口,但是有些比较偏的数据源没有覆盖,由于公司技术栈选择,用了阿里云的MQ服务ONS,要做实时需求,要自己编写Receiver

    二 技术实现

    1.官网的例子已经比较详细,但是进入实践还需要慢慢调试,官方文档

    2.实现代码,由三部分组成,receiver,inputstream,util

    3.receiver代码

    import java.io.Serializable
    import java.util.Properties
    
    import com.aliyun.openservices.ons.api._
    import com.aliyun.openservices.ons.api.impl.ONSFactoryImpl
    import org.apache.spark.internal.Logging
    import org.apache.spark.storage.StorageLevel
    import org.apache.spark.streaming.receiver.Receiver
    
    class OnsReceiver(
        cid: String,
        accessKey: String,
        secretKey: String,
        addr: String,
        topic: String,
        tag: String,
        func: Message => Array[Byte])
      extends Receiver[Array[Byte]](StorageLevel.MEMORY_AND_DISK_2) with Serializable with Logging {
      receiver =>
    
      private var consumer: Consumer = null
      private var workerThread: Thread = null
    
      override def onStart(): Unit = {
        workerThread = new Thread(new Runnable {
          override def run(): Unit = {
            val properties = new Properties
            properties.put(PropertyKeyConst.ConsumerId, cid)
            properties.put(PropertyKeyConst.AccessKey, accessKey)
            properties.put(PropertyKeyConst.SecretKey, secretKey)
            properties.put(PropertyKeyConst.ONSAddr, addr)
            properties.put(PropertyKeyConst.MessageModel, "CLUSTERING")
            properties.put(PropertyKeyConst.ConsumeThreadNums, "50")
            val onsFactoryImpl = new ONSFactoryImpl
            consumer = onsFactoryImpl.createConsumer(properties)
            consumer.subscribe(topic, tag, new MessageListener() {
              override def consume(message: Message, context: ConsumeContext): Action = {
                try {
                  receiver.store(func(message))
                  Action.CommitMessage
                } catch {
                  case e: Throwable => e.printStackTrace()
                    Action.ReconsumeLater
                }
              }
            })
            consumer.start()
          }
        })
        workerThread.setName(s"Aliyun ONS Receiver $streamId")
        workerThread.setDaemon(true)
        workerThread.start()
      }
    
      override def onStop(): Unit = {
        if (workerThread != null) {
          if (consumer != null) {
            consumer.shutdown()
          }
    
          workerThread.join()
          workerThread = null
          logInfo(s"Stopped receiver for streamId $streamId")
        }
      }
    }

    input代码

    import com.aliyun.openservices.ons.api.Message
    import org.apache.spark.streaming.StreamingContext
    import org.apache.spark.streaming.dstream.ReceiverInputDStream
    import org.apache.spark.streaming.receiver.Receiver
    
    class OnsInputDStream(
        @transient _ssc: StreamingContext,
        cid: String,
        topic: String,
        tag: String,
        accessKey: String,
        secretKey: String,
        addr:String,
        func: Message => Array[Byte]
      ) extends ReceiverInputDStream[Array[Byte]](_ssc) {
    
      override def getReceiver(): Receiver[Array[Byte]] = {
        new OnsReceiver(cid,accessKey,secretKey,addr,topic,tag,func)
      }
    
    }

    util代码

    import com.aliyun.openservices.ons.api.Message
    import org.apache.spark.annotation.Experimental
    import org.apache.spark.streaming.StreamingContext
    import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
    
    object OnsUtils {
      @Experimental
      def createStream(
                        ssc: StreamingContext,
                        cid: String,
                        topic: String,
                        tag: String,
                        accessKey: String,
                        secretKey: String,
                        addr: String,
                        func: Message => Array[Byte]): ReceiverInputDStream[Array[Byte]] = {
        new OnsInputDStream(ssc, cid, topic, tag, accessKey, secretKey, addr, func)
      }
    
      @Experimental
      def createStreams(
                         ssc: StreamingContext,
                         consumerIdTopicTags: Array[(String, String, String)],
                         accessKey: String,
                         secretKey: String,
                         addr: String,
                         func: Message => Array[Byte]): DStream[Array[Byte]] = {
        val invalidTuples1 = consumerIdTopicTags.groupBy(e => (e._1, e._2)).filter(e => e._2.length > 1)
        val invalidTuples2 = consumerIdTopicTags.map(e => (e._1, e._2)).groupBy(e => e._1).filter(e => e._2.length > 1)
        if (invalidTuples1.size > 1 || invalidTuples2.size > 1) {
          throw new RuntimeException("Inconsistent consumer subscription.")
        } else {
          ssc.union(consumerIdTopicTags.map({
            case (consumerId, topic, tags) =>
              createStream(ssc, consumerId, topic, tags, accessKey, secretKey, addr, func)
          }))
        }
      }
    
    }

    三 调用

    val stream = (0 until 3).map(i => {
          OnsUtils.createStream(ssc,
            "CID",
            "BI_CALL",
            "call_log_ons",
            config.getString("ons.access_key"),
            config.getString("ons.sercet_key"),
            config.getString("ons.ons_addr"),
            func)
        })
        val unionStream = ssc.union(stream).foreachRDD(...)

    stream可以决定设置多少个receiver,这个数量必须小于等于spark on yarn的num-executors,内存默认占用executors的内存的一半。

  • 相关阅读:
    homebrew 安装 mpv
    Spring JdbcTemplate 两种方法的区别
    git .gitignore失效的解决办法
    git 分支修改bug应用场景
    url编码实践
    escape encodeuri encodeURIComponent 区别
    mysql命令gruop by报错this is incompatible with sql_mode=only_full_group_by
    服务器病毒问题解决- 阿里云 挖矿病毒,Circle_MI.png
    trim和replace的陷阱实践
    mysql 5.7.15 union order by 子查询排序不生效
  • 原文地址:https://www.cnblogs.com/ChouYarn/p/7992724.html
Copyright © 2020-2023  润新知