• spark streaming 1: SparkContex


    StreamingContext 和SparkContex的用途是差不多的,作为spark stream的入口,提供配置、生成DStream等功能。

    总体来看,spark stream包括如下模块:


    /**
    * Main entry point for Spark Streaming functionality. It provides methods used to create
    * [[org.apache.spark.streaming.dstream.DStream]]s from various input sources
    . It can be either
    * created by providing a Spark master URL and an appName, or from a org.apache.spark.SparkConf
    * configuration (see core Spark documentation), or from an existing org.apache.spark.SparkContext.
    * The associated SparkContext can be accessed using `context.sparkContext`. After
    * creating and transforming DStreams, the streaming computation can be started and stopped
    * using `context.start()` and `context.stop()`, respectively.
    * `context.awaitTransformation()` allows the current thread to wait for the termination
    * of the context by `stop()` or by an exception.
    */
    class StreamingContext private[streaming] (
    sc_ : SparkContext
    ,
    cp_ : Checkpoint,
    batchDur_ : Duration
    )
    extends Logging {
    重要的属性:DStreamGraph有点像简洁版的DAG scheduler,负责根据某个时间间隔生成一序列JobSet,以及按照依赖关系序列化
    private[streaming] val graph: DStreamGraph = {
    if (isCheckpointPresent) {
    cp_.graph.setContext(this)
    cp_.graph.restoreCheckpointData()
    cp_.graph
    } else {
    assert(batchDur_ != null, "Batch duration for streaming context cannot be null")
    val newGraph = new DStreamGraph()
    newGraph.setBatchDuration(batchDur_)
    newGraph
    }
    }

    private val nextReceiverInputStreamId = new AtomicInteger(0)

    JobScheduler
    private[streaming] val scheduler = new JobScheduler(this)

    private[streaming] val waiter = new ContextWaiter

    private[streaming] val progressListener = new StreamingJobProgressListener(this)

    添加一个listener,他们会处理InputStream输入的数据
    /** Add a [[org.apache.spark.streaming.scheduler.StreamingListener]] object for
    * receiving system events related to streaming.
    */
    def addStreamingListener(streamingListener: StreamingListener) {
    scheduler.listenerBus.addListener(streamingListener)
    }
    启动调度器
    /**
    * Start the execution of the streams.
    *
    * @throws SparkException if the context has already been started or stopped.
    */
    def start(): Unit = synchronized {
    if (state == Started) {
    throw new SparkException("StreamingContext has already been started")
    }
    if (state == Stopped) {
    throw new SparkException("StreamingContext has already been stopped")
    }
    validate()
    sparkContext.setCallSite(DStream.getCreationSite())
    scheduler.start()
    state = Started
    }

    各种InputStream,后续再细看。
    class PluggableInputDStream[T: ClassTag](
    @transient ssc_ : StreamingContext,
    receiver: Receiver[T]) extends ReceiverInputDStream[T](ssc_) {

    def getReceiver(): Receiver[T] = {
    receiver
    }
    }

    class SocketInputDStream[T: ClassTag](
    @transient ssc_ : StreamingContext,
    host: String,
    port: Int,
    bytesToObjects: InputStream => Iterator[T],
    storageLevel: StorageLevel
    ) extends ReceiverInputDStream[T](ssc_) {

    def getReceiver(): Receiver[T] = {
    new SocketReceiver(host, port, bytesToObjects, storageLevel)
    }
    }

    /**
    * An input stream that reads blocks of serialized objects from a given network address.
    * The blocks will be inserted directly into the block store. This is the fastest way to get
    * data into Spark Streaming, though it requires the sender to batch data and serialize it
    * in the format that the system is configured with.
    */
    private[streaming]
    class RawInputDStream[T: ClassTag](
    @transient ssc_ : StreamingContext,
    host: String,
    port: Int,
    storageLevel: StorageLevel
    ) extends ReceiverInputDStream[T](ssc_ ) with Logging {

    def getReceiver(): Receiver[T] = {
    new RawNetworkReceiver(host, port, storageLevel).asInstanceOf[Receiver[T]]
    }
    }


    /**
    * Create a input stream that monitors a Hadoop-compatible filesystem
    * for new files and reads them using the given key-value types and input format.
    * Files must be written to the monitored directory by "moving" them from another
    * location within the same file system. File names starting with . are ignored.
    * @param directory HDFS directory to monitor for new file
    * @tparam K Key type for reading HDFS file
    * @tparam V Value type for reading HDFS file
    * @tparam F Input format for reading HDFS file
    */
    def fileStream[
    K: ClassTag,
    V: ClassTag,
    F <: NewInputFormat[K, V]: ClassTag
    ] (directory: String): InputDStream[(K, V)] = {
    new FileInputDStream[K, V, F](this, directory)
    }


    private[streaming]
    class TransformedDStream[U: ClassTag] (
    parents: Seq[DStream[_]],
    transformFunc: (Seq[RDD[_]], Time) => RDD[U]
    ) extends DStream[U](parents.head.ssc) {

    require(parents.length > 0, "List of DStreams to transform is empty")
    require(parents.map(_.ssc).distinct.size == 1, "Some of the DStreams have different contexts")
    require(parents.map(_.slideDuration).distinct.size == 1,
    "Some of the DStreams have different slide durations")

    override def dependencies = parents.toList

    override def slideDuration: Duration = parents.head.slideDuration

    override def compute(validTime: Time): Option[RDD[U]] = {
    val parentRDDs = parents.map(_.getOrCompute(validTime).orNull).toSeq
    Some(transformFunc(parentRDDs, validTime))
    }
    }












  • 相关阅读:
    getAttribute()方法
    getElementsByTagName()方法
    DOM方法 getElementsByName()方法
    python 与
    run_debug和run_demo的区别
    sh脚本写法
    使用snapshot继续训练网络
    安卓获取数据demo出现的问题
    查看文件大小
    重命名文件夹名字
  • 原文地址:https://www.cnblogs.com/zwCHAN/p/4274802.html
Copyright © 2020-2023  润新知