编程模型：数据接收层

QueueStream（主要是做实验用）

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{Seconds, StreamingContext}

import scala.collection.mutable.Queue

object QueueStream {

  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("QueueStream")
    val sc = new SparkContext(sparkConf)
    // Create the context
    val ssc = new StreamingContext(sc, Seconds(1))

    // 创建一个RDD类型的queue    Int类型的RDD
    val rddQueue = new Queue[RDD[Int]]()

    // 创建QueueInputDStream 且接受数据和处理数据
    //  queueStream  接收 rddQueue
    val inputStream = ssc.queueStream(rddQueue)
    
    // 统计与10取模的个数
    val mappedStream = inputStream.map(x => (x % 10, 1))
    val reducedStream = mappedStream.reduceByKey(_ + _)
    reducedStream.print()

    ssc.start()

    // 将RDD push到queue中，实时处理
    rddQueue += ssc.sparkContext.makeRDD(1 to 1000, 10)

    ssc.stop(false)
  }
}

　　HdfsFileStream

import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.{LongWritable, Text}
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{Seconds, StreamingContext}

/**
 * Counts words in new text files created in the given directory
  * 1、监控目录下的文件的格式必须是统一的
  * 2、不支持嵌入文件目录
  * 3、一旦文件移动到这个监控目录下，是不能变的，往文件中追加的数据是不会被读取的
  * spark-shell --master spark://master:7077 --total-executor-cores 4 --executor-cores 2
  * hadoop fs -copyFromLocal test1-process.txt hdfs://master:9999/user/hadoop-twq/spark-course/streaming/filestream
 */
object HdfsFileStream {
  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("HdfsFileStream")
    val sc = new SparkContext(sparkConf)
    // Create the context
    val ssc = new StreamingContext(sc, Seconds(2))

    val filePath = "hdfs://master:9999/user/hadoop-twq/spark-course/streaming/filestream"

    // Create the FileInputDStream on the directory and use the
    // stream to count words in new files created

    // filePath 表示监控的文件目录
    // filter(Path => Boolean) 表示符合条件的文件路径
    // isNewFile 表示streaming app启动的时候是否需要处理已经存在的文件
     // fileStream[LongWritable, Text, TextInputFormat]   key是LongWritable    value类型LongWritable    输出的文件类型TextInputFormat
     //  参数：filePath文件目录     
     //     (path: Path) => path.toString.contains("process")   过滤器
     //  false   可见文件
    val linesWithText = ssc.textFileStream(filePath)   //简单写法，没有过滤器
    val lines = ssc.fileStream[LongWritable, Text, TextInputFormat](filePath,
      (path: Path) => path.toString.contains("process"), false).map(_._2.toString)

    val words = lines.flatMap(_.split(" "))
    val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _)
    wordCounts.print()

    ssc.start()
    ssc.awaitTermination()

    ssc.stop(false)
  }
}


    /*spark.streaming.fileStream.minRememberDuration = 60s 
      HDFS系统的时间需要和跑streaming app的机器的时间同步
      新文件在一定的remember window的时间段内可见，这样的新文件才会处理
   一旦文件可见，那么文件的修改时间不能变，如果向文件追加内容的话，这些内容不会被读取
*/

相关阅读:
二维数组 A[m][n] 按行优先和按列优先的下标地址转换公式
 服务器的公网ip 和内网ip
剑指offer 23.二叉搜索树的后序遍历序列
 剑指offer 22. 从上往下打印二叉树 & 102. 二叉树的层序遍历
 剑指offer 21.栈的压入、弹出序列 & leetcode 剑指 Offer 31. 栈的压入、弹出序列
 剑指offer 4. 重建二叉树 & leetcode 剑指 Offer 07. 重建二叉树 & leetcode hot 100 105. 从前序与中序遍历序列构造二叉树
 Linux配置Nginx＋Tomcat负载均衡
 yum安装apache及问题解决
 Linux高级文本处理命令
 JMeter：生成漂亮的多维度的HTML报告
原文地址：https://www.cnblogs.com/tesla-turing/p/11488174.html

最新文章
HDU 2829
HDU 2993
HDU 3507
PAT-GPLT L2-027
蓝桥杯
 POJ 2676
蓝桥杯
 PAT-GPLT L1-039
PAT-GPLT L1-035
PAT-GPLT L1-033