QueueStream(主要是做实验用)
import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Seconds, StreamingContext} import scala.collection.mutable.Queue object QueueStream { def main(args: Array[String]) { val sparkConf = new SparkConf().setAppName("QueueStream") val sc = new SparkContext(sparkConf) // Create the context val ssc = new StreamingContext(sc, Seconds(1)) // 创建一个RDD类型的queue Int类型的RDD val rddQueue = new Queue[RDD[Int]]() // 创建QueueInputDStream 且接受数据和处理数据 // queueStream 接收 rddQueue val inputStream = ssc.queueStream(rddQueue) // 统计与10取模的个数 val mappedStream = inputStream.map(x => (x % 10, 1)) val reducedStream = mappedStream.reduceByKey(_ + _) reducedStream.print() ssc.start() // 将RDD push到queue中,实时处理 rddQueue += ssc.sparkContext.makeRDD(1 to 1000, 10) ssc.stop(false) } }
HdfsFileStream
import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{LongWritable, Text} import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.streaming.{Seconds, StreamingContext} /** * Counts words in new text files created in the given directory * 1、监控目录下的文件的格式必须是统一的 * 2、不支持嵌入文件目录 * 3、一旦文件移动到这个监控目录下,是不能变的,往文件中追加的数据是不会被读取的 * spark-shell --master spark://master:7077 --total-executor-cores 4 --executor-cores 2 * hadoop fs -copyFromLocal test1-process.txt hdfs://master:9999/user/hadoop-twq/spark-course/streaming/filestream */ object HdfsFileStream { def main(args: Array[String]) { val sparkConf = new SparkConf().setAppName("HdfsFileStream") val sc = new SparkContext(sparkConf) // Create the context val ssc = new StreamingContext(sc, Seconds(2)) val filePath = "hdfs://master:9999/user/hadoop-twq/spark-course/streaming/filestream" // Create the FileInputDStream on the directory and use the // stream to count words in new files created // filePath 表示监控的文件目录 // filter(Path => Boolean) 表示符合条件的文件路径 // isNewFile 表示streaming app启动的时候是否需要处理已经存在的文件 // fileStream[LongWritable, Text, TextInputFormat] key是LongWritable value类型LongWritable 输出的文件类型TextInputFormat // 参数:filePath文件目录 // (path: Path) => path.toString.contains("process") 过滤器 // false 可见文件 val linesWithText = ssc.textFileStream(filePath) //简单写法,没有过滤器 val lines = ssc.fileStream[LongWritable, Text, TextInputFormat](filePath, (path: Path) => path.toString.contains("process"), false).map(_._2.toString) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _) wordCounts.print() ssc.start() ssc.awaitTermination() ssc.stop(false) } } /*spark.streaming.fileStream.minRememberDuration = 60s HDFS系统的时间需要和跑streaming app的机器的时间同步 新文件在一定的remember window的时间段内可见,这样的新文件才会处理 一旦文件可见,那么文件的修改时间不能变,如果向文件追加内容的话,这些内容不会被读取 */