• spark本地体验WordCount


    一、流程分析

    二、代码部分

     1 package wc
     2 
     3 import org.apache.spark.rdd.RDD
     4 import org.apache.spark.{SparkConf, SparkContext}
     5 
     6 object Spark_WordCount {
     7 
     8   def main(args: Array[String]): Unit = {
     9     //建立和spark框架的连接
    10     val sparConf = new SparkConf().setMaster("local").setAppName("WordCount")
    11     val sc = new SparkContext(sparConf)
    12 
    13     //执行业务操作
    14     //1.读取文件,获取一行一行的数据
    15     //hello world
    16     val lines:RDD[String] = sc.textFile("datas")
    17 
    18     //2.将一行一行的数据进行拆分,形成一个一个的单词(分词)
    19     //扁平化:将整体拆分成个体的操作
    20     //"hello world"=>hello,world
    21     val words: RDD[String] = lines.flatMap(_.split(" "))
    22 
    23     //3.将数据根据单词进行分组,便于统计
    24     //(hello,hello,hello),(world,world)
    25     val wordGroup: RDD[(String, Iterable[String])] = words.groupBy(word => word)
    26 
    27     //4.对分组后的数据进行转换
    28     //(hello,hello,hello),(world,world)=>(hello,3),(world,2)
    29     val wordToCount = wordGroup.map{
    30       case (word,list)=>{
    31         (word,list.size)
    32       }
    33     }
    34 
    35     //5.将转换结果采集到控制台打印出来
    36     val array: Array[(String, Int)] = wordToCount.collect()
    37     array.foreach(println)
    38     //关闭连接
    39     sc.stop()
    40   }
    41 }

    三、运行结果

     -------------------------------------------------------------------------------------分割线----------------------------------------------------------------------------------------------

    另一种方式:

    一、流程变化

     二、代码

     1 package wc
     2 
     3 import org.apache.spark.rdd.RDD
     4 import org.apache.spark.{SparkConf, SparkContext}
     5 
     6 object Spark_WordCount02 {
     7 
     8   def main(args: Array[String]): Unit = {
     9 
    10     val sparConf = new SparkConf().setMaster("local").setAppName("WordCount")
    11     val sc = new SparkContext(sparConf)
    12 
    13     val lines:RDD[String] = sc.textFile("datas")
    14 
    15     val words: RDD[String] = lines.flatMap(_.split(" "))
    16 
    17     val wordToOne: RDD[(String, Int)] = words.map(
    18       word => (word, 1)
    19     )
    20 
    21     val wordGroup: RDD[(String, Iterable[(String, Int)])] = wordToOne.groupBy(
    22       t => t._1
    23     )
    24 
    25     val wordToCount: RDD[(String, Int)] = wordGroup.map {
    26       case (word, list) => {
    27         list.reduce(
    28           (t1, t2) => {
    29             (t1._1, t1._2 + t2._2)
    30           }
    31         )
    32       }
    33     }
    34     //5.将转换结果采集到控制台打印出来
    35     val array: Array[(String, Int)] = wordToCount.collect()
    36     array.foreach(println)
    37     //关闭连接
    38     sc.stop()
    39   }
    40 }

      -------------------------------------------------------------------------------------分割线----------------------------------------------------------------------------------------------

    spark提供了更多的功能,可以将分组和聚合用一个方法来实现
     1 package wc
     2 
     3 import org.apache.spark.rdd.RDD
     4 import org.apache.spark.{SparkConf, SparkContext}
     5 
     6 object Spark_WordCount03 {
     7 
     8   def main(args: Array[String]): Unit = {
     9 
    10     val sparConf = new SparkConf().setMaster("local").setAppName("WordCount")
    11     val sc = new SparkContext(sparConf)
    12 
    13     val lines:RDD[String] = sc.textFile("datas")
    14 
    15     val words: RDD[String] = lines.flatMap(_.split(" "))
    16 
    17     val wordToOne: RDD[(String, Int)] = words.map(
    18       word => (word, 1)
    19     )
    20 
    21     //spark提供了更多的功能,可以将分组和聚合用一个方法来实现
    22     //reduceByKey:对相同的key的数据,可以对value进行reduce整合
    23     val wordToCount=wordToOne.reduceByKey(_+_)
    24 
    25     val array: Array[(String, Int)] = wordToCount.collect()
    26     array.foreach(println)
    27     //关闭连接
    28     sc.stop()
    29   }
    30 }

     -------------------------------------------------------------------------------------分割线--------------------------------------------------------------------------------------------------------

    cmd中运行wordCount

    sc.textFile("data/word.txt").flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_).collect

  • 相关阅读:
    js数组
    关于编程,程序员的一些语录
    css心得
    js函数
    一些电脑基础知识
    gnome3安装
    C学习小记
    ubuntu重装系统后
    elinks文字浏览器
    快捷方式
  • 原文地址:https://www.cnblogs.com/dd110343/p/14289241.html
Copyright © 2020-2023  润新知