• 大三寒假学习进度笔记(二十)—— 模型提升和Spark中WordCount的11种实现方法


    写在前面

    今天主要学习了机器学习十讲的第四讲,然后把SparkCore中的几种常用算子都学习完毕,用WordCount做了一个小总结。

    机器学习部分

    今天的学习中,首先系统的分析了模型误差出现的原因:

    用我自己理解的话说,模型空间限制了模型的表达能力,使得模型与真实数据之间存在一个客观的误差,叫做逼近误差。
    在了解了误差的存在原因后,我们就可以讨论如何去提升模型的表达能力了,即模型提升。今天的课中提到了模型集成和深度学习的方法。对于模型集成进行了详细讲解。详细的算法有决策树算法,随机森林算法以及AdaBoost算法。算法的具体解释我这里就不再赘述(以我的表达能力也能难讲解清楚算法)。今天的内容就这些了。

    Spark部分

    直接上代码,不废话

     // groupBy
      def wordCount1(sc: SparkContext): Unit = {
        val rdd: RDD[String] = sc.makeRDD(List("Hello Scala", "Hello Spark"))
        val words: RDD[String] = rdd.flatMap(_.split(" "))
        val group: RDD[(String, Iterable[String])] = words.groupBy(word => word)
        val wordCount: RDD[(String, Int)] = group.mapValues(iter => iter.size)
      }
    
      // groupByKey
      def wordCount2(sc: SparkContext): Unit = {
        val rdd: RDD[String] = sc.makeRDD(List("Hello Scala", "Hello Spark"))
        val words: RDD[String] = rdd.flatMap(_.split(" "))
        val wordOne: RDD[(String, Int)] = words.map((_, 1))
        val group: RDD[(String, Iterable[Int])] = wordOne.groupByKey()
        val wordCount: RDD[(String, Int)] = group.mapValues(iter => iter.size)
      }
    
      // reduceByKey
      def wordCount3(sc: SparkContext): Unit = {
        val rdd: RDD[String] = sc.makeRDD(List("Hello Scala", "Hello Spark"))
        val words: RDD[String] = rdd.flatMap(_.split(" "))
        val wordOne: RDD[(String, Int)] = words.map((_, 1))
        val wordCount: RDD[(String, Int)] = wordOne.reduceByKey(_ + _)
      }
    
      // aggregateByKey
      def wordCount4(sc: SparkContext): Unit = {
        val rdd: RDD[String] = sc.makeRDD(List("Hello Scala", "Hello Spark"))
        val words: RDD[String] = rdd.flatMap(_.split(" "))
        val wordOne: RDD[(String, Int)] = words.map((_, 1))
        val wordCount: RDD[(String, Int)] = wordOne.aggregateByKey(0)(_ + _, _ + _)
      }
    
      // foldByKey
      def wordCount5(sc: SparkContext): Unit = {
        val rdd: RDD[String] = sc.makeRDD(List("Hello Scala", "Hello Spark"))
        val words: RDD[String] = rdd.flatMap(_.split(" "))
        val wordOne: RDD[(String, Int)] = words.map((_, 1))
        val wordCount: RDD[(String, Int)] = wordOne.foldByKey(0)(_ + _)
      }
    
      // combineByKey
      def wordCount6(sc: SparkContext): Unit = {
        val rdd: RDD[String] = sc.makeRDD(List("Hello Scala", "Hello Spark"))
        val words: RDD[String] = rdd.flatMap(_.split(" "))
        val wordOne: RDD[(String, Int)] = words.map((_, 1))
        val wordCount: RDD[(String, Int)] = wordOne.combineByKey(v => v, (x: Int, y) => x + y, (x: Int, y: Int) => x + y)
      }
    
      // countByKey
      def wordCount7(sc: SparkContext): Unit = {
        val rdd: RDD[String] = sc.makeRDD(List("Hello Scala", "Hello Spark"))
        val words: RDD[String] = rdd.flatMap(_.split(" "))
        val wordOne: RDD[(String, Int)] = words.map((_, 1))
        val wordCount: collection.Map[String, Long] = wordOne.countByKey()
      }
    
      // countByValue
      def wordCount8(sc: SparkContext): Unit = {
        val rdd: RDD[String] = sc.makeRDD(List("Hello Scala", "Hello Spark"))
        val words: RDD[String] = rdd.flatMap(_.split(" "))
        val wordCount: collection.Map[String, Long] = words.countByValue()
      }
    
    
      // reduce,aggregate,fold
      def wordCount9(sc: SparkContext): Unit = {
        val rdd: RDD[String] = sc.makeRDD(List("Hello Scala", "Hello Spark"))
        val words: RDD[String] = rdd.flatMap(_.split(" "))
        val mapWord: RDD[mutable.Map[String, Long]] = words.map(word => mutable.Map[String, Long]((word, 1)))
        val wordCount: mutable.Map[String, Long] = mapWord.reduce((map1, map2) => {
          map2.foreach {
            case (word, count) =>
              val newCount = map1.getOrElse(word, 0L) + count
              map1.update(word, newCount)
          }
          map1
        })
      }
    

    代码难度不大,都是可以看懂的。

    总结

    今天少见的听懂了机器学习中的内容,倒是让我很成就感。SparkCore的部分也就告一段落了。

  • 相关阅读:
    AC日记——[ZJOI2012]网络 bzoj 2816
    [USACO08FEB]酒店Hotel 线段树
    divisors 数学
    Count on a tree 树上主席树
    STL备忘
    [TJOI2013]松鼠聚会 曼哈顿距离
    斐波那契数列 矩阵乘法优化DP
    [TJOI2013]奖学金 乱搞
    铁轨 清北学堂 线段树
    P3939 数颜色 线段树动态开点
  • 原文地址:https://www.cnblogs.com/wushenjiang/p/14347301.html
Copyright © 2020-2023  润新知