• spark04


    spark04

    join leftOuterjoin rightOuterJoin cogroup

    scala> var arr = Array(("zhangsan",200),("lisi",300),("wangwu",350))

    arr: Array[(String, Int)] = Array((zhangsan,200), (lisi,300), (wangwu,350))

    scala> var arr1 = Array(("zhangsan",10),("lisi",15),("zhaosi",20))

    arr1: Array[(String, Int)] = Array((zhangsan,10), (lisi,15), (zhaosi,20))

    scala> sc.makeRDD(arr,3)

    res0: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[0] at makeRDD at <console>:27

    scala> sc.makeRDD(arr1,3)

    res1: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[1] at makeRDD at <console>:27

    scala>

    scala>

    scala> res0 join res1

    res2: org.apache.spark.rdd.RDD[(String, (Int, Int))] = MapPartitionsRDD[4] at join at <console>:33

    scala> res2.mapValues(t=>t._1*t._2)

    res3: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[5] at mapValues at <console>:35

    scala> res3.collect

    res4: Array[(String, Int)] = Array((zhangsan,2000), (lisi,4500))                

    scala> res0 leftOuterJoin res1

    res5: org.apache.spark.rdd.RDD[(String, (Int, Option[Int]))] = MapPartitionsRDD[8] at leftOuterJoin at <console>:33

    scala> res5.mapValues(t=>t._1*t._2.getOrElse(0))

    res6: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[9] at mapValues at <console>:35

    scala> res6.collect

    res7: Array[(String, Int)] = Array((zhangsan,2000), (wangwu,0), (lisi,4500))    

    scala>

    scala> res0 rightOuterJoin res1

    res8: org.apache.spark.rdd.RDD[(String, (Option[Int], Int))] = MapPartitionsRDD[12] at rightOuterJoin at <console>:33

    scala> res8.mapValues(t=>t._1.getOrElse(0)*t._2)

    res9: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[13] at mapValues at <console>:35

    scala> res9.collect

    res10: Array[(String, Int)] = Array((zhangsan,2000), (lisi,4500), (zhaosi,0))   

    scala>

    scala> res0 cogroup res1

    res11: org.apache.spark.rdd.RDD[(String, (Iterable[Int], Iterable[Int]))] = MapPartitionsRDD[15] at cogroup at <console>:33

    scala> res11.mapValues(t=>t._1.sum*t._2.sum)

    res12: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[16] at mapValues at <console>:35

    scala> res12.collect

    res13: Array[(String, Int)] = Array((zhangsan,2000), (wangwu,0), (lisi,4500), (zhaosi,0))

    cartesian笛卡儿积

    scala> var arr = Array(1,2,3,4,5,6)

    arr: Array[Int] = Array(1, 2, 3, 4, 5, 6)

    scala> var arr1 = Array("a","b","c")

    arr1: Array[String] = Array(a, b, c)

    scala> sc.makeRDD(arr,3)

    res14: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[17] at makeRDD at <console>:27

    scala> sc.makeRDD(arr1,3)

    res15: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[18] at makeRDD at <console>:27

    scala> res14 cartesian res15

    res16: org.apache.spark.rdd.RDD[(Int, String)] = CartesianRDD[19] at cartesian at <console>:33

    scala> res16.collect

    res17: Array[(Int, String)] = Array((1,a), (2,a), (1,b), (2,b), (1,c), (2,c), (3,a), (4,a), (3,b), (4,b), (3,c), (4,c), (5,a), (6,a), (5,b), (6,b), (5,c), (6,c))

    repartition == coalesce

    修改分区

    scala> var arr = Array(1,2,3,4,5,6,7,8,9)

    arr: Array[Int] = Array(1, 2, 3, 4, 5, 6, 7, 8, 9)

    scala> sc.makeRDD(arr,3)

    res18: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[20] at makeRDD at <console>:27

    scala> res18.partitions.size

    res19: Int = 3

    scala> res18.repartition(4)

    res20: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[24] at repartition at <console>:29

    scala> res20.partitions.size

    res21: Int = 4

    scala> res18.repartition(2)

    res22: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[28] at repartition at <console>:29

    scala> res22.partitions.size

    res23: Int = 2

    repartition任意的改变分区

    scala> res18.coalesce(2)

    res24: org.apache.spark.rdd.RDD[Int] = CoalescedRDD[29] at coalesce at <console>:29

    scala> res24.partitions.size

    res25: Int = 2

    scala> res18.coalesce(6)

    res26: org.apache.spark.rdd.RDD[Int] = CoalescedRDD[30] at coalesce at <console>:29

    scala> res26.partitions.size

    res27: Int = 3

    coalesce只能缩小分区不能增加分区数量

     

    repartition底层调用的是coalesce,但是coalesce中加入的是shuffle=true

    所以repartition含有shuffle流程

     

    repartition存在shuffle

    repartition = coalesce(true)

    repartition存在shuffle   coalesce不存在shuffle

    最后一个rdd的分区数量才是这个阶段的task任务的个数

    scala> var arr = Array(1,2,3,4,5,6)

    arr: Array[Int] = Array(1, 2, 3, 4, 5, 6)

    scala> sc.makeRDD(arr,3)

    res35: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[35] at makeRDD at <console>:27

    scala> res35.repartition(4)

    res36: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[39] at repartition at <console>:29

    scala> res36.collect

    res37: Array[Int] = Array(6, 4, 2, 1, 3, 5)

    DAG有向无环图中,按照stage进行切分(shuffle流程),总共的task数量

    stage中的所有的task任务的总和,每个stage中最后一个rdd的分区数量的和

    aggreagte  aggregateByKey

    scala> res41.aggregate(10)(_+_,_+_)

    res42: Int = 85

    scala> res41.aggregate(0)(_+_,_+_)

    res43: Int = 45

    每个分区单独加一次,整体聚合加一次

    第二个聚合函数是全体聚合

    scala> var arr = Array(("a",1),("a",2),("a",5),("a",6),("b",3),("b",4))

    arr: Array[(String, Int)] = Array((a,1), (a,2), (a,5), (a,6), (b,3), (b,4))

    scala> sc.makeRDD(arr,2)

    res45: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[43] at makeRDD at <console>:27

    scala> res45.aggregateByKey(0)(_+_,_+_)

    res46: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[44] at aggregateByKey at <console>:29

    scala> res46.collect

    res47: Array[(String, Int)] = Array((b,7), (a,14))

    scala> res45.aggregateByKey(10)(_+_,_+_)

    res48: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[45] at aggregateByKey at <console>:29

    scala> res48.collect

    res49: Array[(String, Int)] = Array((b,17), (a,34))     

    aggregateByKey()全局聚合函数,初始化值每个分区加一次,但是全局聚合不加

    aggregate是行动类算子 aggregateByKey是转换类算子

    算子分为两类

    行动类和转换类算子

    行动类算子立即执行  转换类算子不会执行  转换类算子含有shuffle和没有shuffle

    如果是shuffle类的算子会切分阶段,没有shuffle的算子都处于一个阶段

    RDD的五大特性(重点)

     

    五大特性

     

    1. 每个rdd都是默认被分区的。含有一系列的分区列表
    2. 专门存在一个函数用来处理每一个rdd的分区
    3. 每个rdd之间存在依赖关系
    4. key-value键值对的形式的rdd上面存在可选择的分区器
    5. 优先位置进行计算每一个分区/分片

     

     

    分区是一个特质,其中存在index下标  hashcode方法 equals方法

    分区就是一个rdd上面的分岔路,每个路口流动的数据要交给一个task线程进行处理

    每个rdd都会存在一个或者多个分区,读取hdfs文件的时候,每个分区对应的是一个block块,其实分区就是记录了数据的位置在哪里,每个分区应该从哪个block中读取数据。

    每个分区交给一个executor的一个线程处理

     

     

    compute函数

     

     

    每个rdd上面都会存在一个compute方法专门来计算每一个分区中的数据

    所有的算子进行数据处理的时候都会交给compute方法进行统一计算

     


    compute方法可以统一处理每一个算子中的逻辑,用compute使用算子中的函数,将rdd每个分区中的数据进行迭代处理

     

    每个rdd之间存在依赖关系

    依赖关系存在两种,一对一的(pipeline) 多对多shuffle

    什么是依赖关系?算子

     

    每个rdd上面都存在一个获取依赖关系的函数getDependencies

     

    根据两个rdd之间调用的算子不一样产生的依赖关系主要分为两种

    宽依赖和窄依赖

    存在shuffle的就是宽依赖  没有shuffle的,一对一的就是窄依赖

    窄依赖存在两种关系  OneToOne  RangeDependency

     

     

    OneToOneDependency一对一的依赖  map mapValues flatMap...

    RangeDependency 只有sortByKey

     

    比如map算子

     

     

     

     

    可以看出map算子就是窄依赖

     

    宽依赖shuffleDependency

     

     

     

     

    最后可以得到reduceByKeyshuffleDependency

    可选择的分区器partitioner必须再k-vrdd上面存在分区器,默认得rdd上面不存在分区器

     

    分区器自带得主要分为两种,hashPartitioner  rangePartitioner

    reduceByKey  groupByKey  distinct 使用得都是hashPartitioner

    sortByKey rangePartitioner 依赖关系也是rangeDependency

     

    rdd上面自带得属性,分区器partitioner,默认是None

    scala> sc.makeRDD(Array(1,2,3,4,5,6,7,8,9),3)

    res0: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[0] at makeRDD at <console>:25

    scala> res0.partitioner

    res1: Option[org.apache.spark.Partitioner] = None

    scala>

    scala> sc.makeRDD(Array(("a",1),("a",1),("b",1),("b",1)))

    res2: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[1] at makeRDD at <console>:25

    scala> res2.partitioner

    res3: Option[org.apache.spark.Partitioner] = None

    scala> res2.reduceByKey(_+_)

    res5: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[2] at reduceByKey at <console>:27

    scala> res5.partitioner

    res6: Option[org.apache.spark.Partitioner] = Some(org.apache.spark.HashPartitioner@18)

    scala> res5.sortByKey()

    res7: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[5] at sortByKey at <console>:29

    scala> res7.partitioner

    res8: Option[org.apache.spark.Partitioner] = Some(org.apache.spark.RangePartitioner@1f12d)

    reduceByKey使用的是HashPartitioner所以两个rdd之间得依赖宽依赖shuffleDependency

    sortByKey使用得是RangeDependency 所以依赖关系是rangeDependency

    scala> var arr = Array(1,2,3,4,5,6,7,8,9)

    arr: Array[Int] = Array(1, 2, 3, 4, 5, 6, 7, 8, 9)

    scala> arr.zipWithIndex

    res9: Array[(Int, Int)] = Array((1,0), (2,1), (3,2), (4,3), (5,4), (6,5), (7,6), (8,7), (9,8))

    scala> sc.makeRDD(res9,3)

    res10: org.apache.spark.rdd.RDD[(Int, Int)] = ParallelCollectionRDD[6] at makeRDD at <console>:29

    scala> res10.reduceByKey(_+_)

    res11: org.apache.spark.rdd.RDD[(Int, Int)] = ShuffledRDD[7] at reduceByKey at <console>:31

    scala> res11.mapPartitionsWithIndex((a,b)=>b.map((a,_)))

    res12: org.apache.spark.rdd.RDD[(Int, (Int, Int))] = MapPartitionsRDD[8] at mapPartitionsWithIndex at <console>:33

    scala> res12.collect

    res13: Array[(Int, (Int, Int))] = Array((0,(6,5)), (0,(3,2)), (0,(9,8)), (1,(4,3)), (1,(1,0)), (1,(7,6)), (2,(8,7)), (2,(5,4)), (2,(2,1)))

    reduceByKey使用得hashpartitioner

    scala> var arr = Array(1,2,10000,20000,30000,40000,5000,60000,70000,8,9000000)

    arr: Array[Int] = Array(1, 2, 10000, 20000, 30000, 40000, 5000, 60000, 70000, 8, 9000000)

    scala> sc.makeRDD(arr,3)

    res17: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[13] at makeRDD at <console>:27

    scala> res17.zipWithIndex

    res18: org.apache.spark.rdd.RDD[(Int, Long)] = ZippedWithIndexRDD[14] at zipWithIndex at <console>:29

    scala> res18.sortByKey()

    res19: org.apache.spark.rdd.RDD[(Int, Long)] = ShuffledRDD[17] at sortByKey at <console>:31

    scala> res19.mapPartitionsWithIndex((a,b)=>b.map((a,_)))

    res20: org.apache.spark.rdd.RDD[(Int, (Int, Long))] = MapPartitionsRDD[18] at mapPartitionsWithIndex at <console>:33

    scala> res20.collect

    res21: Array[(Int, (Int, Long))] = Array((0,(1,0)), (0,(2,1)), (0,(8,9)), (0,(5000,6)), (1,(10000,2)), (1,(20000,3)), (1,(30000,4)), (1,(40000,5)), (2,(60000,7)), (2,(70000,8)), (2,(9000000,10)))

    rangeDependency尽量得保证了数据在数值得范围和数值得个数两个因素上面保证平均

    scala> res18.sortByKey(true,20)

    res23: org.apache.spark.rdd.RDD[(Int, Long)] = ShuffledRDD[21] at sortByKey at <console>:31

    scala> res23.partitions.size

    res24: Int = 12

    sortBykey使用得rangePartitioner,重新分区得时候,如果分区数字比元素得个数还要大,那么重新分区得个数就不能起作用,最少保证一个分区中应该含有一个数据

    优先位置

     

    rdd === aa.txt= ===hdfs  ===== 2 blk ==== 2 partition ===每个分区得读取文件得位置

    找到对应得blk块得位置,在本地进行运算

    移动计算比移动数据本身更划算

    以上为五个特性(重点)

    分区列表

    一个compute方法用于计算

    依赖关系

    k-vrdd上存在一个可选择得分区器

    优先位置进行计算

    自定义分区器:

    object WordCountPartition {
      def main(args: Array[String]): Unit = {
        val conf = new SparkConf()
        conf.setMaster("local[*]")
        conf.setAppName("wc")
        val sc = new SparkContext(conf)
        val rdd1 = sc.textFile("aa.txt")
        val rdd2 = rdd1.flatMap(_.split(" " ))
        val rdd3 = rdd2.map((_,1))
        val rdd4 = rdd3.reduceByKey(_+_)
        val rdd5 = rdd4.partitionBy(new MyPartitioner)
        rdd5.saveAsTextFile("wcresult")
      }
    }

    class MyPartitioner extends Partitioner {
      override def numPartitions: Int = 2

      override def getPartition(key: Any): Int = {
        if(key.asInstanceOf[String].equals("hello")){
          0
        }else
          1
      }
    }

    老师得访问量,专业得topN,将每个老师对应得专业进行分区,每个分区中都是一个专业得全部老师,然后将分区中得数据进行排序,就可以得到topN

    自定partitioner 然后mapPartitions每次遍历一个分区中得数据,得到topN

    作业:

    分区器得形式进行教师得专业排名?

    spark版本得电影推荐算法,每个人最喜欢得类型?

    wordcount中得rdd个数

    通过println(rdd.toDebugString)

     

    本地模式是5rdd

     

    wordcount中存在6rdd

  • 相关阅读:
    【项目 · Wonderland】UML设计
    【项目 · Wonderland】预则立 && 他山之石
    【项目 · Wonderland】需求规格说明书 · 终版
    【项目 · 学生部门互选系统】项目展示
    【项目 · Wonderland】立项报告
    React 封装Form表单组件
    前端算法题解析 《四》
    前端算法题解析 《三》
    前端算法题解析 《二》
    前端算法题解析 《一》
  • 原文地址:https://www.cnblogs.com/JBLi/p/11527320.html
Copyright © 2020-2023  润新知