• RDD的转换操作(续)


    1.mapValues[Pair]
      def mapValues[U](f: V => U): RDD[(K, U)]
      说明:将RDD[(K, V)] --> RDD[(K, U)],对Value做(f: V => U)操作
      val a = sc.parallelize(List("dog", "tiger", "lion", "cat", "panther", "eagle"), 2)
      val b = a.map(x => (x.length, x))
      b.mapValues("x" + _ + "x").collect
      res5: Array[(Int, String)] = Array((3,xdogx), (5,xtigerx), (4,xlionx), (3,xcatx), (7,xpantherx), (5,xeaglex))
    2.flatMapValues[Pair]
      def flatMapValues[U](f: V => TraversableOnce[U]): RDD[(K, U)]
      val a = sc.parallelize(List("dog", "tiger", "lion", "cat", "panther", "eagle"), 2)
      val b = a.map(x => (x.length, x))
      b.flatMapValues("x" + _ + "x").collect
      res6: Array[(Int, Char)] = Array((3,x), (3,d), (3,o), (3,g), (3,x), (5,x), (5,t), (5,i), (5,g), (5,e), (5,r), (5,x), (4,x), (4,l), (4,i), (4,o), (4,n), (4,x), (3,x), (3,c), (3,a), (3,t), (3,x), (7,x), (7,p), (7,a), (7,n), (7,t), (7,h), (7,e), (7,r), (7,x), (5,x), (5,e), (5,a), (5,g), (5,l), (5,e), (5,x))
    3.subtractByKey[Pair]
      def subtractByKey[W: ClassTag](other: RDD[(K, W)]): RDD[(K, V)]
      说明:删掉RDD 中键与other RDD 中的键相同的元素
      val a = sc.parallelize(List("dog", "tiger", "lion", "cat", "spider", "eagle"), 2)
      val b = a.keyBy(_.length)
      val c = sc.parallelize(List("ant", "falcon", "squid"), 2)
      val d = c.keyBy(_.length)
      b.subtractByKey(d).collect
      res15: Array[(Int, String)] = Array((4,lion))

    4.combineByKey[Pair]
      def combineByKey[C](createCombiner: V => C, mergeValue: (C, V) => C, mergeCombiners: (C, C) => C): RDD[(K, C)]
      说明:createCombiner:当分区中遇到第一次出现的键时,触发此函数
         mergeValue:当分区中再次出现的键时,触发此函数
         mergeCombiners:处理不同区当中相同Key的Value,执行此函数
      案例:
        RDD为一个分区时:
          scala> var rdd1 = sc.makeRDD(Array(("A",1),("A",2),("B",1),("B",2),("C",1)))
          rdd1: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[0] at makeRDD at <console>:24

          scala> rdd1.combineByKey(x=>x+"_",(x:String,y:Int)=>x+"@"+y,(x:String,y:String)=>x+"$"+y)
          res0: org.apache.spark.rdd.RDD[(String, String)] = ShuffledRDD[1] at combineByKey at <console>:27

          scala> res0.collect
          res1: Array[(String, String)] = Array((B,1_@2), (A,1_@2), (C,1_))
        RDD为两个分区时:
          scala> val rdd2 = rdd1.repartition(2)
          rdd2: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[5] at repartition at <console>:26

          scala> rdd2.partitions.size
          res2: Int = 2

          scala> rdd2.glom.collect
          res3: Array[Array[(String, Int)]] = Array(Array((A,1), (B,1), (C,1)), Array((A,2), (B,2)))

          scala> rdd2.combineByKey(x=>x+"_",(x:String,y:Int)=>x+"@"+y,(x:String,y:String)=>x+"$"+y)
          res4: org.apache.spark.rdd.RDD[(String, String)] = ShuffledRDD[7] at combineByKey at <console>:29

          scala> res4.collect
          res6: Array[(String, String)] = Array((B,1_$2_), (A,1_$2_), (C,1_))
        RDD为三个分区时:
          scala> val rdd3 = rdd1.partitionBy(new org.apache.spark.HashPartitioner(3))
          rdd3: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[8] at partitionBy at <console>:26

          scala> rdd3.partitions.size
          res7: Int = 3

          scala> rdd3.glom.collect
          res8: Array[Array[(String, Int)]] = Array(Array((B,1), (B,2)), Array((C,1)), Array((A,1), (A,2)))

          scala> rdd3.combineByKey(x=>x+"_",(x:String,y:Int)=>x+"@"+y,(x:String,y:String)=>x+"$"+y)
          res9: org.apache.spark.rdd.RDD[(String, String)] = MapPartitionsRDD[10] at combineByKey at <console>:29

          scala> res9.collect
          res10: Array[(String, String)] = Array((B,1_@2), (C,1_), (A,1_@2))
          val a = sc.parallelize(List("dog","cat","gnu","salmon","rabbit","turkey","wolf","bear","bee"), 3)
          val b = sc.parallelize(List(1,1,2,2,2,1,2,2,2), 3)
          val c = b.zip(a)
          val d = c.combineByKey(List(_), (x:List[String], y:String) => y :: x, (x:List[String], y:List[String]) => x ::: y)
          d.collect
          res16: Array[(Int, List[String])] = Array((1,List(cat, dog, turkey)), (2,List(gnu, rabbit, salmon, bee, bear, wolf)))

    5.foldByKey[Pair]
      def foldByKey(zeroValue: V)(func: (V, V) => V): RDD[(K, V)]
      说明:与reduceByKey作用类似,但通过柯里化函数,首先要初始化zeroValue
      val a = sc.parallelize(List("dog", "cat", "owl", "gnu", "ant"), 2)
      val b = a.map(x => (x.length, x))
      b.foldByKey("")(_ + _).collect
      res84: Array[(Int, String)] = Array((3,dogcatowlgnuant)

      val a = sc.parallelize(List("dog", "tiger", "lion", "cat", "panther", "eagle"), 2)
      val b = a.map(x => (x.length, x))
      b.foldByKey("")(_ + _).collect
      res85: Array[(Int, String)] = Array((4,lion), (3,dogcat), (7,panther), (5,tigereagle))
    6.reduceByKeyLocally:行动操作
      def reduceByKeyLocally(func: (V, V) => V): Map[K, V]
      scala> val a = sc.parallelize(List("dog", "cat", "owl", "gnu", "ant"), 2)
      a: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[0] at parallelize at <console>:24

      scala> val b = a.map(x => (x.length, x))
      b: org.apache.spark.rdd.RDD[(Int, String)] = MapPartitionsRDD[1] at map at <console>:26

      scala> b.reduceByKeyLocally(_+_)
      res0: scala.collection.Map[Int,String] = Map(3 -> dogcatowlgnuant)

    7.join
      def join[W](other: RDD[(K, W)]): RDD[(K, (V, W))]
      说明:将两个RDD进行内连接
      val a = sc.parallelize(List("dog", "salmon", "salmon", "rat", "elephant"), 3)
      val b = a.keyBy(_.length)
      val c = sc.parallelize(List("dog","cat","gnu","salmon","rabbit","turkey","wolf","bear","bee"), 3)
      val d = c.keyBy(_.length)
      b.join(d).collect
    8.rightOuterJoin
      说明:对两个RDD 进行连接操作,确保第一个RDD 的键必须存在(右外连接)
    9.leftOuterJoin
      说明:对两个RDD 进行连接操作,确保第二个RDD 的键必须存在(左外连接)
    10.cogroup
      说明:将两个RDD 中拥有相同键的数据分组到一起,全连

  • 相关阅读:
    Python eval 函数妙用
    502与504故障分析与解决方法
    [转]谈谈select, iocp, epoll,kqueue及各种网络I/O复用机制
    计算阶乘n!末尾0的个数
    C++中的Overload、Override和Overwrite
    C++中的空类与空结构体大小
    多线程编程之优先级翻转问题
    数据库原理之事务(二)
    数据库原理之事务(一)
    NoSQL之基础篇
  • 原文地址:https://www.cnblogs.com/lyr999736/p/9562395.html
Copyright © 2020-2023  润新知