1、countByValue
scala> val a = sc.parallelize(List(1,2,3,4,5,2,3,1,1,2)) a: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[0] at parallelize at <console>:27 scala> val b = a.countByValue() b: scala.collection.Map[Int,Long] = Map(5 -> 1, 1 -> 3, 2 -> 3, 3 -> 2, 4 -> 1) scala> b.foreach(println) (5,1) (1,3) (2,3) (3,2) (4,1)
2、zip
scala> val a = sc.parallelize(List(1,2,3,4,5)) a: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[4] at parallelize at <console>:27 scala> val b = sc.parallelize(List('a','b','c','d','e')) b: org.apache.spark.rdd.RDD[Char] = ParallelCollectionRDD[5] at parallelize at <console>:27 scala> val c = a.zip(b) c: org.apache.spark.rdd.RDD[(Int, Char)] = ZippedPartitionsRDD2[6] at zip at <console>:31 scala> c.foreach(println) (2,b) (3,c) (5,e) (1,a) (4,d)