union/intersection/subtract:
import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession object TransformationsDemo { def main(args: Array[String]): Unit = { val sparkSession = SparkSession.builder().appName("TransformationsDemo").master("local[1]").getOrCreate() val sc = sparkSession.sparkContext testUnion(sc) testIntersection(sc) testSubtract(sc) } private def testSubtract(sc: SparkContext) = { val rdd1 = sc.parallelize(1 to 3, 1) val rdd2 = sc.parallelize(3 to 5, 1) //返回在当前RDD中出现,并且不在另一个RDD中出现的元素,不去重。 rdd1.subtract(rdd2).collect().foreach(println) println(s"partitions: ${rdd1.subtract(rdd2, 1).partitions.size}") println(s"partitions: ${rdd1.subtract(rdd2, 2).partitions.size}") val rdd3 = sc.parallelize(List(List(1, 2, 3), List(4, 5, 6)), 1) val rdd4 = sc.parallelize(List(List(4, 5, 6), List(7, 8, 9)), 1) rdd3.subtract(rdd4).collect().foreach(println) } private def testIntersection(sc: SparkContext) = { val rdd1 = sc.parallelize(1 to 2, 1) val rdd2 = sc.parallelize(3 to 5, 1) //返回两个RDD的交集,并且去重。 rdd1.intersection(rdd2).collect().foreach(println) println(s"partitions: ${rdd1.intersection(rdd2, 1).partitions.size}") println(s"partitions: ${rdd1.intersection(rdd2, 2).partitions.size}") val rdd3 = sc.parallelize(List(List(1, 2, 3), List(4, 5, 6)), 1) val rdd4 = sc.parallelize(List(List(4, 5, 6), List(7, 8, 9)), 1) rdd3.intersection(rdd4).collect().foreach(println) } private def testUnion(sc: SparkContext) = { val rdd1 = sc.parallelize(1 to 3, 1) val rdd2 = sc.parallelize(3 to 5, 1) //将两个RDD进行合并,不去重。 rdd1.union(rdd2).collect().foreach(println) val rdd3 = sc.parallelize(List(List(1, 2, 3), List(4, 5, 6)), 1) val rdd4 = sc.parallelize(List(List(4, 5, 6), List(7, 8, 9)), 1) rdd3.union(rdd4).collect().foreach(println) } }
运行结果: