/** * # _*_ coding:utf-8 _*_ * # Author:xiaoshubiao * # Time : 2020/5/14 8:33 **/ import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function2; import scala.Tuple2; import java.util.ArrayList; import java.util.Arrays; import java.util.List; public class union_test { public static void main(String[] args) { SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("union_test"); JavaSparkContext sc = new JavaSparkContext(conf); List<String> list = Arrays.asList("a","b","c","a"); JavaRDD<String> parallelize = sc.parallelize(list,2); JavaPairRDD javaPairRDD = parallelize.mapToPair(x -> new Tuple2(x, 1)); javaPairRDD.groupByKey().collect().forEach(x->System.out.println(x)); // reduceBykey 没有默认值,在某个key只有一条数据的时候是不会执行该函数的,所以当前情况下“执行”只打印了一次。 // 最终执行的结果是(b,1) (a,执行) (c,1) 并不是(b,执行) (a,执行) (c,执行) javaPairRDD.reduceByKey( new Function2<Integer, Integer, String>() { @Override public String call(Integer integer, Integer integer2) throws Exception { System.out.println("执行"); return "执行"; } },2).collect().forEach(x->System.out.println(x)); // aggregateBykey 参数1:进行分区内计算的默认值 // 参数2:分区内计算逻辑 //参数3:分区间计算逻辑 javaPairRDD.aggregateByKey(1, new Function2<Integer,Integer,Integer>() { @Override public Integer call(Integer o, Integer o2) throws Exception { System.out.println("分区内"); return o+o2; } }, new Function2<Integer,Integer,Integer>() { @Override public Integer call(Integer o, Integer o2) throws Exception { System.out.println("分区间"); return o+o2; } }).collect().forEach(x->System.out.println(x)); } }