spark pair转化操作
sparik pair rdd是一种键值对数据,常见的转化函数如下图
以单词统计
使用mapToPair方法,返回二元组
JavaPairRDD<String,Integer> wordPairs = words.mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String s) throws Exception {
return new Tuple2<>(s,1);
}
});
使用reduceKey方法归纳统计相同的健,把值相加,统计
JavaPairRDD<String,Integer> wordredues = wordPairs.reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer integer, Integer integer2) throws Exception {
return integer+integer2;
}
});
调用默认的rdd sortByKey函数将所有的字符排序
//测试默认排序,默认是ascending(上升)true,如果sortByKey参数是false则是降序
System.out.println("test sort:");
wordredues.sortByKey(true).foreach(stringIntegerTuple2 -> System.out.println(stringIntegerTuple2));
例子源码
package com.learn.hadoop.spark.doc.analysis.chpater.rdd;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Iterator;
/**
*
*/
public class RddTest03 {
public static void main(String[] args) {
SparkConf conf =new SparkConf().setMaster("local").setAppName("RddTest03");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> rdd =sc.parallelize(Arrays.asList("welcome","welcome hell world","welcome python world","welcome java world"));
JavaRDD<String> words = rdd.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterator<String> call(String s) throws Exception {
return Arrays.asList(s.split(" ")).iterator();
}
});
//输出所有的word
System.out.println("console all word");
words.foreach(s -> System.out.println(s));
JavaPairRDD<String,Integer> wordPairs = words.mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String s) throws Exception {
return new Tuple2<>(s,1);
}
});
//输出所有的对pairRDD
System.out.println("console all pair");
//wordPairs.foreach(stringIntegerTuple2 -> System.out.println(stringIntegerTuple2));
wordPairs.foreach(new VoidFunction<Tuple2<String, Integer>>() {
@Override
public void call(Tuple2<String, Integer> stringIntegerTuple2) throws Exception {
System.out.println(stringIntegerTuple2);
}
});
//归纳redues
JavaPairRDD<String,Integer> wordredues = wordPairs.reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer integer, Integer integer2) throws Exception {
return integer+integer2;
}
});
//输出字符统计
System.out.println("console all");
wordredues.foreach(stringIntegerTuple2 -> System.out.println(stringIntegerTuple2));
//测试自定义排序,暂未实现
//System.out.println("test define sort:");
/*wordredues.sortByKey(new Comparator<String>() {
@Override
public int compare(String o1, String o2) {
return o1.compareTo(o2);
}
});*/
//wordredues.foreach(stringIntegerTuple2 -> System.out.println(stringIntegerTuple2));
//测试默认排序,默认是ascending(上升)true,如果sortByKey参数是false则是降序
System.out.println("test sort:");
wordredues.sortByKey(true).foreach(stringIntegerTuple2 -> System.out.println(stringIntegerTuple2));
}
}
结果输出
console all word
welcome
welcome
hell
world
welcome
python
world
welcome
java
world
console all pair
20/03/03 10:41:41 INFO Executor: Running task 0.0 in stage 1.0 (TID 1)
(welcome,1)
(welcome,1)
(hell,1)
(world,1)
(welcome,1)
(python,1)
(world,1)
(welcome,1)
(java,1)
(world,1)
console all
(python,1)
(hell,1)
(java,1)
(welcome,4)
(world,3)
test sort:
(hell,1)
(java,1)
(python,1)
(welcome,4)
(world,3)