• spark Transformations算子


    在java中,RDD分为javaRDDs和javaPairRDDs。下面分两大类来进行。

    都必须要进行的一步。

    SparkConf conf = new SparkConf().setMaster("local").setAppName("test");
    JavaSparkContext sc = new JavaSparkContext(conf);
    

      

    一。javaRDDs

     1         String[] ayys = {"a","b","c"};
     2         List<String> strings = Arrays.asList(ayys);
     3         
     4         JavaRDD<String> rdd1 = sc.parallelize(strings);
     5         strings.add("d");
     6         JavaRDD<String> rdd2 = sc.parallelize(strings);
     7 
     8 
     9         JavaRDD<Tuple2<String, Integer>> parallelize = sc.parallelize(Arrays.asList(
    10                 new Tuple2<String, Integer>("asd", 11),
    11                 new Tuple2<String, Integer>("asd", 11),
    12                 new Tuple2<String, Integer>("asd", 11)
    13         ));
    14 
    15         rdd1.map(new Function<String, String>() {
    16             public String call(String s) throws Exception {
    17                 return s.replace("a","qqq");
    18             }
    19         }).foreach(new VoidFunction<String>() {
    20             public void call(String s) throws Exception {
    21                 System.out.println(s);
    22             }
    23         });
    24 
    25 
    26         List<String> a = rdd1.filter(new Function<String, Boolean>() {
    27             public Boolean call(String s) throws Exception {
    28                 return s.contains("a");
    29             }
    30         }).collect();
    31 
    32         System.out.println(a);
    33 
    34         JavaRDD<String> rdd22 = rdd1.flatMap(new FlatMapFunction<String, String>() {
    35             public Iterable<String> call(String s) throws Exception {
    36                 return Arrays.asList(s.split(" "));
    37             }
    38         });
    39 
    40         JavaPairRDD<String, Integer> rdd4 = rdd2.mapToPair(new PairFunction<String, String, Integer>() {
    41             public Tuple2<String, Integer> call(String s) throws Exception {
    42                 return new Tuple2<String, Integer>(s, 1);
    43             }
    44         });
    45 
    46          JavaRDD<String> rdd11 = rdd2.mapPartitions(new FlatMapFunction<Iterator<String>, String>() {
    47             public Iterable<String> call(Iterator<String> stringIterator) throws Exception {
    48                 ArrayList<String> strings = new ArrayList<String>();
    49                 while (stringIterator.hasNext()){
    50                     strings.add(stringIterator.next());
    51                 }
    52                 return strings;
    53             }
    54         });
    55 
    56         JavaRDD<String> stringJavaRDD = rdd1.mapPartitionsWithIndex(new Function2<Integer, Iterator<String>, Iterator<String>>() {
    57             public Iterator<String> call(Integer integer, Iterator<String> stringIterator) throws Exception {
    58                 ArrayList<String> strings = new ArrayList<String>();
    59                 while (stringIterator.hasNext()){
    60                     strings.add(stringIterator.next());
    61                 }
    62                 return strings.iterator();
    63             }
    64         },false);
    65 
    66         JavaRDD<String> sample = rdd1.sample(false, 0.3);
    67 
    68         JavaRDD<String> union = rdd1.union(rdd2);
    69 
    70         JavaRDD<String> intersection = rdd1.intersection(rdd2);
    71 
    72         JavaRDD<String> distinct = rdd1.distinct();

    二。JavaPairRDDs.

      

            JavaPairRDD<String, Integer> rdd1 = sc.parallelizePairs(Arrays.asList(
                    new Tuple2<String, Integer>("asd", 111),
                    new Tuple2<String, Integer>("asd", 111),
                    new Tuple2<String, Integer>("asd", 111)
            ));
    
            JavaPairRDD<String, Integer> rdd2 = sc.parallelizePairs(Arrays.asList(
                    new Tuple2<String, Integer>("sdfsd", 222),
                    new Tuple2<String, Integer>("sdfsd", 222),
                    new Tuple2<String, Integer>("sdfsd", 222)
            ));
    
            JavaPairRDD<String, Iterable<Integer>> stringIterableJavaPairRDD = rdd1.groupByKey();
    
            JavaPairRDD<String, Integer> rdd = rdd1.reduceByKey(new Function2<Integer, Integer, Integer>() {
                public Integer call(Integer integer, Integer integer2) throws Exception {
                    return integer + integer2;
                }
            });
    
            JavaPairRDD<String, Integer> rdd3 = rdd1.aggregateByKey(0, new Function2<Integer, Integer, Integer>() {
                public Integer call(Integer integer, Integer integer2) throws Exception {
                    return max(integer,integer2);
                }
            }, new Function2<Integer, Integer, Integer>() {
                public Integer call(Integer integer, Integer integer2) throws Exception {
                    return integer + integer2;
                }
            });
    
            JavaPairRDD<String, Integer> rdd111 = rdd1.sortByKey();
    
            JavaPairRDD<String, Tuple2<Integer, Integer>> join = rdd1.join(rdd2);
            JavaPairRDD<String, Tuple2<Integer, Optional<Integer>>> stringTuple2JavaPairRDD = rdd1.leftOuterJoin(rdd2);
            JavaPairRDD<String, Tuple2<Optional<Integer>, Integer>> stringTuple2JavaPairRDD1 = rdd1.rightOuterJoin(rdd2);
            JavaPairRDD<String, Tuple2<Optional<Integer>, Optional<Integer>>> stringTuple2JavaPairRDD2 = rdd1.fullOuterJoin(rdd2);
    
            JavaPairRDD<String, Tuple2<Iterable<Integer>, Iterable<Integer>>> cogroup = rdd1.cogroup(rdd2);
    
            JavaPairRDD<String, Integer> coalesce = rdd1.coalesce(3, false);
    
            JavaPairRDD<String, Integer> repartition = rdd1.repartition(3);
    
            JavaPairRDD<String, Integer> rdd5 = rdd1.repartitionAndSortWithinPartitions(new HashPartitioner(2));
    
            JavaPairRDD<Tuple2<String, Integer>, Tuple2<String, Integer>> cartesian = rdd1.cartesian(rdd2);
    
            JavaRDD<String> pipe = rdd1.pipe("");
    

      

    zip:

      

            JavaPairRDD<Tuple2<String, Integer>, Tuple2<String, Integer>> zip = rdd1.zip(rdd2);
    
            JavaPairRDD<Tuple2<String, Integer>, Long> tuple2LongJavaPairRDD =     rdd1.zipWithIndex();
    

      

    最后都要加上

      

            sc.stop();
    

     

    aggregateByKey算子详解

    repartitionAndSortWithinPartitions算子详解

      

  • 相关阅读:
    java_爬虫_从腾讯视频播放界面爬取视频真实地址
    杂_小技巧_将网页上的内容通过亚马逊邮箱传到kindle中
    java_基础_接口和抽象类
    知乎上的50道SQL练习题
    第 4 章 WebDriver API
    第 2 章 测试环境搭建
    第 1 章 自动化测试基础
    【软件测试】9.QC管理学习(类禅道)学习
    01 Python简介、环境安装、变量、数据类型
    【MySQL面试指南】
  • 原文地址:https://www.cnblogs.com/dhName/p/10641183.html
Copyright © 2020-2023  润新知