• spark算子(二)


    1.collect算子

    *使用foreachACTION操作 ,collect在远程集群中遍历RDD的元素

    *使用collect操作,将分布式在远程集群中的数据拉取到本地
    *这种方式不建议使用,如果数据量大,会使用大量 的网络带宽
    *这种方式不建议使用。

    package kw.test.action;
    
    import java.util.Arrays;
    import java.util.Iterator;
    import java.util.List;
    
    import org.apache.spark.SparkConf;
    import org.apache.spark.api.java.JavaRDD;
    import org.apache.spark.api.java.JavaSparkContext;
    import org.apache.spark.api.java.function.FlatMapFunction;
    
    public class Collect {
    	public static void main(String[] args) {
    		SparkConf sparkConf = new SparkConf().setAppName("FlatMap").setMaster("local");
    		JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
    		List<String> list = Arrays.asList("wo am xues","ni shi la ji ","zha zs","zha df");
    		JavaRDD<String> javaRDD = javaSparkContext.parallelize(list);
    		JavaRDD<String> result = javaRDD.flatMap(new FlatMapFunction<String, String>() {
    
    			@Override
    			public Iterator<String> call(String arg0) throws Exception {
    				// TODO Auto-generated method stub
    				//首选将数据分割,然后将和数据值机型压扁。
    				return Arrays.asList(arg0.split(" ")).iterator();
    			}
    		});
    		List<String> str = result.collect();
    		for(String s : str)
    		{
    			System.out.println(s);
    		}
    		
    	}
    }
    

     2.count

    * 这个是一个action,他是有返回值的。

    package kw.test.action;
    
    import java.util.Arrays;
    import java.util.List;
    
    import org.apache.spark.SparkConf;
    import org.apache.spark.api.java.JavaRDD;
    import org.apache.spark.api.java.JavaSparkContext;
    
    /*
     * 这个是一个action,他是有返回值的。
     */
    public class count {
        public static void main(String[] args) {
            SparkConf sparkConf = new SparkConf().setAppName("FlatMap").setMaster("local");
            JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
            List<Integer> list = Arrays.asList(1,2,3,4,5,6,7,8,9);
            JavaRDD<Integer> javaRDD = javaSparkContext.parallelize(list);
            long num = javaRDD.count();
            System.out.println(num);
        }
    
    }

    3.filter

    *将返回true的数据进行输出

    package kw.test.action;
    
    import java.util.ArrayList;
    import java.util.Arrays;
    import java.util.List;
    
    import org.apache.spark.SparkConf;
    import org.apache.spark.api.java.JavaRDD;
    import org.apache.spark.api.java.JavaSparkContext;
    import org.apache.spark.api.java.function.Function;
    import org.apache.spark.api.java.function.VoidFunction;
    
    public class Filter {
        public static void main(String[] args) {
            SparkConf sparkConf = new SparkConf().setAppName("Filter").setMaster("local");
            JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
            List<Integer> list = Arrays.asList(1,2,3,4,5,6);
            JavaRDD<Integer> javaRDD = javaSparkContext.parallelize(list);
            JavaRDD<Integer> redult = javaRDD.filter(new Function<Integer, Boolean>() {
    
                @Override
                public Boolean call(Integer arg0) throws Exception {
                    // TODO Auto-generated method stub
                    return arg0%2==0;//返回的为true的就可以了。
                }
            });
            redult.foreach(new VoidFunction<Integer>() {
                
                @Override
                public void call(Integer arg0) throws Exception {
                    // TODO Auto-generated method stub
                    System.out.println(arg0);
                }
            });
        }
    }

    4.coalesce

    package kw.test.action;
    
    import java.util.ArrayList;
    import java.util.Arrays;
    import java.util.Iterator;
    import java.util.List;
    
    import org.apache.spark.SparkConf;
    import org.apache.spark.api.java.JavaRDD;
    import org.apache.spark.api.java.JavaSparkContext;
    import org.apache.spark.api.java.function.Function2;
    import org.apache.spark.api.java.function.VoidFunction;
    /*
     * 此参数将返回一个RDD,分区的参数会别为设置的哪一个参数的个数
     * 
     * 这个返回是一个窄依赖,如果将一个100的变为10个partition的时候,这个时候
     * 不会进行shuffle运算,
     * 
     * 如果将好多个partition变为一个的时候,这个时候需要使用shuffle,他会进行shuffle的传递
     * 一般是较少使用,因为需要将数据进行传递。
     * 
     * 
     * 
     * 不仅仅是将partition变得更少,同时也可以将partition变的更大,这个时候需要将shuffle变为true
     * 如果将partition变的更多的时候,也是需要将将shuffle设置为true的。
     */
    
    
    public class FilterOperter {
    	public static void main(String[] args) {
    		SparkConf sparkConf = new SparkConf().setAppName("Coalesce").setMaster("local");
    		JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
    		List<Integer> list = Arrays.asList(1,2,3,4,5,6);
    		JavaRDD<Integer> numbers = javaSparkContext.parallelize(list,6);
    		//Colesce算子,在执行filter之后,有的partition上的数据就会变得很少,容易造成数据的倾斜
    		JavaRDD<String> result = numbers.mapPartitionsWithIndex(new Function2<Integer, Iterator<Integer>, Iterator<String>>() {
    
    			@Override
    			public Iterator<String> call(Integer arg0, Iterator<Integer> arg1)
    					throws Exception {
    				// TODO Auto-generated method stub
    				List<String> list = new ArrayList<String>();
    				while(arg1.hasNext())
    				{
    					list.add(arg0+"      "+arg1.next());
    				}
    				return list.iterator();
    			}
    		}, true);
    		
    		result.foreach(new VoidFunction<String>() {
    
    			@Override
    			public void call(String arg1) throws Exception {
    				// TODO Auto-generated method stub
    				System.out.println(arg1+"    ");
    			}
    			
    		});
    		JavaRDD<String> javaCoaxlesce = result.coalesce(3);
    		JavaRDD<String> coalesce = javaCoaxlesce.mapPartitionsWithIndex(new Function2<Integer, Iterator<String>, Iterator<String>>() {
    
    			@Override
    			public Iterator<String> call(Integer arg0, Iterator<String> arg1)
    					throws Exception {
    				// TODO Auto-generated method stub
    				List<String> list = new ArrayList<String>();
    				while(arg1.hasNext())
    				{
    					list.add(arg1.next()+"      "+arg0);
    					
    				}
    				return list.iterator();
    			}
    		}, true);//参数的含义:true的含义就是是否进行shuffle。默认是不进行shuffle
    		coalesce.foreach(new VoidFunction<String>() {
    
    			@Override
    			public void call(String arg0) throws Exception {
    				// TODO Auto-generated method stub
    				System.out.println("   "+ arg0);
    			}
    		});
    	}
    		
    
    }
    

     5.floatmap

    package kw.test.action;
    
    import java.util.Arrays;
    import java.util.Iterator;
    import java.util.List;
    
    import org.apache.spark.SparkConf;
    import org.apache.spark.api.java.JavaRDD;
    import org.apache.spark.api.java.JavaSparkContext;
    import org.apache.spark.api.java.function.FlatMapFunction;
    import org.apache.spark.api.java.function.VoidFunction;
    
    /*
     * 先执行一个map操作,然后将数据flat压扁
     */
    public class FlatMap {
    	public static void main(String[] args) {
    		SparkConf sparkConf = new SparkConf().setAppName("FlatMap").setMaster("local");
    		JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
    		List<String> list = Arrays.asList("wo am xues","ni shi la ji ","zha zs","zha df");
    		JavaRDD<String> javaRDD = javaSparkContext.parallelize(list);
    		JavaRDD<String> result = javaRDD.flatMap(new FlatMapFunction<String, String>() {
    
    			@Override
    			public Iterator<String> call(String arg0) throws Exception {
    				// TODO Auto-generated method stub
    				//首选将数据分割,然后将和数据值机型压扁。
    				return Arrays.asList(arg0.split(" ")).iterator();
    			}
    		});
    		result.foreach(new VoidFunction<String>() {
    
    			@Override
    			public void call(String arg0) throws Exception {
    				// TODO Auto-generated method stub
    				System.out.println(arg0);
    			}
    		});
    		
    		
    	}
    
    }
    

    6.mappartition

    package kw.test.action;
    
    import java.util.ArrayList;
    import java.util.Arrays;
    import java.util.HashMap;
    import java.util.Iterator;
    import java.util.List;
    import java.util.Map;
    
    import org.apache.spark.SparkConf;
    import org.apache.spark.api.java.JavaRDD;
    import org.apache.spark.api.java.JavaSparkContext;
    import org.apache.spark.api.java.function.FlatMapFunction;
    import org.apache.spark.api.java.function.VoidFunction;
    
    public class MapPartition {
    	public static void main(String[] args) {
    		SparkConf sparkConf = new SparkConf().setAppName("MapPartition").setMaster("local");
    		JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);
    		List<String> list = Arrays.asList("wo","shi","lal","woowj","dffdds");
    		JavaRDD<String> javaRDD = sparkContext.parallelize(list);
    		
    
    		JavaRDD<String> result = javaRDD.mapPartitions(new FlatMapFunction<Iterator<String>, String>() {
    
    			//将partition的数据全部给map,然后使用迭代器处理
    			@Override
    			public Iterator<String> call(Iterator<String> arg0)throws Exception {
    				// TODO Auto-generated method stub
    			   List<String> list = new ArrayList<String>();
    				while(arg0.hasNext() )
    				{
    					list.add(arg0.next());
    				}
    				return  list.iterator();
    			}
    		});
    		result.foreach(new VoidFunction<String>() {
    
    			@Override
    			public void call(String arg0) throws Exception {
    				// TODO Auto-generated method stub
    				System.out.println(arg0);
    			}});
    
    		 
    	}
    
    }
    

      7.MapPartitionsWithIndex

    package kw.test.action;
    
    import java.util.ArrayList;
    import java.util.Arrays;
    import java.util.Iterator;
    import java.util.List;
    
    import org.apache.spark.SparkConf;
    import org.apache.spark.api.java.JavaRDD;
    import org.apache.spark.api.java.JavaSparkContext;
    import org.apache.spark.api.java.function.Function2;
    import org.apache.spark.api.java.function.VoidFunction;
    
    
    /*
     *k可以获取到里面的分区位置
     */
    public class MapPartitionsWithIndex {
    	public static void main(String[] args) {
    		SparkConf sparkConf = new SparkConf().setAppName("MapPartitionsWithIndex").setMaster("local");
    		JavaSparkContext sparkContext =new JavaSparkContext(sparkConf);
    		
    		List<String> list = Arrays.asList("kang","wang","ddd","kang1","wang2","ddd3");
    		JavaRDD<String> javaRDD = sparkContext.parallelize(list,4);
    		
    		JavaRDD<String> javaRDD2 = javaRDD.mapPartitionsWithIndex(new Function2<Integer, Iterator<String>, Iterator<String>>() {
    
    			@Override
    			public Iterator<String> call(Integer arg0, Iterator<String> arg1)
    					throws Exception {
    				// TODO Auto-generated method stub
    				List<String> list = new ArrayList<String>();
    				while(arg1.hasNext())
    				{
    					list.add(arg1.next()+"  "+ arg0);
    				}
    				return list.iterator();
    			}
    		}, true);
    		javaRDD2.foreach(new VoidFunction<String>() {
    			
    			@Override
    			public void call(String arg0) throws Exception {
    				// TODO Auto-generated method stub
    				System.out.println(arg0);
    			}
    		});
    	}
    }
    

      8.Reduce 

    package kw.test.action;
    
    import java.util.Arrays;
    import java.util.List;
    
    import org.apache.spark.SparkConf;
    import org.apache.spark.api.java.JavaRDD;
    import org.apache.spark.api.java.JavaSparkContext;
    import org.apache.spark.api.java.function.Function2;
    
    public class Reduce {
    	public static void main(String[] args) {
    		SparkConf sparkConf = new SparkConf().setAppName("Reduce").setMaster("local");
    		JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
    		List<Integer> list= Arrays.asList(1,2,3,4,5,6,7);   
    		JavaRDD<Integer> num = javaSparkContext.parallelize(list);
    		int sum = num.reduce(new Function2<Integer, Integer, Integer>() {
    			
    			@Override
    			public Integer call(Integer arg0, Integer arg1) throws Exception {
    				// TODO Auto-generated method stub
    				return arg0+arg1;
    			}
    		});
    		System.out.println("最终结果:"+sum);
    	}
    
    }
    

    9.ReduceByKey

    package kw.test.action;
    
    import java.util.Arrays;
    import java.util.List;
    
    import org.apache.spark.SparkConf;
    import org.apache.spark.api.java.JavaPairRDD;
    import org.apache.spark.api.java.JavaRDD;
    import org.apache.spark.api.java.JavaSparkContext;
    import org.apache.spark.api.java.function.Function2;
    import org.apache.spark.api.java.function.VoidFunction;
    
    import scala.Tuple2;
    
    /*
     * ReduceByKey是一个shuffle操作
     * 
     * 在shuffle的时候,分为map端和reduce端
     * 
     * spark里面的reduceByKey在map端映带conbiner。
     * 也就是在map中处理然后将其累加,减少了网络的传输,效率更高。
     */
    public class ReduceByKey {
        public static void main(String[] args) {
            SparkConf sparkConf = new SparkConf().setAppName("Reduce").setMaster("local");
            JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
            List<Tuple2<String, Integer>> list= Arrays.asList(new Tuple2<String,Integer>("kang",100),
                    new Tuple2<String,Integer>("kang",100),
                    new Tuple2<String,Integer>("kang",100),
                    new Tuple2<String,Integer>("kang",100),
                    new Tuple2<String,Integer>("wang",100));
            JavaPairRDD<String, Integer> num = javaSparkContext.parallelizePairs(list);
            JavaPairRDD<String, Integer> rr = num.reduceByKey(new Function2<Integer, Integer, Integer>() {
                
                @Override
                public Integer call(Integer arg0, Integer arg1) throws Exception {
                    // TODO Auto-generated method stub
                    return arg0+arg1;
                }
            });
            rr.foreach(new VoidFunction<Tuple2<String,Integer>>() {
                
                @Override
                public void call(Tuple2<String, Integer> arg0) throws Exception {
                    // TODO Auto-generated method stub
                    System.out.println(arg0);
                }
            });
        }
    
    }

    10.RePartition

    package kw.test.action;
    
    import java.util.ArrayList;
    import java.util.Arrays;
    import java.util.Iterator;
    import java.util.List;
    
    import org.apache.spark.SparkConf;
    import org.apache.spark.api.java.JavaRDD;
    import org.apache.spark.api.java.JavaSparkContext;
    import org.apache.spark.api.java.function.Function2;
    import org.apache.spark.api.java.function.VoidFunction;
    
    
    /*
     * repartition算子,用于任意数据量RDD的partition增多或者减少。
     * coalesce仅仅将RDD的数量减少【其实不是这样的】
     * 、
     * 
     * 建议使用的场景:
     * 一个很经典的场景,使用spark SQL从HIVE中查询数据的时候,spark SQLhui 
     * 会根据HIVE对应的hdfs文件的block的数量决定加载出来的RDD的个数是
     * 多少个,这里默认的partition的数量是我们根本无法设置的
     * 
     * 
     * 有的时候,可能他会自动的设置partition的数量过于少,就进行优化
     * 可以提高并发度,就是对RDD使用的partition的算子。
     * 
     * 
     * 一般情况下,我们为了减少shuffle的时候,我们首选使用coalesce
     * 因为他可以避免shuffle操作。
     * 
     * 
     */
    public class RePartition {
    	public static void main(String[] args) {
    		SparkConf sparkConf = new SparkConf().setAppName("Coalesce").setMaster("local");
    		JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
    		List<Integer> list = Arrays.asList(1,2,3,4,5,6);
    		
    		JavaRDD<Integer> numbers = javaSparkContext.parallelize(list);
    		
    		JavaRDD<String> result = numbers.mapPartitionsWithIndex(new Function2<Integer, Iterator<Integer>, Iterator<String>>() {
    
    			@Override
    			public Iterator<String> call(Integer arg0, Iterator<Integer> arg1)
    					throws Exception {
    				// TODO Auto-generated method stub
    			
    				List<String> list = new ArrayList<String>();
    				while(arg1.hasNext())
    				{
    					list.add(arg1.next()+"      "+arg0);
    				}
    				return list.iterator();
    			}
    		}, true);
    		result.foreach(new VoidFunction<String>() {
    			
    			@Override
    			public void call(String arg0) throws Exception {
    				// TODO Auto-generated method stub
    				System.out.println(arg0+"           ");
    			}
    		});
    		
    
    		result.repartition(5);
    		JavaRDD<String> result2 = result.mapPartitionsWithIndex(new Function2<Integer, Iterator<String>, Iterator<String>>() {
    
    			@Override
    			public Iterator<String> call(Integer arg0, Iterator<String> arg1)
    					throws Exception {
    				// TODO Auto-generated method stub
    				List<String> list = new ArrayList<String>();
    				while(arg1.hasNext())
    				{
    					list.add(arg1.next()+"        "+arg0);
    				}
    				return list.iterator();
    			}
    		
    		}, true);
    		result2.foreach(new VoidFunction<String>() {
    			
    			@Override
    			public void call(String arg0) throws Exception {
    				// TODO Auto-generated method stub
    				System.out.println(arg0+"           ");
    			}
    		});
    	}
    
    }
    

    11.Sample

    package kw.test.action;
    
    import java.util.Arrays;
    import java.util.List;
    
    import org.apache.spark.SparkConf;
    import org.apache.spark.api.java.JavaRDD;
    import org.apache.spark.api.java.JavaSparkContext;
    import org.apache.spark.api.java.function.VoidFunction;
    
    /*
     * 随机采样从,可以传入一个Float,不如数0.3就是采样30%
     * 比如说我在测试的时候使用的是5个单词,最后也就显示的是1个,当然了 Float的值是可以修改的,参数的个数也是可以自己给定的。
     * 
     * 这个也可以设置partition的个数,如果是一个就先手wordnum*float,如果是两个的时候,他们的值是每一个里面取出wordNum*float的值
     */
    public class Sample {
    	public static void main(String[] args) {
    		SparkConf sparkConf = new SparkConf().setAppName("sample").setMaster("local");
    		JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
    		List<String> list = Arrays.asList("kwang","wang","is","a","student");
    		JavaRDD<String> num = javaSparkContext.parallelize(list);
    	    //参数:false就是抽取之后,不会将数据放回,最终的结果是不会有重复的,如果是true会将数据放回再次抽取,最终结果就会有重复的。
    		//它的第三个参数,种子,如果不指定就会自动产生一个种子,所以每次的结果都是不一样的,但是如果指定一个种子
    		//那么它的最终的结果都是一样的。此参数一般作为测试使用。
    		num.sample(false, 0.8).foreach(new VoidFunction<String>() {
    			
    			@Override
    			public void call(String arg0) throws Exception {
    				// TODO Auto-generated method stub
    				System.out.println(arg0);
    			}
    		});
    	}
    
    }
    

    12.Take

    package kw.test.action;
    
    import java.util.Arrays;
    import java.util.List;
    
    import org.apache.spark.SparkConf;
    import org.apache.spark.api.java.JavaRDD;
    import org.apache.spark.api.java.JavaSparkContext;
    
    /*
     * take是取数据,参数是几,我们就取其中的几个参数出来
     */
    public class Take {
        public static void main(String[] args) {
            SparkConf sparkConf = new SparkConf().setAppName("take").setMaster("local");
            JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
            List<String> list = Arrays.asList("kang","wang","lala");
            JavaRDD<String> result = javaSparkContext.parallelize(list);
            /*
             *         List<String> list1 = Arrays.asList("kang","wang","lala");
            List<Integer> list = Arrays.asList(1,2,3,4,5);
            JavaRDD<Integer> result = javaSparkContext.parallelize(list);
            JavaRDD<String> result1 = javaSparkContext.parallelize(list1);
            result.take(2);
            result1.take(num);
            
             */
            List<String> name = result.take(2);
            for(String value :name)
            {
                System.out.println(value);
            }
        }
    
    }

    13.TakeSample

    package kw.test.action;
    
    import java.util.Arrays;
    import java.util.List;
    
    import org.apache.spark.SparkConf;
    import org.apache.spark.api.java.JavaRDD;
    import org.apache.spark.api.java.JavaSparkContext;
    
    /*
     * take是取出数据,sample是随机的取出数据
     * 
     * takeSample先进行sample在进行采样
     */
    public class TakeSample {
    	public static void main(String[] args) {
    		SparkConf sparkConf = new SparkConf().setAppName("takesample").setMaster("local");
    		JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
    		List<String> list = Arrays.asList("kang1","wang1","lala1","kang2","wang2","lala2","kang3","wang3","lala3");
    		JavaRDD<String> result = javaSparkContext.parallelize(list);
    		//第一个参数是否放回,第二个参数是数据的个数,第三个参数是种子。  如果种子一样,每一次的数据都是一样的,如果种子不一样,每一次的参数都不一样。
    		List<String> value = result.takeSample(false, 3);
    		for(String v:value)
    		{
    			System.out.println(v);
    		}
    	}
    }
    

    14.Union

    package kw.test.action;
    
    import java.util.Arrays;
    import java.util.List;
    
    import org.apache.spark.SparkConf;
    import org.apache.spark.api.java.JavaRDD;
    import org.apache.spark.api.java.JavaSparkContext;
    import org.apache.spark.api.java.function.VoidFunction;
    
    /*
     * 将两个list变为一个list,他不会进行shuffle操作,加入开始都是两个partition,
     * 将他们union之后,会是四个
     */
    public class Union {
    	public static void main(String[] args) {
    		
    
    		// TODO Auto-generated method stub
    		SparkConf sparkConf = new SparkConf().setAppName("union").setMaster("local");
    		JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
    		List<String> list1 = Arrays.asList("kang1","wang1","lala1","kang2","wang2","lala2");
    		List<String> list2 = Arrays.asList("kang3","wang3","lala3");
    		JavaRDD<String> result1 = javaSparkContext.parallelize(list1,2);
    		JavaRDD<String> result2 = javaSparkContext.parallelize(list2,2);
    		JavaRDD <String> unionre = result1.union(result2);
    		unionre.foreach(new VoidFunction<String>() {
    			
    			@Override
    			public void call(String arg0) throws Exception {
    				// TODO Auto-generated method stub
    				System.out.println(arg0);
    			}
    		});
    	}
    }
    

      

  • 相关阅读:
    Office2007界面风格的绿色软件针式个人知识库管理系统[V3.5]
    Mentor工具简介
    Xilinx网站资源导读
    FPGA时钟问题的探讨汇总
    FPGA中竞争冒险问题的研究
    一些IC前端设计工具
    SPI协议简介
    USB接口定义
    TTL与CMOS电平的区别
    Synopsys工具简介
  • 原文地址:https://www.cnblogs.com/kw28188151/p/8722364.html
Copyright © 2020-2023  润新知