• sample采样倾斜key并单独进行join代码


            /**
             * sample采样倾斜key单独进行join
             */
            
            JavaPairRDD<Long, String> sampledRDD = userid2PartAggrInfoRDD.sample(false, 0.1, 9);
            
            JavaPairRDD<Long, Long> mappedSampledRDD = sampledRDD.mapToPair(
                    
                    new PairFunction<Tuple2<Long,String>, Long, Long>() {
    
                        private static final long serialVersionUID = 1L;
    
                        @Override
                        public Tuple2<Long, Long> call(Tuple2<Long, String> tuple)
                                throws Exception {
                            return new Tuple2<Long, Long>(tuple._1, 1L);
                        }
                        
                    });
            
            JavaPairRDD<Long, Long> computedSampledRDD = mappedSampledRDD.reduceByKey(
                    
                    new Function2<Long, Long, Long>() {
    
                        private static final long serialVersionUID = 1L;
            
                        @Override
                        public Long call(Long v1, Long v2) throws Exception {
                            return v1 + v2;
                        }
                        
                    });
            
            JavaPairRDD<Long, Long> reversedSampledRDD = computedSampledRDD.mapToPair(
                    
                    new PairFunction<Tuple2<Long,Long>, Long, Long>() {
    
                        private static final long serialVersionUID = 1L;
    
                        @Override
                        public Tuple2<Long, Long> call(Tuple2<Long, Long> tuple)
                                throws Exception {
                            return new Tuple2<Long, Long>(tuple._2, tuple._1);
                        }
                        
                    });
            
            final Long skewedUserid = reversedSampledRDD.sortByKey(false).take(1).get(0)._2;  
            
            JavaPairRDD<Long, String> skewedRDD = userid2PartAggrInfoRDD.filter(
                    
                    new Function<Tuple2<Long,String>, Boolean>() {
    
                        private static final long serialVersionUID = 1L;
    
                        @Override
                        public Boolean call(Tuple2<Long, String> tuple) throws Exception {
                            return tuple._1.equals(skewedUserid);
                        }
                        
                    });
                
            JavaPairRDD<Long, String> commonRDD = userid2PartAggrInfoRDD.filter(
                    
                    new Function<Tuple2<Long,String>, Boolean>() {
    
                        private static final long serialVersionUID = 1L;
    
                        @Override
                        public Boolean call(Tuple2<Long, String> tuple) throws Exception {
                            return !tuple._1.equals(skewedUserid);
                        }
                        
                    });
            
            JavaPairRDD<String, Row> skewedUserid2infoRDD = userid2InfoRDD.filter(
                    
                    new Function<Tuple2<Long,Row>, Boolean>() {
    
                        private static final long serialVersionUID = 1L;
            
                        @Override
                        public Boolean call(Tuple2<Long, Row> tuple) throws Exception {
                            return tuple._1.equals(skewedUserid);
                        }
                        
                    }).flatMapToPair(new PairFlatMapFunction<Tuple2<Long,Row>, String, Row>() {
    
                        private static final long serialVersionUID = 1L;
    
                        @Override
                        public Iterable<Tuple2<String, Row>> call(
                                Tuple2<Long, Row> tuple) throws Exception {
                            Random random = new Random();
                            List<Tuple2<String, Row>> list = new ArrayList<Tuple2<String, Row>>();
                            
                            for(int i = 0; i < 100; i++) {
                                int prefix = random.nextInt(100);
                                list.add(new Tuple2<String, Row>(prefix + "_" + tuple._1, tuple._2));
                            }
                            
                            return list;
                        }
                        
                    });
            
            JavaPairRDD<Long, Tuple2<String, Row>> joinedRDD1 = skewedRDD.mapToPair(
                    
                    new PairFunction<Tuple2<Long,String>, String, String>() {
    
                        private static final long serialVersionUID = 1L;
    
                        @Override
                        public Tuple2<String, String> call(Tuple2<Long, String> tuple)
                                throws Exception {
                            Random random = new Random();
                            int prefix = random.nextInt(100);
                            return new Tuple2<String, String>(prefix + "_" + tuple._1, tuple._2);
                        }
                        
                    }).join(skewedUserid2infoRDD).mapToPair(
                            
                            new PairFunction<Tuple2<String,Tuple2<String,Row>>, Long, Tuple2<String, Row>>() {
    
                                private static final long serialVersionUID = 1L;
            
                                @Override
                                public Tuple2<Long, Tuple2<String, Row>> call(
                                        Tuple2<String, Tuple2<String, Row>> tuple)
                                        throws Exception {
                                    long userid = Long.valueOf(tuple._1.split("_")[1]);  
                                    return new Tuple2<Long, Tuple2<String, Row>>(userid, tuple._2);  
                                }
                                
                            });
            
            JavaPairRDD<Long, Tuple2<String, Row>> joinedRDD2 = commonRDD.join(userid2InfoRDD);
            
            JavaPairRDD<Long, Tuple2<String, Row>> joinedRDD = joinedRDD1.union(joinedRDD2);
            
            JavaPairRDD<String, String> sessionid2FullAggrInfoRDD = joinedRDD.mapToPair(
                    
                    new PairFunction<Tuple2<Long,Tuple2<String,Row>>, String, String>() {
    
                        private static final long serialVersionUID = 1L;
    
                        @Override
                        public Tuple2<String, String> call(
                                Tuple2<Long, Tuple2<String, Row>> tuple)
                                throws Exception {
                            String partAggrInfo = tuple._2._1;
                            Row userInfoRow = tuple._2._2;
                            
                            String sessionid = StringUtils.getFieldFromConcatString(
                                    partAggrInfo, "\|", Constants.FIELD_SESSION_ID);
                            
                            int age = userInfoRow.getInt(3);
                            String professional = userInfoRow.getString(4);
                            String city = userInfoRow.getString(5);
                            String sex = userInfoRow.getString(6);
                            
                            String fullAggrInfo = partAggrInfo + "|"
                                    + Constants.FIELD_AGE + "=" + age + "|"
                                    + Constants.FIELD_PROFESSIONAL + "=" + professional + "|"
                                    + Constants.FIELD_CITY + "=" + city + "|"
                                    + Constants.FIELD_SEX + "=" + sex;
                            
                            return new Tuple2<String, String>(sessionid, fullAggrInfo);
                        }
                        
                    });
  • 相关阅读:
    Silverlight 2学习笔记一:初识Silverlight
    全面解析布局(Grid & Canvas &StackPanel &Wrappanel) 转
    WPF-使用面板控制内容布局,比较Canvas,WrapPanel,StackPanel,Grid,ScrollViewer
    C# winform通过按钮上移下移 解决了datasource绑定问题
    经典.net面试题目
    RAID5配置及服务器2003系统安装方法。2000系统的安装要使用7.9版本的引导盘
    (线段树)hdoj1394-Minimum Inversion Number 逆序对
    (线段树)hdoj1754-I Hate It
    (线段树)hdoj1166-敌兵布阵
    Codeforces Round #393 (Div. 2) E题Nikita and stack(线段树)解题报告
  • 原文地址:https://www.cnblogs.com/gentle-awen/p/10144882.html
Copyright © 2020-2023  润新知