1、双重key聚合解决数据倾斜:个别key数据量过大,导致数据倾斜。在key前加随机前缀,先进行一次预聚合,然后再将随机前缀去掉,再进行一次聚合。
// 注册自定义函数 spark.udf().register("concat_String_string", new ConcatStringStringUDF(), DataTypes.StringType); spark.udf().register("random_prefix", new RandomPrefixUDF(), DataTypes.StringType); spark.udf().register("remove_random_prefix", new RemoveRandomPrefixUDF(), DataTypes.StringType); spark.udf().register("group_concat_distinct",new GroupConcatDistinctUDAF());
package com.bjsxt.spark.areaRoadFlow; import java.util.Random; import org.apache.spark.sql.api.java.UDF2; public class RandomPrefixUDF implements UDF2<String, Integer, String>{ /** * */ private static final long serialVersionUID = 1L; @Override public String call(String area_name_road_id, Integer ranNum) throws Exception { Random random = new Random(); int prefix = random.nextInt(ranNum); return prefix+"_"+area_name_road_id; } }
private static void generateTempAreaRoadFlowTable(SparkSession spark) { /** * structFields.add(DataTypes.createStructField("area_id", DataTypes.StringType, true)); * structFields.add(DataTypes.createStructField("area_name", DataTypes.StringType, true)); * structFields.add(DataTypes.createStructField("road_id", DataTypes.StringType, true)); * structFields.add(DataTypes.createStructField("monitor_id", DataTypes.StringType, true)); * structFields.add(DataTypes.createStructField("car", DataTypes.StringType, true)); */ String sql = "SELECT " + "area_name," + "road_id," + "count(*) car_count," //group_concat_distinct 统计每一条道路中每一个卡扣下的车流量 + "group_concat_distinct(monitor_id) monitor_infos "//0001=20|0002=30 + "FROM tmp_car_flow_basic " + "GROUP BY area_name,road_id"; /** * 下面是当遇到区域下某个道路车辆特别多的时候,会有数据倾斜,怎么处理?random */ String sqlText = "" + "SELECT " + "area_name_road_id," + "sum(car_count)," + "group_concat_distinct(monitor_infos) monitor_infoss " + "FROM (" + "SELECT " + "remove_random_prefix(prefix_area_name_road_id) area_name_road_id," + "car_count," + "monitor_infos " + "FROM (" + "SELECT " + "prefix_area_name_road_id,"//1_鼓楼区:49 + "count(*) car_count," + "group_concat_distinct(monitor_id) monitor_infos " + "FROM (" + "SELECT " + "monitor_id," + "car," + "random_prefix(concat_String_string(area_name,road_id,':'),10) prefix_area_name_road_id " + "FROM tmp_car_flow_basic " + ") t1 " + "GROUP BY prefix_area_name_road_id " + ") t2 " + ") t3 " + "GROUP BY area_name_road_id"; Dataset<Row> df = spark.sql(sql); df.registerTempTable("tmp_area_road_flow_count"); }