• 使用spark对hive表中的多列数据判重


    本文处理的场景如下,hive表中的数据,对其中的多列进行判重deduplicate。

    1、先解决依赖,spark相关的所有包,pom.xml

    spark-hive是我们进行hive表spark处理的关键。

    <dependencies>
            <dependency>
                <groupId>org.apache.spark</groupId>
                <artifactId>spark-core_2.10</artifactId>
                <version>1.6.0</version>
                <scope>provided</scope>
            </dependency>
            <dependency>
                <groupId>org.apache.spark</groupId>
                <artifactId>spark-hive_2.10</artifactId>
                <version>1.6.0</version>
                <scope>provided</scope>
            </dependency>
            <dependency>
                <groupId>org.apache.spark</groupId>
                <artifactId>spark-sql_2.10</artifactId>
                <version>1.6.0</version>
                <scope>provided</scope>
            </dependency>
            <dependency>
                <groupId>com.alibaba</groupId>
                <artifactId>fastjson</artifactId>
                <version>1.2.19</version>
            </dependency>
        </dependencies>

    2、spark-client

    package com.xiaoju.kangaroo.duplicate;
    
    import org.apache.spark.SparkConf;
    import org.apache.spark.api.java.JavaSparkContext;
    import org.apache.spark.sql.SQLContext;
    import org.apache.spark.sql.hive.HiveContext;
    
    import java.io.Serializable;
    
    public class SparkClient implements Serializable{
        private SparkConf sparkConf;
        private JavaSparkContext javaSparkContext;
    
        public SparkClient() {
            initSparkConf();
            javaSparkContext = new JavaSparkContext(sparkConf);
        }
    
    
        public SQLContext getSQLContext() {
            return new SQLContext(javaSparkContext);
        }
    
        public HiveContext getHiveContext() {
            return new HiveContext(javaSparkContext);
        }
    
        private void initSparkConf() {
            try {
                String warehouseLocation = System.getProperty("user.dir");
                sparkConf = new SparkConf()
                        .setAppName("duplicate")
                        .set("spark.sql.warehouse.dir", warehouseLocation)
                        .setMaster("yarn-client");
            } catch (Exception ex) {
                ex.printStackTrace();
            }
        }
    
    }

    3、判重流程

    package com.xiaoju.kangaroo.duplicate;
    
    import org.apache.spark.api.java.JavaRDD;
    import org.apache.spark.api.java.function.FlatMapFunction;
    import org.apache.spark.api.java.function.Function;
    import org.apache.spark.api.java.function.Function2;
    import org.apache.spark.api.java.function.PairFunction;
    import org.apache.spark.sql.DataFrame;
    import org.apache.spark.sql.Row;
    import org.apache.spark.sql.hive.HiveContext;
    import scala.Tuple2;
    
    import java.io.Serializable;
    import java.util.ArrayList;
    import java.util.HashMap;
    import java.util.List;
    import java.util.Map;
    
    public class SparkDuplicate implements Serializable  {
    
        private transient SparkClient sparkClient;
        private transient HiveContext hiveContext;
        private String db;
        private String tb;
        private String pt;
        private String cols;
    
        public SparkDuplicate(String db, String tb, String pt, String cols) {
            this.db = db;
            this.tb = tb;
            this.pt = pt;
            this.cols = cols;
            this.sparkClient = new SparkClient();
            this.hiveContext = sparkClient.getHiveContext();
        }
    
        public void duplicate() {
            String partition = formatPartition(pt);
            String query = String.format("select * from %s.%s where %s", db ,tb, partition);
            System.out.println(query);
            DataFrame rows = hiveContext.sql(query);
            JavaRDD<Row> rdd = rows.toJavaRDD();
            Map<String, Integer> repeatRetMap = rdd.flatMap(new FlatMapFunction<Row, String>() {
                public Iterable<String> call(Row row) throws Exception {
                    HashMap<String, Object> rowMap = formatRowMap(row);
                    List<String> sList = new ArrayList<String>();
                    String[] colList = cols.split(",");
                    for (String col : colList) {
                        sList.add(col + "@" + rowMap.get(col));
                    }
                    return sList;
                }
            }).mapToPair(new PairFunction<String, String, Integer>() {
                public Tuple2<String, Integer> call(String s) throws Exception {
                    return new Tuple2<String, Integer>(s, 1);
    
                }
            }).reduceByKey(new Function2<Integer, Integer, Integer>() {
                public Integer call(Integer integer, Integer integer2) throws Exception {
                    return integer + integer2;
                }
            }).map(new Function<Tuple2<String,Integer>, Map<String, Integer>>() {
                public Map<String, Integer> call(Tuple2<String, Integer> stringIntegerTuple2) throws Exception {
                    Map<String, Integer> retMap = new HashMap<String, Integer>();
                    if (stringIntegerTuple2._2 > 1) {
                        retMap.put(stringIntegerTuple2._1, stringIntegerTuple2._2);
                    }
                    return retMap;
                }
            }).reduce(new Function2<Map<String, Integer>, Map<String, Integer>, Map<String, Integer>>() {
                public Map<String, Integer> call(Map<String, Integer> stringIntegerMap, Map<String, Integer> stringIntegerMap2) throws Exception {
                    stringIntegerMap.putAll(stringIntegerMap2);
                    return stringIntegerMap;
                }
            });
    
            for (Map.Entry<String, Integer> entry : repeatRetMap.entrySet()) {
                if (entry.getValue() > 1) {
                    System.out.println("重复值为:" + entry.getKey() + ", 重复个数" + entry.getValue());
                }
            }
        }
    
        private String formatPartition(String partition) {
            String format = "";
            if (partition.startsWith("pt") || partition.startsWith("dt")) {
                String[] items = partition.split("=");
                for (int i = 0; i < items.length; i++) {
                    if (items[i].equals("pt") || items[i].equals("dt")) {
                        format += items[i];
                    } else {
                        format += "='" + items[i] + "'";
                    }
                }
            } else {
                String[] keys;
                if (partition.contains("w=")){
                    keys = new String[] {"year", "week"};
                    partition = partition.replace("w=", "");
                }
                else{
                    keys = new String[] {"year","month","day", "hour"};
                }
                String[] items = partition.split("/");
                for(int i=0; i<items.length; i++) {
                    if (i == items.length-1) {
                        format += keys[i] + "='" + items[i] + "'";
                    } else {
                        format += keys[i] + "='" + items[i] + "' and ";
                    }
                }
            }
            return format;
        }
    
        private HashMap<String, Object> formatRowMap(Row row){
            HashMap<String, Object> rowMap = new HashMap<String, Object>();
            try {
                for (int i=0; i<row.schema().fields().length; i++) {
                    String colName = row.schema().fields()[i].name();
                    Object colValue = row.get(i);
                    rowMap.put(colName, colValue);
    
                }
            }catch (Exception ex) {
                ex.printStackTrace();
            }
            return rowMap;
        }
    
        public static void main(String[] args) {
            String db = args[0];
            String tb = args[1];
            String pt = args[2];
            String cols = args[3];
            SparkDuplicate sparkDuplicate = new SparkDuplicate(db, tb, pt, cols);
            sparkDuplicate.duplicate();
        }
    }

    4、运行方式

    提交任务脚本

    #!/bin/bash
    source /etc/profile
    source ~/.bash_profile
    db=$1
    table=$2
    partition=$3
    cols=$4
    spark-submit 
        --queue=root.zhiliangbu_prod_datamonitor 
        --driver-memory 500M 
        --executor-memory 13G 
        --num-executors 50 
        spark-duplicate-1.0-SNAPSHOT-jar-with-dependencies.jar ${db} ${table} ${partition} ${cols}

    运行:

    sh run.sh gulfstream_ods g_order 2017/07/11 area,type

    结果

    重复值为:area@179, 重复个数225                                                  
    重复值为:area@80, 重复个数7398
    重复值为:area@82, 重复个数69823
    重复值为:area@81, 重复个数98317
    重复值为:area@84, 重复个数91775
    重复值为:area@83, 重复个数72053
    重复值为:area@180, 重复个数2362
    重复值为:area@86, 重复个数264487
    重复值为:area@181, 重复个数2927
    重复值为:area@85, 重复个数230484
    重复值为:area@88, 重复个数87527
    重复值为:area@87, 重复个数74987
    重复值为:area@89, 重复个数130297
    重复值为:area@188, 重复个数24463
    重复值为:area@189, 重复个数15699
    重复值为:area@186, 重复个数13517
    重复值为:area@187, 重复个数4774
    重复值为:area@184, 重复个数5022
    重复值为:area@185, 重复个数6737
    重复值为:area@182, 重复个数12705
    重复值为:area@183, 重复个数18961
    重复值为:area@289, 重复个数20715
    重复值为:area@168, 重复个数15179
    重复值为:area@169, 重复个数1276
    重复值为:area@91, 重复个数31664
    重复值为:area@90, 重复个数61261
    重复值为:area@93, 重复个数32496
    重复值为:area@92, 重复个数55877
    重复值为:area@95, 重复个数40933
    重复值为:area@94, 重复个数32564
    重复值为:area@290, 重复个数300
    重复值为:area@97, 重复个数21405
    重复值为:area@170, 重复个数37696
    重复值为:area@291, 重复个数212
    重复值为:area@96, 重复个数12442
    重复值为:area@99, 重复个数2526
    重复值为:area@98, 重复个数17456
    重复值为:area@298, 重复个数12688
    重复值为:area@177, 重复个数17285
    重复值为:area@178, 重复个数11511
    重复值为:area@299, 重复个数6622
    重复值为:area@175, 重复个数9573
    重复值为:area@296, 重复个数2416
    重复值为:area@176, 重复个数8109
    重复值为:area@297, 重复个数27915
    重复值为:area@173, 重复个数58942
    重复值为:area@294, 重复个数18842
    重复值为:area@295, 重复个数3482
    重复值为:area@174, 重复个数31452
    重复值为:area@292, 重复个数11436
    重复值为:area@171, 重复个数656
    重复值为:area@172, 重复个数31557
    重复值为:area@293, 重复个数1726
    重复值为:type@1, 重复个数288479
    重复值为:type@0, 重复个数21067365
  • 相关阅读:
    C#文件操作
    WPF 工具提示(ToolTip)
    C#中is 和 as 操作符
    C#线程同步——lock,Monitor,Mutex(摘录)
    随记
    实现RichTextBox内容自动滚动(WPF)
    初探MVC3(一)
    thinkPad系列的笔记本质量已经大不如以前了,太令人失望了
    WPF—— ComboBox绑定应用
    new,virtual,override,interface,delegate,eventC#——方法实现总结
  • 原文地址:https://www.cnblogs.com/kangoroo/p/7221600.html
Copyright © 2020-2023  润新知