• CIB Training Scripts For TPC-H Benchmark


    http://52.11.56.155:7180/
    http://52.11.56.155:8888/

    impala-shell -i 172.31.25.244

    sudo -u hdfs hdfs dfs -du -h /

    http://54.149.20.119:25000/

    第一步:
    准备工作
    生成tpc-h测试数据
    在TPC-H的官网http://www.tpc.org/tpch/上下载dbgen工具http://www.tpc.org/tpch/spec/tpch_2_17_0.zip
    [root@ip-172-31-25-243 training]# wget http://www.tpc.org/tpch/spec/tpch_2_17_0.zip

    确认下载后的结果:
    [root@ip-172-31-25-243 training]# ls
    tpch_2_17_0.zip

    如果没有安装解压工具,请安装解压工具
    [root@ip-172-31-25-243 training]# yum install unzip
    解压tpc-h数据生成工具,确认加压结果:
    [root@ip-172-31-25-243 training]# unzip tpch_2_17_0.zip
    [root@ip-172-31-25-243 training]# ls
    __MACOSX tpch_2_17_0 tpch_2_17_0.zip

    切换到相应目录并修改参数
    [root@ip-172-31-25-243 training]# cd tpch_2_17_0
    [root@ip-172-31-25-243 tpch_2_17_0]# ls
    dbgen dev-tools ref_data
    [root@ip-172-31-25-243 tpch_2_17_0]# cd dbgen
    [root@ip-172-31-25-243 dbgen]# ls
    answers bm_utils.c check_answers dbgen.dsp dss.ddl dsstypes.h makefile.suite PORTING.NOTES qgen.vcproj reference rnd.h shared.h text.c tpch.sln variants
    bcd2.c BUGS column_split.sh dists.dss dss.h HISTORY permute.c print.c queries release.h rng64.c speed_seed.c tpcd.h tpch.vcproj varsub.c
    bcd2.h build.c config.h driver.c dss.ri load_stub.c permute.h qgen.c README rnd.c rng64.h tests tpch.dsw update_release.sh

    [root@ip-172-31-25-243 dbgen]# cp makefile.suite makefile
    [root@ip-172-31-25-243 dbgen]# vi makefile
    将其中的如下几行:
    CC =
    DATABASE=
    MACHINE =
    WORKLOAD =
    修改成:
    CC = gcc
    DATABASE= ORACLE
    MACHINE = LINUX
    WORKLOAD = TPCH

    构建生成工具
    [root@ip-172-31-25-243 dbgen]# make
    查看结果
    [root@ip-172-31-25-243 dbgen]# ls
    answers bm_utils.c build.o dbgen driver.o dsstypes.h makefile permute.o qgen queries rnd.c rng64.h speed_seed.o tpcd.h update_release.sh
    bcd2.c bm_utils.o check_answers dbgen.dsp dss.ddl HISTORY makefile.suite PORTING.NOTES qgen.c README rnd.h rng64.o tests tpch.dsw variants
    bcd2.h BUGS column_split.sh dists.dss dss.h load_stub.c permute.c print.c qgen.o reference rnd.o shared.h text.c tpch.sln varsub.c
    bcd2.o build.c config.h driver.c dss.ri load_stub.o permute.h print.o qgen.vcproj release.h rng64.c speed_seed.c text.o tpch.vcproj varsub.o
    发现多了两个两个可执行文件dbgen和ggen

    切换到dbgen目录下并运行生生数据程序
    [root@ip-172-31-25-243 dbgen]# ./dbgen -s 10

    查看运行结果:
    [root@ip-172-31-25-243 dbgen]# ls *.tbl
    customer.tbl lineitem.tbl nation.tbl orders.tbl partsupp.tbl part.tbl region.tbl supplier.tbl


    第二步:
    将数据写入hdfs
    [root@ip-172-31-25-243 dbgen]# hdfs dfs -mkdir /user/training
    [root@ip-172-31-25-243 dbgen]# hdfs dfs -mkdir /user/training/tpch
    [root@ip-172-31-25-243 dbgen]# hdfs dfs -mkdir /user/training/tpch/customer
    [root@ip-172-31-25-243 dbgen]# hdfs dfs -mkdir /user/training/tpch/lineitem
    [root@ip-172-31-25-243 dbgen]# hdfs dfs -mkdir /user/training/tpch/nation
    [root@ip-172-31-25-243 dbgen]# hdfs dfs -mkdir /user/training/tpch/orders
    [root@ip-172-31-25-243 dbgen]# hdfs dfs -mkdir /user/training/tpch/part
    [root@ip-172-31-25-243 dbgen]# hdfs dfs -mkdir /user/training/tpch/partsupp
    [root@ip-172-31-25-243 dbgen]# hdfs dfs -mkdir /user/training/tpch/region
    [root@ip-172-31-25-243 dbgen]# hdfs dfs -mkdir /user/training/tpch/supplier
    [root@ip-172-31-25-243 dbgen]# hdfs dfs -put customer.tbl /user/training/tpch/customer
    [root@ip-172-31-25-243 dbgen]# hdfs dfs -put lineitem.tbl /user/training/tpch/lineitem
    [root@ip-172-31-25-243 dbgen]# hdfs dfs -put nation.tbl /user/training/tpch/nation
    [root@ip-172-31-25-243 dbgen]# hdfs dfs -put orders.tbl /user/training/tpch/orders
    [root@ip-172-31-25-243 dbgen]# hdfs dfs -put part.tbl /user/training/tpch/part
    [root@ip-172-31-25-243 dbgen]# hdfs dfs -put partsupp.tbl /user/training/tpch/partsupp
    [root@ip-172-31-25-243 dbgen]# hdfs dfs -put region.tbl /user/training/tpch/region
    [root@ip-172-31-25-243 dbgen]# hdfs dfs -put supplier.tbl /user/training/tpch/supplier
    大家可以看一下数据大小以及数据的分布情况,每个表的大小

    第三步:
    进入hue界面
    http://52.11.56.155:8888/
    用户名admin,密码admin
    进入Query Editors —> Hive
    同时打开另一个标签页,进入进入Query Editors —> Impala


    创建数据库和表
    —-创建数据库
    在hive标签页中输入
    create database training

    在界面左边的DataBase部分,点击刷新之后就可以看到新建的数据库training
    选中该数据库,然后再右边的编辑窗口中输入以下建表语句:


    在hive标签页执行
    -- create tables and load data
    create external table customer (C_CUSTKEY INT, C_NAME STRING, C_ADDRESS STRING, C_NATIONKEY INT, C_PHONE STRING, C_ACCTBAL DOUBLE, C_MKTSEGMENT STRING, C_COMMENT STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' STORED AS TEXTFILE LOCATION '/user/training/tpch/customer';
    create external table lineitem (L_ORDERKEY INT, L_PARTKEY INT, L_SUPPKEY INT, L_LINENUMBER INT, L_QUANTITY DOUBLE, L_EXTENDEDPRICE DOUBLE, L_DISCOUNT DOUBLE, L_TAX DOUBLE, L_RETURNFLAG STRING, L_LINESTATUS STRING, L_SHIPDATE STRING, L_COMMITDATE STRING, L_RECEIPTDATE STRING, L_SHIPINSTRUCT STRING, L_SHIPMODE STRING, L_COMMENT STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' STORED AS TEXTFILE LOCATION '/user/training/tpch/lineitem';
    create external table nation (N_NATIONKEY INT, N_NAME STRING, N_REGIONKEY INT, N_COMMENT STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' STORED AS TEXTFILE LOCATION '/user/training/tpch/nation';
    create external table orders (O_ORDERKEY INT, O_CUSTKEY INT, O_ORDERSTATUS STRING, O_TOTALPRICE DOUBLE, O_ORDERDATE STRING, O_ORDERPRIORITY STRING, O_CLERK STRING, O_SHIPPRIORITY INT, O_COMMENT STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' STORED AS TEXTFILE LOCATION '/user/training/tpch/orders';
    create external table part (P_PARTKEY INT, P_NAME STRING, P_MFGR STRING, P_BRAND STRING, P_TYPE STRING, P_SIZE INT, P_CONTAINER STRING, P_RETAILPRICE DOUBLE, P_COMMENT STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' STORED AS TEXTFILE LOCATION '/user/training/tpch/part';
    create external table partsupp (PS_PARTKEY INT, PS_SUPPKEY INT, PS_AVAILQTY INT, PS_SUPPLYCOST DOUBLE, PS_COMMENT STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' STORED AS TEXTFILE LOCATION'/user/training/tpch/partsupp';
    create external table region (R_REGIONKEY INT, R_NAME STRING, R_COMMENT STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' STORED AS TEXTFILE LOCATION '/user/training/tpch/region';
    create external table supplier (S_SUPPKEY INT, S_NAME STRING, S_ADDRESS STRING, S_NATIONKEY INT, S_PHONE STRING, S_ACCTBAL DOUBLE, S_COMMENT STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' STORED AS TEXTFILE LOCATION '/user/training/tpch/supplier';

    -- create orc tables;
    create table customer_orc (C_CUSTKEY INT, C_NAME STRING, C_ADDRESS STRING, C_NATIONKEY INT, C_PHONE STRING, C_ACCTBAL DOUBLE, C_MKTSEGMENT STRING, C_COMMENT STRING) STORED AS orc;
    create table lineitem_orc (L_ORDERKEY INT, L_PARTKEY INT, L_SUPPKEY INT, L_LINENUMBER INT, L_QUANTITY DOUBLE, L_EXTENDEDPRICE DOUBLE, L_DISCOUNT DOUBLE, L_TAX DOUBLE, L_RETURNFLAG STRING, L_LINESTATUS STRING, L_SHIPDATE STRING, L_COMMITDATE STRING, L_RECEIPTDATE STRING, L_SHIPINSTRUCT STRING, L_SHIPMODE STRING, L_COMMENT STRING) STORED AS orc;
    create table nation_orc (N_NATIONKEY INT, N_NAME STRING, N_REGIONKEY INT, N_COMMENT STRING) STORED AS orc;
    create table orders_orc (O_ORDERKEY INT, O_CUSTKEY INT, O_ORDERSTATUS STRING, O_TOTALPRICE DOUBLE, O_ORDERDATE STRING, O_ORDERPRIORITY STRING, O_CLERK STRING, O_SHIPPRIORITY INT, O_COMMENT STRING) STORED AS orc;
    create table part_orc (P_PARTKEY INT, P_NAME STRING, P_MFGR STRING, P_BRAND STRING, P_TYPE STRING, P_SIZE INT, P_CONTAINER STRING, P_RETAILPRICE DOUBLE, P_COMMENT STRING) STORED AS orc;
    create table partsupp_orc (PS_PARTKEY INT, PS_SUPPKEY INT, PS_AVAILQTY INT, PS_SUPPLYCOST DOUBLE, PS_COMMENT STRING) STORED AS orc;
    create table region_orc (R_REGIONKEY INT, R_NAME STRING, R_COMMENT STRING) STORED AS orc;
    create table supplier_orc (S_SUPPKEY INT, S_NAME STRING, S_ADDRESS STRING, S_NATIONKEY INT, S_PHONE STRING, S_ACCTBAL DOUBLE, S_COMMENT STRING) STORED AS orc;


    在Impala标签页执行

    invalidate metadata

    然后点击屏幕左边的刷新按钮,可以看到刚才在hive中建立的database training.
    -- create parquet tables
    create table customer_parquet (C_CUSTKEY INT, C_NAME STRING, C_ADDRESS STRING, C_NATIONKEY INT, C_PHONE STRING, C_ACCTBAL DOUBLE, C_MKTSEGMENT STRING, C_COMMENT STRING) STORED AS parquet;
    create table lineitem_parquet (L_ORDERKEY INT, L_PARTKEY INT, L_SUPPKEY INT, L_LINENUMBER INT, L_QUANTITY DOUBLE, L_EXTENDEDPRICE DOUBLE, L_DISCOUNT DOUBLE, L_TAX DOUBLE, L_RETURNFLAG STRING, L_LINESTATUS STRING, L_SHIPDATE STRING, L_COMMITDATE STRING, L_RECEIPTDATE STRING, L_SHIPINSTRUCT STRING, L_SHIPMODE STRING, L_COMMENT STRING) STORED AS parquet;
    create table nation_parquet (N_NATIONKEY INT, N_NAME STRING, N_REGIONKEY INT, N_COMMENT STRING) STORED AS parquet;
    create table orders_parquet (O_ORDERKEY INT, O_CUSTKEY INT, O_ORDERSTATUS STRING, O_TOTALPRICE DOUBLE, O_ORDERDATE STRING, O_ORDERPRIORITY STRING, O_CLERK STRING, O_SHIPPRIORITY INT, O_COMMENT STRING) STORED AS parquet;
    create table part_parquet (P_PARTKEY INT, P_NAME STRING, P_MFGR STRING, P_BRAND STRING, P_TYPE STRING, P_SIZE INT, P_CONTAINER STRING, P_RETAILPRICE DOUBLE, P_COMMENT STRING) STORED AS parquet;
    create table partsupp_parquet (PS_PARTKEY INT, PS_SUPPKEY INT, PS_AVAILQTY INT, PS_SUPPLYCOST DOUBLE, PS_COMMENT STRING) STORED AS parquet;
    create table region_parquet (R_REGIONKEY INT, R_NAME STRING, R_COMMENT STRING) STORED AS parquet;
    create table supplier_parquet (S_SUPPKEY INT, S_NAME STRING, S_ADDRESS STRING, S_NATIONKEY INT, S_PHONE STRING, S_ACCTBAL DOUBLE, S_COMMENT STRING) STORED AS parquet;

    —hive: load data to orc format
    insert overwrite table lineitem_orc select * from lineitem;
    insert overwrite table customer_orc select * from customer;
    insert overwrite table nation_orc select * from nation;
    insert overwrite table orders_orc select * from orders;
    insert overwrite table part_orc select * from part;
    insert overwrite table partsupp_orc select * from partsupp;
    insert overwrite table region_orc select * from region;
    insert overwrite table supplier_orc select * from supplier;

    —impala: load data to parquet format
    insert overwrite table lineitem_parquet select * from lineitem;
    insert overwrite table customer_parquet select * from customer;
    insert overwrite table nation_parquet select * from nation;
    insert overwrite table orders_parquet select * from orders;
    insert overwrite table part_parquet select * from part;
    insert overwrite table partsupp_parquet select * from partsupp;
    insert overwrite table region_parquet select * from region;
    insert overwrite table supplier_parquet select * from supplier;


    —impala compute stats
    compute stats customer;
    compute stats lineitem;
    compute stats nation;
    compute stats orders;
    compute stats part;
    compute stats partsupp;
    compute stats region;
    compute stats supplier;

    compute stats customer_parquet;
    compute stats lineitem_parquet;
    compute stats nation_parquet;
    compute stats orders_parquet;
    compute stats part_parquet;
    compute stats partsupp_parquet;
    compute stats region_parquet;
    compute stats supplier_parquet;

    第四步:
    --Query1:单表统计分析

    --hive

    -- hive+text
    SELECT
    L_RETURNFLAG, L_LINESTATUS, SUM(L_QUANTITY), SUM(L_EXTENDEDPRICE), SUM(L_EXTENDEDPRICE*(1-L_DISCOUNT)), SUM(L_EXTENDEDPRICE*(1-L_DISCOUNT)*(1+L_TAX)), AVG(L_QUANTITY), AVG(L_EXTENDEDPRICE), AVG(L_DISCOUNT), COUNT(1)
    FROM
    lineitem
    WHERE
    L_SHIPDATE<='1998-09-02'
    GROUP BY L_RETURNFLAG, L_LINESTATUS
    ORDER BY L_RETURNFLAG, L_LINESTATUS;

    -- hive+orc
    SELECT
    L_RETURNFLAG, L_LINESTATUS, SUM(L_QUANTITY), SUM(L_EXTENDEDPRICE), SUM(L_EXTENDEDPRICE*(1-L_DISCOUNT)), SUM(L_EXTENDEDPRICE*(1-L_DISCOUNT)*(1+L_TAX)), AVG(L_QUANTITY), AVG(L_EXTENDEDPRICE), AVG(L_DISCOUNT), COUNT(1)
    FROM
    lineitem_orc
    WHERE
    L_SHIPDATE<='1998-09-02'
    GROUP BY L_RETURNFLAG, L_LINESTATUS
    ORDER BY L_RETURNFLAG, L_LINESTATUS;

    -- impala+text
    SELECT
    L_RETURNFLAG, L_LINESTATUS, SUM(L_QUANTITY), SUM(L_EXTENDEDPRICE), SUM(L_EXTENDEDPRICE*(1-L_DISCOUNT)), SUM(L_EXTENDEDPRICE*(1-L_DISCOUNT)*(1+L_TAX)), AVG(L_QUANTITY), AVG(L_EXTENDEDPRICE), AVG(L_DISCOUNT), COUNT(1)
    FROM
    lineitem
    WHERE
    L_SHIPDATE<='1998-09-02'
    GROUP BY L_RETURNFLAG, L_LINESTATUS
    ORDER BY L_RETURNFLAG, L_LINESTATUS;

    --impala+parquet
    SELECT
    L_RETURNFLAG, L_LINESTATUS, SUM(L_QUANTITY), SUM(L_EXTENDEDPRICE), SUM(L_EXTENDEDPRICE*(1-L_DISCOUNT)), SUM(L_EXTENDEDPRICE*(1-L_DISCOUNT)*(1+L_TAX)), AVG(L_QUANTITY), AVG(L_EXTENDEDPRICE), AVG(L_DISCOUNT), cast(COUNT(1) as int)
    FROM
    lineitem_parquet
    WHERE
    L_SHIPDATE<='1998-09-02'
    GROUP BY L_RETURNFLAG, L_LINESTATUS
    ORDER BY L_RETURNFLAG, L_LINESTATUS;

    第五步:
    --Query2:多表联合查询
    --hive+text
    select
    l_orderkey, sum(l_extendedprice*(1-l_discount)) as revenue, o_orderdate, o_shippriority
    from
    customer c join orders o
    on c.c_mktsegment = 'BUILDING' and c.c_custkey = o.o_custkey
    join lineitem l
    on l.l_orderkey = o.o_orderkey
    where
    o_orderdate < '1995-03-15' and l_shipdate > '1995-03-15'
    group by l_orderkey, o_orderdate, o_shippriority
    order by revenue desc, o_orderdate
    limit 10;

    --hive+orc
    select
    l_orderkey, sum(l_extendedprice*(1-l_discount)) as revenue, o_orderdate, o_shippriority
    from
    customer_orc c join orders_orc o
    on c.c_mktsegment = 'BUILDING' and c.c_custkey = o.o_custkey
    join lineitem_orc l
    on l.l_orderkey = o.o_orderkey
    where
    o_orderdate < '1995-03-15' and l_shipdate > '1995-03-15'
    group by l_orderkey, o_orderdate, o_shippriority
    order by revenue desc, o_orderdate
    limit 10;

    --impala+text
    select
    l_orderkey, sum(l_extendedprice*(1-l_discount)) as revenue, o_orderdate, o_shippriority
    from
    customer c join orders o
    on c.c_mktsegment = 'BUILDING' and c.c_custkey = o.o_custkey
    join lineitem l
    on l.l_orderkey = o.o_orderkey
    where
    o_orderdate < '1995-03-15' and l_shipdate > '1995-03-15'
    group by l_orderkey, o_orderdate, o_shippriority
    order by revenue desc, o_orderdate
    limit 10;

    --impala+parquet
    select
    l_orderkey, sum(l_extendedprice*(1-l_discount)) as revenue, o_orderdate, o_shippriority
    from
    customer_parquet c join orders_parquet o
    on c.c_mktsegment = 'BUILDING' and c.c_custkey = o.o_custkey
    join lineitem_parquet l
    on l.l_orderkey = o.o_orderkey
    where
    o_orderdate < '1995-03-15' and l_shipdate > '1995-03-15'
    group by l_orderkey, o_orderdate, o_shippriority
    order by revenue desc, o_orderdate
    limit 10;

    —-hdfs
    数据的块分布

    --hive演示要点:
    1. Hive的Shell执行方式和Hue界面对比
    2. Hive建表语句,Hive查询,统计分析,多表关联查询和统计
    3. Hive的运行原理
    4. Hive的SQL执行速度问题
    5. 系统资源使用情况
    6. 并发访问的问题,资源使用

    --impala演示
    1. impala shell与Hue
    2. impala与hive执行速度的对比
    3. orc,parquet的压缩比
    4. parquet的运行效率提升
    5. 系统资源使用情况
    6. impala的运行原理
    7. impala运行时统计信息,impala的诊断和调优

  • 相关阅读:
    JVM -- Full GC触发条件总结以及解决策略
    java实现-图的相关操作
    Integer的intValue()方法
    Java transient关键字
    Redis 单线程模型介绍
    String类的intern()方法 -- 重用String对象,节省内存消耗
    数据库阿里连接池 druid配置详解
    redis 实现发布/订阅模式
    Redis实现队列
    redis 实现分布式锁
  • 原文地址:https://www.cnblogs.com/littlesuccess/p/4524730.html
Copyright © 2020-2023  润新知