• Hive实践(hive0.12)


    版本号:cdh5.0.0+hadoop2.3.0+hive0.12

    一、原始数据:

    1. 本地数据

    [root@node33 data]# ll
    total 12936
    -rw-r--r--. 1 root root 13245467 May  1 17:08 hbase-data.csv
    [root@node33 data]# head -n 3 hbase-data.csv 
    1,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0,0,1
    2,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0,0,1
    3,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0,0,1

    2. hdfs数据:

    [root@node33 data]# hadoop fs -ls /input
    Found 1 items
    -rwxrwxrwx   1 hdfs supergroup   13245467 2014-05-01 17:09 /input/hbase-data.csv
    [root@node33 data]# hadoop fs -cat /input/* | head -n 3
    1,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0,0,1
    2,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0,0,1
    3,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0,0,1


    二、创建hive表:

    1.hive外部表:

    [root@node33 hive]# cat employees_ext.sql 
    create external table if not exists employees_ext(
    	id	int,
    	x1	float,
    	x2	float,
    	x3	float,
    	x4	float,
    	x5	float,
    	x6	float,
    	x7	float,
    	x8	float,
    	x9	float,
    	y	int)
    row format delimited fields terminated by ','
    location '/input/'


    创建表,client执行 :hive -f employees_ext.sql

    2. hive表

    [root@node33 hive]# cat employees.sql 
    create table employees(
    	id	int,
    	x1	float,
    	x2	float,
    	x3	float,
    	x4	float,
    	x5	float,
    	x6	float,
    	x7	float,
    	x8	float,
    	x9	float
    )
    partitioned by (y int);

    创建表,client执行:hive -f employees.sql

    3. hive表(orc方式存储)

    [root@node33 hive]# cat employees_orc.sql 
    create table employees_orc(
    	id	int,
    	x1	float,
    	x2	float,
    	x3	float,
    	x4	float,
    	x5	float,
    	x6	float,
    	x7	float,
    	x8	float,
    	x9	float
    )
    partitioned by (y int)
    row format serde "org.apache.hadoop.hive.ql.io.orc.OrcSerde"
    stored as orc;


    执行:hive -f employees_orc.sql

    三、导入数据:

    1. employees_ext 表导入employees表:

    [root@node33 hive]# cat employees_ext-to-employees.sql 
    
    set hive.exec.dynamic.partition=true;
    set hive.exec.dynamic.partition.mode=nonstrict;
    set hive.eec.max.dynamic.partitions.pernode=1000;
    
    insert overwrite table employees 
    	partition(y)
    select 
    	emp_ext.id,
    	emp_ext.x1,
    	emp_ext.x2,
    	emp_ext.x3,
    	emp_ext.x4,
    	emp_ext.x5,
    	emp_ext.x6,
    	emp_ext.x7,
    	emp_ext.x8,
    	emp_ext.x9,
    	emp_ext.y
    from employees_ext emp_ext;


    执行:hive -f employees_ext-to-employees.sql。其部分log例如以下:

    Partition default.employees{y=1} stats: [num_files: 1, num_rows: 0, total_size: 3622, raw_data_size: 0]
    Partition default.employees{y=2} stats: [num_files: 1, num_rows: 0, total_size: 4060, raw_data_size: 0]
    Partition default.employees{y=3} stats: [num_files: 1, num_rows: 0, total_size: 910, raw_data_size: 0]
    Partition default.employees{y=5} stats: [num_files: 1, num_rows: 0, total_size: 699, raw_data_size: 0]
    Partition default.employees{y=6} stats: [num_files: 1, num_rows: 0, total_size: 473, raw_data_size: 0]
    Partition default.employees{y=7} stats: [num_files: 1, num_rows: 0, total_size: 13561851, raw_data_size: 0]
    Table default.employees stats: [num_partitions: 6, num_files: 6, num_rows: 0, total_size: 13571615, raw_data_size: 0]
    MapReduce Jobs Launched: 
    Job 0: Map: 1   Cumulative CPU: 6.78 sec   HDFS Read: 13245660 HDFS Write: 13571615 SUCCESS
    Total MapReduce CPU Time Spent: 6 seconds 780 msec
    OK
    Time taken: 186.743 seconds

    查看hdfs文件大小:

    [root@node33 hive]# hadoop fs -count /user/hive/warehouse/employees
               7            6           13571615 /user/hive/warehouse/employees

    查看hdfs文件内容:

    bash-4.1$ hadoop fs -cat /user/hive/warehouse/employees/y=1/* | head -n 1
    11.5210113.644.491.171.780.068.750.00.0

    (截图的内容为输出,拷贝到代码块里面有问题)

    2. employees_ext 表导入employees_orc表:

    [root@node33 hive]# cat employees_ext-to-employees_orc.sql 
    
    set hive.exec.dynamic.partition=true;
    set hive.exec.dynamic.partition.mode=nonstrict;
    set hive.eec.max.dynamic.partitions.pernode=1000;
    
    insert overwrite table employees_orc 
    	partition(y)
    select 
    	emp_ext.id,
    	emp_ext.x1,
    	emp_ext.x2,
    	emp_ext.x3,
    	emp_ext.x4,
    	emp_ext.x5,
    	emp_ext.x6,
    	emp_ext.x7,
    	emp_ext.x8,
    	emp_ext.x9,
    	emp_ext.y
    from employees_ext emp_ext;


    执行:hive -f employees_ext-to-employees_orc.sql,其部分log例如以下:

    Partition default.employees_orc{y=1} stats: [num_files: 1, num_rows: 0, total_size: 2355, raw_data_size: 0]
    Partition default.employees_orc{y=2} stats: [num_files: 1, num_rows: 0, total_size: 2539, raw_data_size: 0]
    Partition default.employees_orc{y=3} stats: [num_files: 1, num_rows: 0, total_size: 1290, raw_data_size: 0]
    Partition default.employees_orc{y=5} stats: [num_files: 1, num_rows: 0, total_size: 1165, raw_data_size: 0]
    Partition default.employees_orc{y=6} stats: [num_files: 1, num_rows: 0, total_size: 955, raw_data_size: 0]
    Partition default.employees_orc{y=7} stats: [num_files: 1, num_rows: 0, total_size: 1424599, raw_data_size: 0]
    Table default.employees_orc stats: [num_partitions: 6, num_files: 6, num_rows: 0, total_size: 1432903, raw_data_size: 0]
    MapReduce Jobs Launched: 
    Job 0: Map: 1   Cumulative CPU: 7.84 sec   HDFS Read: 13245660 HDFS Write: 1432903 SUCCESS
    Total MapReduce CPU Time Spent: 7 seconds 840 msec
    OK
    Time taken: 53.014 seconds


    查看hdfs文件大小:

    [root@node33 hive]# hadoop fs -count /user/hive/warehouse/employees_orc
               7            6            1432903 /user/hive/warehouse/employees_orc


    查看hdfs文件内容:

     

    3. 比較两者性能

     

     时间压缩率
    employees表:186.7秒13571615/13245660=1.0246
    employees_orc表:53.0秒1432903/13245660=0.108

    时间上来说,orc的表现方式会好非常多。同一时候压缩率也好非常多。

    只是,这个測试是在本人虚拟机上測试的,并且是单机測试的,所以參考价值不是非常大,可是压缩率还是有一定參考价值的。

    四、导出数据

    1. employees表:

    [root@node33 hive]# cat export_employees.sql 
    
    insert overwrite local directory '/opt/hivedata/employees.dat'
    row format delimited
    fields terminated by ','
    select 
    	emp.id,
    	emp.x1, 
    	emp.x2, 
    	emp.x3, 
    	emp.x4, 
    	emp.x5, 
    	emp.x6, 
    	emp.x7, 
    	emp.x8, 
    	emp.x9, 
    	emp.y
    from employees emp

    执行:hive -f export_employees.sql
    部分log:

    MapReduce Total cumulative CPU time: 9 seconds 630 msec
    Ended Job = job_1398958404577_0007
    Copying data to local directory /opt/hivedata/employees.dat
    Copying data to local directory /opt/hivedata/employees.dat
    MapReduce Jobs Launched: 
    Job 0: Map: 1   Cumulative CPU: 9.63 sec   HDFS Read: 13572220 HDFS Write: 13978615 SUCCESS
    Total MapReduce CPU Time Spent: 9 seconds 630 msec
    OK
    Time taken: 183.841 seconds

    数据查看:

    [root@node33 hive]# ll /opt/hivedata/employees.dat/
    total 13652
    -rw-r--r--. 1 root root 13978615 May  2 05:15 000000_0
    [root@node33 hive]# head -n 1 /opt/hivedata/employees.dat/000000_0 
    1,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1


    2. employees_orc表:

    [root@node33 hive]# cat export_employees_orc.sql 
    
    insert overwrite local directory '/opt/hivedata/employees_orc.dat'
    row format delimited
    fields terminated by ','
    select 
    	emp.id,
    	emp.x1, 
    	emp.x2, 
    	emp.x3, 
    	emp.x4, 
    	emp.x5, 
    	emp.x6, 
    	emp.x7, 
    	emp.x8, 
    	emp.x9, 
    	emp.y
    from employees_orc emp

    执行 hive -f export_employees_orc.sql

    部分log:

    MapReduce Total cumulative CPU time: 4 seconds 920 msec
    Ended Job = job_1398958404577_0008
    Copying data to local directory /opt/hivedata/employees_orc.dat
    Copying data to local directory /opt/hivedata/employees_orc.dat
    MapReduce Jobs Launched: 
    Job 0: Map: 1   Cumulative CPU: 4.92 sec   HDFS Read: 1451352 HDFS Write: 13978615 SUCCESS
    Total MapReduce CPU Time Spent: 4 seconds 920 msec
    OK
    Time taken: 41.686 second


    查看数据:

    [root@node33 hive]# head -n 1 /opt/hivedata/employees_orc.dat/000000_0 
    1,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
    [root@node33 hive]# ll /opt/hivedata/employees_orc.dat/
    total 13652
    -rw-r--r--. 1 root root 13978615 May  2 05:18 000000_0


    这里的数据和原始数据的大小不一样。原始数据是13245467, 而导出到本地的是13978615 。这是由于数据的精度问题,比如原始数据中的0都被存储为了0.0。

     

    分享,成长。快乐

    转载请注明blog地址:http://blog.csdn.net/fansy1990


     


     

     

  • 相关阅读:
    【luogu P1307 数字反转】 题解
    【luogu P1111 公路修建】 题解
    字符串与正则运算
    Java 正则表达式的总结和一些小例子
    js -history.back(-1)和history.go(-1) 区别
    js
    html _ 提取html片段内的纯文本
    vue-x action 的相互调用
    java通过过滤器 设置跨域允许
    git-搭建企业git服务器
  • 原文地址:https://www.cnblogs.com/ldxsuanfa/p/10855164.html
  • Copyright © 2020-2023  润新知