创建分桶的表
create table t_buck(id int ,name string) clustered by (id ) sorted by (id) into 4 buckets ;
分桶表的数据不是直接导入(load)的,是从其他表里面查出来插入的
,插入时会在语句中加入约束的语句。
hive的存储格式介绍
hive 的DML操作
https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DML
插入
Standard syntax: INSERT OVERWRITE TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...) [IF NOT EXISTS]] select_statement1 FROM from_statement; INSERT INTO TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...)] select_statement1 FROM from_statement; Hive extension (multiple inserts): FROM from_statement INSERT OVERWRITE TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...) [IF NOT EXISTS]] select_statement1 [INSERT OVERWRITE TABLE tablename2 [PARTITION ... [IF NOT EXISTS]] select_statement2] [INSERT INTO TABLE tablename2 [PARTITION ...] select_statement2] ...; FROM from_statement INSERT INTO TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...)] select_statement1 [INSERT INTO TABLE tablename2 [PARTITION ...] select_statement2] [INSERT OVERWRITE TABLE tablename2 [PARTITION ... [IF NOT EXISTS]] select_statement2] ...; Hive extension (dynamic partition inserts): INSERT OVERWRITE TABLE tablename PARTITION (partcol1[=val1], partcol2[=val2] ...) select_statement FROM from_statement; INSERT INTO TABLE tablename PARTITION (partcol1[=val1], partcol2[=val2] ...) select_statement FROM from_statement;
设置
set hive.enforce.bucketing=true;
set mapreduce.job.reduces = 4;//和分桶的数量相同(设置reduce.tasks的个数)
set mapred.resuce.tasks=2; 设置reduce.tasks的个数
insert into table t_buck select id,name from t_sz02 cluster by (id) ; --观察此时的reducetask的数量
insert into table t_buck select id,name from t_sz02 distribute by (id) sort by (id desc);
使用sort by 可以指定排序,使用cluster by 不能,默认是升序。
最后就能得到分桶数据
分桶相当于mapreduce中的分区,hive中的分区表是将数据放在分好的文件夹里面
保存数据的方式
将结果保存到一张已经存在的hive表中
create table t_tmp as select id,name from t_sz02; -- as 是必须的
insert into table t_buck select id,name from t_sz02 distribute by (id) sort by (id desc); 保存到已经存在数据的表中,as不是必须的 ,也可以加overwrite
insert overwrite local directory '/home/hadoop/study' select * from t_tmp; 保持到本地文件系统中(也可以存在hdfs上去) overwrite是必须的,因为textoutputformat 不支持追加,只能覆盖
不是分区的表也可以使用sort by 来查询数据,或者导出数据。同理,分区表的数据也是没有分区的表经过 sort by 查询之后查出来的。