• hadoop-hive学习笔记


    create table hive_1(id string,name string ,gender string)
    row format delimited fields terminated by ','
    stored as TEXTFILE;

    load data local inpath '/luozt/hive_001.txt' into table hive_1 ;

    create EXTERNAL table hive_2(id string,name string ,gender string)
    row format delimited fields terminated by ','
    stored as TEXTFILE;

    load data inpath '/luo/hive_001.txt' into table hive_2 ;
    //查询记录数
    count 'hive_1'
    //清空表
    truncate table log_struct;

    //删除表
    drop table log_struct;

    create table partition_table
    (name string ,salary float,gender string,level string)
    partitioned by(dt string,dept string)
    row format delimited fields terminated by ','
    stored as TEXTFILE;

    desc partition_table

    show partitions partition_table;//查看分区表的分区信息
    //给分区表插入数据
    load data local inpath '/luozt/par.txt' into table partition_table partition(dt='2014-04-01',dept='yonyu');

    添加分区:
    alter table partition_table add partition(dt='2014-04-03',dept='yonyou3') location '/user/hive/warehouse/luo.db/partition_table/dt=2014-04-03/dept=yonyou3';

    删除分区:
    alter table partition_table drop partition(dt='2014-04-03',dept='yonyou4')


    select * from partition_table where salary>7600;
    //嵌套
    from (select name,salary from partition_table)e select e.name,e.salary where e.salary>7600;

    //in 的用法

    select * from partition_table where salary in(7000,6700);

    //case的用法
    select name,salary,
    case
    when salary<6800 then 'L1'
    when salary>6800 and salary <8000 then 'L2'
    when salary>8100 then 'L3'
    else 'L0'
    end as salary_level
    from partition_table;

    //having的用法

    select gender,sum(salary) from partition_table group by gender;


    //练习join
    create table group1 (user string,score int)
    row format delimited fields terminated by ','
    stored as TEXTFILE;

    //
    create table group_join (user string,class string)
    row format delimited fields terminated by ','
    stored as TEXTFILE;

    //普通的join
    select b.class,a.score from group1 a join group_join b on (a.user=b.user);
    //有个表很小时用mapjoin(b) b为小表
    select /*+MAPJOIN(b)*/ b.class,a.score from group1 a join group_join b on (a.user=b.user);
    //left Semi join

    //分组
    select user ,sum(score) from group1 group by user; ----分组时select的字段要全部作为group字段
    //优化
    set hive.map.aggr=true

    //order by
    create table orderby_test (user string,class string,math int,english int)
    row format delimited fields terminated by ','
    stored as TEXTFILE;

    //默认升序
    select * from orderby_test order by math; --desc改为降序,若将set hive.mapred.mode=strict;则要加上limit

    //sort by 不受set hive.mapred.mode=strict的影响 可以指定 set mapred.reduce.tasks=<number> sort by 只会在么给reduce上进行排序,reduce输出的数据时有序的,提高全局排序的效率

    //union all hive不支持顶层union ,只能将union封装在子查询中,且必须为union的查询输出定义别名
    select * from (select count(*) from group1 union all select count(*) from orderby_test) temp;

    //记得这种用法
    select name,height,mark
    from
    (select name,height,'0' as mark from a
    union all
    select name height,'1' as mark from b) t;


    //索引
    create table index_test(id int,name string) partitioned by (dt string) row format delimited fields terminated by ',' stored as TEXTFILE;
    //先创建一个临时表
    create table temp(id int,name string ,dt string ) row format delimited fields terminated by ',' stored as TEXTFILE;
    //动态分区
    set hive.exec.dynamic.partition.mode=nonstrict;
    set hive.exec.dynamic.partition=true;

    insert overwrite table index_test partition(dt) select id,name,dt from temp;

    //索引 创建索引时要有partition 否则不行
    create index index1 on table index_test(id) AS 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler' with DEFERRED REBUILD;

    alter index index1 on index_test rebuild;

    show index on index_test;
    show partitions index_test;

    //Bucket 桶 就是抽样

    create table tb_tmp(id int,age int,name string,timeflag bigint) row format delimited fields terminated by ',' ;
    create table tb_stu(id int,age int,name string,timeflag bigint) clustered by(id) sorted by (age) into 5 buckets row format delimited fields terminated by ','

    insert into table tb_stu select id,age,name,timeflag from tb_tmp;

    利用桶进行查询
    select * from tb_stu tablesample(bucket 1 out of 5 on id);


    //存储类型和复合数据类型
    rcfile 是直接load不进去的 要用临时表insert进去

    //array
    create table log_array(ip string,uid array<bigint>) partitioned by (dt string) row format delimited fields terminated by ',' collection items terminated by '|' stored AS TEXTFILE;

    load data local inpath '/luo/log_array.txt' into table log_array partition(dt=20150902);
    //查询array中的值
    select uid[1] from log_array;
    select ip,size(uid) from log_array where dt=20150902;
    select ip from log_array where dt=20150902 and array_contains(uid,4732974)

    //map
    create table log_map(ts string,ip string,type string,logtype string,request Map<string,string>,response Map<string,string>)
    row format delimited fields terminated by '#' collection items terminated by '&' Map keys terminated by '=' stored as TEXTFILE;

    //查询
    select request['src'] from log_map;

    //struct
    create table log_struct(ip string,user struct<name:string,age:int>)
    row format delimited fields terminated by ','
    collection items terminated by '#'
    stored as TEXTFILE;

    数据:192.168.1.1,wow#23
    192.168.1.1,wow#23
    192.168.1.1,wow#23
    192.168.1.1,wow#23
    192.168.1.1,wow#23
    192.168.1.1,wow#23

    select user.name from log_struct;

  • 相关阅读:
    k8s使用私有镜像仓库
    spark client 配置lzo
    jvm系列(四):jvm调优-命令篇
    mysqldump 备份还原数据库
    df 卡死及ls无法查看文件
    记录一次服务器断电,直接进入救援模式
    nginx开机自启脚本
    mongodb启动关闭脚本
    mongo数据备份恢复
    centos 快速配置网络
  • 原文地址:https://www.cnblogs.com/luo-mao/p/5872420.html
Copyright © 2020-2023  润新知