• hive的基本用法(1)


    create table t_a(name string,numb int)
    row format delimited
    fields terminated by ',';
    
    create table t_b(name string,nick string)
    row format delimited
    fields terminated by ',';
    
    load data local inpath '/root/hivetest/a.txt' into table t_a;
    load data local inpath '/root/hivetest/b.txt' into table t_b;
    
    
    -- 各类join
    --1/ 内连接
    -- 笛卡尔积
    select a.*,b.*
    from t_a a inner join t_b b;
    
    
    -- 指定join条件
    select a.*,b.*
    from 
    t_a a join t_b b on a.name=b.name;
    
    -- 2/ 左外连接(左连接)
    select a.*,b.*
    from 
    t_a a left outer join t_b b on a.name=b.name;
    
    
    -- 3/ 右外连接(右连接)
    select a.*,b.*
    from 
    t_a a right outer join t_b b on a.name=b.name;
    
    -- 4/ 全外连接
    select a.*,b.*
    from
    t_a a full outer join t_b b on a.name=b.name;
    
    
    -- 5/ 左半连接
    select a.*
    from 
    t_a a left semi join t_b b on a.name=b.name;
    
    
    -- 分组聚合查询
    
    -- 针对每一行进行运算
    select ip,upper(url),access_time  -- 该表达式是对数据中的每一行进行逐行运算
    from t_pv_log;
    
    -- 求每条URL的访问总次数
    
    select url,count(1) as cnts   -- 该表达式是对分好组的数据进行逐组运算
    from t_pv_log 
    group by url;
    
    -- 求每个URL的访问者中ip地址最大的
    
    select url,max(ip)
    from t_pv_log
    group by url;
    
    -- 求每个用户访问同一个页面的所有记录中,时间最晚的一条
    
    select ip,url,max(access_time) 
    from  t_pv_log
    group by ip,url;
    
    
    -- 分组聚合综合示例
    -- 有如下数据
    /*
    192.168.33.3,http://www.edu360.cn/stu,2017-08-04 15:30:20
    192.168.33.3,http://www.edu360.cn/teach,2017-08-04 15:35:20
    192.168.33.4,http://www.edu360.cn/stu,2017-08-04 15:30:20
    192.168.33.4,http://www.edu360.cn/job,2017-08-04 16:30:20
    192.168.33.5,http://www.edu360.cn/job,2017-08-04 15:40:20
    
    
    192.168.33.3,http://www.edu360.cn/stu,2017-08-05 15:30:20
    192.168.44.3,http://www.edu360.cn/teach,2017-08-05 15:35:20
    192.168.33.44,http://www.edu360.cn/stu,2017-08-05 15:30:20
    192.168.33.46,http://www.edu360.cn/job,2017-08-05 16:30:20
    192.168.33.55,http://www.edu360.cn/job,2017-08-05 15:40:20
    
    
    192.168.133.3,http://www.edu360.cn/register,2017-08-06 15:30:20
    192.168.111.3,http://www.edu360.cn/register,2017-08-06 15:35:20
    192.168.34.44,http://www.edu360.cn/pay,2017-08-06 15:30:20
    192.168.33.46,http://www.edu360.cn/excersize,2017-08-06 16:30:20
    192.168.33.55,http://www.edu360.cn/job,2017-08-06 15:40:20
    192.168.33.46,http://www.edu360.cn/excersize,2017-08-06 16:30:20
    192.168.33.25,http://www.edu360.cn/job,2017-08-06 15:40:20
    192.168.33.36,http://www.edu360.cn/excersize,2017-08-06 16:30:20
    192.168.33.55,http://www.edu360.cn/job,2017-08-06 15:40:20
    
    */
    -- 建表映射上述数据
    create table t_access(ip string,url string,access_time string)
    partitioned by (dt string)
    row format delimited fields terminated by ',';
    
    
    -- 导入数据
    load data local inpath '/root/hivetest/access.log.0804' into table t_access partition(dt='2017-08-04');
    load data local inpath '/root/hivetest/access.log.0805' into table t_access partition(dt='2017-08-05');
    load data local inpath '/root/hivetest/access.log.0806' into table t_access partition(dt='2017-08-06');
    
    -- 查看表的分区
    show partitions t_access;
    
    -- 求8月4号以后,每天http://www.edu360.cn/job的总访问次数,及访问者中ip地址中最大的
    select dt,'http://www.edu360.cn/job',count(1),max(ip)
    from t_access
    where url='http://www.edu360.cn/job'
    group by dt having dt>'2017-08-04';
    
    
    select dt,max(url),count(1),max(ip)
    from t_access
    where url='http://www.edu360.cn/job'
    group by dt having dt>'2017-08-04';
    
    
    select dt,url,count(1),max(ip)
    from t_access
    where url='http://www.edu360.cn/job'
    group by dt,url having dt>'2017-08-04';
    
    
    
    select dt,url,count(1),max(ip)
    from t_access
    where url='http://www.edu360.cn/job' and dt>'2017-08-04'
    group by dt,url;
    
    
    -- 求8月4号以后,每天每个页面的总访问次数,及访问者中ip地址中最大的
    
    select dt,url,count(1),max(ip)
    from t_access
    where dt>'2017-08-04'
    group by dt,url;
    
    -- 求8月4号以后,每天每个页面的总访问次数,及访问者中ip地址中最大的,且,只查询出总访问次数>2 的记录
    -- 方式1:
    select dt,url,count(1) as cnts,max(ip)
    from t_access
    where dt>'2017-08-04'
    group by dt,url having cnts>2;
    
    
    -- 方式2:用子查询
    select dt,url,cnts,max_ip
    from
    (select dt,url,count(1) as cnts,max(ip) as max_ip
    from t_access
    where dt>'2017-08-04'
    group by dt,url) tmp
    where cnts>2;
    
    
    +----------------+---------------------------------+-----------------------+--------------+--+
    |  t_access.ip   |          t_access.url           | t_access.access_time  | t_access.dt  |
    +----------------+---------------------------------+-----------------------+--------------+--+
    
    | 192.168.33.46  | http://www.edu360.cn/job        | 2017-08-05 16:30:20   | 2017-08-05   |
    | 192.168.33.55  | http://www.edu360.cn/job        | 2017-08-05 15:40:20   | 2017-08-05   |
    
    | 192.168.33.55  | http://www.edu360.cn/job        | 2017-08-06 15:40:20   | 2017-08-06   |
    | 192.168.33.25  | http://www.edu360.cn/job        | 2017-08-06 15:40:20   | 2017-08-06   |
    | 192.168.33.55  | http://www.edu360.cn/job        | 2017-08-06 15:40:20   | 2017-08-06   |
    +----------------+---------------------------------+-----------------------+--------------+
    
    
    /*
    HIVE 中的复合数据类型
    
    */
    -- 数组
    -- 有如下数据:
    战狼2,吴京:吴刚:龙母,2017-08-16
    三生三世十里桃花,刘亦菲:痒痒,2017-08-20
    普罗米修斯,苍老师:小泽老师:波多老师,2017-09-17
    美女与野兽,吴刚:加藤鹰,2017-09-17
    
    -- 建表映射:
    create table t_movie(movie_name string,actors array<string>,first_show date)
    row format delimited fields terminated by ','
    collection items terminated by ':';
    
    -- 导入数据
    load data local inpath '/root/hivetest/actor.dat' into table t_movie;
    load data local inpath '/root/hivetest/actor.dat.2' into table t_movie;
    
    -- 查询
    select movie_name,actors[0],first_show from t_movie;
    
    select movie_name,actors,first_show
    from t_movie where array_contains(actors,'吴刚');
    
    select movie_name
    ,size(actors) as actor_number
    ,first_show
    from t_movie;
    
    
    -- 有如下数据:
    1,zhangsan,father:xiaoming#mother:xiaohuang#brother:xiaoxu,28
    2,lisi,father:mayun#mother:huangyi#brother:guanyu,22
    3,wangwu,father:wangjianlin#mother:ruhua#sister:jingtian,29
    4,mayun,father:mayongzhen#mother:angelababy,26
    
    -- 建表映射上述数据
    create table t_family(id int,name string,family_members map<string,string>,age int)
    row format delimited fields terminated by ','
    collection items terminated by '#'
    map keys terminated by ':';
    
    -- 导入数据
    load data local inpath '/root/hivetest/fm.dat' into table t_family;
    
    +--------------+----------------+----------------------------------------------------------------+---------------+--+
    | t_family.id  | t_family.name  |                    t_family.family_members                     | t_family.age  |
    +--------------+----------------+----------------------------------------------------------------+---------------+--+
    | 1            | zhangsan       | {"father":"xiaoming","mother":"xiaohuang","brother":"xiaoxu"}  | 28            |
    | 2            | lisi           | {"father":"mayun","mother":"huangyi","brother":"guanyu"}       | 22            |
    | 3            | wangwu         | {"father":"wangjianlin","mother":"ruhua","sister":"jingtian"}  | 29            |
    | 4            | mayun          | {"father":"mayongzhen","mother":"angelababy"}                  | 26            |
    +--------------+----------------+----------------------------------------------------------------+---------------+--+
    
    -- 查出每个人的 爸爸、姐妹
    select id,name,family_members["father"] as father,family_members["sister"] as sister,age
    from t_family;
    
    -- 查出每个人有哪些亲属关系
    select id,name,map_keys(family_members) as relations,age
    from  t_family;
    
    -- 查出每个人的亲人名字
    select id,name,map_values(family_members) as relations,age
    from  t_family;
    
    -- 查出每个人的亲人数量
    select id,name,size(family_members) as relations,age
    from  t_family;
    
    -- 查出所有拥有兄弟的人及他的兄弟是谁
    -- 方案1:一句话写完
    select id,name,age,family_members['brother']
    from t_family  where array_contains(map_keys(family_members),'brother');
    
    
    -- 方案2:子查询
    select id,name,age,family_members['brother']
    from
    (select id,name,age,map_keys(family_members) as relations,family_members 
    from t_family) tmp 
    where array_contains(relations,'brother');
    
    
    /*  hive数据类型struct
    
    假如有以下数据:
    1,zhangsan,18:male:深圳
    2,lisi,28:female:北京
    3,wangwu,38:male:广州
    4,赵六,26:female:上海
    5,钱琪,35:male:杭州
    6,王八,48:female:南京
    */
    
    -- 建表映射上述数据
    
    drop table if exists t_user;
    create table t_user(id int,name string,info struct<age:int,sex:string,addr:string>)
    row format delimited fields terminated by ','
    collection items terminated by ':';
    
    -- 导入数据
    load data local inpath '/root/hivetest/user.dat' into table t_user;
    
    -- 查询每个人的id name和地址
    select id,name,info.addr
    from t_user;
    
    
    /*
        条件控制函数:case when
    */
    
    0: jdbc:hive2://localhost:10000> select * from t_user;
    +------------+--------------+----------------------------------------+--+
    | t_user.id  | t_user.name  |              t_user.info               |
    +------------+--------------+----------------------------------------+--+
    | 1          | zhangsan     | {"age":18,"sex":"male","addr":"深圳"}    |
    | 2          | lisi         | {"age":28,"sex":"female","addr":"北京"}  |
    | 3          | wangwu       | {"age":38,"sex":"male","addr":"广州"}    |
    | 4          | 赵六           | {"age":26,"sex":"female","addr":"上海"}  |
    | 5          | 钱琪           | {"age":35,"sex":"male","addr":"杭州"}    |
    | 6          | 王八           | {"age":48,"sex":"female","addr":"南京"}  |
    +------------+--------------+----------------------------------------+--+
    
    需求:查询出用户的id、name、年龄(如果年龄在30岁以下,显示年轻人,30-40之间,显示中年人,40以上老年人)
    select id,name,
    case
    when info.age<30 then '青年'
    when info.age>=30 and info.age<40 then '中年'
    else '老年'
    end
    from t_user;
    
    
    
    --- IF
    0: jdbc:hive2://localhost:10000> select * from t_movie;
    +---------------------+------------------------+---------------------+--+
    | t_movie.movie_name  |     t_movie.actors     | t_movie.first_show  |
    +---------------------+------------------------+---------------------+--+
    | 战狼2                 | ["吴京","吴刚","龙母"]       | 2017-08-16          |
    | 三生三世十里桃花            | ["刘亦菲","痒痒"]           | 2017-08-20          |
    | 普罗米修斯               | ["苍老师","小泽老师","波多老师"]  | 2017-09-17          |
    | 美女与野兽               | ["吴刚","加藤鹰"]           | 2017-09-17          |
    +---------------------+------------------------+---------------------+--+
    
    -- 需求: 查询电影信息,并且如果主演中有吴刚的,显示好电影,否则烂片
    
    select movie_name,actors,first_show,
    if(array_contains(actors,'吴刚'),'好片儿','烂片儿')
    from t_movie;
    
    
    
    
    
    -- row_number() over() 函数
    -- 造数据:
    
    1,18,a,male
    2,19,b,male
    3,22,c,female
    4,16,d,female
    5,30,e,male
    6,26,f,female
    
    create table t_rn(id int,age int,name string,sex string)
    row format delimited fields terminated by ',';
    
    load data local inpath '/root/hivetest/rn.dat' into table t_rn;
    
    -- 分组标记序号
    
    select * 
    from 
    (select id,age,name,sex,
    row_number() over(partition by sex order by age desc) as rn 
    from t_rn) tmp
    where rn<3
    ;
    
    
    -- 窗口分析函数  sum() over()  :可以实现在窗口中进行逐行累加
    
    0: jdbc:hive2://localhost:10000> select * from  t_access_amount;
    +----------------------+------------------------+-------------------------+--+
    | t_access_amount.uid  | t_access_amount.month  | t_access_amount.amount  |
    +----------------------+------------------------+-------------------------+--+
    | A                    | 2015-01                | 33                      |
    | A                    | 2015-02                | 10                      |
    | A                    | 2015-03                | 20                      |
    | B                    | 2015-01                | 30                      |
    | B                    | 2015-02                | 15                      |
    | B                    | 2015-03                | 45                      |
    | C                    | 2015-01                | 30                      |
    | C                    | 2015-02                | 40                      |
    | C                    | 2015-03                | 30                      |
    +----------------------+------------------------+-------------------------+--+
    
    -- 需求:求出每个人截止到每个月的总额
    
    select uid,month,amount,
    sum(amount) over(partition by uid order by month rows between unbounded preceding and current row) as accumulate
    from t_access_amount;
    
    
    
    
    -- 自定义函数
    /*
    有如下json数据:rating.json
    {"movie":"1193","rate":"5","timeStamp":"978300760","uid":"1"}
    {"movie":"661","rate":"3","timeStamp":"978302109","uid":"1"}
    {"movie":"914","rate":"3","timeStamp":"978301968","uid":"1"}
    {"movie":"3408","rate":"4","timeStamp":"978300275","uid":"1"}
    
    需要导入hive中进行数据分析
    */
    
    -- 建表映射上述数据
    create table t_ratingjson(json string);
    
    load data local inpath '/root/hivetest/rating.json' into table t_ratingjson;
    
    想把上面的原始数据变成如下形式:
    1193,5,978300760,1
    661,3,978302109,1
    914,3,978301968,1
    3408,4,978300275,1
    
    思路:如果能够定义一个json解析函数,则很方便了
    create table t_rate
    as
    select myjson(json,1) as movie,cast(myjson(json,2) as int) as rate,myjson(json,3) as ts,myjson(json,4) as uid from t_ratingjson;
    
    解决:
    hive中如何定义自己的函数:
    1、先写一个java类(extends UDF,重载方法public C evaluate(A a,B b)),实现你所想要的函数的功能(传入一个json字符串和一个脚标,返回一个值)
    2、将java程序打成jar包,上传到hive所在的机器
    3、在hive命令行中将jar包添加到classpath :    
            hive>add jar /root/hivetest/myjson.jar;
    4、在hive命令中用命令创建一个函数叫做myjson,关联你所写的这个java类
            hive> create temporary function myjson as 'cn.edu360.hive.udf.MyJsonParser';
  • 相关阅读:
    InjectAPC全部项目(Win32和Win64位)
    Codeforces Round #377 (Div. 2)
    Codeforces Canada Cup 2016
    UVa 1395 (最小生成树)
    空间表SpaceList
    线程中的临界区的应用
    【题解】狼和羊-C++
    【基础算法-树状数组】入门-C++
    【题解】在你窗外闪耀的星星-C++
    【题解】[NOIP模拟题]我要的幸福-C++
  • 原文地址:https://www.cnblogs.com/ywqtro/p/14043289.html
Copyright © 2020-2023  润新知