create table t_a(name string,numb int) row format delimited fields terminated by ','; create table t_b(name string,nick string) row format delimited fields terminated by ','; load data local inpath '/root/hivetest/a.txt' into table t_a; load data local inpath '/root/hivetest/b.txt' into table t_b; -- 各类join --1/ 内连接 -- 笛卡尔积 select a.*,b.* from t_a a inner join t_b b; -- 指定join条件 select a.*,b.* from t_a a join t_b b on a.name=b.name; -- 2/ 左外连接(左连接) select a.*,b.* from t_a a left outer join t_b b on a.name=b.name; -- 3/ 右外连接(右连接) select a.*,b.* from t_a a right outer join t_b b on a.name=b.name; -- 4/ 全外连接 select a.*,b.* from t_a a full outer join t_b b on a.name=b.name; -- 5/ 左半连接 select a.* from t_a a left semi join t_b b on a.name=b.name; -- 分组聚合查询 -- 针对每一行进行运算 select ip,upper(url),access_time -- 该表达式是对数据中的每一行进行逐行运算 from t_pv_log; -- 求每条URL的访问总次数 select url,count(1) as cnts -- 该表达式是对分好组的数据进行逐组运算 from t_pv_log group by url; -- 求每个URL的访问者中ip地址最大的 select url,max(ip) from t_pv_log group by url; -- 求每个用户访问同一个页面的所有记录中,时间最晚的一条 select ip,url,max(access_time) from t_pv_log group by ip,url; -- 分组聚合综合示例 -- 有如下数据 /* 192.168.33.3,http://www.edu360.cn/stu,2017-08-04 15:30:20 192.168.33.3,http://www.edu360.cn/teach,2017-08-04 15:35:20 192.168.33.4,http://www.edu360.cn/stu,2017-08-04 15:30:20 192.168.33.4,http://www.edu360.cn/job,2017-08-04 16:30:20 192.168.33.5,http://www.edu360.cn/job,2017-08-04 15:40:20 192.168.33.3,http://www.edu360.cn/stu,2017-08-05 15:30:20 192.168.44.3,http://www.edu360.cn/teach,2017-08-05 15:35:20 192.168.33.44,http://www.edu360.cn/stu,2017-08-05 15:30:20 192.168.33.46,http://www.edu360.cn/job,2017-08-05 16:30:20 192.168.33.55,http://www.edu360.cn/job,2017-08-05 15:40:20 192.168.133.3,http://www.edu360.cn/register,2017-08-06 15:30:20 192.168.111.3,http://www.edu360.cn/register,2017-08-06 15:35:20 192.168.34.44,http://www.edu360.cn/pay,2017-08-06 15:30:20 192.168.33.46,http://www.edu360.cn/excersize,2017-08-06 16:30:20 192.168.33.55,http://www.edu360.cn/job,2017-08-06 15:40:20 192.168.33.46,http://www.edu360.cn/excersize,2017-08-06 16:30:20 192.168.33.25,http://www.edu360.cn/job,2017-08-06 15:40:20 192.168.33.36,http://www.edu360.cn/excersize,2017-08-06 16:30:20 192.168.33.55,http://www.edu360.cn/job,2017-08-06 15:40:20 */ -- 建表映射上述数据 create table t_access(ip string,url string,access_time string) partitioned by (dt string) row format delimited fields terminated by ','; -- 导入数据 load data local inpath '/root/hivetest/access.log.0804' into table t_access partition(dt='2017-08-04'); load data local inpath '/root/hivetest/access.log.0805' into table t_access partition(dt='2017-08-05'); load data local inpath '/root/hivetest/access.log.0806' into table t_access partition(dt='2017-08-06'); -- 查看表的分区 show partitions t_access; -- 求8月4号以后,每天http://www.edu360.cn/job的总访问次数,及访问者中ip地址中最大的 select dt,'http://www.edu360.cn/job',count(1),max(ip) from t_access where url='http://www.edu360.cn/job' group by dt having dt>'2017-08-04'; select dt,max(url),count(1),max(ip) from t_access where url='http://www.edu360.cn/job' group by dt having dt>'2017-08-04'; select dt,url,count(1),max(ip) from t_access where url='http://www.edu360.cn/job' group by dt,url having dt>'2017-08-04'; select dt,url,count(1),max(ip) from t_access where url='http://www.edu360.cn/job' and dt>'2017-08-04' group by dt,url; -- 求8月4号以后,每天每个页面的总访问次数,及访问者中ip地址中最大的 select dt,url,count(1),max(ip) from t_access where dt>'2017-08-04' group by dt,url; -- 求8月4号以后,每天每个页面的总访问次数,及访问者中ip地址中最大的,且,只查询出总访问次数>2 的记录 -- 方式1: select dt,url,count(1) as cnts,max(ip) from t_access where dt>'2017-08-04' group by dt,url having cnts>2; -- 方式2:用子查询 select dt,url,cnts,max_ip from (select dt,url,count(1) as cnts,max(ip) as max_ip from t_access where dt>'2017-08-04' group by dt,url) tmp where cnts>2; +----------------+---------------------------------+-----------------------+--------------+--+ | t_access.ip | t_access.url | t_access.access_time | t_access.dt | +----------------+---------------------------------+-----------------------+--------------+--+ | 192.168.33.46 | http://www.edu360.cn/job | 2017-08-05 16:30:20 | 2017-08-05 | | 192.168.33.55 | http://www.edu360.cn/job | 2017-08-05 15:40:20 | 2017-08-05 | | 192.168.33.55 | http://www.edu360.cn/job | 2017-08-06 15:40:20 | 2017-08-06 | | 192.168.33.25 | http://www.edu360.cn/job | 2017-08-06 15:40:20 | 2017-08-06 | | 192.168.33.55 | http://www.edu360.cn/job | 2017-08-06 15:40:20 | 2017-08-06 | +----------------+---------------------------------+-----------------------+--------------+ /* HIVE 中的复合数据类型 */ -- 数组 -- 有如下数据: 战狼2,吴京:吴刚:龙母,2017-08-16 三生三世十里桃花,刘亦菲:痒痒,2017-08-20 普罗米修斯,苍老师:小泽老师:波多老师,2017-09-17 美女与野兽,吴刚:加藤鹰,2017-09-17 -- 建表映射: create table t_movie(movie_name string,actors array<string>,first_show date) row format delimited fields terminated by ',' collection items terminated by ':'; -- 导入数据 load data local inpath '/root/hivetest/actor.dat' into table t_movie; load data local inpath '/root/hivetest/actor.dat.2' into table t_movie; -- 查询 select movie_name,actors[0],first_show from t_movie; select movie_name,actors,first_show from t_movie where array_contains(actors,'吴刚'); select movie_name ,size(actors) as actor_number ,first_show from t_movie; -- 有如下数据: 1,zhangsan,father:xiaoming#mother:xiaohuang#brother:xiaoxu,28 2,lisi,father:mayun#mother:huangyi#brother:guanyu,22 3,wangwu,father:wangjianlin#mother:ruhua#sister:jingtian,29 4,mayun,father:mayongzhen#mother:angelababy,26 -- 建表映射上述数据 create table t_family(id int,name string,family_members map<string,string>,age int) row format delimited fields terminated by ',' collection items terminated by '#' map keys terminated by ':'; -- 导入数据 load data local inpath '/root/hivetest/fm.dat' into table t_family; +--------------+----------------+----------------------------------------------------------------+---------------+--+ | t_family.id | t_family.name | t_family.family_members | t_family.age | +--------------+----------------+----------------------------------------------------------------+---------------+--+ | 1 | zhangsan | {"father":"xiaoming","mother":"xiaohuang","brother":"xiaoxu"} | 28 | | 2 | lisi | {"father":"mayun","mother":"huangyi","brother":"guanyu"} | 22 | | 3 | wangwu | {"father":"wangjianlin","mother":"ruhua","sister":"jingtian"} | 29 | | 4 | mayun | {"father":"mayongzhen","mother":"angelababy"} | 26 | +--------------+----------------+----------------------------------------------------------------+---------------+--+ -- 查出每个人的 爸爸、姐妹 select id,name,family_members["father"] as father,family_members["sister"] as sister,age from t_family; -- 查出每个人有哪些亲属关系 select id,name,map_keys(family_members) as relations,age from t_family; -- 查出每个人的亲人名字 select id,name,map_values(family_members) as relations,age from t_family; -- 查出每个人的亲人数量 select id,name,size(family_members) as relations,age from t_family; -- 查出所有拥有兄弟的人及他的兄弟是谁 -- 方案1:一句话写完 select id,name,age,family_members['brother'] from t_family where array_contains(map_keys(family_members),'brother'); -- 方案2:子查询 select id,name,age,family_members['brother'] from (select id,name,age,map_keys(family_members) as relations,family_members from t_family) tmp where array_contains(relations,'brother'); /* hive数据类型struct 假如有以下数据: 1,zhangsan,18:male:深圳 2,lisi,28:female:北京 3,wangwu,38:male:广州 4,赵六,26:female:上海 5,钱琪,35:male:杭州 6,王八,48:female:南京 */ -- 建表映射上述数据 drop table if exists t_user; create table t_user(id int,name string,info struct<age:int,sex:string,addr:string>) row format delimited fields terminated by ',' collection items terminated by ':'; -- 导入数据 load data local inpath '/root/hivetest/user.dat' into table t_user; -- 查询每个人的id name和地址 select id,name,info.addr from t_user; /* 条件控制函数:case when */ 0: jdbc:hive2://localhost:10000> select * from t_user; +------------+--------------+----------------------------------------+--+ | t_user.id | t_user.name | t_user.info | +------------+--------------+----------------------------------------+--+ | 1 | zhangsan | {"age":18,"sex":"male","addr":"深圳"} | | 2 | lisi | {"age":28,"sex":"female","addr":"北京"} | | 3 | wangwu | {"age":38,"sex":"male","addr":"广州"} | | 4 | 赵六 | {"age":26,"sex":"female","addr":"上海"} | | 5 | 钱琪 | {"age":35,"sex":"male","addr":"杭州"} | | 6 | 王八 | {"age":48,"sex":"female","addr":"南京"} | +------------+--------------+----------------------------------------+--+ 需求:查询出用户的id、name、年龄(如果年龄在30岁以下,显示年轻人,30-40之间,显示中年人,40以上老年人) select id,name, case when info.age<30 then '青年' when info.age>=30 and info.age<40 then '中年' else '老年' end from t_user; --- IF 0: jdbc:hive2://localhost:10000> select * from t_movie; +---------------------+------------------------+---------------------+--+ | t_movie.movie_name | t_movie.actors | t_movie.first_show | +---------------------+------------------------+---------------------+--+ | 战狼2 | ["吴京","吴刚","龙母"] | 2017-08-16 | | 三生三世十里桃花 | ["刘亦菲","痒痒"] | 2017-08-20 | | 普罗米修斯 | ["苍老师","小泽老师","波多老师"] | 2017-09-17 | | 美女与野兽 | ["吴刚","加藤鹰"] | 2017-09-17 | +---------------------+------------------------+---------------------+--+ -- 需求: 查询电影信息,并且如果主演中有吴刚的,显示好电影,否则烂片 select movie_name,actors,first_show, if(array_contains(actors,'吴刚'),'好片儿','烂片儿') from t_movie; -- row_number() over() 函数 -- 造数据: 1,18,a,male 2,19,b,male 3,22,c,female 4,16,d,female 5,30,e,male 6,26,f,female create table t_rn(id int,age int,name string,sex string) row format delimited fields terminated by ','; load data local inpath '/root/hivetest/rn.dat' into table t_rn; -- 分组标记序号 select * from (select id,age,name,sex, row_number() over(partition by sex order by age desc) as rn from t_rn) tmp where rn<3 ; -- 窗口分析函数 sum() over() :可以实现在窗口中进行逐行累加 0: jdbc:hive2://localhost:10000> select * from t_access_amount; +----------------------+------------------------+-------------------------+--+ | t_access_amount.uid | t_access_amount.month | t_access_amount.amount | +----------------------+------------------------+-------------------------+--+ | A | 2015-01 | 33 | | A | 2015-02 | 10 | | A | 2015-03 | 20 | | B | 2015-01 | 30 | | B | 2015-02 | 15 | | B | 2015-03 | 45 | | C | 2015-01 | 30 | | C | 2015-02 | 40 | | C | 2015-03 | 30 | +----------------------+------------------------+-------------------------+--+ -- 需求:求出每个人截止到每个月的总额 select uid,month,amount, sum(amount) over(partition by uid order by month rows between unbounded preceding and current row) as accumulate from t_access_amount; -- 自定义函数 /* 有如下json数据:rating.json {"movie":"1193","rate":"5","timeStamp":"978300760","uid":"1"} {"movie":"661","rate":"3","timeStamp":"978302109","uid":"1"} {"movie":"914","rate":"3","timeStamp":"978301968","uid":"1"} {"movie":"3408","rate":"4","timeStamp":"978300275","uid":"1"} 需要导入hive中进行数据分析 */ -- 建表映射上述数据 create table t_ratingjson(json string); load data local inpath '/root/hivetest/rating.json' into table t_ratingjson; 想把上面的原始数据变成如下形式: 1193,5,978300760,1 661,3,978302109,1 914,3,978301968,1 3408,4,978300275,1 思路:如果能够定义一个json解析函数,则很方便了 create table t_rate as select myjson(json,1) as movie,cast(myjson(json,2) as int) as rate,myjson(json,3) as ts,myjson(json,4) as uid from t_ratingjson; 解决: hive中如何定义自己的函数: 1、先写一个java类(extends UDF,重载方法public C evaluate(A a,B b)),实现你所想要的函数的功能(传入一个json字符串和一个脚标,返回一个值) 2、将java程序打成jar包,上传到hive所在的机器 3、在hive命令行中将jar包添加到classpath : hive>add jar /root/hivetest/myjson.jar; 4、在hive命令中用命令创建一个函数叫做myjson,关联你所写的这个java类 hive> create temporary function myjson as 'cn.edu360.hive.udf.MyJsonParser';