项目实战从0到1之hive（14）关于hive统计连续的案例

一:例题1

统计用户连续交易的总额、连续登陆天数、连续登陆开始和结束时间、间隔天数

1.建表:

create table deal_tb (id int, daystr varchar(10), amount decimal(10,2));

2.插入数据:

insert into deal_tb values
(1, "2019-02-08", 6214.23), 
(1, "2019-02-08", 6247.32), 
(1, "2019-02-09", 85.63), 
(1, "2019-02-09", 967.36), 
(1, "2019-02-10", 85.69), 
(1, "2019-02-12", 769.85), 
(1, "2019-02-13", 943.86), 
(1, "2019-02-14", 538.42),
(1, "2019-02-15", 369.76),
(1, "2019-02-16", 369.76),
(1, "2019-02-18", 795.15),
(1, "2019-02-19", 715.65),
(1, "2019-02-21", 537.71),
(2, "2019-02-08", 6214.23), 
(2, "2019-02-08", 6247.32), 
(2, "2019-02-09", 85.63), 
(2, "2019-02-09", 967.36), 
(2, "2019-02-10", 85.69), 
(2, "2019-02-12", 769.85), 
(2, "2019-02-13", 943.86), 
(2, "2019-02-14", 943.18),
(2, "2019-02-15", 369.76),
(2, "2019-02-18", 795.15),
(2, "2019-02-19", 715.65),
(2, "2019-02-21", 537.71),
(3, "2019-02-08", 6214.23), 
(3, "2019-02-08", 6247.32), 
(3, "2019-02-09", 85.63), 
(3, "2019-02-09", 967.36), 
(3, "2019-02-10", 85.69), 
(3, "2019-02-12", 769.85), 
(3, "2019-02-13", 943.86), 
(3, "2019-02-14", 276.81),
(3, "2019-02-15", 369.76),
(3, "2019-02-16", 369.76),
(3, "2019-02-18", 795.15),
(3, "2019-02-19", 715.65),
(3, "2019-02-21", 537.71);

3.1.因为每个用户每天会有多天记录，所以按用户和日期分组求和，使每个用户每天只有一条数据。

select 
 id, daystr, sum(amount) amount
from
 deal_tb
group by
 id, daystr;

结果

id    daystr    amount
1    2019-02-08    12461.55
1    2019-02-09    1052.99
1    2019-02-10    85.69
1    2019-02-12    769.85
1    2019-02-13    943.86
1    2019-02-14    538.42
1    2019-02-15    369.76
1    2019-02-16    369.76
1    2019-02-18    795.15
1    2019-02-19    715.65
1    2019-02-21    537.71
2    2019-02-08    12461.55
2    2019-02-09    1052.99
2    2019-02-10    85.69
2    2019-02-12    769.85
2    2019-02-13    943.86
2    2019-02-14    943.18
2    2019-02-15    369.76
2    2019-02-18    795.15
2    2019-02-19    715.65
2    2019-02-21    537.71
3    2019-02-08    12461.55
3    2019-02-09    1052.99
3    2019-02-10    85.69
3    2019-02-12    769.85
3    2019-02-13    943.86
3    2019-02-14    276.81
3    2019-02-15    369.76
3    2019-02-16    369.76
3    2019-02-18    795.15
3    2019-02-19    715.65
3    2019-02-21    537.71

3.2.根据用户ID分组按日期排序，将日期和分组序号相减得到连续登陆的开始日期(from_day)，如果开始日期相同说明连续登陆？

select
id, daystr, amount, date_sub(daystr, row_number() over(partition by id order by daystr)) from_day 
from
(
select 
 id, daystr, sum(amount) amount
from
 deal_tb
group by
 id, daystr
) a;

结果：

id    daystr    amount    from_day
1    2019-02-08    12461.55    2019-02-07
1    2019-02-09    1052.99    2019-02-07
1    2019-02-10    85.69    2019-02-07
1    2019-02-12    769.85    2019-02-08
1    2019-02-13    943.86    2019-02-08
1    2019-02-14    538.42    2019-02-08
1    2019-02-15    369.76    2019-02-08
1    2019-02-16    369.76    2019-02-08
1    2019-02-18    795.15    2019-02-09
1    2019-02-19    715.65    2019-02-09
1    2019-02-21    537.71    2019-02-10
2    2019-02-08    12461.55    2019-02-07
2    2019-02-09    1052.99    2019-02-07
2    2019-02-10    85.69    2019-02-07
2    2019-02-12    769.85    2019-02-08
2    2019-02-13    943.86    2019-02-08
2    2019-02-14    943.18    2019-02-08
2    2019-02-15    369.76    2019-02-08
2    2019-02-18    795.15    2019-02-10
2    2019-02-19    715.65    2019-02-10
2    2019-02-21    537.71    2019-02-11
3    2019-02-08    12461.55    2019-02-07
3    2019-02-09    1052.99    2019-02-07
3    2019-02-10    85.69    2019-02-07
3    2019-02-12    769.85    2019-02-08
3    2019-02-13    943.86    2019-02-08
3    2019-02-14    276.81    2019-02-08
3    2019-02-15    369.76    2019-02-08
3    2019-02-16    369.76    2019-02-08
3    2019-02-18    795.15    2019-02-09
3    2019-02-19    715.65    2019-02-09
3    2019-02-21    537.71    2019-02-10

3.3.统计用户连续交易的总额、连续登陆天数、连续登陆开始和结束时间、间隔天数

select
 id, 
 case when count(1) >= 3 then sum(amount) else -100 end sum_amount, -- 连续大于三天的交易总额，不满足的赋值为-100
 min(daystr) start_date, -- 连续登陆的开始时间
 max(daystr) end_date, -- 连续登陆的结束时间
 count(1) continuous_day, -- 连续登陆的天数
 datediff(from_day, lag(from_day, 1, from_day) over(partition by id order by from_day)) interval_day-- 间隔多少天没交易
from
(
-- 根据用户ID分组按日期排序，将日期和分组序号相减得到连续登陆的开始日期，如果开始日期相同说明连续登陆
select
id, daystr, amount, date_sub(daystr, row_number() over(partition by id order by daystr)) from_day -- 日期减去分区排序的序号，如果新日期相等，则连续登陆
from
(
-- 按用户和日期分区求和，使每个用户每天只有一条数据
select 
 id, daystr, sum(amount) amount
from
 deal_tb
group by
 id, daystr
) a
) b
group by
id, from_day;

结果：

id    sum_amount    start_date    end_date    continuous_day    interval_day

1    13600.23    2019-02-08    2019-02-10    3    0

1    2991.65    2019-02-12    2019-02-16    5    1

1    -100.00    2019-02-18    2019-02-19    2    1

1    -100.00    2019-02-21    2019-02-21    1    1

2    13600.23    2019-02-08    2019-02-10    3    0

2    3026.65    2019-02-12    2019-02-15    4    1

2    -100.00    2019-02-18    2019-02-19    2    2

2    -100.00    2019-02-21    2019-02-21    1    1

3    13600.23    2019-02-08    2019-02-10    3    0

3    2730.04    2019-02-12    2019-02-16    5    1

3    -100.00    2019-02-18    2019-02-19    2    1

3    -100.00    2019-02-21    2019-02-21    1    1

二.例题2：

一、求单月访问次数和总访问次数

1、数据说明

数据字段说明

用户名，月份，访问次数

数据格式

A,2015-01,5
A,2015-01,15
B,2015-01,5
A,2015-01,8
B,2015-01,25
A,2015-01,5
A,2015-02,4
A,2015-02,6
B,2015-02,10
B,2015-02,5
A,2015-03,16
A,2015-03,22
B,2015-03,23
B,2015-03,10
B,2015-03,1

2、数据准备

（1）创建表

use myhive;
create external table if not exists t_access(
uname string comment '用户名',
umonth string comment '月份',
ucount int comment '访问次数'
) comment '用户访问表' 
row format delimited fields terminated by "," 
location "/hive/t_access";

（2）导入数据

load data local inpath "/home/hadoop/access.txt" into table t_access;

（3）验证数据

select * from t_access;

3、结果需求

现要求出：
每个用户截止到每月为止的最大单月访问次数和累计到该月的总访问次数，结果数据格式如下

4、需求分析

此结果需要根据用户+月份进行分组

（1）先求出当月访问次数

--求当月访问次数
create table tmp_access(
name string,
mon string,
num int
); 

insert into table tmp_access 
select uname,umonth,sum(ucount)
 from t_access t group by t.uname,t.umonth;

select * from tmp_access;

（2）tmp_access进行自连接视图

create view tmp_view as 
select a.name anme,a.mon amon,a.num anum,b.name bname,b.mon bmon,b.num bnum from tmp_access a join tmp_access b 
on a.name=b.name;

select * from tmp_view;

（3）进行比较统计

select anme,amon,anum,max(bnum) as max_access,sum(bnum) as sum_access 
from tmp_view 
where amon>=bmon 
group by anme,amon,anum;

二、学生课程成绩

1、说明

use myhive;
CREATE TABLE `course` (
  `id` int,
  `sid` int ,
  `course` string,
  `score` int 
) ;

// 插入数据
// 字段解释：id, 学号， 课程， 成绩
INSERT INTO `course` VALUES (1, 1, 'yuwen', 43);
INSERT INTO `course` VALUES (2, 1, 'shuxue', 55);
INSERT INTO `course` VALUES (3, 2, 'yuwen', 77);
INSERT INTO `course` VALUES (4, 2, 'shuxue', 88);
INSERT INTO `course` VALUES (5, 3, 'yuwen', 98);
INSERT INTO `course` VALUES (6, 3, 'shuxue', 65);

2、需求

求：所有数学课程成绩大于语文课程成绩的学生的学号

1、使用case…when…将不同的课程名称转换成不同的列

create view tmp_course_view as
select sid, case course when "shuxue" then score else 0 end  as shuxue,  
case course when "yuwen" then score else 0 end  as yuwen from course;  

select * from tmp_course_view;

2、以sid分组合并取各成绩最大值

create view tmp_course_view1 as
select aa.sid, max(aa.shuxue) as shuxue, max(aa.yuwen) as yuwen from tmp_course_view aa group by sid;  

select * from tmp_course_view1;

3、比较结果

select * from tmp_course_view1 where shuxue > yuwen;

三、求每一年最大气温的那一天 + 温度

1、说明

数据格式

2010012325

具体数据

数据解释

2010012325表示在2010年01月23日的气温为25度

2、需求

比如：2010012325表示在2010年01月23日的气温为25度。现在要求使用hive，计算每一年出现过的最大气温的日期+温度。
要计算出每一年的最大气温。我用
select substr(data,1,4),max(substr(data,9,2)) from table2 group by substr(data,1,4);
出来的是年份 + 温度这两列数据例如 2015 99

但是如果我是想select 的是：具体每一年最大气温的那一天 + 温度。例如 20150109 99
请问该怎么执行hive语句。。
group by 只需要substr(data,1,4)，
但是select substr(data,1,8)，又不在group by 的范围内。
是我陷入了思维死角。一直想不出所以然。。求大神指点一下。
在select 如果所需要的。不在group by的条件里。这种情况如何去分析？

3、解析

（1）创建一个临时表tmp_weather，将数据切分

create table tmp_weather as 
select substr(data,1,4) years,substr(data,5,2) months,substr(data,7,2) days,substr(data,9,2) temp from weather;

select * from tmp_weather;

（2）创建一个临时表tmp_year_weather

create table tmp_year_weather as 
select substr(data,1,4) years,max(substr(data,9,2)) max_temp from weather group by substr(data,1,4);

select * from tmp_year_weather;

（3）将2个临时表进行连接查询

select * from tmp_year_weather a join tmp_weather b on a.years=b.years and a.max_temp=b.temp;

四、求学生选课情况

1、数据说明

（1）数据格式

id course 
1,a 
1,b 
1,c 
1,e 
2,a 
2,c 
2,d 
2,f 
3,a 
3,b 
3,c 
3,e

（2）字段含义

表示有id为1,2,3的学生选修了课程a,b,c,d,e,f中其中几门。

2、数据准备

（1）建表t_course

create table t_course(id int,course string)
row format delimited fields terminated by ",";

（2）导入数据

load data local inpath "/home/hadoop/course/course.txt" into table t_course;

3、需求

编写Hive的HQL语句来实现以下结果：表中的1表示选修，表中的0表示未选修

id    a    b    c    d    e    f
1     1    1    1    0    1    0
2     1    0    1    1    0    1
3     1    1    1    0    1    0

4、解析

第一步：

select collect_set(course) as courses from id_course;

第二步：

set hive.strict.checks.cartesian.product=false;

create table id_courses as select t1.id as id,t1.course as id_courses,t2.course courses 
from 
( select id as id,collect_set(course) as course from id_course group by id ) t1 
join 
(select collect_set(course) as course from id_course) t2;

启用严格模式：hive.mapred.mode = strict // Deprecated
hive.strict.checks.large.query = true
该设置会禁用：1. 不指定分页的orderby
　　　　　　 2. 对分区表不指定分区进行查询
　　　　　　 3. 和数据量无关，只是一个查询模式

hive.strict.checks.type.safety = true
严格类型安全，该属性不允许以下操作：1. bigint和string之间的比较
　　　　　　　　　　　　　　　　　　2. bigint和double之间的比较

hive.strict.checks.cartesian.product = true
该属性不允许笛卡尔积操作

第三步：得出最终结果：
思路：
拿出course字段中的每一个元素在id_courses中进行判断，看是否存在。

select id,
case when array_contains(id_courses, courses[0]) then 1 else 0 end as a,
case when array_contains(id_courses, courses[1]) then 1 else 0 end as b,
case when array_contains(id_courses, courses[2]) then 1 else 0 end as c,
case when array_contains(id_courses, courses[3]) then 1 else 0 end as d,
case when array_contains(id_courses, courses[4]) then 1 else 0 end as e,
case when array_contains(id_courses, courses[5]) then 1 else 0 end as f 
from id_courses;

五、求月销售额和总销售额

1、数据说明

（1）数据格式

a,01,150
a,01,200
b,01,1000
b,01,800
c,01,250
c,01,220
b,01,6000
a,02,2000
a,02,3000
b,02,1000
b,02,1500
c,02,350
c,02,280
a,03,350
a,03,250

（2）字段含义

店铺，月份，金额

2、数据准备

（1）创建数据库表t_store

use class;
create table t_store(
name string,
months int,
money int
) 
row format delimited fields terminated by ",";

（2）导入数据

load data local inpath "/home/hadoop/store.txt" into table t_store;

3、需求

编写Hive的HQL语句求出每个店铺的当月销售额和累计到当月的总销售额

4、解析

（1）按照商店名称和月份进行分组统计

create table tmp_store1 as 
select name,months,sum(money) as money from t_store group by name,months;
select * from tmp_store1;

（2）对tmp_store1 表里面的数据进行自连接

create table tmp_store2 as 
select a.name aname,a.months amonths,a.money amoney,b.name bname,b.months bmonths,b.money bmoney from tmp_store1 a 
join tmp_store1 b on a.name=b.name order by aname,amonths;

select * from tmp_store2;

（3）比较统计

select aname,amonths,amoney,sum(bmoney) as total from tmp_store2 where amonths >= bmonths group by aname,amonths,amoney;

作者：大码王

出处：https://www.cnblogs.com/huanghanyu/

-------------------------------------------

个性签名：独学而无友，则孤陋而寡闻。做一个灵魂有趣的人！

如果觉得这篇文章对你有小小的帮助的话，记得在右下角点个“推荐”哦，博主在此感谢！

万水千山总是情，打赏一分行不行，所以如果你心情还比较高兴，也是可以扫码打赏博主，哈哈哈(っ•?ω•?)っ???！

相关阅读:
main函数的一些特性
 确保函数的操作不超出数组实参的边界
 今天学习了一点sed
libevent 与事件驱动
 mvc3 action验证失败后的自定义处理
 使用spring.net+nibernate时如何用aspnet_regiis加密数据库连接字符串
 C# 中 IList IEnumable 转换成 List类型
 Nhibernate 过长的字符串报错 dehydration property
小论接口(interface)和抽象类(abstract class)的区别
 C# 语言在函数参数列表中出现this关键词的作用
原文地址：https://www.cnblogs.com/huanghanyu/p/13637778.html

项目实战从0到1之hive（14）关于hive统计连续的案例

一:例题1

二.例题2：

一、求单月访问次数和总访问次数

1、数据说明

2、数据准备

（1）创建表

（2）导入数据

（3）验证数据

3、结果需求

4、需求分析

（1）先求出当月访问次数

（2）tmp_access进行自连接视图

（3）进行比较统计

二、学生课程成绩

1、说明

2、需求

1、使用case…when…将不同的课程名称转换成不同的列

2、以sid分组合并取各成绩最大值

3、比较结果

三、求每一年最大气温的那一天 + 温度

1、说明

2、 需求

3、解析

（1）创建一个临时表tmp_weather，将数据切分

（2）创建一个临时表tmp_year_weather

（3）将2个临时表进行连接查询

四、求学生选课情况

1、数据说明

（1）数据格式

（2）字段含义

2、数据准备

（1）建表t_course

（2）导入数据

3、需求

4、解析

五、求月销售额和总销售额

1、数据说明

（1）数据格式

（2）字段含义

2、数据准备

（1）创建数据库表t_store

（2）导入数据

3、需求

4、解析

2、需求