蚂蚁森林案例背景说明
- 原始数据样例
user_low_carbon.txt 记录用户每天的蚂蚁森林低碳生活领取的流水
数据样例
u_001 2017/1/1 10
u_001 2017/1/2 150
u_001 2017/1/2 110
plant_carbon.txt 记录申领环保植物所需要减少的碳排放量
数据样例
p001 梭梭树 17
p002 沙柳 19
p003 樟子树 146
p004 胡杨 215
- 以上原始数据样例建表格式如下
表名:user_low_carbon
字段说明
user_id:用户
data_dt:日期
low_carbon:减少碳排放(g)
表名:plant_carbon
字段说明
plant_id:植物编号
plant_name:植物名
low_carbon:换购植物所需要的碳
创建表
drop table if exists user_low_carbon;
create table user_low_carbon(
user_id String,
data_dt String,
low_carbon int
)
row format delimited fields terminated by ' ';
drop table if exists plant_carbon;
create table plant_carbon(
plant_id String,
low_carbon int
)
row format delimited fields terminated by ' ';
导入数据
hive (default)> load data local inpath "/opt/module/data/user_low_carbon.txt" into table user_low_carbon;
hive (default)> load data local inpath "/opt/module/data/plant_carbon.txt" into table plant_carbon;
设置本地模式
hive (default)> set hive.exec.mode.local.auto=true;
1 需求一:蚂蚁森林植物申领统计
假设2017年1月1日开始记录低碳数据(user_low_carbon),假设2017年10月1日之前满足申领条件的用户都申领了一颗p004-胡杨,剩余的能量全部用来领取“p002-沙柳” 。统计在10月1日累计申领“p002-沙柳” 排名前10的用户信息;以及他比后一名多领了几颗沙柳。
step1 统计每个用户截止到2017-10-01日之前收集的总碳量
hive (default)> select user_id, sum(low_carbon) sum_carbon
from user_low_carbon
where date_format(regexp_replace(data_dt, '/', '-'), 'yyyy-MM-dd')<'2017-10-01'
group by user_id;
输出结果:
user_id sum_carbon
u_001 475
u_002 659
u_003 620
u_004 640
u_005 1100
u_006 830
u_007 1470
u_008 1240
u_009 930
u_010 1080
u_011 960
u_012 250
u_013 1430
u_014 1060
u_015 290
step2 获取胡杨和沙柳的能量
hive (default)> select low_carbon from plant_carbon where plant_id='004';
hive (default)> select low_carbon from plant_carbon where plant_id='002';
step3 计算每个用户申领沙柳的棵数
hive (default)> select user_id,
floor((t1.sum_carbon-t2.low_carbon)/t3.low_carbon) count_p002
from (
select user_id, sum(low_carbon) sum_carbon
from user_low_carbon
where date_format(regexp_replace(data_dt, '/', '-'), 'yyyy-MM-dd')<'2017-10-01'
group by user_id
) t1,
(
select low_carbon
from plant_carbon
where plant_id='p004'
) t2,
(
select low_carbon
from plant_carbon
where plant_id='p002'
) t3;
输出结果:
user_id count_p002
u_001 13
u_002 23
u_003 21
u_004 22
u_005 46
u_006 32
u_007 66
u_008 53
u_009 37
u_010 45
u_011 39
u_012 1
u_013 63
u_014 44
u_015 3
step4 按照每个人领取的沙柳棵数倒序排序,并获取当前用户的后一名用户领取的沙柳的棵数
统计在10月1日累计申领“p002-沙柳” 排名前10的用户信息
hive (default)> select user_id,
count_p002,
lead(count_p002, 1) over (order by count_p002 desc) lead_1_p002
from (
select user_id,
floor((t1.sum_carbon - t2.low_carbon) / t3.low_carbon) count_p002
from (
select user_id, sum(low_carbon) sum_carbon
from user_low_carbon
where date_format(regexp_replace(data_dt, '/', '-'), 'yyyy-MM-dd') < '2017-10-01'
group by user_id
) t1,
(
select low_carbon
from plant_carbon
where plant_id = 'p004'
) t2,
(
select low_carbon
from plant_carbon
where plant_id = 'p002'
) t3
) t4
limit 10;
输出结果:
user_id count_p002 lead_1_p002
u_007 66 63
u_013 63 53
u_008 53 46
u_005 46 45
u_010 45 44
u_014 44 39
u_011 39 37
u_009 37 32
u_006 32 23
u_002 23 22
step5 统计当前用户比他后一名多领了几颗沙柳
hive (default)> select user_id,
count_p002,
(count_p002-lead_1_p002) less_count
from (
select user_id,
count_p002,
lead(count_p002, 1) over (order by count_p002 desc) lead_1_p002
from (
select user_id,
floor((t1.sum_carbon-t2.low_carbon) / t3.low_carbon) count_p002
from (
select user_id, sum(low_carbon) sum_carbon
from user_low_carbon
where date_format(regexp_replace(data_dt, '/', '-'), 'yyyy-MM-dd')<'2017-10-01'
group by user_id
) t1,
(
select low_carbon
from plant_carbon
where plant_id='p004'
) t2,
(
select low_carbon
from plant_carbon
where plant_id='p002'
) t3
) t4
limit 10
) t5
order by count_p002 desc;
输出结果:
user_id count_p002 less_count
u_007 66 3
u_013 63 10
u_008 53 7
u_005 46 1
u_010 45 1
u_014 44 5
u_011 39 2
u_009 37 5
u_006 32 9
u_002 23 1
2 需求二:蚂蚁森林低碳用户排名分析
问题查询user_low_carbon表中每日流水记录,条件为:
用户在2017年,连续三天(或以上)的天数里,每天减少碳排放(low_carbon)都超过100g的用户低碳流水。
需要查询返回满足以上条件的user_low_carbon表中的记录流水。
例如用户u_002符合条件的记录如下,因为2017/1/2~2017/1/5连续四天的碳排放量之和都大于等于100
seq(key) user_id data_dt low_carbon
xxxxx10 u_002 2017/1/2 150
xxxxx11 u_002 2017/1/2 70
xxxxx12 u_002 2017/1/3 30
xxxxx13 u_002 2017/1/3 80
xxxxx14 u_002 2017/1/4 150
xxxxx14 u_002 2017/1/5 101
备注:统计方法不限于sql、procedure、python,java等
解法一
step1 将2017年,每个用户在一天内减少碳排放量大于100g的日期过滤出来
hive (default)> select user_id,
date_format(regexp_replace(data_dt, '/', '-'), 'yyyy-MM-dd') data_dt,
sum(low_carbon) sum_carbon
from user_low_carbon
where substring(data_dt, 1, 4) = '2017'
group by user_id, data_dt
having sum_carbon >= 100;
输出结果
user_id data_dt sum_carbon
u_001 2017-01-02 270
u_001 2017-01-06 135
u_002 2017-01-02 220
u_002 2017-01-03 110
u_002 2017-01-04 150
u_002 2017-01-05 101
u_003 2017-01-02 160
u_003 2017-01-03 160
u_003 2017-01-05 120
u_003 2017-01-07 120
u_004 2017-01-01 110
u_004 2017-01-03 120
u_004 2017-01-06 120
u_004 2017-01-07 130
......
step2 对于日期这一列的数据 将前两行数据以及后两行数据的日期放置当前行
hive (default)> select user_id,
data_dt,
lag(data_dt, 2, '1970-01-01') over (partition by user_id order by data_dt) lag2,
lag(data_dt, 1, '1970-01-01') over (partition by user_id order by data_dt) lag1,
lead(data_dt, 1, '9999-01-01') over (partition by user_id order by data_dt) lead1,
lead(data_dt, 2, '9999-01-01') over (partition by user_id order by data_dt) lead2
from (
select user_id,
date_format(regexp_replace(data_dt, '/', '-'), 'yyyy-MM-dd') data_dt,
sum(low_carbon) sum_carbon
from user_low_carbon
where substring(data_dt, 1, 4) = '2017'
group by user_id, data_dt
having sum_carbon >= 100
) t1;
输出结果:
user_id data_dt lag2 lag1 lead1 lead2
u_001 2017-01-02 1970-01-01 1970-01-01 2017-01-06 9999-01-01
u_001 2017-01-06 1970-01-01 2017-01-02 9999-01-01 9999-01-01
u_002 2017-01-02 1970-01-01 1970-01-01 2017-01-03 2017-01-04
u_002 2017-01-03 1970-01-01 2017-01-02 2017-01-04 2017-01-05
u_002 2017-01-04 2017-01-02 2017-01-03 2017-01-05 9999-01-01
u_002 2017-01-05 2017-01-03 2017-01-04 9999-01-01 9999-01-01
u_003 2017-01-02 1970-01-01 1970-01-01 2017-01-03 2017-01-05
u_003 2017-01-03 1970-01-01 2017-01-02 2017-01-05 2017-01-07
u_003 2017-01-05 2017-01-02 2017-01-03 2017-01-07 9999-01-01
......
step3 计算当前日期跟前后两行时间的差值
hive (default)> select user_id,
data_dt,
datediff(data_dt, lag2) lag2_diff,
datediff(data_dt, lag1) lag1_diff,
datediff(data_dt, lead1) lead1_diff,
datediff(data_dt, lead2) lead2_diff
from (
select user_id,
data_dt,
lag(data_dt, 2, '1970-01-01') over (partition by user_id order by data_dt) lag2,
lag(data_dt, 1, '1970-01-01') over (partition by user_id order by data_dt) lag1,
lead(data_dt, 1, '9999-01-01') over (partition by user_id order by data_dt) lead1,
lead(data_dt, 2, '9999-01-01') over (partition by user_id order by data_dt) lead2
from (
select user_id,
date_format(regexp_replace(data_dt, '/', '-'), 'yyyy-MM-dd') data_dt,
sum(low_carbon) sum_carbon
from user_low_carbon
where substring(data_dt, 1, 4) = '2017'
group by user_id, data_dt
having sum_carbon >= 100
) t1
) t2;
输出结果:
user_id data_dt lag2_diff lag1_diff lead1_diff lead2_diff
u_001 2017-01-02 17168 17168 -4 -2915364
u_001 2017-01-06 17172 4 -2915360 -2915360
u_002 2017-01-02 17168 17168 -1 -2
u_002 2017-01-03 17169 1 -1 -2
u_002 2017-01-04 2 1 -1 -2915362
u_002 2017-01-05 2 1 -2915361 -2915361
u_003 2017-01-02 17168 17168 -1 -3
u_003 2017-01-03 17169 1 -2 -4
u_003 2017-01-05 3 2 -2 -2915361
......
step4 过滤出最少连续3天超过100g的用户
hive (default)> select user_id,
data_dt
from (
select user_id,
data_dt,
datediff(data_dt, lag2) lag2_diff,
datediff(data_dt, lag1) lag1_diff,
datediff(data_dt, lead1) lead1_diff,
datediff(data_dt, lead2) lead2_diff
from (
select user_id,
data_dt,
lag(data_dt, 2, '1970-01-01') over (partition by user_id order by data_dt) lag2,
lag(data_dt, 1, '1970-01-01') over (partition by user_id order by data_dt) lag1,
lead(data_dt, 1, '9999-01-01') over (partition by user_id order by data_dt) lead1,
lead(data_dt, 2, '9999-01-01') over (partition by user_id order by data_dt) lead2
from (
select user_id,
date_format(regexp_replace(data_dt, '/', '-'), 'yyyy-MM-dd') data_dt,
sum(low_carbon) sum_carbon
from user_low_carbon
where substring(data_dt, 1, 4) = '2017'
group by user_id, data_dt
having sum_carbon >= 100
) t1
) t2
) t3
where (lead1_diff = -1 and lead2_diff = -2)
or (lag1_diff = 1 and lead1_diff = -1)
or (lag2_diff = 2 and lag1_diff = 1);
输出结果:
user_id data_dt
u_002 2017-01-02
u_002 2017-01-03
u_002 2017-01-04
u_005 2017-01-02
u_005 2017-01-03
u_008 2017-01-04
u_008 2017-01-05
u_008 2017-01-06
u_009 2017-01-02
u_009 2017-01-03
u_010 2017-01-04
u_010 2017-01-05
u_010 2017-01-06
u_011 2017-01-01
u_011 2017-01-02
u_011 2017-01-03
u_011 2017-01-04
u_011 2017-01-05
u_011 2017-01-06
u_013 2017-01-02
u_013 2017-01-03
u_013 2017-01-04
u_014 2017-01-05
u_014 2017-01-06
step5 和原表进行内连接
hive (default)> select u.user_id,
u.data_dt,
u.low_carbon
from (
select user_id,
data_dt
from (
select user_id,
data_dt,
datediff(data_dt, lag2) lag2_diff,
datediff(data_dt, lag1) lag1_diff,
datediff(data_dt, lead1) lead1_diff,
datediff(data_dt, lead2) lead2_diff
from (
select user_id,
data_dt,
lag(data_dt, 2, '1970-01-01') over (partition by user_id order by data_dt) lag2,
lag(data_dt, 1, '1970-01-01') over (partition by user_id order by data_dt) lag1,
lead(data_dt, 1, '9999-01-01') over (partition by user_id order by data_dt) lead1,
lead(data_dt, 2, '9999-01-01') over (partition by user_id order by data_dt) lead2
from (
select user_id,
date_format(regexp_replace(data_dt, '/', '-'), 'yyyy-MM-dd') data_dt,
sum(low_carbon) sum_carbon
from user_low_carbon
where substring(data_dt, 1, 4) = '2017'
group by user_id, data_dt
having sum_carbon >= 100
) t1
) t2
) t3
where (lead1_diff = -1 and lead2_diff = -2)
or (lag1_diff = 1 and lead1_diff = -1)
or (lag2_diff = 2 and lag1_diff = 1)
) t4
join user_low_carbon u on t4.user_id = u.user_id and t4.data_dt = date_format(regexp_replace(u.data_dt, '/', '-'), 'yyyy-MM-dd');
输出结果:
u.user_id u.data_dt u.low_carbon
u_002 2017/1/2 150
u_002 2017/1/2 70
u_002 2017/1/3 30
u_002 2017/1/3 80
u_002 2017/1/4 150
u_005 2017/1/2 50
u_005 2017/1/2 80
u_005 2017/1/3 180
u_008 2017/1/4 260
u_008 2017/1/5 360
u_008 2017/1/6 160
u_009 2017/1/2 70
u_009 2017/1/2 70
u_009 2017/1/3 170
u_010 2017/1/4 90
u_010 2017/1/4 80
u_010 2017/1/5 90
u_010 2017/1/5 90
u_010 2017/1/6 190
u_011 2017/1/1 110
u_011 2017/1/2 100
u_011 2017/1/2 100
u_011 2017/1/3 120
u_011 2017/1/4 100
u_011 2017/1/5 100
u_011 2017/1/6 100
u_013 2017/1/2 150
u_013 2017/1/2 50
u_013 2017/1/3 150
u_013 2017/1/4 550
u_014 2017/1/5 250
u_014 2017/1/6 120
解法二
step1 将2017年,每个用户在一天内减少碳排放量大于100g的日期过滤出来
hive (default)> select user_id,
date_format(regexp_replace(data_dt, '/', '-'), 'yyyy-MM-dd') data_dt,
sum(low_carbon) sum_carbon
from user_low_carbon
where substring(data_dt, 1, 4) = '2017'
group by user_id, data_dt
having sum_carbon >= 100;
输出结果:
user_id data_dt sum_carbon
u_001 2017-01-02 270
u_001 2017-01-06 135
u_002 2017-01-02 220
u_002 2017-01-03 110
u_002 2017-01-04 150
u_002 2017-01-05 101
u_003 2017-01-02 160
u_003 2017-01-03 160
u_003 2017-01-05 120
......
step2 按日期排序 并给每一条记录做标记
hive (default)> select user_id,
data_dt,
rank() over (partition by user_id order by data_dt) rk
from (
select user_id,
date_format(regexp_replace(data_dt, '/', '-'), 'yyyy-MM-dd') data_dt,
sum(low_carbon) sum_carbon
from user_low_carbon
where substring(data_dt, 1, 4) = '2017'
group by user_id, data_dt
having sum_carbon >= 100
) t1;
输出结果:
user_id data_dt rk
u_001 2017-01-02 1
u_001 2017-01-06 2
u_002 2017-01-02 1
u_002 2017-01-03 2
u_002 2017-01-04 3
u_002 2017-01-05 4
u_003 2017-01-02 1
u_003 2017-01-03 2
u_003 2017-01-05 3
u_003 2017-01-07 4
u_004 2017-01-01 1
u_004 2017-01-03 2
......
step3 将日期减去当前的rank值
hive (default)> select user_id,
data_dt,
date_sub(data_dt, rk) sub_rk
from (
select user_id,
data_dt,
rank() over (partition by user_id order by data_dt) rk
from (
select user_id,
date_format(regexp_replace(data_dt, '/', '-'), 'yyyy-MM-dd') data_dt,
sum(low_carbon) sum_carbon
from user_low_carbon
where substring(data_dt, 1, 4) = '2017'
group by user_id, data_dt
having sum_carbon >= 100
) t1
) t2;
输出结果:
user_id data_dt sub_rk
u_001 2017-01-02 2017-01-01
u_001 2017-01-06 2017-01-04
u_002 2017-01-02 2017-01-01
u_002 2017-01-03 2017-01-01
u_002 2017-01-04 2017-01-01
u_002 2017-01-05 2017-01-01
u_003 2017-01-02 2017-01-01
u_003 2017-01-03 2017-01-01
u_003 2017-01-05 2017-01-02
u_003 2017-01-07 2017-01-03
......
step4 过滤出连续3天超过100g的用户
hive (default)> select user_id
from (
select user_id,
data_dt,
date_sub(data_dt, rk) sub_rk
from (
select user_id,
data_dt,
rank() over (partition by user_id order by data_dt) rk
from (
select user_id,
date_format(regexp_replace(data_dt, '/', '-'), 'yyyy-MM-dd') data_dt,
sum(low_carbon) sum_carbon
from user_low_carbon
where substring(data_dt, 1, 4) = '2017'
group by user_id, data_dt
having sum_carbon >= 100
) t1
) t2
) t3
group by user_id, sub_rk
having count(*) >= 3;
输出结果:
user_id
u_002
u_005
u_008
u_009
u_010
u_011
u_013
u_014