HIVE学习(一)
一、基础语法
1. select from where
# 选择城市在北京,性别为女的10个用户名
SELECT user_name
FROM user_info
WHERE city='beijing' and sex='female'
limit 10
分区表必须对分区字段进行限制
-- 选出在2019年4月9日,购买商品品类为food的用户名、购买数量,支付金额
SELECT user_name,
piece,
pay_amount
FROM user_trade
WHERE dt='2014-04-09' AND gppds_category='food'; # 分区表必须限制分区字段,这里对dt进行限制
2. group by
-- 2019年一月至四月,每个品类有多少人购买,累计金额是多少
SELECT goods_category
count(distinct user_name),
sum(pay_amount)
FROM user_trade
WHERE dt BETWEEN '2019-01-01' AND '2019-04-01'
GROUP BY goods_category
group by -- having
--2019年4月,支付金额超过5万元的用户
SELECT user_name,
sum(pay_amount) as total_amount
FROM user_trade
WHERE dt BETWEEN '2019-04-01' AND '2019-04-30'
GROUP BY user_name HAVING sum(pay_amount)>50000;
3. ORDER BY
--2019年4月,支付金额最多的top5用户
SELECT user_name,
sum(pay_amount) as 'per_amount'
FROM user_trade
WHERE dt BETWEEN '2019-04-01' AND '2019-04-30'
GROUP BY user_name
ORDER BY per_amount DESC LIMIT 5;
-- ORDER BY 执行顺序在select后面
二、常用函数
1. 时间戳转化为日期
SELECT pay_time,
from_unixtime(pay_time, 'yyyy-MM-dd hh:mm:ss')
FROM user_trade
WHERE dt ='2019-04-09'
2. 如何计算日期间隔
datediff(string enddate, string startdate):结束日期机器拿去开始日期的天数
date_add
date_sub
-- 用户的首次激活时间,与2019年5月1日的日期间隔
SELECT user_name,
datediff('2019-05-01', to_date(firstactivetime))
FROM user_info
limit 10
3. CASE WHEN
--统计以下四个年龄段20岁以下,20-30岁,30-40岁,40岁以上的用户数
SELECT CASE WHEN age<20 THEN '20岁以下'
WHEN age>20 AND age<30 THEN '20-30岁'
WHEN age>=30 and age<40 THEN '30-40岁'
ELSE '40岁及以上'
END,
COUNT(DISTINCT user_id) user_num
FROM user_info
GROUP BY CASE WHEN age<20 THEN '20岁以下'
WHEN age>20 and age<30 THEN '20-30岁'
WHEN age>=30 and age<40 THEN '30-40岁'
ELSE '40岁及以上'
END
4. 统计每个性别用户等级高低的分布情况
-- 统计每个性别用户等级高低的分布情况,level>5为高级
SELECT sex
if(level>5, '高', '低'),
count(distinct user_id) user_num
FROM user_info
GROUP BY sex,
if(level>5, '高', '低')
5. 字符串函数
substr(stringA, int start, int len)
备注:如果不指定截取的长度,则从开始一直截取到最后一位
-- 每个月新激活的用户数
SELECT substr(firstactivetime, 1, 7) as month
count(distinct user_id) user_num
FROM user_info
GROUP BY substr(firstactivetime, 1, 7)
get_json_object(stringjson_string, string path)
param1: 需要解析的json字段
param2:用key取出想要获取的value
--不同品牌的用户数
# 第一种情况
SELECT get_json_object(extra1, '$.phonebrand') as phone_brand,
count(distinct user_id) user_num
FROM user_info
GROUP BY get_json_object(extra1,'$.phonebrand') as phone_brand;
-- 第二种情况
SELECT extra2['phonebrand'] as phone_brand,
count(distinct user_id) user_num
FROM user_info
GROUP BY extra2['phonebrand'];
6. 聚合统计函数
-- ELLA用户的2018年的平均支付金额,以及2018年最大的支付日期与最小的支付日期的间隔
SELECT AVG(pay_amount) as avg_amount,
datediff(max(from_unixtime(pay_time, 'yyyy-MM--dd')),min(from_unixtime(pay_time,'yyyy-MM-dd')))
FROM user_trade
WHERE year(dt) = '2018'
and user_name = 'ELLA';
三、练习
- 2018年购买的商品品类在两个以上的用户数
SELECT sum(t.user_name)
FROM (SELECT user_name, count(distinct goods_category) count_category
FROM user_trade
WHERE year(dt) = '2018'
group by user_name having count(distinct goods_category) >2) t
-
激活时间在2018年,年龄段在20-30岁和30-40岁的婚姻状况分布
SELECT A.age_type, if(a.marriage_status=1, '已婚', '未婚') count(distinct A.user_id) FROM( SELECT CASE WHEN age<20 THEN '20岁及以下' WHEN age>=20 and age<30 THEN '20-30岁' WHEN age>=30 and age <40 THEN '30-40岁' ELSE '40岁'及以上 END AS age_type, get_json_object(extral, '$.marriage_status') as marriage_status, user_id FROM user_info WHERE to_date(firstactivetime) BETWEEN '2018-01-01' AND '2018-12-31' ) A WHERE a.age_type in ('20-30岁', '30-40岁') GROUP BY A.age_type, if(a.marriage_status=1, '已婚', '未婚');