• HIVE学习


    HIVE学习(一)

    一、基础语法

    1. select from where

    # 选择城市在北京,性别为女的10个用户名
    SELECT user_name
    FROM user_info
    WHERE city='beijing' and sex='female'
    limit 10
    
    
    

    分区表必须对分区字段进行限制

    -- 选出在2019年4月9日,购买商品品类为food的用户名、购买数量,支付金额
    SELECT user_name,
    		piece,
    		pay_amount	
    FROM user_trade
    WHERE dt='2014-04-09' AND gppds_category='food'; # 分区表必须限制分区字段,这里对dt进行限制
    

    2. group by

    -- 2019年一月至四月,每个品类有多少人购买,累计金额是多少
    SELECT   goods_category
             count(distinct user_name),
    		sum(pay_amount)
    FROM user_trade
    WHERE dt BETWEEN '2019-01-01' AND '2019-04-01'
    GROUP BY goods_category
    

    group by -- having

    --2019年4月,支付金额超过5万元的用户
    SELECT user_name,
         sum(pay_amount) as total_amount
    FROM user_trade
    WHERE dt BETWEEN '2019-04-01' AND '2019-04-30'
    GROUP BY user_name HAVING sum(pay_amount)>50000;
    
    

    3. ORDER BY

    --2019年4月,支付金额最多的top5用户
    SELECT user_name,
    		sum(pay_amount) as 'per_amount'
    FROM user_trade
    WHERE dt BETWEEN '2019-04-01' AND '2019-04-30'
    GROUP BY user_name
    ORDER BY per_amount DESC LIMIT 5;
    -- ORDER BY 执行顺序在select后面
    
    

    二、常用函数

    1. 时间戳转化为日期

    SELECT pay_time,
           from_unixtime(pay_time, 'yyyy-MM-dd hh:mm:ss')
    FROM user_trade
    WHERE dt ='2019-04-09'
    

    2. 如何计算日期间隔

    datediff(string enddate, string startdate):结束日期机器拿去开始日期的天数

    date_add

    date_sub

    -- 用户的首次激活时间,与2019年5月1日的日期间隔
    SELECT user_name, 
           datediff('2019-05-01', to_date(firstactivetime))
    FROM user_info
    limit 10
    

    3. CASE WHEN

    --统计以下四个年龄段20岁以下,20-30岁,30-40岁,40岁以上的用户数
    SELECT CASE WHEN age<20 THEN '20岁以下'
    		WHEN age>20 AND age<30 THEN '20-30岁'
    		WHEN age>=30 and age<40 THEN '30-40岁'
    		ELSE '40岁及以上'
    		END,
    		COUNT(DISTINCT user_id) user_num
    FROM user_info
    GROUP BY CASE WHEN age<20 THEN '20岁以下'
    			WHEN age>20 and age<30 THEN '20-30岁'
    			WHEN age>=30 and age<40 THEN '30-40岁'
    			ELSE '40岁及以上' 
    			END
    			
    

    4. 统计每个性别用户等级高低的分布情况

    -- 统计每个性别用户等级高低的分布情况,level>5为高级
    SELECT sex
    		if(level>5, '高', '低'),
    		count(distinct user_id) user_num
    FROM user_info
    GROUP BY sex,
    		if(level>5, '高', '低')
    
    

    5. 字符串函数

    substr(stringA, int start, int len)

    备注:如果不指定截取的长度,则从开始一直截取到最后一位

    -- 每个月新激活的用户数
    SELECT substr(firstactivetime, 1, 7) as month
    		count(distinct user_id) user_num
    FROM user_info
    GROUP BY substr(firstactivetime, 1, 7)
    

    get_json_object(stringjson_string, string path)

    param1: 需要解析的json字段

    param2:用key取出想要获取的value

    --不同品牌的用户数
    # 第一种情况
    SELECT get_json_object(extra1, '$.phonebrand') as phone_brand,
    		count(distinct user_id) user_num
    FROM user_info
    GROUP BY get_json_object(extra1,'$.phonebrand') as phone_brand;
    
    -- 第二种情况
    SELECT extra2['phonebrand'] as phone_brand,
    		count(distinct user_id) user_num
    FROM user_info
    GROUP BY extra2['phonebrand'];
    

    6. 聚合统计函数

    -- ELLA用户的2018年的平均支付金额,以及2018年最大的支付日期与最小的支付日期的间隔
    SELECT AVG(pay_amount) as avg_amount,
    		datediff(max(from_unixtime(pay_time, 'yyyy-MM--dd')),min(from_unixtime(pay_time,'yyyy-MM-dd')))
    FROM user_trade
    WHERE year(dt) = '2018'
    		and user_name = 'ELLA';
    		
    

    三、练习

    1. 2018年购买的商品品类在两个以上的用户数
    SELECT sum(t.user_name)
    FROM (SELECT user_name, count(distinct goods_category) count_category
    	  FROM  user_trade
          WHERE year(dt) = '2018'
          group by user_name having count(distinct goods_category) >2) t
       
    
    1. 激活时间在2018年,年龄段在20-30岁和30-40岁的婚姻状况分布

      SELECT A.age_type,
      		if(a.marriage_status=1, '已婚', '未婚')
      		count(distinct A.user_id)
      FROM(
      	SELECT CASE WHEN age<20 THEN '20岁及以下'
      				WHEN  age>=20 and age<30 THEN '20-30岁'
      				WHEN  age>=30 and age <40 THEN '30-40岁'
                      ELSE '40岁'及以上
                      END AS age_type,
                   get_json_object(extral, '$.marriage_status') as 														marriage_status,
                   user_id
             FROM user_info
             WHERE to_date(firstactivetime) BETWEEN '2018-01-01'  AND '2018-12-31'
          ) A
      WHERE a.age_type in ('20-30岁', '30-40岁')
      GROUP BY A.age_type,
      		if(a.marriage_status=1, '已婚', '未婚');
  • 相关阅读:
    工科物理实验()中国大学MOOC答案(已更新)
    类似jar文件使用java无法打开问题
    python9、10章
    nmap的理解与利用(初级)
    常见端口
    配置优化
    删除表操作
    万能的map
    测试
    Mapper.xml
  • 原文地址:https://www.cnblogs.com/zz-yy/p/14473094.html
Copyright © 2020-2023  润新知