• 项目实战从0到1之hive(45)大数据项目之电商数仓(用户行为数据)(十三)


    第20章 需求九:每个用户累计访问次数

    结果如下

    用户    日期                小计    总计
    mid1    2019-12-14        10        10
    mid1    2019-02-11        12        22
    mid2    2019-12-14        15        15
    mid2    2019-02-11        12        27

    20.1 DWS层

    20.1.1 建表语句

    drop table if exists dws_user_total_count_day;
    create external table dws_user_total_count_day(
      `mid_id` string COMMENT '设备id',
    `subtotal` bigint COMMENT '每日登录小计'
    )
    partitioned by(`dt` string)
    row format delimited fields terminated by ' '
    location '/warehouse/gmall/dws/dws_user_total_count_day';

    20.1.2 导入数据

    1)导入数据

    insert overwrite table dws_user_total_count_day 
    partition(dt='2019-12-14')
    select
      mid_id,
       count(mid_id) cm
    from
    dwd_start_log
    where
      dt='2019-12-14'
    group by
      mid_id;

    2)查询结果

    select * from dws_user_total_count_day;

    20.1.3 数据导入脚本

    1)创建脚本dws_user_total_count_day.sh

    [kgg@hadoop102 bin]$ vim dws_user_total_count_day.sh
    在脚本中填写如下内容
    #!/bin/bash

    # 定义变量方便修改
    APP=gmall
    hive=/opt/module/hive/bin/hive
    hadoop=/opt/module/hadoop-2.7.2/bin/hadoop

    # 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
    if [ -n "$1" ] ;then
      do_date=$1
    else
      do_date=`date -d "-1 day" +%F`
    fi

    echo "===日志日期为 $do_date==="
    sql="
    insert overwrite table "$APP".dws_user_total_count_day partition(dt='$do_date')
    select
      mid_id,
      count(mid_id) cm
    from
       "$APP".dwd_start_log
    where
       dt='$do_date'
    group by
      mid_id,dt;
    "

    $hive -e "$sql"

    2)增加脚本执行权限

    chmod 777 ads_user_total_count.sh

    3)脚本使用

     ads_user_total_count.sh 2019-02-20

    4)查询结果

    select * from ads_user_total_count;

    5)脚本执行时间

    企业开发中一般在每天凌晨30分~1点

    20.2 ADS层

    20.2.1 建表语句

    drop table if exists ads_user_total_count;
    create external table ads_user_total_count(
      `mid_id` string COMMENT '设备id',
      `subtotal` bigint COMMENT '每日登录小计',
      `total` bigint COMMENT '登录次数总计'
    )
    partitioned by(`dt` string)
    row format delimited fields terminated by ' '
    location '/warehouse/gmall/ads/ads_user_total_count';

    20.2.2 导入数据

    insert overwrite table ads_user_total_count partition(dt='2019-10-03')
    select
    if(today.mid_id is null, yesterday.mid_id, today.mid_id) mid_id,
    today.subtotal,
    if(today.subtotal is null, 0, today.subtotal) + if(yesterday.total is null, 0, yesterday.total) total
    from (
     select
      *
     from dws_user_total_count_day
     where dt='2019-10-03'
    ) today
    full join (
     select
      *
     from ads_user_total_count
     where dt=date_add('2019-10-03', -1)
    ) yesterday
    on today.mid_id=yesterday.mid_id

    20.2.3 数据导入脚本

    1)创建脚本

    [kgg@hadoop102 bin]$ vim ads_user_total_count.sh
    在脚本中编写如下内容
    #!/bin/bash

    db=gmall
    hive=/opt/module/hive-1.2.1/bin/hive
    hadoop=/opt/module/hadoop-2.7.2/bin/hadoop

    if [[ -n $1 ]]; then
       do_date=$1
    else
       do_date=`date -d '-1 day' +%F`
    fi

    sql="
    use gmall;
    insert overwrite table ads_user_total_count partition(dt='$do_date')
    select
     if(today.mid_id is null, yesterday.mid_id, today.mid_id) mid_id,
    today.subtotal,
     if(today.subtotal is null, 0, today.subtotal) + if(yesterday.total is null, 0, yesterday.total) total
    from (
    select
      *
    from dws_user_total_count_day
    where dt='$do_date'
    ) today
    full join (
    select
      *
    from ads_user_total_count
    where dt=date_add('$do_date', -1)
    ) yesterday
    on today.mid_id=yesterday.mid_id
    "

    $hive -e "$sql"

    2)增加脚本执行权限

    chmod 777 ads_user_total_count.sh

    3)脚本使用

    ads_user_total_count.sh 2019-02-20

    4)查询结果

    select * from ads_user_total_count;

    5)脚本执行时间

    企业开发中一般在每天凌晨30分~1点

    第21章 需求十:新收藏用户数

    新收藏用户:指的是在某天首次添加收藏的用户

    21.1 DWS层建立用户日志行为宽表

    考虑到后面的多个需求会同时用到多张表中的数据, 如果每次都join操作, 则影响查询的效率. 可以先提前做一张宽表, 提高其他查询的执行效率.

    每个用户对每个商品的点击次数, 点赞次数, 收藏次数

    21.1.1 建表语句

    drop table if exists dws_user_action_wide_log;
    CREATE EXTERNAL TABLE dws_user_action_wide_log(
      `mid_id` string COMMENT '设备id',
      `goodsid` string COMMENT '商品id',
      `display_count` string COMMENT '点击次数',
      `praise_count` string COMMENT '点赞次数',
      `favorite_count` string COMMENT '收藏次数')
    PARTITIONED BY (`dt` string)
    stored as parquet
    location '/warehouse/gmall/dws/dws_user_action_wide_log/'
    TBLPROPERTIES('parquet.compression'='lzo');

    21.1.2 导入数据

    insert overwrite table dws_user_action_wide_log partition(dt='2019-12-14')
    select
      mid_id,
      goodsid,
      sum(display_count) display_count,
      sum(praise_count) praise_count,
      sum(favorite_count) favorite_count
    from
    ( select
          mid_id,
          goodsid,
           count(*) display_count,
           0 praise_count,
           0 favorite_count
       from
          dwd_display_log
       where
          dt='2019-12-14' and action=2
       group by
          mid_id,goodsid

       union all

       select
          mid_id,
          target_id goodsid,
           0,
           count(*) praise_count,
           0
       from
          dwd_praise_log
       where
          dt='2019-12-14'
       group by
          mid_id,target_id

       union all

       select
          mid_id,
          course_id goodsid,
           0,
           0,
           count(*) favorite_count
       from
          dwd_favorites_log
       where
          dt='2019-12-14'
       group by
          mid_id,course_id
    )user_action
    group by
    mid_id,goodsid;

    21.1.3 数据导入脚本

    [kgg@hadoop102 bin]$ vi dws_user_action_wide_log.sh
    [kgg@hadoop102 bin]$ chmod 777 dws_user_action_wide_log.sh

    #!/bin/bash
    db=gmall
    hive=/opt/module/hive-1.2.1/bin/hive
    hadoop=/opt/module/hadoop-2.7.2/bin/hadoop

    if [[ -n $1 ]]; then
       do_date=$1
    else
       do_date=`date -d '-1 day' +%F`
    fi

    sql="
    use gmall;
    insert overwrite table dws_user_action_wide_log partition(dt='$do_date')
    select
      mid_id,
      goodsid,
      sum(display_count) display_count,
      sum(praise_count) praise_count,
      sum(favorite_count) favorite_count
    from
    ( select
          mid_id,
          goodsid,
          count(*) display_count,
           0 praise_count,
           0 favorite_count
      from
          dwd_display_log
      where
           dt='$do_date' and action=2
      group by
          mid_id,goodsid

      union all

      select
          mid_id,
          target_id goodsid,
           0,
          count(*) praise_count,
           0
      from
          dwd_praise_log
      where
           dt='$do_date'
      group by
          mid_id,target_id

      union all

      select
          mid_id,
          course_id goodsid,
           0,
           0,
          count(*) favorite_count
      from
          dwd_favorites_log
      where
           dt='$do_date'
      group by
          mid_id,course_id
    )user_action
    group by
    mid_id,goodsid;
    "

    $hive -e "$sql"

    21.2 DWS层

    使用日志数据用户行为宽表作为DWS层表

    21.3 ADS层

    21.3.1 建表语句

    drop table if exists ads_new_favorites_mid_day;
    create external table ads_new_favorites_mid_day(
      `dt` string COMMENT '日期',
      `favorites_users` bigint COMMENT '新收藏用户数'
    )
    row format delimited fields terminated by ' '
    location '/warehouse/gmall/ads/ads_new_favorites_mid_day';

    21.3.2 导入数据

    insert into table ads_new_favorites_mid_day
    select
       '2019-12-14' dt,
       count(*) favorites_users
    from
    (
       select
          mid_id
       from
          dws_user_action_wide_log
       where
          favorite_count>0
       group by
          mid_id
       having
          min(dt)='2019-12-14'
    )user_favorite;

    21.3.3 数据导入脚本

    1)创建脚本ads_new_favorites_mid_day.sh

    [kgg@hadoop102 bin]$ vim ads_new_favorites_mid_day.sh
    在脚本中填写如下内容
    #!/bin/bash

    # 定义变量方便修改
    APP=gmall
    hive=/opt/module/hive/bin/hive
    hadoop=/opt/module/hadoop-2.7.2/bin/hadoop

    # 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
    if [ -n "$1" ] ;then
      do_date=$1
    else
      do_date=`date -d "-1 day" +%F`
    fi

    echo "===日志日期为 $do_date==="
    sql="
    insert into table "$APP".ads_new_favorites_mid_day
    select
       '$do_date' dt,
      count(*) favorites_users
    from
    (
      select
          mid_id
      from
           "$APP".dws_user_action_wide_log
      where
          favorite_count>0
      group by
          mid_id
      having
          min(dt)='$do_date'
    )user_favorite;
    "

    $hive -e "$sql"

    2)增加脚本执行权限

    chmod 777 ads_new_favorites_mid_day.sh

    3)脚本使用

    ads_new_favorites_mid_day.sh 2019-02-20

    4)查询结果

    select * from ads_new_favorites_mid_day;

    5)脚本执行时间

    企业开发中一般在每天凌晨30分~1点

    第22章 需求十一:各个商品点击次数top3的用户

    22.1 DWS层

    使用日志数据用户行为宽表作为DWS层表

    22.2 ADS层

    22.2.1 建表语句

    drop table if exists ads_goods_count;
    create external table ads_goods_count(
      `dt` string COMMENT '统计日期',
      `goodsid` string COMMENT '商品',
      `user_id` string COMMENT '用户',
      `goodsid_user_count` bigint COMMENT '商品用户点击次数'
    )
    row format delimited fields terminated by ' '
    location '/warehouse/gmall/ads/ads_goods_count';

    22.2.2 导入数据

    insert into table ads_goods_count
    select
       '2019-10-03',
      goodsid,
      mid_id,
      sum_display_count
    from(
       select
        goodsid,
        mid_id,
        sum_display_count,
        row_number() over(partition by goodsid order by sum_display_count desc) rk
       from(
         select
          goodsid,
          mid_id,
          sum(display_count) sum_display_count
         from dws_user_action_wide_log
         where display_count>0
         group by goodsid, mid_id
      ) t1
    ) t2
    where rk <= 3

    22.2.3 数据导入脚本

    1)创建脚本ads_goods_count.sh

    [kgg@hadoop102 bin]$ vim ads_goods_count.sh
    在脚本中填写如下内容
    #!/bin/bash

    db=gmall
    hive=/opt/module/hive/bin/hive
    hadoop=/opt/module/hadoop/bin/hadoop

    if [[ -n $1 ]]; then
       do_date=$1
    else
       do_date=`date -d '-1 day' +%F`
    fi

    sql="
    use gmall;
    insert into table ads_goods_count
    select
       '$do_date',
      goodsid,
      mid_id,
      sum_display_count
    from(
      select
        goodsid,
        mid_id,
        sum_display_count,
        row_number() over(partition by goodsid order by sum_display_count desc) rk
      from(
        select
          goodsid,
          mid_id,
          sum(display_count) sum_display_count
        from dws_user_action_wide_log
        where display_count>0
        group by goodsid, mid_id
      ) t1
    ) t2
    where rk <= 3
    "
    $hive -e "$sql"

    2)增加脚本执行权限

    chmod 777 ads_goods_count.sh

    3)脚本使用

    ads_goods_count.sh 2019-02-20

    4)查询结果

    select * from ads_goods_count;

    5)脚本执行时间

    企业开发中一般在每天凌晨30分~1点

    作者:大码王

    -------------------------------------------

    个性签名:独学而无友,则孤陋而寡闻。做一个灵魂有趣的人!

    如果觉得这篇文章对你有小小的帮助的话,记得在右下角点个“推荐”哦,博主在此感谢!

    万水千山总是情,打赏一分行不行,所以如果你心情还比较高兴,也是可以扫码打赏博主,哈哈哈(っ•?ω•?)っ???!

  • 相关阅读:
    springCloud你要了解的都在这(方向性)
    十分钟了解 spring cloud
    JDK 自带压缩解压流
    JAVA自带API的压缩与解压
    Java实现多文件压缩打包的方法
    芯片超Intel,盈利比肩Apple,三星成科技界"全民公敌"
    一个不知道体谅难处,一个说话伤人自尊,矛盾的种子已悄悄埋下
    一天杀了两千宦官(袁绍也曾仗义过,为东汉彻底除去了宦官祸害。何进却心软了,任凭袁绍怎么劝说,何进就是不下手)
    WMWaire使用FreeNAS硬盘挂载、Raid0
    net share列出了Windows的默认共享(包括C盘)
  • 原文地址:https://www.cnblogs.com/huanghanyu/p/14308745.html
Copyright © 2020-2023  润新知