• 项目实战从0到1之hive(29)企业级数据仓库构建(十):搭建 ADS 层


    一、数仓搭建 - ADS 层
    1.1 设备主题
    1.1.1 活跃设备数(日、周、月)

    需求定义:
    日活:当日活跃的设备数
    周活:当周活跃的设备数
    月活:当月活跃的设备数
    1)建表语句

    drop table if exists ads_uv_count;
    create external table ads_uv_count(
    `dt` string COMMENT '统计日期',
    `day_count` bigint COMMENT '当日用户数量',
    `wk_count` bigint COMMENT '当周用户数量',
    `mn_count` bigint COMMENT '当月用户数量',
    `is_weekend` string COMMENT 'Y,N 是否是周末,用于得到本周最终结果',
    `is_monthend` string COMMENT 'Y,N 是否是月末,用于得到本月最终结果'
    ) COMMENT '活跃设备数'
    row format delimited fields terminated by '	'
    location '/warehouse/gmall/ads/ads_uv_count/';

    2)导入数据

    insert into table ads_uv_count
    select
    '2020-03-10' dt,
    daycount.ct,
    wkcount.ct,
    mncount.ct,
    if(date_add(next_day('2020-03-10','MO'),-1)='2020-03-10','Y','N') ,
    if(last_day('2020-03-10')='2020-03-10','Y','N')
    from
    (
    select
    '2020-03-10' dt,
    count(*) ct
    from dwt_uv_topic
    where login_date_last='2020-03-10'
    )daycount join
    (
    select
    '2020-03-10' dt,
    count (*) ct
    from dwt_uv_topic
    where login_date_last>=date_add(next_day('2020-03-10','MO'),-7)
    and login_date_last<= date_add(next_day('2020-03-10','MO'),-1)
    ) wkcount on daycount.dt=wkcount.dt
    join
    (
    select
    '2020-03-10' dt,
    count (*) ct
    from dwt_uv_topic
    where
    date_format(login_date_last,'yyyy-MM')=date_format('2020-03-10','yyyy-MM')
    )mncount on daycount.dt=mncount.dt;

    3)查询导入结果

    1.1.2 每日新增设备
    1)建表语句

    drop table if exists ads_new_mid_count;
    create external table ads_new_mid_count
    (
    `create_date` string comment '创建时间' ,
    `new_mid_count` BIGINT comment '新增设备数量'
    ) COMMENT '每日新增设备信息数量'
    row format delimited fields terminated by '	'
    location '/warehouse/gmall/ads/ads_new_mid_count/';

    2)导入数据

    insert into table ads_new_mid_count
    select
    login_date_first,
    count(*)
    from dwt_uv_topic
    where login_date_first='2020-03-10'
    group by login_date_first;

    3)查询导入数据

    select * from ads_new_mid_count;

    1.1.3 沉默用户数
    需求定义:
    沉默用户:只在安装当天启动过,且启动时间是在 7 天前
    1)建表语句

    drop table if exists ads_silent_count;
    create external table ads_silent_count(
    `dt` string COMMENT '统计日期',
    `silent_count` bigint COMMENT '沉默设备数'
    )
    row format delimited fields terminated by '	'
    location '/warehouse/gmall/ads/ads_silent_count';

    2)导入 2020-03-20 数据

    insert into table ads_silent_count
    select
    '2020-03-15',
    count(*)
    from dwt_uv_topic
    where login_date_first=login_date_last
    and login_date_last<=date_add('2020-03-15',-7);

    3)查询导入数据

    select * from ads_silent_count;

    1.1.4 本周回流用户数

    需求定义:
    本周回流用户:上周未活跃,本周活跃的设备,且不是本周新增设备
    1)建表语句

    drop table if exists ads_back_count;
    create external table ads_back_count(
    `dt` string COMMENT '统计日期',
    `wk_dt` string COMMENT '统计日期所在周',
    `wastage_count` bigint COMMENT '回流设备数'
    )
    row format delimited fields terminated by '	'
    location '/warehouse/gmall/ads/ads_back_count';

    2)导入数据:

    insert into table ads_back_count
    select
    '2020-03-15',
    count(*)
    from
    (
    select
    mid_id
    from dwt_uv_topic
    where login_date_last>=date_add(next_day('2020-03-15','MO'),-7)
    and login_date_last<= date_add(next_day('2020-03-15','MO'),-1)
    and login_date_first<date_add(next_day('2020-03-15','MO'),-7)
    )current_wk
    left join
    (
    select
    mid_id
    from dws_uv_detail_daycount
    where dt>=date_add(next_day('2020-03-15','MO'),-7*2)
    and dt<= date_add(next_day('2020-03-15','MO'),-7-1)
    group by mid_id
    )last_wk
    on current_wk.mid_id=last_wk.mid_id
    where last_wk.mid_id is null;

    3)查询结果

    select * from ads_back_count;

    1.1.5 流失用户数
    需求定义:
    流失用户:最近 7 天未活跃的设备
    1)建表语句

    drop table if exists ads_wastage_count;
    create external table ads_wastage_count(
    `dt` string COMMENT '统计日期',
    `wastage_count` bigint COMMENT '流失设备数'
    )
    row format delimited fields terminated by '	'
    location '/warehouse/gmall/ads/ads_wastage_count';

    2)导入 2020-03-20 数据

    insert into table ads_wastage_count
    select
    '2020-03-20',
    count(*)
    from
    (
    select
    mid_id
    from dwt_uv_topic
    where login_date_last<=date_add('2020-03-20',-7)
    group by mid_id
    )t1;

    3)查询结果

    select * from ads_wastage_count;

    1.1.6 留存率



    1)建表语句

    drop table if exists ads_user_retention_day_rate;
    create external table ads_user_retention_day_rate
    (
    `stat_date` string comment '统计日期',
    `create_date` string comment '设备新增日期',
    `retention_day` int comment '截止当前日期留存天数',
    `retention_count` bigint comment '留存数量',
    `new_mid_count` bigint comment '设备新增数量',
    `retention_ratio` decimal(10,2) comment '留存率'
    ) COMMENT '每日用户留存情况'
    row format delimited fields terminated by '	'
    location '/warehouse/gmall/ads/ads_user_retention_day_rate/';

    2)导入数据

    insert into table ads_user_retention_day_rate
    select
    '2020-03-10',--统计日期
    date_add('2020-03-10',-1),--新增日期
    1,--留存天数
    sum(if(login_date_first=date_add('2020-03-10',-1) and
    login_date_last='2020-03-10',1,0)),--2020-03-09 的 1 日留存数
    sum(if(login_date_first=date_add('2020-03-10',-1),1,0)),--2020-03-09 新增
    sum(if(login_date_first=date_add('2020-03-10',-1) and
    login_date_last='2020-03-10',1,0))/sum(if(login_date_first=date_add('2020-03-10',-
    1),1,0))*100
    from dwt_uv_topic
    union all
    select
    '2020-03-10',--统计日期
    date_add('2020-03-10',-2),--新增日期
    2,--留存天数
    sum(if(login_date_first=date_add('2020-03-10',-2) and
    login_date_last='2020-03-10',1,0)),--2020-03-08 的 2 日留存数
    sum(if(login_date_first=date_add('2020-03-10',-2),1,0)),--2020-03-08 新增
    sum(if(login_date_first=date_add('2020-03-10',-2) and
    login_date_last='2020-03-10',1,0))/sum(if(login_date_first=date_add('2020-03-10',-
    2),1,0))*100
    from dwt_uv_topic
    union all
    select
    '2020-03-10',--统计日期
    date_add('2020-03-10',-3),--新增日期
    3,--留存天数
    sum(if(login_date_first=date_add('2020-03-10',-3) and
    login_date_last='2020-03-10',1,0)),--2020-03-07 的 3 日留存数
    sum(if(login_date_first=date_add('2020-03-10',-3),1,0)),--2020-03-07 新增
    sum(if(login_date_first=date_add('2020-03-10',-3) and
    login_date_last='2020-03-10',1,0))/sum(if(login_date_first=date_add('2020-03-10',-
    3),1,0))*100
    from dwt_uv_topic;

    3)查询导入数据

    select * from ads_user_retention_day_rate;

    1.1.7 最近连续三周活跃用户数

    1)建表语句

    drop table if exists ads_continuity_wk_count;
    create external table ads_continuity_wk_count(
    `dt` string COMMENT '统计日期,一般用结束周周日日期,如果每天计算一次,可用当天日
    期',
    `wk_dt` string COMMENT '持续时间',
    `continuity_count` bigint COMMENT '活跃次数'
    )
    row format delimited fields terminated by '	'
    location '/warehouse/gmall/ads/ads_continuity_wk_count';

    2)导入 2020-03-20 所在周的数据

    insert into table ads_continuity_wk_count
    select
    '2020-03-15',
    concat(date_add(next_day('2020-03-15','MO'),-7*3),'_',date_add(next_day('
    2020-03-15','MO'),-1)),
    count(*)
    from
    (
    select
    mid_id
    from
    (
    select
    mid_id
    from dws_uv_detail_daycount
    where dt>=date_add(next_day('2020-03-10','monday'),-7)
    and dt<=date_add(next_day('2020-03-10','monday'),-1)
    group by mid_id
    union all
    select
    mid_id
    from dws_uv_detail_daycount
    where dt>=date_add(next_day('2020-03-10','monday'),-7*2)
    and dt<=date_add(next_day('2020-03-10','monday'),-7-1)
    group by mid_id
    union all
    select
    mid_id
    from dws_uv_detail_daycount
    where dt>=date_add(next_day('2020-03-10','monday'),-7*3)
    and dt<=date_add(next_day('2020-03-10','monday'),-7*2-1)
    group by mid_id
    )t1
    group by mid_id
    having count(*)=3
    )t2

    3)查询

    select * from ads_continuity_wk_count;

    1.1.8 最近七天内连续三天活跃用户数

    1)建表语句

    drop table if exists ads_continuity_uv_count;
    create external table ads_continuity_uv_count(
    `dt` string COMMENT '统计日期',
    `wk_dt` string COMMENT '最近 7 天日期',
    `continuity_count` bigint
    ) COMMENT '连续活跃设备数'
    row format delimited fields terminated by '	'
    location '/warehouse/gmall/ads/ads_continuity_uv_count';

    2)写出导入数据的 SQL 语句

    insert into table ads_continuity_uv_count
    select
    '2020-03-12',
    concat(date_add('2020-03-12',-6),'_','2020-03-12'),
    count(*)
    from
    (
    select mid_id
    from
    (
    select mid_id
    from
    (
    select
    mid_id,
    date_sub(dt,rank) date_dif
    from
    (
    select
    mid_id,
    dt,
    rank() over(partition by mid_id order by dt) rank
    from dws_uv_detail_daycount
    where dt>=date_add('2020-03-12',-6) and
    dt<='2020-03-12'
    )t1
    )t2
    group by mid_id,date_dif
    having count(*)>=3
    )t3
    group by mid_id
    )t4;

    7.2 会员主题
    7.2.1 会员主题信息

    1)建表

    drop table if exists ads_user_topic;
    create external table ads_user_topic(
    `dt` string COMMENT '统计日期',
    `day_users` string COMMENT '活跃会员数',
    `day_new_users` string COMMENT '新增会员数',
    `day_new_payment_users` string COMMENT '新增消费会员数',
    `payment_users` string COMMENT '总付费会员数',
    `users` string COMMENT '总会员数',
    `day_users2users` decimal(10,2) COMMENT '会员活跃率',
    `payment_users2users` decimal(10,2) COMMENT '会员付费率',
    `day_new_users2users` decimal(10,2) COMMENT '会员新鲜度'
    ) COMMENT '会员主题信息表'
    row format delimited fields terminated by '	'
    location '/warehouse/gmall/ads/ads_user_topic';

    2)导入数据

    insert into table ads_user_topic
    select
    '2020-03-10',
    sum(if(login_date_last='2020-03-10',1,0)),
    sum(if(login_date_first='2020-03-10',1,0)),
    sum(if(payment_date_first='2020-03-10',1,0)),
    sum(if(payment_count>0,1,0)),
    count(*),
    sum(if(login_date_last='2020-03-10',1,0))/count(*),
    sum(if(payment_count>0,1,0))/count(*),
    sum(if(login_date_first='2020-03-10',1,0))/sum(if(login_date_last='2020-03-10',1,0))
    from dwt_user_topic

    3)查询数据

    hive (gmall)> select * from ads_user_topic;

    4)vim ads_user_topic.sh
    添加如下内容:

    #!/bin/bash
    APP=gmall
    hive=/opt/modules/hive/bin/hive
    # 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
    if [ -n "$1" ] ;then
    do_date=$1
    else
    do_date=`date -d "-1 day" +%F`
    fi
    sql="
    with
    tmp_day_users as
    (
    select
    '$do_date' dt,
    count(*) day_users
    from
    ${APP}.dwt_user_topic
    where
    login_date_last='$do_date'
    ),
    tmp_day_new_users as
    (
    select
    '$do_date' dt,
    count(*) day_new_users
    from
    ${APP}.dwt_user_topic
    where
    login_date_last='$do_date' and login_date_first='$do_date'
    ),
    tmp_day_new_payment_users as
    (
    select
    '$do_date' dt,
    count(*) day_new_payment_users
    from
    ${APP}.dwt_user_topic
    where
    payment_date_first='$do_date'
    ),
    tmp_payment_users as
    (
    select
    '$do_date' dt,
    count(*) payment_users
    from
    ${APP}.dwt_user_topic
    where
    payment_date_first is not null
    ),
    tmp_users as
    (
    select
    '$do_date' dt,
    count(*) users
    from
    ${APP}.dwt_user_topic
    tmp_users
    )
    insert into table ${APP}.ads_user_topic
    select
    '$do_date' dt,
    day_users,
    day_new_users,
    day_new_payment_users,
    payment_users,
    users,
    day_users/users,
    payment_users/users,
    day_new_users/users
    from
    tmp_day_users
    join
    tmp_day_new_users
    on
    tmp_day_users.dt=tmp_day_new_users.dt
    join
    tmp_day_new_payment_users
    on
    tmp_day_users.dt=tmp_day_new_payment_users.dt
    join
    tmp_payment_users
    on
    tmp_day_users.dt=tmp_payment_users.dt
    join
    tmp_users
    on
    tmp_day_users.dt=tmp_users.dt;
    "
    $hive -e "$sql"

    5)增加脚本执行权限

    chmod 770 ads_user_topic.sh

    6)执行脚本导入数据

    ads_user_topic.sh 2020-03-11

    7)查看导入数据

    select * from ads_user_topic;

    7.2.2 漏斗分析

    统计“浏览->购物车->下单->支付”的转化率
    思路:统计各个行为的人数,然后计算比值
    1)建表语句

    drop table if exists ads_user_action_convert_day;
    create external table ads_user_action_convert_day(
    `dt` string COMMENT '统计日期',
    `total_visitor_m_count` bigint COMMENT '总访问人数',
    `cart_u_count` bigint COMMENT '加入购物车的人数',
    `visitor2cart_convert_ratio` decimal(10,2) COMMENT '访问到加入购物车转化率',
    `order_u_count` bigint COMMENT '下单人数',
    `cart2order_convert_ratio` decimal(10,2) COMMENT '加入购物车到下单转化率',
    `payment_u_count` bigint COMMENT '支付人数',
    `order2payment_convert_ratio` decimal(10,2) COMMENT '下单到支付的转化率'
    ) COMMENT '用户行为漏斗分析'
    row format delimited fields terminated by '	'
    location '/warehouse/gmall/ads/ads_user_action_convert_day/';

    2)数据装载

    insert into table ads_user_action_convert_day
    select
    '2020-03-10',
    uv.day_count,
    ua.cart_count,
    cast(ua.cart_count/uv.day_count as decimal(10,2)) visitor2cart_convert_ratio,
    ua.order_count,
    cast(ua.order_count/ua.cart_count as decimal(10,2)) visitor2order_convert_ratio,
    ua.payment_count,
    cast(ua.payment_count/ua.order_count as decimal(10,2)) order2payment_convert_ratio
    from
    (
    select
    dt,
    sum(if(cart_count>0,1,0)) cart_count,
    sum(if(order_count>0,1,0)) order_count,
    sum(if(payment_count>0,1,0)) payment_count
    from dws_user_action_daycount
    where dt='2020-03-10'
    group by dt
    )ua join ads_uv_count uv on uv.dt=ua.dt; 

    3)查询加载数据

    select * from ads_user_action_convert_day;

    7.3 商品主题
    7.3.1 商品个数信息
    1)建表语句

    drop table if exists ads_product_info;
    create external table ads_product_info(
    `dt` string COMMENT '统计日期',
    `sku_num` string COMMENT 'sku 个数',
    `spu_num` string COMMENT 'spu 个数'
    ) COMMENT '商品个数信息'
    row format delimited fields terminated by '	'
    location '/warehouse/gmall/ads/ads_product_info';

    2)导入数据

    insert into table ads_product_info
    select
    '2020-03-10' dt,
    sku_num,
    spu_num
    from
    (
    select
    '2020-03-10' dt,
    count(*) sku_num
    from
    dwt_sku_topic
    ) tmp_sku_num
    join
    (
    select
    '2020-03-10' dt,
    count(*) spu_num
    from
    (
    select
    spu_id
    from
    dwt_sku_topic
    group by
    spu_id
    ) tmp_spu_id
    ) tmp_spu_num
    on
    tmp_sku_num.dt=tmp_spu_num.dt;

    3)查询结果数据

    select * from ads_product_info;

    7.3.2 商品销量排名

    1)建表语句

    drop table if exists ads_product_sale_topN;
    create external table ads_product_sale_topN(
    `dt` string COMMENT '统计日期',
    `sku_id` string COMMENT '商品 ID',
    `payment_amount` bigint COMMENT '销量'
    ) COMMENT '商品个数信息'
    row format delimited fields terminated by '	'
    location '/warehouse/gmall/ads/ads_product_sale_topN';

    2)导入数据

    insert into table ads_product_sale_topN
    select
    '2020-03-10' dt,
    sku_id,
    payment_amount
    from
    dws_sku_action_daycount
    where
    dt='2020-03-10'
    order by payment_amount desc
    limit 10;

    3)查询结果数据

    select * from ads_product_sale_topN;

    7.3.3 商品收藏排名

    1)建表语句

    drop table if exists ads_product_favor_topN;
    create external table ads_product_favor_topN(
    `dt` string COMMENT '统计日期',
    `sku_id` string COMMENT '商品 ID',
    `favor_count` bigint COMMENT '收藏量'
    ) COMMENT '商品收藏 TopN'
    row format delimited fields terminated by '	'
    location '/warehouse/gmall/ads/ads_product_favor_topN';

    2)导入数据

    insert into table ads_product_favor_topN
    select
    '2020-03-10' dt,
    sku_id,
    favor_count
    from
    dws_sku_action_daycount
    where
    dt='2020-03-10'
    order by favor_count desc
    limit 10;

    3)查询数据

    select * from ads_product_favor_topN;

    7.3.4 商品加入购物车排名

    1)建表语句

    drop table if exists ads_product_cart_topN;
    create external table ads_product_cart_topN(
    `dt` string COMMENT '统计日期',
    `sku_id` string COMMENT '商品 ID',
    `cart_num` bigint COMMENT '加入购物车数量'
    ) COMMENT '商品加入购物车 TopN'
    row format delimited fields terminated by '	'
    location '/warehouse/gmall/ads/ads_product_cart_topN';

    2)导入数据

    insert into table ads_product_cart_topN
    select
    '2020-03-10' dt,
    sku_id,
    cart_num
    from
    dws_sku_action_daycount
    where
    dt='2020-03-10'
    order by cart_num desc
    limit 10;

    3)查询数据

    select * from ads_product_cart_topN;

    7.3.5 商品退款率排名(最近 30 天)

    1)建表语句

    drop table if exists ads_product_refund_topN;
    create external table ads_product_refund_topN(
    `dt` string COMMENT '统计日期',
    `sku_id` string COMMENT '商品 ID',
    `refund_ratio` decimal(10,2) COMMENT '退款率'
    ) COMMENT '商品退款率 TopN'
    row format delimited fields terminated by '	'
    location '/warehouse/gmall/ads/ads_product_refund_topN';

    2)导入数据

    insert into table ads_product_refund_topN
    select
    '2020-03-10',
    sku_id,
    refund_last_30d_count/payment_last_30d_count*100 refund_ratio
    from dwt_sku_topic
    order by refund_ratio desc
    limit 10;

    3)查询数据

    select * from ads_product_refund_topN;

    7.3.6 商品差评率
    1)建表语句

    drop table if exists ads_appraise_bad_topN;
    create external table ads_appraise_bad_topN(
    `dt` string COMMENT '统计日期',
    `sku_id` string COMMENT '商品 ID',
    `appraise_bad_ratio` decimal(10,2) COMMENT '差评率'
    ) COMMENT '商品差评率 TopN'
    row format delimited fields terminated by '	'
    location '/warehouse/gmall/ads/ads_appraise_bad_topN';

    2)导入数据

    insert into table ads_appraise_bad_topN
    select
    '2020-03-10' dt,
    sku_id,
    appraise_bad_count/(appraise_good_count+appraise_mid_count+appraise_bad_coun
    t+appraise_default_count) appraise_bad_ratio
    from
    dws_sku_action_daycount
    where
    dt='2020-03-10'
    order by appraise_bad_ratio desc
    limit 10;

    3)查询数据

    select * from ads_appraise_bad_topN;

    7.4 营销主题(用户+商品+购买行为)
    7.4.1 下单数目统计
    需求分析:统计每日下单数,下单金额及下单用户数
    1)建表语句

    drop table if exists ads_order_daycount;
    create external table ads_order_daycount(
    dt string comment '统计日期',
    order_count bigint comment '单日下单笔数',
    order_amount decimal(10,2) comment '单日下单金额',
    order_users bigint comment '单日下单用户数'
    ) comment '每日订单总计表'
    row format delimited fields terminated by '	'
    location '/warehouse/gmall/ads/ads_order_daycount';

    2)导入数据

    insert into table ads_order_daycount
    select
    '2020-03-10',
    sum(order_count),
    sum(order_amount),
    sum(if(order_count>0,1,0))
    from dws_user_action_daycount
    where dt='2020-03-10';

    3)查询数据

    select * from ads_order_daycount;

    7.4.2 支付信息统计
    每日支付金额、支付人数、支付商品数、支付笔数以及下单到支付的平均时长(取自 DWD)
    1)建表

    drop table if exists ads_payment_daycount;
    create external table ads_payment_daycount(
    dt string comment '统计日期',
    payment_count bigint comment '单日支付笔数',
    payment_amount decimal(10,2) comment '单日支付金额',
    payment_user_count bigint comment '单日支付人数',
    payment_sku_count bigint comment '单日支付商品数',
    payment_avg_time double comment '下单到支付的平均时长,取分钟数'
    ) comment '每日订单总计表'
    row format delimited fields terminated by '	'
    location '/warehouse/gmall/ads/ads_payment_daycount';

    2)导入数据

    insert into table ads_payment_daycount
    select
    tmp_payment.dt,
    tmp_payment.payment_count,
    tmp_payment.payment_amount,
    tmp_payment.payment_user_count,
    tmp_skucount.payment_sku_count,
    tmp_time.payment_avg_time
    from
    (
    select
    '2020-03-15' dt,
    sum(payment_count) payment_count,
    sum(payment_amount) payment_amount,
    sum(if(payment_count>0,1,0)) payment_user_count
    from dws_user_action_daycount
    where dt='2020-03-15'
    )tmp_payment
    join
    (
    select
    '2020-03-15' dt,
    sum(if(payment_count>0,1,0)) payment_sku_count
    from dws_sku_action_daycount
    where dt='2020-03-15'
    )tmp_skucount on tmp_payment.dt=tmp_skucount.dt
    join
    (
    select
    '2020-03-15' dt,
    sum(unix_timestamp(payment_time)-unix_timestamp(create_time))/count(*)/60
    payment_avg_time
    from dwd_fact_order_info
    where dt='2020-03-15'
    and payment_time is not null
    )tmp_time on tmp_payment.dt=tmp_time.dt

    3)查询数据

    select * from ads_payment_daycount;

    7.4.3 复购率
    1)建表语句

    drop table ads_sale_tm_category1_stat_mn;
    create external table ads_sale_tm_category1_stat_mn
    (
    tm_id string comment '品牌 id',
    category1_id string comment '1 级品类 id ',
    category1_name string comment '1 级品类名称 ',
    buycount bigint comment '购买人数',
    buy_twice_last bigint comment '两次以上购买人数',
    buy_twice_last_ratio decimal(10,2) comment '单次复购率',
    buy_3times_last bigint comment '三次以上购买人数',
    buy_3times_last_ratio decimal(10,2) comment '多次复购率',
    stat_mn string comment '统计月份',
    stat_date string comment '统计日期'
    ) COMMENT '复购率统计'
    row format delimited fields terminated by '	'
    location '/warehouse/gmall/ads/ads_sale_tm_category1_stat_mn/';

    2)数据导入

    insert into table ads_sale_tm_category1_stat_mn
    select
    mn.sku_tm_id,
    mn.sku_category1_id,
    mn.sku_category1_name,
    sum(if(mn.order_count>=1,1,0)) buycount,
    sum(if(mn.order_count>=2,1,0)) buyTwiceLast,
    sum(if(mn.order_count>=2,1,0))/sum( if(mn.order_count>=1,1,0))
    buyTwiceLastRatio,
    sum(if(mn.order_count>=3,1,0)) buy3timeLast ,
    sum(if(mn.order_count>=3,1,0))/sum( if(mn.order_count>=1,1,0))
    buy3timeLastRatio ,
    date_format('2020-03-10' ,'yyyy-MM') stat_mn,
    '2020-03-10' stat_date
    from
    (
            select
            user_id,
            sd.sku_tm_id,
            sd.sku_category1_id,
            sd.sku_category1_name,
            sum(order_count) order_count
            from dws_sale_detail_daycount sd
            where date_format(dt,'yyyy-MM')=date_format('2020-03-10' ,'yyyy-MM')
            group by user_id, sd.sku_tm_id, sd.sku_category1_id, sd.sku_category1_name
    ) mn
    group by mn.sku_tm_id, mn.sku_category1_id, mn.sku_category1_name;

    7.5 ADS 层导入脚本

    1)vim dwt_to_ads.sh
    在脚本中填写如下内容

    #!/bin/bash
    hive=/opt/modules/hive/bin/hive
    # 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
    if [ -n "$1" ] ;then
    do_date=$1
    else
    do_date=`date -d "-1 day" +%F`
    fi
    sql="use gmall;
    insert into table ads_uv_count
    select
    '$do_date',
    sum(if(login_date_last='$do_date',1,0)),
    sum(if(login_date_last>=date_add(next_day('$do_date','monday'),-7) and
    login_date_last<=date_add(next_day('$do_date','monday'),-1) ,1,0)),
    sum(if(date_format(login_date_last,'yyyy-MM')=date_format('$do_date','yyyy-M
    M'),1,0)),
    if('$do_date'=date_add(next_day('$do_date','monday'),-1),'Y','N'),
    if('$do_date'=last_day('$do_date'),'Y','N')
    from dwt_uv_topic;
    
    
    insert into table ads_new_mid_count
    select
    '$do_date',
    count(*)
    from dwt_uv_topic
    where login_date_first='$do_date';
    
    
    insert into table ads_silent_count
    select
    '$do_date',
    count(*)
    from dwt_uv_topic
    where login_date_first=login_date_last
    and login_date_last<=date_add('$do_date',-7);
    
    
    insert into table ads_back_count
    select
    '$do_date',
    concat(date_add(next_day('2020-03-10','MO'),-7),'_',date_add(next_day('2020-
    03-10','MO'),-1)),
    count(*)
    from
    (
    select
    mid_id
    from dwt_uv_topic
    where login_date_last>=date_add(next_day('$do_date','MO'),-7)
    and login_date_last<= date_add(next_day('$do_date','MO'),-1)
    and login_date_first<date_add(next_day('$do_date','MO'),-7)
    )current_wk
    left join
    (
    select
    mid_id
    from dws_uv_detail_daycount
    where dt>=date_add(next_day('$do_date','MO'),-7*2)
    and dt<= date_add(next_day('$do_date','MO'),-7-1)
    group by mid_id
    )last_wk
    on current_wk.mid_id=last_wk.mid_id
    where last_wk.mid_id is null;
    
    
    insert into table ads_wastage_count
    select
    '$do_date',
    count(*)
    from dwt_uv_topic
    where login_date_last<=date_add('$do_date',-7);
    
    
    insert into table ads_user_retention_day_rate
    select
    '$do_date',
    date_add('$do_date',-3),
    3,
    sum(if(login_date_first=date_add('$do_date',-3) and
    login_date_last='$do_date',1,0)),
    sum(if(login_date_first=date_add('$do_date',-3),1,0)),
    sum(if(login_date_first=date_add('$do_date',-3) and
    login_date_last='$do_date',1,0))/sum(if(login_date_first=date_add('$do_date',
    -3),1,0))*100
    from dwt_uv_topic
    union all
    select
    '$do_date',
    date_add('$do_date',-2),
    2,
    sum(if(login_date_first=date_add('$do_date',-2) and
    login_date_last='$do_date',1,0)),
    sum(if(login_date_first=date_add('$do_date',-2),1,0)),
    sum(if(login_date_first=date_add('$do_date',-2) and
    login_date_last='$do_date',1,0))/sum(if(login_date_first=date_add('$do_date',
    -2),1,0))*100
    from dwt_uv_topic
    union all
    select
    '$do_date',
    date_add('$do_date',-1),
    1,
    sum(if(login_date_first=date_add('$do_date',-1) and
    login_date_last='$do_date',1,0)),
    sum(if(login_date_first=date_add('$do_date',-1),1,0)),
    sum(if(login_date_first=date_add('$do_date',-1) and
    login_date_last='$do_date',1,0))/sum(if(login_date_first=date_add('$do_date',
    -1),1,0))*100
    from dwt_uv_topic;
    
    
    insert into table ads_continuity_wk_count
    select
    '$do_date',
    concat(date_add(next_day('$do_date','MO'),-7*3),'_',date_add(next_day('$do_d
    ate','MO'),-1)),
    count(*)
    from
    (
    select
    mid_id
    from
    (
    select
    mid_id
    from dws_uv_detail_daycount
    where dt>=date_add(next_day('$do_date','monday'),-7)
    and dt<=date_add(next_day('$do_date','monday'),-1)
    group by mid_id
    union all
    select
    mid_id
    from dws_uv_detail_daycount
    where dt>=date_add(next_day('$do_date','monday'),-7*2)
    and dt<=date_add(next_day('$do_date','monday'),-7-1)
    group by mid_id
    union all
    select
    mid_id
    from dws_uv_detail_daycount
    where dt>=date_add(next_day('$do_date','monday'),-7*3)
    and dt<=date_add(next_day('$do_date','monday'),-7*2-1)
    group by mid_id
    )t1
    group by mid_id
    having count(*)=3
    )t2;
    
    
    insert into table ads_continuity_uv_count
    select
    '$do_date',
    concat(date_add('$do_date',-6),'_','$do_date'),
    count(*)
    from
    (
    select mid_id
    from
    (
    select mid_id
    from
    (
    select
    mid_id,
    date_sub(dt,rank) date_dif
    from
    (
    select
    mid_id,
    dt,
    rank() over(partition by mid_id order by dt) rank
    from dws_uv_detail_daycount
    where dt>=date_add('$do_date',-6) and dt<='$do_date'
    )t1
    )t2
    group by mid_id,date_dif
    having count(*)>=3
    )t3
    group by mid_id
    )t4;
    
    
    insert into table ads_user_topic
    select
    '$do_date',
    sum(if(login_date_last='$do_date',1,0)),
    sum(if(login_date_first='$do_date',1,0)),
    sum(if(payment_date_first='$do_date',1,0)),
    sum(if(payment_count>0,1,0)),
    count(*),
    sum(if(login_date_last='$do_date',1,0))/count(*),
    sum(if(payment_count>0,1,0))/count(*),
    sum(if(login_date_first='$do_date',1,0))/sum(if(login_date_last='$do_date',1,
    0))
    from dwt_user_topic;
    
    
    insert into table ads_user_action_convert_day
    select
    '$do_date',
    uv.day_count,
    ua.cart_count,
    ua.cart_count/uv.day_count*100 visitor2cart_convert_ratio,
    ua.order_count,
    ua.order_count/ua.cart_count*100 visitor2order_convert_ratio,
    ua.payment_count,
    ua.payment_count/ua.order_count*100 order2payment_convert_ratio
    from
    (
    select
    '$do_date' dt,
    sum(if(cart_count>0,1,0)) cart_count,
    sum(if(order_count>0,1,0)) order_count,
    sum(if(payment_count>0,1,0)) payment_count
    from dws_user_action_daycount
    where dt='$do_date'
    )ua join ads_uv_count uv on uv.dt=ua.dt;
    
    
    insert into table ads_product_info
    select
    '$do_date' dt,
    sku_num,
    spu_num
    from
    (
    select
    '$do_date' dt,
    count(*) sku_num
    from
    dwt_sku_topic
    ) tmp_sku_num
    join
    (
    select
    '$do_date' dt,
    count(*) spu_num
    from
    (
    select
    spu_id
    from
    dwt_sku_topic
    group by
    spu_id
    ) tmp_spu_id
    ) tmp_spu_num
    on tmp_sku_num.dt=tmp_spu_num.dt;
    
    
    insert into table ads_product_sale_topN
    select
    '$do_date',
    sku_id,
    payment_amount
    from dws_sku_action_daycount
    where dt='$do_date'
    order by payment_amount desc
    limit 10;
    
    
    insert into table ads_product_favor_topN
    select
    '$do_date',
    sku_id,
    favor_count
    from dws_sku_action_daycount
    where dt='$do_date'
    order by favor_count
    limit 10;
    
    
    insert into table ads_product_cart_topN
    select
    '$do_date' dt,
    sku_id,
    cart_num
    from dws_sku_action_daycount
    where dt='$do_date'
    order by cart_num
    limit 10;
    
    
    insert into table ads_product_refund_topN
    select
    '$do_date',
    sku_id,
    refund_last_30d_count/payment_last_30d_count*100 refund_ratio
    from dwt_sku_topic
    order by refund_ratio desc
    limit 10;
    
    
    insert into table ads_appraise_bad_topN
    select
    '$do_date' dt,
    sku_id,
    appraise_bad_count/(appraise_bad_count+appraise_good_count+appraise_mid_coun
    t+appraise_default_count)*100 appraise_bad_ratio
    from dws_sku_action_daycount
    where dt='$do_date'
    order by appraise_bad_ratio desc
    limit 10;
    
    
    insert into table ads_order_daycount
    select
    '$do_date',
    sum(order_count),
    sum(order_amount),
    sum(if(order_count>0,1,0))
    from dws_user_action_daycount
    where dt='$do_date';
    
    
    insert into table ads_payment_daycount
    select
    tmp_payment.dt,
    tmp_payment.payment_count,
    tmp_payment.payment_amount,
    tmp_payment.payment_user_count,
    tmp_skucount.payment_sku_count,
    tmp_time.payment_avg_time
    from
    (
    select
    '$do_date' dt,
    sum(payment_count) payment_count,
    sum(payment_amount) payment_amount,
    sum(if(payment_count>0,1,0)) payment_user_count
    from dws_user_action_daycount
    where dt='$do_date'
    )tmp_payment
    join
    (
    select
    '$do_date' dt,
    sum(if(payment_count>0,1,0)) payment_sku_count
    from dws_sku_action_daycount
    where dt='$do_date'
    )tmp_skucount on tmp_payment.dt=tmp_skucount.dt
    join
    (
    select
    '$do_date' dt,
    sum(unix_timestamp(payment_time)-unix_timestamp(create_time))/count(*)/60
    payment_avg_time
    from dwd_fact_order_info
    where dt='$do_date'
    and payment_time is not null
    )tmp_time on tmp_payment.dt=tmp_time.dt;
    
    
    insert into table ads_sale_tm_category1_stat_mn
    select
    mn.sku_tm_id,
    mn.sku_category1_id,
    mn.sku_category1_name,
    sum(if(mn.order_count>=1,1,0)) buycount,
    sum(if(mn.order_count>=2,1,0)) buyTwiceLast,
    sum(if(mn.order_count>=2,1,0))/sum( if(mn.order_count>=1,1,0))
    buyTwiceLastRatio,
    sum(if(mn.order_count>=3,1,0)) buy3timeLast ,
    sum(if(mn.order_count>=3,1,0))/sum( if(mn.order_count>=1,1,0))
    buy3timeLastRatio ,
    date_format('$do_date' ,'yyyy-MM') stat_mn,
    '$do_date' stat_date
    from
    (
    select
    user_id,
    sd.sku_tm_id,
    sd.sku_category1_id,
    sd.sku_category1_name,
    sum(order_count) order_count
    from dws_sale_detail_daycount sd
    where date_format(dt,'yyyy-MM')=date_format('$do_date' ,'yyyy-MM')
    group by user_id, sd.sku_tm_id, sd.sku_category1_id, sd.sku_category1_name
    ) mn
    group by mn.sku_tm_id, mn.sku_category1_id, mn.sku_category1_name;
    "
    $hive -e "$sql"

    2)增加脚本执行权限

    chmod 770 dwt_to_ads.sh

    3)执行脚本导入数据

    dwt_to_ads.sh  2020-03-10
    作者:大码王

    -------------------------------------------

    个性签名:独学而无友,则孤陋而寡闻。做一个灵魂有趣的人!

    如果觉得这篇文章对你有小小的帮助的话,记得在右下角点个“推荐”哦,博主在此感谢!

    万水千山总是情,打赏一分行不行,所以如果你心情还比较高兴,也是可以扫码打赏博主,哈哈哈(っ•?ω•?)っ???!

  • 相关阅读:
    Flink命令行提交job (源码分析)
    Flink 中LatencyMarks延迟监控(源码分析)
    Flink中的CEP复杂事件处理 (源码分析)
    Flink中异步AsyncIO的实现 (源码分析)
    Flink中发送端反压以及Credit机制(源码分析)
    Flink中接收端反压以及Credit机制 (源码分析)
    Flink整合oozie shell Action 提交任务 带kerberos认证
    Flink中TaskManager端执行用户逻辑过程(源码分析)
    Flink的Job启动TaskManager端(源码分析)
    Flink中Idle停滞流机制(源码分析)
  • 原文地址:https://www.cnblogs.com/huanghanyu/p/13739378.html
Copyright © 2020-2023  润新知