• hive拉链表以及退链例子笔记


     拉链表设计:

      在企业中,由于有些流水表每日有几千万条记录,数据仓库保存5年数据的话很容易不堪重负,因此可以使用拉链表的算法来节省存储空间。

     例子:

    -- 用户信息表;  采集当日全量数据存储到 (当日) 表中
    CREATE TABLE dwd.user_info(
        id string,      
        name string,
        sex string,
        biz_date string  -- 业务日期
    )
    
    
    -- 用户信息整合表
    CREATE TABLE dws.user_merge_info(
        id string,
        name string,
        sex string,
        start_date string,
        end_date string
    )
    
    -- 测试插入用户信息
    INSERT INTO dwd.user_info 
    SELECT 
    '1','YaoMing','boy','20190701'
    UNION ALL 
    SELECT 
    '2','YaoLinlin','girl','20190701'
    UNION ALL 
    SELECT 
    '3','CaiLili','girl','20190701'
    UNION ALL 
    SELECT 
    '4','ZhangSan','girl','20190702'
    UNION ALL 
    SELECT 
    '5','LiSi','girl','20190702'
    
    
    -- 查看数据
    SELECT * FROM dwd.user_info 

    -- 初始化用户信息整合表
    INSERT overwrite TABLE dws.user_merge_info
    SELECT
        id,
        name,
        sex,
        '20190701' AS start_date,
        '99991231' AS end_date
    FROM (
        SELECT 
            id,
            name,
            sex,
            row_number() over(PARTITION BY id ORDER BY biz_date) AS row_num  -- 初始化时候根据主键id分组,取最新修改的数据
        FROM dwd.user_info 
        ) t
    WHERE t.row_num = 1 
    
    -- 查看数据
    SELECT * FROM dws.user_merge_info

    -- 现在biz_date='20190702'这天,新跑了一条全新数据id=6,以及修改了一条id=2的数据
    INSERT INTO dwd.user_info 
    SELECT 
    '6','WangWu','boy','20190702'
    UNION ALL 
    SELECT 
    '2','YaoLinlin','boy','20190702'
    
    -- 查看数据
    SELECT * FROM dwd.user_info  ORDER BY id,biz_date

     

    -- 新增修改以及完全新增
    INSERT overwrite TABLE tmp.user_merge_info_new
    -- 修改的数据
    SELECT
    b.id,
    b.name,
    b.sex,
    '20190702' AS start_date,        -- ${bizdate} 业务日期
    '99991231' AS end_date            -- 99991231代表有效数据
    FROM dws.user_merge_info     a,
         dwd.user_info              b
    WHERE a.id = b.id 
    AND a.end_date = '99991231' 
    AND b.biz_date = '20190702'     -- ${bizdate}只取当天数据
    AND (
           a.name!= b.name
        OR a.sex != b.sex
    )
    
    UNION ALL 
    -- 全新的数据
    SELECT                                                    
        b.id,
        b.name,
        b.sex,
        '20190702'AS start_date,
        '99991231'AS end_date    
    FROM         dws.user_merge_info     a
    RIGHT JOIN  dwd.user_info         b
    ON a.id = b.id
    WHERE b.biz_date='20190702'
        AND a.id IS NULL;

    -- 闭链
    INSERT overwrite TABLE tmp.user_merge_info_upt
    SELECT
        a.id,
        a.name,
        a.sex,
        a.start_date,
        '20190702'            -- 闭链,${biz_date}业务时间
    FROM dws.user_merge_info a
    LEFT JOIN dwd.user_info b
    ON a.id=b.id 
    WHERE     a.end_date='99991231' 
        AND b.biz_date='20190702' 
        AND (
            a.name != b.name
            OR a.sex != b.sex
            )

    -- 历史数据
    INSERT overwrite TABLE tmp.user_merge_info_new
    SELECT
        a.id,
        a.name,
        a.sex,
        a.start_date,
        a.end_date
    FROM dws.user_merge_info       a,
         tmp.user_merge_info_upt  b
    WHERE a.id != b.id;

    -- 整合数据 
    INSERT OVERWRITE TABLE dws.user_merge_info
    SELECT
        id,
        name,
        sex,
        start_date,
        end_date
    FROM tmp.user_merge_info_new
    UNION ALL 
    SELECT
        id,
        name,
        sex,
        start_date,
        end_date
    FROM tmp.user_merge_info_upt
    UNION ALL 
    SELECT
        id,
        name,
        sex,
        start_date,
        end_date
    FROM tmp.user_merge_info_his
    
    
    -- 查看下数据
    SELECT * FROM dws.user_merge_info ORDER BY id,start_date

    以上拉链表就实现好了


    以下是退链操作模板

    #!/bin/bash
    
    # 使用说明提示
    if [ $# -ne 1 ]; then
        echo "Usage : `basename $0` biz_date"
        exit 1
    fi
    
    #业务时间
    biz_date=$1
    
    # 判断是数据整合还是回退拉链表
    isGoBack=`execHQL "select count(1) from dws.user_merge_info where (end_date>='$biz_date' or start_date>='$biz_date') and biz_date<>'99991231';"`
    
    if [ $isGoBack -ne 0 ];then
        # 回退模式
        Log "
    ## 【user_merge_info表回退】 执行开始 ##" 
        execHQL "
            INSERT overwrite TABLE dws.user_merge_info 
            --  完全不变的数据
            SELECT 
                 id
                ,name
                ,sex     
                ,start_date
                ,end_date 
            FROM dws.user_merge_info 
            WHERE (start_date<'$biz_date' AND end_date='99991231') OR end_date<'$biz_date'
                
            UNION ALL 
            
            -- 重跑 重新开链的数据
            SELECT 
                 id
                ,name
                ,sex     
                ,start_date
                ,'99991231' AS end_date
            FROM dws.user_merge_info 
            WHERE start_date<'$biz_date' AND end_date>='$biz_date' AND end_date<>'99991231';
        "
        if [ $? -ne 0 ];then
            Log "
    ## 【user_merge_info表回退】 执行失败 ##"
            exit 1
        fi
        Log "
    ## 【user_merge_info表回退】 执行成功 ##"
    
    fi
  • 相关阅读:
    (转)大型分布式网站架构技术总结
    VC中怎么输入特殊符号(如平方、立方等下标符号)
    【转】VC++的Unicode编程
    【转】VC下的Unicode编程
    MFC多国语言——资源DLL
    MFC多国语言——资源副本
    【转】SetThreadLocale解决越南文乱码问题
    【转】msxml 操作xml
    关于形如--error LNK2005: xxx 已经在 msvcrtd.lib ( MSVCR90D.dll ) 中定义--的问题分析解决
    vc6项目-vc8项目 转换日志
  • 原文地址:https://www.cnblogs.com/linkmust/p/11289759.html
Copyright © 2020-2023  润新知