拉链表设计:
在企业中,由于有些流水表每日有几千万条记录,数据仓库保存5年数据的话很容易不堪重负,因此可以使用拉链表的算法来节省存储空间。
例子:
-- 用户信息表; 采集当日全量数据存储到 (当日) 表中 CREATE TABLE dwd.user_info( id string, name string, sex string, biz_date string -- 业务日期 ) -- 用户信息整合表 CREATE TABLE dws.user_merge_info( id string, name string, sex string, start_date string, end_date string ) -- 测试插入用户信息 INSERT INTO dwd.user_info SELECT '1','YaoMing','boy','20190701' UNION ALL SELECT '2','YaoLinlin','girl','20190701' UNION ALL SELECT '3','CaiLili','girl','20190701' UNION ALL SELECT '4','ZhangSan','girl','20190702' UNION ALL SELECT '5','LiSi','girl','20190702' -- 查看数据 SELECT * FROM dwd.user_info
-- 初始化用户信息整合表 INSERT overwrite TABLE dws.user_merge_info SELECT id, name, sex, '20190701' AS start_date, '99991231' AS end_date FROM ( SELECT id, name, sex, row_number() over(PARTITION BY id ORDER BY biz_date) AS row_num -- 初始化时候根据主键id分组,取最新修改的数据 FROM dwd.user_info ) t WHERE t.row_num = 1 -- 查看数据 SELECT * FROM dws.user_merge_info
-- 现在biz_date='20190702'这天,新跑了一条全新数据id=6,以及修改了一条id=2的数据 INSERT INTO dwd.user_info SELECT '6','WangWu','boy','20190702' UNION ALL SELECT '2','YaoLinlin','boy','20190702' -- 查看数据 SELECT * FROM dwd.user_info ORDER BY id,biz_date
-- 新增修改以及完全新增 INSERT overwrite TABLE tmp.user_merge_info_new -- 修改的数据 SELECT b.id, b.name, b.sex, '20190702' AS start_date, -- ${bizdate} 业务日期 '99991231' AS end_date -- 99991231代表有效数据 FROM dws.user_merge_info a, dwd.user_info b WHERE a.id = b.id AND a.end_date = '99991231' AND b.biz_date = '20190702' -- ${bizdate}只取当天数据 AND ( a.name!= b.name OR a.sex != b.sex ) UNION ALL -- 全新的数据 SELECT b.id, b.name, b.sex, '20190702'AS start_date, '99991231'AS end_date FROM dws.user_merge_info a RIGHT JOIN dwd.user_info b ON a.id = b.id WHERE b.biz_date='20190702' AND a.id IS NULL;
-- 闭链 INSERT overwrite TABLE tmp.user_merge_info_upt SELECT a.id, a.name, a.sex, a.start_date, '20190702' -- 闭链,${biz_date}业务时间 FROM dws.user_merge_info a LEFT JOIN dwd.user_info b ON a.id=b.id WHERE a.end_date='99991231' AND b.biz_date='20190702' AND ( a.name != b.name OR a.sex != b.sex )
-- 历史数据 INSERT overwrite TABLE tmp.user_merge_info_new SELECT a.id, a.name, a.sex, a.start_date, a.end_date FROM dws.user_merge_info a, tmp.user_merge_info_upt b WHERE a.id != b.id;
-- 整合数据 INSERT OVERWRITE TABLE dws.user_merge_info SELECT id, name, sex, start_date, end_date FROM tmp.user_merge_info_new UNION ALL SELECT id, name, sex, start_date, end_date FROM tmp.user_merge_info_upt UNION ALL SELECT id, name, sex, start_date, end_date FROM tmp.user_merge_info_his -- 查看下数据 SELECT * FROM dws.user_merge_info ORDER BY id,start_date
以上拉链表就实现好了
以下是退链操作模板
#!/bin/bash # 使用说明提示 if [ $# -ne 1 ]; then echo "Usage : `basename $0` biz_date" exit 1 fi #业务时间 biz_date=$1 # 判断是数据整合还是回退拉链表 isGoBack=`execHQL "select count(1) from dws.user_merge_info where (end_date>='$biz_date' or start_date>='$biz_date') and biz_date<>'99991231';"` if [ $isGoBack -ne 0 ];then # 回退模式 Log " ## 【user_merge_info表回退】 执行开始 ##" execHQL " INSERT overwrite TABLE dws.user_merge_info -- 完全不变的数据 SELECT id ,name ,sex ,start_date ,end_date FROM dws.user_merge_info WHERE (start_date<'$biz_date' AND end_date='99991231') OR end_date<'$biz_date' UNION ALL -- 重跑 重新开链的数据 SELECT id ,name ,sex ,start_date ,'99991231' AS end_date FROM dws.user_merge_info WHERE start_date<'$biz_date' AND end_date>='$biz_date' AND end_date<>'99991231'; " if [ $? -ne 0 ];then Log " ## 【user_merge_info表回退】 执行失败 ##" exit 1 fi Log " ## 【user_merge_info表回退】 执行成功 ##" fi