一:di表、ds表
di表多用于事实表,例如:从数据库抽取的交易记录表,购买详单表等
实现逻辑较为简单,一般直接ETL即可。
ds表一般就是简单汇聚,例如:用户每天使用的pkg的数量表,一般直接group by 就行。
二:dd表,全量表
dd表多用于记录每日的全量状态表例如:用户账号密码表、用户通讯录表
实现逻辑一般为:取今天上报的全量的数据覆盖昨天表中的数据,完成更新
SQL例子:
insert into table dwd.dwd_user_info_dd(dt='20220503')
select coalesce(b.user_id,a.user_id) as user_id,coalesce(b.username,a.username) as username,coalesce(b.passsword,a.passsword) as passsword from
(select user_id,username,passsword from ods.ods_user_info_di where dt='20220503')a full join
(select user_id,username,passsword from dwd.dwd_user_info_dd where dt='20220502')b on a.user_id = b.user_id;
三:trace_dd表,轨迹表
trace_dd表多用于记录活跃轨迹:用户活跃轨迹表(一张表记录用户从历史到现在所有的活跃轨迹)
实现逻辑一般为:一个字段记录01串,表示用户是否活跃,一般附加两个字段,初始活跃日期和最近活跃日期
SQL例子:
insert overwrite table dws.dws_user_active_trace_dd(dt='20220504')
select coalesce(a.user_id,b.user_id) as user_id,if(b.user_id is not null,init_date,'20220504') as init_date,
if(a.user_id is not null,'20220504',init_date) as active_date,
if(a.user_id is not null and b.user_id is not null,concat(active_trace,'1'),
if(a.user_id is not null and b.user_id is null,'1',concat(active_trace,'0'))) as active_trace from
(select user_id from dwd.dwd_user_active_di)a full join
(select user_id,init_date,active_date,active_trace from dws.dws_user_active_trace_dd)b on a.user_id = b.user_id;
四:周表、双周表、月表
周表,双周表,月表:一般有最近一周两周月表、累计一周两周月表两种形式。
实现逻辑一般为:周表一般为直接计算7天的数据,双周表一般为单周表+计算7天的数据,月表一般为三周表+最近7天的数据(具体实现为一个公共函数,对外提供计算口径及方法)
例子:用户pkg的使用时长表
object parseDate { //输出三个字段 def deal3days(dealdate:String) = { import java.text.SimpleDateFormat val simpleDateFormat = new SimpleDateFormat("yyyyMMdd") val formatdate = simpleDateFormat.parse(dealdate) if(formatdate.getTime/1000/60/60/24%3 ==0){ import java.util.Calendar val calendar = Calendar.getInstance calendar.setTime(formatdate) calendar.add(Calendar.DAY_OF_YEAR,-2) "true"+"|"+simpleDateFormat.format(calendar.getTime)+"|"+dealdate } else { "false"+"||" } } //第二个字段表示有一组三天的日期还是两组三天的日期 def dealweek(dealdate:String) = { import java.text.SimpleDateFormat import java.util.Calendar val calendar = Calendar.getInstance val simpleDateFormat = new SimpleDateFormat("yyyyMMdd") val formatdate = simpleDateFormat.parse(dealdate) calendar.setTime(formatdate) if(calendar.get(Calendar.DAY_OF_WEEK) == 1){ val datethree = Array("","","","") var datethreeindex = 0 while(calendar.getTimeInMillis/1000/60/60/24%3 !=0){ datethree.update(datethreeindex,simpleDateFormat.format(calendar.getTime)) datethreeindex = datethreeindex+1 calendar.add(Calendar.DAY_OF_YEAR,-1) } val first3daysend = simpleDateFormat.format(calendar.getTime)//提取3days的后缀日期 calendar.add(Calendar.DAY_OF_YEAR,-2) val first3daystart = simpleDateFormat.format(calendar.getTime)//提取3days的前缀日期 calendar.add(Calendar.DAY_OF_YEAR,-1) if(datethreeindex == 2){ while(calendar.get(Calendar.DAY_OF_WEEK) != 1){ //拿到周日期的起点 datethree.update(datethreeindex,simpleDateFormat.format(calendar.getTime)) datethreeindex = datethreeindex+1 calendar.add(Calendar.DAY_OF_YEAR,-1) } "true"+"|"+1+"|"+first3daystart+"|"+first3daysend+"|"+datethree(0)+"|"+datethree(1)+"|"+datethree(2)+"|"+datethree(3) } else{ val second3daysend = simpleDateFormat.format(calendar.getTime)//提取3days的前缀日期 calendar.add(Calendar.DAY_OF_YEAR,-2) val second3daystart = simpleDateFormat.format(calendar.getTime)//提取3days的前缀日期 while(calendar.get(Calendar.DAY_OF_WEEK) != 1){ //拿到周日期的起点 calendar.add(Calendar.DAY_OF_YEAR,-1) datethree.update(datethreeindex,simpleDateFormat.format(calendar.getTime)) datethreeindex = datethreeindex+1 } "true"+"|"+2+"|"+first3daystart+"|"+first3daysend+"|"+second3daystart+"|"+second3daysend+"|"+datethree(0) } } else { "false"+"||||||" } } //第二个字段表示有一组三天的日期还是两组三天的日期 def dealastweek(dealdate:String) = { import java.text.SimpleDateFormat import java.util.Calendar val calendar = Calendar.getInstance val simpleDateFormat = new SimpleDateFormat("yyyyMMdd") val formatdate = simpleDateFormat.parse(dealdate) calendar.setTime(formatdate) val datethree = Array("","","","") var datethreeindex = 0 while(calendar.getTimeInMillis/1000/60/60/24%3 !=0){ datethree.update(datethreeindex,simpleDateFormat.format(calendar.getTime)) datethreeindex = datethreeindex+1 calendar.add(Calendar.DAY_OF_YEAR,-1) } val first3daysend = simpleDateFormat.format(calendar.getTime)//提取3days的后缀日期 calendar.add(Calendar.DAY_OF_YEAR,-2) val first3daystart = simpleDateFormat.format(calendar.getTime)//提取3days的前缀日期 calendar.add(Calendar.DAY_OF_YEAR,-1) if(datethreeindex == 2){ while(calendar.get(Calendar.DAY_OF_WEEK) != 1){ //拿到周日期的起点 datethree.update(datethreeindex,simpleDateFormat.format(calendar.getTime)) datethreeindex = datethreeindex+1 calendar.add(Calendar.DAY_OF_YEAR,-1) } dealdate+"true"+"|"+1+"|"+first3daystart+"|"+first3daysend+"|"+datethree(0)+"|"+datethree(1)+"|"+datethree(2)+"|"+datethree(3) } else{ val second3daysend = simpleDateFormat.format(calendar.getTime)//提取3days的前缀日期 calendar.add(Calendar.DAY_OF_YEAR,-2) val second3daystart = simpleDateFormat.format(calendar.getTime)//提取3days的前缀日期 while(calendar.get(Calendar.DAY_OF_WEEK) != 1){ //拿到周日期的起点 calendar.add(Calendar.DAY_OF_YEAR,-1) datethree.update(datethreeindex,simpleDateFormat.format(calendar.getTime)) datethreeindex = datethreeindex+1 } dealdate+"true"+"|"+2+"|"+first3daystart+"|"+first3daysend+"|"+second3daystart+"|"+second3daysend+"|"+datethree(0) } } def dealtwoweek(dealdate:String) = { import java.text.SimpleDateFormat import java.util.Calendar val calendar = Calendar.getInstance val simpleDateFormat = new SimpleDateFormat("yyyyMMdd") val formatdate = simpleDateFormat.parse(dealdate) calendar.setTime(formatdate) if(calendar.get(Calendar.DAY_OF_WEEK_IN_MONTH)%2 == 0 && calendar.get(Calendar.DAY_OF_WEEK)==1){ val second3daysend = simpleDateFormat.format(calendar.getTime)//提取3days的前缀日期 calendar.add(Calendar.DAY_OF_YEAR,-6) val second3daystart = simpleDateFormat.format(calendar.getTime)//提取3days的前缀日期 calendar.add(Calendar.DAY_OF_YEAR,-1) val first3daysend = simpleDateFormat.format(calendar.getTime)//提取3days的后缀日期 calendar.add(Calendar.DAY_OF_YEAR,-6) val first3daystart = simpleDateFormat.format(calendar.getTime)//提取3days的前缀日期 "true"+"|"+first3daystart+"|"+first3daysend+"|"+second3daystart+"|"+second3daysend } else{ "false"+"||||" } } def dealastwoweek(dealdate:String) = { import java.text.SimpleDateFormat import java.util.Calendar val calendar = Calendar.getInstance val simpleDateFormat = new SimpleDateFormat("yyyyMMdd") val formatdate = simpleDateFormat.parse(dealdate) calendar.setTime(formatdate) val second3daysend = simpleDateFormat.format(calendar.getTime)//提取3days的前缀日期 calendar.add(Calendar.DAY_OF_YEAR,-6) val second3daystart = simpleDateFormat.format(calendar.getTime)//提取3days的前缀日期 calendar.add(Calendar.DAY_OF_YEAR,-1) val first3daysend = simpleDateFormat.format(calendar.getTime)//提取3days的后缀日期 calendar.add(Calendar.DAY_OF_YEAR,-6) val first3daystart = simpleDateFormat.format(calendar.getTime)//提取3days的前缀日期 "true"+"|"+first3daystart+"|"+first3daysend+"|"+second3daystart+"|"+second3daysend } def dealmonth(dealdate:String) = { import java.text.SimpleDateFormat import java.util.Calendar val calendar = Calendar.getInstance val simpleDateFormat = new SimpleDateFormat("yyyyMMdd") val formatdate = simpleDateFormat.parse(dealdate) calendar.setTime(formatdate) val calendarcopy = Calendar.getInstance calendarcopy.setTime(formatdate) calendarcopy.add(Calendar.DAY_OF_YEAR,1) if(calendarcopy.get(Calendar.DAY_OF_MONTH)==1){ val datethree = Array("","","") var datethreeindex = 0 while(calendar.get(Calendar.DAY_OF_WEEK_IN_MONTH)%2 != 0 && calendar.get(Calendar.DAY_OF_WEEK)!=1){ datethree.update(datethreeindex,simpleDateFormat.format(calendar.getTime)) datethreeindex = datethreeindex+1 calendar.add(Calendar.DAY_OF_YEAR,-1) } val second3daysend = simpleDateFormat.format(calendar.getTime)//提取3days的前缀日期 calendar.add(Calendar.DAY_OF_YEAR,-13) val second3daystart = simpleDateFormat.format(calendar.getTime)//提取3days的前缀日期 calendar.add(Calendar.DAY_OF_YEAR,-1) val first3daysend = simpleDateFormat.format(calendar.getTime)//提取3days的后缀日期 calendar.add(Calendar.DAY_OF_YEAR,-13) val first3daystart = simpleDateFormat.format(calendar.getTime)//提取3days的前缀日期 while(calendar.get(Calendar.DAY_OF_MONTH)!=1 ){ calendar.add(Calendar.DAY_OF_YEAR,-1) datethree.update(datethreeindex,simpleDateFormat.format(calendar.getTime)) datethreeindex = datethreeindex+1 } "true"+"|"+datethreeindex+"|"+first3daystart+"|"+first3daysend+"|"+second3daystart+"|"+second3daysend+"|"+datethree(0)+"|"+datethree(1)+"|"+datethree(2) } else{ "false"+"||||||||" } } def dealastmonth(dealdate:String) = { import java.text.SimpleDateFormat import java.util.Calendar val calendar = Calendar.getInstance val simpleDateFormat = new SimpleDateFormat("yyyyMMdd") val formatdate = simpleDateFormat.parse(dealdate) calendar.setTime(formatdate) val calendarcopy = Calendar.getInstance calendarcopy.setTime(formatdate) calendarcopy.add(Calendar.DAY_OF_YEAR,1) val datethree = Array("","") var datethreeindex = 0 val second3daysend = simpleDateFormat.format(calendar.getTime)//提取3days的前缀日期 calendar.add(Calendar.DAY_OF_YEAR,-13) val second3daystart = simpleDateFormat.format(calendar.getTime)//提取3days的前缀日期 calendar.add(Calendar.DAY_OF_YEAR,-1) val first3daysend = simpleDateFormat.format(calendar.getTime)//提取3days的后缀日期 calendar.add(Calendar.DAY_OF_YEAR,-13) val first3daystart = simpleDateFormat.format(calendar.getTime)//提取3days的前缀日期 while(datethreeindex <2){ calendar.add(Calendar.DAY_OF_YEAR,-1) datethree.update(datethreeindex,simpleDateFormat.format(calendar.getTime)) datethreeindex = datethreeindex+1 } "true"+"|"+datethreeindex+"|"+first3daystart+"|"+first3daysend+"|"+second3daystart+"|"+second3daysend+"|"+datethree(0)+"|"+datethree(1) } def main(args: Array[String]):Unit= { val dealtype = 7 // val dealdatelist = Array("20220109","20220116","20220123","20220130") val dealdatelist = Array("20220227","20220228","20220330","20220331") for(dealdate <- dealdatelist) { dealtype match{ case 1 => println(deal3days(dealdate)) case 2 => println(dealweek(dealdate)) case 3 => println(dealastweek(dealdate)) case 4 => println(dealtwoweek(dealdate)) case 5 => println(dealastwoweek(dealdate)) case 6 => println(dealmonth(dealdate)) case 7 => println(dealastmonth(dealdate)) } } } }
SQL例子:
天表:(每天运行)
insert overwrite table dws.dws_user_usedur_ds(dt='20220109')
select user_id,sum(usedur) as usedur from dwd.dwd_user_usedur_di where dt='20220109' group by user_id;
三天表:(每隔三天运行)
System.currentTimeMillis/1000/60/60/24%3 ==0
insert overwrite table dws.dws_user_usedur_4ds(dt='20220103_20220105')
select user_id,sum(usedur) as usedur from dws.dws_user_usedur_ds where dt>='20220103' and dt<='20220105' group by user_id;
insert overwrite table dws.dws_user_usedur_4ds(dt='20220106_20220108')
select user_id,sum(usedur) as usedur from dws.dws_user_usedur_ds where dt>='20220106' and dt<='20220108' group by user_id;
周表:(每周日运行,依赖三天表天表):
insert overwrite table dws.dws_user_usedur_ws(dt='20220103_20220109')
select user_id,sum(usedur) as usedur from (
select user_id,usedur from dws.dws_user_usedur_4ds where dt='20220103_20220105'
union all
select user_id,usedur from dws.dws_user_usedur_4ds where dt='20220106_20220108'
union all
select user_id,usedur from dwd.dwd_user_usedur_di where dt='20220109'
)t group by user_id;
双周表:(每双周日运行,依赖周表):
insert overwrite table dws.dws_user_usedur_2ws(dt='20220103_20220116')
select user_id,sum(usedur) as usedur from (
select user_id,usedur from dws_user_usedur_ws where dt='20220103_20220109'
union all
select user_id,usedur from dws_user_usedur_ws where dt='20220110_20220116'
)t group by user_id;
月表:(每月底运行,依赖双周表天表):
insert overwrite table dws.dws_user_usedur_ms(dt='20220101_20220131')
select user_id,sum(usedur) as usedur from (
select user_id,usedur from dws.dws_user_usedur_2ws where dt='20220103_20220116'
union all
select user_id,usedur from dws.dws_user_usedur_2ws where dt='20220117_20220130'
union all
select user_id,usedur from dws.dws_user_usedur_ds where dt='20220101'
union all
select user_id,usedur from dws.dws_user_usedur_ds where dt='20220103'
union all
select user_id,usedur from dws.dws_user_usedur_ds where dt='20220131'
)t group by user_id;
周表:(每天运行,最近七天数据,依赖三天表天表):
insert overwrite table dws.dws_user_usedur_ws(dt='20220102_20220108')
select user_id,sum(usedur) as usedur from (
select user_id,usedur from dws.dws_user_usedur_4ds where dt='20220103_20220105'
union all
select user_id,usedur from dws.dws_user_usedur_4ds where dt='20220106_20220108'
union all
select user_id,usedur from dwd.dwd_user_usedur_di where dt='20220102'
)t group by user_id;
双周表:(每天运行,最近十四天数据,依赖周表):
insert overwrite table dws.dws_user_usedur_2ws(dt='20220104_20220117')
select user_id,sum(usedur) as usedur from (
select user_id,usedur from dws.dws_user_usedur_ws where dt='20220104_20220110'
union all
select user_id,usedur from dws.dws_user_usedur_ws where dt='20220111_20220117'
)t group by user_id;
月表:(每天运行,最近三十天数据,依赖双周表天表):
insert overwrite table dws.dws_user_usedur_ms(dt='20220103_20220201')
select user_id,sum(usedur) as usedur from (
select user_id,usedur from dws.dws_user_usedur_2ws where dt='20220103_20220116'
union all
select user_id,usedur from dws.dws_user_usedur_2ws where dt='20220117_20220130'
union all
select user_id,usedur from dws.dws_user_usedur_ds where dt='20220131'
union all
select user_id,usedur from dws.dws_user_usedur_ds where dt='20220201'
)t group by user_id;
四:复杂类型操作:相加表
一般的表会有复杂类型的数据,例如maplist,部分表会存在将map合并或者list相加减的表。
实现逻辑一般为:建立UDAF
package com.transsion.bigdata.aggregate import org.apache.hadoop.hive.ql.exec.UDF import org.apache.spark.sql.Row import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction} import org.apache.spark.sql.types.{DataType, MapType, StringType, StructField, StructType,ArrayType} import scala.collection.mutable class AggList extends UserDefinedAggregateFunction { override def inputSchema: StructType = StructType(StructField("input",ArrayType(StringType)):: Nil) // 缓存区数据结构 override def bufferSchema: StructType = StructType(StructField("buffer",ArrayType(StringType)):: Nil) // 聚合函数返回值数据结构 override def dataType: DataType = ArrayType(StringType) // 聚合函数是否是幂等的,即相同输入是否总是能得到相同输出 override def deterministic: Boolean = true // 初始化缓冲区 override def initialize(buffer: MutableAggregationBuffer): Unit = { buffer(0) = new java.util.ArrayList[String]() } // 给聚合函数传入一条新数据进行处理 //传入字段做字符串反转 override def update(buffer: MutableAggregationBuffer, input: Row): Unit = { agg(buffer,input) } // 合并聚合函数缓冲区 override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { agg(buffer1,buffer2) } // 计算最终结果 override def evaluate(buffer: Row) = buffer.getList[String](0) def agg(buffer1: MutableAggregationBuffer, buffer2: Row) = { val bufferdata:java.util.List[String] =new java.util.ArrayList[String]() bufferdata.addAll(buffer1.getList[String](0)) if(buffer2.get(0) != null){ bufferdata.removeAll(buffer2.getList[String](0)) bufferdata.addAll(buffer2.getList[String](0)) } bufferdata.remove("") buffer1.update(0,bufferdata) // println("bufferdata:"+bufferdata) // println() } }
package com.transsion.bigdata.aggregate import org.apache.hadoop.hive.ql.exec.UDF import org.apache.spark.sql.Row import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction} import org.apache.spark.sql.types._ import org.apache.spark.sql.types.MapType import scala.collection.mutable class AggMap extends UserDefinedAggregateFunction { override def inputSchema: StructType = StructType(StructField("input",MapType(StringType,StringType)):: Nil) // 缓存区数据结构 override def bufferSchema: StructType = StructType(StructField("buffer",MapType(StringType,StringType)):: Nil) // 聚合函数返回值数据结构 override def dataType: DataType = MapType(StringType,StringType) // 聚合函数是否是幂等的,即相同输入是否总是能得到相同输出 override def deterministic: Boolean = true // 初始化缓冲区 override def initialize(buffer: MutableAggregationBuffer): Unit = { buffer(0) = mutable.Map() } // 给聚合函数传入一条新数据进行处理 //传入字段做字符串反转 override def update(buffer: MutableAggregationBuffer, input: Row): Unit = { evaluate(buffer,input) } // 合并聚合函数缓冲区 override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { evaluate(buffer1,buffer2) } // 计算最终结果 override def evaluate(buffer: Row) = buffer.getMap[String,String](0) def evaluate(buffer1: MutableAggregationBuffer, buffer2: Row) = { // println("-----buffer1.size === "+buffer1.size) // println("-----buffer2.size === "+buffer2.size) val bufferdata = buffer1.getMap[String,String](0) val inputdata = buffer2.getMap[String,String](0) if(inputdata != null){ val keyset = bufferdata.keySet.toList.++:(inputdata.keySet.toList).distinct // println("-----keySet === "+bufferdata.keySet.toList.++:(inputdata.keySet.toList).distinct) // println("-----bufferdata|inputdata === "+bufferdata+"|"+inputdata) var finalMap:mutable.Map[String,String] =mutable.Map[String,String]() for (key <- keyset){ if(!key.equals("")){ // println("-----key:"+(key.equals(""))) // if(bufferdata.contains(key) && inputdata.contains(key) ) // println("-----bufferdata + inputdata === " + bufferdata.apply(key).toString.toDouble + inputdata.apply(key).toString.toDouble) // else if(bufferdata.contains(key)) // println("-----bufferdata === "+bufferdata.apply(key).toString.toDouble) // else if(inputdata.contains(key)) println("-----inputdata === "+inputdata.apply(key).toString.toDouble) val values = {if(bufferdata.contains(key) && inputdata.contains(key) ) bufferdata.apply(key).toString.toDouble + inputdata.apply(key).toString.toDouble else if(bufferdata.contains(key)) bufferdata.apply(key).toString.toDouble else if(inputdata.contains(key)) inputdata.apply(key).toString.toDouble } finalMap.put(key,values.toString) } } // println("-----finalMap.toMap === "+ finalMap.toMap) // println() buffer1.update(0,finalMap.toMap) } } }