使用大表 join 小表对性能的提升
案例一:
原sql:
select
t3.repay_dt,
count(1),
count(case when t3.his_had_overdue=1 then a.loan_id end),
count(case when t3.his_had_overdue=1 then a.loan_id end)*1.0000/count(1) ,
try(count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>1 and t3.his_overdue_days>1 then a.loan_id end)*1.0000 / count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>1 then a.loan_id end) ),
try(count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>3 and t3.his_overdue_days>3 then a.loan_id end)*1.0000 / count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>3 then a.loan_id end) ),
try(count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>5 and t3.his_overdue_days>5 then a.loan_id end)*1.0000 / count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>5 then a.loan_id end) ) ,
try(count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>10 and t3.his_overdue_days>10 then a.loan_id end)*1.0000 / count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>10 then a.loan_id end) )
from riskn.test a
join riskn.test t2 on a.loan_id=t2.loan_id
and t2.etl_dt='2021-06-06'
join riskn.test2 t3 on t3.loan_id =t2.loan_id
and t3.etl_dt='2021-06-06' and t3.repay_per_num=1
and t3.repay_plan_type!='prepay'
and(( t3.repay_stat not in (30) and t3.source_system ='1' )or t3.source_system='1')
where a.etl_dt='2021-06-06'
and a.biz_line='1'
and a.bank_id=1
and a.loan_dt>=date'2019-12-31'
and t3.repay_dt>=date'2021-05-31'
and t3.repay_dt<=date'2021-06-06'
and a.app_name='1'
group by 1
order by 1
资源使用情况:
- 峰值内存:834MB
- 用户内存:2.89G
执行计划:
优化建议:
使用大表去 join 小表
更改操作:
- from riskn.dws_loan_order_wide_001 a
- join riskn.dwd_order_cash_result_001 t2 on a.loan_id=t2.loan_id
+ from riskn.dwd_order_cash_result_001 t2
+ join riskn.dws_loan_order_wide_001 a on a.loan_id=t2.loan_id
优化后的sql:
select
t3.repay_dt ,
count(1) ,
count(case when t3.his_had_overdue=1 then a.loan_id end),
count(case when t3.his_had_overdue=1 then a.loan_id end)*1.0000/count(1),
try(count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>1 and t3.his_overdue_days>1 then a.loan_id end)*1.0000 / count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>1 then a.loan_id end) ),
try(count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>3 and t3.his_overdue_days>3 then a.loan_id end)*1.0000 / count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>3 then a.loan_id end) ),
try(count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>5 and t3.his_overdue_days>5 then a.loan_id end)*1.0000 / count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>5 then a.loan_id end) ),
try(count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>10 and t3.his_overdue_days>10 then a.loan_id end)*1.0000 / count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>10 then a.loan_id end) ),
from riskn.test t2
join riskn.testa a on a.loan_id=t2.loan_id
and t2.etl_dt='2021-06-06'
join riskn.testb t3 on t3.loan_id =t2.loan_id
and t3.etl_dt='2021-06-06' and t3.repay_per_num=1
and t3.repay_plan_type!='1'
and(( t3.repay_stat not in (30) and t3.source_system ='1' )or t3.source_system='1')
where a.etl_dt='2021-06-06'
and a.biz_line='1'
and a.bank_id=1
and a.loan_dt>=date'2019-12-31'
and t3.repay_dt>=date'2021-05-31'
and t3.repay_dt<=date'2021-06-06'
and a.app_name='1'
group by 1
order by 1
执行计划:
资源使用情况:
案例二:
内存使用了减少了近3倍
原sql:
select
json_extract_scalar(apply.extend_field, '$.1') as customer_type,
try(count(distinct case when t1.decision='1' then t1.apply_id end )*1.0000/count(distinct t1.apply_id)) innerrule_rate,
try(count(distinct case when apply.credit_result ='pass' then apply.apply_id end )*1.0000/(count(distinct case when t2.decision is not null then t2.apply_id end )-count(distinct case when t2.decision='2' then t2.apply_id end ) ))model_rate,
1-try(count(distinct case when t2.decision='2' then t2.apply_id end )*1.0000/count(distinct t2.apply_id)) outrule_rate,
try(count(distinct case when apply.step ='1' and apply.credit_result ='pass' then apply.apply_id end )*1.0000/
(count(distinct case when t2.decision is not null then t2.apply_id end )-count(distinct case when t2.decision='2' then t2.apply_id end ) ))model_rate,
try(count(distinct case when apply.credit_result='pass' then apply.apply_id end )*1.0000/count(distinct apply.apply_id )) approve_rate,
count(distinct apply.apply_id ) apply_cnt,count(distinct case when apply.credit_result='pass' then apply.apply_id end )approve_cnt--, count(apply.apply_id)
from rsk.rreal_flink_bl_sgfq_risk_control_sgfq_credit_result_presto apply
join rsk.rreal_flink_decision_engine_ruleset_run_record_presto t1
on t1.apply_id=apply.apply_id
and regexp_like(t1.ruleset_name,'rule' ) and t1.ruleset_name not like '%outer%'
and t1.etl_dt>='2020-01-17' and t1.step in ('1','2')
and t1.ruleset_name not like '%3%'
left join rsk.test t2
on apply.apply_id= t2.apply_id
and regexp_like(t2.ruleset_name,'rule')
and regexp_like(t2.ruleset_name,'outer')
and t2.etl_dt>='2020-01-17' and t2.step in ('3','2')
and t2.ruleset_name not like '%1%'
where
apply.step in ('3','4')
and apply.app_name='jsd'
--and apply.etl_dt='2019-10-11'
and apply.etl_dt='2021-06-07'
and json_extract_scalar(apply.extend_field, '$.apply_type')!= '257'
and format_datetime(from_unixtime(cast (substring(apply.create_time,1,10) as bigint)),'yyyy-MM-dd HH:mm:ss')>='2021-06-07 00:00:00'
group by 1
结论
大表 jion 小表性能较之小表 join 大表要提升三倍。所以大表join优化项比较关键。
使用WITH
原SQL语句:
select distinct a.etl_dt,a.cost,de.hightype,de.type,a.desc as description,
case when a.cost>0 then 'test'
else '非投放渠道' end as pay_type
from
(select cost_dt as etl_dt,
sum(cost) as cost,
desc
from rdw.test
where cost_dt>=date('2021-06-06')
and cost_dt<=date('2021-06-06')
and platform in ('4')
group by cost_dt,desc)a
left join
(select distinct hightype,
type,
description,
etl_dt
from rmk.hha
where date(etl_dt) between date('2021-06-06') and date('2021-06-06')
and platform='APP'
and app_subname in ('null','')
) as de
on trim(lower(cast(a.desc as varchar)))=trim(lower(cast(de.description as varchar)))
and date(de.etl_dt)=date(a.etl_dt)
group by a.etl_dt,a.cost,de.hightype,de.type,a.desc,case when a.cost>0 then '投放'
else '非投放' end
优化方向:
- 使用with替换
- join 字段避免出现函数
- group by后面跟数字
- 大表join小表
优化后:
with a as (select distinct date(cost_dt) as etl_dt,
sum(cost) as cost,
trim(lower(cast(desc as varchar))) as desc
from rdw.test
where cost_dt>=date('2021-06-06')
and cost_dt<=date('2021-06-06')
and platform in ('4')
group by cost_dt,desc),
de as (select distinct hightype,
type,
trim(lower(cast(description as varchar))) as description,
date(etl_dt) as etl_dt
from rmk.hha
where date(etl_dt) between date('2021-06-06') and date('2021-06-06')
and platform='APP'
and app_subname in ('null','')
)
select a.etl_dt,a.cost,de.hightype,de.type,a.desc as description,
case when a.cost>0 then '投放渠道'
else '非投放渠道' end as pay_type
from de right join a on a.desc = de.description
and de.etl_dt = a.etl_dt
group by 1,2,3,4,5,6
结论:
经过测试,对性能提升不大。