【presto】测试各种优化建议对presto性能的提升

使用大表 join 小表对性能的提升

案例一：

原sql：

select
t3.repay_dt,
count(1),
count(case when t3.his_had_overdue=1 then a.loan_id end),
count(case when t3.his_had_overdue=1 then a.loan_id end)*1.0000/count(1) ,
try(count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>1 and t3.his_overdue_days>1 then a.loan_id end)*1.0000 / count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>1 then a.loan_id end) ),
try(count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>3 and t3.his_overdue_days>3 then a.loan_id end)*1.0000 / count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>3 then a.loan_id end) ),
try(count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>5 and t3.his_overdue_days>5 then a.loan_id end)*1.0000 / count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>5 then a.loan_id end) ) ,
try(count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>10 and t3.his_overdue_days>10 then a.loan_id end)*1.0000 / count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>10 then a.loan_id end) )
from riskn.test a
join riskn.test t2 on a.loan_id=t2.loan_id
and t2.etl_dt='2021-06-06'
join riskn.test2 t3 on t3.loan_id =t2.loan_id
and t3.etl_dt='2021-06-06' and t3.repay_per_num=1
and t3.repay_plan_type!='prepay'
and(( t3.repay_stat not in (30) and t3.source_system ='1' )or  t3.source_system='1')
where a.etl_dt='2021-06-06'
and a.biz_line='1'
and a.bank_id=1
and a.loan_dt>=date'2019-12-31'
and t3.repay_dt>=date'2021-05-31'
and t3.repay_dt<=date'2021-06-06'
and a.app_name='1'
group by 1
order by 1

资源使用情况：

在这里插入图片描述

峰值内存：834MB
用户内存：2.89G

执行计划：

在这里插入图片描述

优化建议：

使用大表去 join 小表
在这里插入图片描述

更改操作：

- from riskn.dws_loan_order_wide_001 a
- join riskn.dwd_order_cash_result_001 t2 on a.loan_id=t2.loan_id
 
 
+ from riskn.dwd_order_cash_result_001 t2
+ join riskn.dws_loan_order_wide_001 a on a.loan_id=t2.loan_id

优化后的sql：

select
t3.repay_dt ,
count(1) ,
count(case when t3.his_had_overdue=1 then a.loan_id end),
count(case when t3.his_had_overdue=1 then a.loan_id end)*1.0000/count(1),
try(count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>1 and t3.his_overdue_days>1 then a.loan_id end)*1.0000 / count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>1 then a.loan_id end) ),
try(count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>3 and t3.his_overdue_days>3 then a.loan_id end)*1.0000 / count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>3 then a.loan_id end) ),
try(count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>5 and t3.his_overdue_days>5 then a.loan_id end)*1.0000 / count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>5 then a.loan_id end) ),
try(count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>10 and t3.his_overdue_days>10 then a.loan_id end)*1.0000 / count(case when DATE_DIFF('day',t3.repay_dt,cast(t3.etl_dt as date))>10 then a.loan_id end) ),
from riskn.test t2
join riskn.testa a on a.loan_id=t2.loan_id
and t2.etl_dt='2021-06-06'
join riskn.testb t3 on t3.loan_id =t2.loan_id
and t3.etl_dt='2021-06-06' and t3.repay_per_num=1
and t3.repay_plan_type!='1'
and(( t3.repay_stat not in (30) and t3.source_system ='1' )or  t3.source_system='1')
where a.etl_dt='2021-06-06'
and a.biz_line='1'
and a.bank_id=1
and a.loan_dt>=date'2019-12-31'
and t3.repay_dt>=date'2021-05-31'
and t3.repay_dt<=date'2021-06-06'
and a.app_name='1'
group by 1
order by 1

执行计划:

在这里插入图片描述

资源使用情况：

在这里插入图片描述

案例二：

在这里插入图片描述

内存使用了减少了近3倍

原sql：

select
json_extract_scalar(apply.extend_field, '$.1') as customer_type,
        try(count(distinct case when  t1.decision='1' then t1.apply_id end )*1.0000/count(distinct t1.apply_id)) innerrule_rate,
        try(count(distinct case when apply.credit_result ='pass' then apply.apply_id end )*1.0000/(count(distinct case when t2.decision is not null then t2.apply_id end )-count(distinct case when t2.decision='2' then t2.apply_id end ) ))model_rate,
        1-try(count(distinct case when t2.decision='2' then t2.apply_id end )*1.0000/count(distinct t2.apply_id)) outrule_rate,
try(count(distinct case when apply.step ='1' and apply.credit_result ='pass' then apply.apply_id end )*1.0000/
(count(distinct case when t2.decision is not null then t2.apply_id end )-count(distinct case when t2.decision='2' then t2.apply_id end ) ))model_rate,       
try(count(distinct case when apply.credit_result='pass' then apply.apply_id end )*1.0000/count(distinct apply.apply_id )) approve_rate,
        count(distinct apply.apply_id ) apply_cnt,count(distinct case when apply.credit_result='pass' then apply.apply_id end )approve_cnt--, count(apply.apply_id)
 
 
        from rsk.rreal_flink_bl_sgfq_risk_control_sgfq_credit_result_presto apply
 
         join rsk.rreal_flink_decision_engine_ruleset_run_record_presto t1
        on t1.apply_id=apply.apply_id
        and regexp_like(t1.ruleset_name,'rule' ) and t1.ruleset_name not like '%outer%'
        and t1.etl_dt>='2020-01-17' and  t1.step in ('1','2')
        and t1.ruleset_name not  like '%3%'
 
 
 
        left  join rsk.test t2
        on  apply.apply_id= t2.apply_id
        and regexp_like(t2.ruleset_name,'rule')
        and regexp_like(t2.ruleset_name,'outer')
        and t2.etl_dt>='2020-01-17' and t2.step  in ('3','2')
        and t2.ruleset_name  not  like '%1%'
        where
        apply.step in ('3','4')
        and apply.app_name='jsd'
        --and apply.etl_dt='2019-10-11'
        and apply.etl_dt='2021-06-07'
        and json_extract_scalar(apply.extend_field, '$.apply_type')!= '257'       
and format_datetime(from_unixtime(cast (substring(apply.create_time,1,10) as bigint)),'yyyy-MM-dd HH:mm:ss')>='2021-06-07 00:00:00'
 
        group by 1

结论

大表 jion 小表性能较之小表 join 大表要提升三倍。所以大表join优化项比较关键。

使用WITH

原SQL语句：

select distinct a.etl_dt,a.cost,de.hightype,de.type,a.desc as description,
 
case when a.cost>0 then 'test'
else '非投放渠道' end as pay_type
from
(select cost_dt  as etl_dt,
        sum(cost) as cost,
 
        desc 
from rdw.test
where  cost_dt>=date('2021-06-06')
and cost_dt<=date('2021-06-06')
and platform in ('4')
group by cost_dt,desc)a
left join
   (select distinct hightype,
            type,
            description,
            etl_dt
     from  rmk.hha
     where date(etl_dt)  between date('2021-06-06') and date('2021-06-06')
 
     and platform='APP'
     and app_subname in ('null','')
 
    ) as de
on trim(lower(cast(a.desc as varchar)))=trim(lower(cast(de.description as varchar)))
and date(de.etl_dt)=date(a.etl_dt)
group by a.etl_dt,a.cost,de.hightype,de.type,a.desc,case when a.cost>0 then '投放'
else '非投放' end

优化方向：

使用with替换
join 字段避免出现函数
group by后面跟数字
大表join小表

优化后：

with a as (select distinct date(cost_dt) as etl_dt,
        sum(cost) as cost,
        trim(lower(cast(desc as varchar))) as desc
from rdw.test
where  cost_dt>=date('2021-06-06')
and cost_dt<=date('2021-06-06')
and platform in ('4')
group by cost_dt,desc),
 
de as (select distinct hightype,
            type,
            trim(lower(cast(description as varchar))) as description,
            date(etl_dt) as etl_dt
     from  rmk.hha
     where date(etl_dt)  between date('2021-06-06') and date('2021-06-06')
     and platform='APP'
     and app_subname in ('null','')
)
 
select a.etl_dt,a.cost,de.hightype,de.type,a.desc as description,
case when a.cost>0 then '投放渠道'
else '非投放渠道' end as pay_type
from de right join a on a.desc = de.description
and de.etl_dt = a.etl_dt
group by 1,2,3,4,5,6

结论：

经过测试，对性能提升不大。

相关阅读:
016.CI4框架CodeIgniter数据库操作之：Insert插入一条数据
 015.CI4框架CodeIgniter数据库操作之：Query带参数查询数
 014.CI4框架CodeIgniter数据库操作之：查询数据库，并让数据以对象的方式返回查询结果
 013.CI4框架CodeIgniter数据库操作之：查询数据库，并让数据以数组的方式返回查询结果
 012.CI4框架CodeIgniter, 加载并调用自己的Libraries库
 033.SAP上查看IDOC接口，PI接口查不到的日志记录，可能在IDOC接口日志里面
 032.SAP上用户无法打开PPE模块，查看并开通用户的PPE权限
 011.CI4框架CodeIgniter, 获取查看用户的IP地址和浏览器信息
 010.CI4框架CodeIgniter, autoload自动加载自己的helper函数类
 009.CI4框架CodeIgniter, 网页访问GET的URL参数获取，分段输出URL参数
原文地址：https://www.cnblogs.com/erlou96/p/16878332.html