• 利用kettle中的JS来完成ETL数据校验


    最近参与了一个信托行业的BI项目,由于信托业务系统设计的问题,很多都是用户手工录入的数据,也有一些是需要分析的但是用户没有录入的数据,针对这样的数据质量,我们就要在ETL抽取的过程中来对数据流进行校验,今天我们就说一下如何利用ETL开源工具kettle来完成对数据的基础性校验

    1:非空校验

    if(INTRUST_FLAG1==null){intrust_flag1_isnull=1;}

    2:唯一校验

    var uniquesql_item_id = "SELECT count(*)  FROM titem where item_id="+ITEM_ID;
    
    if(fireToDB(strConn,uniquesql_item_id)[0[0]==1){item_id_isunique=1;}

    3:标准化校验

    var normalsql_intrust_type= "select  count(*) from  trust_tdictparam where type_id=1104 and type_value="+INTRUST_TYPE;
    if(fireToDB(strConn, normalsql_intrust_type)[0][0]>0){intrust_type_isnormal=1;}

    完整的JS处理逻辑展示如下

    var check_status=0;//如果检测到有任何一种校验错误,则check_status=1
    var check_table="T_ITEM_TITEM";//校验有关的表名字,可能存在多个
    var source_table="titem";//数据源表名称,如果数据来自多个数据表,则需要声明多个
    var source_table_id="item_id";//数据源表主键,如果多个表联合主键,则需要声明多个主键
    var check_detail="";//校验到的错误详细情况
    var check_name="";//主键名称
    var check_type="";//校验到的错误类型
    var check_date=new Date();//校验时间
    var strConn = "bjitic_og";
    
    //////////////////////////////////定义所有校验枚举值////////////////////
    //1:唯一性枚举值
    var item_id_isunique=0;
    var item_code_isunique=0;
    var item_name_isunique=0;
    //2:非空枚举值
    var intrust_flag1_isnull=0;
    var intrust_type_isnull=0;
    var intrust_type_subitem_isnull=0;
    var intrust_type1_isnull=0;
    var intrust_type2_isnull=0;
    var item_num_isnull=0;
    var item_money_isnull=0;
    var exp_rate1_isnull=0;
    var exp_rate2_isnull=0;
    var intrust_flag3_isnull=0;
    var intrust_flag4_isnull=0;
    var entity_type_isnull=0;
    var deal_type_isnull=0;
    var managertype_isnull=0;
    var natrust_type_isnull=0;
    var custodian_bank_isnull=0;
    var with_bank_flag_isnull=0;
    var with_security_flag_isnull=0;
    var with_private_flag_isnull=0;
    var with_gov_flag_isnull=0;
    var ben_period_isnull=0;  
    
    //3:标准化枚举值
    var intrust_type_isnormal=0;
    var intrust_type_subitem_isnormal=0;
    var intrust_type1_isnormal=0;
    var intrust_type2_isnormal=0;
    var entity_type_isnormal=0;
    var deal_type_isnormal=0;
    var natrust_type_isnormal=0;
    var custodian_bank_isnormal=0;
    var innovat_type_isnormal=0;
    var bank_id_isnormal=0;
    var gov_regional_isnormal=0;
    //4:类型校验枚举值
    var ben_period_isnumber=0;  
    ////////////////////////////////唯一性校验枚举值赋值
    var uniquesql_item_id = "SELECT count(1)  FROM titem where item_id="+ITEM_ID;
    var uniquesql_item_code = "SELECT count(1)  FROM titem where item_code="+ITEM_CODE;
    var uniquesql_item_name = "SELECT count(1)  FROM titem where item_name="+"'"+ITEM_NAME+"'";
    if(fireToDB(strConn,uniquesql_item_id)[0][0]==1){item_id_isunique=1;}
    if(fireToDB(strConn,uniquesql_item_code)[0][0]==1){item_code_isunique=1;}
    if(fireToDB(strConn,uniquesql_item_name)[0][0]==1){item_name_isunique=1;}
    ////////////////////////////////非空校验枚举值赋值
    if(INTRUST_FLAG1==null){intrust_flag1_isnull=1;}
    if(INTRUST_TYPE==null){intrust_type_isnull=1;}
    if(INTRUST_TYPE_SUBITEM==null){intrust_type_subitem_isnull=1;}
    if(INTRUST_TYPE1==null){intrust_type1_isnull=1;}
    if(INTRUST_TYPE2==null){intrust_type2_isnull=1;}
    if(ITEM_NUM==null){item_num_isnull=1;}
    if(ITEM_MONEY==null){item_money_isnull=1;}
    if(EXP_RATE1==null){exp_rate1_isnull=1;}
    if(EXP_RATE2==null){exp_rate2_isnull=1;}
    if(INTRUST_FLAG3==null){intrust_flag3_isnull=1;}
    if(INTRUST_FLAG4==null){intrust_flag4_isnull=1;}
    if(ENTITY_TYPE==null){entity_type_isnull=1;}
    if(DEAL_TYPE==null){deal_type_isnull=1;}
    if(MANAGERTYPE==null){managertype_isnull=1;}
    if(NATRUST_TYPE==null){natrust_type_isnull=1;}
    if(CUSTODIAN_BANK==null){custodian_bank_isnull=1;}
    if(WITH_BANK_FLAG==null){with_bank_flag_isnull=1;}
    if(WITH_SECURITY_FLAG==null){with_security_flag_isnull=1;}
    if(WITH_PRIVATE_FLAG==null){with_private_flag_isnull=1;}
    if(WITH_GOV_FLAG==null){with_gov_flag_isnull=1;}
    if(BEN_PERIOD==null){ben_period_isnull=1;}
    ////////////////////////////////标准化校验枚举值赋值
    var normalsql_intrust_type= "select  count(1) from  trust_tdictparam where type_id=1104 and type_value="+INTRUST_TYPE;
    var normalsql_intrust_type_subitem= "select  count(1) from  trust_tdictparam where type_id=1151 and type_value="+INTRUST_TYPE_SUBITEM;
    var normalsql_intrust_type1= "select  count(1) from  trust_tdictparam where type_id=1138 and type_value="+INTRUST_TYPE1;
    var normalsql_intrust_type2= "select  count(1) from  trust_tdictparam where type_id=1139 and type_value="+INTRUST_TYPE2;
    var normalsql_entity_type= "select  count(1) from  trust_tdictparam where type_id=1140 and type_value="+ENTITY_TYPE;
    var normalsql_deal_type= "select  count(1) from  trust_tdictparam where type_id=1141 and type_value="+DEAL_TYPE;
    var normalsql_natrust_type= "select  count(1) from  trust_tdictparam where type_id=1143 and type_value="+NATRUST_TYPE;
    var normalsql_custodian_bank= "select  count(1) from  trust_tdictparam where type_id=1103 and type_value="+"'"+CUSTODIAN_BANK+"'";;
    var normalsql_innovat_type= "select  count(1) from  trust_tdictparam where type_id=1104 and type_value="+INNOVAT_TYPE;
    var normalsql_gov_regional= "select  count(1) from  trust_tdictparam where type_id=9999 and type_value="+GOV_PROV_REGIONAL;
    if(fireToDB(strConn, normalsql_intrust_type)[0][0]>0){intrust_type_isnormal=1;}
    if(fireToDB(strConn, normalsql_intrust_type_subitem)[0][0]>0){intrust_type_subitem_isnormal=1;}
    if(fireToDB(strConn, normalsql_intrust_type1)[0][0]>0){intrust_type1_isnormal=1;}
    if(fireToDB(strConn, normalsql_intrust_type2)[0][0]>0){intrust_type2_isnormal=1;}
    if(fireToDB(strConn, normalsql_entity_type)[0][0]>0){entity_type_isnormal=1;}
    if(fireToDB(strConn, normalsql_deal_type)[0][0]>0){deal_type_isnormal=1;}
    if(fireToDB(strConn, normalsql_natrust_type)[0][0]>0){natrust_type_isnormal=1;}
    if(fireToDB(strConn, normalsql_custodian_bank)[0][0]>0){custodian_bank_isnormal=1;}
    if(fireToDB(strConn, normalsql_innovat_type)[0][0]>0){innovat_type_isnormal=1;}
    if(fireToDB(strConn, normalsql_gov_regional)[0][0]>0){gov_regional_isnormal=1;}
    ////////////////////////////////数据类型校验枚举值赋值
    if(isNum(BEN_PERIOD))
    {
    ben_period_isnumber=1; 
    }
    //校验所有表需要校验的字段,如果有一个校验失败,则校验状态为1
    if
    (
    item_id_isunique==0 || item_code_isunique==0 ||item_name_isunique==0|| //is unique
    intrust_flag1_isnull==1 || intrust_type_isnull==1 || intrust_type_subitem_isnull==1 ||//is null
    intrust_type1_isnull==1 || intrust_type2_isnull==1 || item_num_isnull==1 ||
    item_money_isnull==1 || exp_rate1_isnull==1 || exp_rate2_isnull==1 || 
    intrust_flag3_isnull==1 || intrust_flag4_isnull==1 || entity_type_isnull==1 || 
    deal_type_isnull==1 || managertype_isnull==1 || natrust_type_isnull==1 || 
    custodian_bank_isnull==1 || with_bank_flag_isnull==1 || with_security_flag_isnull==1 || 
    with_private_flag_isnull==1 || with_gov_flag_isnull==1 || ben_period_isnull==1 ||
    intrust_type_isnormal==0 || intrust_type_subitem_isnormal==0 || intrust_type1_isnormal==0 ||//is normal
    intrust_type2_isnormal==0 || entity_type_isnormal==0 || deal_type_isnormal==0 ||
    natrust_type_isnormal==0 || custodian_bank_isnormal==0 || innovat_type_isnormal==0 ||
    bank_id_isnormal==0 || gov_regional_isnormal==0 ||
    ben_period_isnumber==0//is number
    )
    {
    check_status=1;
    }
    //check is unique? return not unique column
    var isunique_column="";
    if(item_id_isunique==0){isunique_column="item_id";}
    if(item_code_isunique==0){if(isunique_column==""){isunique_column="item_code";}else{isunique_column=isunique_column+"、"+"item_code";}}
    if(item_name_isunique==0){if(isunique_column==""){isunique_column="item_name";}else{isunique_column=isunique_column+"、"+"item_name";}}
    //check is null? return null column
    var isnull_column="";
    if(intrust_flag1_isnull==1){isnull_column="intrust_flag1";}
    if(intrust_type_isnull==1){if(isnull_column==""){isnull_column="intrust_type";}else{isnull_column=isnull_column+"、"+"intrust_type";}}
    if(intrust_type_subitem_isnull==1){if(isnull_column==""){isnull_column="intrust_type_subitem";}else{isnull_column=isnull_column+"、"+"intrust_type_subitem";}}
    if(intrust_type1_isnull==1){if(isnull_column==""){isnull_column="intrust_type1";}else{isnull_column=isnull_column+"、"+"intrust_type1";}}
    if(intrust_type2_isnull==1){if(isnull_column==""){isnull_column="intrust_type2";}else{isnull_column=isnull_column+"、"+"intrust_type2";}}
    if(item_num_isnull==1){if(isnull_column==""){isnull_column="item_num";}else{isnull_column=isnull_column+"、"+"item_num";}}
    if(item_money_isnull==1){if(isnull_column==""){isnull_column="item_money";}else{isnull_column=isnull_column+"、"+"item_money";}}
    if(exp_rate1_isnull==1){if(isnull_column==""){isnull_column="exp_rate1";}else{isnull_column=isnull_column+"、"+"exp_rate1";}}
    if(exp_rate2_isnull==1){if(isnull_column==""){isnull_column="exp_rate2";}else{isnull_column=isnull_column+"、"+"exp_rate2";}}
    if(intrust_flag3_isnull==1){if(isnull_column==""){isnull_column="intrust_flag3";}else{isnull_column=isnull_column+"、"+"intrust_flag3";}}
    if(intrust_flag4_isnull==1){if(isnull_column==""){isnull_column="intrust_flag4";}else{isnull_column=isnull_column+"、"+"intrust_flag4";}}
    if(entity_type_isnull==1){if(isnull_column==""){isnull_column="entity_type";}else{isnull_column=isnull_column+"、"+"entity_type";}}
    if(deal_type_isnull==1){if(isnull_column==""){isnull_column="deal_type";}else{isnull_column=isnull_column+"、"+"deal_type";}}
    if(managertype_isnull==1){if(isnull_column==""){isnull_column="managertype";}else{isnull_column=isnull_column+"、"+"managertype";}}
    if(natrust_type_isnull==1){if(isnull_column==""){isnull_column="natrust_type";}else{isnull_column=isnull_column+"、"+"natrust_type";}}
    if(custodian_bank_isnull==1){if(isnull_column==""){isnull_column="custodian_bank";}else{isnull_column=isnull_column+"、"+"custodian_bank";}}
    if(with_bank_flag_isnull==1){if(isnull_column==""){isnull_column="with_bank_flag";}else{isnull_column=isnull_column+"、"+"with_bank_flag";}}
    if(with_security_flag_isnull==1){if(isnull_column==""){isnull_column="with_security_flag";}else{isnull_column=isnull_column+"、"+"with_security_flag";}}
    if(with_private_flag_isnull==1){if(isnull_column==""){isnull_column="with_private_flag";}else{isnull_column=isnull_column+"、"+"with_private_flag";}}
    if(with_gov_flag_isnull==1){if(isnull_column==""){isnull_column="with_gov_flag";}else{isnull_column=isnull_column+"、"+"with_gov_flag";}}
    if(ben_period_isnull==1){if(isnull_column==""){isnull_column="ben_period";}else{isnull_column=isnull_column+"、"+"ben_period";}}
    //check is normal? return not normal column
    var isnormal_column="";
    if(intrust_type_isnormal==0){isnormal_column="intrust_type";}
    if(intrust_type_subitem_isnormal==0){if(isnormal_column==""){isnormal_column="intrust_type_subitem";}else{isnormal_column=isnormal_column+"、"+"intrust_type_subitem";}}
    if(intrust_type1_isnormal==0){if(isnormal_column==""){isnormal_column="intrust_type1";}else{isnormal_column=isnormal_column+"、"+"intrust_type1";}}
    if(intrust_type2_isnormal==0){if(isnormal_column==""){isnormal_column="intrust_type2";}else{isnormal_column=isnormal_column+"、"+"intrust_type2";}}
    if(entity_type_isnormal==0){if(isnormal_column==""){isnormal_column="entity_type";}else{isnormal_column=isnormal_column+"、"+"entity_type";}}
    if(deal_type_isnormal==0){if(isnormal_column==""){isnormal_column="deal_type";}else{isnormal_column=isnormal_column+"、"+"deal_type";}}
    if(natrust_type_isnormal==0){if(isnormal_column==""){isnormal_column="natrust_type";}else{isnormal_column=isnormal_column+"、"+"natrust_type";}}
    if(custodian_bank_isnormal==0){if(isnormal_column==""){isnormal_column="custodian_bank";}else{isnormal_column=isnormal_column+"、"+"custodian_bank";}}
    if(innovat_type_isnormal==0){if(isnormal_column==""){isnormal_column="iinnovat_type";}else{isnormal_column=isnormal_column+"、"+"innovat_type";}}
    if(bank_id_isnormal==0){if(isnormal_column==""){isnormal_column="bank_id";}else{isnormal_column=isnormal_column+"、"+"bank_id";}}
    if(gov_regional_isnormal==0){if(isnormal_column==""){isnormal_column="gov_regional";}else{isnormal_column=isnormal_column+"、"+"gov_regional";}}
    //check is number? return not number column
    var isnumber_column="";
    if(ben_period_isnumber==0)
    {
    isnumber_column="ben_period";
    }
    //最终输出的错误详细情况
    if(item_id_isunique==0 || item_code_isunique==0 ||item_name_isunique==0)
    {
    check_type="违反唯一规则";
    check_detail="表"+source_table+"中,字段"+isunique_column+"违反了唯一规则";
    }
    if
    (
    intrust_flag1_isnull==1 || intrust_type_isnull==1 || intrust_type_subitem_isnull==1 ||
    intrust_type1_isnull==1 || intrust_type2_isnull==1 || item_num_isnull==1 ||
    item_money_isnull==1 || exp_rate1_isnull==1 || exp_rate2_isnull==1|| 
    intrust_flag3_isnull==1 || intrust_flag4_isnull==1 || entity_type_isnull==1 || 
    deal_type_isnull==1 || managertype_isnull==1 || natrust_type_isnull==1 || 
    custodian_bank_isnull==1 || with_bank_flag_isnull==1 || with_security_flag_isnull==1 || 
    with_private_flag_isnull==1 || with_gov_flag_isnull==1 || ben_period_isnull==1 
    )
    {
      if(check_detail=="")
      {
       check_type="违反非空规则";
       check_detail="字段"+isnull_column+"违反了非空规则";
     
      }
      else
      {
      check_type=check_type+","+"违反非空规则";
      check_detail=check_detail+",字段"+isnull_column+"违反了非空规则";
      }
    }
    if
    (
    intrust_type_isnormal==0 || intrust_type_subitem_isnormal==0 || intrust_type1_isnormal==0 ||
    intrust_type2_isnormal==0 || entity_type_isnormal==0 || deal_type_isnormal==0 ||
    natrust_type_isnormal==0 || custodian_bank_isnormal==0 || innovat_type_isnormal==0 ||
    bank_id_isnormal==0 || gov_regional_isnormal==0
    )
    {
      if(check_detail=="")
      {
       check_type="违反标准化规则";
       check_detail="字段"+isnormal_column+"违反了标准化规则";
      }
      else
      {
      check_type=check_type+","+"违反标准化规则";
      check_detail=check_detail+",字段"+isnormal_column+"违反了标准化规则";
      }
    }
    
    
    
    if(ben_period_isnumber==0)
    {
      if(check_detail=="")
      {
       check_type="违反数据类型为数字规则";
       check_detail="字段"+isnumber_column+"违反了数据类型为数字规则";
      }
      else
      {
      check_type=check_type+","+"违反数据类型为数字规则";
      check_detail=check_detail+",字段"+isnumber_column+"违反了数据类型为数字规则";
      }
    }
    if(check_detail!="")
    {
    check_detail=check_detail+","+source_table_id+"="+ITEM_ID;
    }
    check_name=ITEM_NAME;
    View Code

    校验日志表结果:

    输出关键性指标-表,那些字段违反了那些规则,时间

  • 相关阅读:
    2020.7.11
    2020.7.13
    2020.7.9
    2020.7.10
    Java入门——day13
    Java入门——day12
    Java入门——day11
    虚拟交换机
    KVM虚拟化 ProxmoxVE
    【路径】python环境错误调试【执行路径】【操作系统,版本】
  • 原文地址:https://www.cnblogs.com/wxjnew/p/4320344.html
Copyright © 2020-2023  润新知