一:DQC核心流程
Define:数据质检规则(指标)的定义。
你要告警给谁,你要使用什么方式告警(邮件,即时消息),你的规则是什么(空值,波动)等
Measure:数据质检任务的执行
数据在哪存储:hive、mysql是基本的数据库、CK、kylin等
Analyze:数据质检结果量化及可视化展示。
分为两种情况:(一)不需要图形化界面,直接在调度里面进行bash配置,使用自定义代码解析(二)有图形化界面,可以操作和查看历史结果
二:DQC标准
Accuracy:准确性。如是否符合表的加工逻辑。
Completeness:完备性。如数据是否存在丢失。
Timeliness:及时性。如表数据是否按时产生。
Uniqueness:唯一性。如主键字段是否唯一。
Validity:合规性。如字段长度是否合规、枚举值集合是否合规。
Consistency:一致性。如表与表之间在某些字段上是否存在矛盾。
三:DQC规则
2.1有效性
字段长度有效、字段内容有效、字段数值范围有效、枚举值个数有效、枚举值集合有效
2.2 唯一性
对主键是否存在重复数据的监控指标。
2.3 完整性
字段是否为空或NULL、记录数是否丢失、记录数环比波动、录数波动范围、记录数方差检验、
2.4 准确性
数值同比、数值环比、数值方差检验、表逻辑检查
2.5 一致性
表级别一致性检查,外键检查
2.6 时效性
表级别质量监控指标,数据是否按时产出
2.7数据剖析
最大值检查、最小值检查、平均值检查、汇总值检查
2.8 自定义规则检查
用户写自定义SQL实现的监控规则
从有效性、唯一性、完整性、准确性、一致性、时效性、数据剖析和自定义规则检查等几个维度对数据质量进行测量,但对于现在超级大的数据量级监控所有的数据是不符合成本效率的。
因此,知道哪些数据为最关键的,对这些关键数据进行全链路的数据质量,这样有助于防止错误或揭示改进的机会。
总结:指定值、空值、外键规范、外键最大最小、行数统计、最大、最小、平均、用户自定义
四:样例代码
String[] argsa={"-alter_user=xiaolong.wu","-alter_type=3","-counttype=A0001","-count_column=tag","-databases=hive.tranadm.adm_fin_paytrigger_revenue_ds","-dt=20220509","-filter_column=easysolar|tag","-filter_column_value=1000|上一节点","-assert_type=eq","-assert_Stringue=200","-assert_rate=0.2"};
MyArgs myArgs = MyArgs.build(argsa);
import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.List; class MyArgsItemInfo{ private String optionName =""; private boolean necessary = false; private String datatype = ""; private String dataverfiy = ""; // String:正则表达式 enum:|分割的枚举类型 float:最大值|最小值 date:日期格式 private String desc =""; public MyArgsItemInfo(String optionName, boolean necessary,String datetype,String dateverfiy, String desc) { this.optionName = optionName; this.necessary = necessary; this.datatype = datetype; this.dataverfiy = dateverfiy; this.desc = desc; } public String getOptionName() { return optionName; } public String getDatatype() { return datatype; } public String getDataverfiy() { return dataverfiy; } } class MyArgsInfo { public static MyArgsInfoBuildFactory init(){ return new MyArgsInfoBuildFactory(); } public static class MyArgsInfoBuildFactory { private List<MyArgsItemInfo> buildFactoryerMyArgsItemInfo = new ArrayList<MyArgsItemInfo>(); public List<MyArgsItemInfo> build(){ return buildFactoryerMyArgsItemInfo; } public MyArgsInfoBuildFactory addMyArgsItemInfo(String optionName, boolean necessary,String datetype,String dateverfiy, String desc) { buildFactoryerMyArgsItemInfo.add(new MyArgsItemInfo(optionName,necessary,datetype,dateverfiy,desc)); return this; } } } public class MyArgs{ private String alter_user = ""; //告警接收人 private String alter_type = ""; //邮件,企业微信 private String counttype = ""; //A0003 private String count_column = ""; // private String databases = ""; // hive.ods.ods_user_active_di 限定数据库类型、库名、表明 private String dt = ""; private String filter_column = ""; //,分割 接受多个字段 private String filter_column_value = ""; //,分割 接受多个字段 private String sql = ""; private String assert_type = ""; //eq(==), lt(<), gt(>), le(<=), ge(>=), ne(!=) private String assert_Stringue = ""; //, private String assert_rate = ""; //, public static MyArgs build(String[] args) { return new MyArgsBuildFactory().build(args); } public MyArgs(MyArgsBuildFactory builder) { this.alter_user = builder.alter_user; this.alter_type = builder.alter_type ; this.counttype = builder.counttype; this.count_column = builder.count_column; this.databases = builder.databases; this.dt = builder.dt; this.filter_column = builder.filter_column; this.filter_column_value = builder.filter_column_value; this.sql = builder.sql; this.assert_type = builder.assert_type; this.assert_Stringue = builder.assert_Stringue; this.assert_rate = builder.assert_rate; } public static class MyArgsBuildFactory{ private boolean checkFalse = false; private String alter_user = ""; //告警接收人 private String alter_type = ""; //邮件,企业微信 private String counttype = ""; //count(1) count(distinct ) private String count_column = ""; // private String databases = ""; // hive.ods.ods_user_active_di 限定数据库类型、库名、表明 private String dt = ""; // 默认比较今天和前一天的结果 private String filter_column = ""; //,分割 接受多个字段 private String filter_column_value = ""; //,分割 接受多个字段 private String sql = ""; private String assert_type = ""; //eq(==), lt(<), gt(>), le(<=), ge(>=), ne(!=) private String assert_Stringue = ""; //, private String assert_rate = ""; //, public void checkArgs(List<MyArgsItemInfo> myArgsInfo){ for (int i = 0; i < myArgsInfo.size(); i++) { MyArgsItemInfo myArgsItemInfo = myArgsInfo.get(i); String datatype = myArgsItemInfo.getDatatype(); String datavalue = getValueByName(myArgsItemInfo.getOptionName()); // String:正则表达式 enum:|分割的枚举类型 float:最大值|最小值 date:日期格式 if(!datavalue.equals("")){ //空值的去掉 switch (datatype){ case "String" : { if(!datavalue.matches(myArgsItemInfo.getDataverfiy())){ System.out.println(myArgsItemInfo.getOptionName()+"参数值存在问题,非"+myArgsItemInfo.getDataverfiy()+"正则表达式"); checkFalse = true; } };break; case "enum" : { if(!Arrays.asList(myArgsItemInfo.getDataverfiy().split("\\|")).contains(datavalue)){ System.out.println(myArgsItemInfo.getOptionName()+"参数值存在问题,非"+myArgsItemInfo.getDataverfiy()+"枚举值"); checkFalse = true; } }break; case "float" : { float minvalue = Float.valueOf(myArgsItemInfo.getDataverfiy().split("\\|")[0]); float maxvalue = Float.valueOf(myArgsItemInfo.getDataverfiy().split("\\|")[0]); if(Float.valueOf(datavalue)>maxvalue || Float.valueOf(datavalue)<minvalue){ checkFalse = true; } };break; case "date" : { SimpleDateFormat format = new SimpleDateFormat("yyyyMMdd"); try { format.parse(datavalue); } catch (ParseException e) { System.out.println(myArgsItemInfo.getOptionName()+"参数值存在问题,非"+myArgsItemInfo.getDataverfiy()+"格式"); checkFalse = true; } };break; } } } //检测 filter_column 长度和 filter_column_value长度相同 String []filter_column_array = filter_column.split("\\|"); String []filter_column_value_array = filter_column_value.split("\\|"); if(filter_column_array.length!=filter_column_value_array.length){ System.out.println("filter_column列长度和filter_column_value长度不一致"); checkFalse = true; } if(checkFalse){ new Exception(); } } public String getValueByName(String getOptionName){ String data = ""; switch (getOptionName){ case "alter_user" : data = this.alter_user;break; case "alter_type" : data = this.alter_type;break; case "counttype" : data = this.counttype;break; case "count_column" : data = this.count_column;break; case "databases" : data = this.databases;break; case "dt" : data = this.dt;break; case "filter_column" : data = this.filter_column;break; case "filter_column_value" : data = this.filter_column_value;break; case "sql" : data = this.sql;break; case "assert_type" : data = this.assert_type;break; case "assert_Stringue" : data = this.assert_Stringue;break; case "assert_rate" : data = this.assert_rate;break; } return data; } public List<MyArgsItemInfo> myArgsItemInfoInit(){ MyArgsInfo.MyArgsInfoBuildFactory myArgsInfoBuildFactory = MyArgsInfo.init(); myArgsInfoBuildFactory.addMyArgsItemInfo("alter_user", true,"String","[a-z]*.[a-z]*", "告警接收人") .addMyArgsItemInfo("alter_type", true,"enum","1|2|3", "1:邮件,2:企业微信,3邮件+企业微信") .addMyArgsItemInfo("counttype", true,"enum","A0001|A0002|A0003|A0004|A0005", "计算类型") .addMyArgsItemInfo("count_column", true,"String","[a-zA-Z0-9_]*", "列名") .addMyArgsItemInfo("databases", true,"String","(hive|mysql|kylin|clickhouse|hbase).[a-z_]*.[a-z_]*", "hive.ods.ods_user_active_di 限定数据库类型、库名、表明") .addMyArgsItemInfo("dt", false,"date","yyyyMMdd", "时间") .addMyArgsItemInfo("filter_column", true,"String","[a-zA-Z0-9_\\|]*", "|分割 接受多个字段") .addMyArgsItemInfo("filter_column_value", true,"String","[a-zA-Z0-9_\\|]*", "|分割 接受多个字段") .addMyArgsItemInfo("sql", false,"String","[a-zA-Z\\*\\s]*", "自定义SQL") .addMyArgsItemInfo("assert_type", true,"enum","eq|lt|gt|le|ge|ne", "eq(==), lt(<), gt(>), le(<=), ge(>=), ne(!=)") .addMyArgsItemInfo("assert_Stringue", true,"float","-2099999999|2099999999", "限定数值") .addMyArgsItemInfo("assert_rate", true,"float","0|1", "限定比率"); return myArgsInfoBuildFactory.build(); } public void myArgsValueInit(String[] args){ for (int i = 0; i < args.length; i++) { String argString = args[i].substring(1,args[i].length()); String argName = argString.split("=")[0]; String argValue = argString.split("=")[1]; switch (argName){ case "alter_user" : this.alter_user(argValue);break; case "alter_type" : this.alter_type(argValue);break; case "counttype" : this.counttype(argValue);break; case "count_column" : this.count_column(argValue);break; case "databases" : this.databases(argValue);break; case "dt" : this.dt(argValue);break; case "filter_column" : this.filter_column(argValue);break; case "filter_column_value" : this.filter_column_value(argValue);break; case "sql" : this.sql(argValue);break; case "assert_type" : this.assert_type(argValue);break; case "assert_Stringue" : this.assert_Stringue(argValue);break; case "assert_rate" : this.assert_rate(argValue);break; } } } //主要方法 public MyArgs build(String[] args){ List<MyArgsItemInfo> myArgsInfo = myArgsItemInfoInit(); //将列信息初始化 myArgsValueInit(args); //将参数命令行过来的值初始化 checkArgs(myArgsInfo); //检查命令行过来的值是否符合 列表达式 return new MyArgs(this); } public MyArgsBuildFactory alter_user(String alter_user) { this.alter_user = alter_user; return this; } public MyArgsBuildFactory alter_type(String alter_type) { this.alter_type = alter_type; return this; } public MyArgsBuildFactory counttype(String counttype) { this.counttype = counttype; return this; } public MyArgsBuildFactory count_column(String count_column) { this.count_column = count_column; return this; } public MyArgsBuildFactory databases(String databases) { this.databases = databases; return this; } public MyArgsBuildFactory dt(String dt) { this.dt = dt; return this; } public MyArgsBuildFactory filter_column(String filter_column) { this.filter_column = filter_column; return this; } public MyArgsBuildFactory filter_column_value(String filter_column_value) { this.filter_column_value = filter_column_value; return this; } public MyArgsBuildFactory sql(String sql) { this.sql = sql; return this; } public MyArgsBuildFactory assert_type(String assert_type) { this.assert_type = assert_type; return this; } public MyArgsBuildFactory assert_Stringue(String assert_Stringue) { this.assert_Stringue = assert_Stringue; return this; } public MyArgsBuildFactory assert_rate(String assert_rate) { this.assert_rate = assert_rate; return this; } } public String getAlter_user() { return alter_user; } public String getAlter_type() { return alter_type; } public String getCounttype() { return counttype; } public String getCount_column() { return count_column; } public String getDatabases() { return databases; } public String getDt() { return dt; } public String getFilter_column() { return filter_column; } public String getFilter_column_value() { return filter_column_value; } public String getSql() { return sql; } public String getAssert_type() { return assert_type; } public String getAssert_Stringue() { return assert_Stringue; } public String getAssert_rate() { return assert_rate; } @Override public String toString() { return "MyArgs{" + "alter_user='" + alter_user + '\'' + ", alter_type=" + alter_type + ", counttype='" + counttype + '\'' + ", count_column='" + count_column + '\'' + ", databases='" + databases + '\'' + ", dt='" + dt + '\'' + ", filter_column='" + filter_column + '\'' + ", filter_column_value='" + filter_column_value + '\'' + ", sql='" + sql + '\'' + ", assert_type='" + assert_type + '\'' + ", assert_Stringue=" + assert_Stringue + ", assert_rate=" + assert_rate + '}'; } }