关于hadoop一些自定义输出 code>OutputFormat</code> describes the output-specification for a * Map-Reduce job. 首先继承outputFormat<key,value>这个抽象类 Map-Reduce job的输出规范 实现他的方法: RecordWriter<KeyBaseDimension, BaseStatsValueWritable> getRecordWriter 在方法内可以进行数据库连接操作 这里需要一个返回一个RecordWriter 继承这个RecordWriter类 实现里面的write方法 进行数据库jdbc存储即可 关于reduce端输出时会调用的write方法 实现类为:TaskInputOutputContextImpl private RecordWriter<KEYOUT,VALUEOUT> output; public void write(KEYOUT key, VALUEOUT value ) throws IOException, InterruptedException { output.write(key, value); } 最终是调用了RecordWriter的write方法, *在提交TableMap作业之前使用此选项。它将被适当地设置 *工作。 TableMapReduceUtil 这个类很重要,在提交读取hbase表job之前可以对其进行一系列过滤操作 public static void initTableMapperJob(List<Scan> scans, Class<? extends TableMapper> mapper, Class<?> outputKeyClass, Class<?> outputValueClass, Job job, boolean addDependencyJars, boolean initCredentials) throws IOException { job.setInputFormatClass(MultiTableInputFormat.class); if (outputValueClass != null) { job.setMapOutputValueClass(outputValueClass); } if (outputKeyClass != null) { job.setMapOutputKeyClass(outputKeyClass); } job.setMapperClass(mapper); Configuration conf = job.getConfiguration(); HBaseConfiguration.merge(conf, HBaseConfiguration.create(conf)); List<String> scanStrings = new ArrayList<String>(); for (Scan scan : scans) { scanStrings.add(convertScanToString(scan)); } job.getConfiguration().setStrings(MultiTableInputFormat.SCANS, scanStrings.toArray(new String[scanStrings.size()])); if (addDependencyJars) { addDependencyJars(job); } if (initCredentials) { initCredentials(job); } } map之前Hbase过滤 FilterList filterList = new FilterList(FilterList.Operator.MUST_PASS_ALL); filterList.addFilter( new SingleColumnValueFilter(EventLogConstants.EVENT_LOGS_FAMILY_NAME_BYTES, Bytes.toBytes(EventLogConstants.LOG_COLUMN_NAME_EVENT_NAME), CompareOp.EQUAL, Bytes.toBytes(EventLogConstants.EventEnum.BC_SX.alias))); filterList.addFilter( new SingleColumnValueFilter(EventLogConstants.EVENT_LOGS_FAMILY_NAME_BYTES, Bytes.toBytes(EventLogConstants.LOG_COLUMN_NAME_BC_STATUS), CompareOp.NOT_EQUAL, Bytes.toBytes("0"))); filterList.addFilter( new SingleColumnValueFilter(EventLogConstants.EVENT_LOGS_FAMILY_NAME_BYTES, Bytes.toBytes(EventLogConstants.LOG_COLUMN_NAME_CR_CP_ID), CompareOp.NOT_EQUAL, Bytes.toBytes("699004"))); filterList.addFilter( new SingleColumnValueFilter(EventLogConstants.EVENT_LOGS_FAMILY_NAME_BYTES, Bytes.toBytes(EventLogConstants.LOG_COLUMN_NAME_IS_DEL), CompareOp.EQUAL, Bytes.toBytes("0"))); String[] columns = new String[] { // 不管mapper中是否用到event的值,在column中都必须有 EventLogConstants.LOG_COLUMN_NAME_EVENT_NAME, EventLogConstants.LOG_COLUMN_NAME_BC_STATUS, EventLogConstants.LOG_COLUMN_NAME_CR_CP_ID, EventLogConstants.LOG_COLUMN_NAME_C_ID, EventLogConstants.LOG_COLUMN_NAME_BC_PERSON, EventLogConstants.LOG_COLUMN_NAME_IS_BC_RE }; filterList.addFilter(this.getColumnFilter(columns)); String statDate = conf.get(GlobalConstants.RUNNING_DATE_PARAMES); Connection conn; Admin admin = null; List<Scan> scanList = new ArrayList<Scan>(); try { conn = ConnectionFactory.createConnection(conf); admin = conn.getAdmin(); String tableName = EventLogConstants.HBASE_NAME_AUDIT_SX + GlobalConstants.UNDERLINE + statDate.replaceAll(GlobalConstants.KEY_SEPARATOR, ""); if (admin.tableExists(TableName.valueOf(tableName))) { Scan scan = new Scan(); // If an application wants to use multiple scans over different tables each scan must // define this attribute with the appropriate table name by calling // scan.setAttribute(Scan.SCAN_ATTRIBUTES_TABLE_NAME, Bytes.toBytes(tableName)) // static public final String SCAN_ATTRIBUTES_TABLE_NAME = "scan.attributes.table.name"; scan.setAttribute(Scan.SCAN_ATTRIBUTES_TABLE_NAME, Bytes.toBytes(tableName)); scan.setFilter(filterList); scanList.add(scan); } } catch (Exception e) { e.printStackTrace(); throw new RuntimeException("创建HBaseAdmin发生异常", e); } finally { if (admin != null) { try { admin.close(); } catch (IOException e) { e.printStackTrace(); } } } if (scanList.isEmpty()) { throw new IOException("没有表存在,无法创建scan集合"); } TableMapReduceUtil.initTableMapperJob(scanList, AuditorSXMapper.class, AuditorDimensionKey.class, Text.class, job, false); } storm echo(File(),fun,File()) filter:实现filter接口 iskeep方法 partitionAggregate函数:分区内聚合,实现aggregate<保存聚合状态的类> 的aggregate实现聚合逻辑 ,complete方法 ridentCollector collector.emit(Value(聚合后的值)) 一般的key拼接函数:实现function接口的execute方法 HBaseMapState.Options optsWait = new HBaseMapState.Options(); TridentState amtOfWaitState = partStream.project(new Fields("waitingTotalOfPartDay","dayAndContType")) .groupBy(new Fields("dayAndContType")) .persistentAggregate( factoryWait, new Fields("waitingTotalOfPartDay"),new Sum(), new Fields("waitingGlobalOfDay") ); persistentAggregate 持久化保存函数 进行全区的sum求和,输入各区,输出为总和