• 在Hadoop中重写FileInputFormat类以处理二进制格式存储的整数


    近期開始使用MapReduce,发现网上大部分样例都是对文本数据进行处理的,也就是说在读取输入数据时直接使用默认的TextInputFormat进行处理就可以。对于文本数据处理,这个类还是能满足一部分应用场景。可是假设要处理以二进制形式结构化记录存储的文件时,这些类就不再适合了。

    本文以一个简单的应用场景为例:对依照二进制格式存储的整数做频数统计。当然,也能够在此基础上实现排序之类的其它应用。实现该应用的主要难点就是怎样处理输入数据。參考《权威指南·第三版》得知须要继承FileInputFormat这个类,并实现下面三个方法:

    class MyInputFormat extends FileInputFormat<Type1, Type2> {
    	/*
    	 * 查询推断当前文件能否够分块?"true"为能够分块,"false"表示不进行分块
    	 */
    	protected boolean isSplitable(Configuration conf, Path path) {
    		
    	}
    	
    	/*
    	 * MapReduce的client调用此方法得到全部的分块,然后将分块发送给MapReduce服务端。
    	 * 注意,分块中不包括实际的信息,而仅仅是对实际信息的分块信息。详细的说,每一个分块中
    	 * 包括当前分块相应的文件路径,当前分块在该文件里起始位置,当前分块的长度以及相应的
    	 * 实际数据所在的机器列表。在实现这个函数时,将这些信息填上就可以。
    	 * */
    	public List<InputSplit> getSplits(Configuration conf) throws IOException {
    	}
    
    	/*
    	 * 类RecordReader是用来创建传给map函数的Key-Value序列,传给此类的參数有两个:一个分块(split)和作业的配置信息(context).
    	 * 在Mapper的run函数中能够看到MapReduce框架运行Map的逻辑:
    	 * public void run(Context context) throws IOException, InterruptedException {
    	 * 		setup(context);
    	 * 		调用RecordReader方法的nextKeyValue,生成新的键值对。假设当前分块(Split)中已经处理完成了,则nextKeyValue会返回false.退出run函数
    	 *		while (context.nextKeyValue()) {	
    	 *			map(context.getCurrentKey(), context.getCurrentValue(), context);
    	 *		}
    	 *		cleanup(context);
    	 * }
    	 **/
    	public RecordReader<LongWritable, IntWritable> createRecordReader(InputSplit split, TaskAttemptContext context)
    			throws IOException, InterruptedException {
    	}
    }

    在RecordReader函数中实现下面几个接口:

    public class BinRecordReader extends RecordReader<LongWritable, IntWritable> {
    	/*关闭文件流
    	 * */
    	public void close() {}
    
    	/*
    	 * 获取处理进度
    	 **/
    	public float getProgress() {}
    
    	/*
    	 * 获取当前的Key
    	 * */
    	public LongWritable getCurrentKey() throws IOException,
    	InterruptedException {}
    
    	/* 获取当前的Value
    	 * */
    	public IntWritable getCurrentValue() throws IOException,InterruptedException {}
    
    	/*
    	 * 进行初始化工作,打开文件流,依据分块信息设置起始位置和长度等等
    	 * */
    	public void initialize(InputSplit inputSplit, TaskAttemptContext context)
    			throws IOException, InterruptedException {}
    
    	/*生成下一个键值对
    	 **/
    	public boolean nextKeyValue() throws IOException, InterruptedException {
    	}
    }

    下面为是三个文件的代码,首先是BinInputFormat.java的代码:

    package org.apache.hadoop.examples;
    
    import java.io.IOException;
    import java.util.ArrayList;
    import java.util.List;
    
    import org.apache.hadoop.fs.FSDataInputStream;
    import org.apache.hadoop.fs.FileStatus;
    import org.apache.hadoop.fs.FileSystem;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.fs.BlockLocation;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.mapreduce.TaskAttemptContext;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.input.FileSplit;
    import org.apache.hadoop.mapreduce.InputSplit;
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.mapreduce.RecordReader;
    import org.apache.hadoop.examples.BinRecordReader;
    
    class BinInputFormat extends FileInputFormat<LongWritable, IntWritable> {
    	
    	private static final double SPLIT_SLOP=1.1;
    	
    	/*
    	 * 查询推断当前文件能否够分块?"true"为能够分块,"false"表示不进行分块
    	 */
    	protected boolean isSplitable(Configuration conf, Path path) {
    		return true;
    	}
    	
    	/*
    	 * MapReduce的client调用此方法得到全部的分块,然后将分块发送给MapReduce服务端。
    	 * 注意,分块中不包括实际的信息,而仅仅是对实际信息的分块信息。详细的说,每一个分块中
    	 * 包括当前分块相应的文件路径,当前分块在该文件里起始位置,当前分块的长度以及相应的
    	 * 实际数据所在的机器列表。在实现这个函数时,将这些信息填上就可以。
    	 * */
    	public List<InputSplit> getSplits(Configuration conf) throws IOException {
    		List<InputSplit> splits = new ArrayList<InputSplit>();
    		long minSplitSize = conf.getLong("mapred.min.split.size",1);
    		long maxSplitSize = conf.getLong("mapred.max.split.size", 1);
    		long blockSize = conf.getLong("dfs.block.size",1);
    		long splitSize = Math.max(minSplitSize, Math.min(maxSplitSize, blockSize));
    		FileSystem fs = FileSystem.get(conf);
    		String path = conf.get(INPUT_DIR);
    		FileStatus[] files = fs.listStatus(new Path(path));
    
    		for (int fileIndex = 0; fileIndex < files.length; fileIndex++) {
    			FileStatus file = files[fileIndex];
    			System.out.println("input file: " + file.getPath().toString());
    			long length = file.getLen();
    			FileSystem fsin = file.getPath().getFileSystem(conf);
    		    BlockLocation[] blkLocations = fsin.getFileBlockLocations(file, 0, length);
    		    if ((length != 0) && isSplitable(conf, file.getPath())) {
    		        long bytesRemaining = length;
    		        while (((double) bytesRemaining)/splitSize > SPLIT_SLOP) {
    		          int blkIndex = getBlockIndex(blkLocations, length-bytesRemaining);
    		          splits.add(new FileSplit(file.getPath(), length-bytesRemaining, splitSize,
    		                                   blkLocations[blkIndex].getHosts()));
    		          bytesRemaining -= splitSize;
    		        }
    		        
    		        if (bytesRemaining != 0) {
    		          splits.add(new FileSplit(file.getPath(), length-bytesRemaining, bytesRemaining,
    		                     blkLocations[blkLocations.length-1].getHosts()));
    		        }
    		      } else if (length != 0) {
    		        splits.add(new FileSplit(file.getPath(), 0, length, blkLocations[0].getHosts()));
    		      } else {
    		        //Create empty hosts array for zero length files
    		        splits.add(new FileSplit(file.getPath(), 0, length, new String[0]));
    		      }
    		}
    		return splits;
    	}
    	
    	/*
    	 * 类RecordReader是用来创建传给map函数的Key-Value序列,传给此类的參数有两个:一个分块(split)和作业的配置信息(context).
    	 * 在Mapper的run函数中能够看到MapReduce框架运行Map的逻辑:
    	 * public void run(Context context) throws IOException, InterruptedException {
    	 * 		setup(context);
    	 * 		调用RecordReader方法的nextKeyValue,生成新的键值对。假设当前分块(Split)中已经处理完成了,则nextKeyValue会返回false.退出run函数
    	 *		while (context.nextKeyValue()) {	
    	 *			map(context.getCurrentKey(), context.getCurrentValue(), context);
    	 *		}
    	 *		cleanup(context);
    	 * }
    	 **/
    	public RecordReader<LongWritable, IntWritable> createRecordReader(InputSplit split, TaskAttemptContext context)
    			throws IOException, InterruptedException {
    		// TODO Auto-generated method stub
    		BinRecordReader reader = new BinRecordReader();
    		reader.initialize(split,context);
    		return reader;
    	}
    }

    下面为BinRecordReader.java的代码:

    package org.apache.hadoop.examples;
    
    import java.io.IOException;
    
    import org.apache.hadoop.fs.FSDataInputStream;
    import org.apache.hadoop.fs.FileSystem;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.mapreduce.TaskAttemptContext;
    import org.apache.hadoop.mapreduce.lib.input.FileSplit;
    import org.apache.hadoop.mapreduce.InputSplit;
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.mapreduce.RecordReader;
    
    /**
     * Return a single record (filename, "") where the filename is taken from
     * the file split.
     */
    public class BinRecordReader extends RecordReader<LongWritable, IntWritable> {
    	private FSDataInputStream inputStream = null;
    	private long start,end,pos;
    	private Configuration conf = null;
    	private FileSplit fileSplit = null;
    	private LongWritable key = new LongWritable();
    	private IntWritable value = new IntWritable();
    	private boolean processed = false;
    	public BinRecordReader() throws IOException {
    	}
    
    	/*关闭文件流
    	 * */
    	public void close() {
    		try {
    			if(inputStream != null)
    				inputStream.close();
    		} catch (IOException e) {
    			// TODO Auto-generated catch block
    			e.printStackTrace();
    		}
    	}
    
    	/*
    	 * 获取处理进度
    	 **/
    	public float getProgress() {
    		return ((processed == true)? 1.0f : 0.0f);
    	}
    
    	/*
    	 * 获取当前的Key
    	 * */
    	public LongWritable getCurrentKey() throws IOException,
    	InterruptedException {
    		// TODO Auto-generated method stub
    		return key;
    	}
    
    	/* 获取当前的Value
    	 * */
    	public IntWritable getCurrentValue() throws IOException,InterruptedException {
    		// TODO Auto-generated method stub
    		return value;
    	}
    
    	/*
    	 * 进行初始化工作,打开文件流,依据分块信息设置起始位置和长度等等
    	 * */
    	public void initialize(InputSplit inputSplit, TaskAttemptContext context)
    			throws IOException, InterruptedException {
    		// TODO Auto-generated method stub
    		fileSplit = (FileSplit)inputSplit;
    		conf = context.getConfiguration();
    
    		this.start = fileSplit.getStart();
    		this.end = this.start + fileSplit.getLength();
    
    		try{
    			Path path = fileSplit.getPath();
    			FileSystem fs = path.getFileSystem(conf);
    			this.inputStream = fs.open(path);
    			inputStream.seek(this.start);
    			this.pos = this.start;
    		}	catch(IOException e)	{
    			e.printStackTrace();
    		}
    	}
    
    	/*生成下一个键值对
    	 **/
    	public boolean nextKeyValue() throws IOException, InterruptedException {
    		// TODO Auto-generated method stub
    		if(this.pos < this.end) {
    			key.set(this.pos);
    			value.set(Integer.reverseBytes(inputStream.readInt()));
    			this.pos = inputStream.getPos();
    			return true;
    		} else {
    			processed = true;
    			return false;
    		}
    	}
    }
    

    下面是主文件BinCount.java的代码

    package org.apache.hadoop.examples;
    
    import java.io.IOException;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.Reducer;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    import org.apache.hadoop.util.GenericOptionsParser;
    import org.apache.hadoop.examples.BinInputFormat;
    
    public class IntCount {
    	public static class TokenizerMapper extends Mapper<LongWritable, IntWritable, Text, IntWritable>{
    
    		private final static IntWritable one = new IntWritable(1); 
    		private Text intNum = new Text();                             
    
    		public void map(LongWritable key, IntWritable value, Context context
    				) throws IOException, InterruptedException {
    			intNum.set(Integer.toString(value.get()));                              
    			context.write(intNum, one);                            
    		}
    	}
    
    	public static class IntSumReducer 
    	extends Reducer<Text,IntWritable,Text,IntWritable> { 
    		private IntWritable result = new IntWritable();              
    
    		public void reduce(Text key, Iterable<IntWritable> values, 
    				Context context
    				) throws IOException, InterruptedException {
    			int sum = 0;                                                 
    			for (IntWritable val : values) {
    				sum += val.get();                                         
    
    			}
    			result.set(sum);                                                                              
    			context.write(key, result);                                
    		}
    	}
    
    	public static void main(String[] args) throws Exception {
    		System.out.println("testing1");
    		Configuration conf = new Configuration();
    		String[] newArgs = new String[]{"hdfs://localhost:9000/read","hdfs://localhost:9000/data/wc_output21"};
    		String[] otherArgs = new GenericOptionsParser(conf, newArgs).getRemainingArgs();
    		if (otherArgs.length != 2) {
    			System.err.println("Usage: wordcount <in> <out>");
    			System.exit(2);
    		}
    		Job job = new Job(conf, "IntCount");                
    		job.setJarByClass(IntCount.class);
    		job.setMapperClass(TokenizerMapper.class);  
    		job.setCombinerClass(IntSumReducer.class);  
    		job.setReducerClass(IntSumReducer.class); 
    		//设置自己定义的输入类
    		job.setInputFormatClass(BinInputFormat.class);
    		job.setOutputKeyClass(Text.class);     
    		job.setOutputValueClass(IntWritable.class);   
    		FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    		FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    		System.exit(job.waitForCompletion(true) ? 0 : 1);
    	}
    }
    接着我们用一段C语言生成二进制格式存储的文件,C语言代码例如以下:

    #include<stdio.h>
    int main(){
    	FILE * fp = fopen("tmpfile","wb");
    	int i,j;
    	for(i=0;i<10;i++) {
    		for(j=0;j<10;j++)
    			fwrite(&j,sizeof(int),1,fp);
    	}
    	fclose(fp);
    	return 0;
    }

    将生成的文件复制到/read/下,接着启动IntCount这个MapReduce程序,打开执行结果:




  • 相关阅读:
    【大数据】Hadoop常用启动命令
    Codeforces Round #695 (Div. 2)
    Codeforces Round #668 (Div. 2)
    Codeforces Round #666 (Div. 2)
    Educational Codeforces Round 94 (Rated for Div. 2)
    Sum Queries? CodeForces
    New Year and Old Subsequence CodeForces
    Gym
    huntian oy HDU
    The Boss on Mars HDU
  • 原文地址:https://www.cnblogs.com/bhlsheji/p/3783970.html
Copyright © 2020-2023  润新知