• 一个combineInputformat


    mark

    import java.io.DataInput;
    import java.io.DataOutput;
    import java.io.IOException;
    
    import org.apache.commons.logging.Log;
    import org.apache.commons.logging.LogFactory;
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.FSDataInputStream;
    import org.apache.hadoop.fs.FileSystem;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.io.WritableComparable;
    import org.apache.hadoop.io.compress.CompressionCodec;
    import org.apache.hadoop.io.compress.CompressionCodecFactory;
    import org.apache.hadoop.mapreduce.InputSplit;
    import org.apache.hadoop.mapreduce.RecordReader;
    import org.apache.hadoop.mapreduce.TaskAttemptContext;
    import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat;
    import org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader;
    import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
    import org.apache.hadoop.util.LineReader;
    
    public class MyInputFormat extends CombineFileInputFormat<InputSplitFile, Text> {
        
        
        
        @Override
        public RecordReader<InputSplitFile, Text> createRecordReader(InputSplit split, TaskAttemptContext context)
                throws IOException {
            return new CombineFileRecordReader<InputSplitFile, Text>((CombineFileSplit)split, context, MyCombineFileRecordReader.class);
        }
    
    }
    
    class MyCombineFileRecordReader extends RecordReader<InputSplitFile, Text> {
        private static final Log LOG = LogFactory.getLog(MyCombineFileRecordReader.class);
    
        private CompressionCodecFactory compressionCodecs = null;
        private long start;
        private long pos;
        private long end;
        private Path path;
        private LineReader in;
        private int maxLineLength;
        private InputSplitFile key = null;
        private Text value = null;
    
        public MyCombineFileRecordReader(CombineFileSplit split, TaskAttemptContext context, Integer index) throws IOException{
            Configuration job = context.getConfiguration();
            this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
            this.path = split.getPath(index);
            this.start = split.getOffset(index);
            this.end = start + split.getLength(index);
            compressionCodecs = new CompressionCodecFactory(job);
            final CompressionCodec codec = compressionCodecs.getCodec(this.path);
            boolean skipFirstLine = false;
            
            FileSystem fs = path.getFileSystem(job);
            FSDataInputStream fileIn = fs.open(split.getPath(index));
            if (codec != null) {
                  in = new LineReader(codec.createInputStream(fileIn), job);
                  end = Long.MAX_VALUE;
                } else {
                  if (start != 0) {
                    skipFirstLine = true;
                    --start;
                    fileIn.seek(start);
                  }
                  in = new LineReader(fileIn, job);
                }
                if (skipFirstLine) {  // skip first line and re-establish "start".
                  start += in.readLine(new Text(), 0,
                                       (int)Math.min((long)Integer.MAX_VALUE, end - start));
                }
                this.pos = start;
            
        }
        
        
        
        @Override
        public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
            // TODO Auto-generated method stub
    
        }
    
        @Override
        public boolean nextKeyValue() throws IOException, InterruptedException {
            if(key == null){
                key = new InputSplitFile();
                key.setFileName(path.getName());
            }
            key.setFileName(path.getName());
            if(value == null){
                value = new Text();
            }
            int newSize = 0;
            while(pos < end){
    
                newSize = in.readLine(value, maxLineLength, Math.max((int)Math.min(Integer.MAX_VALUE, end-pos), maxLineLength));
                
                if(newSize == 0){
                    break;
                }
                
                pos += newSize;
                if(newSize < maxLineLength){
                    break;
                }
                // line too long. try again
                  LOG.info("Skipped line of size " + newSize + " at pos " + 
                           (pos - newSize));
                }
                if (newSize == 0) {
                  key = null;
                  value = null;
                  return false;
                } else {
                  return true;
                }
            
    
        }
    
        @Override
        public InputSplitFile getCurrentKey() throws IOException, InterruptedException {
            
            return key;
        }
    
        @Override
        public Text getCurrentValue() throws IOException, InterruptedException {
            
            return value;
        }
    
        @Override
        public float getProgress() throws IOException, InterruptedException {
            if(start == end){
            return 0.0f;
            }else {
                return Math.min(1.0f, (pos - start)/(float)(end-start));
            }
        }
    
        @Override
        public void close() throws IOException {
            if(in != null){
                in.close();
            }
    
        }
    
    }
    
    
    class InputSplitFile implements WritableComparable<InputSplitFile> {
        private long offset;
        private String fileName;
        
        public long getOffset(){
            return offset;
        }
        
        public void setOffset(long offset){
            this.offset = offset;
        }
        
        public String getFileName(){
            return fileName;
        }
        
        public void setFileName(String fileName){
            this.fileName = fileName;
        }
        
        public void readFields(DataInput in) throws IOException {
            this.offset = in.readLong();
            this.fileName = Text.readString(in);
        }
        
        public void write(DataOutput out) throws IOException{
            out.writeLong(offset);
            Text.writeString(out, fileName);
        }
        
        public int compareTo(InputSplitFile o){
            InputSplitFile that = (InputSplitFile) o;
            
            int f = this.fileName.compareTo(that.fileName);
            if(f == 0){
                return (int)Math.signum((double)(this.offset - that.offset));
            }
            return f;
        }
        
        
        public boolean equals(InputSplitFile obj){
            if(obj instanceof InputSplitFile){
                return this.compareTo(obj) == 0;
            }
            return false;
        }
        
        public int hashCode(){
            assert false : "hashCode not designed";
        return 42;//an arbitrary constant
        }
    
    
    }
    View Code
  • 相关阅读:
    python变量及简单数据类型
    python函数小案例
    python字符串和列表小案例
    python循环输出
    Jmeter测试工具
    操作系统-进程管理~
    操作系统~
    组成原理第一章笔记~
    分时间段(年份或月份)统计,没有数字补0 Java(替代 MYSQL) 做法
    组成原理复习概略
  • 原文地址:https://www.cnblogs.com/luolizhi/p/4945938.html
Copyright © 2020-2023  润新知