hadoop虽然内置了很多个inputFormat但是,有时候还是不满足我们的一些需求,所以我们需要重写一个类,来实现我们对数据的读取。
public abstract class InputFormat<K, V> { public InputFormat() { } public abstract List<InputSplit> getSplits(JobContext var1) throws IOException, InterruptedException; public abstract RecordReader<K, V> createRecordReader(InputSplit var1, TaskAttemptContext var2) throws IOException, InterruptedException; }
getSplits用来给文件分片变成一个个的List<InputSplit>,RecordReader用来把每个InputSplit变成KV对。一般来说,我们选择继承其子类FileInputFormat,比如我们默认的输入方式TextInputFormat就是继承它的。
FileInputFormat及各种实现的部分源码
看下FileInputFormat里头对于getSplits和createRecordReader的实现。1-7从设置里头读每个切片的maxSize和minSize并把dir下的文件变成iterator以便一个个的读一个个的切。
1 public List<InputSplit> getSplits(JobContext job) throws IOException { 2 StopWatch sw = (new StopWatch()).start(); 3 long minSize = Math.max(this.getFormatMinSplitSize(), getMinSplitSize(job)); 4 long maxSize = getMaxSplitSize(job); 5 List<InputSplit> splits = new ArrayList(); 6 List<FileStatus> files = this.listStatus(job); 7 Iterator i$ = files.iterator(); 8 9 while(true) { 10 while(true) { 11 while(i$.hasNext()) { 12 FileStatus file = (FileStatus)i$.next(); 13 Path path = file.getPath(); 14 long length = file.getLen(); 15 if (length != 0L) { 16 BlockLocation[] blkLocations; 17 if (file instanceof LocatedFileStatus) { 18 blkLocations = ((LocatedFileStatus)file).getBlockLocations(); 19 } else { 20 FileSystem fs = path.getFileSystem(job.getConfiguration()); 21 blkLocations = fs.getFileBlockLocations(file, 0L, length); 22 } 23 24 if (this.isSplitable(job, path)) { 25 long blockSize = file.getBlockSize(); 26 long splitSize = this.computeSplitSize(blockSize, minSize, maxSize); 27 28 long bytesRemaining; 29 int blkIndex; 30 for(bytesRemaining = length; (double)bytesRemaining / (double)splitSize > 1.1D; bytesRemaining -= splitSize) { 31 blkIndex = this.getBlockIndex(blkLocations, length - bytesRemaining); 32 splits.add(this.makeSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts())); 33 } 34 35 if (bytesRemaining != 0L) { 36 blkIndex = this.getBlockIndex(blkLocations, length - bytesRemaining); 37 splits.add(this.makeSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts())); 38 } 39 } else { 40 splits.add(this.makeSplit(path, 0L, length, blkLocations[0].getHosts(), blkLocations[0].getCachedHosts())); 41 } 42 } else { 43 splits.add(this.makeSplit(path, 0L, length, new String[0])); 44 } 45 } 46 47 job.getConfiguration().setLong("mapreduce.input.fileinputformat.numinputfiles", (long)files.size()); 48 sw.stop(); 49 if (LOG.isDebugEnabled()) { 50 LOG.debug("Total # of splits generated by getSplits: " + splits.size() + ", TimeTaken: " + sw.now(TimeUnit.MILLISECONDS)); 51 } 52 53 return splits; 54 } 55 } 56 }
protected long computeSplitSize(long blockSize, long minSize, long maxSize) { return Math.max(minSize, Math.min(maxSize, blockSize)); }
16-22拿到当前文件的 file's block locations。24-41如果这个文件(比如有些压缩的类型不可分)不可分,就直接把这文件整个不管大小作为一块,可分则每次当剩余文件大小比1.1倍的文件大小高的时候(30行)就切一块出来,比1.1倍小就不切了,不然太浪费了。FileInputFormat里isSplitable直接返回True了,而如TextInputformat就事按压缩文件类型看是否可分的的
从getSplits来说,如NLineInputFormat 就把N line作为一个split,在getSplitsForFile中一行一行的读然后加到split里去,直到达到预设的numLinePerSplit。看源码可知而对于TextInputSplit来说,就没有重写getSplits。
然后看createRecordReader(TextFileInputFormat)
1 public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context) { 2 String delimiter = context.getConfiguration().get("textinputformat.record.delimiter"); 3 byte[] recordDelimiterBytes = null; 4 if (null != delimiter) { 5 recordDelimiterBytes = delimiter.getBytes(Charsets.UTF_8); 6 } 7 8 return new LineRecordReader(recordDelimiterBytes); 9 }
1 public class LineRecordReader extends RecordReader<LongWritable, Text> { 2 private static final Log LOG = LogFactory.getLog(LineRecordReader.class); 3 public static final String MAX_LINE_LENGTH = "mapreduce.input.linerecordreader.line.maxlength"; 4 private long start; 5 private long pos; 6 private long end; 7 private SplitLineReader in; 8 private FSDataInputStream fileIn; 9 private Seekable filePosition; 10 private int maxLineLength; 11 private LongWritable key; 12 private Text value; 13 private boolean isCompressedInput; 14 private Decompressor decompressor; 15 private byte[] recordDelimiterBytes; 16 17 public LineRecordReader() { 18 } 19 20 public LineRecordReader(byte[] recordDelimiter) { 21 this.recordDelimiterBytes = recordDelimiter; 22 } 23 24 public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { 25 FileSplit split = (FileSplit)genericSplit; 26 Configuration job = context.getConfiguration(); 27 this.maxLineLength = job.getInt("mapreduce.input.linerecordreader.line.maxlength", 2147483647); 28 this.start = split.getStart(); 29 this.end = this.start + split.getLength(); 30 Path file = split.getPath(); 31 FileSystem fs = file.getFileSystem(job); 32 this.fileIn = fs.open(file); 33 CompressionCodec codec = (new CompressionCodecFactory(job)).getCodec(file); 34 if (null != codec) { 35 this.isCompressedInput = true; 36 this.decompressor = CodecPool.getDecompressor(codec); 37 if (codec instanceof SplittableCompressionCodec) { 38 SplitCompressionInputStream cIn = ((SplittableCompressionCodec)codec).createInputStream(this.fileIn, this.decompressor, this.start, this.end, READ_MODE.BYBLOCK); 39 this.in = new CompressedSplitLineReader(cIn, job, this.recordDelimiterBytes); 40 this.start = cIn.getAdjustedStart(); 41 this.end = cIn.getAdjustedEnd(); 42 this.filePosition = cIn; 43 } else { 44 if (this.start != 0L) { 45 throw new IOException("Cannot seek in " + codec.getClass().getSimpleName() + " compressed stream"); 46 } 47 48 this.in = new SplitLineReader(codec.createInputStream(this.fileIn, this.decompressor), job, this.recordDelimiterBytes); 49 this.filePosition = this.fileIn; 50 } 51 } else { 52 this.fileIn.seek(this.start); 53 this.in = new UncompressedSplitLineReader(this.fileIn, job, this.recordDelimiterBytes, split.getLength()); 54 this.filePosition = this.fileIn; 55 } 56 57 if (this.start != 0L) { 58 this.start += (long)this.in.readLine(new Text(), 0, this.maxBytesToConsume(this.start)); 59 } 60 61 this.pos = this.start; 62 } 63 64 private int maxBytesToConsume(long pos) { 65 return this.isCompressedInput ? 2147483647 : (int)Math.max(Math.min(2147483647L, this.end - pos), (long)this.maxLineLength); 66 } 67 68 private long getFilePosition() throws IOException { 69 long retVal; 70 if (this.isCompressedInput && null != this.filePosition) { 71 retVal = this.filePosition.getPos(); 72 } else { 73 retVal = this.pos; 74 } 75 76 return retVal; 77 } 78 79 private int skipUtfByteOrderMark() throws IOException { 80 int newMaxLineLength = (int)Math.min(3L + (long)this.maxLineLength, 2147483647L); 81 int newSize = this.in.readLine(this.value, newMaxLineLength, this.maxBytesToConsume(this.pos)); 82 this.pos += (long)newSize; 83 int textLength = this.value.getLength(); 84 byte[] textBytes = this.value.getBytes(); 85 if (textLength >= 3 && textBytes[0] == -17 && textBytes[1] == -69 && textBytes[2] == -65) { 86 LOG.info("Found UTF-8 BOM and skipped it"); 87 textLength -= 3; 88 newSize -= 3; 89 if (textLength > 0) { 90 textBytes = this.value.copyBytes(); 91 this.value.set(textBytes, 3, textLength); 92 } else { 93 this.value.clear(); 94 } 95 } 96 97 return newSize; 98 } 99 100 public boolean nextKeyValue() throws IOException { 101 if (this.key == null) { 102 this.key = new LongWritable(); 103 } 104 105 this.key.set(this.pos); 106 if (this.value == null) { 107 this.value = new Text(); 108 } 109 110 int newSize = 0; 111 112 while(this.getFilePosition() <= this.end || this.in.needAdditionalRecordAfterSplit()) { 113 if (this.pos == 0L) { 114 newSize = this.skipUtfByteOrderMark(); 115 } else { 116 newSize = this.in.readLine(this.value, this.maxLineLength, this.maxBytesToConsume(this.pos)); 117 this.pos += (long)newSize; 118 } 119 120 if (newSize == 0 || newSize < this.maxLineLength) { 121 break; 122 } 123 124 LOG.info("Skipped line of size " + newSize + " at pos " + (this.pos - (long)newSize)); 125 } 126 127 if (newSize == 0) { 128 this.key = null; 129 this.value = null; 130 return false; 131 } else { 132 return true; 133 } 134 } 135 136 public LongWritable getCurrentKey() { 137 return this.key; 138 } 139 140 public Text getCurrentValue() { 141 return this.value; 142 } 143 144 public float getProgress() throws IOException { 145 return this.start == this.end ? 0.0F : Math.min(1.0F, (float)(this.getFilePosition() - this.start) / (float)(this.end - this.start)); 146 } 147 148 public synchronized void close() throws IOException { 149 try { 150 if (this.in != null) { 151 this.in.close(); 152 } 153 } finally { 154 if (this.decompressor != null) { 155 CodecPool.returnDecompressor(this.decompressor); 156 this.decompressor = null; 157 } 158 159 } 160 161 } 162 }
TextInputFormat里的通过返回一个LineRecordReader构造RecordReader,看下LineRecordReader的100行的nextKeyValue,通过这个方法拿到<Key,Value>,每次用this.in.readline读(SplitLineReader继承了LineReader,LR.readline每次读一行至delimeter),如果读到的行大于maxLineLength,就跳过这行写进Log里头。
同样的NLineInputFormat由于仅仅是getSplits的时候一次N行,KV的时候和TextIF是一样的都是LineRecordReader。
而如FixedLengthRecord(read input files which contain fixed length records. The content of a record need not be text. It can be arbitrary binary data),使用的就是FixedLengthRecordReader(对各个方法都进行了重写)