• InputFormat 按文件来划分

      1 package seven.ili;
      3 import org.apache.hadoop.conf.Configuration;
      4 import org.apache.hadoop.fs.BlockLocation;
      5 import org.apache.hadoop.fs.FileStatus;
      6 import org.apache.hadoop.fs.FileSystem;
      7 import org.apache.hadoop.fs.Path;
      8 import org.apache.hadoop.io.IntWritable;
      9 import org.apache.hadoop.io.LongWritable;
     10 import org.apache.hadoop.io.Text;
     11 import org.apache.hadoop.mapreduce.*;
     12 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
     13 import org.apache.hadoop.mapreduce.lib.input.FileSplit;
     14 import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
     15 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
     16 import org.apache.hadoop.util.GenericOptionsParser;
     17 import org.eclipse.core.internal.resources.FileState;
     19 import java.io.IOException;
     20 import java.util.ArrayList;
     21 import java.util.List;
     22 import java.util.StringTokenizer;
     24 /**
     25  * Created with IntelliJ IDEA.
     26  * User: Isaac Li
     27  * Date: 12/17/12
     28  * Time: 4:00 PM
     29  * To change this template use File | Settings | File Templates.
     30  */
     31 public class Test {
     33     public static class TokenizerMapper
     34             extends Mapper<Object, Text, Text, IntWritable> {
     35         private final static IntWritable one = new IntWritable(1);
     36         private Text word = new Text();
     37         public void map(Object key, Text value, Context context
     38         ) throws IOException, InterruptedException {
     39             FileSplit fileSplit = (FileSplit)context.getInputSplit();
     40             String filename = fileSplit.getPath().getName();
     41             System.out.println("File name "+filename);
     42             System.out.println("Directory and File name"+fileSplit.getPath().toString());
     44             StringTokenizer itr = new StringTokenizer(value.toString());
     45             while (itr.hasMoreTokens()) {
     46                 word.set(itr.nextToken());
     47                 context.write(word, one);
     48             }
     49         }
     50     }
     52     public static class IntSumReducer
     53             extends Reducer<Text,IntWritable,Text,IntWritable> {
     54         private IntWritable result = new IntWritable();
     56         public void reduce(Text key, Iterable<IntWritable> values,
     57                            Context context
     58         ) throws IOException, InterruptedException {
     59             int sum = 0;
     60             for (IntWritable val : values) {
     61                 sum += val.get();
     62             }
     63             result.set(sum);
     64             context.write(key, result);
     65         }
     66     }
     68     public static class SimpleTextFileInputFormat extends FileInputFormat<LongWritable, Text>{
     69         public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context){
     70             return new LineRecordReader();
     71         }
     72         public List<InputSplit> getSplits(JobContext job) throws IOException {
     73             List<InputSplit> splits = new ArrayList<InputSplit>();
     74             for (FileStatus file: listStatus(job)){
     75                 Path path = file.getPath();
     76                 FileSystem fs = path.getFileSystem(job.getConfiguration());
     77                 long length = file.getLen();
     78                 BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
     79                 if (length != 0){
     80                     splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts()));
     81                 }
     82             }
     83             return splits;
     84         }
     85     }
     87     public static void main(String[] args) throws Exception {
     88         Configuration conf = new Configuration();
     89         String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
     90         if (otherArgs.length < 2) {
     91             System.err.println("Usage: wordcount <in> <out>");
     92             System.exit(2);
     93         }
     94         Job job = new Job(conf, "word count3");
     95         job.setJarByClass(Test.class);
     96         job.setInputFormatClass(SimpleTextFileInputFormat.class);
     97         job.setMapperClass(TokenizerMapper.class);
     98         //job.setCombinerClass(IntSumReducer.class);
     99         job.setReducerClass(IntSumReducer.class);
    100         job.setOutputKeyClass(Text.class);
    101         job.setOutputValueClass(IntWritable.class);
    102         job.setNumReduceTasks(0);
    103         FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    104         FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    105         System.exit(job.waitForCompletion(true) ? 0 : 1);
    106     }
    107 }

    主要 实现在 SimpleTextFileInputFormat 这个类中 !

