• InputFormat 按文件来划分


      1 package seven.ili;
      2 
      3 import org.apache.hadoop.conf.Configuration;
      4 import org.apache.hadoop.fs.BlockLocation;
      5 import org.apache.hadoop.fs.FileStatus;
      6 import org.apache.hadoop.fs.FileSystem;
      7 import org.apache.hadoop.fs.Path;
      8 import org.apache.hadoop.io.IntWritable;
      9 import org.apache.hadoop.io.LongWritable;
     10 import org.apache.hadoop.io.Text;
     11 import org.apache.hadoop.mapreduce.*;
     12 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
     13 import org.apache.hadoop.mapreduce.lib.input.FileSplit;
     14 import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
     15 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
     16 import org.apache.hadoop.util.GenericOptionsParser;
     17 import org.eclipse.core.internal.resources.FileState;
     18 
     19 import java.io.IOException;
     20 import java.util.ArrayList;
     21 import java.util.List;
     22 import java.util.StringTokenizer;
     23 
     24 /**
     25  * Created with IntelliJ IDEA.
     26  * User: Isaac Li
     27  * Date: 12/17/12
     28  * Time: 4:00 PM
     29  * To change this template use File | Settings | File Templates.
     30  */
     31 public class Test {
     32 
     33     public static class TokenizerMapper
     34             extends Mapper<Object, Text, Text, IntWritable> {
     35         private final static IntWritable one = new IntWritable(1);
     36         private Text word = new Text();
     37         public void map(Object key, Text value, Context context
     38         ) throws IOException, InterruptedException {
     39             FileSplit fileSplit = (FileSplit)context.getInputSplit();
     40             String filename = fileSplit.getPath().getName();
     41             System.out.println("File name "+filename);
     42             System.out.println("Directory and File name"+fileSplit.getPath().toString());
     43 
     44             StringTokenizer itr = new StringTokenizer(value.toString());
     45             while (itr.hasMoreTokens()) {
     46                 word.set(itr.nextToken());
     47                 context.write(word, one);
     48             }
     49         }
     50     }
     51 
     52     public static class IntSumReducer
     53             extends Reducer<Text,IntWritable,Text,IntWritable> {
     54         private IntWritable result = new IntWritable();
     55 
     56         public void reduce(Text key, Iterable<IntWritable> values,
     57                            Context context
     58         ) throws IOException, InterruptedException {
     59             int sum = 0;
     60             for (IntWritable val : values) {
     61                 sum += val.get();
     62             }
     63             result.set(sum);
     64             context.write(key, result);
     65         }
     66     }
     67 
     68     public static class SimpleTextFileInputFormat extends FileInputFormat<LongWritable, Text>{
     69         public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context){
     70             return new LineRecordReader();
     71         }
     72         public List<InputSplit> getSplits(JobContext job) throws IOException {
     73             List<InputSplit> splits = new ArrayList<InputSplit>();
     74             for (FileStatus file: listStatus(job)){
     75                 Path path = file.getPath();
     76                 FileSystem fs = path.getFileSystem(job.getConfiguration());
     77                 long length = file.getLen();
     78                 BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
     79                 if (length != 0){
     80                     splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts()));
     81                 }
     82             }
     83             return splits;
     84         }
     85     }
     86 
     87     public static void main(String[] args) throws Exception {
     88         Configuration conf = new Configuration();
     89         String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
     90         if (otherArgs.length < 2) {
     91             System.err.println("Usage: wordcount <in> <out>");
     92             System.exit(2);
     93         }
     94         Job job = new Job(conf, "word count3");
     95         job.setJarByClass(Test.class);
     96         job.setInputFormatClass(SimpleTextFileInputFormat.class);
     97         job.setMapperClass(TokenizerMapper.class);
     98         //job.setCombinerClass(IntSumReducer.class);
     99         job.setReducerClass(IntSumReducer.class);
    100         job.setOutputKeyClass(Text.class);
    101         job.setOutputValueClass(IntWritable.class);
    102         job.setNumReduceTasks(0);
    103         FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    104         FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    105         System.exit(job.waitForCompletion(true) ? 0 : 1);
    106     }
    107 }

    主要 实现在 SimpleTextFileInputFormat 这个类中 !

  • 相关阅读:
    FZU 2098 刻苦的小芳(卡特兰数,动态规划)
    卡特兰数总结
    FZU 1064 教授的测试(卡特兰数,递归)
    HDU 4745 Two Rabbits(区间DP,最长非连续回文子串)
    Java 第十一届 蓝桥杯 省模拟赛 正整数的摆动序列
    Java 第十一届 蓝桥杯 省模拟赛 反倍数
    Java 第十一届 蓝桥杯 省模拟赛 反倍数
    Java 第十一届 蓝桥杯 省模拟赛 反倍数
    Java 第十一届 蓝桥杯 省模拟赛 凯撒密码加密
    Java 第十一届 蓝桥杯 省模拟赛 凯撒密码加密
  • 原文地址:https://www.cnblogs.com/hengli/p/2826000.html
Copyright © 2020-2023  润新知