• 对于两个输入文件,即文件A 和文件B ,请编写MapReduce程序,对两个文件进行合并排除其中重复的内容,得到一个新的输出文件C。


    package org.apache.hadoop.examples;
    import java.util.HashMap;
    import java.io.IOException;
    import java.util.Iterator;
    import java.util.Map;
    import java.util.StringTokenizer;
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.Reducer;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    import org.apache.hadoop.util.GenericOptionsParser;
     
    public class A_formatSameString {
        public A_formatSameString() {
        }
     
        public static void main(String[] args) throws Exception {
            Configuration conf = new Configuration();
            conf.set("fs.defaultFS", "hdfs://localhost:9000");
            String[] otherArgs = new String[]{"input","output"};
            if(otherArgs.length < 2) {
                System.err.println("Usage: wordcount <in> [<in>...] <out>");
                System.exit(2);
            }
     
            Job job = Job.getInstance(conf, "clearSame");
            job.setJarByClass(A_formatSameString.class);
            job.setMapperClass(A_formatSameString.TokenizerMapper.class);
            job.setCombinerClass(A_formatSameString.IntSumReducer.class);
            job.setReducerClass(A_formatSameString.IntSumReducer.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(Text.class);
     
            for(int i = 0; i < otherArgs.length - 1; ++i) {
                FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
            }
     
            FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
            System.exit(job.waitForCompletion(true)?0:1);
        }
     
        public static class IntSumReducer extends Reducer<Text, Text, Text, Text> {
        	private Text word2 = new Text();
     
            public IntSumReducer() {
            }
            
            public void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context) throws IOException, InterruptedException {
            	Map dict = new HashMap();
                for(Iterator i$ = values.iterator(); i$.hasNext(); ) {
                	Text value = (Text)i$.next();
                	if(!dict.containsKey(value)){
                    	dict.put(value,1);
                    	this.word2.set(value);
                    	context.write(key, this.word2);
                	}
                    
                }
                //System.out.println(key.toString()+"
    "+result.toString());
            }
        }
     
        public static class TokenizerMapper extends Mapper<Object, Text, Text, Text> {
            private static final IntWritable one = new IntWritable(1);
            private Text word = new Text();
            private Text word2 = new Text();
            public TokenizerMapper() {
            }
     
            public void map(Object key, Text value, Mapper<Object, Text, Text, Text>.Context context) throws IOException, InterruptedException {
                StringTokenizer itr = new StringTokenizer(value.toString());
                //System.out.println(itr.toString());
                while(itr.hasMoreTokens()) {
                	String tmpstr = itr.nextToken();
                	String tmpstr2 = itr.nextToken();
                    this.word.set(tmpstr);
                    this.word2.set(tmpstr2);
                    //System.out.println(tmpstr);
                    context.write(this.word, this.word2);
                }
     
            }
        }
    }
    
  • 相关阅读:
    Python动态生成方法
    aid learning安装python
    Pic Go使用阿里云OSS搭建图床
    QSqlQuery、QSqlQueryModel、QSqlTableModel的区别
    python文件上传错误“Required request part 'xxx' is not present”
    【已解决】执行yum命令失败:error: rpmdb: BDB0113 Thread/process 16978/139878363277376 failed: BDB1507 Thread died in Berkeley DB library
    C# DataTable Select用法
    Error in event handler: SyntaxError: Unexpected token '<'
    Lodash 两个数组合并-排重
    forEach,map,filter,find,some,every区别
  • 原文地址:https://www.cnblogs.com/MiraculousB/p/14106843.html
Copyright © 2020-2023  润新知