• map/reduce实现数据去重


      1 import java.io.IOException;
      2 
      3 import org.apache.hadoop.conf.Configuration;
      4 import org.apache.hadoop.conf.Configured;
      5 import org.apache.hadoop.fs.Path;
      6 import org.apache.hadoop.io.LongWritable;
      7 import org.apache.hadoop.io.Text;
      8 import org.apache.hadoop.mapreduce.Job;
      9 import org.apache.hadoop.mapreduce.Mapper;
     10 import org.apache.hadoop.mapreduce.Reducer;
     11 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
     12 import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
     13 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
     14 import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
     15 import org.apache.hadoop.util.Tool;
     16 import org.apache.hadoop.util.ToolRunner;
     17 public class Dedpu extends Configured implements Tool {
     18 /**
     19  * 数据去重
     20  * 数据样例:
     21  * 输入数据
     22  * 2006-6-9 a
     23  * 2006-6-10 b
     24  * 2006-6-9 a
     25  * 结果数据
     26  * 2006-6-9 a
     27  * 2006-6-10 b
     28  * 设计思路:
     29  * Map阶段 <时间,字符>
     30  * Reduce阶段输入<时间,list<字符>>,去除重复的字符,输出
     31  * 
     32  * **/
     33     public static class Map extends Mapper<LongWritable,Text,Text,Text>{
     34         public void map(LongWritable key,Text value,Context context)throws IOException, InterruptedException{
     35             String line=value.toString();
     36             Text myvalue=new Text("");
     37             context.write(new Text(line), myvalue);
     38 //            StringTokenizer tokenizer=new StringTokenizer(line);
     39 //            String datestr="",datastr="";
     40 //            while(tokenizer.hasMoreTokens())
     41 //            {
     42 //                datestr=tokenizer.nextToken();
     43 //                datastr=tokenizer.nextToken();
     44 //                context.write(new Text(datestr), new Text(datastr));
     45 //                
     46 //            }
     47         }
     48         
     49     }
     50     
     51     public static class Reduce extends Reducer<Text,Text,Text,Text>{
     52         public void reduce(Text key,Iterable<Text>values,Context context)throws IOException,InterruptedException{
     53 
     54             context.write(key, new Text(""));
     55 //            ArrayList  arr=new ArrayList();
     56 //            Text mykey=key;
     57 //            for(Text txt:values)
     58 //            {
     59 //                
     60 //                if(!arr.contains(txt.toString())){
     61 //                    arr.add(txt.toString());
     62 //                }
     63 //                    
     64 //                
     65 //            }
     66 //            for(int i=0;i<arr.size();i++){
     67 //                context.write(mykey, new Text(arr.get(i).toString()));
     68 //                
     69 //            }
     70             
     71         
     72             
     73         }
     74         
     75     }
     76     
     77     public int run(String[] args)throws Exception
     78     {
     79         Configuration conf=new Configuration();
     80         Job job=new Job(conf,"Data Depution");
     81         job.setJarByClass(Dedpu.class);
     82         
     83         job.setMapperClass(Map.class);
     84         job.setCombinerClass(Reduce.class);
     85         job.setReducerClass(Reduce.class);
     86         
     87         job.setOutputKeyClass(Text.class);
     88         job.setOutputValueClass(Text.class);
     89         
     90         job.setInputFormatClass(TextInputFormat.class);
     91         job.setOutputFormatClass(TextOutputFormat.class);
     92         
     93         FileInputFormat.setInputPaths(job, new Path(args[0]));
     94         FileOutputFormat.setOutputPath(job, new Path(args[1]));
     95         
     96         boolean success=job.waitForCompletion(true);
     97         return success?0:1;
     98         
     99     }
    100     
    101     public static void main(String[] args) throws Exception{
    102         int ret=ToolRunner.run(new Dedpu(), args);
    103         System.exit(ret);
    104     }
    105 }
  • 相关阅读:
    js rsa sign使用笔记(加密,解密,签名,验签)
    金额的计算
    常用js方法集合
    sourceTree 的使用
    node-- express()模块
    详细讲解vue.js里的父子组件通信(props和$emit)
    Vue -- vue-cli webpack打包开启Gzip 报错
    es6函数的rest参数和拓展运算符(...)的解析
    js中判断对象数据类型的方法
    vue学习之vue基本功能初探
  • 原文地址:https://www.cnblogs.com/6tian/p/3829178.html
Copyright © 2020-2023  润新知