• 在map端使用关联数组实现wordcount


      今天看Data-Intensive Text Processing with MapReduce 这本书的第三章的时候,里面有写到在map端优化wordcount。

      对数据密集型数据进行分布式处理的时候,影响数据处理速度的非常重要的一个方面就是map的输出中间结果,在传送到reduce的过程中,很多的中间数据需要进行交换以及包括一些相应的处理,然后再交给相应的reduce。其中中间数据需要在网络中传输,另外中间数据在发送到网络上之前还要写到本地磁盘上,因为网络带宽和磁盘I/O是非常耗时的相比与其他的操作,所以减少中间数据的传输将会增加算法的执行效率,通过使用combiner函数或者其他的方式减少key-value对的个数。下面是一个改进的wordcount算法。

      基本的思想是:

      在map处理的时候定义一个关联数组,然后对文档进行处理,将<word,次数>加入到关联数组中,word存在,则将相应的次数加1,不存在则直接加入到关联数组中。所有的map任务结束后,然后再在run函数中输出处理结果。

    伪代码:

    class Mapper

      method Map(docid a,doc d)

                H =new AssociativeArray

         for all term t 属于doc  d  do

                         H{t}=H{t}+1;

                     for all term t 属于 H do

                    EMIT(term t,count H{t})

    class REDUCER

         method REDUCE(term t,counts[c1,c2,...])

                    sum=0

                   for  all count c 属于 counts[c1,c2,...]  do

                       sum+=c

                 EMIT(term t,count sum)

    代码如下:

    import java.io.IOException;
    import java.io.InputStream;
    import java.net.URI;
    import java.util.HashMap;
    import java.util.Iterator;
    import java.util.Map;
    import java.util.StringTokenizer;
    import java.util.Map.Entry;

    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.FSDataInputStream;
    import org.apache.hadoop.fs.FileSystem;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper.Context;
    import org.apache.hadoop.mapreduce.lib.input.FileSplit;
    import org.apache.hadoop.util.LineReader;


    public class Mapper extends
    org.apache.hadoop.mapreduce.Mapper<LongWritable, Text, Text, IntWritable> {

    int c;
    HashMap<String,IntWritable> map=new HashMap<String,IntWritable>();
    @Override
    protected void map(LongWritable key, Text value,
    Context context)
    throws IOException, InterruptedException {
    String str=value.toString();
    StringTokenizer token=new StringTokenizer(str);
    while(token.hasMoreTokens()){
    String value1=token.nextToken();
    if(map.containsKey(value1)){
    //System.out.println("ni");
    int p=map.get(value1).get()+1;
    map.remove(value1);
    map.put(value1, new IntWritable(p));
    }
    else{
    //System.out.println("ni");
    map.put(value1, new IntWritable(1));
    }
    }
    // TODO Auto-generated method stub

    c++;
    System.out.println(c);



    }
    @Override
    protected void cleanup(org.apache.hadoop.mapreduce.Mapper.Context context)
    throws IOException, InterruptedException {
    // TODO Auto-generated method stub
    System.out.println("cleanup");
    super.cleanup(context);
    }

    @Override
    public void run(Context context) throws IOException, InterruptedException {
    // TODO Auto-generated method stub
    super.run(context);
    System.out.println("run");
    Iterator it=map.entrySet().iterator();
    while(it.hasNext()){
    //System.out.println("nihe");
    Map.Entry<String, IntWritable> entry=(Map.Entry<String, IntWritable>) it.next();
    //System.out.println("nihe");
    context.write(new Text(entry.getKey()), entry.getValue());

    }

    }

    @Override
    protected void setup(org.apache.hadoop.mapreduce.Mapper.Context context)
    throws IOException, InterruptedException {
    // TODO Auto-generated method stub
    // System.out.println(context.getInputSplit().toString());
            // System.out.println(context.getJobID());
      // FileSplit input=(FileSplit)context.getInputSplit();
    // String path=input.getPath().toString();
    // Configuration conf=new Configuration();
      // System.out.println(input.getPath().toString());
       // FileSystem fs=FileSystem.get(URI.create(path), conf);
    // FSDataInputStream filein=fs.open(input.getPath());
       //  LineReader in=new LineReader(filein,conf);
    // Text line=new Text();
    //  int cd=in.readLine(line);
    //   System.out.println(line);
         }
     }



    import java.io.IOException;

    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.Text;


    public class Reducer extends
    org.apache.hadoop.mapreduce.Reducer<Text, IntWritable, Text, IntWritable> {

    @Override
    protected void reduce(Text key, Iterable<IntWritable> values,
    Context context)
    throws IOException, InterruptedException {
    // TODO Auto-generated method stub
    int sum=0;
    for(IntWritable it:values){
    sum+=it.get();
    }
    context.write(key, new IntWritable(sum));
    }




    }



    import java.io.IOException;
    import java.net.URI;



    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.FileSystem;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


    public class Word {

    /**
    *
    @param args
    *
    @throws IOException
    *
    @throws ClassNotFoundException
    *
    @throws InterruptedException
    */
    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
    // TODO Auto-generated method stub
    Job job=new Job();
    Configuration conf=new Configuration();

    Path in=new Path(args[0]);
    Path out=new Path(args[1]);

    FileSystem fs=FileSystem.get(URI.create(args[1]), conf);
    fs.delete(out);
    FileInputFormat.addInputPath(job, in);
    FileOutputFormat.setOutputPath(job, out);
    job.setMapperClass(Mapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);



    job.waitForCompletion(false);



    }

    }




  • 相关阅读:
    ValueError: max() arg is an empty sequence
    链接到镜像
    SparkStreaming+Kafka
    软件质量六大属性—
    架构之美3
    架构之美2
    架构之美-读书笔记之一
    机器学习四正则化(Regularization)
    机器学习三--各种差
    机器学习系列(二)——回归模型
  • 原文地址:https://www.cnblogs.com/dlutxm/p/2223055.html
Copyright © 2020-2023  润新知