• hadoop的wordcount的改动版



    //这个是在原来的基础上改动以后得到的,将当中的分词的根据给换掉了,而且进行词频统计的时候会自己主动的忽略大写和小写

     

    packageorg.apache.hadoop.mapred;

     

    importjava.io.IOException;

    importjava.util.ArrayList;

    importjava.util.Iterator;

    importjava.util.List;

    importjava.util.StringTokenizer;

     

    importorg.apache.hadoop.conf.Configuration;

    import org.apache.hadoop.conf.Configured;

    importorg.apache.hadoop.fs.Path;

    importorg.apache.hadoop.io.IntWritable;

    importorg.apache.hadoop.io.LongWritable;

    importorg.apache.hadoop.io.Text;

    importorg.apache.hadoop.mapred.FileInputFormat;

    import org.apache.hadoop.mapred.FileOutputFormat;

    importorg.apache.hadoop.mapred.JobClient;

    importorg.apache.hadoop.mapred.JobConf;

    importorg.apache.hadoop.mapred.MapReduceBase;

    importorg.apache.hadoop.mapred.Mapper;

    importorg.apache.hadoop.mapred.OutputCollector;

    import org.apache.hadoop.mapred.Reducer;

    importorg.apache.hadoop.mapred.Reporter;

    importorg.apache.hadoop.util.Tool;

    importorg.apache.hadoop.util.ToolRunner;

     

     

    public classWordCount extends Configured implements Tool {

     

      /*

    这个类实现mapper接口的map方法,输入的是文本总的每一行。

    利用StringTokenizer将字符串拆分成单词。然后将输出结果(word, 1)写入到OutputCollector中去

    OutputCollectorhadoop框架提供,负责收集mapperreducer的输出数据,实现map函数和reduce函数时。仅仅须要将输出的<key,value>对向OutputCollector一丢就可以,其余的事情框架会自己处理。

       */

      public static class MapClass extendsMapReduceBase

        implements Mapper<LongWritable, Text,Text, IntWritable> {

       

        private final static IntWritable one = newIntWritable(1);

        private Text word = new Text();

    /*类中的LongWritable,  Text, IntWritablehadoop中实现的用于封装Java数据类型的类,这些类都可以被串行化从而便于在分布式系统中进行数据交换。可以将它们等同的视为long,string,int的替代品

    */

        public void map(LongWritable key, Textvalue,

                        OutputCollector<Text,IntWritable> output,

                        Reporter reporter) throwsIOException {

          String line = value.toString();

          StringTokenizer itr = new StringTokenizer(line,” f,. : ; ? ! [] ‘ ”);

                //原来仅仅是用空格来分词。如今利用标点和空格等进行分词

          while (itr.hasMoreTokens()) {

            word.set(itr.nextToken().toLowerCase());//单词统计的时候忽略大写和小写

            output.collect(word, one);//输出结果(word1

          }

        }

      }

     

      /*

           此类实现的是Reducer接口中的reduce方法。函数中的參数key.value是由mapper输出的中间结果。values是一个iterator(迭代器)

      */

      public static class Reduce extendsMapReduceBase

        implements Reducer<Text, IntWritable,Text, IntWritable> {

       

        public void reduce(Text key,Iterator<IntWritable> values,

                           OutputCollector<Text,IntWritable> output,

                           Reporter reporter)throws IOException {

          int sum = 0;

    /*

    遍历这个迭代器,就行得到有同样的key的全部的value值。

    此处的key是一个单词,而value则是词频

    */

          while (values.hasNext()) {

            sum += values.next().get();

          }

             //遍历后得到这个单词出现的总次数

          output.collect(key, newIntWritable(sum));

        }

      }

     

      static int printUsage() {

        System.out.println("wordcount [-m<maps>] [-r <reduces>] <input> <output>");//输入输入路径

       ToolRunner.printGenericCommandUsage(System.out);

        return -1;

      }

     

      /*

                Wordcountmap/reduce项目的主要驱动程序,调用此方法提交的map / reduce任务。

    hadoop中一次计算任务成为一个job,能够通过以一个JobConf对象设置怎样执行这个job,此处定义了输出的key 类型是text,value的类型是IntWritable

      */

      public int run(String[] args) throwsException {

        JobConf conf = new JobConf(getConf(),WordCount.class);

        conf.setJobName("wordcount");

     

        // key是text(words)

        conf.setOutputKeyClass(Text.class);

        // value是IntWritable (ints)

       conf.setOutputValueClass(IntWritable.class);

       

        conf.setMapperClass(MapClass.class);       

        conf.setCombinerClass(Reduce.class);

        conf.setReducerClass(Reduce.class);

       

        List<String> other_args = newArrayList<String>();

        for(int i=0; i < args.length; ++i) {

          try {

            if ("-m".equals(args[i])) {

              conf.setNumMapTasks(Integer.parseInt(args[++i]));

            } else if("-r".equals(args[i])) {

             conf.setNumReduceTasks(Integer.parseInt(args[++i]));

            } else {

              other_args.add(args[i]);

            }

          } catch (NumberFormatException except) {

            System.out.println("ERROR: Integerexpected instead of " + args[i]);

            return printUsage();

          } catch (ArrayIndexOutOfBoundsExceptionexcept) {

            System.out.println("ERROR:Required parameter missing from " +

                               args[i-1]);

            return printUsage();

          }

        }

        // Make sure there are exactly 2 parametersleft.

        if (other_args.size() != 2) {

          System.out.println("ERROR: Wrongnumber of parameters: " +

                             other_args.size() +" instead of 2.");

          return printUsage();

        }

        FileInputFormat.setInputPaths(conf,other_args.get(0));

        FileOutputFormat.setOutputPath(conf, newPath(other_args.get(1)));

           

        JobClient.runJob(conf);

        return 0;

      }

     

     

      public static void main(String[] args) throwsException {

                /* ToolRunnerrun方法開始,run方法有三个參数。第一个是Configuration类的实例,第二个是wordcount的实例,args则是从控制台接收到的命令行数组

                */

        int res = ToolRunner.run(newConfiguration(), new WordCount(), args);

        System.exit(res);

      }

     

    }

     

  • 相关阅读:
    javascript 学习笔记
    vim折叠设置(转载)
    描述符
    python运算符优先级
    python repr()和str()
    python super()
    [深入Python]__new__和__init__
    python中,类方法和静态方法区别。
    python中 os._exit() 和 sys.exit(), exit(0)和exit(1) 的用法和区别
    关于字符集和字符编码自己汇总记录
  • 原文地址:https://www.cnblogs.com/lcchuguo/p/5312844.html
Copyright © 2020-2023  润新知