• MapReduce的WordCount


      1 package com.mengyao.hadoop.mapreduce;
      2 
      3 import java.io.IOException;
      4 
      5 import org.apache.hadoop.conf.Configuration;
      6 import org.apache.hadoop.conf.Configured;
      7 import org.apache.hadoop.fs.Path;
      8 import org.apache.hadoop.io.LongWritable;
      9 import org.apache.hadoop.io.Text;
     10 import org.apache.hadoop.mapreduce.Job;
     11 import org.apache.hadoop.mapreduce.Mapper;
     12 import org.apache.hadoop.mapreduce.Reducer;
     13 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
     14 import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
     15 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
     16 import org.apache.hadoop.util.Tool;
     17 import org.apache.hadoop.util.ToolRunner;
     18 
     19 /**
     20  * 输入文件目录为HDFS上的/mapreduces/word.txt,内容如下:
     21  *         hadoop    zookeeper    hbase    hive
     22  *         flume    sqoop    pig    mahout
     23  *         hadoop    spark    mllib    hive    zookeeper
     24  *         hadoop    storm    kafka    redis    zookeeper
     25  *
     26  * 输出目录为HDFS上的/mapreduces/wordcount/
     27  *     _SUCCESS空文件表示作业执行成功(如果是_FAILD文件则失败)
     28  *  part-r-00000文件表示作业的结果,内容如下:
     29  *      flume    1
     30  *      hadoop    3
     31  *      hbase    1
     32  *      hive    2
     33  *      kafka    1
     34  *      mahout    1
     35  *      mllib    1
     36  *      pig    1
     37  *      redis    1
     38  *      spark    1
     39  *      sqoop    1
     40  *      storm    1
     41  *      zookeeper    3
     42  *  
     43  * @author mengyao
     44  *
     45  */
     46 public class WordCount extends Configured implements Tool {
     47 
     48     static class WordCountMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
     49         
     50         private Text outputKey;
     51         private LongWritable outputValue;
     52         
     53         @Override
     54         protected void setup(Context context)
     55                 throws IOException, InterruptedException {
     56             this.outputKey = new Text();
     57             this.outputValue = new LongWritable(1L);
     58         }
     59         
     60         @Override
     61         protected void map(LongWritable key, Text value, Context context)
     62                 throws IOException, InterruptedException {
     63             final String[] words = value.toString().split("\t");
     64             for (String word : words) {
     65                 this.outputKey.set(word);
     66                 context.write(this.outputKey, this.outputValue);            
     67             }
     68         }
     69     }
     70     
     71     static class WordCountReducer extends Reducer<Text, LongWritable, Text, LongWritable> {
     72         
     73         private Text outputKey;
     74         private LongWritable outputValue;
     75         
     76         @Override
     77         protected void setup(Context context)
     78                 throws IOException, InterruptedException {
     79             this.outputKey = new Text();
     80             this.outputValue = new LongWritable();
     81         }
     82         
     83         @Override
     84         protected void reduce(Text key, Iterable<LongWritable> value, Context context)
     85                 throws IOException, InterruptedException {
     86             long count = 0L;
     87             for (LongWritable item : value) {
     88                 count += item.get();
     89             }
     90             this.outputKey.set(key);
     91             this.outputValue.set(count);
     92             context.write(this.outputKey, this.outputValue);
     93         }
     94     }
     95     
     96     @Override
     97     public int run(String[] args) throws Exception {
     98         Job job = Job.getInstance(getConf(), WordCount.class.getSimpleName());
     99         job.setJarByClass(WordCount.class);
    100         
    101         job.setInputFormatClass(TextInputFormat.class);
    102         FileInputFormat.addInputPath(job, new Path(args[0]));
    103         FileOutputFormat.setOutputPath(job, new Path(args[1]));
    104         
    105         job.setMapperClass(WordCountMapper.class);
    106         job.setMapOutputKeyClass(Text.class);
    107         job.setMapOutputValueClass(LongWritable.class);
    108         
    109         job.setCombinerClass(WordCountReducer.class);
    110         
    111         job.setReducerClass(WordCountReducer.class);
    112         job.setOutputKeyClass(Text.class);
    113         job.setOutputValueClass(LongWritable.class);
    114         
    115         return job.waitForCompletion(true)?0:1;
    116     }
    117 
    118     public static int createJob(String[] args) {
    119         Configuration conf = new Configuration();
    120         conf.set("dfs.datanode.socket.write.timeout", "7200000");
    121         conf.set("mapreduce.input.fileinputformat.split.minsize", "268435456");
    122         conf.set("mapreduce.input.fileinputformat.split.maxsize", "536870912");
    123         int status = 0;
    124         
    125         try {
    126             status = ToolRunner.run(conf, new WordCount(), args);
    127         } catch (Exception e) {
    128             e.printStackTrace();
    129         }
    130         
    131         return status;
    132     }
    133     
    134     public static void main(String[] args) {
    135         args = new String[]{"/mapreduces/word.txt", "/mapreduces/wordcount"};
    136         if (args.length!=2) {
    137             System.out.println("Usage: "+WordCount.class.getName()+" Input paramters <INPUT_PATH> <OUTPUT_PATH>");
    138         } else {
    139             int status = createJob(args);
    140             System.exit(status);
    141         }
    142     }
    143 
    144 }
  • 相关阅读:
    python文件、文件夹操作OS模块
    python字符串
    python集合set
    多市场交易碎片交易
    基金公司主要系统
    高频交易:Solarflare组建超低延迟网络
    上交所技术前沿
    高频交易低延迟:信鸽、100微妙和恒生的纳秒试验
    解密PB
    解密HOMS
  • 原文地址:https://www.cnblogs.com/mengyao/p/4865561.html
Copyright © 2020-2023  润新知