• hadoop第一个程序WordCount


    hadoop第一个程序WordCount

    package test;
    
    import org.apache.hadoop.mapreduce.Job;
    import java.io.IOException;
    import java.util.StringTokenizer;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.Reducer;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    /*
     * 作者:翟超科
     * 时间:2019.9.3
     * 任务:mapreduce 实现单词计数
    * */
    public class WordCount {
        //map类继承Mapper,实现map功能
        public static class doMapper extends Mapper<Object, Text, Text, IntWritable>{
            //定义变量 one 为数字1
            public static final IntWritable one = new IntWritable(1);
            //定义关键字变量 word
            public static Text word = new Text();
            @Override
            protected void map(Object key, Text value, Mapper<Object, Text, Text, IntWritable>.Context context)
                    throws IOException, InterruptedException {
                //将hdfs上的文件按行分割放入tokenzer集合中
                StringTokenizer tokenizer = new StringTokenizer(value.toString(),"	");
                //将每一行作为一个关键字
                word.set(tokenizer.nextToken());
                //每个关键字出现1次,将键值对写入缓存。
                context.write(word, one);
                
            }
        }
        
        
        //reduce部分整合缓存的键值对,
        public static class doReduce extends Reducer<Text, IntWritable, Text, IntWritable>{
            //定义每次读入的键值对的同键值对的个数
            private IntWritable result = new IntWritable();
            @Override
            protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context)
                    throws IOException, InterruptedException {
                int sum = 0;//定义每个键对应的值只用0个
                for(IntWritable value:values) {
                    sum += value.get();
                }
                result.set(sum);
                context.write(key, result);
            }
        }
        public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
            // TODO Auto-generated method stub
            Job job = Job.getInstance();
            job.setJobName("WordCount");
            job.setJarByClass(WordCount.class);
            job.setMapperClass(doMapper.class);
            job.setReducerClass(doReduce.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(IntWritable.class);
            Path in = new Path("hdfs://192.168.13.101:9000/data");//文件所处位置
            Path out = new Path("hdfs://192.168.13.101:9000/output");//输出位置
            FileInputFormat.addInputPath(job,in);
            FileOutputFormat.setOutputPath(job,out);
            System.exit(job.waitForCompletion(true) ? 0 : 1);
            
        }
    
    }

     

  • 相关阅读:
    Python--day34--socket模块的方法
    Python--day32--ftp文件传输报错的原因
    Python--day32--struct模块
    Python--day32--复习:https和http的区别;黏包;黏包问题的解决方式;
    快捷键(随时补充)
    turtle 20秒画完小猪佩奇“社会人”
    Kibana使用教程
    Elasticsearch: 权威指南
    數據可視化大屏
    newtonsoft文檔說明
  • 原文地址:https://www.cnblogs.com/2016-zck/p/11452487.html
Copyright © 2020-2023  润新知