• 原生MapReduce开发样例


    一、需求

    data:  将相同名字合并为一个,并计算出平均数
    
    tom 12
    小明  23
    jerry  45
    2哈 34
    tom  45
    tom   65
    小明  34
    

    二、编码

    1.导入jar包

    2.编码

    2.1Map编写

    package com.wzy.studentscore;
    
    import java.io.IOException;
    import java.util.StringTokenizer;
    
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;
    
    /**
    * @author:吴兆跃
    * @version 创建时间:2018年6月5日 下午5:58:55
    * 类说明:
    */
    public class ScoreMap extends Mapper<LongWritable, Text, Text, IntWritable>{
        @Override
        public void map(LongWritable key, Text value, Context context) 
                throws IOException,InterruptedException{
            
            String line = value.toString(); //一行的数据
            StringTokenizer tokenizerArticle = new StringTokenizer(line, "
    ");
            
            System.out.println("key: "+key);
            System.out.println("value-line: "+line);
            System.out.println("count: "+tokenizerArticle.countTokens());
            
            while(tokenizerArticle.hasMoreTokens()){
                String token = tokenizerArticle.nextToken();
                System.out.println("token: "+token);
                
                StringTokenizer tokenizerLine = new StringTokenizer(token);
                String strName = tokenizerLine.nextToken(); // 得到name
                String strScore = tokenizerLine.nextToken(); // 得到分数
                
                Text name = new Text(strName);
                int scoreInt = Integer.parseInt(strScore);
                context.write(name, new IntWritable(scoreInt));
                
            }
            System.out.println("context: "+context.toString());
        }
    
    }

    2.2Reduce编写

    package com.wzy.studentscore;
    
    import java.io.IOException;
    import java.util.Iterator;
    
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Reducer;
    
    /**
    * @author:吴兆跃
    * @version 创建时间:2018年6月5日 下午6:50:28
    * 类说明:
    */
    public class ScoreReduce extends Reducer<Text, IntWritable, Text, IntWritable>{
    
        @Override
        public void reduce(Text key, Iterable<IntWritable> values, Context context)
            throws IOException, InterruptedException{
            
            int sum = 0;
            int count = 0;
            Iterator<IntWritable> iterator = values.iterator();
            while(iterator.hasNext()){ 
                sum += iterator.next().get(); //求和
                count++;
            }
            int average = (int)sum / count;  //求平均数
            context.write(key, new IntWritable(average));
        }
        
        
    }

    2.3运行类编写

    package com.wzy.studentscore;
    
    
    import org.apache.hadoop.conf.Configured;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
    import org.apache.hadoop.util.Tool;
    import org.apache.hadoop.util.ToolRunner;
    
    /**
    * @author:吴兆跃
    * @version 创建时间:2018年6月5日 下午6:59:29
    * 类说明:
    */
    public class ScoreProcess extends Configured implements Tool{
    
        public static void main(String[] args) throws Exception {
            int ret = ToolRunner.run(new ScoreProcess(), new String[]{"input","output"});
            System.exit(ret);
        }
        
        @Override
        public int run(String[] args) throws Exception {
            Job job = new Job(getConf());
            job.setJarByClass(ScoreProcess.class);
            job.setJobName("score_process");
            
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(IntWritable.class);
            
            job.setMapperClass(ScoreMap.class);
            job.setCombinerClass(ScoreReduce.class);
            job.setReducerClass(ScoreReduce.class);
            
            job.setInputFormatClass(TextInputFormat.class);
            job.setOutputFormatClass(TextOutputFormat.class);
            
            FileInputFormat.setInputPaths(job, new Path(args[0]));
            FileOutputFormat.setOutputPath(job, new Path(args[1]));
            
            boolean success = job.waitForCompletion(true);
            return success ? 0 : 1;
        }
    
    }

    3.打包

     三、调试

    1. java本地运行

    root@master:/home/wzy/software/hadoop-0.20.2/testfile/ScoreProcess# ls
    input  part  scoreProcess.jar
    root@master:/home/wzy/software/hadoop-0.20.2/testfile/ScoreProcess# java -jar scoreProcess.jar 
    Jun 06, 2018 5:28:26 AM org.apache.hadoop.util.NativeCodeLoader <clinit>
    WARNING: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
    Jun 06, 2018 5:28:26 AM org.apache.hadoop.mapreduce.lib.input.FileInputFormat listStatus
    INFO: Total input paths to process : 1
    Jun 06, 2018 5:28:26 AM org.apache.hadoop.io.compress.snappy.LoadSnappy <clinit>
    WARNING: Snappy native library not loaded
    Jun 06, 2018 5:28:27 AM org.apache.hadoop.mapred.JobClient monitorAndPrintJob
    INFO: Running job: job_local1903623691_0001
    Jun 06, 2018 5:28:27 AM org.apache.hadoop.mapred.LocalJobRunner$Job$MapTaskRunnable run
    INFO: Starting task: attempt_local1903623691_0001_m_000000_0
    Jun 06, 2018 5:28:27 AM org.apache.hadoop.mapred.LocalJobRunner$Job run
    INFO: Waiting for map tasks
    Jun 06, 2018 5:28:27 AM org.apache.hadoop.util.ProcessTree isSetsidSupported
    INFO: setsid exited with exit code 0
    Jun 06, 2018 5:28:27 AM org.apache.hadoop.mapred.Task initialize
    INFO:  Using ResourceCalculatorPlugin : org.apache.hadoop.util.LinuxResourceCalculatorPlugin@5ddf714a
    Jun 06, 2018 5:28:27 AM org.apache.hadoop.mapred.MapTask runNewMapper
    INFO: Processing split: file:/home/wzy/software/hadoop-0.20.2/testfile/ScoreProcess/input/data:0+72
    Jun 06, 2018 5:28:27 AM org.apache.hadoop.mapred.MapTask$MapOutputBuffer <init>
    INFO: io.sort.mb = 100
    Jun 06, 2018 5:28:27 AM org.apache.hadoop.mapred.MapTask$MapOutputBuffer <init>
    INFO: data buffer = 79691776/99614720
    Jun 06, 2018 5:28:27 AM org.apache.hadoop.mapred.MapTask$MapOutputBuffer <init>
    INFO: record buffer = 262144/327680
    key: 0
    value-line: tom 12
    count: 1
    token: tom 12
    context: org.apache.hadoop.mapreduce.Mapper$Context@41b9bff9
    key: 8
    value-line: 小明  23
    count: 1
    token: 小明  23
    context: org.apache.hadoop.mapreduce.Mapper$Context@41b9bff9
    key: 20
    value-line: jerry  45
    count: 1
    token: jerry  45
    context: org.apache.hadoop.mapreduce.Mapper$Context@41b9bff9
    key: 31
    value-line: 哈2  34
    count: 1
    token: 哈2  34
    context: org.apache.hadoop.mapreduce.Mapper$Context@41b9bff9
    key: 41
    value-line: tom  45
    count: 1
    token: tom  45
    context: org.apache.hadoop.mapreduce.Mapper$Context@41b9bff9
    key: 50
    value-line: tom   65
    count: 1
    token: tom   65
    context: org.apache.hadoop.mapreduce.Mapper$Context@41b9bff9
    key: 60
    value-line: 小明  34
    count: 1
    token: 小明  34
    context: org.apache.hadoop.mapreduce.Mapper$Context@41b9bff9
    root@master:/home/wzy/software/hadoop-0.20.2/testfile/ScoreProcess# ls
    input  output  part  scoreProcess.jar
    root@master:/home/wzy/software/hadoop-0.20.2/testfile/ScoreProcess# cd output/
    root@master:/home/wzy/software/hadoop-0.20.2/testfile/ScoreProcess/output# ls
    part-r-00000  _SUCCESS
    root@master:/home/wzy/software/hadoop-0.20.2/testfile/ScoreProcess/output# cat part-r-00000 
    jerry    45
    tom    40
    哈2    34
    小明    28

    2. 在hadoop hdfs上运行

    2.1 data文件上传到hdfs

    root@master:/home/wzy/software/hadoop-0.20.2/testfile/ScoreProcess# ../../bin/hadoop fs -mkdir /user
    root@master:/home/wzy/software/hadoop-0.20.2/testfile/ScoreProcess# ../../bin/hadoop fs -mkdir /user/root
    root@master:/home/wzy/software/hadoop-0.20.2/testfile/ScoreProcess# ../../bin/hadoop fs -mkdir /user/root/input
    root@master:/home/wzy/software/hadoop-0.20.2/testfile/ScoreProcess# ../../bin/hadoop fs -put input/data /user/root/input
    root@master:/home/wzy/software/hadoop-0.20.2/testfile/ScoreProcess# ../../bin/hadoop fs -ls /user/root/input
    Found 1 items
    -rw-r--r--   1 root supergroup         72 2018-06-06 04:00 /user/root/input/data

    2.2 运行

    root@master:/home/wzy/software/hadoop-0.20.2/testfile/ScoreProcess# ../../bin/hadoop jar scoreProcess.jar 
    18/06/06 04:00:52 INFO input.FileInputFormat: Total input paths to process : 1
    18/06/06 04:00:53 INFO mapred.JobClient: Running job: job_201806060358_0002
    18/06/06 04:00:54 INFO mapred.JobClient:  map 0% reduce 0%
    18/06/06 04:01:02 INFO mapred.JobClient:  map 100% reduce 0%
    18/06/06 04:01:14 INFO mapred.JobClient:  map 100% reduce 100%
    18/06/06 04:01:16 INFO mapred.JobClient: Job complete: job_201806060358_0002
    18/06/06 04:01:16 INFO mapred.JobClient: Counters: 17
    18/06/06 04:01:16 INFO mapred.JobClient:   Map-Reduce Framework
    18/06/06 04:01:16 INFO mapred.JobClient:     Combine output records=4
    18/06/06 04:01:16 INFO mapred.JobClient:     Spilled Records=8
    18/06/06 04:01:16 INFO mapred.JobClient:     Reduce input records=4
    18/06/06 04:01:16 INFO mapred.JobClient:     Reduce output records=4
    18/06/06 04:01:16 INFO mapred.JobClient:     Map input records=7
    18/06/06 04:01:16 INFO mapred.JobClient:     Map output records=7
    18/06/06 04:01:16 INFO mapred.JobClient:     Map output bytes=65
    18/06/06 04:01:16 INFO mapred.JobClient:     Reduce shuffle bytes=52
    18/06/06 04:01:16 INFO mapred.JobClient:     Combine input records=7
    18/06/06 04:01:16 INFO mapred.JobClient:     Reduce input groups=4
    18/06/06 04:01:16 INFO mapred.JobClient:   FileSystemCounters
    18/06/06 04:01:16 INFO mapred.JobClient:     HDFS_BYTES_READ=72
    18/06/06 04:01:16 INFO mapred.JobClient:     FILE_BYTES_WRITTEN=136
    18/06/06 04:01:16 INFO mapred.JobClient:     FILE_BYTES_READ=52
    18/06/06 04:01:16 INFO mapred.JobClient:     HDFS_BYTES_WRITTEN=34
    18/06/06 04:01:16 INFO mapred.JobClient:   Job Counters 
    18/06/06 04:01:16 INFO mapred.JobClient:     Launched map tasks=1
    18/06/06 04:01:16 INFO mapred.JobClient:     Launched reduce tasks=1
    18/06/06 04:01:16 INFO mapred.JobClient:     Data-local map tasks=1

    2.3 查看结果

    root@master:/home/wzy/software/hadoop-0.20.2/testfile/ScoreProcess# ../../bin/hadoop fs -ls /user/root/output/
    Found 2 items
    drwxr-xr-x   - root supergroup          0 2018-06-06 04:00 /user/root/output/_logs
    -rw-r--r--   1 root supergroup         34 2018-06-06 04:01 /user/root/output/part-r-00000
    root@master:/home/wzy/software/hadoop-0.20.2/testfile/ScoreProcess# ../../bin/hadoop fs -get /user/root/output/part-r-00000 part
    root@master:/home/wzy/software/hadoop-0.20.2/testfile/ScoreProcess# ls
    input  output  part  scoreProcess.jar
    root@master:/home/wzy/software/hadoop-0.20.2/testfile/ScoreProcess# cat part
    jerry    45
    tom    40
    2哈    34
    小明    28
  • 相关阅读:
    堆排序
    剑指 Offer 59
    面试题:happen-before原则和as-if-serial语义
    面试题:Redis的持久化机制是什么?各自的优缺点?
    面试题:单线程redis还这么快
    面试题:微服务理论
    wait和notify
    线程八锁
    面试题:在静态方法和非静态方法上加 Synchronized的区别
    面试题:3种线程阻塞唤醒的对比
  • 原文地址:https://www.cnblogs.com/wwzyy/p/9144633.html
Copyright © 2020-2023  润新知