【Hadoop学习之八】MapReduce开发

环境
　　虚拟机：VMware 10
　　Linux版本：CentOS-6.5-x86_64
　　客户端：Xshell4
　　FTP：Xftp4
　　jdk8
　　hadoop-3.1.1

伪分布式：HDFS和YARN 伪分布式搭建，事先启动HDFS和YARN

第一步：开发WordCount示例

package test.mr;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class MyWC {
    
    public static void main(String[] args) {
        Configuration conf = new Configuration();
        try {
            Job job = Job.getInstance(conf,"word count");
            job.setJarByClass(MyWC.class);
            
            job.setMapperClass(WordMapper.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(IntWritable.class);
            
            job.setReducerClass(WordReducer.class);
            job.setNumReduceTasks(1);
            
//            FileInputFormat.addInputPath(job, new Path("hdfs://node1:9820/wjy/input/text.txt"));
//            Path output = new Path("hdfs://node1:9820/wjy/output/");
            
            //注意这里设置的目录是从 HDFS根目录开始的
            FileInputFormat.addInputPath(job, new Path("/wjy/input/text.txt"));
            Path output = new Path("/wjy/output/");
            if (output.getFileSystem(conf).exists(output))
            {
                output.getFileSystem(conf).delete(output,true);
            }
            FileOutputFormat.setOutputPath(job, output);
            
            System.exit(job.waitForCompletion(true) ? 0 : 1);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

}

package test.mr;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class WordMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

    // 写在外面 map循环创建会造成内存溢出
    private final static IntWritable one = new IntWritable(1);
    // map写出的数据放到buffer字节数组里 这样word可以继续使用 没有影响
    private Text word = new Text();
    
    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
            throws IOException, InterruptedException {
        //StringTokenizer 默认按照空格 制表符 回车等空白符作为分隔符来切分传入的数据
        StringTokenizer st = new StringTokenizer(value.toString());
        while (st.hasMoreTokens()) {
            word.set(st.nextToken());
            context.write(word, one);
        }
    }
}

package test.mr;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class WordReducer extends Reducer<Text, IntWritable, Text, IntWritable> {

    private IntWritable result = new IntWritable();
    
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values,
            Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
        //key:hello
        //values:(1,1,1,1,1,1)
        int sum = 0;
        for (IntWritable val : values) {
            sum += val.get();
        }
        result.set(sum);
        context.write(key, result);
    }
    
    
}

第二步：程序打jar包：MyWC.jar,上传jar和测试文件

[root@node1 ~]# ls
MyWC.jar text.txt
[root@node1 ~]# hdfs dfs -mkdir /wjy/input
[root@node1 ~]# hdfs dfs -mkdir /wjy/output
[root@node1 ~]# hdfs dfs -put /root/text.txt /wjy/input

text.txt文件里面是测试数据：

hello sxt 1

hello sxt 2

hello sxt 3

...

hello sxt 1000000

第三步：运行jar：MyWC.jar

[root@node1 ~]# hadoop jar MyWC.jar test.mr.MyWC
2019-01-15 19:06:04,326 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2019-01-15 19:06:07,698 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032
2019-01-15 19:06:09,247 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
2019-01-15 19:06:09,294 INFO mapreduce.JobResourceUploader: Disabling Erasure Coding for path: /tmp/hadoop-yarn/staging/root/.staging/job_1547546637762_0003
2019-01-15 19:06:10,518 INFO input.FileInputFormat: Total input files to process : 1
2019-01-15 19:06:11,078 INFO mapreduce.JobSubmitter: number of splits:1
2019-01-15 19:06:11,490 INFO Configuration.deprecation: yarn.resourcemanager.system-metrics-publisher.enabled is deprecated. Instead, use yarn.system-metrics-publisher.enabled
2019-01-15 19:06:14,280 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1547546637762_0003
2019-01-15 19:06:14,287 INFO mapreduce.JobSubmitter: Executing with tokens: []
2019-01-15 19:06:15,163 INFO conf.Configuration: resource-types.xml not found
2019-01-15 19:06:15,163 INFO resource.ResourceUtils: Unable to find 'resource-types.xml'.
2019-01-15 19:06:15,934 INFO impl.YarnClientImpl: Submitted application application_1547546637762_0003
2019-01-15 19:06:16,436 INFO mapreduce.Job: The url to track the job: http://node1:8088/proxy/application_1547546637762_0003/
2019-01-15 19:06:16,438 INFO mapreduce.Job: Running job: job_1547546637762_0003
2019-01-15 19:07:48,824 INFO mapreduce.Job: Job job_1547546637762_0003 running in uber mode : false
2019-01-15 19:07:49,614 INFO mapreduce.Job:  map 0% reduce 0%
2019-01-15 19:09:10,176 INFO mapreduce.Job:  map 67% reduce 0%
2019-01-15 19:09:21,123 INFO mapreduce.Job:  map 100% reduce 0%
2019-01-15 19:13:43,544 INFO mapreduce.Job:  map 100% reduce 73%
2019-01-15 19:13:49,599 INFO mapreduce.Job:  map 100% reduce 100%
2019-01-15 19:14:04,717 INFO mapreduce.Job: Job job_1547546637762_0003 completed successfully
2019-01-15 19:14:08,754 INFO mapreduce.Job: Counters: 53
    File System Counters
        FILE: Number of bytes read=34888902
        FILE: Number of bytes written=70205331
        FILE: Number of read operations=0
        FILE: Number of large read operations=0
        FILE: Number of write operations=0
        HDFS: Number of bytes read=17888997
        HDFS: Number of bytes written=8888922
        HDFS: Number of read operations=8
        HDFS: Number of large read operations=0
        HDFS: Number of write operations=2
    Job Counters 
        Launched map tasks=1
        Launched reduce tasks=1
        Data-local map tasks=1
        Total time spent by all maps in occupied slots (ms)=73564
        Total time spent by all reduces in occupied slots (ms)=167987
        Total time spent by all map tasks (ms)=73564
        Total time spent by all reduce tasks (ms)=167987
        Total vcore-milliseconds taken by all map tasks=73564
        Total vcore-milliseconds taken by all reduce tasks=167987
        Total megabyte-milliseconds taken by all map tasks=75329536
        Total megabyte-milliseconds taken by all reduce tasks=172018688
    Map-Reduce Framework
        Map input records=1000000
        Map output records=3000000
        Map output bytes=28888896
        Map output materialized bytes=34888902
        Input split bytes=101
        Combine input records=0
        Combine output records=0
        Reduce input groups=1000002
        Reduce shuffle bytes=34888902
        Reduce input records=3000000
        Reduce output records=1000002
        Spilled Records=6000000
        Shuffled Maps =1
        Failed Shuffles=0
        Merged Map outputs=1
        GC time elapsed (ms)=1134
        CPU time spent (ms)=23710
        Physical memory (bytes) snapshot=381153280
        Virtual memory (bytes) snapshot=5039456256
        Total committed heap usage (bytes)=189894656
        Peak Map Physical memory (bytes)=229081088
        Peak Map Virtual memory (bytes)=2516492288
        Peak Reduce Physical memory (bytes)=152334336
        Peak Reduce Virtual memory (bytes)=2522963968
    Shuffle Errors
        BAD_ID=0
        CONNECTION=0
        IO_ERROR=0
        WRONG_LENGTH=0
        WRONG_MAP=0
        WRONG_REDUCE=0
    File Input Format Counters 
        Bytes Read=17888896
    File Output Format Counters 
        Bytes Written=8888922

第四步：查看下载处理结果

[root@node1 sbin]# hdfs dfs -ls /wjy/output
2019-01-16 00:32:54,137 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Found 2 items
-rw-r--r--   1 root supergroup          0 2019-01-15 19:13 /wjy/output/_SUCCESS
-rw-r--r--   1 root supergroup    8888922 2019-01-15 19:13 /wjy/output/part-r-00000

[root@node1 ~]# hdfs dfs -get /wjy/output/part-r-00000 ./
[root@node1 ~]# vi part-r-00000
999980  1
999981  1
999982  1
999983  1
999984  1
999985  1
999986  1
999987  1
999988  1
999989  1
99999   1
999990  1
999991  1
999992  1
999993  1
999994  1
999995  1
999996  1
999997  1
999998  1
999999  1
hello   1000000
sxt     1000000

问题1：
[2019-01-15 17:08:05.159]Container killed on request. Exit code is 143
[2019-01-15 17:08:05.182]Container exited with a non-zero exit code 143.
2019-01-15 17:08:20,957 INFO mapreduce.Job: Task Id : attempt_1547542193692_0003_m_000000_2, Status : FAILED
[2019-01-15 17:08:18.963]Container [pid=4064,containerID=container_1547542193692_0003_01_000004] is running 210352640B beyond the 'VIRTUAL' memory limit. Current usage: 26.0 MB of 1 GB physical memory used; 2.3 GB of 2.1 GB virtual memory used. Killing container.

原因：申请内存过大而被终止
解决措施：取消内存检查
配置：yarn-site.xml

<property>
<name>yarn.nodemanager.vmem-check-enabled</name>
<value>false</value>
<description>Whether virtual memory limits will be enforced for containers</description>
</property>

问题2：
2019-01-15 18:51:11,229 INFO mapred.ClientServiceDelegate: Application state is completed. FinalApplicationStatus=SUCCEEDED. Redirecting to job history server
2019-01-15 18:51:12,237 INFO ipc.Client: Retrying connect to server: 0.0.0.0/0.0.0.0:10020. Already tried 0 time(s); retry policy is RetryUpToMaximumCountWithFixedSleep(maxRetries=10, sleepTime=1000 MILLISECONDS)
java.io.IOException: java.net.ConnectException: Your endpoint configuration is wrong; For more details see: http://wiki.apache.org/hadoop/UnsetHostnameOrPort
原因：由于没有启动historyserver引起的
解决办法：
在mapred-site.xml配置文件中添加

<property>  
    <name>mapreduce.jobhistory.address</name>  
    <value>node1:10020</value>  
</property>

在namenode上执行命令：mr-jobhistory-daemon.sh start historyserver
这样在，namenode上会启动JobHistoryServer服务，可以在historyserver的日志中查看运行情况

问题3：

2019-01-21 12:33:59,450 WARN hdfs.DataStreamer: Caught exception
java.lang.InterruptedException
    at java.lang.Object.wait(Native Method)
    at java.lang.Thread.join(Thread.java:1245)
    at java.lang.Thread.join(Thread.java:1319)
    at org.apache.hadoop.hdfs.DataStreamer.closeResponder(DataStreamer.java:986)
    at org.apache.hadoop.hdfs.DataStreamer.endBlock(DataStreamer.java:640)
    at org.apache.hadoop.hdfs.DataStreamer.run(DataStreamer.java:810)

这个网上有说是BUG，也有说是没有按照hadoop约定的规则创建HDFS目录，

对于上传块目录：

格式：hdfs dfs -mkdir -p /user/input

比如使用root用户登录，则创建目录应为：hdfs dfs -mkdir -p /root/input

相关阅读:
表的创建与管理
 以传值和传引用的方式传递参数 IN OUT NOCOPY
PLSQL中的三种参数模式IN、OUT、IN OUT
用python写GPU上的并行计算程序，有什么库或者编译器？
cupy中tensor数据类型与numpy以及pytorch中相互转换
 c++ string split
Java 读取大文件
 Linux 使用系列
 安装以太坊环境
 服务器排查问题相关命令
原文地址：https://www.cnblogs.com/cac2020/p/10274979.html