hadoop 把mapreduce任务从本地提交到hadoop集群上运行

MapReduce任务有三种运行方式：

1、windows（linux）本地调试运行，需要本地hadoop环境支持

2、本地编译成jar包，手动发送到hadoop集群上用hadoop jar或者yarn jar方式运行。

3、本地编译环境在IDE里直接提交到集群上运行，实际上这种方式就是第二种方式的变种。

本例说的就是第三种方式

1）核心的部分就是Confirguration的配置
2）本地需要编译成jar包
3）运行参数在本地配置，包括输入输出参数
4）出现windows下的环境配置问题，参照https://www.cnblogs.com/asker009/p/10348188.html

关键运行代码如下：mapper和reducer就不贴出来了，可以看之前的https://www.cnblogs.com/asker009/p/10337598.html

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.net.URI;

public class WordCount {

    private static String HDFSUri = "hdfs://bigdata-senior01.home.com:9000";

    public static void main(String[] args) throws Exception {
        if(args.length!=2)
        {
            System.err.println("使用格式：WordCount <input path> <output path>");
            System.exit(-1);
        }

        long startTime = System.currentTimeMillis();
        //Configuration类代表作业的配置，该类会加载mapred-site.xml、hdfs-site.xml、core-site.xml等配置文件。
        Configuration conf =new Configuration();


        //本地模式运行mr程序时，输入输出的数据可以在本地，也可以在hdfs上
        //到底在哪里，就看以下两行配置你用哪行，默认就是file:///
        conf.set("fs.defaultFS","hdfs://bigdata-senior01.home.com:9000");
//        conf.set("fs.defaultFS", "file:///");

        //本地提交到集群上运行
        //运行集群模式，就是把程序提交到yarn中去运行
        //要想运行为集群模式，以下5个参数要指定为集群上的值（实际上就是hadoop集群上的配置）
        //还需要把hadoop集群上core-site.xml,yarn-site.xml,mapred-site.xml拷贝到resources目录下或者把这几个文件的核心配置写入conf变量
        //如果是把程序打包成jar,hadoop jar运行，不需要写下面，因为hadoop jar脚本自动把集群中配置好的配置文件加载给该程序
        conf.set("mapreduce.framework.name", "yarn");
        conf.set("yarn.nodemanager.aux-services","mapreduce_shuffle");
        conf.set("yarn.resourcemanager.hostname", "bigdata-senior01.home.com");
        conf.set("hadoop.tmp.dir","/opt/data/tmp");
        conf.set("mapreduce.application.classpath","/opt/modules/hadoop-3.1.0/share/hadoop/mapreduce/*, /opt/modules/hadoop-3.1.0/share/hadoop/mapreduce/lib-examples/*");

        //跨平台提交
        conf.set("mapreduce.app-submission.cross-platform", "true");
        //设置mapred.jar的路径,不然会报找不到，设置的内容就是本例中输出的jar包
        conf.set("mapred.jar","E:\myProgram\Java\wordcount\out\artifacts\wordcount_jar\wordcount.jar");


        //如果实在非hadoop用户环境下提交任务
        System.setProperty("HADOOP_USER_NAME","hadoop");
        System.out.println("HADOOP_USER_NAME: "+System.getProperty("HADOOP_USER_NAME"));

        

        Path outPath = new Path(args[1]);
        //FileSystem里面包括很多系统，不局限于hdfs
        FileSystem fileSystem = FileSystem.get(URI.create(HDFSUri),conf);
        //删除输出路径
        if(fileSystem.exists(outPath))
        {
            fileSystem.delete(outPath,true);
        }

        Job job = Job.getInstance(conf,"word count"); // new Job(conf, "word count");
        job.setJarByClass(WordCount.class);

        job.setMapperClass(WordCountMapper.class);
        //Combiner最终不能影响reduce输出的结果
//        job.setCombinerClass(WordCountReducer.class);
        job.setReducerClass(WordCountReducer.class);

        //一般情况下mapper和reducer的输出的数据类型是一样的，所以我们用上面两条命令就行，如果不一样，我们就可以用下面两条命令单独指定mapper的输出key、value的数据类型
        //job.setMapOutputKeyClass(Text.class);
        //job.setMapOutputValueClass(IntWritable.class);
        //hadoop默认的是TextInputFormat和TextOutputFormat,所以说我们这里可以不用配置。
//        job.setInputFormatClass(TextInputFormat.class);
//        job.setOutputFormatClass(TextOutputFormat.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);


        //指定的这个路径可以是单个文件、一个目录或符合特定文件模式的一系列文件。
        //从方法名称可以看出，可以通过多次调用这个方法来实现多路径的输入。
        FileInputFormat.addInputPath(job,new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));


        boolean result = job.waitForCompletion(true);

        long endTime = System.currentTimeMillis();
        long timeSpan = endTime - startTime;
        System.out.println("运行耗时："+timeSpan+"毫秒。");

        System.exit( result ? 0 : 1);

    }
}

相关阅读:
使用SWFUpload进行多文件上传
 TSQL递归
 Silverlight之视频录制
 Silverlight之摄像头麦克风使用
 Silverlight之文件上传组件
 SQL FOR XML
XAML开发入门之XAML核心语法
 Ajax技术三种实现方式之xmlhttp+httphandler篇（三）
Ext中超时设定 ext.ajax.timeout
后台执行js先执行前端的JS函数,再执行后台函数的按钮实
原文地址：https://www.cnblogs.com/asker009/p/10349162.html