MapReduce编程
有三个⽂文件file1、file2、file3,⽂文件中每⼀一⾏行行都是⼀一个数字,如下所示。
file1.txt:
2
32
654
32
15
756
65223
file2.txt:
5956
11
650
92
file3.txt:
26
54
6
请编写 MapReduce 程序实现如下需求:
MapReduce 程序读取这三个文件,对三个文件中的数字进行整体升序排序,并输出到⼀个结果文件中,结果文件中的每一行有两个数字(两个数字之间使用制表符分隔),第一个数字代表排名,第二个数字代表原始数据
期望输出:
1 2
2 6
3 11
4 15
5 26
6 32
7 32
8 54
9 92
10 650
11 654
12 756
13 5956
14 65223
homeworkMapper.java
package com.lagou.mr.homework01;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class homeworkMapper extends Mapper<LongWritable, Text, IntWritable, IntWritable> {
private IntWritable mapperValue = new IntWritable();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
final String num = value.toString().trim();
mapperValue.set(Integer.parseInt(num));
context.write(mapperValue, new IntWritable(1));
}
}
homeworkReducer.java
package com.lagou.mr.homework01;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class homeworkReducer extends Reducer<IntWritable, IntWritable, IntWritable, IntWritable> {
//读取出来的每一行调用的是同一个homeworkReducer,多次调用reduce方法,对排序要进行累加就必须有一个全局变量
private IntWritable postion = new IntWritable(1); //排序序号
@Override
protected void reduce(IntWritable key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
for (IntWritable value : values) {
context.write(postion, key);
postion = new IntWritable(postion.get() + 1);
}
}
}
homeworkDriver.java
package com.lagou.mr.homework01;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class homeworkDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
/*
1. 获取配置文件对象,获取job对象实例
2. 指定程序jar的本地路径
3. 指定Mapper/Reducer类
4. 指定Mapper输出的kv数据类型
5. 指定最终输出的kv数据类型
6. 指定job处理的原始数据路径
7. 指定job输出结果路径
8. 提交作业
*/
final Configuration configuration = new Configuration();
final Job job = Job.getInstance(configuration, "homeworkDriver");
job.setJarByClass(com.lagou.mr.homework01.homeworkDriver.class);
job.setMapperClass(homeworkMapper.class);
job.setReducerClass(homeworkReducer.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(IntWritable.class);
job.setNumReduceTasks(1);
FileInputFormat.setInputPaths(job, new Path("H:\hadoop\learningCode\mapreduce\wordcount\input\homework01"));
FileOutputFormat.setOutputPath(job, new Path("H:\hadoop\learningCode\mapreduce\wordcount\output\homework01\output"));
final boolean flag = job.waitForCompletion(true);
System.exit(flag ? 0 : 1);
}
}