基于Mapreduce数据排序

 1 import java.io.IOException;
 2 import org.apache.hadoop.conf.Configuration;
 3 import org.apache.hadoop.fs.Path;
 4 import org.apache.hadoop.io.IntWritable;
 5 import org.apache.hadoop.io.Text;
 6 import org.apache.hadoop.mapreduce.Job;
 7 import org.apache.hadoop.mapreduce.Mapper;
 8 import org.apache.hadoop.mapreduce.Reducer;
 9 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
10 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
11 import org.apache.hadoop.util.GenericOptionsParser;
12 
13 public class Sort {
14     // map将输入中的value化成IntWritable类型，作为输出的key
15     public static class Map extends
16             Mapper<Object, Text, IntWritable, IntWritable> {
17         private static IntWritable data = new IntWritable();
18 
19         // 实现map函数
20         public void map(Object key, Text value, Context context)
21                 throws IOException, InterruptedException {
22 
23             String line = value.toString();
24 
25             data.set(Integer.parseInt(line));
26 
27             context.write(data, new IntWritable(1));
28         }
29     }
30 
31     // reduce将输入中的key复制到输出数据的key上，
32 
33     // 然后根据输入的value-list中元素的个数决定key的输出次数
34 
35     // 用全局linenum来代表key的位次
36 
37     public static class Reduce extends
38 
39     Reducer<IntWritable, IntWritable, IntWritable, IntWritable> {
40         private static IntWritable linenum = new IntWritable(1);
41 
42         // 实现reduce函数
43         public void reduce(IntWritable key, Iterable<IntWritable> values,
44                 Context context)
45 
46         throws IOException, InterruptedException {
47 
48             for (IntWritable val : values) {
49 
50                 context.write(linenum, key);
51 
52                 linenum = new IntWritable(linenum.get() + 1);
53 
54             }
55         }
56     }
57 
58     public static void main(String[] args) throws Exception {
59 
60         Configuration conf = new Configuration();
61         conf.set("mapred.job.tracker", "localhost:9000");
62         String[] ioArgs = new String[] { "hdfs://localhost:9000/input/sort",
63                 "hdfs://localhost:9000/output/sortout" };
64 
65         String[] otherArgs = new GenericOptionsParser(conf, ioArgs)
66                 .getRemainingArgs();
67 
68         if (otherArgs.length != 2) {
69 
70             System.err.println("Usage: Data Sort <in> <out>");
71 
72             System.exit(2);
73 
74         }
75         // Job job = new Job(conf, "Data Sort");
76         Job job = Job.getInstance(conf, "Data Sort");
77 
78         job.setJarByClass(Sort.class);
79         // 设置Map和Reduce处理类
80         job.setMapperClass(Map.class);
81 
82         job.setReducerClass(Reduce.class);
83         // 设置输出类型
84         job.setOutputKeyClass(IntWritable.class);
85 
86         job.setOutputValueClass(IntWritable.class);
87         // 设置输入和输出目录
88         FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
89 
90         FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
91 
92         System.exit(job.waitForCompletion(true) ? 0 : 1);
93     }
94 }

个人学习记录

相关阅读:
NumPy数组基本的索引和切片
 赫夫曼树编码解码实例(C)
深度优先迷宫求解实例(C)
创建ndarray的方法
 【学习笔记】计算机网络-利用TELNET进行SMTP的邮件发送
 【学习笔记】非递归实现先后根遍历二叉树
 【学习笔记】计算机网络-DNS层次查询
 【学习笔记】计算机网络-网络常用命令(一)
【学习笔记】计算机网络-Ping命令(一)
Win10下Wireshark找不到接口的解决办法
原文地址：https://www.cnblogs.com/jeshy/p/15244746.html