1.pom.xml
<dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-hdfs</artifactId> <version>2.9.0</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-yarn-api</artifactId> <version>2.9.0</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-client</artifactId> <version>2.9.0</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-common</artifactId> <version>2.9.0</version> </dependency>
2. java文件
1 package ex.topn; 2 3 import java.util.Comparator; 4 5 public class MySalaryComparator implements Comparator<Salary> { 6 @Override 7 public int compare(Salary e1, Salary e2) { 8 if(e1.getSum() > e2.getSum()) { 9 return 1; 10 }else{ 11 return -1; 12 } 13 } 14 } 15 16 17 package ex.topn; 18 19 public class Salary { 20 private int sum; 21 22 public Salary(int sum) { 23 this.sum = sum; 24 } 25 26 public int getSum() { 27 return sum; 28 } 29 30 public void setSum(int sum) { 31 this.sum = sum; 32 } 33 } 34 35 36 37 38 package ex.topn; 39 40 import java.io.IOException; 41 import java.util.Iterator; 42 import java.util.Map.Entry; 43 import java.util.TreeMap; 44 45 import org.apache.hadoop.io.LongWritable; 46 import org.apache.hadoop.io.NullWritable; 47 import org.apache.hadoop.io.Text; 48 import org.apache.hadoop.mapreduce.Mapper; 49 50 public class Top10Mapper extends Mapper<LongWritable, Text, NullWritable, Text> { 51 public static TreeMap<Salary, Text> TopRecordMap = new TreeMap<Salary, Text>(new MySalaryComparator()); 52 53 public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 54 String line = value.toString(); 55 String[] tokens = line.split(","); 56 57 // get salary 58 int salary = Integer.parseInt(tokens[3]); 59 System.out.println("=> salary=" + salary); 60 TopRecordMap.put(new Salary(salary), new Text(value)); 61 62 Iterator<Entry<Salary, Text>> iter = TopRecordMap.entrySet().iterator(); 63 Entry<Salary, Text> entry = null; 64 65 System.out.println("TopRecordMap.size()=" + TopRecordMap.size()); 66 while(TopRecordMap.size() > 10) { 67 entry = iter.next(); 68 iter.remove(); 69 } 70 } 71 72 protected void cleanup(Context context) throws IOException, InterruptedException { 73 // output our ten records to the reducers with a null key 74 75 for(Text t : TopRecordMap.values()) { 76 context.write(NullWritable.get(), t); 77 System.out.println(" => " + t + ", " + t.toString()); 78 } 79 } 80 } 81 82 83 84 85 package ex.topn; 86 87 import java.io.IOException; 88 import java.util.Iterator; 89 import java.util.Map.Entry; 90 import java.util.TreeMap; 91 92 import org.apache.hadoop.io.NullWritable; 93 import org.apache.hadoop.io.Text; 94 import org.apache.hadoop.mapreduce.Reducer; 95 96 public class Top10Reducer extends Reducer<NullWritable, Text, NullWritable, Text> { 97 public static TreeMap<Salary, Text> TopRecordMap = new TreeMap<Salary, Text>(new MySalaryComparator()); 98 99 public void reduce(NullWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException { 100 101 for(Text value : values) { 102 String line = value.toString(); 103 if(line.length() > 0) { 104 String[] tokens = line.split(","); 105 int salary = Integer.parseInt(tokens[3]); 106 System.out.println("=> salary=" + salary); 107 TopRecordMap.put(new Salary(salary), new Text(value)); 108 109 Iterator<Entry<Salary, Text>> iter = TopRecordMap.entrySet().iterator(); 110 Entry<Salary, Text> entry = null; 111 112 System.out.println("TopRecordMap.size()=" + TopRecordMap.size()); 113 while(TopRecordMap.size() > 10) { 114 entry = iter.next(); 115 iter.remove(); 116 } 117 } 118 } 119 120 // Iterator<Entry<Salary, Text>> iter = TopRecordMap.entrySet().iterator(); 121 // while(iter.hasNext()) { 122 // System.out.println(" => " + iter.next().getValue()); 123 // } 124 125 for(Text t : TopRecordMap.descendingMap().values()) { 126 context.write(NullWritable.get(), t); 127 } 128 } 129 } 130 131 132 133 package ex.topn; 134 135 import java.io.IOException; 136 import java.net.URI; 137 138 import org.apache.hadoop.conf.Configuration; 139 import org.apache.hadoop.fs.FileSystem; 140 import org.apache.hadoop.fs.Path; 141 import org.apache.hadoop.io.NullWritable; 142 import org.apache.hadoop.io.Text; 143 import org.apache.hadoop.mapreduce.Job; 144 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 145 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 146 147 public class Topn { 148 149 public static void main(String[] args) throws Exception { 150 Configuration conf = new Configuration(); 151
152 153 Job job = Job.getInstance(conf,"top n"); 154 job.setJarByClass(Topn.class); 155 job.setMapperClass(Top10Mapper.class); 156 job.setCombinerClass(Top10Reducer.class); 157 job.setReducerClass(Top10Reducer.class); 158 job.setNumReduceTasks(1); 159 160 //job.setMapOutputKeyClass(NullWritable.class); 161 //job.setMapOutputValueClass(IntWritable.class); 162 163 job.setOutputKeyClass(NullWritable.class); 164 job.setOutputValueClass(Text.class); 165 166 FileInputFormat.addInputPath(job, new Path(args[0])); 167 FileOutputFormat.setOutputPath(job, new Path(args[1])); 168 System.exit(job.waitForCompletion(true) ? 0 : 1); 169 }170 }
3. 从eclipse export一个jar: topn.jar, 上传到 hadoop: /opt/hadoop-2.9.0/files
4. 准备三个文件,放入hdfs: /user/root/topn/input, 每行四个数字,原来的例子第四个字段是代表工资,在这些数据中取前10个工资最高的行,可以有重复。
这里用比较简单的数字代字代替,前三个字段根本用不到,所以大部分是1,2,3这种。
file03: 1985,1,2,11 1985,1,2,12 1985,1,2,13 1985,1,2,8 1985,1,2,7 1985,1,2,16 1985,1,2,7 1985,1,2,20 195,1,2,25 1,2,3,22 1,2,3,27 12,2,3,18 1,2,3,17 1,2,3,15 file04: 1,2,3,2 12,3,3,3 1,2,3,4 12,3,4,5 1,2,3,36 1,2,3,34 12,2,3,27 1,2,3,48 1,2,3,35 12,3,3,28 1,2,3,19 12,2,2,31 1,2,3,29 file05: 1,2,3,55 1,2,3,39 1,2,3,2 a,a,a,35
5.执行:
例子以root用户运行
# cd /opt/hadoop-2.9.0 # bin/yarn jar files/topn.jar ex.topn.Topn topn/input topn/output
6.运行结果:
1,2,3,55 1,2,3,48 1,2,3,39 1,2,3,36 a,a,a,35 1,2,3,35 1,2,3,34 12,2,2,31 1,2,3,29 12,3,3,28