课堂测试-数据清洗

今天完成的是第二部数据的清洗，由于在网上查阅了相关资料，进一步了解了有关mapreduce的运作机理，所以在做起来时，较以前简单了许多。

import java.io.IOException;
 2 import org.apache.hadoop.conf.Configuration;
 3 import org.apache.hadoop.fs.Path;
 4 import org.apache.hadoop.io.Text;
 5 import org.apache.hadoop.mapreduce.Job;
 6 import org.apache.hadoop.mapreduce.Mapper;
 7 import org.apache.hadoop.mapreduce.Reducer;
 8 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 9 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
10 
11 public class text_2_1 {
12     public static class Map extends Mapper<Object,Text,Text,Text>{
13         private static Text newKey = new Text();
14         private static Text newvalue = new Text("1");
15         public void map(Object key,Text value,Context context) throws IOException, InterruptedException{
16         String line = value.toString();
17         String arr[] = line.split(" ");
18         newKey.set(arr[5]);
19         context.write(newKey,newvalue);
20         }
21     }
22         public static class Reduce extends Reducer<Text, Text, Text, Text> {
23             private static Text newkey = new Text();
24             private static Text newvalue = new Text();
25             protected void reduce(Text key, Iterable<Text> values, Context context)throws IOException, InterruptedException {
26                 int num = 0;
27                     for(Text text : values){
28                         num++;
29                     }
30                     newkey.set(""+num);
31                     newvalue.set(key);
32                     context.write(newkey,newvalue);
33             }
34         }
35     public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
36     Configuration conf = new Configuration();
37     conf.set("mapred.textoutputformat.separator", " ");
38     System.out.println("start");
39       Job job=Job.getInstance(conf); 
40       job.setJarByClass(text_2_1.class);
41       job.setMapperClass(Map.class);
42       job.setReducerClass(Reduce.class);
43       job.setOutputKeyClass(Text.class); 
44       job.setOutputValueClass(Text.class); 
45       Path in=new Path("hdfs://localhost:9000/text/in/data");
46       Path out=new Path("hdfs://localhost:9000/text/out1");
47       FileInputFormat.addInputPath(job, in);
48       FileOutputFormat.setOutputPath(job, out);
49       boolean flag = job.waitForCompletion(true);
50         System.out.println(flag);
51         System.exit(flag? 0 : 1);
52   }
53 }

import java.io.IOException;
 2 import org.apache.hadoop.conf.Configuration;
 3 import org.apache.hadoop.fs.Path;
 4 import org.apache.hadoop.io.IntWritable;
 5 import org.apache.hadoop.io.Text;
 6 import org.apache.hadoop.mapreduce.Job;
 7 import org.apache.hadoop.mapreduce.Mapper;
 8 import org.apache.hadoop.mapreduce.Reducer;
 9 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
10 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
11 public class m {
12     public static class Map extends Mapper<Object,Text,IntWritable,Text>{
13         private static IntWritable  newKey = new IntWritable ();
14         private static Text  newvalue = new Text ();
15         public void map(Object key,Text value,Context context) throws IOException, InterruptedException{
16         String line = value.toString();
17         String arr[] = line.split(" ");
18         newKey.set(Integer.parseInt(arr[0]));
19         newvalue.set(arr[1]);
20         context.write(newKey,newvalue);
21         }
22     }
23         public static class Reduce extends Reducer<IntWritable, Text, IntWritable, Text> {
24             protected void reduce(IntWritable key, Iterable<Text> values, Context context)throws IOException, InterruptedException {
25                     for(Text text : values){
26                         context.write(key,text);
27                     }
28             }
29         }
30         public static class IntWritableDecreasingComparator extends    IntWritable.Comparator 
31         {        
32             public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) 
33             {            
34                 return -super.compare(b1, s1, l1, b2, s2, l2);        
35                 }    
36             }
37     public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
38     Configuration conf = new Configuration();
39     conf.set("mapred.textoutputformat.separator", " ");
40     System.out.println("start");
41       Job job=Job.getInstance(conf); 
42       job.setJarByClass(m.class);
43       job.setMapperClass(Map.class);
44       job.setReducerClass(Reduce.class);
45       job.setOutputKeyClass(IntWritable.class); 
46       job.setOutputValueClass(Text.class); 
47       job.setSortComparatorClass(IntWritableDecreasingComparator.class);
48       Path in=new Path("hdfs://localhost:9000/text/out1/part-r-00000");
49       Path out=new Path("hdfs://localhost:9000/text/out2");
50       FileInputFormat.addInputPath(job, in);
51       FileOutputFormat.setOutputPath(job, out);
52       boolean flag = job.waitForCompletion(true);
53         System.out.println(flag);
54         System.exit(flag? 0 : 1);
55   }
56 }

运行结果如下：

相关阅读:
决策树和随机森林
 6个开源数据科学项目
 机器学习：梯度下降
 Python中的数据结构
 方差分析介绍(结合COVID-19案例)
html5
归并排序
 前端与后端
 Dw3 Sublime text 3 UltraEdit XAMMPP 火狐浏览器 ISS
ECMAScript JavaScript JScript
原文地址：https://www.cnblogs.com/hwh000/p/11862689.html