• 课堂测试-数据清洗


    今天完成的是第二部数据的清洗,由于在网上查阅了相关资料,进一步了解了有关mapreduce的运作机理,所以在做起来时,较以前简单了许多。

    import java.io.IOException;
     2 import org.apache.hadoop.conf.Configuration;
     3 import org.apache.hadoop.fs.Path;
     4 import org.apache.hadoop.io.Text;
     5 import org.apache.hadoop.mapreduce.Job;
     6 import org.apache.hadoop.mapreduce.Mapper;
     7 import org.apache.hadoop.mapreduce.Reducer;
     8 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
     9 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    10 
    11 public class text_2_1 {
    12     public static class Map extends Mapper<Object,Text,Text,Text>{
    13         private static Text newKey = new Text();
    14         private static Text newvalue = new Text("1");
    15         public void map(Object key,Text value,Context context) throws IOException, InterruptedException{
    16         String line = value.toString();
    17         String arr[] = line.split(" ");
    18         newKey.set(arr[5]);
    19         context.write(newKey,newvalue);
    20         }
    21     }
    22         public static class Reduce extends Reducer<Text, Text, Text, Text> {
    23             private static Text newkey = new Text();
    24             private static Text newvalue = new Text();
    25             protected void reduce(Text key, Iterable<Text> values, Context context)throws IOException, InterruptedException {
    26                 int num = 0;
    27                     for(Text text : values){
    28                         num++;
    29                     }
    30                     newkey.set(""+num);
    31                     newvalue.set(key);
    32                     context.write(newkey,newvalue);
    33             }
    34         }
    35     public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
    36     Configuration conf = new Configuration();
    37     conf.set("mapred.textoutputformat.separator", " ");
    38     System.out.println("start");
    39       Job job=Job.getInstance(conf); 
    40       job.setJarByClass(text_2_1.class);
    41       job.setMapperClass(Map.class);
    42       job.setReducerClass(Reduce.class);
    43       job.setOutputKeyClass(Text.class); 
    44       job.setOutputValueClass(Text.class); 
    45       Path in=new Path("hdfs://localhost:9000/text/in/data");
    46       Path out=new Path("hdfs://localhost:9000/text/out1");
    47       FileInputFormat.addInputPath(job, in);
    48       FileOutputFormat.setOutputPath(job, out);
    49       boolean flag = job.waitForCompletion(true);
    50         System.out.println(flag);
    51         System.exit(flag? 0 : 1);
    52   }
    53 }
    import java.io.IOException;
     2 import org.apache.hadoop.conf.Configuration;
     3 import org.apache.hadoop.fs.Path;
     4 import org.apache.hadoop.io.IntWritable;
     5 import org.apache.hadoop.io.Text;
     6 import org.apache.hadoop.mapreduce.Job;
     7 import org.apache.hadoop.mapreduce.Mapper;
     8 import org.apache.hadoop.mapreduce.Reducer;
     9 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    10 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    11 public class m {
    12     public static class Map extends Mapper<Object,Text,IntWritable,Text>{
    13         private static IntWritable  newKey = new IntWritable ();
    14         private static Text  newvalue = new Text ();
    15         public void map(Object key,Text value,Context context) throws IOException, InterruptedException{
    16         String line = value.toString();
    17         String arr[] = line.split(" ");
    18         newKey.set(Integer.parseInt(arr[0]));
    19         newvalue.set(arr[1]);
    20         context.write(newKey,newvalue);
    21         }
    22     }
    23         public static class Reduce extends Reducer<IntWritable, Text, IntWritable, Text> {
    24             protected void reduce(IntWritable key, Iterable<Text> values, Context context)throws IOException, InterruptedException {
    25                     for(Text text : values){
    26                         context.write(key,text);
    27                     }
    28             }
    29         }
    30         public static class IntWritableDecreasingComparator extends    IntWritable.Comparator 
    31         {        
    32             public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) 
    33             {            
    34                 return -super.compare(b1, s1, l1, b2, s2, l2);        
    35                 }    
    36             }
    37     public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
    38     Configuration conf = new Configuration();
    39     conf.set("mapred.textoutputformat.separator", " ");
    40     System.out.println("start");
    41       Job job=Job.getInstance(conf); 
    42       job.setJarByClass(m.class);
    43       job.setMapperClass(Map.class);
    44       job.setReducerClass(Reduce.class);
    45       job.setOutputKeyClass(IntWritable.class); 
    46       job.setOutputValueClass(Text.class); 
    47       job.setSortComparatorClass(IntWritableDecreasingComparator.class);
    48       Path in=new Path("hdfs://localhost:9000/text/out1/part-r-00000");
    49       Path out=new Path("hdfs://localhost:9000/text/out2");
    50       FileInputFormat.addInputPath(job, in);
    51       FileOutputFormat.setOutputPath(job, out);
    52       boolean flag = job.waitForCompletion(true);
    53         System.out.println(flag);
    54         System.exit(flag? 0 : 1);
    55   }
    56 }

    运行结果如下:

     

  • 相关阅读:
    决策树和随机森林
    6个开源数据科学项目
    机器学习:梯度下降
    Python中的数据结构
    方差分析介绍(结合COVID-19案例)
    html5
    归并排序
    前端与后端
    Dw3 Sublime text 3 UltraEdit XAMMPP 火狐浏览器 ISS
    ECMAScript JavaScript JScript
  • 原文地址:https://www.cnblogs.com/hwh000/p/11862689.html
Copyright © 2020-2023  润新知