• YARN集群的mapreduce测试(三)


    将user表、group表、order表关;(类似于多表关联查询)

    测试准备:

    首先同步时间,然后 开启hdfs集群,开启yarn集群;在本地"/home/hadoop/test/"目录创建user表、group表、order表的文件;

    user文件:

    group文件:

    order文件:

     

    测试目标:

    得到3张表关联后的结果;

    测试代码

    一定要把握好输出键值的类型,否则有可能造成有输出目录,但是没有文件内容的问题;

    package com.mmzs.bigdata.yarn.mapreduce;
    
    import java.io.IOException;
    
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.lib.input.FileSplit;
    
    public class UserGroupMapper01 extends Mapper<LongWritable, Text, Text, Text> {
        
        private Text outKey;
        private Text outValue;
        
        @Override
        protected void setup(Mapper<LongWritable, Text, Text, Text>.Context context)
                throws IOException, InterruptedException {
            outKey = new Text();
            outValue = new Text();
        }
    
        @Override
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
                throws IOException, InterruptedException {
            FileSplit fp = (FileSplit) context.getInputSplit();
            String fileName = fp.getPath().getName();
            
            String line = value.toString();
            String[] fields = line.split("\s+");
            
            String keyStr = null;
            String valueStr = null;
            if ("group".equalsIgnoreCase(fileName)) {
                keyStr = fields[0];
                valueStr = new StringBuilder(fields[1]).append("-->").append(fileName).toString();
            } else {
                keyStr = fields[2];
                //加“-->”;后以此标识符作为分割符,进行文件区分
                valueStr = new StringBuilder(fields[0]).append("	").append(fields[1]).append("-->").append(fileName).toString();
            }
            
            outKey.set(keyStr);
            outValue.set(valueStr);
            context.write(outKey, outValue);
            
            
        }
        
        @Override
        protected void cleanup(Mapper<LongWritable, Text, Text, Text>.Context context)
                throws IOException, InterruptedException {
            outKey = null;
            outValue = null;
        }
        
    }
    UserGroupMapper01
     1 package com.mmzs.bigdata.yarn.mapreduce;
     2 
     3 import java.io.IOException;
     4 import java.util.ArrayList;
     5 import java.util.Iterator;
     6 import java.util.List;
     7 import org.apache.hadoop.io.Text;
     8 import org.apache.hadoop.mapreduce.Reducer;
     9 
    10 public class UserGroupReducer01 extends Reducer<Text, Text, Text, Text> {
    11     
    12     private Text outValue;
    13     
    14     @Override
    15     protected void setup(Reducer<Text, Text, Text, Text>.Context context) throws IOException, InterruptedException {
    16         outValue = new Text();
    17     }
    18 
    19     @Override
    20     protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context)
    21             throws IOException, InterruptedException {
    22         Iterator<Text> its = values.iterator();
    23         
    24         String masterRecord = null;
    25         List<String> slaveRecords = new ArrayList<String>();
    26         
    27         //拆分出主表记录和从表记录
    28         while (its.hasNext()) {
    29             String[] rowAndFileName = its.next().toString().split("-->");
    30             if (rowAndFileName[1].equalsIgnoreCase("group")) {
    31                 masterRecord = rowAndFileName[0];
    32                 continue;
    33             }
    34             slaveRecords.add(rowAndFileName[0]);
    35         }
    36         
    37         for (String slaveRecord : slaveRecords) {
    38             String valueStr = new StringBuilder(masterRecord).append("	").append(slaveRecord).toString();
    39             outValue.set(valueStr);
    40             context.write(key, outValue);
    41         }
    42         
    43     }
    44     
    45     @Override
    46     protected void cleanup(Reducer<Text, Text, Text, Text>.Context context) throws IOException, InterruptedException {
    47         outValue = null;
    48     }
    49 
    50 
    51 }
    UserGroupReducer01
      1 package com.mmzs.bigdata.yarn.mapreduce;
      2 
      3 import java.io.IOException;
      4 import java.net.URI;
      5 import java.net.URISyntaxException;
      6 
      7 import org.apache.hadoop.conf.Configuration;
      8 import org.apache.hadoop.fs.FileSystem;
      9 import org.apache.hadoop.fs.Path;
     10 import org.apache.hadoop.io.LongWritable;
     11 import org.apache.hadoop.io.Text;
     12 import org.apache.hadoop.mapreduce.Job;
     13 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
     14 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
     15 
     16 /**
     17  * @author hadoop
     18  *
     19  */
     20 public class UserGroupDriver01 {
     21 
     22     private static FileSystem fs;
     23     private static Configuration conf;
     24     static {
     25         String uri = "hdfs://master01:9000/";
     26         conf = new Configuration();
     27         try {
     28             fs = FileSystem.get(new URI(uri), conf, "hadoop");
     29         } catch (IOException e) {
     30             e.printStackTrace();
     31         } catch (InterruptedException e) {
     32             e.printStackTrace();
     33         } catch (URISyntaxException e) {
     34             e.printStackTrace();
     35         }
     36     }
     37     
     38     public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
     39 
     40         Job ugJob01 = getJob(args);
     41         if (null == ugJob01) {
     42             return;
     43         }
     44         //提交Job到集群并等待Job运行完成,参数true表示将Job运行时的状态信息返回到客户端
     45         boolean flag = false;
     46         flag = ugJob01.waitForCompletion(true);
     47         System.exit(flag?0:1);
     48     }
     49     
     50     /**
     51      * 获取Job实例
     52      * @param args
     53      * @return
     54      * @throws IOException
     55      */
     56     public static Job getJob(String[] args) throws IOException {
     57         if (null==args || args.length<2) return null;
     58         //放置需要处理的数据所在的HDFS路径
     59         Path inputPath = new Path(args[0]);
     60         //放置Job作业执行完成之后其处理结果的输出路径
     61         Path outputPath = new Path(args[1]);
     62         //主机文件路径
     63         Path userPath = new Path("/home/hadoop/test/user");
     64         Path groupPath = new Path("/home/hadoop/test/group");
     65 
     66         //如果输入的集群路径存在,则删除
     67         if (fs.exists(inputPath)) {
     68             fs.delete(inputPath, true);//true表示递归删除
     69         }
     70         if (fs.exists(outputPath)) {
     71             fs.delete(outputPath, true);//true表示递归删除
     72         }
     73         
     74         //创建并且将数据文件拷贝到创建的集群路径
     75         fs.mkdirs(inputPath);
     76         fs.copyFromLocalFile(false, false, new Path[]{userPath, groupPath}, inputPath);
     77         
     78         
     79         //获取Job实例
     80         Job ugJob01 = Job.getInstance(conf, "UserGroupJob01");
     81         //设置运行此jar包入口类
     82         //ugJob01的入口是WordCountDriver类
     83         ugJob01.setJarByClass(UserGroupDriver01.class);
     84         //设置Job调用的Mapper类
     85         ugJob01.setMapperClass(UserGroupMapper01.class);
     86         //设置Job调用的Reducer类(如果一个Job没有Reducer则可以不调用此条语句)
     87         ugJob01.setReducerClass(UserGroupReducer01.class);
     88         
     89         //设置MapTask的输出键类型
     90         ugJob01.setMapOutputKeyClass(Text.class);
     91         //设置MapTask的输出值类型
     92         ugJob01.setMapOutputValueClass(Text.class);
     93         
     94         //设置整个Job的输出键类型(如果一个Job没有Reducer则可以不调用此条语句)
     95         ugJob01.setOutputKeyClass(Text.class);
     96         //设置整个Job的输出值类型(如果一个Job没有Reducer则可以不调用此条语句)
     97         ugJob01.setOutputValueClass(Text.class);
     98         
     99         //设置整个Job需要处理数据的输入路径
    100         FileInputFormat.setInputPaths(ugJob01, inputPath);
    101         //设置整个Job计算结果的输出路径
    102         FileOutputFormat.setOutputPath(ugJob01, outputPath);
    103         
    104         return ugJob01;
    105     }
    106     
    107 }
    UserGroupDriver01
     1 package com.mmzs.bigdata.yarn.mapreduce;
     2 
     3 import java.io.IOException;
     4 
     5 import org.apache.hadoop.io.LongWritable;
     6 import org.apache.hadoop.io.Text;
     7 import org.apache.hadoop.mapreduce.Mapper;
     8 import org.apache.hadoop.mapreduce.lib.input.FileSplit;
     9 
    10 public class UserGroupMapper02 extends Mapper<LongWritable, Text, Text, Text> {
    11     
    12     private Text outKey;
    13     private Text outValue;
    14     
    15     @Override
    16     protected void setup(Mapper<LongWritable, Text, Text, Text>.Context context)
    17             throws IOException, InterruptedException {
    18         outKey = new Text();
    19         outValue = new Text();
    20     }
    21 
    22     @Override
    23     protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
    24             throws IOException, InterruptedException {
    25         FileSplit fp = (FileSplit) context.getInputSplit();
    26         String fileName = fp.getPath().getName();
    27         
    28         String line = value.toString();
    29         String[] fields = line.split("\s+");
    30         
    31         String keyStr = fields[2];
    32         String valueStr = null;
    33         valueStr = new StringBuilder(fields[0]).append("	").append(fields[1]).append("	").append(fields[3]).append("-->").append(fileName).toString();
    34         
    35         outKey.set(keyStr);
    36         outValue.set(valueStr);
    37         context.write(outKey, outValue);
    38         
    39         
    40     }
    41     
    42     @Override
    43     protected void cleanup(Mapper<LongWritable, Text, Text, Text>.Context context)
    44             throws IOException, InterruptedException {
    45         outKey = null;
    46         outValue = null;
    47     }
    48     
    49 }
    UserGroupMapper02
     1 package com.mmzs.bigdata.yarn.mapreduce;
     2 
     3 import java.io.IOException;
     4 import java.util.ArrayList;
     5 import java.util.Iterator;
     6 import java.util.List;
     7 
     8 import org.apache.hadoop.io.Text;
     9 import org.apache.hadoop.mapreduce.Reducer;
    10 
    11 public class UserGroupReducer02 extends Reducer<Text, Text, Text, Text> {
    12     
    13     private Text outValue;
    14     
    15     @Override
    16     protected void setup(Reducer<Text, Text, Text, Text>.Context context) throws IOException, InterruptedException {
    17         outValue = new Text();
    18     }
    19 
    20     @Override
    21     protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context)
    22             throws IOException, InterruptedException {
    23         String masterRecord = null;
    24         List<String> slaveRecords = new ArrayList<String>();
    25         
    26         //拆分出主表记录和从表记录
    27         Iterator<Text> its = values.iterator();
    28         while (its.hasNext()) {
    29             String[] rowAndFileName = its.next().toString().split("-->");
    30             if (!rowAndFileName[1].equalsIgnoreCase("order")) {
    31                 masterRecord = rowAndFileName[0];
    32                 continue;
    33             }
    34             slaveRecords.add(rowAndFileName[0]);
    35         }
    36         
    37         for (String slaveRecord : slaveRecords) {
    38             String valueStr = new StringBuilder(masterRecord).append("	").append(slaveRecord).toString();
    39             outValue.set(valueStr);
    40             context.write(key, outValue);
    41         }
    42         
    43     }
    44     
    45     @Override
    46     protected void cleanup(Reducer<Text, Text, Text, Text>.Context context) throws IOException, InterruptedException {
    47         outValue = null;
    48     }
    49 
    50 
    51 }
    UserGroupReducer02
      1 package com.mmzs.bigdata.yarn.mapreduce;
      2 
      3 import java.io.IOException;
      4 import java.net.URI;
      5 import java.net.URISyntaxException;
      6 import org.apache.hadoop.conf.Configuration;
      7 import org.apache.hadoop.fs.FileSystem;
      8 import org.apache.hadoop.fs.Path;
      9 import org.apache.hadoop.io.Text;
     10 import org.apache.hadoop.mapreduce.Job;
     11 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
     12 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
     13 
     14 /**
     15  * @author hadoop
     16  *
     17  */
     18 public class UserGroupDriver02 {
     19 
     20     private static FileSystem fs;
     21     private static Configuration conf;
     22     static {
     23         String uri = "hdfs://master01:9000/";
     24         conf = new Configuration();
     25         try {
     26             fs = FileSystem.get(new URI(uri), conf, "hadoop");
     27         } catch (IOException e) {
     28             e.printStackTrace();
     29         } catch (InterruptedException e) {
     30             e.printStackTrace();
     31         } catch (URISyntaxException e) {
     32             e.printStackTrace();
     33         }
     34     }
     35     
     36     public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
     37 
     38         Job ugJob02 = getJob(new String[] {args[1], args[2]});
     39         if (null == ugJob02) {
     40             return;
     41         }
     42         //提交Job到集群并等待Job运行完成,参数true表示将Job运行时的状态信息返回到客户端
     43         boolean flag = false;
     44         flag = ugJob02.waitForCompletion(true);
     45         System.exit(flag?0:1);
     46     }
     47     
     48     /**
     49      * 获取Job实例
     50      * @param args
     51      * @return
     52      * @throws IOException
     53      */
     54     public static Job getJob(String[] args) throws IOException {
     55         if (null==args || args.length<2) return null;
     56         //放置需要处理的数据所在的HDFS路径
     57         Path inputPath = new Path(args[1]);
     58         //放置Job作业执行完成之后其处理结果的输出路径
     59         Path outputPath = new Path(args[2]);
     60         //主机文件路径
     61         Path orderPath = new Path("/home/hadoop/test/order");
     62 
     63         //输入的集群路径存在,在第一次已创建
     64         if (!fs.exists(inputPath)) return null;
     65         if (fs.exists(outputPath)) {
     66             fs.delete(outputPath, true);//true表示递归删除
     67         }
     68         
     69         //将数据文件拷贝到创建的集群路径
     70         fs.copyFromLocalFile(false, false, orderPath, inputPath);
     71         
     72         
     73         //获取Job实例
     74         Job ugJob02 = Job.getInstance(conf, "UserGroupJob02");
     75         //设置运行此jar包入口类
     76         //ugJob02的入口是WordCountDriver类
     77         ugJob02.setJarByClass(UserGroupDriver02.class);
     78         //设置Job调用的Mapper类
     79         ugJob02.setMapperClass(UserGroupMapper02.class);
     80         //设置Job调用的Reducer类(如果一个Job没有Reducer则可以不调用此条语句)
     81         ugJob02.setReducerClass(UserGroupReducer02.class);
     82         
     83         //设置MapTask的输出键类型
     84         ugJob02.setMapOutputKeyClass(Text.class);
     85         //设置MapTask的输出值类型
     86         ugJob02.setMapOutputValueClass(Text.class);
     87         
     88         //设置整个Job的输出键类型(如果一个Job没有Reducer则可以不调用此条语句)
     89         ugJob02.setOutputKeyClass(Text.class);
     90         //设置整个Job的输出值类型(如果一个Job没有Reducer则可以不调用此条语句)
     91         ugJob02.setOutputValueClass(Text.class);
     92         
     93         //设置整个Job需要处理数据的输入路径
     94         FileInputFormat.setInputPaths(ugJob02, inputPath);
     95         //设置整个Job计算结果的输出路径
     96         FileOutputFormat.setOutputPath(ugJob02, outputPath);
     97         
     98         return ugJob02;
     99     }
    100     
    101 }
    UserGroupDriver02
     1 package com.mmzs.bigdata.yarn.mapreduce;
     2 
     3 import java.io.IOException;
     4 import java.net.URI;
     5 import java.net.URISyntaxException;
     6 
     7 import org.apache.hadoop.conf.Configuration;
     8 import org.apache.hadoop.fs.FileSystem;
     9 import org.apache.hadoop.fs.Path;
    10 import org.apache.hadoop.mapreduce.Job;
    11 
    12 public class UserGroupDriver {
    13 
    14     private static FileSystem fs;
    15     private static Configuration conf;
    16     private static final String TEMP= "hdfs://master01:9000/data/usergrouporder/tmp";
    17     static {
    18         String uri = "hdfs://master01:9000/";
    19         conf = new Configuration();
    20         try {
    21             fs = FileSystem.get(new URI(uri), conf, "hadoop");
    22         } catch (IOException e) {
    23             e.printStackTrace();
    24         } catch (InterruptedException e) {
    25             e.printStackTrace();
    26         } catch (URISyntaxException e) {
    27             e.printStackTrace();
    28         }
    29     }
    30     
    31     public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
    32         
    33         String[] params = {args[0], TEMP, args[1]};
    34         
    35         //运行第1个Job
    36         Job ugJob01 = UserGroupDriver01.getJob(params);
    37         //提交Job到集群并等待Job运行完成,参数true表示将Job运行时的状态信息返回到客户端
    38         boolean flag01 = ugJob01.waitForCompletion(true);
    39         if (!flag01) {
    40             System.out.println("job2 running failure......");
    41             System.exit(1);
    42         }
    43         
    44         //运行第2个Job
    45         Job ugJob02 = UserGroupDriver02.getJob(params);
    46         //提交Job到集群并等待Job运行完成,参数true表示将Job运行时的状态信息返回到客户端
    47         boolean flag02 = ugJob02.waitForCompletion(true);
    48         if (flag02) {//等待Job02完成后就删掉中间目录并退出;
    49 //            fs.delete(new Path(TEMP), true);
    50             System.out.println("job2 running success......");
    51             System.exit(0);
    52         }
    53         System.out.println("job2 running failure......");
    54         System.exit(1);
    55     }
    56 
    57 }
    UserGroupDriver

    为了更好的测试,可以先屏蔽删除中间输出结果的语句;

    //总Driver
    String[] params = {args[0], TEMP, args[1]};    
    //运行第1个Job
    Job ugJob01 = UserGroupDriver01.getJob(params);
    //运行第2个Job
    Job ugJob02 = UserGroupDriver02.getJob(params);
    
    //分Driver01
    //放置需要处理的数据所在的HDFS路径 Path inputPath = new Path(args[0]);//params中的args[0] //放置Job作业执行完成之后其处理结果的输出路径 Path outputPath = new Path(args[1]);//params中的TEMP //分Driver02 //params中的TEMP和args[2]//放置需要处理的数据所在的HDFS路径 Path inputPath = new Path(args[1]); //放置Job作业执行完成之后其处理结果的输出路径 Path outputPath = new Path(args[2]);

    测试结果:

    运行时传入参数是:

    如果在eclipse上运行:传参需要加上集群的master的uri即 hdfs://master01:9000

    输入路径参数:  /data/usergrouporder/src

    输出路径参数:  /data/usergrouporder/dst

  • 相关阅读:
    DataTable、GridView、DataList导出至Word或Excel
    实现字符串转换为图片
    Sina App Engine与Google App Engine区别对比
    C/C++反序输出字符串总结
    NAND flash和NOR flash详解
    [转载]看完《python简明教程》笔记及第一个python程序
    Python写爬虫——抓取网页并解析HTML
    ACM HDU 1004 Let the Balloon Rise
    有向图的强连通分量
    ACM HDU 1005 Number Sequence
  • 原文地址:https://www.cnblogs.com/mmzs/p/8040158.html
Copyright © 2020-2023  润新知