Reduce join 案例

一、需求分析

1、需求

order.txt

  id   pid amount   
1001    01    1
1002    02    2
1003    03    3
1004    01    4
1005    02    5
1006    03    6

pd.txt

pid    pname
01    小米
02    华为
03    格力

两张表合并成order.txt中的pid用 pd.txt中的pname代替

2、分析

a、map 将order.txt 和 pd.txt的数据进行组合，不同的文件对应不同的处理(flag)，key为pid，value为自定义的Hadoop序列化

b、按照key会自动排序，因此不需要自定义排序

c、reduce map的数据合并

二、代码

1、自定义Hadoop序列化类

package com.wt.reducejoin;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class TableBean implements Writable {
    private String orderId;
    private String pId;
    private int amount;
    private String pName;
    private String flag;

    public TableBean() {
    }

    public TableBean(String orderId, String pId, int amount, String pName, String flag) {
        this.orderId = orderId;
        this.pId = pId;
        this.amount = amount;
        this.pName = pName;
        this.flag = flag;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        // 序列化
        out.writeUTF(orderId);
        out.writeUTF(pId);
        out.writeInt(amount);
        out.writeUTF(pName);
        out.writeUTF(flag);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        // 反序列化
        this.orderId = in.readUTF();
        this.pId = in.readUTF();
        this.amount = in.readInt();
        this.pName = in.readUTF();
        this.flag = in.readUTF();
    }

    public String getOrderId() {
        return orderId;
    }

    public void setOrderId(String orderId) {
        this.orderId = orderId;
    }

    public String getpId() {
        return pId;
    }

    public void setpId(String pId) {
        this.pId = pId;
    }

    public int getAmount() {
        return amount;
    }

    public void setAmount(int amount) {
        this.amount = amount;
    }

    public String getpName() {
        return pName;
    }

    public void setpName(String pName) {
        this.pName = pName;
    }

    public String getFlag() {
        return flag;
    }

    public void setFlag(String flag) {
        this.flag = flag;
    }

    @Override
    public String toString() {
        return orderId + "	" + pName + "	" + pId + "	";
    }
}

2、Mapper

package com.wt.reducejoin;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.IOException;

public class TableMapper extends Mapper<LongWritable, Text, Text, TableBean> {
    String name;
    TableBean bean = new TableBean();
    Text k = new Text();
    // 获取文件文件名字
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        FileSplit split = (FileSplit) context.getInputSplit();
        name = split.getPath().getName();
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        // 1.读取行
        String line = value.toString();
        // 判断文件名称
        if (name.startsWith("order")){
            String[] fields = line.split("	");
            // 2.1 设置 value
            bean.setOrderId(fields[0]);
            bean.setpId(fields[1]);
            bean.setAmount(Integer.parseInt(fields[2]));
            bean.setpName("");
            bean.setFlag("order");
            // 2.2 设置 key pid
            k.set(fields[1]);
        }else {
            String[] fields = line.split("	");
            // 3.1 设置 value
            bean.setpId(fields[0]);
            bean.setpName(fields[1]);
            bean.setOrderId("");
            bean.setAmount(0);
            bean.setFlag("pd");
            // 3.2 设置 key pid
            k.set(fields[0]);
        }
        // 4. 写入
        context.write(k, bean);
    }
}

3、Reducer

package com.wt.reducejoin;

import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;

public class TableReducer extends Reducer<Text, TableBean,TableBean, NullWritable> {
    @Override
    protected void reduce(Text key, Iterable<TableBean> values, Context context) throws IOException, InterruptedException {
        // 存储订单的集合
        ArrayList<TableBean> orderBeans = new ArrayList<TableBean>();
        // 订单
        TableBean pdBean = new TableBean();

        for (TableBean value : values) {
            // 订单表
            if ("order".equals(value.getFlag())){
                // 拷贝传递过来的每条订单数据到集合中
                TableBean orderBean = new TableBean();
                try {
                    BeanUtils.copyProperties(orderBean, value);
                } catch (IllegalAccessException e) {
                    e.printStackTrace();
                } catch (InvocationTargetException e) {
                    e.printStackTrace();
                }
                orderBeans.add(orderBean);
            }else {
                // 复制到pd
                try {
                    BeanUtils.copyProperties(pdBean, value);
                } catch (IllegalAccessException e) {
                    e.printStackTrace();
                } catch (InvocationTargetException e) {
                    e.printStackTrace();
                }
            }
        }

        // 表的连接
        for (TableBean bean : orderBeans) {
            bean.setpName(pdBean.getpName());
            // 4. 写入
            context.write(bean, NullWritable.get());
        }
    }
}

4、Driver

package com.wt.reducejoin;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class TableDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        args = new String[]{"E:\a\inputjoin", "E:\a\output1"};
        // 1.job
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        // 2.jar
        job.setJarByClass(TableDriver.class);
        // 3.关联mapper和reducer
        job.setMapperClass(TableMapper.class);
        job.setReducerClass(TableReducer.class);
        // 4.设置mapper的输出的 k v
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(TableBean.class);
        // 5.设置输出的 k v
        job.setOutputKeyClass(TableBean.class);
        job.setOutputValueClass(NullWritable.class);
        // 6.设置 文件的输入输出
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        // 7.提交job
        boolean wait = job.waitForCompletion(true);
        System.exit(wait? 0: 1);
    }
}

思路(ReduceJoin)：‘

1、自定义Hadoop序列化的类

2、Mapper分开两个文件、排序

3、Reducer按照key，传递值，设置想要输出的结果

4、设置驱动类

相关阅读:
软工实践寒假作业（1/2）
javaSprintBoot技术总结
 个人作业——软件工程实践总结&个人技术博客
 个人作业软件评测
 结对第二次作业——某次疫情统计可视化的实现
 结对第一次—疫情统计可视化（原型设计）
软工实践寒假作业（2/2）
《软件工程》_寒假作业1_职业生涯规划
 部署前端项目和后端项目到服务器
 软件工程实践总结&个人技术博客
原文地址：https://www.cnblogs.com/wt7018/p/13636474.html