• 自己编写MapReduce实现 Hive 的 join


    Hive中自带有 join 语法,但是想用mr程序来自己实现一下 join 语法,巩固一下对mr的理解。

    比如:

    有一个用户表 ,有三个字段: userId,userName,address

    有一个订单表,有四个字段:userId,orderId,categoryId,price

    用户信息:

    100000001 a addressa
    100000002 b addressb
    100000003 c addressc

    订单信息:

    100000001 101 g1 100
    100000001 102 g2 200
    100000003 103 g3 300
    100000002 201 g1 100
    100000002 202 g2 200

    实现用户表和订单表的 join 操作:

    //mr程序读取的每一行都是一串字符串,无法区分是用户信息还是订单信息。我们需要定义一个类,给这一行打上一个标签,标明它是哪种信息。

    package com.rabbit.hadoop.hive.join;

    import java.io.DataInput;
    import java.io.DataOutput;
    import java.io.IOException;

    import org.apache.hadoop.io.WritableComparable;

    public class DataJoinWritable implements WritableComparable<DataJoinWritable>{

    private String tag; //标签
    private String info; //用户或者订单信息

    public DataJoinWritable() {
    set(tag, info);
    }

    public DataJoinWritable(String tag,String info) {

    }

    public void set(String tag,String info) {
    this.tag = tag;
    this.info = info;
    }

    public String getTag() {
    return tag;
    }

    public void setTag(String tag) {
    this.tag = tag;
    }

    public String getInfo() {
    return info;
    }

    public void setInfo(String info) {
    this.info = info;
    }

    public void write(DataOutput out) throws IOException {

    out.writeUTF(tag);
    out.writeUTF(info);
    }

    public void readFields(DataInput in) throws IOException {

    tag = in.readUTF();
    info = in.readUTF();
    }

    public int compareTo(DataJoinWritable o) {
    // TODO Auto-generated method stub
    return 0;
    }

    @Override
    public String toString() {
    return tag + "," + info;
    }

    }

    //map

    package com.rabbit.hadoop.hive.join;

    import java.io.IOException;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;


    public class DataJoinMapper extends Mapper<LongWritable, Text, LongWritable, DataJoinWritable> {

    /**
    * cid cname address
    *
    * cid oid gname price
    *
    */
    private LongWritable outputKeys = new LongWritable();
    private DataJoinWritable outputValue = new DataJoinWritable();

    @Override
    protected void map(LongWritable key, Text value,Context context) throws IOException, InterruptedException {

    String lineValue = value.toString();
    String[] fields = lineValue.split(" ");
    //简单的按照长度来区分是用户还是订单信息,用户有三个字段,订单有四个字段。如果长度不是3也不是4,则认为是异常数据,丢弃该行数据。
    if (fields.length != 3 && fields.length != 4) {
    return;
    }
    //无论是用户还是订单信息,第一个字段都是userId。
    Long cid = Long.valueOf(fields[0]);
    outputKeys.set(cid);
    //长度为3,是用户信息,打上一个标签 :customer,保存这条用户信息。
    if (fields.length == 3) {
    outputValue.set("customer",fields[0]+","+fields[1]+","+fields[2]);
    }
    //长度为4,是订单信息,打上一个标签:order,保存这条订单信息。
    if (fields.length == 4) {
    outputValue.set("order", fields[0]+","+fields[1]+","+fields[2]+","+fields[3]);
    }
    //写出一行数据,key为userId,value是用户或者订单信息。
    context.write(outputKeys, outputValue);
    }

    }

    //reduce

    package com.rabbit.hadoop.hive.join;

    import java.io.IOException;
    import java.util.ArrayList;

    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Reducer;

    public class DataJoinReducer extends Reducer<LongWritable, DataJoinWritable, NullWritable, Text> {

    private Text outputValue = new Text();

    @Override
    protected void reduce(LongWritable key, Iterable<DataJoinWritable> values,Context context) throws IOException, InterruptedException {

    String customerInfo = null; //用户信息
    ArrayList<String> orders = new ArrayList<String>(); //订单信息。一个用户可以有多个订单,所以用List保存。
    //拿到的values集合中既有customer信息,又有order信息,因为key是一样的,都是cid。
    for (DataJoinWritable value : values) {

    if(value.getTag().equals("customer")) {
    customerInfo = value.getInfo();
    }else if(value.getTag().equals("order")) {
    orders.add(value.getInfo());
    }
    }
    //加上这一段相当于实现了左连接的功能,没有下过订单,就只输出用户信息。
    if(orders.isEmpty()) {
    outputValue.set(customerInfo);
    context.write(NullWritable.get(), outputValue);
    }
    //内连接的结果
    for(String order : orders) {

    outputValue.set(customerInfo+","+order);
    context.write(NullWritable.get(), outputValue);
    }

    }

    }

    //Driver

    package com.rabbit.hadoop.hive.join;

    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.conf.Configured;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    import org.apache.hadoop.util.Tool;
    import org.apache.hadoop.util.ToolRunner;


    public class DataJoinDriver extends Configured implements Tool {

    public int run(String[] args) throws Exception {

    Configuration configuration = getConf();

    Job job = Job.getInstance(configuration,this.getClass().getSimpleName());

    job.setJarByClass(DataJoinDriver.class);

    FileInputFormat.setInputPaths(job, new Path(args[0]),new Path(args[1]));
    FileOutputFormat.setOutputPath(job, new Path(args[2]));

    job.setMapperClass(DataJoinMapper.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(DataJoinWritable.class);

    job.setReducerClass(DataJoinReducer.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    boolean issuccess = job.waitForCompletion(true);
    return issuccess ? 0 : 1;
    }


    public static void main(String[] args) throws Exception {

    Configuration configuration = new Configuration();

    args = new String[] {"D:\input-join\customer.txt","D:\input-join\order.txt","D:\outputjoin"};

    int status = ToolRunner.run(configuration, new DataJoinDriver(), args);

    System.exit(status);
    }

    }

  • 相关阅读:
    大数据学习之路(持续更新中...)
    大数据之Yarn——Capacity调度器概念以及配置
    大数据之Oozie——源码分析(一)程序入口
    《结网》—— 读后总结
    [大数据之Yarn]——资源调度浅学
    Java程序员的日常—— FileUtils工具类的使用
    Oracle 11g透明网关连接Sqlserver 2000
    Python3中通过fake_useragent生成随机UserAgent
    Python导入 from lxml import etree 导入不了
    ModuleNotFoundError: No module named 'pymysql'
  • 原文地址:https://www.cnblogs.com/rabbit624/p/10553559.html
Copyright © 2020-2023  润新知