Hive中自带有 join 语法,但是想用mr程序来自己实现一下 join 语法,巩固一下对mr的理解。
比如:
有一个用户表 ,有三个字段: userId,userName,address
有一个订单表,有四个字段:userId,orderId,categoryId,price
用户信息:
100000001 a addressa
100000002 b addressb
100000003 c addressc
订单信息:
100000001 101 g1 100
100000001 102 g2 200
100000003 103 g3 300
100000002 201 g1 100
100000002 202 g2 200
实现用户表和订单表的 join 操作:
//mr程序读取的每一行都是一串字符串,无法区分是用户信息还是订单信息。我们需要定义一个类,给这一行打上一个标签,标明它是哪种信息。
package com.rabbit.hadoop.hive.join;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
public class DataJoinWritable implements WritableComparable<DataJoinWritable>{
private String tag; //标签
private String info; //用户或者订单信息
public DataJoinWritable() {
set(tag, info);
}
public DataJoinWritable(String tag,String info) {
}
public void set(String tag,String info) {
this.tag = tag;
this.info = info;
}
public String getTag() {
return tag;
}
public void setTag(String tag) {
this.tag = tag;
}
public String getInfo() {
return info;
}
public void setInfo(String info) {
this.info = info;
}
public void write(DataOutput out) throws IOException {
out.writeUTF(tag);
out.writeUTF(info);
}
public void readFields(DataInput in) throws IOException {
tag = in.readUTF();
info = in.readUTF();
}
public int compareTo(DataJoinWritable o) {
// TODO Auto-generated method stub
return 0;
}
@Override
public String toString() {
return tag + "," + info;
}
}
//map
package com.rabbit.hadoop.hive.join;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class DataJoinMapper extends Mapper<LongWritable, Text, LongWritable, DataJoinWritable> {
/**
* cid cname address
*
* cid oid gname price
*
*/
private LongWritable outputKeys = new LongWritable();
private DataJoinWritable outputValue = new DataJoinWritable();
@Override
protected void map(LongWritable key, Text value,Context context) throws IOException, InterruptedException {
String lineValue = value.toString();
String[] fields = lineValue.split(" ");
//简单的按照长度来区分是用户还是订单信息,用户有三个字段,订单有四个字段。如果长度不是3也不是4,则认为是异常数据,丢弃该行数据。
if (fields.length != 3 && fields.length != 4) {
return;
}
//无论是用户还是订单信息,第一个字段都是userId。
Long cid = Long.valueOf(fields[0]);
outputKeys.set(cid);
//长度为3,是用户信息,打上一个标签 :customer,保存这条用户信息。
if (fields.length == 3) {
outputValue.set("customer",fields[0]+","+fields[1]+","+fields[2]);
}
//长度为4,是订单信息,打上一个标签:order,保存这条订单信息。
if (fields.length == 4) {
outputValue.set("order", fields[0]+","+fields[1]+","+fields[2]+","+fields[3]);
}
//写出一行数据,key为userId,value是用户或者订单信息。
context.write(outputKeys, outputValue);
}
}
//reduce
package com.rabbit.hadoop.hive.join;
import java.io.IOException;
import java.util.ArrayList;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class DataJoinReducer extends Reducer<LongWritable, DataJoinWritable, NullWritable, Text> {
private Text outputValue = new Text();
@Override
protected void reduce(LongWritable key, Iterable<DataJoinWritable> values,Context context) throws IOException, InterruptedException {
String customerInfo = null; //用户信息
ArrayList<String> orders = new ArrayList<String>(); //订单信息。一个用户可以有多个订单,所以用List保存。
//拿到的values集合中既有customer信息,又有order信息,因为key是一样的,都是cid。
for (DataJoinWritable value : values) {
if(value.getTag().equals("customer")) {
customerInfo = value.getInfo();
}else if(value.getTag().equals("order")) {
orders.add(value.getInfo());
}
}
//加上这一段相当于实现了左连接的功能,没有下过订单,就只输出用户信息。
if(orders.isEmpty()) {
outputValue.set(customerInfo);
context.write(NullWritable.get(), outputValue);
}
//内连接的结果
for(String order : orders) {
outputValue.set(customerInfo+","+order);
context.write(NullWritable.get(), outputValue);
}
}
}
//Driver
package com.rabbit.hadoop.hive.join;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class DataJoinDriver extends Configured implements Tool {
public int run(String[] args) throws Exception {
Configuration configuration = getConf();
Job job = Job.getInstance(configuration,this.getClass().getSimpleName());
job.setJarByClass(DataJoinDriver.class);
FileInputFormat.setInputPaths(job, new Path(args[0]),new Path(args[1]));
FileOutputFormat.setOutputPath(job, new Path(args[2]));
job.setMapperClass(DataJoinMapper.class);
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(DataJoinWritable.class);
job.setReducerClass(DataJoinReducer.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
boolean issuccess = job.waitForCompletion(true);
return issuccess ? 0 : 1;
}
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
args = new String[] {"D:\input-join\customer.txt","D:\input-join\order.txt","D:\outputjoin"};
int status = ToolRunner.run(configuration, new DataJoinDriver(), args);
System.exit(status);
}
}