• mapjoin与reducejoin


    一、mapjoin

    1.Mapper类

    package com.css.mapjoin;
    
    import java.io.BufferedReader;
    import java.io.FileInputStream;
    import java.io.IOException;
    import java.io.InputStreamReader;
    import java.util.HashMap;
    
    import org.apache.commons.lang.StringUtils;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;
    
    // 思路:商品表加载到内存中  然后数据在map端输出前  进行替换
    public class CacheMapper extends Mapper<LongWritable, Text, Text, NullWritable>{
    
        HashMap<String, String> pdMap = new HashMap<>();
        
        // 1.商品表加载到内存
        @Override
        protected void setup(Context context)throws IOException {
            // 加载缓存文件
            BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream("pd.txt"), "UTF-8"));
            String line;
            while (StringUtils.isNotEmpty(line = br.readLine())) {
                // 切分
                String[] fields = line.split("	");
                // 缓存
                pdMap.put(fields[0], fields[1]);
            }
            br.close();
        }
        
        // 2.map传输
        @Override
        protected void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {
            // 获取数据
            String line = value.toString();
            // 切割
            String[] fields = line.split("	");
            // 获取订单中商品id
            String pid = fields[1];
            // 根据订单商品id获取商品名
            String pName = pdMap.get(pid);
            // 拼接数据
            line = line + "	" + pName;
            // 输出
            context.write(new Text(line), NullWritable.get());
        }
    }

    2.Driver类

    package com.css.mapjoin;
    
    import java.io.IOException;
    import java.net.URI;
    import java.net.URISyntaxException;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    
    public class CacheDriver {
        public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
            // 1.获取job信息
            Configuration conf = new Configuration();
            Job job = Job.getInstance(conf);
            // 2.获取jar包
            job.setJarByClass(CacheDriver.class);
            // 3.获取自定义的mapper与reducer类
            job.setMapperClass(CacheMapper.class);
            // 4.设置reduce输出的数据类型(最终的数据类型)
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(NullWritable.class);
            // 5.设置输入存在的路径与处理后的结果路径
            FileInputFormat.setInputPaths(job, new Path("c:/table1029/in"));
            FileOutputFormat.setOutputPath(job, new Path("c:/table1029/out"));
            // 6.加载缓存商品数据
            job.addCacheFile(new URI("file:///c:/inputcache/pd.txt"));
            // 7.设置一下reducetask的数量
            job.setNumReduceTasks(0);
            // 8.提交任务
            boolean rs = job.waitForCompletion(true);
            System.out.println(rs ? 0 : 1);
        }
    }

    3.输入文件

    (1)order.txt
    201801    01    1
    201802    02    2
    201803    03    3
    201804    01    4
    201805    02    5
    201806    03    6
    
    (2)pd.txt
    01    苹果
    02    华为
    03    小米

    4.输出文件part-m-00000

    201801    01    1    苹果
    201802    02    2    华为
    201803    03    3    小米
    201804    01    4    苹果
    201805    02    5    华为
    201806    03    6    小米

    二、reducejoin

    1.Mapper类

    package com.css.reducejoin;
    
    import java.io.IOException;
    
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.lib.input.FileSplit;
    
    public class TableMapper extends Mapper<LongWritable, Text, Text, TableBean>{
    
        @Override
        protected void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {
            
            TableBean v = new TableBean();
            Text k = new Text();
            
            // 区分两张表
            FileSplit inputSplit = (FileSplit) context.getInputSplit();
            String name = inputSplit.getPath().getName();
            
            // 获取数据
            String line = value.toString();
            
            // 区分  此时是订单表
            if (name.contains("order.txt")) {
                // 切分字段
                String[] fields = line.split("	");
                // 封装对象
                v.setOrder_id(fields[0]);
                v.setPid(fields[1]);
                v.setAmount(Integer.parseInt(fields[2]));
                v.setpName("");
                v.setFlag("0");
                // 设置k 商品id作为k
                k.set(fields[1]);
            }else { // 此时为商品表
                // 切分字段
                String[] fields = line.split("	");
                // 封装对象
                v.setOrder_id("");
                v.setPid(fields[0]);
                v.setAmount(0);
                v.setpName(fields[1]);
                v.setFlag("1");
                // 设置k 商品id作为k
                k.set(fields[0]);
            }
            context.write(k, v);
        }
    }

    2.Reducer类

    package com.css.reducejoin;
    
    import java.io.IOException;
    import java.lang.reflect.InvocationTargetException;
    import java.util.ArrayList;
    
    import org.apache.commons.beanutils.BeanUtils;
    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Reducer;
    
    public class TableReducer extends Reducer<Text, TableBean, TableBean, NullWritable>{
    
        @Override
        protected void reduce(Text key, Iterable<TableBean> values,
                Context context) throws IOException, InterruptedException {
            // 创建集合  存放订单数据
            ArrayList<TableBean> orderBean = new ArrayList<TableBean>();
            
            // 商品存储
            TableBean pdBean = new TableBean(); // 把pd商品中商品名  拷贝到orderBean
            
            for (TableBean v : values) {
                if ("0".equals(v.getFlag())) { // 订单表
                    // 1.创建一个临时变量  拷贝数据
                    TableBean tableBean = new TableBean();
                    // 2.拷贝
                    try {
                        BeanUtils.copyProperties(tableBean, v);
                    } catch (IllegalAccessException | InvocationTargetException e) {
                        e.printStackTrace();
                    }
                    orderBean.add(tableBean);
                }else {
                    try {
                        BeanUtils.copyProperties(pdBean, v);
                    } catch (IllegalAccessException | InvocationTargetException e) {
                        e.printStackTrace();
                    }
                }
            }
            
            // 拼接表
            for (TableBean tableBean : orderBean) {
                // 加入商品名
                tableBean.setpName(pdBean.getpName());
                context.write(tableBean, NullWritable.get());
            }
        }
    }

    3.封装类

    package com.css.reducejoin;
    
    import java.io.DataInput;
    import java.io.DataOutput;
    import java.io.IOException;
    
    import org.apache.hadoop.io.Writable;
    
    public class TableBean implements Writable{
        
        // 封装对应字段
        private String order_id; //订单id
        private String pid; // 产品id
        private int amount; // 产品数量
        private String pName; // 产品名称
        private String flag; // 判断是订单表还是商品表
        
        public TableBean() {
            super();
        }
        
        public String getOrder_id() {
            return order_id;
        }
        public void setOrder_id(String order_id) {
            this.order_id = order_id;
        }
        public String getPid() {
            return pid;
        }
        public void setPid(String pid) {
            this.pid = pid;
        }
        public int getAmount() {
            return amount;
        }
        public void setAmount(int amount) {
            this.amount = amount;
        }
        public String getpName() {
            return pName;
        }
        public void setpName(String pName) {
            this.pName = pName;
        }
        public String getFlag() {
            return flag;
        }
        public void setFlag(String flag) {
            this.flag = flag;
        }
        
        @Override
        public void write(DataOutput out) throws IOException {
            out.writeUTF(order_id);
            out.writeUTF(pid);
            out.writeInt(amount);
            out.writeUTF(pName);
            out.writeUTF(flag);
        }
        
        @Override
        public void readFields(DataInput in) throws IOException {
            order_id = in.readUTF();
            pid = in.readUTF();
            amount = in.readInt();
            pName = in.readUTF();
            flag = in.readUTF();
        }
    
        @Override
        public String toString() {
            return order_id + "	" + pName + "	" + amount;
        }
    }

    4.Driver类

    package com.css.reducejoin;
    
    import java.io.IOException;
    import java.net.URISyntaxException;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    
    public class TableDriver {
        public static void main(String[] args) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException {
            Configuration conf = new Configuration();
            Job job = Job.getInstance(conf);
            
            job.setJarByClass(TableDriver.class);
            
            job.setMapperClass(TableMapper.class);
            job.setReducerClass(TableReducer.class);
            
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(TableBean.class);
            
            job.setOutputKeyClass(TableBean.class);
            job.setOutputValueClass(NullWritable.class);
            
            FileInputFormat.setInputPaths(job, new Path("c:/reduce1029/in"));
            FileOutputFormat.setOutputPath(job, new Path("c:/reduce1029/out"));
            
            boolean rs = job.waitForCompletion(true);
            System.out.println(rs ? 0 : 1);
        }
    }

    5.输入文件

    (1)order.txt
    201801    01    1
    201802    02    2
    201803    03    3
    201804    01    4
    201805    02    5
    201806    03    62)pd.txt
    01    苹果
    02    华为
    03    小米

    6.输出文件part-r-00000

    201804    苹果    4
    201801    苹果    1
    201805    华为    5
    201802    华为    2
    201806    小米    6
    201803    小米    3
  • 相关阅读:
    学习 swift (1)
    Sbulime Text 2 修改选中字符串的颜色
    js string 和 json 互转
    Mac OSX sublime text2 各种快捷键
    webstorm keymap
    python http post json
    node.js async 几个函数
    python 爬图 helloworld
    合服导致 globalserver 起不来的问题
    ssh 登陆 端口转发
  • 原文地址:https://www.cnblogs.com/areyouready/p/9904984.html
Copyright © 2020-2023  润新知