• MapReduce(四)--MapReduce案例


    一、倒排索引案例(多job串联)

    1.1、需求及分析

    image

    1.2、代码编写

    1.2.1、第一次处理

    1)第一次处理,编写OneIndexMapper类

    package com.dianchou.mr.index;
    
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.lib.input.FileSplit;
    
    import java.io.IOException;
    
    public class OneIndexMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
    
        Text k = new Text();
        IntWritable v = new IntWritable(1);
        String name;
    
        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            FileSplit split = (FileSplit) context.getInputSplit();
            name = split.getPath().getName();
        }
    
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();
            String[] words = line.split(" ");
            for (String word : words) {
                k.set(word + "---" + name);
                context.write(k,v);
            }
        }
    }
    

    2)第一次处理,编写OneIndexReducer类

    package com.dianchou.mr.index;
    
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Reducer;
    
    import java.io.IOException;
    
    
    public class OneIndexReducer extends Reducer<Text, IntWritable,Text,IntWritable> {
        //atguigu--a.txt	3
    
        IntWritable v = new IntWritable();
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable value : values) {
                sum += value.get();
            }
            v.set(sum);
            context.write(key, v);
    
        }
    }
    

    3)第一次处理,编写OneIndexDriver类

    package com.dianchou.mr.index;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    
    public class OneIndexDriver {
    
        public static void main(String[] args) throws Exception {
    
            // 输入输出路径需要根据自己电脑上实际的输入输出路径设置
            args = new String[] { "D:\hadoop\index-input", "D:\hadoop\index-output" };
    
            Configuration conf = new Configuration();
    
            Job job = Job.getInstance(conf);
            job.setJarByClass(OneIndexDriver.class);
    
            job.setMapperClass(OneIndexMapper.class);
            job.setReducerClass(OneIndexReducer.class);
    
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(IntWritable.class);
    
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(IntWritable.class);
    
            FileInputFormat.setInputPaths(job, new Path(args[0]));
            FileOutputFormat.setOutputPath(job, new Path(args[1]));
    
            job.waitForCompletion(true);
        }
    }
    

    4)查看第一次输出结果

    atguigu--a.txt	3
    atguigu--b.txt	2
    atguigu--c.txt	2
    pingping--a.txt	1
    pingping--b.txt	3
    pingping--c.txt	1
    ss--a.txt	2
    ss--b.txt	1
    ss--c.txt	1

    1.1.2、第二次处理

    1)第二次处理,编写TwoIndexMapper类

    package com.dianchou.mr.index;
    
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;
    
    import java.io.IOException;
    
    public class TwoIndexMapper extends Mapper<LongWritable, Text,Text,Text> {
    
        Text k = new Text();
        Text v = new Text();
    
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            //atguigu---a.txt	3  ==>atguigu   a.txt	3
            String line = value.toString();
            String[] fields = line.split("---");
            k.set(fields[0]);
            v.set(fields[1]);
            context.write(k, v);
    
        }
    }
    

    2)第二次处理,编写TwoIndexReducer类

    package com.dianchou.mr.index;
    
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Reducer;
    
    import java.io.IOException;
    
    public class TwoIndexReducer extends Reducer<Text,Text,Text,Text> {
        Text v = new Text();
    
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            // atguigu a.txt 3
            // atguigu b.txt 2
            // atguigu c.txt 2
            // atguigu c.txt-->2 b.txt-->2 a.txt-->3
    
            StringBuilder sb = new StringBuilder();
            for (Text value : values) {
                sb.append(value.toString().replace("	", "-->") + "	");
            }
            v.set(sb.toString());
            context.write(key, v);
        }
    }
    

    3)第二次处理,编写TwoIndexDriver类


    package com.dianchou.mr.index;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    
    public class TwoIndexDriver {
    
        public static void main(String[] args) throws Exception {
    
            // 输入输出路径需要根据自己电脑上实际的输入输出路径设置
            args = new String[] { "D:\hadoop\index-output", "D:\hadoop\index-output2" };
    
            Configuration config = new Configuration();
            Job job = Job.getInstance(config);
    
            job.setJarByClass(TwoIndexDriver.class);
            job.setMapperClass(TwoIndexMapper.class);
            job.setReducerClass(TwoIndexReducer.class);
    
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(Text.class);
    
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(Text.class);
    
            FileInputFormat.setInputPaths(job, new Path(args[0]));
            FileOutputFormat.setOutputPath(job, new Path(args[1]));
    
            boolean result = job.waitForCompletion(true);
            System.exit(result?0:1);
        }
    }
    

    4)第二次查看最终结果

    atguigu	c.txt-->2	b.txt-->2	a.txt-->3
    pingping	c.txt-->1	b.txt-->3	a.txt-->1
    ss	c.txt-->1	b.txt-->1	a.txt-->2

    二、TopN案例

    2.1、需求及分析

    image

    2.2、代码实现

    1)编写FlowBean类

    package com.dianchou.mr.top;
    
    import java.io.DataInput;
    import java.io.DataOutput;
    import java.io.IOException;
    import org.apache.hadoop.io.WritableComparable;
    
    public class FlowBean implements WritableComparable<FlowBean>{
    
        private long upFlow;
        private long downFlow;
        private long sumFlow;
    
        public FlowBean() {
            super();
        }
    
        public FlowBean(long upFlow, long downFlow) {
            super();
            this.upFlow = upFlow;
            this.downFlow = downFlow;
        }
    
        @Override
        public void write(DataOutput out) throws IOException {
            out.writeLong(upFlow);
            out.writeLong(downFlow);
            out.writeLong(sumFlow);
        }
    
        @Override
        public void readFields(DataInput in) throws IOException {
            upFlow = in.readLong();
            downFlow = in.readLong();
            sumFlow = in.readLong();
        }
    
        public long getUpFlow() {
            return upFlow;
        }
    
        public void setUpFlow(long upFlow) {
            this.upFlow = upFlow;
        }
    
        public long getDownFlow() {
            return downFlow;
        }
    
        public void setDownFlow(long downFlow) {
            this.downFlow = downFlow;
        }
    
        public long getSumFlow() {
            return sumFlow;
        }
    
        public void setSumFlow(long sumFlow) {
            this.sumFlow = sumFlow;
        }
    
        @Override
        public String toString() {
            return upFlow + "	" + downFlow + "	" + sumFlow;
        }
    
        public void set(long downFlow2, long upFlow2) {
            downFlow = downFlow2;
            upFlow = upFlow2;
            sumFlow = downFlow2 + upFlow2;
        }
    
        @Override
        public int compareTo(FlowBean bean) {
            int result;
            if (this.sumFlow > bean.getSumFlow()) {
                result = -1;
            }else if (this.sumFlow < bean.getSumFlow()) {
                result = 1;
            }else {
                result = 0;
            }
            return result;
        }
    }
    

    2)编写TopNMapper类

    package com.dianchou.mr.top;
    
    import java.io.IOException;
    import java.util.Iterator;
    import java.util.TreeMap;
    
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;
    
    public class TopNMapper extends Mapper<LongWritable, Text, FlowBean, Text> {
    
        // 定义一个TreeMap作为存储数据的容器(天然按key排序)
        private TreeMap<FlowBean, Text> flowMap = new TreeMap<FlowBean, Text>();
        private FlowBean kBean;
    
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    
            kBean = new FlowBean();
            Text v = new Text();
    
            // 1 获取一行
            String line = value.toString();
    
            // 2 切割
            String[] fields = line.split("	");
    
            // 3 封装数据
            String phoneNum = fields[0];
            long upFlow = Long.parseLong(fields[1]);
            long downFlow = Long.parseLong(fields[2]);
            long sumFlow = Long.parseLong(fields[3]);
    
            kBean.setDownFlow(downFlow);
            kBean.setUpFlow(upFlow);
            kBean.setSumFlow(sumFlow);
    
            v.set(phoneNum);
    
            // 4 向TreeMap中添加数据
            flowMap.put(kBean, v);
    
            // 5 限制TreeMap的数据量,超过10条就删除掉流量最小的一条数据
            if (flowMap.size() > 10) {
                flowMap.remove(flowMap.lastKey());
            }
        }
    
        @Override
        protected void cleanup(Context context) throws IOException, InterruptedException {
            // 6 遍历treeMap集合,输出数据
            Iterator<FlowBean> bean = flowMap.keySet().iterator();
            while (bean.hasNext()) {
                FlowBean k = bean.next();
                context.write(k, flowMap.get(k));
            }
        }
    }
    

    3)编写TopNReducer类

    package com.dianchou.mr.top;
    
    import java.io.IOException;
    import java.util.Iterator;
    import java.util.TreeMap;
    
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Reducer;
    
    public class TopNReducer extends Reducer<FlowBean, Text, Text, FlowBean> {
    
        // 定义一个TreeMap作为存储数据的容器(天然按key排序)
        TreeMap<FlowBean, Text> flowMap = new TreeMap<FlowBean, Text>();
    
        @Override
        protected void reduce(FlowBean key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
    
            for (Text value : values) {
                FlowBean bean = new FlowBean();
                bean.set(key.getDownFlow(), key.getUpFlow());
    
                // 1 向treeMap集合中添加数据
                flowMap.put(bean, new Text(value));
    
                // 2 限制TreeMap数据量,超过10条就删除掉流量最小的一条数据
                if (flowMap.size() > 10) {
                    flowMap.remove(flowMap.lastKey());
                }
            }
        }
    
        @Override
        protected void cleanup(Reducer<FlowBean, Text, Text, FlowBean>.Context context) throws IOException, InterruptedException {
            // 3 遍历集合,输出数据
            Iterator<FlowBean> it = flowMap.keySet().iterator();
            while (it.hasNext()) {
                FlowBean v = it.next();
                context.write(new Text(flowMap.get(v)), v);
            }
        }
    }
    

    4)编写TopNDriver类

    package com.dianchou.mr.top;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    
    public class TopNDriver {
    
        public static void main(String[] args) throws Exception {
    
            args  = new String[]{"D:\hadoop\top-input","D:\hadoop\top-output"};
    
            // 1 获取配置信息,或者job对象实例
            Configuration configuration = new Configuration();
            Job job = Job.getInstance(configuration);
    
            // 6 指定本程序的jar包所在的本地路径
            job.setJarByClass(TopNDriver.class);
    
            // 2 指定本业务job要使用的mapper/Reducer业务类
            job.setMapperClass(TopNMapper.class);
            job.setReducerClass(TopNReducer.class);
    
            // 3 指定mapper输出数据的kv类型
            job.setMapOutputKeyClass(FlowBean.class);
            job.setMapOutputValueClass(Text.class);
    
            // 4 指定最终输出的数据的kv类型
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(FlowBean.class);
    
            // 5 指定job的输入原始文件所在目录
            FileInputFormat.setInputPaths(job, new Path(args[0]));
            FileOutputFormat.setOutputPath(job, new Path(args[1]));
    
            // 7 将job中配置的相关参数,以及job所用的java类所在的jar包, 提交给yarn去运行
            boolean result = job.waitForCompletion(true);
            System.exit(result ? 0 : 1);
        }
    }
    

    三、共同好友案例

    3.1、需求及分析

    好友列表数据,冒号前是一个用户,冒号后是该用户的所有好友(数据中的好友关系是单向的),求出哪些人两两之间有共同好友,及他俩的共同好友都有谁?

    数据输入:

    A:B,C,D,F,E,O
    B:A,C,E,K
    C:F,A,D,I
    D:A,E,F,L
    E:B,C,D,M,L
    F:A,B,C,D,E,O,M
    G:A,C,D,E,F
    H:A,C,D,E,O
    I:A,O
    J:B,O
    K:A,C,D
    L:D,E,F
    M:E,F,G
    O:A,H,I,J
    View Code

    需求分析:

    先求出A、B、C、….等是谁的好友

    第一次输出结果:

    A	I,K,C,B,G,F,H,O,D,
    B	A,F,J,E,
    C	A,E,B,H,F,G,K,
    D	G,C,K,A,L,F,E,H,
    E	G,M,L,H,A,F,B,D,
    F	L,M,D,C,G,A,
    G	M,
    H	O,
    I	O,C,
    J	O,
    K	B,
    L	D,E,
    M	E,F,
    O	A,H,I,J,F,
    View Code

    第二次输出结果:

    A-B	E C
    A-C	D F
    A-D	E F
    A-E	D B C
    A-F	O B C D E
    A-G	F E C D
    A-H	E C D O
    A-I	O
    A-J	O B
    A-K	D C
    A-L	F E D
    A-M	E F
    B-C	A
    B-D	A E
    B-E	C
    B-F	E A C
    B-G	C E A
    B-H	A E C
    B-I	A
    B-K	C A
    B-L	E
    B-M	E
    B-O	A
    C-D	A F
    C-E	D
    C-F	D A
    C-G	D F A
    C-H	D A
    C-I	A
    C-K	A D
    C-L	D F
    C-M	F
    C-O	I A
    D-E	L
    D-F	A E
    D-G	E A F
    D-H	A E
    D-I	A
    D-K	A
    D-L	E F
    D-M	F E
    D-O	A
    E-F	D M C B
    E-G	C D
    E-H	C D
    E-J	B
    E-K	C D
    E-L	D
    F-G	D C A E
    F-H	A D O E C
    F-I	O A
    F-J	B O
    F-K	D C A
    F-L	E D
    F-M	E
    F-O	A
    G-H	D C E A
    G-I	A
    G-K	D A C
    G-L	D F E
    G-M	E F
    G-O	A
    H-I	O A
    H-J	O
    H-K	A C D
    H-L	D E
    H-M	E
    H-O	A
    I-J	O
    I-K	A
    I-O	A
    K-L	D
    K-O	A
    L-M	E F
    View Code

    3.2、代码编写

    1)第一次Mapper类

    package com.dianchou.mr.friends;
    
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;
    
    import java.io.IOException;
    
    public class OneShareFriendsMapper extends Mapper<LongWritable, Text, Text, Text> {
    
        @Override
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
                throws IOException, InterruptedException {
    
            // 1 获取一行 A:B,C,D,F,E,O
            String line = value.toString();
            //读取到最后
            if ("".equals(line)) {
                return;
            }
    
            // 2 切割
            String[] fields = line.split(":");
    
            // 3 获取person和好友
            String person = fields[0];
            String[] friends = fields[1].split(",");
    
            // 4写出去
            for (String friend : friends) {
                // 输出 <好友,人>
                context.write(new Text(friend), new Text(person));
            }
        }
    }
    

    2)第一次Reducer类

    package com.dianchou.mr.friends;
    
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Reducer;
    
    import java.io.IOException;
    
    public class OneShareFriendsReducer extends Reducer<Text, Text, Text, Text> {
    
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context)throws IOException, InterruptedException {
            StringBuffer sb = new StringBuffer();
            //1 拼接
            for(Text person: values){
                sb.append(person).append(",");
            }
            //2 写出
            context.write(key, new Text(sb.toString()));
        }
    }
    

    3)第一次Driver类

    package com.dianchou.mr.friends;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    
    public class OneShareFriendsDriver {
    
        public static void main(String[] args) throws Exception {
            args = new String[]{"D:\hadoop\friends-input","D:\hadoop\friends-output"};
    
            // 1 获取job对象
            Configuration configuration = new Configuration();
            Job job = Job.getInstance(configuration);
    
            // 2 指定jar包运行的路径
            job.setJarByClass(OneShareFriendsDriver.class);
    
            // 3 指定map/reduce使用的类
            job.setMapperClass(OneShareFriendsMapper.class);
            job.setReducerClass(OneShareFriendsReducer.class);
    
            // 4 指定map输出的数据类型
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(Text.class);
    
            // 5 指定最终输出的数据类型
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(Text.class);
    
            // 6 指定job的输入原始所在目录
            FileInputFormat.setInputPaths(job, new Path(args[0]));
            FileOutputFormat.setOutputPath(job, new Path(args[1]));
    
            // 7 提交
            boolean result = job.waitForCompletion(true);
    
            System.exit(result?0:1);
        }
    }
    

    4)第二次Mapper类

    package com.dianchou.mr.friends;
    
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;
    
    import java.io.IOException;
    import java.util.Arrays;
    
    public class TwoShareFriendsMapper extends Mapper<LongWritable, Text, Text, Text> {
    
        @Override
        protected void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {
    
            // A I,K,C,B,G,F,H,O,D,
            // 友 人,人,人
            String line = value.toString();
            String[] friend_persons = line.split("	");
    
            String friend = friend_persons[0];
            String[] persons = friend_persons[1].split(",");
    
            Arrays.sort(persons);
    
            for (int i = 0; i < persons.length - 1; i++) {
    
                for (int j = i + 1; j < persons.length; j++) {
                    // 发出 <人-人,好友> ,这样,相同的“人-人”对的所有好友就会到同1个reduce中去
                    context.write(new Text(persons[i] + "-" + persons[j]), new Text(friend));
                }
            }
        }
    }
    

    5)第二次Reducer类

    package com.dianchou.mr.friends;
    
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Reducer;
    
    import java.io.IOException;
    
    public class TwoShareFriendsReducer extends Reducer<Text, Text, Text, Text> {
    
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context)	throws IOException, InterruptedException {
    
            StringBuffer sb = new StringBuffer();
    
            for (Text friend : values) {
                sb.append(friend).append(" ");
            }
    
            context.write(key, new Text(sb.toString()));
        }
    }
    

    6)第二次Driver类

    package com.dianchou.mr.friends;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    
    public class TwoShareFriendsDriver {
    
        public static void main(String[] args) throws Exception {
    
            args = new String[]{"D:\hadoop\friends-output","D:\hadoop\friends-output2"};
    
            // 1 获取job对象
            Configuration configuration = new Configuration();
            Job job = Job.getInstance(configuration);
    
            // 2 指定jar包运行的路径
            job.setJarByClass(TwoShareFriendsDriver.class);
    
            // 3 指定map/reduce使用的类
            job.setMapperClass(TwoShareFriendsMapper.class);
            job.setReducerClass(TwoShareFriendsReducer.class);
    
            // 4 指定map输出的数据类型
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(Text.class);
    
            // 5 指定最终输出的数据类型
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(Text.class);
    
            // 6 指定job的输入原始所在目录
            FileInputFormat.setInputPaths(job, new Path(args[0]));
            FileOutputFormat.setOutputPath(job, new Path(args[1]));
    
            // 7 提交
            boolean result = job.waitForCompletion(true);
            System.exit(result?0:1);
        }
    }
    
    作者:Lawrence

    -------------------------------------------

    个性签名:独学而无友,则孤陋而寡闻。做一个灵魂有趣的人!

    扫描上面二维码关注我
    如果你真心觉得文章写得不错,而且对你有所帮助,那就不妨帮忙“推荐"一下,您的“推荐”和”打赏“将是我最大的写作动力!
    本文版权归作者所有,欢迎转载,但未经作者同意必须保留此段声明,且在文章页面明显位置给出原文连接.
  • 相关阅读:
    原型模式
    浅复制和深复制
    适配器模式
    外观模式
    模板方法
    建造者模式
    代理模式
    Centos7重新安装yum
    关于mongodb创建索引的一些经验总结(转)
    MongoDB查询语句(转)
  • 原文地址:https://www.cnblogs.com/hujinzhong/p/14338088.html
Copyright © 2020-2023  润新知