• HADOOP之HDFS使用idea练习MapReduce(八)


    HDFS基本命令

    • 创建文件夹
      • hdfs dfs -mkdir -p  /data/dist1
    • 上传文件
      • hdfs dfs -put dist.txt /data/dist1
    • 查看目录下有哪些文件
      • hdfs dfs -ls -R /data/topn
    • 查看具体的文件
      • hdfs dfs -cat /data/topn/output3/part-r-00000

    练习内容:将每个月的第一高气温和第二高气温删选出来,第一温度和第二温度不能是同一天。

    有一组数据:其中空格部门是制表符 '	'。
    数据格式: 日期、城市字段、温度 2019-6-1 22:22:22 1 39 2019-5-21 22:22:22 3 33 2019-6-1 22:22:22 1 38 2019-6-2 22:22:22 2 31 2018-3-11 22:22:22 3 18 2018-4-23 22:22:22 1 22 1970-8-23 22:22:22 2 23 1970-8-8 22:22:22 1 32 2019-6-1 22:22:22 1 39 2019-5-21 22:22:22 3 33 2019-6-1 22:22:22 1 44 2019-6-2 22:22:22 2 50 2018-3-11 22:22:22 3 18 2018-4-23 22:22:22 1 65 1970-8-23 22:22:22 2 66 1970-8-8 22:22:22 1 77

    创建MyTopN类:

    
    
    package com.xiaoke.mapreduce.topn;

    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

    public class MyTopN {

    public static void main(String[] args) throws Exception {
    Configuration configuration = new Configuration(true);

    //让框架知道是windows异构平台运行
    configuration.set("mapreduce.app-submission.cross-platform", "true");

    configuration.set("mapreduce.framework.name", "local");

    // 点击job下面的参数根据example写
    Job job = Job.getInstance(configuration);

    job.setJarByClass(MyTopN.class);
    //job.setJar("D:\code\mayun_hadoop\test\hadoop\target\hadoop-hdfs-1.0-SNAPSHOT.jar");

    // Specify various job-specific parameters
    job.setJobName("topN1");

    Path inputPath = new Path("/data/topn/input1");
    TextInputFormat.setInputPaths(job, inputPath);

    Path outputPath = new Path("/data/topn/output2");
    if (outputPath.getFileSystem(configuration).exists(outputPath))
    outputPath.getFileSystem(configuration).delete(outputPath, true);
    TextOutputFormat.setOutputPath(job, outputPath);

    // map设置
    job.setMapOutputKeyClass(TKey.class);
    job.setMapperClass(TMapper.class);
    job.setMapOutputValueClass(IntWritable.class);

    //分区
    job.setPartitionerClass(TPartitioner.class); //partitioner 按 年,月 分区 -》 分区 > 分组 按 年分区!!!!!! //分区器潜台词:满足 相同的key获得相同的分区号就可以~!
    job.setSortComparatorClass(TSortComparator.class); //年,月,温度 且 温度倒序

    // reduce设置
    job.setGroupingComparatorClass(TGroupingComparator.class);//按年月分组
    job.setReducerClass(TReducer.class);

    job.waitForCompletion(true);

    }
    }
     

    创建TKey.class

    package com.xiaoke.mapreduce.topn;
    
    import org.apache.hadoop.io.WritableComparable;
    
    import java.io.DataInput;
    import java.io.DataOutput;
    import java.io.IOException;
    
    //自定义类型必须实现接口: 序列化/反序列化   比较器
    public class TKey implements WritableComparable<TKey> {
    
        private int year;
        private int month;
        private int day;
        //温度
        private int wd;
    
    
        @Override
        public int compareTo(TKey that) {
            // 可写可不写,如果重写了排序方法,则需要写,这里就作废
            int year = Integer.compare(this.year, that.getYear());
            if (year == 0) {
                int month = Integer.compare(this.month, that.getMonth());
                if (month == 0) {
                    return Integer.compare(this.day, that.getDay());
                }
                return month;
            }
            return year;
        }
    
        @Override
        public void write(DataOutput out) throws IOException {
            out.writeInt(year);
            out.writeInt(month);
            out.writeInt(day);
            out.writeInt(wd);
    
        }
    
        @Override
        public void readFields(DataInput in) throws IOException {
            this.year = in.readInt();
            this.month = in.readInt();
            this.day = in.readInt();
            this.wd = in.readInt();
        }
    
    
        public int getYear() {
            return year;
        }
    
        public void setYear(int year) {
            this.year = year;
        }
    
        public int getMonth() {
            return month;
        }
    
        public void setMonth(int month) {
            this.month = month;
        }
    
        public int getDay() {
            return day;
        }
    
        public void setDay(int day) {
            this.day = day;
        }
    
        public int getWd() {
            return wd;
        }
    
        public void setWd(int wd) {
            this.wd = wd;
        }
    }

    创建TMapper.class

    package com.xiaoke.mapreduce.topn;
    
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.util.StringUtils;
    
    import java.io.IOException;
    import java.text.ParseException;
    import java.text.SimpleDateFormat;
    import java.util.Calendar;
    import java.util.Date;
    
    public class TMapper extends Mapper<LongWritable, Text, TKey, IntWritable> {
    
        TKey mkey = new TKey();
        IntWritable mval = new IntWritable();
    
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
            //开发习惯:不要过于自信
            // value:  2019-6-1 22:22:22    1    31
            String[] split = StringUtils.split(value.toString(), '	');
            // 2019-6-1 22:22:22
            // 1
            // 31
    
            try {
                Date data = sdf.parse(split[0]);
                Calendar calendar = Calendar.getInstance();
                calendar.setTime(data);
                mkey.setYear(calendar.get(Calendar.YEAR));
                mkey.setMonth(calendar.get(Calendar.MONTH) + 1);
                mkey.setDay(calendar.get(Calendar.DAY_OF_MONTH));
    
                int wd = Integer.parseInt(split[2]);
                mkey.setWd(wd);
                mval.set(wd);
    
                // TKey 是根据年月日排序
                // map这里将值拆成: 年月日温度  温度
                context.write(mkey, mval);
            } catch (ParseException e) {
                e.printStackTrace();
            }
    
    
        }
    }

    创建TPartitioner.class

    package com.xiaoke.mapreduce.topn;
    
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.mapreduce.Partitioner;
    
    public class TPartitioner extends Partitioner<TKey, IntWritable> {
    
        //1,不能太复杂。。。
        //partitioner  按  年,月  分区  -》  分区 > 分组  按 年分区!!!!!!
        //分区器潜台词:满足  相同的key获得相同的分区号就可以~!
    
        @Override
        public int getPartition(TKey tKey, IntWritable intWritable, int numPartitions) {
            //numPartitions 数量来自配置项 mapreduce.task.partition
            return tKey.getYear() % numPartitions;
        }
    }

    创建TSortComparator.class

    package com.xiaoke.mapreduce.topn;
    
    import org.apache.hadoop.io.RawComparator;
    import org.apache.hadoop.io.WritableComparable;
    import org.apache.hadoop.io.WritableComparator;
    
    public class TSortComparator extends WritableComparator {
    
        // 需要构造 修改序列化方式
        public TSortComparator() {
            super(TKey.class, true);
        }
    
        @Override
        public int compare(WritableComparable a, WritableComparable b) {
            TKey k1 = (TKey) a;
            TKey k2 = (TKey) b;
            //  年,月,温度,,且温度倒序:
            int c1 = Integer.compare(k1.getYear(), k2.getYear());
            if (c1 == 0) {
                int c2 = Integer.compare(k1.getMonth(), k2.getMonth());
                if (c2 == 0) {
                    return -Integer.compare(k1.getWd(), k2.getWd());
                }
                return c2;
            }
            return c1;
        }
    
    
    }

    创建TGroupingComparator.class

    package com.xiaoke.mapreduce.topn;
    
    import org.apache.hadoop.io.RawComparator;
    import org.apache.hadoop.io.WritableComparable;
    import org.apache.hadoop.io.WritableComparator;
    
    public class TGroupingComparator extends WritableComparator {
    
        public TGroupingComparator() {
            super(TKey.class, true);
        }
    
        @Override
        public int compare(WritableComparable a, WritableComparable b) {
            TKey k1 = (TKey) a;
            TKey k2 = (TKey) b;
            //  按着 年,月分组
            int c1 = Integer.compare(k1.getYear(), k2.getYear());
            if (c1 == 0) {
                return Integer.compare(k1.getMonth(), k2.getMonth());
            }
            return c1;
        }
    }

    创建TReducer.class

    package com.xiaoke.mapreduce.topn;
    
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Reducer;
    
    import java.io.IOException;
    import java.util.Iterator;
    
    public class TReducer extends Reducer<TKey, IntWritable, Text, IntWritable> {
    
        Text rkey = new Text();
        IntWritable rval = new IntWritable();
    
        @Override
        protected void reduce(TKey key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            // 1970-6-4 33   33
            // 1970-6-4 32   32
            // 1970-6-22 31   31
            // 1970-6-4 22   22
            System.out.println("- - - --- reduce - - TKey - -  -" + key.toString());
    
            //问题:  对着value进行迭代,,key会不会变?  会变
            //程序开发,方法传参:  值传递,引用传递
            Iterator<IntWritable> iterator = values.iterator();
            int flg = 0;
            int day = 0;
            while (iterator.hasNext()) {
                IntWritable val = iterator.next();
                System.out.println("TReducer values key - - - -" + val);
    
                if (flg == 0) {
                    rkey.set(key.getYear() + "-" + key.getMonth() + "-" + key.getDay());
                    rval.set(key.getWd());
                    context.write(rkey, rval);
                    flg++;
                    day = key.getDay();
                }
    
                if (flg != 0 && day != key.getDay()) {
                    rkey.set(key.getYear() + "-" + key.getMonth() + "-" + key.getDay());
                    rval.set(key.getWd());
                    context.write(rkey, rval);
                    break;
                }
    
            }
    
        }
    }

    备注

    • 本地单机模式: 
      configuration.set("mapreduce.framework.name", "local");
    • 本地集群模式: 
      // 需要注掉configuration.set("mapreduce.framework.name", "local");
      job.setJar("D:\code\mayun_hadoop\test\hadoop\target\hadoop-hdfs-1.0-SNAPSHOT.jar");

    最终结果

    1970-8-8    77
    1970-8-23    66
    2018-3-11    18
    2018-4-23    65
    2019-5-21    33
    2019-6-2    50
    2019-6-1    44

     将地址替换掉

    思路:需要考虑到底是在map进行,还是在reduce处理。

    • 数据量小在map进行,数据量大在reduce处理

    地址原数据

    [root@ke01 bigdata]# hdfs dfs -cat /data/dist/dist.txt
    1 beijing
    2 shanghai
    3 Guangzhou

    MyTopN类添加:

    job.addCacheFile(new Path("/data/dist1/dist.txt").toUri());

    TKey类增加:并添加进输入输出流中

    private String location;

    TMapper中 重写setup方法,该方法会在map方法之前执行

        public HashMap<String,String> dict = new HashMap<String,String>();
    
        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
    //        1    beijing
    //        2    shanghai
    //        3    Guangzhou
            URI[] cacheFiles = context.getCacheFiles();
            Path path = new Path(cacheFiles[0].getPath());
            BufferedReader bufferedReader= new BufferedReader(new FileReader(new File(path.getName())));
            String line = bufferedReader.readLine();
            while (line != null){
                String[] split = line.split(" ");
                dict.put(split[0], split[1]);
                line = bufferedReader.readLine();
            }
        }
    
    
    将数据最终设置进map方法中输出
                mkey.setLocation(dict.get(split[1]));

    最终数据:

    [root@ke01 bigdata]# hdfs dfs -cat /data/topn/output3/part-r-00000
    1970-8-8_beijing    77
    1970-8-23_shanghai    66
    2018-3-11_Guangzhou    18
    2018-4-23_beijing    65
    2019-5-21_Guangzhou    33
    2019-6-2_shanghai    50
    2019-6-1_beijing    44

    代码: https://gitee.com/Xiaokeworksveryhard/big-data.git
  • 相关阅读:
    性能测试二十五:redis-cli 命令总结
    性能测试二十四:环境部署之Redis多实例部署
    性能测试二十三:环境部署之Redis安装和配置
    性能测试二十二:环境部署之Nginx
    性能测试二十一:环境部署之mysql
    性能测试二十:环境部署之Tomcat多实例部署+日志监控
    性能测试十九:jmeter参数优化+排错
    性能测试十八:jmeter分布式
    性能测试十七:liunx下jmeter结果报表、html报表
    性能测试十六:liunx下jmete配置环境变量
  • 原文地址:https://www.cnblogs.com/bigdata-familyMeals/p/14111020.html
Copyright © 2020-2023  润新知