• HDFS小文件处理——Mapper处理


    处理小文件的时候,可以通过org.apache.hadoop.io.SequenceFile.Writer类将所有文件写出到一个seq文件中。

    大致流程如下:

    实现代码:

    package study.smallfile.sequence_one;
    
    import java.io.IOException;
    
    
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.FSDataInputStream;
    import org.apache.hadoop.fs.FileStatus;
    import org.apache.hadoop.fs.FileSystem;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.BytesWritable;
    import org.apache.hadoop.io.IOUtils;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.SequenceFile;
    import org.apache.hadoop.io.SequenceFile.CompressionType;
    import org.apache.hadoop.io.SequenceFile.Writer;
    import org.apache.hadoop.io.SequenceFile.Writer.Option;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    
    public class MapperDemo {
    
        private static final String INPUT_PATH = "hdfs://cluster1/smallfile/blankfile";
        private static final String OUT_PATH = "hdfs://cluster1/smallfile/combined/map";
        static FileSystem fileSystem;
    
        public void CombinedFile() throws Exception {
            Job job = Job.getInstance();
    
            job.setJarByClass(MapperDemo.class);
            job.setJobName(MapperDemo.class.getSimpleName());
    
            // 设置map类
            job.setMapperClass(MapperDemo.CombinedMapper.class);
            // 设置输出
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(BytesWritable.class);
            // 设置reduce任务数量
            job.setNumReduceTasks(0);
            // 设置输入路径
            FileInputFormat.setInputPaths(job, new Path(INPUT_PATH));
            // 检查输出路径
            Path outdir = new Path(OUT_PATH);
            fileSystem = FileSystem.get(job.getConfiguration());
            if (fileSystem.exists(outdir)) {// 如果已经存在删除
                fileSystem.delete(outdir, true);
            }
    
            // 设置输出路径
            FileOutputFormat.setOutputPath(job, outdir);
    
            job.waitForCompletion(true);
    
        }
    
        static class CombinedMapper extends
                Mapper<LongWritable, Text, Text, BytesWritable> {
            Writer writer = null;
            FileStatus[] files;
    
            Text outKey = new Text();
            BytesWritable outValue = new BytesWritable();
    
            FSDataInputStream in;
            byte[] buffer = null;
    
            @Override
            protected void map(LongWritable key, Text value,
                    Mapper<LongWritable, Text, Text, BytesWritable>.Context context)
                    throws IOException, InterruptedException {
    //            for (FileStatus file : files) {
    //                outKey.set(file.getPath().toString());
    //
    //                in = fileSystem.open(file.getPath());
    //                buffer = new byte[(int) file.getLen()];
    //                IOUtils.read(in, buffer, 0, buffer.length);
    //                outValue.set(new BytesWritable(buffer));
    //                writer.append(outKey, outValue);
    //            }
    
            }
    
            @Override
            protected void cleanup(
                    Mapper<LongWritable, Text, Text, BytesWritable>.Context context)
                    throws IOException, InterruptedException {
                for (FileStatus file : files) {
                    outKey.set(file.getPath().toString());
    
                    in = fileSystem.open(file.getPath());
                    buffer = new byte[(int) file.getLen()];
                    IOUtils.readFully(in, buffer, 0, buffer.length);
                    outValue.set(new BytesWritable(buffer));
                    writer.append(outKey, outValue);
                }
                IOUtils.closeStream(writer);
            }
    
            @Override
            protected void setup(
                    Mapper<LongWritable, Text, Text, BytesWritable>.Context context)
                    throws IOException, InterruptedException {
                // 输出文件项
                Option fileOption = SequenceFile.Writer.file(new Path(OUT_PATH
                        + "/mapper.seq"));
                // 压缩选项
                Option compressionOption = SequenceFile.Writer
                        .compression(CompressionType.BLOCK);
                // SequeneFile key类型设置
                Option keyClassOption = SequenceFile.Writer.keyClass(Text.class);
                // SequeneFile value类型设置
                Option valueClassOption = SequenceFile.Writer
                        .valueClass(BytesWritable.class);
                // 构建输出流文件
                Configuration conf = new Configuration();
                writer = SequenceFile.createWriter(conf, fileOption,
                        compressionOption, keyClassOption, valueClassOption);
                if (fileSystem == null) {
                    fileSystem = FileSystem.get(conf);
                }
                files = fileSystem.listStatus(new Path("hdfs://cluster1/smallfile/logs"));
                
            }
        }
    }
    View Code

    注意事项:

      我原本的逻辑是放到map函数中,将所有文件通过Writer写到HDFS中,但是map在整个mr的执行中被调用的次数是根据输入文件情况确定的,通过控制输入文件的情况,可以通过map函数实现

    发现问题:

    原本在实现之前,定义了一个FileSystem类型的静态字段,在提交job前已经赋值了,但是,在mapper类中访问到的fileSystem字段,是空值,有知道的大虾,多多指导小弟

    SequenceFile介绍:

    http://wiki.apache.org/hadoop/SequenceFile

    http://www.cnblogs.com/zhenjing/archive/2012/11/02/File-Format.html

     
  • 相关阅读:
    git常用命令
    IDEA设置
    redis基础
    SQL 基础
    springboot 配置日志 打印不出来sql
    阿里巴巴开发规范最新版
    rabbitmq用户权限
    rabbitMQ配置文件
    RabbitMQ配置文件(rabbitmq.conf)
    C++模板编程:如何使非通用的模板函数实现声明和定义分离
  • 原文地址:https://www.cnblogs.com/hpuCode/p/5300667.html
Copyright © 2020-2023  润新知