MapReduce-自定义 InputFormat 生成 SequenceFile

Hadoop 框架自带的 InputFormat 类型不能满足所有应用场景，需要自定义 InputFormat 来解决实际问题。

无论 HDFS 还是 MapReduce，在处理小文件时效率都非常低，但又难免面临处理大量小文件的场景，此时，就需要有相应解决方案。可以自定义 InputFormat 实现小文件的合并。

将多个小文件合并成一个 SequenceFile 文件（SequenceFile 文件是 Hadoop 用来存储二进制形式的 key-value 对的文件格式），SequenceFile 里面存储着多个文件，存储的形式为文件路径+名称为key，文件内容为 value。

自定义 ImputFormat 步骤：

（1）自定义一个类继承 FilelnputFormat。
（1.1）重写 isSplitable() 方法，返回 false 不可切割
（1.2）重写createRecordReader()，创建自定义的 RecordReader 对象，并初始化

（2）改写 RecordReader，实现一次读取一个完整文件封装为K-V。
（2.1）采用IO流一次读取一个文件输出到 value 中，因为设置了不可切片，最终把所有文件都封装到了 value 中
（2.2）获取文件路径信息 + 名称，并设置 key

（3）输入时使用自定义的 InputFormat，在输出时使用 SequenceFileOutPutFormat 输出合并文件。

自定义一个 InputFormat，将小文件合并为一个文件（SequenceFile）

1.测试数据

2.切片数，与 TextInputFormat 一样，按照文件大小进行切片

3.读取数据方式，查看 k-v 值，按照自定义的方式在读取

4.结果，大概可以看出是文件路径加上文件类容组成

5.测试代码

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.log4j.BasicConfigurator;

import java.io.IOException;

public class SequenceFileDriver {

    static {
        try {
            // 设置 HADOOP_HOME 环境变量
            System.setProperty("hadoop.home.dir", "D://DevelopTools/hadoop-2.9.2/");
            // 日志初始化
            BasicConfigurator.configure();
            // 加载库文件
            System.load("D://DevelopTools/hadoop-2.9.2/bin/hadoop.dll");
        } catch (UnsatisfiedLinkError e) {
            System.err.println("Native code library failed to load.
" + e);
            System.exit(1);
        }
    }

    public static void main(String[] args) throws Exception, IOException {
        args = new String[]{"D:\tmp\input2", "D:\tmp\456"};

        // 1 获取job对象
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        // 2 设置jar包存储位置、关联自定义的mapper和reducer
        job.setJarByClass(SequenceFileDriver.class);
        job.setMapperClass(SequenceFileMapper.class);
        job.setReducerClass(SequenceFileReducer.class);

        // 3 设置map输出端的kv类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(BytesWritable.class);

        // 4 设置最终输出端的kv类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(BytesWritable.class);

        // 5 设置输入输出路径
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        // 6 设置输入的inputFormat
        job.setInputFormatClass(WholeFileInputFormat.class);
        // 7 设置输出的outputFormat
        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        // 8 提交job
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

class SequenceFileMapper extends Mapper<Text, BytesWritable, Text, BytesWritable> {

    @Override
    protected void map(Text key, BytesWritable value, Context context) throws IOException, InterruptedException {
        // 查看 k-v
        System.out.println(key + "	" + new String(value.getBytes()));
        context.write(key, value);
    }
}

class SequenceFileReducer extends Reducer<Text, BytesWritable, Text, BytesWritable> {

    @Override
    protected void reduce(Text key, Iterable<BytesWritable> values, Context context) throws IOException, InterruptedException {
        for (BytesWritable value : values) {
            context.write(key, value);
        }
    }
}

自定义的 InputFormat

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

public class WholeFileInputFormat extends FileInputFormat<Text, BytesWritable>{

    @Override
    public RecordReader<Text, BytesWritable> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
        WholeRecordReader recordReader = new WholeRecordReader();
        recordReader.initialize(split, context);
        return recordReader;
    }
}

class WholeRecordReader extends RecordReader<Text, BytesWritable>{

    FileSplit split;
    Configuration configuration;
    Text k = new Text();
    BytesWritable v = new BytesWritable();
    boolean isProgress = true;

    @Override
    public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
        // 初始化
        this.split = (FileSplit) split;
        configuration = context.getConfiguration();
    }

    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {
        if (isProgress) {
            byte[] buf = new byte[(int) split.getLength()];
            // 1 获取fs对象
            Path path = split.getPath();
            FileSystem fs = path.getFileSystem(configuration);
            // 2 获取输入流
            FSDataInputStream fis = fs.open(path);
            // 3 拷贝
            IOUtils.readFully(fis, buf, 0, buf.length);
            // 4 封装v
            v.set(buf, 0, buf.length);
            // 5 封装k
            k.set(path.toString());
            // 6 关闭资源
            IOUtils.closeStream(fis);
            isProgress = false;
            return true;
        }
        return false;
    }

    @Override
    public Text getCurrentKey() throws IOException, InterruptedException {
        return k;
    }

    @Override
    public BytesWritable getCurrentValue() throws IOException, InterruptedException {
        return v;
    }

    @Override
    public float getProgress() throws IOException, InterruptedException {
        // 进度
        return 0;
    }

    @Override
    public void close() throws IOException {
        // 关闭资源
    }
}

生成的 part-r-00000 文件就是合并后的 SequenceFile 文件

https://hadoop.apache.org/docs/current/api/org/apache/hadoop/io/SequenceFile.html

https://wiki.apache.org/hadoop/SequenceFile

相关阅读:
Oracle 11g 新特性安全性增强说明
 Oracle 11g 新特性安全性增强说明
 Oracle 11g 新特性 Invisible Indexes(不可见的索引) 说明
 Oracle 只读表空间说明
 Openfiler 配置 NFS 示例
 Oracle v$session 中sql_id 为 null 说明
 Oracle v$session 中sql_id 为 null 说明
 Oracle 11g 新特性管理 SPFILE 说明
 Oracle 11.2.0.1 RAC GRID 无法启动： Oracle High Availability Services startup failed
Oracle 11g 新特性管理 SPFILE 说明
原文地址：https://www.cnblogs.com/jhxxb/p/10792827.html