• mapreduce (四) MapReduce实现Grep+sort


    1.txt
    dong xi cheng
    xi dong cheng
    wo ai beijing
    tian an men
    qiche
    dong
    dong
    dong
    2.txt
    dong xi cheng
    xi dong cheng
    wo ai beijing
    tian an men
    qiche
    dong
    dong
    dong
    
    import java.io.IOException;
    import java.util.Random;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.FileSystem;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
    import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
    import org.apache.hadoop.mapreduce.lib.map.InverseMapper;
    import org.apache.hadoop.mapreduce.lib.map.RegexMapper;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
    import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
    import org.apache.hadoop.mapreduce.lib.reduce.LongSumReducer;
    
    public class IGrep {
    
        public static void main(String[] args) throws IOException,
                ClassNotFoundException, InterruptedException {
            Configuration conf = new Configuration();
    
            String dir_in = "hdfs://localhost:9000/input_grep";
            String dir_out = "hdfs://localhost:9000/output_grep";
            String reg = ".ng";//匹配三个字符的字符串,且以ng结尾。
    
            conf.set(RegexMapper.PATTERN, reg);
            conf.setInt(RegexMapper.GROUP, 0);
    
            Path in = new Path(dir_in);
            Path tmp = new Path("grep-temp-"
                    + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
            Path out = new Path(dir_out);
    
            try {
                Job grepJob = new Job(conf, "grep-search");
    
                grepJob.setJarByClass(IGrep.class);
    
                grepJob.setInputFormatClass(TextInputFormat.class);
                grepJob.setMapperClass(RegexMapper.class);
                grepJob.setCombinerClass(LongSumReducer.class);
                grepJob.setPartitionerClass(HashPartitioner.class);
    
                grepJob.setMapOutputKeyClass(Text.class);
                grepJob.setMapOutputValueClass(LongWritable.class);
                FileInputFormat.addInputPath(grepJob, in);
    
                grepJob.setReducerClass(LongSumReducer.class);
                // job.setNumReduceTasks(1);
                grepJob.setOutputFormatClass(SequenceFileOutputFormat.class);
    
                grepJob.setOutputKeyClass(Text.class);
                grepJob.setOutputValueClass(LongWritable.class);
                FileOutputFormat.setOutputPath(grepJob, tmp);
    
                grepJob.waitForCompletion(true);
    
                Job sortJob = new Job(conf, "grep-sort");
    
                sortJob.setJarByClass(IGrep.class);
    
                sortJob.setInputFormatClass(SequenceFileInputFormat.class);
                sortJob.setMapperClass(InverseMapper.class);
                FileInputFormat.addInputPath(sortJob, tmp);
    
                sortJob.setNumReduceTasks(1);【全局排序】
                sortJob.setSortComparatorClass(LongWritable.DecreasingComparator.class);//逆序
    
                FileOutputFormat.setOutputPath(sortJob, out);
    
                sortJob.waitForCompletion(true);
                
            } finally {
                FileSystem.get(conf).delete(tmp, true);
            }
        }
    }


    输出结果:
    10    ong
    4    eng
    2    ing


  • 相关阅读:
    【转载】使用IntelliJ IDEA创建Maven聚合工程、创建resources文件夹、ssm框架整合、项目运行一体化
    【转载】使用IntelliJ IDEA 配置Maven(入门)
    谈谈JS中的高级函数
    js中typeof和instanceof用法区别
    javascript “||”、“&&”的灵活运用
    前端资源教程合集
    使用Flexible实现手淘H5页面的终端适配
    H5实现的手机摇一摇
    html5移动端页面分辨率设置及相应字体大小设置的靠谱使用方式
    优化RequireJS项目(合并与压缩)
  • 原文地址:https://www.cnblogs.com/i80386/p/3598864.html
Copyright © 2020-2023  润新知