• MapReduce设置输出文件到多个文件夹下(二)


    app类

    package mrtest.multipleout;
    
    import com.zyr.baseutil.UrlUtil;
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.counters.Limits;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat;
    import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
    import org.apache.orc.mapred.OrcStruct;
    import util.MOrcOutputFormat;
    
    import java.io.IOException;
    
    
    /**
     * 多输出orc文件
     * @author Administrator
     * 开发时写orc时注意orc的版本问题 1.6.2
     *
     */
    public class MultipleOrcOutDriver {
        public static String a = "a";
        public static String b = "b";
        public static String counterInPath = "/txj/a.txt";
        public static String counterOutPath = "file:///F:/txj/test";
        /**
         * 程序入口
         * @param args
         * @throws IOException
         * @throws ClassNotFoundException
         * @throws InterruptedException
         */
        public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
            //1获取job对象信息
            Configuration conf = new Configuration();
            Job job = Job.getInstance(conf);
    
            //2设置加载jar位置
            job.setJarByClass(MultipleOrcOutDriver.class);
    
            //3设置mapper和reducer的class类
            job.setMapperClass(MultipleOrcOutMapper.class);
            job.setReducerClass(MultipleOrcOutReduce.class);
    
            //4设置输出mapper的数据类型
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(IntWritable.class);
    
            //5设置最终数据输出类型
            job.setOutputKeyClass(NullWritable.class);
            job.setOutputValueClass(OrcStruct.class);
    
            //6设置输入数据和输出数据路径
            FileInputFormat.setInputPaths(job, new Path(counterInPath));
    //        FileOutputFormat.setOutputPath(job, new Path(counterOutPath));
            MOrcOutputFormat.setOutputPath(job, new Path(counterOutPath));
    
            job.setInputFormatClass(TextInputFormat.class);
    //        job.setOutputFormatClass(TextOutputFormat.class);
            job.setOutputFormatClass(MOrcOutputFormat.class);
    
            //
            job.getConfiguration().set(a,counterOutPath +"/a/a");
            job.getConfiguration().set(b,counterOutPath +"/b/b");
            MultipleOutputs.addNamedOutput(job, MultipleOrcOutDriver.a, MOrcOutputFormat.class, NullWritable.class, OrcStruct.class);
            MultipleOutputs.addNamedOutput(job, MultipleOrcOutDriver.b, MOrcOutputFormat.class, NullWritable.class, OrcStruct.class);
            //启用
            MultipleOutputs.setCountersEnabled(job, true);
            //懒加载output模式 防止因为多路输出时 没有文件但是依然创建旧目录和空文件
            LazyOutputFormat.setOutputFormatClass(job, MOrcOutputFormat.class);
            //重新设置文件输出个数控制,默认控制是120个
            job.getConfiguration().setInt("mapreduce.job.counters.max", 1000000000);
            job.getConfiguration().setInt("mapreduce.job.counters.limit", 1000000000);
         //  Limits.init(job.getConfiguration());
    //7提交 boolean result = job.waitForCompletion(true); System.exit(result?0:1); } }

    mapper类

    package mrtest.multipleout;
    
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;
    
    /**
     * 实现自己的mapper类
     * @author Administrator
     */
    public class MultipleOrcOutMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
        private Text outKey = new Text();
        private IntWritable outVal = new IntWritable();
        /**
         * 分割获取到每个单词
         */
        @Override
        protected void map(LongWritable key, Text value, Context context)
                throws java.io.IOException, InterruptedException {
            // 获取这一行内容
            String lineStr = value.toString();
            // 获取每个单词
            String[] words = lineStr.split(" ");
            if (words.length==2){
                outKey.set(words[1]);
                outVal.set(Integer.parseInt(words[0]));
                context.write(outKey, outVal);
            }
        };
    }

    reduce类

    package mrtest.multipleout;
    
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Reducer;
    import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
    import org.apache.orc.TypeDescription;
    import org.apache.orc.mapred.OrcStruct;
    
    import java.io.IOException;
    import java.util.Iterator;
    
    /**
     *
     * @author Administrator
     *
     */
    public class MultipleOrcOutReduce extends Reducer<Text, IntWritable,  NullWritable, OrcStruct>{
        //要创建的ORC文件中的字段类型
        private TypeDescription schema = TypeDescription.fromString("struct<count:bigint,val:string>");
        private OrcStruct pair = (OrcStruct) OrcStruct.createValue(schema);
        public static MultipleOutputs<NullWritable, OrcStruct> mo;
        public static String a = "";
        public static String b = "";
    
        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            Configuration conf = context.getConfiguration();
            mo = new MultipleOutputs<>(context);
            a = conf.get(MultipleOrcOutDriver.a);
            b = conf.get(MultipleOrcOutDriver.b);
        }
    
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            Iterator<IntWritable> it = values.iterator();
            int count = 0;
            while (it.hasNext()){
                IntWritable val = it.next();
                count += val.get();
            }
            switch (key.toString()){
                case "a":
                    pair.setFieldValue("val",key);
                    pair.setFieldValue("count",new LongWritable(count));
                    mo.write(MultipleOrcOutDriver.a,NullWritable.get(),pair,a);
                    break;
                case "b":
                    pair.setFieldValue("val",key);
                    pair.setFieldValue("count",new LongWritable(count));
                    mo.write(MultipleOrcOutDriver.b,NullWritable.get(),pair,b);
                    break;
            }
        }
    
        @Override
        protected void cleanup(Context context) throws IOException, InterruptedException {
            mo.close();
    
    
        }
    }
    
    
  • 相关阅读:
    idea spring boot 1.x junit单元测试
    linux oracle/jdk启用大页面
    jdk8之CompletableFuture与CompletionService
    gc日志深入解析-覆盖CMS、并行GC、G1、ZGC、openj9
    h2 web console使用
    LockSupport工具类详解
    反射、Unsafe、直接调用性能大比拼
    spring boot druid动态多数据源监控集成
    Linux网络
    org.springframework.jdbc.CannotGetJdbcConnectionException: Could not get JDBC Connection总结
  • 原文地址:https://www.cnblogs.com/zyanrong/p/16363842.html
Copyright © 2020-2023  润新知