• mapreduce 读写Parquet格式数据 Demo


    import java.io.IOException;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.Reducer;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    import org.apache.hadoop.util.GenericOptionsParser;
    import org.apache.parquet.example.data.Group;
    import org.apache.parquet.example.data.simple.SimpleGroupFactory;
    import org.apache.parquet.hadoop.ParquetInputFormat;
    import org.apache.parquet.hadoop.ParquetOutputFormat;
    import org.apache.parquet.hadoop.example.GroupReadSupport;
    import org.apache.parquet.hadoop.example.GroupWriteSupport;
    import org.apache.parquet.schema.MessageType;
    import org.apache.parquet.schema.OriginalType;
    import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
    import org.apache.parquet.schema.Types;
    
    /**
     * MR Parquet格式数据读写Demo
     */
    public class ParquetReaderAndWriteMRDemo {
    
        public static void main(String[] args) throws Exception {
            Configuration conf = new Configuration();
            String[] otherargs=new GenericOptionsParser(conf, args).getRemainingArgs();
            if(otherargs.length!=3){
                System.out.println("<in> <out> 1");
                System.out.println("<parquet-in> <out> 2");
                System.out.println("<in> <parquet-out> 3");
                System.out.println("<parquet-in> <parquet-out> 4");
                System.exit(2);
            }
            //此demo 输入数据为2列     city  ip
            
            MessageType schema = Types.buildMessage() 
                       .required(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named("city") 
                       .required(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named("ip") 
                       .named("pair");
            System.out.println("[schema]=="+schema.toString());
            GroupWriteSupport.setSchema(schema, conf);
            
            Job job = Job.getInstance(conf, "ParquetReadMR");
            job.setJarByClass(ParquetReaderAndWriteMRDemo.class);
            
            if(otherargs[2].equals("1")){
                job.setMapperClass(NormalMapper.class);
                job.setReducerClass(NormalReducer.class);
                job.setOutputKeyClass(Text.class);
                job.setOutputValueClass(Text.class);
                FileInputFormat.setInputPaths(job,otherargs[0] );
                FileOutputFormat.setOutputPath(job, new Path(otherargs[1]));
                if (!job.waitForCompletion(true))
                    return;
            }
            if(otherargs[2].equals("3")){
                job.setMapperClass(ParquetWriteMapper.class);
                job.setNumReduceTasks(0);
                FileInputFormat.setInputPaths(job,otherargs[0] );
                
                //parquet输出
                job.setOutputFormatClass(ParquetOutputFormat.class);
                ParquetOutputFormat.setWriteSupportClass(job, GroupWriteSupport.class);
    //            ParquetOutputFormat.setOutputPath(job, new Path(otherargs[1]));
                FileOutputFormat.setOutputPath(job, new Path(otherargs[1]));
                if (!job.waitForCompletion(true))
                    return;
            }
            
            if(otherargs[2].equals("2")){
                //parquet输入
                job.setMapperClass(ParquetReadMapper.class);
                job.setNumReduceTasks(0);
                job.setInputFormatClass(ParquetInputFormat.class);
                ParquetInputFormat.setReadSupportClass(job, GroupReadSupport.class);
                
                job.setOutputKeyClass(Text.class);
                job.setOutputValueClass(Text.class);
                FileInputFormat.setInputPaths(job,otherargs[0] );
                FileOutputFormat.setOutputPath(job, new Path(otherargs[1]));
                if (!job.waitForCompletion(true))
                    return;
            }
            if(otherargs[2].equals("4")){
                //TODO 不想写了
            }
        }
        
        public static class ParquetWriteMapper extends Mapper<LongWritable, Text, Void, Group> {
            SimpleGroupFactory factory=null;
            protected void setup(Context context) throws IOException ,InterruptedException {
                factory = new SimpleGroupFactory(GroupWriteSupport.getSchema(context.getConfiguration()));
            };
            
            public void map(LongWritable _key, Text ivalue, Context context) throws IOException, InterruptedException {
                Group pair=factory.newGroup();
                String[] strs=ivalue.toString().split("\s+");
                pair.append("city", strs[0]);
                pair.append("ip", strs[1]);
                context.write(null,pair);
            }
        }
        
        public static class ParquetReadMapper extends Mapper<Void, Group, Text, Text> {
            public void map(Void _key, Group group, Context context) throws IOException, InterruptedException {
                String city=group.getString(0, 0);
                String ip=group.getString(1, 0);
                context.write(new Text(city),new Text(ip));
            }
        }
        
        public static class NormalMapper extends Mapper<LongWritable, Text, Text, Text> {
    
            public void map(LongWritable ikey, Text ivalue, Context context) throws IOException, InterruptedException {
                String[] strs=ivalue.toString().split("\s+");
                context.write(new Text(strs[0]), new Text(strs[1]));
            }
        }
            public static class NormalReducer extends Reducer<Text, Text, Text, Text> {
    
                public void reduce(Text _key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
                    for (Text text : values) {
                        context.write(_key,text);
                    }
                    
                }
            }
    
    }
  • 相关阅读:
    .NET 6.0 —— 网络监视器 (TODO)
    Google adwords api —— report & AWQL
    Linux 镜像更新 为国内镜像源 for debian
    优化代码 —— 二八法则 & 编完代码,再优化
    鳥哥的 Linux 私房菜 ——— 第十八章、 服务的防火墙管理 xinetd, TCP Wrappers(3)
    端口号port 是什么
    aptget的install、update、upgrade的区别(转发)
    Google ads api —— github
    .net 6.00 —— record 类型 (TODO)
    Compiled models —— .NET Core 6.0
  • 原文地址:https://www.cnblogs.com/yanghaolie/p/7389543.html
Copyright © 2020-2023  润新知