• orc格式文件


    1、Hive支持
    创建表时指定orc格式即可:

    create table tmp.orc_test(id bigint, name string, age int) stored as orc TBLPROPERTIES('orc.compress'='SNAPPY')

    压缩格式有"SNAPPY"和 "ZLIB"两种,需要哪种格式指定即可

    2、SPARK支持

    Spark读:

    df = spark.read.orc("/tmp/test/orc_data") # 读出来的数据是一个dataframe

    Spark写:

    df.write.format("orc").save("/tmp/test/orc_data2")

    3、Hadoop Streaming支持

    3.1、读orc文件,输出text (常用查看orc文件)

    hadoop jar /usr/local/hadoop-2.7.0//share/hadoop/tools/lib/hadoop-streaming-2.7.0.jar 
    
    -libjars /usr/local/hive-1.2.0/lib/hive-exec-1.2.0-SNAPSHOT.jar 
    
    -mapper /bin/cat -reducer /bin/cat 
    
    -input /tmp/test/orc_test1 
    
    -output /tmp/test/orc_streaming_test3 
    
    -inputformat org.apache.hadoop.hive.ql.io.orc.OrcInputFormat

    3.2、读orc文件,写orc文件

    hadoop jar /usr/local/hadoop-2.7.0//share/hadoop/tools/lib/hadoop-streaming-2.7.0.jar 
    -libjars orc_maprd_test.jar 
    -D orc.mapred.output.schema="struct<id:string,name:string,sex:string,age:string>" 
    -input /tmp/test/orc_streaming_test 
    -output /tmp/test/orc_streaming_test2 
    -inputformat org.apache.orc.mapred.OrcInputFormat 
    -outputformat org.apache.orc.mapred.OrcOutputFormat 
    -mapper is.orc.MyMapper -reducer is.orc.MyReducer

    例子:

    mvn依赖

     <!--orc文件-->
            <dependency>
                <groupId>org.apache.orc</groupId>
                <artifactId>orc-core</artifactId>
                <version>1.2.3</version>
            </dependency>
            <dependency>
                <groupId>org.apache.orc</groupId>
                <artifactId>orc-mapreduce</artifactId>
                <version>1.1.0</version>
            </dependency>
            <dependency>
                <groupId>com.yammer.metrics </groupId>
                <artifactId>metrics-core </artifactId>
                <version>2.2.0 </version>
            </dependency>

     编写orc文件

    /**
     * 编写ORC文件
     * https://orc.apache.org/docs/mapreduce.html
     */
    public class OrcWriterMR {
        public static class OrcWriterMapper extends Mapper<LongWritable,Text,NullWritable,OrcStruct> {
            //要创建的ORC文件中的字段类型
            private TypeDescription schema = TypeDescription.fromString(
                    //"struct<str:string>"
                    "struct<datano:bigint,datatime:bigint,type:int,val:int>"
            );
            private OrcStruct pair = (OrcStruct) OrcStruct.createValue(schema);
            private final NullWritable outKey = NullWritable.get();
            public void map(LongWritable key, Text value, Context output) throws IOException, InterruptedException {
                if(!"".equals(value.toString())){
                    //String lineStr = value.toString().trim();
                    //pair.setFieldValue("str",new Text(lineStr));
                    String[] lineStrs = value.toString().split("\,");
                    pair.setFieldValue("datano",new LongWritable(Long.parseLong(lineStrs[0])));
                    pair.setFieldValue("datatime",new LongWritable(Long.parseLong(lineStrs[1])));
                    pair.setFieldValue("type",new IntWritable(Integer.parseInt(lineStrs[2])));
                    pair.setFieldValue("val",new IntWritable(Integer.parseInt(lineStrs[3])));
                    output.write(outKey, pair);
                }
            }
        }
        public static void main(String[] args) throws Exception {
            Configuration conf = new Configuration();
            //conf.set("orc.mapred.output.schema","struct<str:string>");
            conf.set("orc.mapred.output.schema","struct<datano:bigint,datatime:bigint,type:int,val:int>");
            Job job = Job.getInstance(conf);
            job.setJarByClass(OrcWriterMR.class);
            job.setJobName("Writter");
            String in = "file:///C:/Users/Administrator/Desktop/CAN.txt";
            String out = "file:///C:/Users/Administrator/Desktop/CAN1.orc";
            job.setMapperClass(OrcWriterMapper.class);
            job.setInputFormatClass(TextInputFormat.class);
            job.setNumReduceTasks(0);
            job.setOutputFormatClass(OrcOutputFormat.class);
            FileInputFormat.addInputPath(job, new Path(in));
            OrcOutputFormat.setOutputPath(job, new Path(out));
            job.waitForCompletion(true);
        }
    }
    读取orc文件编写成text文件
    /**
     * 读取orc文件编写成text文件
     */
    public class OrcReaderMR {
    
        public static class OrcMap extends Mapper<NullWritable,OrcStruct,NullWritable,Text> {
            Text text = new Text();
            public void map(NullWritable key, OrcStruct value, Context output) throws IOException, InterruptedException {
                StringBuffer sb= new StringBuffer();
                if (!"".equals(value.getFieldValue(0).toString())){
                    sb.append(value.getFieldValue(0).toString()+ "	");
                }
                text.set(sb.toString());
                output.write(NullWritable.get(),text);
            }
        }
    
        public static void main(String[] args) throws Exception {
            Configuration conf = new Configuration();
            Job job = Job.getInstance(conf);
            job.setJarByClass(OrcReaderMR.class);
            job.setJobName("OrcReaderMR");
            String in = "file:///C:/Users/Administrator/Desktop/gps1/gps1.orc";
            String out = "file:///C:/Users/Administrator/Desktop/CAN信息";
            job.setMapperClass(OrcMap.class);
            OrcInputFormat.addInputPath(job, new Path(in));
            FileOutputFormat.setOutputPath(job, new Path(out));
            job.setInputFormatClass(OrcInputFormat.class);
            job.setOutputFormatClass(TextOutputFormat.class);
            job.setNumReduceTasks(0);
            job.waitForCompletion(true);
        }


  • 相关阅读:
    如何配置mysql的超时时间
    什么是P2P流标
    为何农历10月1号要祭祖上坟?原来有这么多讲究,你知道吗?
    “请家堂”的旧习俗不是封建迷信
    sourcetree合并分支
    mybatis sql参考
    source tree使用经验
    关于 early Z 与 z-prepass
    发现一个好工具RenderDoc
    HASHSET不能预留容量问题
  • 原文地址:https://www.cnblogs.com/zyanrong/p/12726543.html
Copyright © 2020-2023  润新知