• pig加载两个不同字段个数的文件?load file with different items(f1有42列,f2有43列读到一个对象中)


    我文章提到,加载一个文件的部分列是可行。两列,你只读一列,没问题。

    但是,两个文件,f1和f2,f1有42列,f2有43列,同时加载到一个流对象,如何?

    答:成功加载。但是无结构(schema unknown),discribe后看到:Schema for origin_cleaned_data unknown。

    这种情况类似union,合并两个不同列的对象,会生成一个未知模式对象。


    背景:因为老日志42列,新日志多加一列在第20列,因为20列后面不能同名,又要总体日志的用户点击数。所以一起加载,统一统计。

    (如果知道不同日期日志的类型,则可以分别读入,指定明确模式,然后用onschema进行uion,在分别统计。可惜接受项目,不确定线上哪天改的)

    采样:老日志log_without.txt,新日志log_with_android_ad_id.txt

    代码如下

    REGISTER piggybank.jar;
    DEFINE SequenceFileLoader org.apache.pig.piggybank.storage.SequenceFileLoader();

    %default cleanedLog /user/wizad/tmp/log_*

    --%default cleanedLog1 /home/wizad/lmj/log_without.txt
    --%default cleanedLog 2/home/wizad/lmj/log_with_android_ad_id.txt

    origin_cleaned_data = LOAD '$cleanedLog' USING PigStorage(','); 

    DUMP origin_cleaned_data;

    DESCRIBE origin_cleaned_data;


    显示结果:

    ((null) 5,74,48809e40-b8d7-41a4-bf68-d0f8e28140ad,575356365101899146,2014-07-30 10:33:56,2014-07-30 10:33:56,1,57074,2,,,,,,,,151.87.202.1,1,-1,-1,lmj,-1,1ac2c73e-d93a-4801-a7ee-da05473d0585,48809e40-b8d7-41a4-bf68-d0f8e28140ad,02:00:00:00:00:00,1940064625594046032,,,,d70cc494,25100,206,,0,2,2,7.1,,,,42.833298,12.833298,120232210032202)
    ((null) 5,74,357633052513139,1033882907630785616,2014-07-30 11:15:05,2014-07-30 11:15:05,1,57074,2,,,,,,,,155.128.32.119,1,357633052513139,270f213575a4eda7,lmj,270f213575a4eda7,,,40:0e:85:40:0e:1a,-7537294162085162169,,,,7626e397,62713,206,,2,1,3,4.3,,,,37.774902,-122.4194,023010203333003)
    ((null) 5,74,e7a4afce-ffd9-4ecd-b916-39f9d793c218,207640323432175503,2014-07-30 10:29:22,2014-07-30 10:29:22,1,57074,2,,,,,,,,111.200.142.163,1,-1,-1,lmj,-1,14ea5e95237f34e278d7ac210173d6b8ad9d5026,e7a4afce-ffd9-4ecd-b916-39f9d793c218,02:00:00:00:00:00,1179719885610920154,,,,d4eeab6e,66104,101,,0,2,2,7.1,1,7,7,39.928894,116.388306,132100103322203)
    ((null) 5,74,48809e40-b8d7-41a4-bf68-d0f8e28140ad,575356365101899146,2014-07-30 10:33:56,2014-07-30 10:33:56,1,57074,2,,,,,,,,151.87.202.1,1,-1,-1,-1,1ac2c73e-d93a-4801-a7ee-da05473d0585,48809e40-b8d7-41a4-bf68-d0f8e28140ad,02:00:00:00:00:00,1940064625594046032,,,,d70cc494,25100,206,,0,2,2,7.1,,,,42.833298,12.833298,120232210032202)
    ((null) 5,74,302bd8f1-b974-4af5-8183-1f67d27410d6,367366268601246781,2014-07-30 10:07:57,2014-07-30 10:07:57,1,57074,2,,,,,,,,56.2.255.220,1,-1,-1,-1,c165376f9f76cf68862a505328b7ba7cd0cfa0b0,302bd8f1-b974-4af5-8183-1f67d27410d6,02:00:00:00:00:00,-488564527359896578,,,,103b14d3,25100,206,,0,2,2,7.1,,,,37.774902,-122.4194,023010203333003)
    ((null) 5,74,e7a4afce-ffd9-4ecd-b916-39f9d793c218,207640323432175503,2014-07-30 10:29:22,2014-07-30 10:29:22,1,57074,2,,,,,,,,111.200.142.163,1,-1,-1,-1,14ea5e95237f34e278d7ac210173d6b8ad9d5026,e7a4afce-ffd9-4ecd-b916-39f9d793c218,02:00:00:00:00:00,1179719885610920154,,,,d4eeab6e,66104,101,,0,2,2,7.1,1,7,7,39.928894,116.388306,132100103322203)
    Schema for origin_cleaned_data unknown.


    多了一列值为lmj的列。可以看到无结构。


    union:合并不同格式的列

    union不去重复行

    A = load 'input1' as (x:int, y:float);
    B = load 'input2' as (x:int, y:chararray);
    C = union A, B;
    describe C;
    显示结果:
    Schema for C unknown

    两个不通列名的变量union用onschema

    需要注意:使用onschema,需要所有的输入都要有明确的schema,否则,错误。因为union合并时,比较是按名字和列类型(可以自动从低级到高级类型转换)。

    合并后,空出的列会补null。

    A = load 'input1' as (w: chararray, x:int, y:float);
    B = load 'input2' as (x:int, y:double, z:chararray);
    C = union onschema A, B;
    describe C;
    结果:
    C: {w: chararray,x: int,y: double,z: chararray}

    给出一个不能union的代码例子

    %default cleanedLog1 /home/wizad/lmj/log_without.txt
    %default cleanedLog2 /home/wizad/lmj/log_with_android_ad_id.txt

    origin1 = LOAD '$cleanedLog1' USING PigStorage(','); 
    origin2 = LOAD '$cleanedLog2' USING PigStorage(','); 

    DESCRIBE origin1
    DESCRIBE origin2

    origin = union origin1,origin2

    结果:

    origin1和origin2显示Schema for origin2 unknown.

    所以origin不能生成

  • 相关阅读:
    在 Windows 上测试 Redis Cluster的集群填坑笔记
    vmware安装黑苹果教程
    微信支付v3发布到iis时的证书问题
    Linux下安装SQL Server 2016(连接篇SQL Server on linux)
    Linux下安装SQL Server 2016(连接篇SQL Server on linux)
    Linux下安装SQL Server 2016(安装篇SQL Server on linux)
    Linux下安装SQL Server 2016(准备篇SQL Server on linux)
    客服端与服务端APP支付宝支付接口联调的那些坑
    ASP.NET MVC]WebAPI应用支持HTTPS的经验总结
    .net平台下C#socket通信(中)
  • 原文地址:https://www.cnblogs.com/cl1024cl/p/6205420.html
Copyright © 2020-2023  润新知