• Pandas合并数据集之merge、join方法


    合并数据集

    • pandas.merge 可根据一个或多个键将不同DataFrame中的行连接起来。
    • pandas.concat 可以沿着一条轴将多个对象堆叠到一起。
    • combine_first

    merge

    默认情况下,merge做的是'inner'连接;结果中的键是交集

    和数据库中的left、right以及outer连接这些外连全部是形成笛卡尔积

    merge合并的数据如果是多对多,则是笛卡尔积的形式合并

    
    import pandas as pd
    import numpy as np
    df1 = pd.DataFrame({'key1':['b','b','a','c','a','a','b'],
                        'data1':range(7)
                       })
    df2 = pd.DataFrame({'key1':['a','b','d'],
                        'data2':range(3)
                       })
    
    df1
    
        key1	data1
    0	b	0
    1	b	1
    2	a	2
    3	c	3
    4	a	4
    5	a	5
    6	b	6
    
    df2
    
        key1	data2
    0	a	0
    1	b	1
    2	d	2
    
    # merge默认会合并相同的列名,但是最好显示指定一下,on用于连接左右都存在的列名,如果只有一侧有,那不能使用on,使用left_on或者right_on
    pd.merge(df1, df2, on='key1',how='left')
    
        key1	data1	data2
    0	b	0	1.0
    1	b	1	1.0
    2	a	2	0.0
    3	c	3	NaN
    4	a	4	0.0
    5	a	5	0.0
    6	b	6	1.0
    
    # 如果两个对象的列名不同,可以详细显示合并的各列情况
    df3 = pd.DataFrame({'key1':['b','b','a','c','a','a','b'],
                        'data1':range(7)
                       })
    df4 = pd.DataFrame({'key2':['a','b','d'],
                        'data2':range(3)
                       })
    
    # left_on,right_on用于连接存在于一方的列
    pd.merge(df3,df4,left_on='key1',right_on='key2')
    
        key1	data1	key2	data2
    0	b	0	b	1
    1	b	1	b	1
    2	b	6	b	1
    3	a	2	a	0
    4	a	4	a	0
    5	a	5	a	0
    
    # how参数里面填写连接的类别,有left、right、outer分别对应左连接,右连接,全连接
    pd.merge(df3, df4,left_on='key1',right_on='key2',how='outer')
    
        key1	data1	key2	data2
    0	b	0.0	b	1.0
    1	b	1.0	b	1.0
    2	b	6.0	b	1.0
    3	a	2.0	a	0.0
    4	a	4.0	a	0.0
    5	a	5.0	a	0.0
    6	c	3.0	NaN	NaN
    7	NaN	NaN	d	2.0
    
    # 示例数据源
    df5 = pd.DataFrame({'key':['b','b','a','c','a','b'],
                        'data1':range(6)
                       })
    df6 = pd.DataFrame({'key':['a','b','a','b','d'],
                        'data2':range(5)
                       })
    df5
        key	data1
    0	b	0
    1	b	1
    2	a	2
    3	c	3
    4	a	4
    5	b	5
    
    df6
        key	data2
    0	a	0
    1	b	1
    2	a	2
    3	b	3
    4	d	4
    
    # 左外连接的笛卡尔积,左右3个b,右有2个b,2*3=6个b,又会形成并集
    pd.merge(df5,df6,how='left')
    
        key	data1	data2
    0	b	0	1.0
    1	b	0	3.0
    2	b	1	1.0
    3	b	1	3.0
    4	a	2	0.0
    5	a	2	2.0
    6	c	3	NaN
    7	a	4	0.0
    8	a	4	2.0
    9	b	5	1.0
    10	b	5	3.0
    
    left = pd.DataFrame({'key1':['foo','foo','bar'],
                        'key2':['one','two','one'],
                         'lval':[1,2,3]})
    right = pd.DataFrame({'key1':['foo','foo','bar','bar'],
                        'key2':['one','one','one','two'],
                         'rval':[4,5,6,7]})
    
    left
    
        key1	key2	lval
    0	foo	one	1
    1	foo	two	2
    2	bar	one	3
    
    right
    
        key1	key2	rval
    0	foo	one	4
    1	foo	one	5
    2	bar	one	6
    3	bar	two	7
    
    # 合并多个列名,需要把他们当做一个整体看,然后做笛卡尔积
    pd.merge(left,right,on=['key1','key2'],how='outer')
    
        key1	key2	lval	rval
    0	foo	one	1.0	4.0
    1	foo	one	1.0	5.0
    2	foo	two	2.0	NaN
    3	bar	one	3.0	6.0
    4	bar	two	NaN	7.0
    
    # suffixes用于指定附加到左右两个dataframe对象的列标签的名,下面是是原始的命名,默认加_x,_y
    pd.merge(left,right,on='key1')
    
        key1	key2_x	lval	key2_y	rval
    0	foo	one	1	one	4
    1	foo	one	1	one	5
    2	foo	two	2	one	4
    3	foo	two	2	one	5
    4	bar	one	3	one	6
    5	bar	one	3	two	7
    
    # suffixes就是指定一下名字,会形成下面的效果
    pd.merge(left,right,on='key1',suffixes=('_left','_right'))
    
        key1	key2_left	lval	key2_right	rval
    0	foo	one	1	one	4
    1	foo	one	1	one	5
    2	foo	two	2	one	4
    3	foo	two	2	one	5
    4	bar	one	3	one	6
    5	bar	one	3	two	7
    
    索引上的合并
    
    left1 = pd.DataFrame({'key':['a','b','a','a','b','c'],
                         'value':range(6)})
    right1 = pd.DataFrame({'group_val':[3.5,7]},index=['a','b'])
    
    left1
        
        key	value
    0	a	0
    1	b	1
    2	a	2
    3	a	3
    4	b	4
    5	c	5
    
    right1
        
        group_val
    a	3.5
    b	7.0
    
    # 上面是用key的列值去合并右侧的行索引,right_index开启将行索引用作连接的键,如果是左侧的表,同理有left_index,默认交集
    
    pd.merge(left1, right1, left_on='key',right_index=True)
    
        key	value	group_val
    0	a	0	3.5
    2	a	2	3.5
    3	a	3	3.5
    1	b	1	7.0
    4	b	4	7.0
    
    # 指定并集
    pd.merge(left1, right1, left_on='key',right_index=True,how='outer')
    
        key	value	group_val
    0	a	0	3.5
    2	a	2	3.5
    3	a	3	3.5
    1	b	1	7.0
    4	b	4	7.0
    5	c	5	NaN
    
    层次化索引的合并
    lefth = pd.DataFrame({'key1':['Ohio','Ohio','Ohio','Nevada','Nevada'],
                          'key2':[2000,2001,2002,2001,2002],
                           'data':np.arange(5.)})
    
    # lefth.set_index(['key1','key2']).T 将列转为层次化行索引
    lefth
    
        key1	key2	data
    0	Ohio	2000	0.0
    1	Ohio	2001	1.0
    2	Ohio	2002	2.0
    3	Nevada	2001	3.0
    4	Nevada	2002	4.0
    
    
    right1 = pd.DataFrame(np.arange(12).reshape(6,2),index=[
        ['Nevada','Nevada','Ohio','Ohio','Ohio','Ohio'],
        [2001,2000,2000,2000,2001,2002]
    ],columns=['data1','data2'])
    
    right1
    
                data1	data2
    Nevada	2001	0	1
            2000	2	3
    Ohio	2000	4	5
            2000	6	7
            2001	8	9
            2002	10	11
    
    # 对于层次化索引的合并,左侧的列名是右侧的行索引,故开启right_index
    
    pd.merge(lefth,right1,left_on=['key1','key2'],right_index=True)
    
        key1	key2	data	data1	data2
    0	Ohio	2000	0.0	4	5
    0	Ohio	2000	0.0	6	7
    1	Ohio	2001	1.0	8	9
    2	Ohio	2002	2.0	10	11
    3	Nevada	2001	3.0	0	1
    
    # 合并双方的索引
    left2 = pd.DataFrame([[1.,2.],[3.,4.],[5.,6.]],index=['a','c','e'],columns=['Ohio','Nevada'])
    right2 = pd.DataFrame([[7.,8.],[9.,10.],[11.,12.],[13,14]],index=['b','c','d','e'],columns=['Missouri','Alabama'])
    
    pd.merge(left2,right2,how='outer',left_index=True,right_index=True)
    
        Ohio	Nevada	Missouri	Alabama
    a	1.0	2.0	NaN	NaN
    b	NaN	NaN	7.0	8.0
    c	3.0	4.0	9.0	10.0
    d	NaN	NaN	11.0	12.0
    e	5.0	6.0	13.0	14.0
    
    # join方法,更方便实现按索引合并
    left2.join(right2,how='outer')
    
        Ohio	Nevada	Missouri	Alabama
    a	1.0	2.0	NaN	NaN
    b	NaN	NaN	7.0	8.0
    c	3.0	4.0	9.0	10.0
    d	NaN	NaN	11.0	12.0
    e	5.0	6.0	13.0	14.0
    
    left3 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3','A4'],
                         'B': ['B0', 'B1', 'B2', 'B3','b4'],
                         'key': ['K0', 'K1', 'K0', 'K1','C1']})
    
    right3 = pd.DataFrame({'C': ['C0', 'C1'],
                          'D': ['D0', 'D1']},
                        index=['K0', 'K1'])
    
    # 如果列的值是另一个dataframe的行索引
    left3.join(right3,on='key')
    
        A	B	key	C	D
    0	A0	B0	K0	C0	D0
    1	A1	B1	K1	C1	D1
    2	A2	B2	K0	C0	D0
    3	A3	B3	K1	C1	D1
    4	A4	b4	C1	NaN	NaN
    
    # 对于简单的索引合并,你可以向join传入一组DataFrame
    another = pd.DataFrame([[7.,8.],[9.,10.],[11.,12.],[16.,17.]],index=['a','c','e','f'],columns=['New York','Oregon'])
    
    left2.join([right2,another])
    
        Ohio	Nevada	Missouri	Alabama	New York	Oregon
    a	1.0	2.0	NaN	NaN	7.0	8.0
    c	3.0	4.0	9.0	10.0	9.0	10.0
    e	5.0	6.0	13.0	14.0	11.0	12.0
    
  • 相关阅读:
    java代码---------实现布尔型的功能,是否执行下一步的关键
    java代码--------实现随机输出100个随机数,10行,0--到9的数字
    java代码--------打印三角形
    java代码------------条件运算符 ?:
    java代码-----------逻辑运算符、 &&逻辑与 ||或
    java代码------计算器
    C/C++ ===复习==函数返回值问题(集合体==网络)
    Centos6.5添加163软件yum源
    python2.7.5 +eric4.4.2+PyQt4-4.10.3
    python yield初探 (转)
  • 原文地址:https://www.cnblogs.com/lishi-jie/p/10009169.html
Copyright © 2020-2023  润新知