• pandas的合并、连接、去重、替换


      1 import pandas as pd
      2 import numpy as np
      3 
      4 # merge合并 ,类似于Excel中的vlookup
      5 
      6 df1 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
      7                     'A': ['A0', 'A1', 'A2', 'A3'],
      8                     'B': ['B0', 'B1', 'B2', 'B3']})
      9 df2 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
     10                     'C': ['C0', 'C1', 'C2', 'C3'],
     11                     'D': ['D0', 'D1', 'D2', 'D3']})
     12 df3 = pd.DataFrame({'key1': ['K0', 'K0', 'K2', 'K3'],
     13                     'key2': ['K0', 'K1', 'K0', 'K1'],
     14                     'A': ['A0', 'A1', 'A2', 'A3'],
     15                     'B': ['B0', 'B1', 'B2', 'B3']})
     16 df4 = pd.DataFrame({'key1': ['K0', 'K0', 'K2', 'K3'],
     17                     'key2': ['K0', 'K0', 'K0', 'K0'],
     18                     'C': ['C0', 'C1', 'C2', 'C3'],
     19                     'D': ['D0', 'D1', 'D2', 'D3']})
     20 print(pd.merge(df1,df2,on='key'))
     21 # 第一个DataFrame为拼接后左边的
     22 # 第二个DataFrame为拼接后右边的
     23 # on 为参考键
     24 '''
     25   key   A   B   C   D
     26 0  K0  A0  B0  C0  D0
     27 1  K1  A1  B1  C1  D1
     28 2  K2  A2  B2  C2  D2
     29 3  K3  A3  B3  C3  D3
     30 '''
     31 # 多个键连接
     32 print(pd.merge(df3, df4, on=['key1', 'key2']))
     33 # 当两个DataFrame中的key1和key2都相同时,才会连,否则不连
     34 '''
     35   key1 key2   A   B   C   D
     36 0   K0   K0  A0  B0  C0  D0
     37 1   K0   K0  A0  B0  C1  D1
     38 2   K2   K0  A2  B2  C2  D2
     39 '''
     40 # 参数how  , 合并方式
     41 # 默认,取交集
     42 print(pd.merge(df3, df4, on=['key1', 'key2'], how='inner'))
     43 print('-' * 8)
     44 '''
     45   key1 key2   A   B   C   D
     46 0   K0   K0  A0  B0  C0  D0
     47 1   K0   K0  A0  B0  C1  D1
     48 2   K2   K0  A2  B2  C2  D2
     49 --------
     50 '''
     51 # 取并集,outer,数据缺失范围NaN
     52 print(pd.merge(df3, df4, on=['key1', 'key2'], how='outer'))
     53 print('-' * 8)
     54 '''
     55   key1 key2    A    B    C    D
     56 0   K0   K0   A0   B0   C0   D0
     57 1   K0   K0   A0   B0   C1   D1
     58 2   K0   K1   A1   B1  NaN  NaN
     59 3   K2   K0   A2   B2   C2   D2
     60 4   K3   K1   A3   B3  NaN  NaN
     61 5   K3   K0  NaN  NaN   C3   D3
     62 --------
     63 '''
     64 # 参照df3为参考合并,数据缺失范围NaN
     65 print(pd.merge(df3, df4, on=['key1', 'key2'], how='left'))
     66 print('-' * 8)
     67 '''
     68   key1 key2   A   B    C    D
     69 0   K0   K0  A0  B0   C0   D0
     70 1   K0   K0  A0  B0   C1   D1
     71 2   K0   K1  A1  B1  NaN  NaN
     72 3   K2   K0  A2  B2   C2   D2
     73 4   K3   K1  A3  B3  NaN  NaN
     74 --------
     75 '''
     76 # 参照df4为参考合并,数据缺失范围NaN
     77 print(pd.merge(df3, df4, on=['key1', 'key2'], how='right'))
     78 print('-' * 8)
     79 '''
     80   key1 key2    A    B   C   D
     81 0   K0   K0   A0   B0  C0  D0
     82 1   K0   K0   A0   B0  C1  D1
     83 2   K2   K0   A2   B2  C2  D2
     84 3   K3   K0  NaN  NaN  C3  D3
     85 --------
     86 '''
     87 # 参数left_on,right_on,left_index, right_index  ,当键不为一个列时,可以单独设置左键与右键
     88 df5 = pd.DataFrame({'lkey': list('bbacaab'),
     89                     'data1': range(7)})
     90 df6 = pd.DataFrame({'rkey': list('abd'),
     91                     'date2': range(3)})
     92 print(df5)
     93 print(df6)
     94 print(pd.merge(df5,df6,left_on='lkey',right_on='rkey'))
     95 '''
     96   lkey  data1
     97 0    b      0
     98 1    b      1
     99 2    a      2
    100 3    c      3
    101 4    a      4
    102 5    a      5
    103 6    b      6
    104   rkey  date2
    105 0    a      0
    106 1    b      1
    107 2    d      2
    108   lkey  data1 rkey  date2
    109 0    b      0    b      1
    110 1    b      1    b      1
    111 2    b      6    b      1
    112 3    a      2    a      0
    113 4    a      4    a      0
    114 5    a      5    a      0
    115 '''
    116 
    117 # concat() 连接,默认axis=0  行+行,当axis=1时,列+列  成为Dataframe
    118 s1 = pd.Series([2, 3, 4])
    119 s2 = pd.Series([1, 2, 3])
    120 print(pd.concat([s1, s2]))
    121 '''
    122 0    2
    123 1    3
    124 2    4
    125 0    1
    126 1    2
    127 2    3
    128 dtype: int64
    129 '''
    130 print(pd.concat([s1,s2],axis=1))
    131 '''
    132    0  1
    133 0  2  1
    134 1  3  2
    135 2  4  3
    136 '''
    137 snew = pd.concat([s1, s2], axis=1)
    138 snew.reset_index(inplace=True)
    139 print(snew)
    140 '''
    141    index  0  1
    142 0      0  2  1
    143 1      1  3  2
    144 2      2  4  3
    145 '''
    146 snew2 = pd.concat([s1, s2], axis=1)
    147 snew2.reset_index(inplace=True, drop=True)
    148 print(snew2)
    149 '''
    150    0  1
    151 0  2  1
    152 1  3  2
    153 2  4  3
    154 '''
    155 
    156 # 去重  .duplicated()
    157 s3 = pd.Series([1, 2, 2, 4, 4, 6, 7, 6, 87])
    158 # 判断是否重复
    159 print(s3.duplicated())
    160 '''
    161 0    False
    162 1    False
    163 2     True
    164 3    False
    165 4     True
    166 5    False
    167 6    False
    168 7     True
    169 8    False
    170 dtype: bool
    171 '''
    172 # 取出重复的值
    173 s4 = s3[s3.duplicated()]
    174 print(s4)
    175 # 取出唯一的元素
    176 s5 = s3[s3.duplicated() == False]
    177 print(s5)
    178 '''
    179 0     1
    180 1     2
    181 3     4
    182 5     6
    183 6     7
    184 8    87
    185 dtype: int64
    186 '''
    187 s5 = s3.drop_duplicates()
    188 # 可以通过设置参数:inplace控制是否替换原先的值
    189 print(s5)
    190 '''
    191 0     1
    192 1     2
    193 3     4
    194 5     6
    195 6     7
    196 8    87
    197 dtype: int64
    198 '''
    199 df7 = pd.DataFrame({'key1':['a','a',3,4,3],
    200                     'key2':['a','a','b','b',5]})
    201 print(df7.duplicated())
    202 # 按行检测,第二次出现时,返回True
    203 '''
    204 0     1
    205 1     2
    206 3     4
    207 5     6
    208 6     7
    209 8    87
    210 dtype: int64
    211 '''
    212 # 今查看key2列
    213 print(df7['key2'].duplicated())
    214 '''
    215 0    False
    216 1     True
    217 2    False
    218 3     True
    219 4    False
    220 Name: key2, dtype: bool
    221 '''
    222 # 直接去重
    223 print(df7.drop_duplicates())
    224 '''
    225   key1 key2
    226 0    a    a
    227 2    3    b
    228 3    4    b
    229 4    3    5
    230 '''
    231 print(df7['key2'].drop_duplicates())
    232 '''
    233 0    a
    234 2    b
    235 4    5
    236 Name: key2, dtype: object
    237 '''
    238 
    239 # 替换  .replace()
    240 s6 = pd.Series(list('askjdghs'))
    241 # 一次性替换一个值
    242 # print(s6.replace('s','dsd'))
    243 '''
    244 0      a
    245 1    dsd
    246 2      k
    247 3      j
    248 4      d
    249 5      g
    250 6      h
    251 7    dsd
    252 dtype: object
    253 '''
    254 # 一次性替换多个值
    255 print(s6.replace(['a','s'],np.nan))
    256 '''
    257 0    NaN
    258 1    NaN
    259 2      k
    260 3      j
    261 4      d
    262 5      g
    263 6      h
    264 7    NaN
    265 dtype: object
    266 '''
    267 # 通过字典的形式替换值
    268 print(s6.replace({'a':np.nan}))
    269 '''
    270 0    NaN
    271 1      s
    272 2      k
    273 3      j
    274 4      d
    275 5      g
    276 6      h
    277 7      s
    278 dtype: object
    279 
    280 '''
  • 相关阅读:
    【JS】【8】拼接字符串
    【JS】【7】获取粘贴板的内容
    【Eclipse】【3】checkstyle插件
    【Eclipse】【2】遇到的问题
    android手机短信监听器,监听到的短信发送到web服务器,完整源码分享
    android手机号归属地查下案例源码
    免费web api接口大全
    android读取远程图片案例
    android向web提交参数的4种方式总结,附带网站案例源码
    csu 10月 月赛 B 题 Scoop water
  • 原文地址:https://www.cnblogs.com/xshan/p/10816567.html
Copyright © 2020-2023  润新知