• 数据规整化:清理、转换、合并、重塑


    你也可以来看我的Github上的原文,欢迎交流:
    https://github.com/AsuraDong/Blog/blob/master/Articles/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0/%E6%95%B0%E6%8D%AE%E8%A7%84%E6%95%B4%E5%8C%96%EF%BC%9A%E6%B8%85%E7%90%86%E3%80%81%E8%BD%AC%E6%8D%A2%E3%80%81%E5%90%88%E5%B9%B6%E3%80%81%E9%87%8D%E5%A1%91.md
    import pandas as pd
    import numpy as np
    from pandas import DataFrame
    from pandas import Series

    1.合并数据集

    • pd.merge():各种参数的使用
    df1 = DataFrame({'key':['b','b','a','c','a','a','b'],
                       'data1':[i for i in range(7)]})
    df2 = DataFrame({'key':['a','b','d'],
                        'data2':[i for i in range(3)]})
    print(df1)
       data1 key
    0      0   b
    1      1   b
    2      2   a
    3      3   c
    4      4   a
    5      5   a
    6      6   b
    
    print(df2)
       data2 key
    0      0   a
    1      1   b
    2      2   d
    
    print(pd.merge(df1,df2,on='key') )
    #pd1 和 pd2 进行inner联结
    #on :指明将列当做键。默认是重叠的。
       data1 key  data2
    0      0   b      1
    1      1   b      1
    2      6   b      1
    3      2   a      0
    4      4   a      0
    5      5   a      0
    
    df3 = DataFrame({'key1':['b','b','a','c','a','a','b'],
                       'data1':[i for i in range(7)]})
    df4 = DataFrame({'key2':['a','b','d'],
                        'data2':[i for i in range(3)]})
    # 如果没有重叠的列名
    # 分别指定
    print(pd.merge(df3,df4,left_on="key1",right_on='key2'))
       data1 key1  data2 key2
    0      0    b      1    b
    1      1    b      1    b
    2      6    b      1    b
    3      2    a      0    a
    4      4    a      0    a
    5      5    a      0    a
    
    print(pd.merge(df1,df2,how="outer")) # 类似数据库的操作:inner、outer、left、right
       data1 key  data2
    0    0.0   b    1.0
    1    1.0   b    1.0
    2    6.0   b    1.0
    3    2.0   a    0.0
    4    4.0   a    0.0
    5    5.0   a    0.0
    6    3.0   c    NaN
    7    NaN   d    2.0
    

    1.1索引上的合并

    left1 = DataFrame({'key':['a','b','a','a','b','c'],
                       'data1':[i for i in range(6)]})
    right1 = DataFrame({'group_val':[3.5,7]},index=['a','b'])
    
    print(right1)
    print(left1)
       group_val
    a        3.5
    b        7.0
       data1 key
    0      0   a
    1      1   b
    2      2   a
    3      3   a
    4      4   b
    5      5   c
    
    print(pd.merge(left1,right1,left_on='key',right_index=True))
       data1 key  group_val
    0      0   a        3.5
    2      2   a        3.5
    3      3   a        3.5
    1      1   b        7.0
    4      4   b        7.0
    

    1.2轴向连接

    • np.concatenate
    • pd.concat([s1,s2,s3])
    arr = np.arange(12).reshape((3,4))
    arr
    array([[ 0,  1,  2,  3],
           [ 4,  5,  6,  7],
           [ 8,  9, 10, 11]])
    
    np.concatenate([arr,arr],axis=1) #numpy对象的方法
    array([[ 0,  1,  2,  3,  0,  1,  2,  3],
           [ 4,  5,  6,  7,  4,  5,  6,  7],
           [ 8,  9, 10, 11,  8,  9, 10, 11]])
    
    s1 = Series([0,1],index=['a','b'])
    s2 = Series([2,3,4],index=['c','d','e'])
    s3 = Series([5,6],index=['f','g'])
    pd.concat([s1,s2,s3])
    a    0
    b    1
    c    2
    d    3
    e    4
    f    5
    g    6
    dtype: int64
    
    print(pd.concat([s1,s2,s3],axis=1)) # 按照列
         0    1    2
    a  0.0  NaN  NaN
    b  1.0  NaN  NaN
    c  NaN  2.0  NaN
    d  NaN  3.0  NaN
    e  NaN  4.0  NaN
    f  NaN  NaN  5.0
    g  NaN  NaN  6.0
    
    s4 = pd.concat([s1*5,s3])
    print(pd.concat([s1,s4],axis=1))
    print(pd.concat([s1,s4],axis=1,join="inner")) #取交集
    print(pd.concat([s1,s1,s1],keys=['one','two','three']))# 创建层次化索引
    print(pd.concat([s1,s1,s1],axis=1,keys=['one','two','three']))# 此时,层次化keys会变成列头
         0  1
    a  0.0  0
    b  1.0  5
    f  NaN  5
    g  NaN  6
       0  1
    a  0  0
    b  1  5
    one    a    0
           b    1
    two    a    0
           b    1
    three  a    0
           b    1
    dtype: int64
       one  two  three
    a    0    0      0
    b    1    1      1
    
    # 下面展示对于DataFrame的效果
    df1 = DataFrame(np.arange(6).reshape(3,2),index=['a','b','c'],
                   columns=['one','two'])
    df2 = DataFrame(np.arange(4).reshape(2,2),index=['a','c'],
                   columns=['three','four'])
    print(df1);print(df2)
       one  two
    a    0    1
    b    2    3
    c    4    5
       three  four
    a      0     1
    c      2     3
    
    print(pd.concat([df1,df2],axis=1,keys=['level1','level2']))
    print(pd.concat([df1,df2],axis=1,keys=['level1','level2'],names=['upper','lower']))
      level1     level2     
         one two  three four
    a      0   1    0.0  1.0
    b      2   3    NaN  NaN
    c      4   5    2.0  3.0
    upper level1     level2     
    lower    one two  three four
    a          0   1    0.0  1.0
    b          2   3    NaN  NaN
    c          4   5    2.0  3.0
    
    df1 = DataFrame(np.random.randn(3,4),columns=['a','b','c','d'])
    df2 = DataFrame(np.random.randn(2,3),columns=['b','d','a'])
    print(df1)
    print(df2)
              a         b         c         d
    0 -0.447944  0.147116  0.342810 -0.618590
    1  0.930223  0.379391 -0.994802  0.008333
    2 -0.239952  0.150497 -0.603801 -1.153930
              b         d         a
    0 -0.332100  0.245658  1.448707
    1  1.137389 -0.170095 -1.552497
    
    print(pd.concat([df1,df2]))
    
    print("应该用这种发法")
    print(pd.concat([df1,df2],ignore_index=True)) #不保留原来的行索引,产生新的行索引
              a         b         c         d
    0 -0.447944  0.147116  0.342810 -0.618590
    1  0.930223  0.379391 -0.994802  0.008333
    2 -0.239952  0.150497 -0.603801 -1.153930
    0  1.448707 -0.332100       NaN  0.245658
    1 -1.552497  1.137389       NaN -0.170095
    应该用这种发法
              a         b         c         d
    0 -0.447944  0.147116  0.342810 -0.618590
    1  0.930223  0.379391 -0.994802  0.008333
    2 -0.239952  0.150497 -0.603801 -1.153930
    3  1.448707 -0.332100       NaN  0.245658
    4 -1.552497  1.137389       NaN -0.170095
    

    1.4合并重叠数据

    • np.where
    • pandas:combine_first
    # Array或者Series合并
    a = Series([1,2,3,np.nan,4])
    b= Series([1,3,3,4,np.nan])
    print(np.where(pd.isnull(a),b,a))
    [ 1.  2.  3.  4.  4.]
    
    # Pandas提供了combine_first方法
    df1 = DataFrame({'a':[1,2,np.nan],
                    'b':[2,3,5],
                    'c':[8,np.nan,0]})
    df2 = DataFrame({'a':[5.,4],
                    'b':[np.nan,3.]})
    print(df1.combine_first(df2))
         a  b    c
    0  1.0  2  8.0
    1  2.0  3  NaN
    2  NaN  5  0.0
    

    2.重塑和轴向旋转

    2.1重塑层次化索引

    • stack:将列旋转为行
    • unstack:将行旋转为列
    • 默认情况下都是最内层
    data = DataFrame(np.arange(6).reshape((2,3)),
                    index=pd.Index(['Ohio','Colorade'],name='state'),
                    columns=pd.Index(['onw','two','three'],name='number'))
    print(data)
    number    onw  two  three
    state                    
    Ohio        0    1      2
    Colorade    3    4      5
    
    result = data.stack()
    print(result)
    state     number
    Ohio      onw       0
              two       1
              three     2
    Colorade  onw       3
              two       4
              three     5
    dtype: int32
    
    print(result.unstack()) #我们再重排回去
    number    onw  two  three
    state                    
    Ohio        0    1      2
    Colorade    3    4      5
    
    print(result.unstack(1))
    print(result.unstack(0))
    print(result.unstack('state'))
    number    onw  two  three
    state                    
    Ohio        0    1      2
    Colorade    3    4      5
    state   Ohio  Colorade
    number                
    onw        0         3
    two        1         4
    three      2         5
    state   Ohio  Colorade
    number                
    onw        0         3
    two        1         4
    three      2         5
    

    2.2长格式=>宽格式

    数据的格式如下:

    处理成如下形式:

    3.过滤、清洗和其他转换工作

    Part:1-2 讲的是过滤工作

    3.1移除重复数据

    • duplicated():返回是否重复列
    • drop_duplicates():去除重复列
    data = DataFrame({'k1':['one']*3+['two']*4,
                     'k2':[1,1,2,3,3,4,4]})
    print(data)
        k1  k2
    0  one   1
    1  one   1
    2  one   2
    3  two   3
    4  two   3
    5  two   4
    6  two   4
    
    print(data.duplicated())
    0    False
    1     True
    2    False
    3    False
    4     True
    5    False
    6     True
    dtype: bool
    
    print(data.drop_duplicates())
        k1  k2
    0  one   1
    2  one   2
    3  two   3
    5  two   4
    
    # 若,只希望根据k1列过滤重复项
    data['v1'] = range(7)
    print(data.drop_duplicates(['k1']))
        k1  k2  v1
    0  one   1   0
    3  two   3   3
    

    3.2利用函数映射进行数据转换

    就是调用map函数:可以一个函数或者有映射关系的字典

    data = DataFrame({'food':['a','b','c']})
    print(data)
      food
    0    a
    1    b
    2    c
    
    animal_to_food = {'a':'pig','b':'dog','c':'cat'}
    data['animal'] = data['food'].map(str.lower).map(animal_to_food)
    print(data)
      food animal
    0    a    pig
    1    b    dog
    2    c    cat
    
    del data['animal']
    # 通过函数来完成
    data['food'].map(lambda x: animal_to_food[x])
    0    pig
    1    dog
    2    cat
    Name: food, dtype: object
    

    3.3替换值

    fillnamap都可以,但更好的是:replace

    data = Series([1,-999,-10000000,5])
    print(data)
    data = data.replace(-999,np.nan).copy()
    print(data)
    # 一次性替换多个值
    # data.replace([-999,-1000],np.nan)
    # 对应替换
    # data.replace([-999,-1000],[np.nan,0])
    # 对应替换:字典
    # data.replace({-999:np.nan})
    0           1
    1        -999
    2   -10000000
    3           5
    dtype: int64
    0           1.0
    1           NaN
    2   -10000000.0
    3           5.0
    dtype: float64
    

    3.4重命名轴/列索引

    • 方法一:data.index/columns:都有map方法
    • 方法二(推荐):data.rename(index = func,columns = func):func也可以是映射关系的字典
    data = DataFrame(np.arange(12).reshape((3,4)),
                    index=['Ohio','Colorda','NesYord'],
                    columns = ['One','two','three','four'])
    print(data)
             One  two  three  four
    Ohio       0    1      2     3
    Colorda    4    5      6     7
    NesYord    8    9     10    11
    
    data.index = data.index.map(str.upper)
    print(data)
             One  two  three  four
    OHIO       0    1      2     3
    COLORDA    4    5      6     7
    NESYORD    8    9     10    11
    
    print(data.rename(index=str.title,columns=str.upper))
             ONE  TWO  THREE  FOUR
    Ohio       0    1      2     3
    Colorda    4    5      6     7
    Nesyord    8    9     10    11
    
    # 如果希望就地修改某个数据集,inplace=True
    data.rename(index={'OHIO':'oooo'},columns=str.lower,inplace = True)
    print(data)
             one  two  three  four
    oooo       0    1      2     3
    COLORDA    4    5      6     7
    NESYORD    8    9     10    11
    

    3.5分组:离散化和面元划分

    pd.cut()函数 
    pd.qcut():会获得数量相等的面元

    ages = [20,22,25,27,21,23,37,31,61,45]
    bins = [18,25,35,60,100]
    cats = pd.cut(ages,bins) # bins可以是整数,自动计算。参数labels:每一组的名字。right可以修改左右开闭
    print(cats)
    [(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], (18, 25], (35, 60], (25, 35], (60, 100], (35, 60]]
    Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
    
    print(cats.codes) #对应的第几个区间
    [0 0 0 1 0 0 2 1 3 2]
    
    print(pd.value_counts(cats)) # 查找
    (18, 25]     5
    (35, 60]     2
    (25, 35]     2
    (60, 100]    1
    dtype: int64
    

    3.6检测过滤异常值

    • 注意describe()的应用
    np.random.seed(12345)
    data = DataFrame(np.random.randn(1000,4))
    print(data.describe())
                     0            1            2            3
    count  1000.000000  1000.000000  1000.000000  1000.000000
    mean     -0.067684     0.067924     0.025598    -0.002298
    std       0.998035     0.992106     1.006835     0.996794
    min      -3.428254    -3.548824    -3.184377    -3.745356
    25%      -0.774890    -0.591841    -0.641675    -0.644144
    50%      -0.116401     0.101143     0.002073    -0.013611
    75%       0.616366     0.780282     0.680391     0.654328
    max       3.366626     2.653656     3.260383     3.927528
    
    col = data[3]
    print(col[np.abs(col)>3])
    97     3.927528
    305   -3.399312
    400   -3.745356
    Name: 3, dtype: float64
    
    print(data[(np.abs(data)>3).any(1)]) # 出现绝对值大于3的行
                0         1         2         3
    5   -0.539741  0.476985  3.248944 -1.021228
    97  -0.774363  0.552936  0.106061  3.927528
    102 -0.655054 -0.565230  3.176873  0.959533
    305 -2.315555  0.457246 -0.025907 -3.399312
    324  0.050188  1.951312  3.260383  0.963301
    400  0.146326  0.508391 -0.196713 -3.745356
    499 -0.293333 -0.242459 -3.056990  1.918403
    523 -3.428254 -0.296336 -0.439938 -0.867165
    586  0.275144  1.179227 -3.184377  1.369891
    808 -0.362528 -3.548824  1.553205 -2.186301
    900  3.366626 -2.372214  0.851010  1.332846
    

    3.7排列和随机采样

    • np.random.permutation():返回随机重排列的index
    • DataFrame.take(上面的index):重新排列
    df = DataFrame(np.arange(5*4).reshape(5,4))
    sampler = np.random.permutation(len(df))
    print(sampler)
    [1 0 4 3 2]
    
    print(df)
    print(df.take(sampler))
    
        0   1   2   3
    0   0   1   2   3
    1   4   5   6   7
    2   8   9  10  11
    3  12  13  14  15
    4  16  17  18  19
        0   1   2   3
    1   4   5   6   7
    0   0   1   2   3
    4  16  17  18  19
    3  12  13  14  15
    2   8   9  10  11
  • 相关阅读:
    信令基本概念
    CMMI
    关于OpenDataSource, OpenRowSet
    冒泡排序
    使用Sqlldr向oracle导入数据
    PowerDesigner生成sql和反向工程生成ER图的问题
    2021.1.4 学习总结
    12天 —— 关于生活与目标的思考【2020.8.5~2020.8.17】
    大一暑假学习总结(七)【2020.7.28~2020.8.4】
    学习:用javascript增加、删除行(转)
  • 原文地址:https://www.cnblogs.com/AsuraDong/p/7413113.html
Copyright © 2020-2023  润新知