• pandas入门:基本功能


    重新索引

    from pandas import Series,DataFrame
    
    # Series重新索引
    obj = Series([4.5,7.2,-5.3,3.6],index=['d','b','a','c'])
    print(obj)
    '''
    d    4.5
    b    7.2
    a   -5.3
    c    3.6
    dtype: float64
    '''
    # 调用reindex将会根据新索进行重排,如果某个索引值当前不存在,就引入缺失值
    obj2 = obj.reindex(['a','b','c','d','e'])
    print(obj2)
    '''
    a   -5.3
    b    7.2
    c    3.6
    d    4.5
    e    NaN
    dtype: float64
    '''
    obj3 = obj.reindex(['a','b','c','d','e'],fill_value=0)
    print(obj3)
    # fill_value 实现空值填充
    '''
    a   -5.3
    b    7.2
    c    3.6
    d    4.5
    e    0.0
    dtype: float64
    '''
    # ffill实现向前填充,bfill实现向后填充
    obj4 = Series(['blue','purpul','yellow'],index=[0,1,4])
    obj5 = obj4.reindex(range(6),method='ffill')
    print(obj5)
    '''
    0      blue
    1    purpul
    2    purpul
    3    purpul
    4    yellow
    5    yellow
    dtype: object
    '''
    obj6 = obj4.reindex(range(6),method='bfill')
    print(obj6)
    '''
    0      blue
    1    purpul
    2    yellow
    3    yellow
    4    yellow
    5       NaN
    dtype: object
    '''
    
    from pandas import Series,DataFrame
    import numpy as np
    
    # DataFrame重新索引
    frame = DataFrame(np.arange(9).reshape(3,3),index=['a','c','d'],columns=['Ohio','Texas','California'])
    print(frame)
    '''
       Ohio  Texas  California
    a     0      1           2
    c     3      4           5
    d     6      7           8
    '''
    frame2 = frame.reindex(['a','b','c','d'])
    print(frame2)
    '''
       Ohio  Texas  California
    a   0.0    1.0         2.0
    b   NaN    NaN         NaN
    c   3.0    4.0         5.0
    d   6.0    7.0         8.0
    '''
    states = ['Texas','Utah','California']
    frame3 = frame.reindex(columns=states)
    print(frame3)
    '''
       Texas  Utah  California
    a      1   NaN           2
    c      4   NaN           5
    d      7   NaN           8
    '''
    # 可同时对行列进行索引
    frame4 = frame.reindex(index=['a','b','c','d'],columns=['Ohio','Texas','California','Utah'])
    print(frame4)
    '''
       Ohio  Texas  California  Utah
    a   0.0    1.0         2.0   NaN
    b   NaN    NaN         NaN   NaN
    c   3.0    4.0         5.0   NaN
    d   6.0    7.0         8.0   NaN
    '''
    
    # 利用ix的标签索引功能,重新索引任务可以变得更简洁
    frame5 = frame.ix[['a','c','d'],['Ohio','Texas','California']]
    print(frame5)
    '''
       Ohio  Texas  California
    a     0      1           2
    c     3      4           5
    d     6      7           8
    '''
    

    丢弃指定轴上的项

    from pandas import Series,DataFrame
    import numpy as np
    
    # drop方法
    obj = Series(np.arange(5),index=['a','b','c','d','e'])
    new_obj = obj.drop('c')
    print(new_obj)
    '''
    a    0
    b    1
    d    3
    e    4
    dtype: int32
    '''
    new_obj = obj.drop(['d','c'])
    print(new_obj)
    '''
    a    0
    b    1
    e    4
    dtype: int32
    '''
    
    # 对于DataFrame可删除任意轴上的索引值
    data = DataFrame(np.arange(16).reshape((4,4)),
                     index=[1,2,3,4],
                     columns=['one','two','three','four'])
    new_data = data.drop([1,3])
    print(new_data)
    '''
       one  two  three  four
    2    4    5      6     7
    4   12   13     14    15
    '''
    new_data = data.drop('two',axis=1)
    print(new_data)
    '''
       one  three  four
    1    0      2     3
    2    4      6     7
    3    8     10    11
    4   12     14    15
    '''
    new_data = data.drop(['two','four'],axis=1)
    print(new_data)
    '''
       one  three
    1    0      2
    2    4      6
    3    8     10
    4   12     14
    '''
    

    索引、选取和过滤

    from pandas import Series,DataFrame
    
    obj = Series([9,5,7,3],index=['a','b','c','d'])
    print(obj['b']) # 5
    print(obj[2]) # 7
    print(obj[2:4])
    '''
    c    7
    d    3
    dtype: int64
    '''
    print(obj[['b','a','d']])
    '''
    b    5
    a    9
    d    3
    dtype: int64
    '''
    print(obj[[1,3]])
    '''
    b    5
    d    3
    dtype: int64
    '''
    print(obj[obj<5])
    '''
    d    3
    dtype: int64
    '''
    # 利用标签的切片运算与普通python切片运算不通过,其末端是包含的
    print(obj['b':'c'])
    '''
    b    5
    c    7
    dtype: int64
    '''
    # 赋值方式
    obj['b':'c'] = 5
    print(obj)
    '''
    a    9
    b    5
    c    5
    d    3
    dtype: int64
    '''
    
    from pandas import Series,DataFrame
    import numpy as np
    
    data = DataFrame(np.arange(16).reshape((4,4)),
                     index=['Ohio','Colorado','Utah','New York'],
                     columns=['one','two','three','four'])
    print(data)
    '''
              one  two  three  four
    Ohio        0    1      2     3
    Colorado    4    5      6     7
    Utah        8    9     10    11
    New York   12   13     14    15
    '''
    print(data['two'])
    '''
    Ohio         1
    Colorado     5
    Utah         9
    New York    13
    Name: two, dtype: int32
    '''
    print(data[['three','one']])
    '''
              three  one
    Ohio          2    0
    Colorado      6    4
    Utah         10    8
    New York     14   12
    '''
    # 特殊情况,通过切片或布尔型数组选取行
    print(data[:2])
    '''
              one  two  three  four
    Ohio        0    1      2     3
    Colorado    4    5      6     7
    '''
    print(data['three'] >5)
    '''
    Ohio        False
    Colorado     True
    Utah         True
    New York     True
    Name: three, dtype: bool
    '''
    print(data[data['three'] >5]) # 等于data[[False,True,True,True]]
    '''
              one  two  three  four
    Colorado    4    5      6     7
    Utah        8    9     10    11
    New York   12   13     14    15
    '''
    # 索引字段ix
    print(data.ix['Colorado',['two','three']])
    '''
    two      5
    three    6
    Name: Colorado, dtype: int32
    '''
    print(data.ix[['Colorado','Utah'],[3,0,1]])
    '''
              four  one  two
    Colorado     7    4    5
    Utah        11    8    9
    '''
    print(data.ix[2])
    '''
    one       8
    two       9
    three    10
    four     11
    Name: Utah, dtype: int32
    '''
    print(data.ix[:'Utah','two'])
    '''
    Ohio        1
    Colorado    5
    Utah        9
    Name: two, dtype: int32
    '''
    print(data.ix[data.three>5,:3])
    '''
              one  two  three
    Colorado    4    5      6
    Utah        8    9     10
    New York   12   13     14
    '''
    

    算术运算和数据对齐

    from pandas import Series,DataFrame
    import numpy as np
    
    s1 = Series([7.3,-2.5,3.4,1.5],index=['a','c','d','e'])
    s2 = Series([-2.1,3.6,-1.5,4,3.1],index=['a','c','e','f','g'])
    
    print(s1+s2)
    # 自动的数据对齐在不重叠处引入NA值
    '''
    a    5.2
    c    1.1
    d    NaN
    e    0.0
    f    NaN
    g    NaN
    dtype: float64
    '''
    
    df1 = DataFrame(np.arange(9).reshape((3,3)),
                        columns=list('ABC'),
                    index=['one','two','three'])
    df2 = DataFrame(np.arange(12).reshape((4,3)),
                        columns=list('ABC'),
                    index=['one','three','four','five'])
    print(df1+df2)
    # 索引为原来两个DataFrame的并集
    '''
             A     B     C
    five   NaN   NaN   NaN
    four   NaN   NaN   NaN
    one    0.0   2.0   4.0
    three  9.0  11.0  13.0
    two    NaN   NaN   NaN
    '''
    
    from pandas import Series,DataFrame
    import numpy as np
    
    # 在算术方法中填充值
    df1 = DataFrame(np.arange(12).reshape(3,4),columns=list('abcd'))
    df2 = DataFrame(np.arange(20).reshape(4,5),columns=list('abcde'))
    print(df1+df2)
    '''
          a     b     c     d   e
    0   0.0   2.0   4.0   6.0 NaN
    1   9.0  11.0  13.0  15.0 NaN
    2  18.0  20.0  22.0  24.0 NaN
    3   NaN   NaN   NaN   NaN NaN
    '''
    print(df1.add(df2,fill_value=0))
    '''
          a     b     c     d     e
    0   0.0   2.0   4.0   6.0   4.0
    1   9.0  11.0  13.0  15.0   9.0
    2  18.0  20.0  22.0  24.0  14.0
    3  15.0  16.0  17.0  18.0  19.0
    '''
    
    from pandas import Series,DataFrame
    import numpy as np
    
    # DataFrame和Series之间的运算
    arr = np.arange(12.).reshape((3,4))
    print(arr)
    '''
    [[ 0.  1.  2.  3.]
     [ 4.  5.  6.  7.]
     [ 8.  9. 10. 11.]]
    '''
    print(arr[0])
    '''
    [0. 1. 2. 3.]
    '''
    print(arr-arr[0])
    '''
    [[0. 0. 0. 0.]
     [4. 4. 4. 4.]
     [8. 8. 8. 8.]]
    '''
    frame = DataFrame(np.arange(12.).reshape((4,3)),
                      columns=list('bde'),
                      index=['one','three','four','five'])
    print(frame)
    '''
             b     d     e
    one    0.0   1.0   2.0
    three  3.0   4.0   5.0
    four   6.0   7.0   8.0
    five   9.0  10.0  11.0
    '''
    series = frame.ix[0]
    print(series)
    '''
    b    0.0
    d    1.0
    e    2.0
    Name: one, dtype: float64
    '''
    # 默认情况下,DataFrame和Series之间的算术运算会将Series的索引匹配到DataFrame的列,然后沿着行乡下广播
    print(frame-series)
    '''
             b    d    e
    one    0.0  0.0  0.0
    three  3.0  3.0  3.0
    four   6.0  6.0  6.0
    five   9.0  9.0  9.0
    '''
    # 如果某个索引在DataFrame或Series的索引中找不到,则参与运算的两个对象会被重新索引以形成并集
    series2 = Series(range(3),index=['b','e','f'])
    print(series2)
    '''
    b    0
    e    1
    f    2
    dtype: int64
    '''
    print(frame+series2)
    '''
             b   d     e   f
    one    0.0 NaN   3.0 NaN
    three  3.0 NaN   6.0 NaN
    four   6.0 NaN   9.0 NaN
    five   9.0 NaN  12.0 NaN
    '''
    # 如果希望匹配行切在列上广播,需使用蒜素运算方法,如
    series3 = frame['d']
    print(series3)
    '''
    one       1.0
    three     4.0
    four      7.0
    five     10.0
    Name: d, dtype: float64
    '''
    print(frame.sub(series3,axis=0))
    # 传入的轴号是希望匹配的轴
    '''
             b    d    e
    one   -1.0  0.0  1.0
    three -1.0  0.0  1.0
    four  -1.0  0.0  1.0
    five  -1.0  0.0  1.0
    '''
    

    函数应用和映射

    from pandas import Series,DataFrame
    import numpy as np
    
    #Numpy的ufuncs也可用于操作pandas对象
    frame = DataFrame(np.random.randn(4,3),
                      columns=list('bde'),
                      index=['one','two','three','four'])
    
    print(frame)
    '''
                  b         d         e
    one   -1.415255 -1.084419  0.724132
    two   -0.468757  0.493345  0.318408
    three  0.913162 -0.513506  0.149354
    four  -2.219956  1.166779 -0.359199
    '''
    print(np.abs(frame))
    '''
                  b         d         e
    one    1.415255  1.084419  0.724132
    two    0.468757  0.493345  0.318408
    three  0.913162  0.513506  0.149354
    four   2.219956  1.166779  0.359199
    '''
    # 将函数应用到由各列或行形成的一维数组上,使用apply方法
    data = [[1,2,3],
            [5,2,3],
            [6,6,6],
            [9,7,1]]
    frame2 = DataFrame(data,
                      columns=list('bde'),
                      index=['one','two','three','four'])
    print(frame2)
    '''
           b  d  e
    one    1  2  3
    two    5  2  3
    three  6  6  6
    four   9  7  1
    '''
    f = lambda x:x.max()-x.min()
    print(frame2.apply(f))
    '''
    b    8
    d    5
    e    5
    dtype: int64
    '''
    # axis=1 横向计算,axis=0 默认纵向计算
    print(frame2.apply(f,axis=1))
    '''
    one      2
    two      3
    three    0
    four     8
    dtype: int64
    '''
    # 元素级函数使用applymap
    f = lambda x:x+1
    print(frame2.applymap(f))
    '''
            b  d  e
    one     2  3  4
    two     6  3  4
    three   7  7  7
    four   10  8  2
    '''
    
    print(frame2['e'].map(f))
    '''
    one      4
    two      4
    three    7
    four     2
    Name: e, dtype: int64
    '''
    

    排序和排名

    from pandas import Series,DataFrame
    import numpy as np
    
    obj = Series([1,4,2,3],index=['d','a','c','b'])
    print(obj)
    '''
    d    1
    a    4
    c    2
    b    3
    dtype: int64
    '''
    print(obj.sort_index())
    '''
    a    4
    b    3
    c    2
    d    1
    dtype: int64
    '''
    print(obj.sort_values())
    '''
    d    1
    c    2
    b    3
    a    4
    dtype: int64
    '''
    frame = DataFrame(np.arange(8).reshape((2,4)),
                      index=['n','c'],
                      columns=[0,4,1,6])
    print(frame)
    '''
       0  4  1  6
    n  0  1  2  3
    c  4  5  6  7
    '''
    print(frame.sort_index())
    '''
       0  4  1  6
    c  4  5  6  7
    n  0  1  2  3
    '''
    # axis=1 横向,axis=0 默认纵向
    print(frame.sort_index(axis=1))
    '''
       0  1  4  6
    n  0  2  1  3
    c  4  6  5  7
    '''
    # ascending默认升序,可设置降序
    print(frame.sort_index(axis=1,ascending=False))
    '''
       6  4  1  0
    n  3  1  2  0
    c  7  5  6  4
    '''
    # 对Series排序,python3.6版本之后没有order了,可使用sort_values
    obj = Series([4,7,-3,2])
    print(obj.sort_values())
    '''
    2   -3
    3    2
    0    4
    1    7
    dtype: int64
    '''
    # 排序时,缺失值会放到Series的末尾
    obj = Series([4,np.nan,7,np.nan,-3,2])
    print(obj.sort_values())
    '''
    4   -3.0
    5    2.0
    0    4.0
    2    7.0
    1    NaN
    3    NaN
    dtype: float64
    '''
    frame = DataFrame({'b':[4,7,-3,2],'a':[0,1,0,1]})
    print(frame)
    '''
       a  b
    0  0  4
    1  1  7
    2  0 -3
    3  1  2
    '''
    # 根据一个或多个李忠的值进行排序。将一个或多个列的名字传递给by选项即可
    print(frame.sort_values(by='b'))
    '''
       a  b
    2  0 -3
    3  1  2
    0  0  4
    1  1  7
    '''
    print(frame.sort_values(by=['a','b']))
    '''
       a  b
    2  0 -3
    0  0  4
    3  1  2
    1  1  7
    '''
    # 排名(ranking) 为各组分配一个平均排名,即排序之后给一个编号
    obj = Series([7,-5,7,4,2,0,4])
    print(obj.rank())
    '''
    0    6.5
    1    1.0
    2    6.5
    3    4.5
    4    3.0
    5    2.0
    6    4.5
    dtype: float64
    '''
    print(obj.rank(method='first'))
    '''
    0    6.0
    1    1.0
    2    7.0
    3    4.0
    4    3.0
    5    2.0
    6    5.0
    dtype: float64
    '''
    print(obj.rank(ascending=False,method='max'))
    '''
    0    2.0
    1    7.0
    2    2.0
    3    4.0
    4    5.0
    5    6.0
    6    4.0
    dtype: float64
    '''
    frame = DataFrame({'b':[4.3,7,-3,2],
                       'a':[0,1,0,1],
                       'c':[-2,5,8,-2.5]})
    print(frame)
    '''
       a    b    c
    0  0  4.3 -2.0
    1  1  7.0  5.0
    2  0 -3.0  8.0
    3  1  2.0 -2.5
    '''
    print(frame.rank(axis=1))
    '''
         a    b    c
    0  2.0  3.0  1.0
    1  1.0  3.0  2.0
    2  2.0  1.0  3.0
    3  2.0  3.0  1.0
    '''
    

    method

    • average:默认,在相等的分组中,为各个值分配平均排名
    • min:使用整个组的最小排名
    • max:使用整个组的最大排名
    • first:按值在原始数据中出现的顺序排名

    带有重复值的轴索引

    from pandas import Series
    
    obj= Series(range(5),index=['a','a','b','b','c'])
    print(obj)
    '''
    a    0
    a    1
    b    2
    b    3
    c    4
    dtype: int64
    '''
    # 索引的is_unique是否唯一
    print(obj.index.is_unique) # False
    print(obj['a'])
    '''
    a    0
    a    1
    dtype: int64
    '''
    print(obj['c']) # 4
    
  • 相关阅读:
    C语言数据结构链表
    Servlet中对上传的图片进行大小变换
    网页中有几个框架,在其中一个框架中点击超链接刷新整个页面
    来园子开博了
    学习《java编程思想》导入作者的net.mindview包
    git常用命令汇总
    安装lessloader后,编译项目报错TypeError: this.getOptions is not a function
    数组学习二
    常见文件管理命令
    (转载)Shell语法
  • 原文地址:https://www.cnblogs.com/nicole-zhang/p/12955099.html
Copyright © 2020-2023  润新知