• Pandas


    如果用 python 的列表和字典来作比较, 那么可以说 Numpy 是列表形式的,没有数值标签,而 Pandas 就是字典形式。Pandas是基于Numpy构建的,让Numpy为中心的应用变得更加简单

    1、要使用 pandas,首先需要了解他主要两个数据结构:Series 和 DataFrame

    1) Series 的字符串表现形式为:索引在左边,值在右边。如果没有为数据指定索引,将会自动创建一个 0 到 N-1(N 为长度)的整数型索引

    import pandas as pd
    import numpy as np
    s = pd.Series([1, 3, 6, np.nan, 44, 1])
    # 0     1.0
    # 1     3.0
    # 2     6.0
    # 3     NaN
    # 4    44.0
    # 5     1.0
    # dtype: float64
    print(s)

    2) DataFrame 是一个表格型的数据结构,它包含一组有序的列,每列可以是不同的值类型(数值,字符串,布尔值等)。DataFrame 既有行索引也有列索引, 它可以被看做由 Series 组成的大字典,可以根据每一个不同的索引来挑选数据

    import pandas as pd
    import numpy as np
    dates = pd.date_range('20160101', periods = 6)
    df = pd.DataFrame(np.random.randn(6, 4), index = dates, columns = ['a', 'b', 'c', 'd'])
    #                    a         b         c         d
    # 2016-01-01 -0.209289 -1.245807  2.025371  1.259524
    # 2016-01-02 -0.568688 -1.200045  2.624338 -0.546785
    # 2016-01-03 -0.935191  1.962250  0.397714 -1.317615
    # 2016-01-04 -0.496206  0.513763  0.826336  0.666354
    # 2016-01-05  0.648660 -0.252978  0.396752 -0.611633
    # 2016-01-06  0.690797 -1.122976  0.823072  1.003473
    print(df)

    DataFrame 的创建方法

    # 创建一组没有给定行标签和列标签的数据
    df1 = pd.DataFrame(np.arange(12).reshape((3, 4)))
    #    0  1   2   3
    # 0  0  1   2   3
    # 1  4  5   6   7
    # 2  8  9  10  11
    print(df1)
    
    # 另一种创建DataFrame的方法
    df2 = pd.DataFrame({'A' : 1.,
                        'B' : pd.Timestamp('20130102'),
                        'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                        'D' : np.array([3] * 4,dtype='int32'),
                        'E' : pd.Categorical(["test","train","test","train"]),
                        'F' : 'foo'})
    #      A          B    C  D      E    F
    # 0  1.0 2013-01-02  1.0  3   test  foo
    # 1  1.0 2013-01-02  1.0  3  train  foo
    # 2  1.0 2013-01-02  1.0  3   test  foo
    # 3  1.0 2013-01-02  1.0  3  train  foo
    print(df2)

    DataFrame 的基本操作

    # 0    1.0
    # 1    1.0
    # 2    1.0
    # 3    1.0
    print(df2['A']) # 或 print(df2.A)
    
    # 查看数据中的类型
    # A           float64
    # B    datetime64[ns]
    # C           float32
    # D             int32
    # E          category
    # F            object
    # dtype: object
    print(df2.dtypes)
    
    # 查看队列序号
    # Int64Index([0, 1, 2, 3], dtype='int64')
    print(df2.index)
    
    # 查看数据名称
    # Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
    print(df2.columns)
    
    # 查看所有的值
    # [[1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
    #  [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']
    #  [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
    #  [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']]
    print(df2.values)
    
    # 查看数据的总结
    #           A    C    D
    #  count  4.0  4.0  4.0
    #  mean   1.0  1.0  3.0
    #  std    0.0  0.0  0.0
    #  min    1.0  1.0  3.0
    #  25%    1.0  1.0  3.0
    #  50%    1.0  1.0  3.0
    #  75%    1.0  1.0  3.0
    #  max    1.0  1.0  3.0
    df2.describe()
    
    # 对行列进行排序
    #       F      E  D    C          B    A
    #  0  foo   test  3  1.0 2013-01-02  1.0
    #  1  foo  train  3  1.0 2013-01-02  1.0
    #  2  foo   test  3  1.0 2013-01-02  1.0
    #  3  foo  train  3  1.0 2013-01-02  1.0
    print(df2.sort_index(axis = 1, ascending = False)) # axis = 0 时对行进行排序
    
    # 对数据的值进行排序
    #       A          B    C  D      E    F
    #  0  1.0 2013-01-02  1.0  3   test  foo
    #  2  1.0 2013-01-02  1.0  3   test  foo
    #  1  1.0 2013-01-02  1.0  3  train  foo
    #  3  1.0 2013-01-02  1.0  3  train  foo
    print(df2.sort_values(by = 'E'))

    2、选择数据

    1) 基本方式选择数据

    # 创建一个 6x4 的矩阵
    import pandas as pd
    import numpy as np
    dates = pd.date_range('20160101', periods = 6)
    df = pd.DataFrame(np.arange(24).reshape(6, 4), index = dates, columns = ['a', 'b', 'c', 'd'])
    #              a   b   c   d
    # 2016-01-01   0   1   2   3
    # 2016-01-02   4   5   6   7
    # 2016-01-03   8   9  10  11
    # 2016-01-04  12  13  14  15
    # 2016-01-05  16  17  18  19
    # 2016-01-06  20  21  22  23
    print(df)
    
    # 2016-01-01     0
    # 2016-01-02     4
    # 2016-01-03     8
    # 2016-01-04    12
    # 2016-01-05    16
    # 2016-01-06    20
    # Freq: D, Name: a, dtype: int64
    print(df['a']) # 或 print(df.a)
    
    # 选择多行或多列
    #             a  b   c   d
    # 2016-01-01  0  1   2   3
    # 2016-01-02  4  5   6   7
    # 2016-01-03  8  9  10  11
    print(df[0: 3])
    #              a   b   c   d
    # 2016-01-02   4   5   6   7
    # 2016-01-03   8   9  10  11
    # 2016-01-04  12  13  14  15
    print(df['20160102': '20160104'])

    2) 根据标签选择数据 loc

    # 通过标签名字选择一行数据
    # a    4
    # b    5
    # c    6
    # d    7
    # Name: 2016-01-02 00:00:00, dtype: int64
    print(df.loc['20160102'])
    
    # 通过标签选择所有行,以及 a、b 两列数据
    #              a   b
    # 2016-01-01   0   1
    # 2016-01-02   4   5
    # 2016-01-03   8   9
    # 2016-01-04  12  13
    # 2016-01-05  16  17
    # 2016-01-06  20  21
    print(df.loc[:, ['a', 'b']])
    
    # 通过标签选择其中一行,以及 a、b 两列数据
    # a    4
    # b    5
    # Name: 2016-01-02 00:00:00, dtype: int64
    print(df.loc['20160102', ['a', 'b']])

    3) 根据位置选择数据 iloc

    # 13
    print(df.iloc[3, 1])
    
    #              b   c
    # 2016-01-04  13  14
    # 2016-01-05  17  18
    print(df.iloc[3: 5, 1: 3])
    
    #              b   c
    # 2016-01-02   5   6
    # 2016-01-04  13  14
    # 2016-01-06  21  22
    print(df.iloc[[1, 3, 5], 1: 3])

    4) 混合方式选择位置 ix

    #             a   c
    # 2016-01-01  0   2
    # 2016-01-02  4   6
    # 2016-01-03  8  10
    print(df.ix[:3, ['a', 'c']])

    5) 根据判断指令选择数据

    #              a   b   c   d
    # 2016-01-04  12  13  14  15
    # 2016-01-05  16  17  18  19
    # 2016-01-06  20  21  22  23
    print(df[df.a > 8])

     3、设置数据

    # 根据索引设置数据
    df.iloc[2, 2] = 1111
    # 根据标签设置数据
    df.loc['20160101', 'b'] = 2222
    # 根据判断指令设置数据
    df.b[df.a > 4] = 0
    # 新增一列 'e',并将 'e' 列全部设置为 nan
    df['e'] = np.nan
    # 新增一列 'f'
    df['f'] = pd.Series([1, 2, 3, 4, 5, 6], index = pd.date_range('20160101', periods = 6))
    
    #              a     b     c   d   e  f
    # 2016-01-01   0  2222     2   3 NaN  1
    # 2016-01-02   4     5     6   7 NaN  2
    # 2016-01-03   8     0  1111  11 NaN  3
    # 2016-01-04  12     0    14  15 NaN  4
    # 2016-01-05  16     0    18  19 NaN  5
    # 2016-01-06  20     0    22  23 NaN  6

    4、处理 NaN 数据

    # 创建一个 6x4的矩阵,并将其中两个值设置为 nan
    dates = pd.date_range('20160101', periods = 6)
    df = pd.DataFrame(np.arange(24).reshape(6, 4), index = dates, columns = ['a', 'b', 'c', 'd'])
    df.iloc[0, 1] = np.nan
    df.iloc[1, 2] = np.nan
    #              a     b     c   d
    # 2016-01-01   0   NaN   2.0   3
    # 2016-01-02   4   5.0   NaN   7
    # 2016-01-03   8   9.0  10.0  11
    # 2016-01-04  12  13.0  14.0  15
    # 2016-01-05  16  17.0  18.0  19
    # 2016-01-06  20  21.0  22.0  23

    1) 使用 dropna 直接去掉有 NaN 的行或列 dropna

    #              a     b     c   d
    # 2016-01-03   8   9.0  10.0  11
    # 2016-01-04  12  13.0  14.0  15
    # 2016-01-05  16  17.0  18.0  19
    # 2016-01-06  20  21.0  22.0  23
    # axis 0-对行进行操作,1-对列进行操作
    # how 'any'-只要有 NaN 就进行处理,'all'-所有元素为 NaN 才进行处理
    df.dropna(axis = 0, how = 'any')

    2) 使用 fillna 将 NaN 的值用其他值代替

    #              a     b     c   d
    # 2016-01-01   0   0.0   2.0   3
    # 2016-01-02   4   5.0   0.0   7
    # 2016-01-03   8   9.0  10.0  11
    # 2016-01-04  12  13.0  14.0  15
    # 2016-01-05  16  17.0  18.0  19
    # 2016-01-06  20  21.0  22.0  23
    df.fillna(value = 0)

    3) 判断是否有缺失数据

    #                 a      b      c      d
    # 2016-01-01  False   True  False  False
    # 2016-01-02  False  False   True  False
    # 2016-01-03  False  False  False  False
    # 2016-01-04  False  False  False  False
    # 2016-01-05  False  False  False  False
    # 2016-01-06  False  False  False  False
    df.isnull()
    
    # 检查数据中是否存在 NaN
    # True
    np.any(df.isnull())

    5、pandas 处理多组数据的时候往往会要用到数据的合并处理,使用 concat 是一种基本的合并方式。而且 concat 中有很多参数可以调整,合并成想要的数据形式

    1) axis 合并方向 (默认 axis = 0)

    df1 = pd.DataFrame(np.ones((3, 4)) * 1, columns = ['a', 'b', 'c', 'd'])
    df2 = pd.DataFrame(np.ones((3, 4)) * 2, columns = ['a', 'b', 'c', 'd'])
    # axis 0-对行进行合并,1-对列进行合并
    res = pd.concat([df1, df2], axis = 0)
    #      a    b    c    d
    # 0  1.0  1.0  1.0  1.0
    # 1  1.0  1.0  1.0  1.0
    # 2  1.0  1.0  1.0  1.0
    # 0  2.0  2.0  2.0  2.0
    # 1  2.0  2.0  2.0  2.0
    # 2  2.0  2.0  2.0  2.0
    print(res)

    2) ignore_index 重绘 index

    res = pd.concat([df1, df2], axis = 0, ignore_index = True)
    #      a    b    c    d
    # 0  1.0  1.0  1.0  1.0
    # 1  1.0  1.0  1.0  1.0
    # 2  1.0  1.0  1.0  1.0
    # 3  2.0  2.0  2.0  2.0
    # 4  2.0  2.0  2.0  2.0
    # 5  2.0  2.0  2.0  2.0
    print(res)

    3) join (合并方式)

    join = 'outer' 为预设值,此方式是依照 column 来做纵向合并,有相同的 column 上下合并在一起,其他独自的 column 个自成列,原本没有值的位置皆以 NaN 填充

    join = 'inner' 时,只有相同的 column 合并在一起,其他的会被抛弃

    df1 = pd.DataFrame(np.ones((3, 4)) * 1, columns = ['a', 'b', 'c', 'd'], index = [1, 2, 3])
    df2 = pd.DataFrame(np.ones((3, 4)) * 2, columns = ['b', 'c', 'd', 'e'], index = [2, 3, 4])
    res = pd.concat([df1, df2], axis = 0, join = 'outer')
    #      a    b    c    d    e
    # 1  1.0  1.0  1.0  1.0  NaN
    # 2  1.0  1.0  1.0  1.0  NaN
    # 3  1.0  1.0  1.0  1.0  NaN
    # 2  NaN  2.0  2.0  2.0  2.0
    # 3  NaN  2.0  2.0  2.0  2.0
    # 4  NaN  2.0  2.0  2.0  2.0
    print(res)
    
    res = pd.concat([df1, df2], axis = 0, join = 'inner')
    #      b    c    d
    # 1  1.0  1.0  1.0
    # 2  1.0  1.0  1.0
    # 3  1.0  1.0  1.0
    # 2  2.0  2.0  2.0
    # 3  2.0  2.0  2.0
    # 4  2.0  2.0  2.0
    print(res)

    4) join_axes (依照 axes 合并)

    df1 = pd.DataFrame(np.ones((3, 4)) * 1, columns = ['a', 'b', 'c', 'd'], index = [1, 2, 3])
    df2 = pd.DataFrame(np.ones((3, 4)) * 2, columns = ['b', 'c', 'd', 'e'], index = [2, 3, 4])
    # 依照 df1.index 对列进行合并
    res = pd.concat([df1, df2], axis = 1, join_axes = [df1.index])
    #      a    b    c    d    b    c    d    e
    # 1  1.0  1.0  1.0  1.0  NaN  NaN  NaN  NaN
    # 2  1.0  1.0  1.0  1.0  2.0  2.0  2.0  2.0
    # 3  1.0  1.0  1.0  1.0  2.0  2.0  2.0  2.0
    print(res)
    
    # 移除 join_axes = [df1.index]
    res = pd.concat([df1, df2], axis = 1)
    #      a    b    c    d    b    c    d    e
    # 1  1.0  1.0  1.0  1.0  NaN  NaN  NaN  NaN
    # 2  1.0  1.0  1.0  1.0  2.0  2.0  2.0  2.0
    # 3  1.0  1.0  1.0  1.0  2.0  2.0  2.0  2.0
    # 4  NaN  NaN  NaN  NaN  2.0  2.0  2.0  2.0
    print(res)

    5) append 添加数据

    注意:append 只要纵向合并(即对行进行合并),没有横向合并

    df1 = pd.DataFrame(np.ones((3, 4)) * 1, columns = ['a', 'b', 'c', 'd'])
    df2 = pd.DataFrame(np.ones((3, 4)) * 2, columns = ['a', 'b', 'c', 'd'])
    res = df1.append(df2, ignore_index=True)
    #      a    b    c    d
    # 0  1.0  1.0  1.0  1.0
    # 1  1.0  1.0  1.0  1.0
    # 2  1.0  1.0  1.0  1.0
    # 3  2.0  2.0  2.0  2.0
    # 4  2.0  2.0  2.0  2.0
    # 5  2.0  2.0  2.0  2.0
    print(res)

     6、merge

    merge 和 concat 类似,但主要是用于两组有一列以上相同列标签(key column)的数据,统一索引的数据,通常也被用在 Database 的处理当中

    1) 根据一组 key 合并

    #     a   b  key
    # 0  a0  b0    0
    # 1  a1  b1    1
    # 2  a2  b2    2
    # 3  a3  b3    3
    left = pd.DataFrame({'key': [0, 1, 2, 3], 'a': ['a0', 'a1', 'a2', 'a3'], 'b': ['b0', 'b1', 'b2', 'b3']})
    #     c   d  key
    # 0  c0  d0    0
    # 1  c1  d1    1
    # 2  c2  d2    2
    # 3  c3  d3    3
    right = pd.DataFrame({'key': [0, 1, 2, 3], 'c': ['c0', 'c1', 'c2', 'c3'], 'd': ['d0', 'd1', 'd2', 'd3']})
    res = pd.merge(left, right, on = 'key')
    #     a   b  key   c   d
    # 0  a0  b0    0  c0  d0
    # 1  a1  b1    1  c1  d1
    # 2  a2  b2    2  c2  d2
    # 3  a3  b3    3  c3  d3
    print(res)

    2) 根据两组 key 合并

    #     a   b key1 key2
    # 0  a0  b0   k0   k0
    # 1  a1  b1   k0   k1
    # 2  a2  b2   k1   k0
    # 3  a3  b3   k2   k1
    left = pd.DataFrame({'key1': ['k0', 'k0', 'k1', 'k2'], 'key2': ['k0', 'k1', 'k0', 'k1'], 'a': ['a0', 'a1', 'a2', 'a3'], 'b': ['b0', 'b1', 'b2', 'b3']})
    #     c   d key1 key2
    # 0  c0  d0   k0   k0
    # 1  c1  d1   k1   k0
    # 2  c2  d2   k1   k0
    # 3  c3  d3   k2   k0
    right = pd.DataFrame({'key1': ['k0', 'k1', 'k1', 'k2'], 'key2': ['k0', 'k0', 'k0', 'k0'], 'c': ['c0', 'c1', 'c2', 'c3'], 'd': ['d0', 'd1', 'd2', 'd3']})
    
    ret = pd.merge(left, right, on = ['key1', 'key2'], how = 'inner')
    #     a   b key1 key2   c   d
    # 0  a0  b0   k0   k0  c0  d0
    # 1  a2  b2   k1   k0  c1  d1
    # 2  a2  b2   k1   k0  c2  d2
    print(ret)
    
    ret = pd.merge(left, right, on = ['key1', 'key2'], how = 'outer')
    #      a    b key1 key2    c    d
    # 0   a0   b0   k0   k0   c0   d0
    # 1   a1   b1   k0   k1  NaN  NaN
    # 2   a2   b2   k1   k0   c1   d1
    # 3   a2   b2   k1   k0   c2   d2
    # 4   a3   b3   k2   k1  NaN  NaN
    # 5  NaN  NaN   k2   k0   c3   d3
    print(ret)
    
    ret = pd.merge(left, right, on = ['key1', 'key2'], how = 'left')
    #     a   b key1 key2    c    d
    # 0  a0  b0   k0   k0   c0   d0
    # 1  a1  b1   k0   k1  NaN  NaN
    # 2  a2  b2   k1   k0   c1   d1
    # 3  a2  b2   k1   k0   c2   d2
    # 4  a3  b3   k2   k1  NaN  NaN
    print(ret)
    
    ret = pd.merge(left, right, on = ['key1', 'key2'], how = 'right')
    #      a    b key1 key2   c   d
    # 0   a0   b0   k0   k0  c0  d0
    # 1   a2   b2   k1   k0  c1  d1
    # 2   a2   b2   k1   k0  c2  d2
    # 3  NaN  NaN   k2   k0  c3  d3
    print(ret)

     3) Indicator

    indicator = True 会将合并的记录放在新的一列

    #    col1 col_left
    # 0     0        a
    # 1     1        b
    df1 = pd.DataFrame({'col1': [0, 1], 'col_left': ['a', 'b']})
    #    col1  col_right
    # 0     1          2
    # 1     2          2
    # 2     2          2
    df2 = pd.DataFrame({'col1': [1, 2, 2], 'col_right': [2, 2, 2]})
    ret = pd.merge(df1, df2, on = 'col1', how = 'outer', indicator = True)
    #    col1 col_left  col_right      _merge
    # 0     0        a        NaN   left_only
    # 1     1        b        2.0        both
    # 2     2      NaN        2.0  right_only
    # 3     2      NaN        2.0  right_only
    print(ret)
    
    # 指定 indicator_column 的名称
    ret = pd.merge(df1, df2, on = 'col1', how = 'outer', indicator = 'indicator_column')
    #    col1 col_left  col_right indicator_column
    # 0     0        a        NaN        left_only
    # 1     1        b        2.0             both
    # 2     2      NaN        2.0       right_only
    # 3     2      NaN        2.0       right_only
    print(ret)

    4) 根据 index 合并

    #     a   b
    # 0  a1  b1
    # 1  a2  b2
    # 2  a3  b3
    left = pd.DataFrame({'a': ['a1', 'a2', 'a3'], 'b': ['b1', 'b2', 'b3']}, index = [0, 1, 2])
    #     c   d
    # 1  c1  d1
    # 2  c2  d2
    # 3  c3  d3
    right = pd.DataFrame({'c': ['c1', 'c2', 'c3'], 'd': ['d1', 'd2', 'd3']}, index = [1, 2, 3])
    
    ret = pd.merge(left, right, left_index = True, right_index = True, how = 'inner')
    #     a   b   c   d
    # 1  a2  b2  c1  d1
    # 2  a3  b3  c2  d2
    print(ret)
    
    ret = pd.merge(left, right, left_index = True, right_index = True, how = 'outer')
    #      a    b    c    d
    # 0   a1   b1  NaN  NaN
    # 1   a2   b2   c1   d1
    # 2   a3   b3   c2   d2
    # 3  NaN  NaN   c3   d3
    print(ret)

    5) suffixes

    如果两个 DataFrame 的标签一样,可以使用 suffixes 来解决 overlapping 的问题

    #    age   k
    # 0    1  K0
    # 1    2  K1
    # 2    3  K2
    left = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
    #    age   k
    # 0    4  K0
    # 1    5  K0
    # 2    6  K3
    right = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
    ret = pd.merge(left, right, on = 'k', suffixes = ['_left', '_right'], how = 'inner')
    #    age_left   k  age_right
    # 0         1  K0          4
    # 1         1  K0          5
    print(ret)
  • 相关阅读:
    优先级队列
    内存拷贝
    快速排序
    ubuntu上编译Thrift
    Hadoop 学习总结之一:HDFS简介(zz写的很不错)
    hbase 详细解析(转载)
    HBase技术介绍简介 NOSQL数据库 入门(转载)
    SQL到NOSQL的思维转变
    Hadoop分布式文件系统:架构和设计(zz)
    zz我对技术方向的一些反思
  • 原文地址:https://www.cnblogs.com/zerotoinfinity/p/12928977.html
Copyright © 2020-2023  润新知