• numpy&pandas基础


    numpy基础

    import numpy as np

    定义array

    In [156]: np.ones(3)
    Out[156]: array([1., 1., 1.])
    
    In [157]: np.ones((3,5))
    Out[157]: 
    array([[1., 1., 1., 1., 1.],
           [1., 1., 1., 1., 1.],
           [1., 1., 1., 1., 1.]])
    
    In [158]: 
    
    In [158]: np.zeros(4)
    Out[158]: array([0., 0., 0., 0.])
    
    In [159]: np.zeros((2,5))
    Out[159]: 
    array([[0., 0., 0., 0., 0.],
           [0., 0., 0., 0., 0.]])
    
    In [160]: 
    In [146]: a = np.array([[1,3,5,2],[4,2,6,1]])
    
    In [147]: print(a)
    [[1 3 5 2]
     [4 2 6 1]]
    
    In [148]:
    In [161]: np.arange(10)
    Out[161]: array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
    
    In [162]: np.arange(3,13)
    Out[162]: array([ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12])
    
    In [163]: np.arange(3,13).reshape((2,5))
    Out[163]: 
    array([[ 3,  4,  5,  6,  7],
           [ 8,  9, 10, 11, 12]])
    
    In [164]: 
    In [169]: np.arange(2,25,2)
    Out[169]: array([ 2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24])
    
    In [170]: np.arange(2,25,2).reshape(3,4)
    Out[170]: 
    array([[ 2,  4,  6,  8],
           [10, 12, 14, 16],
           [18, 20, 22, 24]])
    
    In [171]: 
    
    In [176]:  np.linspace(1,10,4)
    Out[176]: array([ 1.,  4.,  7., 10.])
    
    In [177]: 

    array基本运算

    In [7]: a = np.array([[1,2],[3,4]])
    
    In [8]: b = np.arange(5,9).reshape((2,3))
    In [10]: print(a)
    [[1 2]
     [3 4]]
    
    In [11]: print(b)
    [[5 6]
     [7 8]]
    
    In [12]:
    
    In [12]: a+b
    Out[12]: 
    array([[ 6,  8],
           [10, 12]])
    
    In [13]: a-b
    Out[13]: 
    array([[-4, -4],
           [-4, -4]])
    
    In [14]: a*b     # 对应元素相乘
    Out[14]: 
    array([[ 5, 12],
           [21, 32]])
    
    In [17]: a/b
    Out[17]: 
    array([[0, 0],
           [0, 0]])
    
    In [18]: 
    
    In [18]: a**2
    Out[18]: 
    array([[ 1, 4],
    [ 9, 16]])
    
    In [19]:
    
    
    
    In [15]: np.dot(a,b)   # 矩阵乘法
    Out[15]: 
    array([[19, 22],
    [43, 50]])
    
    In [16]: a.dot(b)
    Out[16]: 
    array([[19, 22],
    [43, 50]])
    
    In [17]:
    
    
    
    In [54]: print(a)
    [[ 2  3  4  5]
     [ 6  7  8  9]
     [10 11 12 13]]
    
    In [55]: np.sum(a)
    Out[55]: 90
    
    In [56]: np.min(a)
    Out[56]: 2
    
    In [57]: np.max(a)
    Out[57]: 13
    
    In [58]: 
    
    In [58]: np.sum(a,axis=1)
    Out[58]: array([14, 30, 46])
    
    In [59]: np.sum(a,axis=0)
    Out[59]: array([18, 21, 24, 27])
    
    In [60]:
    
    
    
    
    
    # 三角函数结合random生成一组随机数据
    In [74]: N = 10
    
    In [75]: t = np.linspace(0, 2*np.pi, N)
    
    In [76]: print(t)
    [0.         0.6981317  1.3962634  2.0943951  2.7925268  3.4906585
     4.1887902  4.88692191 5.58505361 6.28318531]
    
    In [77]: y = np.sin(t) + 0.02*np.random.randn(N)
    
    In [78]: print(y)
    [-0.00947902  0.64196198  0.96567468  0.89394571  0.33830193 -0.3015316
     -0.86943758 -0.95954123 -0.62526393  0.02872202]
    
    In [79]: M = 3 
    
    In [80]: for ii, vv in zip(np.random.rand(M)*N, np.random.randn(M)):
        ...:     y[int(ii):] += vv
        ...:     
    
    In [81]: print(y)
    [-0.00947902  0.64196198  1.47685437  1.55309848  0.99745469  0.35762117
     -0.21028481 -0.30038846 -0.29746375  0.35652221]
    
    In [82]: 
    
    
    
    
    
    In [101]: a = np.arange(2,14).reshape((3,4)) 
    
    In [102]: print(a)
    [[ 2  3  4  5]
     [ 6  7  8  9]
     [10 11 12 13]]
    
    In [103]: print(np.argmin(a))  # 最小值的索引
    0
    
    In [104]: print(np.argmax(a))  # 最大值的索引
    11
    
    In [105]: np.cumsum(a)         # 从0元素开始的累计和
    Out[105]: array([ 2,  5,  9, 14, 20, 27, 35, 44, 54, 65, 77, 90])
    
    In [106]: np.cumprod(a)        # 从1元素开始的累计乘
    Out[106]: 
    array([         2,          6,         24,        120,        720,
                 5040,      40320,     362880,    3628800,   39916800,
            479001600, 6227020800])
    
    In [107]: 
    In [129]: a
    Out[129]: 
    array([[ 2,  3,  4,  5],
           [ 6,  7,  8,  9],
           [10, 11, 12, 13]])
    
    In [130]: np.cumsum(a,axis=1)
    Out[130]: 
    array([[ 2,  5,  9, 14],
           [ 6, 13, 21, 30],
           [10, 21, 33, 46]])
    
    In [131]: np.cumsum(a,axis=0)
    Out[131]: 
    array([[ 2,  3,  4,  5],
           [ 8, 10, 12, 14],
           [18, 21, 24, 27]])
    
    In [132]:
    In [133]: np.cumprod(a,axis=1)
    Out[133]: 
    array([[    2,     6,    24,   120],
           [    6,    42,   336,  3024],
           [   10,   110,  1320, 17160]])
    
    In [134]: np.cumprod(a,axis=0)
    Out[134]: 
    array([[  2,   3,   4,   5],
           [ 12,  21,  32,  45],
           [120, 231, 384, 585]])
    
    In [135]: 
    
    
    
    
    In [146]: a = np.array([[1,3,5,2],[4,2,6,1]])
    
    In [147]: print(a)
    [[1 3 5 2]
     [4 2 6 1]]
    
    In [148]: a.shape
    Out[148]: (2, 4)
    
    In [149]: a.ndim
    Out[149]: 2
    
    In [150]: a.size
    Out[150]: 8
    
    In [151]: np.diff(a)      # 累差运算
    Out[151]: 
    array([[ 2,  2, -3],
           [-2,  4, -5]])
    
    In [152]: np.diff(a,axis=1)
    Out[152]: 
    array([[ 2,  2, -3],
           [-2,  4, -5]])
    
    In [153]: np.diff(a,axis=0)
    Out[153]: array([[ 3, -1,  1, -1]])
    
    In [154]: 
    
    
    
    
    
    In [108]: a = np.array([10,7,11,9,8,13,12,9])
    
    In [109]: a.ndim
    Out[109]: 1
    
    In [110]: a.shape
    Out[110]: (8,)
    
    In [111]: a.size
    Out[111]: 8
    
    In [112]: a.mean()      # 均值
    Out[112]: 9.875
    
    In [113]: a.var()       # 方差
    Out[113]: 3.609375
    
    In [114]: a.std()       # 标准差
    Out[114]: 1.899835519196333
    
    In [115]:
    In [117]: np.median(a)  # 中位数
    Out[117]: 9.5
    
    In [118]: 
    In [138]: z = (a-a.mean())/a.std()   # z-score
    
    In [139]: print(z)
    [ 0.06579517 -1.5132889   0.59215653 -0.46056619 -0.98692754  1.64487924
      1.11851788 -0.46056619]
    
    In [140]: 
    
    
    
    
    In [198]: a = np.arange(-3,3).reshape((2,3))
    
    In [199]: a
    Out[199]: 
    array([[-3, -2, -1],
           [ 0,  1,  2]])
    
    In [200]: np.nonzero(a)  # 查找非0元素
    Out[200]: (array([0, 0, 0, 1, 1]), array([0, 1, 2, 1, 2]))
    
    In [201]: print(np.nonzero(a))
    (array([0, 0, 0, 1, 1]), array([0, 1, 2, 1, 2]))
    
    In [202]: 
    
    
    
    In [207]: a = np.arange(14,2,-1).reshape((3,4))
    
    In [208]: print(a)
    [[14 13 12 11]
     [10  9  8  7]
     [ 6  5  4  3]]
    
    In [209]: np.sort(a)     # 排序
    Out[209]: 
    array([[11, 12, 13, 14],
           [ 7,  8,  9, 10],
           [ 3,  4,  5,  6]])
    
    In [210]: 
    
    In [210]: np.sort(a,axis=1)
    Out[210]: 
    array([[11, 12, 13, 14],
           [ 7,  8,  9, 10],
           [ 3,  4,  5,  6]])
    
    In [211]: np.sort(a,axis=0)
    Out[211]: 
    array([[ 6,  5,  4,  3],
           [10,  9,  8,  7],
           [14, 13, 12, 11]])
    
    In [212]: 
    
    
    
    
    # 矩阵的转置
    In [212]: a = np.arange(14,2,-1).reshape((3,4))
    
    In [213]: print(a)
    [[14 13 12 11]
     [10  9  8  7]
     [ 6  5  4  3]]
    
    In [214]: 
    
    In [215]: print(np.transpose(a))
    [[14 10  6]
     [13  9  5]
     [12  8  4]
     [11  7  3]]
    
    In [216]: a.T
    Out[216]: 
    array([[14, 10,  6],
           [13,  9,  5],
           [12,  8,  4],
           [11,  7,  3]])
    
    In [217]: 
    
    In [220]: a.T.dot(a)  # 先转置,再进行矩阵乘法
    Out[220]: 
    array([[332, 302, 272, 242],
           [302, 275, 248, 221],
           [272, 248, 224, 200],
           [242, 221, 200, 179]])
    
    In [221]: 
    
    
    
    # 矩阵的clip,处理最大值和最小值
    In [221]: print(a)
    [[14 13 12 11]
     [10  9  8  7]
     [ 6  5  4  3]]
    
    In [222]: np.clip(a,5,11)
    Out[222]: 
    array([[11, 11, 11, 11],
           [10,  9,  8,  7],
           [ 6,  5,  5,  5]])
    
    In [223]: 
    

    卷积运算

    numpy.convolve(weights,array)
     
    weight = [a,b,c]
    array = [i,j,k,m,n]
     
    Result:[ai, bi+aj, ci+bj+ak, cj+bk+am, ck+bm+an, cm+bn, cn][N-1:-N+1]
    
    
    针对移动平均算法来预测下一个数据,越接近待预测点的数据权重越大,
    那么就需要让 i, j, k, m, n 的系数逐渐增大即可;即让 a > b > c ,并且 a+b+c=1 。
    
    
    
    示例:
    In [223]: weight = np.ones(3)/3
    
    In [224]: print(weight)
    [0.33333333 0.33333333 0.33333333]
    
    In [225]: arr = np.array([8,11,9,7,10])
    
    In [226]: np.convolve(weight,arr)
    Out[226]: 
    array([2.66666667, 6.33333333, 9.33333333, 9.        , 8.66666667,
           5.66666667, 3.33333333])
    
    In [227]: 
    
    In [227]: weight = np.array([0.8,0.1,0.1])
    
    In [228]: np.convolve(weight,arr)
    Out[228]: array([6.4, 9.6, 9.1, 7.6, 9.6, 1.7, 1. ])
    
    In [229]: 

    random常用操作

    # 生成随机浮点数,范围是在0.0~1.0之间
    In [19]: a = np.random.random((2,3))
    
    In [20]: print(a)
    [[0.02185901 0.69585563 0.04555439]
     [0.37331857 0.32903986 0.62448246]]
    
    In [21]:
    
    # 生成随机整数,可指定起止范围
    In [48]: np.random.randint(3)
    Out[48]: 2
    
    In [49]: np.random.randint(low=3,high=9)
    Out[49]: 6
    
    In [50]: np.random.randint(low=3,high=9,size=(3,4))
    Out[50]: 
    array([[5, 6, 7, 8],
           [8, 7, 3, 8],
           [5, 4, 5, 5]])
    
    In [51]: 
    In [68]: np.random.randint(low=-5,high=2,size=(3,4))
    Out[68]: 
    array([[-4, -4, -2,  1],
           [ 1,  0,  0,  1],
           [-4, -3,  1, -5]])
    
    In [69]: 
    
    # 生成正态分布,又名高斯分布(Gaussian distribution)随机数
    In [64]: np.random.normal()
    Out[64]: -0.5399414561419419
    
    In [65]: np.random.normal(loc=0,scale=1,size=(2,3))
    Out[65]: 
    array([[-0.50318082, -0.38614219,  0.30450427],
           [ 0.41711087,  0.29990928, -0.7843322 ]])
    
    In [66]:
    In [66]: np.random.normal(loc=2,scale=3,size=(2,3))
    Out[66]: 
    array([[ 3.37067379,  6.23517315,  2.3267659 ],
           [ 6.46832646, -2.76363304,  5.77883853]])
    
    In [67]:
    
    # 生成标准正态分布("standard normal" distribution)随机数,标准正态分布的平均值为0,方差为1,服从u(0,1)分布。
    In [83]: np.random.randn()
    Out[83]: 0.502482341264108
    
    In [84]: np.random.randn(3,4)
    Out[84]: 
    array([[ 0.34507555, -0.26868132, -0.56103417,  0.86176617],
           [-0.16535555, -0.38045904,  0.48176385, -1.09005206],
           [-0.60780266,  1.74113117, -0.72427329, -0.51232408]])
    
    In [85]:
    
    # 生成[0, 1)间随机数
    In [99]: np.random.rand()
    Out[99]: 0.607701127768974
    
    In [100]: np.random.rand(3,4)
    Out[100]: 
    array([[0.73020695, 0.53993878, 0.46693879, 0.82611629],
           [0.76117076, 0.16522599, 0.85129611, 0.74448772],
           [0.6450236 , 0.49994053, 0.04115063, 0.30081311]])
    
    In [101]:

    array索引

    # 一维数组的索引和list类似
    略
    
    # 二维数组的索引
    In [13]: import numpy as np
    
    In [14]: a = np.arange(3,15).reshape((3,4))
    
    In [15]: print(a)
    [[ 3  4  5  6]
     [ 7  8  9 10]
     [11 12 13 14]]
    
    In [16]: a[1]
    Out[16]: array([ 7,  8,  9, 10])
    
    In [17]: a[1,2]
    Out[17]: 9
    
    In [18]: a[1][2]              # 等价于 a[1,2]
    Out[18]: 9
    
    In [19]: 
    
    In [19]: a[1,1:-1]            # 获取第二行,除去首尾元素
    Out[19]: array([8, 9])
    
    In [20]: a[1,1:2]             # 获取第二行第二个元素
    Out[20]: array([8])
    
    In [21]:
    In [24]: a[1:-1,2]            # 获取第二列,除去首尾元素
    Out[24]: array([9])
    
    In [26]: a[:,2]               # 获取第二列元素
    Out[26]: array([ 5,  9, 13])
    
    In [27]:

    迭代array

    # 迭代行
    In [27]: print(a)
    [[ 3  4  5  6]
     [ 7  8  9 10]
     [11 12 13 14]]
    
    In [28]: for row in a:
        ...:     print(row)
        ...:     
    [3 4 5 6]
    [ 7  8  9 10]
    [11 12 13 14]
    
    In [29]:     
    
    # 迭代列
    In [29]: print(a.T)
    [[ 3  7 11]
     [ 4  8 12]
     [ 5  9 13]
     [ 6 10 14]]
    
    In [30]: for column in a.T:
        ...:     print(column)
        ...:     
    [ 3  7 11]
    [ 4  8 12]
    [ 5  9 13]
    [ 6 10 14]
    
    In [31]: 
    
    
    
    
    # 二维矩阵,多行转换成一行,迭代每一个item
    In [31]: print(a)
    [[ 3  4  5  6]
     [ 7  8  9 10]
     [11 12 13 14]]
    
    In [32]: print(a.flat)
    <numpy.flatiter object at 0x7f392e3545c0>
    
    In [33]: print(a.flatten())
    [ 3  4  5  6  7  8  9 10 11 12 13 14]
    
    In [34]: for item in a.flat:
        ...:     print(item)
        ...:     
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    
    In [35]: 

    合并array

    In [39]: a = np.array([1,2,3])
    
    In [40]: b = np.array([2,2,2])
    
    In [41]: c = np.vstack((a,b))     # vertical stack,上下合并
    
    In [42]: print(c)
    [[1 2 3]
     [2 2 2]]
    
    In [43]: c.shape
    Out[43]: (2, 3)
    
    In [44]: c.ndim
    Out[44]: 2
    
    In [45]: c.size
    Out[45]: 6
    
    In [46]: 
    
    
    
    In [47]: d = np.hstack((a,b))     # horizontal stack,左右合并
    
    In [48]: print(d)
    [1 2 3 2 2 2]
    
    In [49]: d.shape
    Out[49]: (6,)
    
    In [50]: d.ndim
    Out[50]: 1
    
    In [51]: d.size
    Out[51]: 6
    
    In [52]: 
    
    
    
    
    # newaxis改变数组维度
    In [54]: print(a)
    [1 2 3]
    
    In [55]: e = a[np.newaxis,:]
    
    In [56]: print(e)
    [[1 2 3]]
    
    In [57]: f = a[:,np.newaxis]
    
    In [58]: print(f)
    [[1]
     [2]
     [3]]
    
    In [59]: 
    
    
    
    
    In [59]: a = np.array([1,2,3])[:,np.newaxis]
    
    In [60]: b = np.array([2,2,2])[:,np.newaxis]
    
    In [61]: print(a)
    [[1]
     [2]
     [3]]
    
    In [62]: print(b)
    [[2]
     [2]
     [2]]
    
    In [63]: c = np.vstack((a,b))
    
    In [64]: print(c)
    [[1]
     [2]
     [3]
     [2]
     [2]
     [2]]
    
    In [65]: d = np.hstack((a,b))        # 合并两个array
    
    In [66]: print(d)
    [[1 2]
     [2 2]
     [3 2]]
    
    In [67]: 
    In [74]: d = np.hstack((a,b,b,a))    # 合并多个array
    
    In [75]: print(d)
    [[1 2 2 1]
     [2 2 2 2]
     [3 2 2 3]]
    
    In [76]: 
    
    
    
    
    # concatenate 常用来合并多个矩阵或序列,axis可以方便的指定维度
    In [76]: a = np.array([1,2,3])
    
    In [77]: b = np.array([2,2,2])
    
    In [78]: a = a[:,np.newaxis]
    
    In [79]: b = b[:,np.newaxis]
    
    In [80]: c = np.concatenate((a,b,b,a),axis=0)
    
    In [81]: print(c)
    [[1]
     [2]
     [3]
     [2]
     [2]
     [2]
     [2]
     [2]
     [2]
     [1]
     [2]
     [3]]
    
    In [82]: c = np.concatenate((a,b,b,a),axis=1)
    
    In [83]: print(c)
    [[1 2 2 1]
     [2 2 2 2]
     [3 2 2 3]]
    
    In [84]:   

    分割array

    In [92]: a = np.arange(12).reshape((3,4))
    
    In [93]: print(a)
    [[ 0  1  2  3]
     [ 4  5  6  7]
     [ 8  9 10 11]]
    
    In [94]: c = np.split(a,2,axis=1)               # 等项分割
    
    In [95]: len(c)
    Out[95]: 2
    
    In [96]: c[0]
    Out[96]: 
    array([[0, 1],
           [4, 5],
           [8, 9]])
    
    In [97]: c[1]
    Out[97]: 
    array([[ 2,  3],
           [ 6,  7],
           [10, 11]])
    
    In [98]: 
    
    In [98]: print(c)
    [array([[0, 1],
           [4, 5],
           [8, 9]]), array([[ 2,  3],
           [ 6,  7],
           [10, 11]])]
    
    In [99]: 
    
    
    In [99]: d = np.array_split(a,3,axis=1)         # 不等项分割
    
    In [100]: len(d)
    Out[100]: 3
    
    In [101]: print(d)
    [array([[0, 1],
           [4, 5],
           [8, 9]]), array([[ 2],
           [ 6],
           [10]]), array([[ 3],
           [ 7],
           [11]])]
    
    In [102]: d[0]
    Out[102]: 
    array([[0, 1],
           [4, 5],
           [8, 9]])
    
    In [103]: d[1]
    Out[103]: 
    array([[ 2],
           [ 6],
           [10]])
    
    In [104]: d[2]
    Out[104]: 
    array([[ 3],
           [ 7],
           [11]])
    
    In [105]: 
    
    
    
    
    
    In [111]: print(a)
    [[ 0  1  2  3]
     [ 4  5  6  7]
     [ 8  9 10 11]]
    
    In [112]: b = np.hsplit(a,2)          # horizontal split,水平分割
    
    In [113]: print(b)
    [array([[0, 1],
           [4, 5],
           [8, 9]]), array([[ 2,  3],
           [ 6,  7],
           [10, 11]])]
    
    In [114]: b[0]
    Out[114]: 
    array([[0, 1],
           [4, 5],
           [8, 9]])
    
    In [115]: b[1]
    Out[115]: 
    array([[ 2,  3],
           [ 6,  7],
           [10, 11]])
    
    In [116]: 
    
    In [116]: c = np.vsplit(a,3)          # vertical split,垂直分割
    
    In [117]: len(c)
    Out[117]: 3
    
    In [118]: print(c)
    [array([[0, 1, 2, 3]]), array([[4, 5, 6, 7]]), array([[ 8,  9, 10, 11]])]
    
    In [119]: c[0]
    Out[119]: array([[0, 1, 2, 3]])
    
    In [120]: c[1]
    Out[120]: array([[4, 5, 6, 7]])
    
    In [121]: c[2]
    Out[121]: array([[ 8,  9, 10, 11]])
    
    In [122]:

    Numpy.copy()

    In [150]: a = np.arange(4)
    
    In [151]: print(a)
    [0 1 2 3]
    
    In [152]: b = a
    
    In [153]: b is a
    Out[153]: True
    
    In [154]: a[0] = 99
    
    In [155]: print(b)
    [99  1  2  3]
    
    In [156]: 
    
    In [156]: c = a.copy()      # deep copy
    
    In [157]: c is a
    Out[157]: False
    
    In [159]: print(a)
    [99  1  2  3]
    
    In [160]: a[1:3] = [7,8]
    
    In [161]: print(a)
    [99  7  8  3]
    
    In [163]: print(b)
    [99  7  8  3]
    
    In [164]: print(c)
    [99  1  2  3]
    
    In [165]: 

    Numpy其他

    In [169]: a = np.array([-9,7,12,-4,-3,6,2])
    
    In [170]: print(a)
    [-9  7 12 -4 -3  6  2]
    
    In [171]: np.abs(a)
    Out[171]: array([ 9,  7, 12,  4,  3,  6,  2])
    
    In [172]: np.where(np.abs(a)>6)
    Out[172]: (array([0, 1, 2]),)
    
    In [173]: 
    

    numpy参考:http://pda.readthedocs.io/en/latest/chp4.html

    Pandas基础

    import pandas as pd

    Series

    In [173]: import pandas as pd
    
    In [174]: import numpy as np
    
    In [175]: s = pd.Series([1,3,6,np.nan,44,1])                  # 定义pandas.Series
    
    In [176]: print(s)
    0     1.0
    1     3.0
    2     6.0
    3     NaN
    4    44.0
    5     1.0
    dtype: float64
    
    In [177]:

    Base Time Series Frequencies

    Aggragate for duplicate Indices

    In [157]: dates = pd.DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000', '1/2/2000','1/3/2000','1/3/2000'])
    
    In [158]: dates
    Out[158]: 
    DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-02', '2000-01-02',
                   '2000-01-03', '2000-01-03'],
                  dtype='datetime64[ns]', freq=None)
    
    In [159]: dup_ts = pd.Series(np.arange(6), index=dates)
    
    In [160]: dup_ts
    Out[160]: 
    2000-01-01    0
    2000-01-02    1
    2000-01-02    2
    2000-01-02    3
    2000-01-03    4
    2000-01-03    5
    dtype: int64
    
    In [161]: dup_ts.index.is_unique
    Out[161]: False
    
    In [162]: dup_ts['2000-01-01']
    Out[162]: 0
    
    In [163]: dup_ts['2000-01-02']
    Out[163]: 
    2000-01-02    1
    2000-01-02    2
    2000-01-02    3
    dtype: int64
    
    In [164]: dup_ts['2000-01-03']
    Out[164]: 
    2000-01-03    4
    2000-01-03    5
    dtype: int64
    
    In [165]: 
    
    In [165]: grouped = dup_ts.groupby(level=0)
    
    In [166]: grouped.mean()
    Out[166]: 
    2000-01-01    0.0
    2000-01-02    2.0
    2000-01-03    4.5
    dtype: float64
    
    In [167]: grouped.count()
    Out[167]: 
    2000-01-01    1
    2000-01-02    3
    2000-01-03    2
    dtype: int64
    
    In [168]: grouped.sum()
    Out[168]: 
    2000-01-01    0
    2000-01-02    6
    2000-01-03    9
    dtype: int64
    
    In [169]: 

    Group by month or weekday by passing a function that accesses those fields on the time series’s index.

    In [90]: rng = pd.date_range('1/1/2000', periods=100, freq='D')
    
    In [91]: ts = pd.Series(np.arange(100), index=rng)
    
    In [92]: ts.groupby(lambda x: x.month).mean()
    Out[92]: 
    1    15
    2    45
    3    75
    4    95
    dtype: int64
    
    In [93]: ts.groupby(lambda x: x.month).sum()
    Out[93]: 
    1     465
    2    1305
    3    2325
    4     855
    dtype: int64
    
    In [94]: ts.groupby(lambda x: x.month).max()
    Out[94]: 
    1    30
    2    59
    3    90
    4    99
    dtype: int64
    
    In [95]: ts.groupby(lambda x: x.weekday).mean()
    Out[95]: 
    0    47.5
    1    48.5
    2    49.5
    3    50.5
    4    51.5
    5    49.0
    6    50.0
    dtype: float64
    
    In [96]: ts.groupby(lambda x: x.weekday).sum()
    Out[96]: 
    0    665
    1    679
    2    693
    3    707
    4    721
    5    735
    6    750
    dtype: int64
    
    In [97]: 

    Resample method arguments

    Resampling and Frequency Conversion

    In [50]: rng = pd.date_range('1/1/2000', periods=100, freq='D')
    
    In [51]: ts = pd.Series(np.random.randn(len(rng)), index=rng)
    
    In [52]: ts
    Out[52]: 
    2000-01-01    0.030631
    2000-01-02   -2.087034
    2000-01-03    1.238687
    2000-01-04   -1.297059
    2000-01-05   -1.341296
    2000-01-06   -0.353311
    2000-01-07   -0.854693
    2000-01-08    0.426789
                    ...   
    2000-03-27    1.262705
    2000-03-28   -0.646236
    2000-03-29   -0.349658
    2000-03-30   -1.093438
    2000-03-31   -0.254758
    2000-04-01    0.146417
    2000-04-02    1.774502
    2000-04-03   -0.712635
    2000-04-04   -1.552352
    2000-04-05    0.303172
    2000-04-06   -0.023492
    2000-04-07   -1.418930
    2000-04-08    0.789877
    2000-04-09    1.767594
    Freq: D, Length: 100, dtype: float64
    
    In [53]: 
    
    In [53]: ts.resample('M').mean()
    Out[53]: 
    2000-01-31    0.003531
    2000-02-29    0.030067
    2000-03-31   -0.106783
    2000-04-30    0.119350
    Freq: M, dtype: float64
    
    In [54]: ts.resample('M',kind='period').mean()
    Out[54]: 
    2000-01    0.003531
    2000-02    0.030067
    2000-03   -0.106783
    2000-04    0.119350
    Freq: M, dtype: float64
    
    In [55]: 
    

    Aggregate this data into five-minute chunks or bars by taking the sum of each group.

    In [71]: rng = pd.date_range('1/1/2000', periods=24, freq='T')
    
    In [72]: rng
    Out[72]: 
    DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 00:01:00',
                   '2000-01-01 00:02:00', '2000-01-01 00:03:00',
                   '2000-01-01 00:04:00', '2000-01-01 00:05:00',
                   '2000-01-01 00:06:00', '2000-01-01 00:07:00',
                   '2000-01-01 00:08:00', '2000-01-01 00:09:00',
                   '2000-01-01 00:10:00', '2000-01-01 00:11:00',
                   '2000-01-01 00:12:00', '2000-01-01 00:13:00',
                   '2000-01-01 00:14:00', '2000-01-01 00:15:00',
                   '2000-01-01 00:16:00', '2000-01-01 00:17:00',
                   '2000-01-01 00:18:00', '2000-01-01 00:19:00',
                   '2000-01-01 00:20:00', '2000-01-01 00:21:00',
                   '2000-01-01 00:22:00', '2000-01-01 00:23:00'],
                  dtype='datetime64[ns]', freq='T')
    
    In [73]: ts = pd.Series(np.arange(24), index=rng)
    
    In [74]: ts
    Out[74]: 
    2000-01-01 00:00:00     0
    2000-01-01 00:01:00     1
    2000-01-01 00:02:00     2
    2000-01-01 00:03:00     3
    2000-01-01 00:04:00     4
    2000-01-01 00:05:00     5
    2000-01-01 00:06:00     6
    2000-01-01 00:07:00     7
    2000-01-01 00:08:00     8
    2000-01-01 00:09:00     9
    2000-01-01 00:10:00    10
    2000-01-01 00:11:00    11
    2000-01-01 00:12:00    12
    2000-01-01 00:13:00    13
    2000-01-01 00:14:00    14
    2000-01-01 00:15:00    15
    2000-01-01 00:16:00    16
    2000-01-01 00:17:00    17
    2000-01-01 00:18:00    18
    2000-01-01 00:19:00    19
    2000-01-01 00:20:00    20
    2000-01-01 00:21:00    21
    2000-01-01 00:22:00    22
    2000-01-01 00:23:00    23
    Freq: T, dtype: int64
    
    In [75]: ts.resample('5min').sum()
    Out[75]: 
    2000-01-01 00:00:00    10
    2000-01-01 00:05:00    35
    2000-01-01 00:10:00    60
    2000-01-01 00:15:00    85
    2000-01-01 00:20:00    86
    Freq: 5T, dtype: int64
    
    In [76]: ts.resample('5min',closed='left').sum()
    Out[76]: 
    2000-01-01 00:00:00    10
    2000-01-01 00:05:00    35
    2000-01-01 00:10:00    60
    2000-01-01 00:15:00    85
    2000-01-01 00:20:00    86
    Freq: 5T, dtype: int64
    
    In [77]: 
    
    In [77]: ts.resample('5min').max()
    Out[77]: 
    2000-01-01 00:00:00     4
    2000-01-01 00:05:00     9
    2000-01-01 00:10:00    14
    2000-01-01 00:15:00    19
    2000-01-01 00:20:00    23
    Freq: 5T, dtype: int64
    
    In [78]: 
    
    In [78]: ts.resample('5min',closed='right').sum()
    Out[78]: 
    1999-12-31 23:55:00     0
    2000-01-01 00:00:00    15
    2000-01-01 00:05:00    40
    2000-01-01 00:10:00    65
    2000-01-01 00:15:00    90
    2000-01-01 00:20:00    66
    Freq: 5T, dtype: int64
    
    In [79]: 
    
    In [79]: ts.resample('5min',loffset='-1s').sum()
    Out[79]: 
    1999-12-31 23:59:59    10
    2000-01-01 00:04:59    35
    2000-01-01 00:09:59    60
    2000-01-01 00:14:59    85
    2000-01-01 00:19:59    86
    Freq: 5T, dtype: int64
    
    In [80]:
    
    
    
    # Open-High-Low-Close (OHLC) resampling
    In [81]: ts.resample('5min').ohlc()
    Out[81]: 
                         open  high  low  close
    2000-01-01 00:00:00     0     4    0      4
    2000-01-01 00:05:00     5     9    5      9
    2000-01-01 00:10:00    10    14   10     14
    2000-01-01 00:15:00    15    19   15     19
    2000-01-01 00:20:00    20    23   20     23
    
    In [82]: 

    Resampling with Periods

    In [118]: frame = pd.DataFrame(np.random.randn(24, 4),
         ...:     index=pd.period_range('1-2000', '12-2001', freq='M'),
         ...:     columns=['Beijing', 'Luoyang', 'New York', 'Tokyo'])
    
    In [119]: frame
    Out[119]: 
              Beijing   Luoyang  New York     Tokyo
    2000-01  1.120268 -1.120345 -1.154800  0.443861
    2000-02  0.611443  0.200576 -1.163600 -1.137567
    2000-03  0.658112  2.332235 -1.718285  1.589246
    2000-04 -0.863050  1.890877  2.046202  0.410414
    2000-05  0.710052 -0.041623  0.122719 -1.141112
    2000-06  0.299393  1.227689  0.718627  1.004851
    2000-07  1.287335 -0.179045 -0.476422  0.949235
    2000-08 -2.140590  0.433699 -0.783202  1.073706
    2000-09 -0.149710 -0.580780  0.755274  0.514259
    2000-10  0.190940 -0.187451  1.710803 -1.631272
    2000-11  0.419288  0.565235  0.470381  0.599020
    2000-12  0.951111  0.464671 -0.854858 -0.009189
    2001-01 -1.383493 -0.147035 -0.379006  0.472686
    2001-02  1.803475 -1.628368 -0.896757 -0.508827
    2001-03  0.575910 -0.528299  1.182473  0.159452
    2001-04 -1.056161 -0.475357  0.861852  1.168667
    2001-05 -1.316565  0.354719  1.354205 -0.369083
    2001-06  0.497406 -1.799904 -0.512882 -0.092718
    2001-07  0.896944 -1.276022  0.137365  0.087199
    2001-08 -0.046908 -0.650024  0.958182 -0.048369
    2001-09  0.085401  1.067235  0.541318  0.853376
    2001-10  1.165047 -0.794425  1.137002  0.064595
    2001-11 -0.438006  0.706564  1.464403  0.278069
    2001-12 -0.094644  0.666789  0.220349 -0.386617
    
    In [120]: frame[:5]
    Out[120]: 
              Beijing   Luoyang  New York     Tokyo
    2000-01  1.120268 -1.120345 -1.154800  0.443861
    2000-02  0.611443  0.200576 -1.163600 -1.137567
    2000-03  0.658112  2.332235 -1.718285  1.589246
    2000-04 -0.863050  1.890877  2.046202  0.410414
    2000-05  0.710052 -0.041623  0.122719 -1.141112
    
    In [121]: annual_frame = frame.resample('A-DEC').mean()
    
    In [122]: annual_frame
    Out[122]: 
           Beijing   Luoyang  New York     Tokyo
    2000  0.257883  0.417145 -0.027263  0.222121
    2001  0.057367 -0.375344  0.505709  0.139869
    
    In [123]: 
    
    In [123]: annual_frame_max = frame.resample('A-DEC').max()
    
    In [124]: annual_frame_max
    Out[124]: 
           Beijing   Luoyang  New York     Tokyo
    2000  1.287335  2.332235  2.046202  1.589246
    2001  1.803475  1.067235  1.464403  1.168667
    
    In [125]: 

    DataFrame 

    # 第一种定义pandas.DataFrame方式:直接导入numpy的数据
    In [186]: df1 = pd.DataFrame(np.arange(12).reshape((3,4)))    # 定义pandas.DataFrame
    
    In [187]: print(df1)
       0  1   2   3
    0  0  1   2   3
    1  4  5   6   7
    2  8  9  10  11
    
    In [188]: 
    
    In [178]: dates = pd.date_range('20160101',periods=6)
    
    In [179]: print(dates)
    DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
                   '2016-01-05', '2016-01-06'],
                  dtype='datetime64[ns]', freq='D')
    
    In [180]: 
    
    # 定义pandas.DataFrame,并指定列名和行名
    In [184]: df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])
    
    In [185]: print(df)
                       a         b         c         d
    2016-01-01  1.193589  0.165348  1.598806 -0.478980
    2016-01-02  1.188886 -1.232185 -0.633066  0.594805
    2016-01-03  2.707996 -0.116420  1.622761  0.399708
    2016-01-04  0.416469  1.593061 -0.044390 -0.031153
    2016-01-05 -0.637080  1.680110  1.371026  0.821549
    2016-01-06 -0.079359  1.421577  0.042537  1.058749
    
    In [186]: 
    
    
    # 第二种定义pandas.DataFrame方式:把参数当做字典传入DataFrame
    In [188]: df2 = pd.DataFrame({'A' : 1.,
         ...:                     'B' : pd.Timestamp('20130102'),
         ...:                     'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
         ...:                     'D' : np.array([3] * 4,dtype='int32'),
         ...:                     'E' : pd.Categorical(["test","train","test","train"]),
         ...:                     'F' : 'foo'})
    
    In [189]: print(df2)
         A          B    C  D      E    F
    0  1.0 2013-01-02  1.0  3   test  foo
    1  1.0 2013-01-02  1.0  3  train  foo
    2  1.0 2013-01-02  1.0  3   test  foo
    3  1.0 2013-01-02  1.0  3  train  foo
    
    In [190]:
    In [190]: print(df2.dtypes)                      # 查看DataFrame内容的类型
    A           float64
    B    datetime64[ns]
    C           float32
    D             int32
    E          category
    F            object
    dtype: object
    
    In [191]:
    In [191]: print(df2.index)                       # 打印DataFrame列的名字
    Int64Index([0, 1, 2, 3], dtype='int64')
    
    In [192]: 
    In [192]: print(df2.columns)                     # 打印DataFrame行的名字
    Index([u'A', u'B', u'C', u'D', u'E', u'F'], dtype='object')
    
    In [193]: 
     
    In [194]: print(df2.values)                      # 打印DataFrame的内容
    [[1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
     [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']
     [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
     [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']]
    
    In [195]: 
    
    
    
    
    
    In [196]: print(df2)
         A          B    C  D      E    F
    0  1.0 2013-01-02  1.0  3   test  foo
    1  1.0 2013-01-02  1.0  3  train  foo
    2  1.0 2013-01-02  1.0  3   test  foo
    3  1.0 2013-01-02  1.0  3  train  foo
    
    In [197]: 
    
    In [197]: print(df2.describe())                  # 打印出DataFrame的数学运算的相关数据
             A    C    D
    count  4.0  4.0  4.0
    mean   1.0  1.0  3.0
    std    0.0  0.0  0.0
    min    1.0  1.0  3.0
    25%    1.0  1.0  3.0
    50%    1.0  1.0  3.0
    75%    1.0  1.0  3.0
    max    1.0  1.0  3.0
    
    In [198]: 
    
    
    In [200]: print(df2.T)                           # 把DataFrame进行transport,即转置
                         0                    1                    2                    3
    A                    1                    1                    1                    1
    B  2013-01-02 00:00:00  2013-01-02 00:00:00  2013-01-02 00:00:00  2013-01-02 00:00:00
    C                    1                    1                    1                    1
    D                    3                    3                    3                    3
    E                 test                train                 test                train
    F                  foo                  foo                  foo                  foo
    
    In [201]: 
    
    
    
    
    
    
    # 对DataFrame排序
    In [203]: print(df2)
         A          B    C  D      E    F
    0  1.0 2013-01-02  1.0  3   test  foo
    1  1.0 2013-01-02  1.0  3  train  foo
    2  1.0 2013-01-02  1.0  3   test  foo
    3  1.0 2013-01-02  1.0  3  train  foo
    
    In [204]: df2.sort_index(axis=1, ascending=False)   # 按照index(列名)排序
    Out[204]: 
         F      E  D    C          B    A
    0  foo   test  3  1.0 2013-01-02  1.0
    1  foo  train  3  1.0 2013-01-02  1.0
    2  foo   test  3  1.0 2013-01-02  1.0
    3  foo  train  3  1.0 2013-01-02  1.0
    
    In [205]:
    In [205]: df2.sort_index(axis=0, ascending=False)   # 按照行名排序
    Out[205]: 
         A          B    C  D      E    F
    3  1.0 2013-01-02  1.0  3  train  foo
    2  1.0 2013-01-02  1.0  3   test  foo
    1  1.0 2013-01-02  1.0  3  train  foo
    0  1.0 2013-01-02  1.0  3   test  foo
    
    In [206]: 
    
    
    
    In [207]: df2.sort_values(by='E')                   # 指定value进行排序
    Out[207]: 
         A          B    C  D      E    F
    0  1.0 2013-01-02  1.0  3   test  foo
    2  1.0 2013-01-02  1.0  3   test  foo
    1  1.0 2013-01-02  1.0  3  train  foo
    3  1.0 2013-01-02  1.0  3  train  foo
    
    In [208]:   

    Pandas筛选数据

    In [212]: dates = pd.date_range('20160101',periods=6)
    
    In [213]: df = pd.DataFrame(np.arange(24).reshape(6,4),index=dates,columns=['A','B','C','D'])
    
    In [214]: print(df)
                 A   B   C   D
    2016-01-01   0   1   2   3
    2016-01-02   4   5   6   7
    2016-01-03   8   9  10  11
    2016-01-04  12  13  14  15
    2016-01-05  16  17  18  19
    2016-01-06  20  21  22  23
    
    In [215]: 
    
    In [215]: print(df['A'])            # 选取指定列
    2016-01-01     0
    2016-01-02     4
    2016-01-03     8
    2016-01-04    12
    2016-01-05    16
    2016-01-06    20
    Freq: D, Name: A, dtype: int64
     
    In [216]: print(df.A)               # 等价于 df['A']
    2016-01-01     0
    2016-01-02     4
    2016-01-03     8
    2016-01-04    12
    2016-01-05    16
    2016-01-06    20
    Freq: D, Name: A, dtype: int64
    
    In [217]:
    
    In [217]: print(df[0:3])            # 切片方式选取某些行
                A  B   C   D
    2016-01-01  0  1   2   3
    2016-01-02  4  5   6   7
    2016-01-03  8  9  10  11
    
    In [218]: print(df['2016-01-01':'2016-01-03'])   # 等价于 df[0:3]
                A  B   C   D
    2016-01-01  0  1   2   3
    2016-01-02  4  5   6   7
    2016-01-03  8  9  10  11
    
    In [219]: 
    
    
    
    
    # select by label : loc
    In [220]: print(df.loc['2016-01-02'])
    A    4
    B    5
    C    6
    D    7
    Name: 2016-01-02 00:00:00, dtype: int64
    
    In [221]: 
    In [221]: print(df.loc['2016-01-02']['B'])
    5
    
    In [222]: 
    
    In [227]: print(df.loc[:,['A','B']])
                 A   B
    2016-01-01   0   1
    2016-01-02   4   5
    2016-01-03   8   9
    2016-01-04  12  13
    2016-01-05  16  17
    2016-01-06  20  21
    
    In [228]: 
    In [228]: print(df.loc['2016-01-03',['A','B']])
    A    8
    B    9
    Name: 2016-01-03 00:00:00, dtype: int64
    
    In [229]: 
    In [232]: print(df.loc['2016-01-03':'2016-01-05',['A','B']])
                 A   B
    2016-01-03   8   9
    2016-01-04  12  13
    2016-01-05  16  17
    
    In [233]: 
    
    
    
    
    # select by position : iloc
    In [235]: print(df)
                 A   B   C   D
    2016-01-01   0   1   2   3
    2016-01-02   4   5   6   7
    2016-01-03   8   9  10  11
    2016-01-04  12  13  14  15
    2016-01-05  16  17  18  19
    2016-01-06  20  21  22  23
    
    In [236]: print(df.iloc[3])
    A    12
    B    13
    C    14
    D    15
    Name: 2016-01-04 00:00:00, dtype: int64
    
    In [237]: print(df.iloc[3,1])
    13
    
    In [238]: 
    
    In [238]: print(df.iloc[3:5,1:3])
                 B   C
    2016-01-04  13  14
    2016-01-05  17  18
    
    In [239]: 
    
    In [240]: print(df.iloc[[1,3,5],1:3])
                 B   C
    2016-01-02   5   6
    2016-01-04  13  14
    2016-01-06  21  22
    
    In [241]: 
    
    
    
    
    # mixed selection : ix
    In [243]: print(df.ix[:3,['A','C']])
    /usr/local/anaconda2/bin/ipython2:1: DeprecationWarning: 
    .ix is deprecated. Please use
    .loc for label based indexing or
    .iloc for positional indexing
    
    See the documentation here:
    http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
      #!/usr/local/anaconda2/bin/python
                A   C
    2016-01-01  0   2
    2016-01-02  4   6
    2016-01-03  8  10
    
    In [244]: 
    
    
    
    
    # Boolean indexing
    In [9]: print(df[df.A>8])
                 A   B   C   D
    2016-01-04  12  13  14  15
    2016-01-05  16  17  18  19
    2016-01-06  20  21  22  23
    
    In [10]: 
    df.head(n)      # 返回DataFrame前n行
    
    df.tail(n)      # 返回DateFrame后n行

    Pandas设置值

    # 给DataFrame设置值
    In [1]: import numpy as np
    
    In [2]: import pandas as pd
    
    In [3]: dates = pd.date_range('20160101',periods=6)
    
    In [4]: df = pd.DataFrame(np.arange(24).reshape(6,4),index=dates,columns=['A','B','C','D'])
    
    In [5]: print(df)
                 A   B   C   D
    2016-01-01   0   1   2   3
    2016-01-02   4   5   6   7
    2016-01-03   8   9  10  11
    2016-01-04  12  13  14  15
    2016-01-05  16  17  18  19
    2016-01-06  20  21  22  23
    
    In [6]:
    In [7]: df.iloc[2,2] = 99
    
    In [10]: df.loc['2016-01-02','B'] = 100
    
    In [11]: print(df)
                 A    B   C   D
    2016-01-01   0    1   2   3
    2016-01-02   4  100   6   7
    2016-01-03   8    9  99  11
    2016-01-04  12   13  14  15
    2016-01-05  16   17  18  19
    2016-01-06  20   21  22  23
    
    In [12]:
    
    
    
    
    In [17]: print(df)
                 A   B   C   D
    2016-01-01   0   1   2   3
    2016-01-02   4   5   6   7
    2016-01-03   8   9  10  11
    2016-01-04  12  13  14  15
    2016-01-05  16  17  18  19
    2016-01-06  20  21  22  23
    
    In [18]: df.A[df.A>4] = 0
    
    In [19]: print(df)
                A   B   C   D
    2016-01-01  0   1   2   3
    2016-01-02  4   5   6   7
    2016-01-03  0   9  10  11
    2016-01-04  0  13  14  15
    2016-01-05  0  17  18  19
    2016-01-06  0  21  22  23
    
    In [20]: 
    
    
    
    In [21]: print(df)
                 A   B   C   D
    2016-01-01   0   1   2   3
    2016-01-02   4   5   6   7
    2016-01-03   8   9  10  11
    2016-01-04  12  13  14  15
    2016-01-05  16  17  18  19
    2016-01-06  20  21  22  23
    
    In [22]: df[df.A>4] = 0
    
    In [23]: print(df)
                A  B  C  D
    2016-01-01  0  1  2  3
    2016-01-02  4  5  6  7
    2016-01-03  0  0  0  0
    2016-01-04  0  0  0  0
    2016-01-05  0  0  0  0
    2016-01-06  0  0  0  0
    
    In [24]: 
    
    
    
    
    In [30]: df['F'] = np.nan        # 增加一列,赋值为NaN
    
    In [31]: print(df)
                 A   B   C   D   F
    2016-01-01   0   1   2   3 NaN
    2016-01-02   4   5   6   7 NaN
    2016-01-03   8   9  10  11 NaN
    2016-01-04  12  13  14  15 NaN
    2016-01-05  16  17  18  19 NaN
    2016-01-06  20  21  22  23 NaN
    
    In [32]: 
                                     # 增加一列,需要制定行名
    In [46]: df['F'] = pd.Series([1,2,3,4,5,6], index=pd.date_range('20160101',periods=6)) 
    
    In [47]: print(df)
                 A   B   C   D   E  F
    2016-01-01   0   1   2   3 NaN  1
    2016-01-02   4   5   6   7 NaN  2
    2016-01-03   8   9  10  11 NaN  3
    2016-01-04  12  13  14  15 NaN  4
    2016-01-05  16  17  18  19 NaN  5
    2016-01-06  20  21  22  23 NaN  6
    
    In [48]:

    Pandas删除DataFrame数据

    In [1]: import numpy as np
    
    In [2]: import pandas as pd
    
    In [3]: values = np.arange(12).reshape((3,4))
    
    In [4]: print(values)
    [[ 0  1  2  3]
     [ 4  5  6  7]
     [ 8  9 10 11]]
    
    In [5]:
    In [8]: df = pd.DataFrame(values,index=['row1','row2','row3'],columns=['A','B','C','D'])
    
    In [9]: print(df)
          A  B   C   D
    row1  0  1   2   3
    row2  4  5   6   7
    row3  8  9  10  11
    
    In [10]:
    In [10]: print(df.shape)
    (3, 4)
    
    In [11]:
    In [11]: df.drop(columns='A',axis=1)
    Out[11]: 
          B   C   D
    row1  1   2   3
    row2  5   6   7
    row3  9  10  11
    
    In [12]: df.drop(columns=['A','C'],axis=1)
    Out[12]: 
          B   D
    row1  1   3
    row2  5   7
    row3  9  11
    
    In [13]: 
    
    In [13]: df.drop(index='row2',axis=0)
    Out[13]: 
          A  B   C   D
    row1  0  1   2   3
    row3  8  9  10  11
    
    In [14]: df.drop(index=['row2','row3'],axis=0)
    Out[14]: 
          A  B  C  D
    row1  0  1  2  3
    
    In [15]:
    

    如果index用的是 “pd.date_range('20160101',periods=6)

    In [43]: print(df)
                       a         b         c         d
    2016-01-01  1.273748  0.949407 -0.446053 -0.126789
    2016-01-02 -0.770801  1.641150  0.840216 -0.991219
    2016-01-03 -0.164625 -1.459954  1.214388  0.281621
    2016-01-04  1.863281  1.163653  0.319549 -1.545655
    2016-01-05  0.452804  0.203472 -1.232536  0.681963
    2016-01-06  0.171324  0.353359  1.674004 -2.026071
    
    In [44]: print(df.index)
    DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
                   '2016-01-05', '2016-01-06'],
                  dtype='datetime64[ns]', freq='D')
    
    In [45]: 
    
    In [45]: df.drop(index=pd.datetime(2016,1,4),axis=0)
    Out[45]: 
                       a         b         c         d
    2016-01-01  1.273748  0.949407 -0.446053 -0.126789
    2016-01-02 -0.770801  1.641150  0.840216 -0.991219
    2016-01-03 -0.164625 -1.459954  1.214388  0.281621
    2016-01-05  0.452804  0.203472 -1.232536  0.681963
    2016-01-06  0.171324  0.353359  1.674004 -2.026071
    
    In [46]: df.drop(index=[pd.datetime(2016,1,2),pd.datetime(2016,1,5)],axis=0)
    Out[46]: 
                       a         b         c         d
    2016-01-01  1.273748  0.949407 -0.446053 -0.126789
    2016-01-03 -0.164625 -1.459954  1.214388  0.281621
    2016-01-04  1.863281  1.163653  0.319549 -1.545655
    2016-01-06  0.171324  0.353359  1.674004 -2.026071
    
    In [47]: 

    Pandas处理丢失的数据

    # 处理丢失数据
    
    In [7]: print(df)
                 A   B   C   D
    2016-01-01   0   1   2   3
    2016-01-02   4   5   6   7
    2016-01-03   8   9  10  11
    2016-01-04  12  13  14  15
    2016-01-05  16  17  18  19
    2016-01-06  20  21  22  23
    
    In [8]: df.iloc[0,1] = np.nan
    
    In [9]: df.iloc[1,2] = np.nan
    
    In [10]: print(df)
                 A     B     C   D
    2016-01-01   0   NaN   2.0   3
    2016-01-02   4   5.0   NaN   7
    2016-01-03   8   9.0  10.0  11
    2016-01-04  12  13.0  14.0  15
    2016-01-05  16  17.0  18.0  19
    2016-01-06  20  21.0  22.0  23
    
    In [11]: print(df.dropna(axis=1,how='any'))  # 删除NaN数据所在行,how = {'any','all'}
                 A   D
    2016-01-01   0   3
    2016-01-02   4   7
    2016-01-03   8  11
    2016-01-04  12  15
    2016-01-05  16  19
    2016-01-06  20  23
    
    In [12]: print(df.dropna(axis=0,how='any'))  # 删除NaN数据所在行,how = {'any','all'} 
                 A     B     C   D
    2016-01-03   8   9.0  10.0  11
    2016-01-04  12  13.0  14.0  15
    2016-01-05  16  17.0  18.0  19
    2016-01-06  20  21.0  22.0  23
    
    In [13]: 
    In [13]: print(df.dropna(axis=0,how='all'))
                 A     B     C   D
    2016-01-01   0   NaN   2.0   3
    2016-01-02   4   5.0   NaN   7
    2016-01-03   8   9.0  10.0  11
    2016-01-04  12  13.0  14.0  15
    2016-01-05  16  17.0  18.0  19
    2016-01-06  20  21.0  22.0  23
    
    In [14]: 
    In [14]: print(df.dropna(axis=1,how='all'))
                 A     B     C   D
    2016-01-01   0   NaN   2.0   3
    2016-01-02   4   5.0   NaN   7
    2016-01-03   8   9.0  10.0  11
    2016-01-04  12  13.0  14.0  15
    2016-01-05  16  17.0  18.0  19
    2016-01-06  20  21.0  22.0  23
    
    In [15]: 
    
    
    
    In [15]: df.fillna(value=0)                # 把NaN填充为制定数值
    Out[15]: 
                 A     B     C   D
    2016-01-01   0   0.0   2.0   3
    2016-01-02   4   5.0   0.0   7
    2016-01-03   8   9.0  10.0  11
    2016-01-04  12  13.0  14.0  15
    2016-01-05  16  17.0  18.0  19
    2016-01-06  20  21.0  22.0  23
    
    In [16]: 
    
    
    
    
    
    
    In [19]: print(df.isnull())                # 把数值为NaN的位置标识出来
                    A      B      C      D
    2016-01-01  False   True  False  False
    2016-01-02  False  False   True  False
    2016-01-03  False  False  False  False
    2016-01-04  False  False  False  False
    2016-01-05  False  False  False  False
    2016-01-06  False  False  False  False
    
    In [20]: 
    
    
    In [22]: print(np.any(df.isnull()) == True)   # 检查DataFrame是否含有NaN值
    True
    
    In [23]: 

    Pandas导入导出示例

    In [33]: import pandas as pd
    
    In [34]: data = pd.read_csv('student.csv')
    
    In [35]: print(data)
        Student ID  name   age  gender
    0         1100  Kelly   22  Female
    1         1101    Clo   21  Female
    2         1102  Tilly   22  Female
    3         1103   Tony   24    Male
    4         1104  David   20    Male
    5         1105  Catty   22  Female
    6         1106      M    3  Female
    7         1107      N   43    Male
    8         1108      A   13    Male
    9         1109      S   12    Male
    10        1110  David   33    Male
    11        1111     Dw    3  Female
    12        1112      Q   23    Male
    13        1113      W   21  Female
    
    In [36]: print(type(data))
    <class 'pandas.core.frame.DataFrame'>
    
    In [37]: data.to_pickle('student.pickle')
    
    In [38]: data.to_json('student.json')
    
    In [39]:

    更多IO Tools参考:官方介绍

    Pandas concat合并

    # pandas 合并
    
    # concatenating
    In [40]: import numpy as np
    
    In [41]: import pandas as pd
    
    In [42]: df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
    
    In [43]: df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
    
    In [44]: df3 = pd.DataFrame(np.ones((3,4))*2, columns=['a','b','c','d'])
    
    In [45]: print(df1)
         a    b    c    d
    0  0.0  0.0  0.0  0.0
    1  0.0  0.0  0.0  0.0
    2  0.0  0.0  0.0  0.0
    
    In [46]: print(df2)
         a    b    c    d
    0  1.0  1.0  1.0  1.0
    1  1.0  1.0  1.0  1.0
    2  1.0  1.0  1.0  1.0
    
    In [47]: print(df3)
         a    b    c    d
    0  2.0  2.0  2.0  2.0
    1  2.0  2.0  2.0  2.0
    2  2.0  2.0  2.0  2.0
    
    In [48]: result = pd.concat([df1,df2,df3],axis=0)   # vertical 垂直合并
    
    In [49]: print(result)
         a    b    c    d
    0  0.0  0.0  0.0  0.0
    1  0.0  0.0  0.0  0.0
    2  0.0  0.0  0.0  0.0
    0  1.0  1.0  1.0  1.0
    1  1.0  1.0  1.0  1.0
    2  1.0  1.0  1.0  1.0
    0  2.0  2.0  2.0  2.0
    1  2.0  2.0  2.0  2.0
    2  2.0  2.0  2.0  2.0
    
    In [50]: 
    In [50]: result = pd.concat([df1,df2,df3],axis=0,ignore_index=True)  # 序号重新排列
    
    In [51]: print(result)
         a    b    c    d
    0  0.0  0.0  0.0  0.0
    1  0.0  0.0  0.0  0.0
    2  0.0  0.0  0.0  0.0
    3  1.0  1.0  1.0  1.0
    4  1.0  1.0  1.0  1.0
    5  1.0  1.0  1.0  1.0
    6  2.0  2.0  2.0  2.0
    7  2.0  2.0  2.0  2.0
    8  2.0  2.0  2.0  2.0
    
    In [52]:
    
    
    
    
    
    # join合并   ['inner','outer']
    In [63]: df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'],index=[1,2,3])
    
    In [64]: df2 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d','e'],index=[2,3,4])
    
    In [65]: print(df1)
         a    b    c    d
    1  0.0  0.0  0.0  0.0
    2  0.0  0.0  0.0  0.0
    3  0.0  0.0  0.0  0.0
    
    In [66]: print(df2)
         b    c    d    e
    2  1.0  1.0  1.0  1.0
    3  1.0  1.0  1.0  1.0
    4  1.0  1.0  1.0  1.0
    
    In [67]: 
    In [67]: result = pd.concat([df1,df2])     # 即 pd.concat([df1,df2],join='outer') , 默认就是outer模式
    /usr/local/anaconda2/bin/ipython2:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
    of pandas will change to not sort by default.
    
    To accept the future behavior, pass 'sort=True'.
    
    To retain the current behavior and silence the warning, pass sort=False
    
      #!/usr/local/anaconda2/bin/python
    
    In [68]: 
    
    In [68]: print(result)
         a    b    c    d    e
    1  0.0  0.0  0.0  0.0  NaN
    2  0.0  0.0  0.0  0.0  NaN
    3  0.0  0.0  0.0  0.0  NaN
    2  NaN  1.0  1.0  1.0  1.0
    3  NaN  1.0  1.0  1.0  1.0
    4  NaN  1.0  1.0  1.0  1.0
    
    In [69]:
    
    In [70]: result = pd.concat([df1,df2],join='inner')  # inner模式
    
    In [71]: print(result)
         b    c    d
    1  0.0  0.0  0.0
    2  0.0  0.0  0.0
    3  0.0  0.0  0.0
    2  1.0  1.0  1.0
    3  1.0  1.0  1.0
    4  1.0  1.0  1.0
    
    In [72]: 
    In [72]: result = pd.concat([df1,df2],join='inner',ignore_index=True)
    
    In [73]: print(result)
         b    c    d
    0  0.0  0.0  0.0
    1  0.0  0.0  0.0
    2  0.0  0.0  0.0
    3  1.0  1.0  1.0
    4  1.0  1.0  1.0
    5  1.0  1.0  1.0
    
    In [74]: 
    
    
    
    
    # join_axes合并
    In [78]: res = pd.concat([df1, df2], axis=1)
    
    In [79]: print(res)
         a    b    c    d    b    c    d    e
    1  0.0  0.0  0.0  0.0  NaN  NaN  NaN  NaN
    2  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
    3  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
    4  NaN  NaN  NaN  NaN  1.0  1.0  1.0  1.0
    
    In [80]: 
    In [74]: df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'],index=[1,2,3])
    
    In [75]: df2 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d','e'],index=[2,3,4])
    
    In [76]: res = pd.concat([df1, df2], axis=1, join_axes=[df1.index])
    
    In [77]: print(res)
         a    b    c    d    b    c    d    e
    1  0.0  0.0  0.0  0.0  NaN  NaN  NaN  NaN
    2  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
    3  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
    
    In [78]: 
    In [80]: res = pd.concat([df1, df2], axis=1, join_axes=[df2.index])
    
    In [81]: print(res)
         a    b    c    d    b    c    d    e
    2  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
    3  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
    4  NaN  NaN  NaN  NaN  1.0  1.0  1.0  1.0
    
    In [82]: 
    
    
    
    
    # append合并
    
    In [87]: df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
    
    In [88]: df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
    
    In [89]: df1.append(df2,ignore_index=True)
    Out[89]: 
         a    b    c    d
    0  0.0  0.0  0.0  0.0
    1  0.0  0.0  0.0  0.0
    2  0.0  0.0  0.0  0.0
    3  1.0  1.0  1.0  1.0
    4  1.0  1.0  1.0  1.0
    5  1.0  1.0  1.0  1.0
    
    In [90]: df3 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
    
    In [91]: df1.append([df2,df3],ignore_index=True)
    Out[91]: 
         a    b    c    d
    0  0.0  0.0  0.0  0.0
    1  0.0  0.0  0.0  0.0
    2  0.0  0.0  0.0  0.0
    3  1.0  1.0  1.0  1.0
    4  1.0  1.0  1.0  1.0
    5  1.0  1.0  1.0  1.0
    6  1.0  1.0  1.0  1.0
    7  1.0  1.0  1.0  1.0
    8  1.0  1.0  1.0  1.0
    
    In [92]: 
    
    
    # 添加一行数据到DataFrame
    In [92]: df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
    
    In [93]: s1 = pd.Series([1,2,3,4], index=['a','b','c','d'])
    
    In [94]: res = df1.append(s1,ignore_index=True)
    
    In [95]: print(res)
         a    b    c    d
    0  0.0  0.0  0.0  0.0
    1  0.0  0.0  0.0  0.0
    2  0.0  0.0  0.0  0.0
    3  1.0  2.0  3.0  4.0
    
    In [96]: 

    Pandas merge合并

    # merge合并
    In [99]: import pandas as pd
    
    In [100]: left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
         ...:                      'A': ['A0', 'A1', 'A2', 'A3'],
         ...:                      'B': ['B0', 'B1', 'B2', 'B3']})
    
    In [101]: right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
         ...:                       'C': ['C0', 'C1', 'C2', 'C3'],
         ...:                       'D': ['D0', 'D1', 'D2', 'D3']})
    
    In [102]: 
    
    In [102]: print(left)
        A   B key
    0  A0  B0  K0
    1  A1  B1  K1
    2  A2  B2  K2
    3  A3  B3  K3
    
    In [103]: print(right)
        C   D key
    0  C0  D0  K0
    1  C1  D1  K1
    2  C2  D2  K2
    3  C3  D3  K3
    
    In [104]: 
    In [104]: res = pd.merge(left,right,on='key')
    
    In [105]: print(res)
        A   B key   C   D
    0  A0  B0  K0  C0  D0
    1  A1  B1  K1  C1  D1
    2  A2  B2  K2  C2  D2
    3  A3  B3  K3  C3  D3
    
    In [106]: 
    
    
    # consider two keys
    In [106]: left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
         ...:                       'key2': ['K0', 'K1', 'K0', 'K1'],
         ...:                       'A': ['A0', 'A1', 'A2', 'A3'],
         ...:                       'B': ['B0', 'B1', 'B2', 'B3']})
    
    In [107]: right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
         ...:                        'key2': ['K0', 'K0', 'K0', 'K0'],
         ...:                        'C': ['C0', 'C1', 'C2', 'C3'],
         ...:                        'D': ['D0', 'D1', 'D2', 'D3']})
    
    In [108]: print(left)
        A   B key1 key2
    0  A0  B0   K0   K0
    1  A1  B1   K0   K1
    2  A2  B2   K1   K0
    3  A3  B3   K2   K1
    
    In [109]: print(right)
        C   D key1 key2
    0  C0  D0   K0   K0
    1  C1  D1   K1   K0
    2  C2  D2   K1   K0
    3  C3  D3   K2   K0
    
    In [110]: res = pd.merge(left,right,on=['key1','key2'])
    
    In [111]: print(res)
        A   B key1 key2   C   D
    0  A0  B0   K0   K0  C0  D0
    1  A2  B2   K1   K0  C1  D1
    2  A2  B2   K1   K0  C2  D2
    
    
    # how={'left','right','inner','outer'}
    In [112]: res = pd.merge(left,right,on=['key1','key2'],how='inner')  # 默认就是inner模式
    
    In [113]: print(res)
        A   B key1 key2   C   D
    0  A0  B0   K0   K0  C0  D0
    1  A2  B2   K1   K0  C1  D1
    2  A2  B2   K1   K0  C2  D2
    
    In [114]: res = pd.merge(left,right,on=['key1','key2'],how='outer')
    
    In [115]: print(res)
         A    B key1 key2    C    D
    0   A0   B0   K0   K0   C0   D0
    1   A1   B1   K0   K1  NaN  NaN
    2   A2   B2   K1   K0   C1   D1
    3   A2   B2   K1   K0   C2   D2
    4   A3   B3   K2   K1  NaN  NaN
    5  NaN  NaN   K2   K0   C3   D3
    
    In [116]: 
    In [116]: res = pd.merge(left,right,on=['key1','key2'],how='left')
    
    In [117]: print(res)
        A   B key1 key2    C    D
    0  A0  B0   K0   K0   C0   D0
    1  A1  B1   K0   K1  NaN  NaN
    2  A2  B2   K1   K0   C1   D1
    3  A2  B2   K1   K0   C2   D2
    4  A3  B3   K2   K1  NaN  NaN
    
    In [118]: res = pd.merge(left,right,on=['key1','key2'],how='right')
    
    In [119]: print(res)
         A    B key1 key2   C   D
    0   A0   B0   K0   K0  C0  D0
    1   A2   B2   K1   K0  C1  D1
    2   A2   B2   K1   K0  C2  D2
    3  NaN  NaN   K2   K0  C3  D3
    
    In [120]: 
    
    
    
    # indicator
    In [121]: df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']})
    
    In [122]: df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})
    
    In [123]: print(df1)
       col1 col_left
    0     0        a
    1     1        b
    
    In [124]: print(df2)
       col1  col_right
    0     1          2
    1     2          2
    2     2          2
    
    In [125]: res = pd.merge(df1, df2, on='col1', how='outer', indicator=True) # 给一个提示
    
    In [126]: print(res)
       col1 col_left  col_right      _merge
    0     0        a        NaN   left_only
    1     1        b        2.0        both
    2     2      NaN        2.0  right_only
    3     2      NaN        2.0  right_only
    
    In [127]:
    In [129]: res = pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column') # 指定提示的列名
    
    In [130]: print(res)
       col1 col_left  col_right indicator_column
    0     0        a        NaN        left_only
    1     1        b        2.0             both
    2     2      NaN        2.0       right_only
    3     2      NaN        2.0       right_only
    
    In [131]: 
    In [127]: res = pd.merge(df1, df2, on='col1', how='outer', indicator=False)
    
    In [128]: print(res)
       col1 col_left  col_right
    0     0        a        NaN
    1     1        b        2.0
    2     2      NaN        2.0
    3     2      NaN        2.0
    
    In [129]: 
    
    
    
    
    
    
    In [131]: left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
         ...:                      'B': ['B0', 'B1', 'B2']},
         ...:                      index=['K0', 'K1', 'K2'])
    
    In [132]: right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
         ...:                       'D': ['D0', 'D2', 'D3']},
         ...:                      index=['K0', 'K2', 'K3'])
    
    In [133]: print(left)
         A   B
    K0  A0  B0
    K1  A1  B1
    K2  A2  B2
    
    In [134]: print(right)
         C   D
    K0  C0  D0
    K2  C2  D2
    K3  C3  D3
    
    In [135]: res = pd.merge(left, right, left_index=True, right_index=True, how='outer')
    
    In [136]: print(res)
          A    B    C    D
    K0   A0   B0   C0   D0
    K1   A1   B1  NaN  NaN
    K2   A2   B2   C2   D2
    K3  NaN  NaN   C3   D3
    
    In [137]: res = pd.merge(left, right, left_index=True, right_index=True, how='inner')
    
    In [138]: print(res)
         A   B   C   D
    K0  A0  B0  C0  D0
    K2  A2  B2  C2  D2
    
    In [139]: 
    
    
    
    
    
    
    # handle overlapping
    In [139]: boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
    
    In [140]: girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
    
    In [141]: print(boys)
       age   k
    0    1  K0
    1    2  K1
    2    3  K2
    
    In [142]: print(girls)
       age   k
    0    4  K0
    1    5  K0
    2    6  K3
    
    In [143]: res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner')
    
    In [144]: print(res)
       age_boy   k  age_girl
    0        1  K0         4
    1        1  K0         5
    
    In [145]: res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='outer')
    
    In [146]: print(res)
       age_boy   k  age_girl
    0      1.0  K0       4.0
    1      1.0  K0       5.0
    2      2.0  K1       NaN
    3      3.0  K2       NaN
    4      NaN  K3       6.0
    
    In [147]:   

    关于Concat 函数、Merge 函数和 Join 函数

    Pandas Moving Window Functions

    Pandas plot可视化

    #!/usr/bin/python2.7
    
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    
    # Series
    data = pd.Series(np.random.randn(1000),index=np.arange(1000))
    
    data = data.cumsum()
    
    
    data.plot()
    plt.show()

    #!/usr/bin/python2.7
    
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    
    # DataFrame
    data = pd.DataFrame(np.random.randn(1000,4),
                        index=np.arange(1000), 
                        columns=list("ABCD"))
    data = data.cumsum()
    # print(data.head(6))
    
    data.plot()
    plt.show()
    

    #!/usr/bin/python2.7
    
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    
    # DataFrame
    data = pd.DataFrame(np.random.randn(1000,4),
                        index=np.arange(1000), 
                        columns=list("ABCD"))
    data = data.cumsum()
    # print(data.head(6))
    
    # plot method:
    #     'bar','hist','box','kde','aera','scatter','pie','hexbin'...
    ax = data.plot.scatter(x='A',y='B',color='DarkBlue',label='Class AB')
    data.plot.scatter(x='A',y='C',color='DarkGreen',label='Class AC',ax=ax)
    plt.show()
    

    补充:Matplotlib 3D图像

    #!/usr/bin/python2.7
    
    import numpy as np
    import matplotlib.pyplot as plt
    from mpl_toolkits.mplot3d import Axes3D
    
    fig = plt.figure()
    ax = Axes3D(fig)
    
    # X,Y value
    X = np.arange(-4,4,0.25)
    Y = np.arange(-4,4,0.25)
    X,Y = np.meshgrid(X,Y)
    R = np.sqrt(X**2+Y**2)
    
    # height value
    Z = np.sin(R)
    
    ax.plot_surface(X,Y,Z,rstride=1,cstride=1,cmap=plt.get_cmap('rainbow'))
    
    plt.show()
    

    #!/usr/bin/python2.7
    
    import numpy as np
    import matplotlib.pyplot as plt
    from mpl_toolkits.mplot3d import Axes3D
    
    fig = plt.figure()
    ax = Axes3D(fig)
    
    # X,Y value
    X = np.arange(-4,4,0.25)
    Y = np.arange(-4,4,0.25)
    X,Y = np.meshgrid(X,Y)
    R = np.sqrt(X**2+Y**2)
    
    # height value
    Z = np.sin(R)
    
    ax.plot_surface(X,Y,Z,rstride=1,cstride=1,cmap=plt.get_cmap('rainbow'))
    
    ax.contourf(X,Y,Z,zdir='z',offset=-2,cmap='rainbow') # 增加等高线
    
    ax.set_zlim(-2,2)
    
    plt.show()
    

    参考:https://github.com/MorvanZhou

    参考:https://morvanzhou.github.io/tutorials/

  • 相关阅读:
    [BJOI2012]最多的方案(记忆化搜索)
    our happy ending(状压dp)
    [NOI2005]聪聪与可可(期望dp)
    CF983A Finite or not?(数学)
    [POI2012]STU-Well(二分答案+神仙操作)
    作诗2(玄学)
    IncDec Sequence(差分)
    [Vani有约会]雨天的尾巴(树上差分+线段树合并)
    合法括号序列(dp+组合数学)
    [SHOI2014]概率充电器(概率+换根dp)
  • 原文地址:https://www.cnblogs.com/standby/p/9462636.html
Copyright © 2020-2023  润新知