• Python 数据科学系列 の Numpy、Series 和 DataFrame介绍


    本課主題

    • Numpy 的介绍和操作实战
    • Series 的介绍和操作实战
    • DataFrame 的介绍和操作实战

    Numpy 的介绍和操作实战

    numpy 是 Python 在数据计算领域里很常用的模块 

    import numpy as np
    np.array([11,22,33]) #接受一个列表数据
    1. 创建 numpy array
      >>> import numpy as np
      >>> mylist = [1,2,3]
      >>> x = np.array(mylist)
      >>> x
      array([1, 2, 3])
      >>> y = np.array([4,5,6])
      >>> y
      array([4, 5, 6])
      >>> m = np.array([[7,8,9],[10,11,12]])
      >>> m
      array([[ 7,  8,  9],
             [10, 11, 12]])
      创建 numpy array(例子)
    2. 查看 numpy array 的
      >>> m.shape #array([1, 2, 3])
      (2, 3)
      
      >>> x.shape #array([4, 5, 6])
      (3,)
      
      >>> y.shape #array([[ 7,  8,  9], [10, 11, 12]])
      (3,)
      View Code
    3. numpy.arrange
      >>> n = np.arange(0,30,2)
      >>> n
      array([ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28])
      numpy.arrange( )(例子)
    4. 改变numpy array的位置
      >>> n = np.arange(0,30,2)
      >>> n
      array([ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28])
      >>> n.shape
      (15,)
      
      >>> n = n.reshape(3,5) #从15列改成3列5行
      >>> n
      
      array([[ 0,  2,  4,  6,  8],
             [10, 12, 14, 16, 18],
             [20, 22, 24, 26, 28]])
      numpy.reshape( )(例子一)
      >>> o = np.linspace(0,4,9)
      >>> o
      array([ 0. ,  0.5,  1. ,  1.5,  2. ,  2.5,  3. ,  3.5,  4. ])
      >>> o.resize(3,3)
      >>> o
      array([[ 0. ,  0.5,  1. ],
             [ 1.5,  2. ,  2.5],
             [ 3. ,  3.5,  4. ]])
      numpy.reshape( )(例子二)
    5. numpy.ones( ) ,numpy.zeros( ),numpy.eye( )
      >>> r1 = np.ones((3,2))
      >>> r1
      array([[ 1.,  1.],
             [ 1.,  1.],
             [ 1.,  1.]])
      
      >>> r1 = np.zeros((2,3))
      >>> r1
      array([[ 0.,  0.,  0.],
             [ 0.,  0.,  0.]])
      
      >>> r2 = np.eye(3)
      >>> r2
      array([[ 1.,  0.,  0.],
             [ 0.,  1.,  0.],
             [ 0.,  0.,  1.]])
      numpy.ones/zeros/eye( )(例子)

      可以定义整数

      >>> r5 = np.ones([2,3], int)
      >>> r5
      array([[1, 1, 1],
             [1, 1, 1]])
      
      >>> r5 = np.ones([2,3])
      >>> r5
      array([[ 1.,  1.,  1.],
             [ 1.,  1.,  1.]])
      numpy.ones(x,int)(例子)
    6. numpy.diag( )
      >>> y = np.array([4,5,6])
      >>> y
      array([4, 5, 6])
      
      >>> np.diag(y)
      array([[4, 0, 0],
             [0, 5, 0],
             [0, 0, 6]])
      diag( )(例子)
    7. 复制 numpy array
      >>> r3 = np.array([1,2,3] * 3)
      >>> r3
      array([1, 2, 3, 1, 2, 3, 1, 2, 3])
      
      >>> r4 = np.repeat([1,2,3],3)
      >>> r4
      array([1, 1, 1, 2, 2, 2, 3, 3, 3])
      复制numpy array(例子)
    8. numpy中的 vstack和 hstack
      >>> r5 = np.ones([2,3], int)
      >>> r5
      array([[1, 1, 1],
             [1, 1, 1]])
      
      >>> r6 = np.vstack([r5,2*r5])
      >>> r6
      array([[1, 1, 1],
             [1, 1, 1],
             [2, 2, 2],
             [2, 2, 2]])
      
      >>> r7 = np.hstack([r5,2*r5])
      >>> r7
      array([[1, 1, 1, 2, 2, 2],
             [1, 1, 1, 2, 2, 2]])
      numpy.vstack( )和np.hstack( )(例子)
    9. numpy 中的加减乘除操作一 (+-*/)
      >>> mylist = [1,2,3]
      >>> x = np.array(mylist)
      >>> y = np.array([4,5,6])
      
      >>> x+y
      array([5, 7, 9])
      
      >>> x-y
      array([-3, -3, -3])
      
      >>> x*y
      array([ 4, 10, 18])
      
      >>> x**2
      array([1, 4, 9])
      
      >>> x.dot(y)
      32
      numpy中的加减乘除(例子一)
    10. numpy 中的加减乘除操作二:sum( )、max( )、min( )、mean( )、std( )
      >>> a = np.array([1,2,3,4,5])
      >>> a.sum()
      15
      
      >>> a.max()
      5
      
      >>> a.min()
      1
      
      >>> a.mean()
      3.0
      
      >>> a.std()
      1.4142135623730951
      
      >>> a.argmax()
      4
      
      >>> a.argmin()
      0
      numpy中的加减乘除(例子二)
    11. 查看numpy array 的数据类型
      >>> y = np.array([4,5,6])
      >>> z = np.array([y, y**2])
      >>> z
      array([[ 4,  5,  6],
             [16, 25, 36]])
      
      >>> z.shape
      (2, 3)
      
      >>> z.T.shape
      (3, 2)
      
      >>> z.dtype
      dtype('int64')
      
      >>> z = z.astype('f')
      
      >>> z.dtype
      dtype('float32')
      numpy array 的数据类型
    12. numpy 中的索引和切片
      >>> s = np.arange(13)
      >>> s
      array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])
      
      >>> s = np.arange(13) ** 2
      >>> s
      array([  0,   1,   4,   9,  16,  25,  36,  49,  64,  81, 100, 121, 144])
      
      >>> s[0],s[4],s[0:3]
      (0, 16, array([0, 1, 4]))
      
      >>> s[1:5]
      array([ 1,  4,  9, 16])
      
      >>> s[-4:]
      array([ 81, 100, 121, 144])
      
      >>> s[-5:-2]
      array([ 64,  81, 100])
      numpy索引和切片(例子一)
      >>> r = np.arange(36)
      >>> r.resize((6,6))
      >>> r
      array([[ 0,  1,  2,  3,  4,  5],
             [ 6,  7,  8,  9, 10, 11],
             [12, 13, 14, 15, 16, 17],
             [18, 19, 20, 21, 22, 23],
             [24, 25, 26, 27, 28, 29],
             [30, 31, 32, 33, 34, 35]])
      
      >>> r[2,2]
      14
      
      >>> r[3,3:6]
      array([21, 22, 23])
      
      >>> r[:2,:-1]
      array([[ 0,  1,  2,  3,  4],
             [ 6,  7,  8,  9, 10]])
      
      >>> r[-1,::2]
      array([30, 32, 34])
      
      >>> r[r > 30] #取r大于30的数据
      array([31, 32, 33, 34, 35])
      
      >>> re2 = r[r > 30] = 30
      >>> re2
      30
      >>> r8 = r[:3,:3]
      >>> r8
      
      array([[ 0,  1,  2],
             [ 6,  7,  8],
             [12, 13, 14]])
      
      >>> r8[:] = 0
      
      >>> r8
      array([[0, 0, 0],
             [0, 0, 0],
             [0, 0, 0]])
      
      >>> r 
      array([[ 0,  0,  0,  3,  4,  5],
             [ 0,  0,  0,  9, 10, 11],
             [ 0,  0,  0, 15, 16, 17],
             [18, 19, 20, 21, 22, 23],
             [24, 25, 26, 27, 28, 29],
             [30, 30, 30, 30, 30, 30]])
      numpy索引和切片(例子二)
    13. copy numpy array 的数组
      >>> r = np.arange(36)
      >>> r.resize((6,6))
      >>> r_copy = r.copy()
      >>> r
      array([[ 0,  1,  2,  3,  4,  5],
             [ 6,  7,  8,  9, 10, 11],
             [12, 13, 14, 15, 16, 17],
             [18, 19, 20, 21, 22, 23],
             [24, 25, 26, 27, 28, 29],
             [30, 31, 32, 33, 34, 35]])
      
      >>> r_copy
      array([[ 0,  1,  2,  3,  4,  5],
             [ 6,  7,  8,  9, 10, 11],
             [12, 13, 14, 15, 16, 17],
             [18, 19, 20, 21, 22, 23],
             [24, 25, 26, 27, 28, 29],
             [30, 31, 32, 33, 34, 35]])
      
      >>> r_copy[:] = 10
      
      >>> r_copy
      array([[10, 10, 10, 10, 10, 10],
             [10, 10, 10, 10, 10, 10],
             [10, 10, 10, 10, 10, 10],
             [10, 10, 10, 10, 10, 10],
             [10, 10, 10, 10, 10, 10],
             [10, 10, 10, 10, 10, 10]])
      copy( )例子
    14. 其他操作
      >>> test = np.random.randint(0,10,(4,3))
      >>> test
      array([[3, 5, 2],
             [7, 7, 9],
             [8, 9, 2],
             [2, 9, 1]])
      
      >>> for row in test:
      ...     print(row)
      ... 
      [3 5 2]
      [7 7 9]
      [8 9 2]
      [2 9 1]
      
      >>> for i in range(len(test)):
      ...     print(test[i])
      ... 
      [3 5 2]
      [7 7 9]
      [8 9 2]
      [2 9 1]
      
      >>> for i, row in enumerate(test):
      ...     print('row', i, 'is', row)
      ... 
      row 0 is [3 5 2]
      row 1 is [7 7 9]
      row 2 is [8 9 2]
      row 3 is [2 9 1]
      
      >>> test2 = test ** 2
      >>> test2
      array([[ 9, 25,  4],
             [49, 49, 81],
             [64, 81,  4],
             [ 4, 81,  1]])
      
      >>> for i,j, in zip(test,test2):
      ...     print(i, '+', j, '=', i + j)
      ... 
      [3 5 2] + [ 9 25  4] = [12 30  6]
      [7 7 9] + [49 49 81] = [56 56 90]
      [8 9 2] + [64 81  4] = [72 90  6]
      [2 9 1] + [ 4 81  1] = [ 6 90  2]
      >>> 
      numpy array 的其他操作例子

    Series 的介绍和操作实战

    如果是输入一个字典类型的话,字典的键会自动变成 Index,然后它的值是Value

    from pandas import Series, DataFrame
    import pandas as pd
    pd.Series(['Dog','Bear','Tiger','Moose','Giraffe','Hippopotamus','Mouse'], name='Animals') #接受一个列表类型的数据
    def __init__(self, data=None, index=None, dtype=None, name=None,
                     copy=False, fastpath=False):
    Series的__init__方法

    1. 创建 Series 类型
      第一:你可以传入一个列表或者是字典来创建 Series,如果传入的是列表,Python会自动把 [0,1,2] 作为 Series 的索引。
      第二:如果你传入的是字符串类型的数据,Series 返回的dtype是object;如果你传入的是数字类型的数据,Series 返回的dtype是int64
      >>> from pandas import Series, DataFrame
      >>> import pandas as pd
      >>> animals = ['Tiger','Bear','Moose']
      
      >>> s1 = pd.Series(animals) 
      >>> s1
      0    Tiger
      1     Bear
      2    Moose
      dtype: object
      
      >>> s2 = pd.Series([1,2,3])
      >>> s2
      0    1
      1    2
      2    3
      dtype: int64
      创建 Series

      Series如何处理 NaN的数据?

      >>> animals2 = ['Tiger','Bear',None]
      >>> s3 = pd.Series(animals2) 
      >>> s3
      0    Tiger
      1     Bear
      2     None
      dtype: object
      
      >>> s4 = pd.Series([1,2,None]) 
      >>> s4
      0    1.0
      1    2.0
      2    NaN
      dtype: float64
      Series NaN数据(范例)
    2. Series 中的 NaN数据和如何检查 NaN数据是否相等,这时候需要调用 np.isnan( )方法
      >>> import numpy as np
      >>> np.nan == None
      False
      
      >>> np.nan == np.nan
      False
      
      >>> np.isnan(np.nan)
      True
      np.isnan( )
    3. Series 默应 Index 是 [0,1,2],但也可以自定义 Series 中的Index
      >>> import numpy as np
      >>> sports = {
      ...     'Archery':'Bhutan',
      ...     'Golf':'Scotland',
      ...     'Sumo':'Japan',
      ...     'Taekwondo':'South Korea'
      ... }
      
      >>> s5 = pd.Series(sports)
      >>> s5
      Archery           Bhutan
      Golf            Scotland
      Sumo               Japan
      Taekwondo    South Korea
      dtype: object
      
      >>> s5.index
      Index(['Archery', 'Golf', 'Sumo', 'Taekwondo'], dtype='object')
      自定义 Series 中的Index(例子一)
      >>> from pandas import Series, DataFrame
      >>> import pandas as pd
      >>> s6 = pd.Series(['Tiger','Bear','Moose'], index=['India','America','Canada'])
      >>> s6
      India      Tiger
      America     Bear
      Canada     Moose
      dtype: object
      自定义 Series 中的Index(例子一)
    4. 查询 Series 的数据有两种方法,第一是通过index方法 e.g. s.iloc[2];第二是通过label方法 e.g. s.loc['America']
      >>> from pandas import Series, DataFrame
      >>> import pandas as pd
      >>> s6
      India      Tiger
      America     Bear
      Canada     Moose
      dtype: object
      
      >>> s6.iloc[2] #获取 index2位置的数据
      'Moose'
      
      >>> s6.loc['America'] #获取 label: America 的值
      'Bear'
      
      >>> s6[1] #底层调用了 s6.iloc[1]
      'Bear'
      
      >>> s6['India'] #底层调用了 s6.loc['India']
      'Tiger'
      查询Series(例子)
    5. Series 的数据操作: sum( ),它底层也是调用 numpy 的方法
      >>> s7 = pd.Series([100.00,120.00,101.00,3.00])
      >>> s7
      0    100.0
      1    120.0
      2    101.0
      3      3.0
      dtype: float64
      
      >>> total = 0
      >>> for item in s7:
      ...     total +=item
      ... 
      >>> total
      324.0
      
      >>> total2 =  np.sum(s7)
      >>> total2
      324.0
      np.sum(s7)
      >>> s8 = pd.Series(np.random.randint(0,1000,10000))
      >>> s8.head()
      0     25
      1    399
      2    326
      3    479
      4    603
      dtype: int64
      >>> len(s8)
      10000
      head( )例子
    6. Series 也可以存储混合型数据
      >>> s9 = pd.Series([1,2,3])
      >>> s9.loc['Animals'] = 'Bears'
      >>> s9
      0              1
      1              2
      2              3
      Animals    Bears
      dtype: object
      混合型存储数据(例子)
    7. Series 中的 append( ) 用法
      >>> original_sports = pd.Series({'Archery':'Bhutan',
      ...                              'Golf':'Scotland',
      ...                              'Sumo':'Japan',
      ...                              'Taekwondo':'South Korea'})
      >>> cricket_loving_countries = pd.Series(['Australia', 'Barbados','Pakistan','England'],
      ...                                      index=['Cricket','Cricket','Cricket','Cricket'])
      >>> all_countries = original_sports.append(cricket_loving_countries)
      
      >>> original_sports
      Archery           Bhutan
      Golf            Scotland
      Sumo               Japan
      Taekwondo    South Korea
      dtype: object
      
      >>> cricket_loving_countries
      Cricket    Australia
      Cricket     Barbados
      Cricket     Pakistan
      Cricket      England
      dtype: object
      
      >>> all_countries
      Archery           Bhutan
      Golf            Scotland
      Sumo               Japan
      Taekwondo    South Korea
      Cricket        Australia
      Cricket         Barbados
      Cricket         Pakistan
      Cricket          England
      dtype: object
      Series类型的append( )

    DataFrame

    这是创建一个DataFrame对象的基本语句:接受字典类型的数据;字典中的Key (e.g. Animals, Owners) 对应 DataFrame中的Columns,它的 Value 也相当于数据库表中的每一行数据。 

    data = {
            'Animals':['Dog','Bear','Tiger','Moose','Giraffe','Hippopotamus','Mouse'],
            'Owners':['Chris','Kevyn','Bob','Vinod','Daniel','Fil','Stephanie']
    }
    df = DataFrame(data, columns=['Animals','Owners'])

     

    基础操作

    1. 创建DataFrame
      >>> from pandas import Series, DataFrame
      >>> import pandas as pd
      >>> data = {'name':['yahoo','google','facebook'],
      ...         'marks':[200,400,800],
      ...         'price':[9,3,7]}
      >>> df = DataFrame(data)
      >>> df
         marks      name  price
      0    200     yahoo      9
      1    400    google      3
      2    800  facebook      7
      创建DataFrame(例子一)
      >>> df2 = DataFrame(data, columns=['name','price','marks'])
      >>> df2
             name  price  marks
      0     yahoo      9    200
      1    google      3    400
      2  facebook      7    800
      
      >>> df3 = DataFrame(data, columns=['name','price','marks'], index=['a','b','c'])
      >>> df3
             name  price  marks
      a     yahoo      9    200
      b    google      3    400
      c  facebook      7    800
      
      >>> df4 = DataFrame(data, columns=['name','price','marks', 'debt'], index=['a','b','c'])
      >>> df4
             name  price  marks debt
      a     yahoo      9    200  NaN
      b    google      3    400  NaN
      c  facebook      7    800  NaN
      创建DataFrame(例子二)
      >>> import pandas as pd
      >>> purchase_1 = pd.Series({'Name':'Chris','Item Purchased':'Dog Food','Cost':22.50})
      >>> purchase_2 = pd.Series({'Name':'Kelvin','Item Purchased':'Kitty Litter','Cost':2.50})
      >>> purchase_3 = pd.Series({'Name':'Vinod','Item Purchased':'Bird Seed','Cost':5.00})
      >>> 
      >>> df = pd.DataFrame([purchase_1,purchase_2,purchase_3],index=['Store 1','Store 2','Store 1'])
      >>> df
               Cost Item Purchased    Name
      Store 1  22.5       Dog Food   Chris
      Store 2   2.5   Kitty Litter  Kelvin
      Store 1   5.0      Bird Seed   Vinod
      创建DataFrame(例子三)
    2. 查询 dataframe 的index:df.loc['index']
      >>> df.loc['Store 2']
      Cost                       2.5
      Item Purchased    Kitty Litter
      Name                    Kelvin
      Name: Store 2, dtype: object
      df.loc['Store 2']
      >>> df.loc['Store 1']
               Cost Item Purchased   Name
      Store 1  22.5       Dog Food  Chris
      Store 1   5.0      Bird Seed  Vinod
      df.loc['Store 1']
      >>> df['Item Purchased']
      Store 1        Dog Food
      Store 2    Kitty Litter
      Store 1       Bird Seed
      Name: Item Purchased, dtype: object
      df['Item Purchased']
    3. 查 store1 的 cost 是多少
      >>> df.loc['Store 1', 'Cost']
      Store 1    22.5
      Store 1     5.0
      Name: Cost, dtype: float64
      df.loc['Store 1', 'Cost']
    4. 查询Cost大于3的Name
      >>> df['Name'][df['Cost']>3]
      Store 1    Chris
      Store 1    Vinod
      Name: Name, dtype: object
      df['Name'][df['Cost']>3]
    5. 查询DataFrame 的类型
      >>> type(df.loc['Store 2'])
      <class 'pandas.core.series.Series'>
      type( )例子
    6. drop dataframe (但这不会把原来的 dataframe drop 掉)
      >>> df.drop('Store 1')
               Cost Item Purchased    Name
      Store 2   2.5   Kitty Litter  Kelvin
      
      >>> df
               Cost Item Purchased    Name
      Store 1  22.5       Dog Food   Chris
      Store 2   2.5   Kitty Litter  Kelvin
      Store 1   5.0      Bird Seed   Vinod
      df.drop('Store 1')
      >>> copy_df = df.copy()
      >>> copy_df
               Cost Item Purchased    Name
      Store 1  22.5       Dog Food   Chris
      Store 2   2.5   Kitty Litter  Kelvin
      Store 1   5.0      Bird Seed   Vinod
      >>> copy_df = df.drop('Store 1')
      >>> copy_df
               Cost Item Purchased    Name
      Store 2   2.5   Kitty Litter  Kelvin
      把dataframe数据drop的例子

      也可以用 del 把 Column 列删除掉

      >>> del copy_df['Name']
      >>> copy_df
               Cost Item Purchased
      Store 2   2.5   Kitty Litter
      del copy_df['Name']
    7. set_index
    8. rename column
    9. 可以修改dataframe里的数据
      >>> df = pd.DataFrame([purchase_1,purchase_2,purchase_3],index=['Store 1','Store 2','Store 1'])
      >>> df
               Cost Item Purchased    Name
      Store 1  22.5       Dog Food   Chris
      Store 2   2.5   Kitty Litter  Kelvin
      Store 1   5.0      Bird Seed   Vinod
      
      >>> df['Cost'] = df['Cost'] * 0.8
      >>> df
               Cost Item Purchased    Name
      Store 1  18.0       Dog Food   Chris
      Store 2   2.0   Kitty Litter  Kelvin
      Store 1   4.0      Bird Seed   Vinod
      df['Cost'] * 0.8
      >>> df = pd.DataFrame([purchase_1,purchase_2,purchase_3],index=['Store 1','Store 2','Store 1'])
      >>> costs = df['Cost']
      >>> costs
      Store 1    22.5
      Store 2     2.5
      Store 1     5.0
      Name: Cost, dtype: float64
      >>> costs += 2
      >>> costs
      Store 1    24.5
      Store 2     4.5
      Store 1     7.0
      Name: Cost, dtype: float64
      costs = df['Cost']

     

    进阶操作

    1. Merge
      Full Outer Join
      Inner Join
      Left Join
      Right Join
    2. apply
    3. group by
    4. agg
    5. astype
    6. cut
      s = pd.Series([168, 180, 174, 190, 170, 185, 179, 181, 175, 169, 182, 177, 180, 171])
      pd.cut(s, 3)
      pd.cut(s, 3, labels=['Small', 'Medium', 'Large'])
      cut( )
    7. pivot table 

    Date in DataFrame

    1. Timestampe
    2. period
    3. DatetimeINdex
    4. PeriodIndex
    5. to_datetime
    6. Timedelta
    7. date_range
    8. difference between date value
    9. resample
    10. asfreq - changing the frequency of the date

    读取 csv 文件

    import pandas as pd
    pd.read_csv('student.csv')
    1. 读取csv
      >>> from pandas import Series, DataFrame
      >>> import pandas as pd
      >>> df_student = pd.read_csv('student.csv')
      >>> df_student
              name   class  marks  age
          janice  python     80   22
            alex  python     95   21
           peter  python     85   25
             ken    java     75   28
       lawerance    java     50   22
      pd.read_csv('student.csv')(例子一)
      df_student = pd.read_csv('student.csv', index_col=0, skiprows=1)
      pd.read_csv('student.csv')(例子二)
    2. 获取分数大于70的数据
      >>> df_student['marks'] > 70
          True
          True
          True
          True
         False
      Name: marks, dtype: bool
      方法一: df_student['marks'] > 70
      >>> df_student.where(df_student['marks']>70)
           name   class  marks   age
       janice  python   80.0  22.0
         alex  python   95.0  21.0
        peter  python   85.0  25.0
          ken    java   75.0  28.0
          NaN     NaN    NaN   NaN
      方法二: df_student.where(df_student['marks']>70)
      >>> df_student[df_student['marks'] > 70]
           name   class  marks  age
      0  janice  python     80   22
      1    alex  python     95   21
      2   peter  python     85   25
      3     ken    java     75   28
      方法三: df_student[df_student['marks'] > 70]
    3. 获取class = 'python' 的数据,df.count( ) 是不会把 NaN数据计算在其中
      >>> df2 = df_student.where(df_student['class'] == 'python') 
      >>> df2
           name   class  marks   age
      0  janice  python   80.0  22.0
      1    alex  python   95.0  21.0
      2   peter  python   85.0  25.0
      3     NaN     NaN    NaN   NaN
      4     NaN     NaN    NaN   NaN
      
      >>> df2 = df_student[df_student['class'] == 'python']
      >>> df2
           name   class  marks  age
      0  janice  python     80   22
      1    alex  python     95   21
      2   peter  python     85   25
      df_student.where( )例子
    4. 计算 class 的数目 e.g. count( )
      >>> df2['class'].count() #不会把 NaN也计算
      3
      
      >>> df_student['class'].count() #会把 NaN也计算
      5
      df.count( )例子
    5. 删取NaN数据
      >>> df3 = df2.dropna()
      >>> df3
           name   class  marks   age
      0  janice  python   80.0  22.0
      1    alex  python   95.0  21.0
      2   peter  python   85.0  25.0
      df2.dropna()
    6. 获取age大于23 学生的数据
      >>> df_student
              name   class  marks  age
      0     janice  python     80   22
      1       alex  python     95   21
      2      peter  python     85   25
      3        ken    java     75   28
      4  lawerance    java     50   22
      
      >>> df_student[df_student['age'] > 23]
          name   class  marks  age
      2  peter  python     85   25
      3    ken    java     75   28
      
      >>> df_student['age'] > 23
      0    False
      1    False
      2     True
      3     True
      4    False
      Name: age, dtype: bool
      
      >>> len(df_student[df_student['age'] > 23])
      2
      df_student[df_student['age'] > 23]
    7. 获取age大于23分数大于80分学生的数据
      >>> df_student
              name   class  marks  age
      0     janice  python     80   22
      1       alex  python     95   21
      2      peter  python     85   25
      3        ken    java     75   28
      4  lawerance    java     50   22
      >>> df_and = df_student[(df_student['age'] > 23) & (df_student['marks'] > 80)]
      >>> df_and
          name   class  marks  age
      2  peter  python     85   25
      df_student[(df_student['age'] > 23) & (df_student['marks'] > 80)]
    8. 获取age大于23分数大于80分学生的数据
      >>> df_student
              name   class  marks  age
      0     janice  python     80   22
      1       alex  python     95   21
      2      peter  python     85   25
      3        ken    java     75   28
      4  lawerance    java     50   22
      
      >>> df_or = df_student[(df_student['age'] > 23) | (df_student['marks'] > 80)]
      >>> df_or
          name   class  marks  age
      1   alex  python     95   21
      2  peter  python     85   25
      3    ken    java     75   28
      df_student[(df_student['age'] > 23) | (df_student['marks'] > 80)]
    9. 重新定义index的数值 df.set_index( )
      >>> df_student = pd.read_csv('student.csv')
      >>> df_student
              name   class  marks  age
      0     janice  python     80   22
      1       alex  python     95   21
      2      peter  python     85   25
      3        ken    java     75   28
      4  lawerance    java     50   22
      
      >>> df_student['order_id'] = df_student.index
      >>> df_student
              name   class  marks  age  order_id
      0     janice  python     80   22         0
      1       alex  python     95   21         1
      2      peter  python     85   25         2
      3        ken    java     75   28         3
      4  lawerance    java     50   22         4
      
      >>> df_student = df_student.set_index('class')
      >>> df_student
                   name  marks  age  order_id
      class                                  
      python     janice     80   22         0
      python       alex     95   21         1
      python      peter     85   25         2
      java          ken     75   28         3
      java    lawerance     50   22         4
      df_student.set_index( )例子
    10. 获取在 dataframe column 中唯一的数据
      >>> df_student = pd.read_csv('student.csv')
      >>> df_student['class'].unique()
      array(['python', 'java'], dtype=object)
      df.unique( )例子

    python 的可视化 matplotlib

    1. plot

       

    參考資料

    Coursera: Introduction to Data Science in Python

    Data Science (Chris Albon)

    Data Science: GoodHart's Law | Goodhart's Law

    Pandas文档Pandas中文文档

     

  • 相关阅读:
    Spring@Profile注解
    day 32 子进程的开启 及其用法
    day 31 udp 协议SOCK_DGRAM
    day 30 客户端获取cmd 命令的步骤
    day 29 socket 理论
    day 29 socket 初级版
    有关 组合 继承
    day 27 多态 接口 类方法 静态方法 hashlib 摘要算法模块
    新式类和经典类的区别
    day 28 hasattr getattr serattr delattr 和带__内置__ 类的内置方法
  • 原文地址:https://www.cnblogs.com/jcchoiling/p/5928452.html
Copyright © 2020-2023  润新知