• pandas入门:pandas的数据结构介绍


    Series

    from pandas import Series,DataFrame
    import pandas as pd
    import numpy as np
    
    # Series是一种类似于一维数组的对象,由一组数据(各种Numpy数据类型)以及一组与之相关的数据标签(即索引)组成
    obj = Series([4,7,-5,3])
    print(obj)
    '''
    0    4
    1    7
    2   -5
    3    3
    dtype: int64
    '''
    # 索引在左边,值在右边,可通过Series的values和index属性获取其数组表示形式和索引对象
    print(obj.values) # [ 4  7 -5  3]
    print(obj.index) # RangeIndex(start=0, stop=4, step=1)
    
    # 创建一个带有可以对各个数据点进行标记的索引
    obj2 = Series([4,7,-5,1],index=['d','b','a','c'])
    print(obj2)
    '''
    d    4
    b    7
    a   -5
    c    1
    dtype: int64
    '''
    print(obj2.index) # Index(['d', 'b', 'a', 'c'], dtype='object')
    
    # 与普通NumPy数据相比,可以通过索引的方式选取Series中单个或一组值
    print(obj2['a']) # -5
    obj2['d'] = 6
    print(obj2[['c','a','d']])
    '''
    c    1
    a   -5
    d    6
    dtype: int64
    '''
    print(obj2[obj2>0])
    '''
    d    6
    b    7
    c    1
    dtype: int64
    '''
    print(obj2*2)
    '''
    d    12
    b    14
    a   -10
    c     2
    dtype: int64
    '''
    print(np.exp(obj2))
    '''
    d     403.428793
    b    1096.633158
    a       0.006738
    c       2.718282
    dtype: float64
    '''
    print('b'in obj2) # True
    print('e'in obj2) # False
    
    # 通过字典创建Series
    sdata = {'Ohio':35000,'Texas':71000,'Oregon':16000,'Utah':5000}
    obj3 = Series(sdata)
    print(obj3)
    '''
    Ohio      35000
    Oregon    16000
    Texas     71000
    Utah       5000
    dtype: int64
    '''
    states = ['California','Ohio','Oregon','Texas']
    obj4 = Series(sdata,index=states)
    print(obj4)
    '''
    California        NaN
    Ohio          35000.0
    Oregon        16000.0
    Texas         71000.0
    dtype: float64
    '''
    # sdata中跟states索引相匹配的3个值会找出来放到对应位置上,由于California所对应的sdata找不到,其结果就是NaN(not a number,表示缺失或NA值)
    # Pandas的isnull和notnull函数可用于检测缺失值
    print(pd.isnull(obj4))
    '''
    California     True
    Ohio          False
    Oregon        False
    Texas         False
    dtype: bool
    '''
    print(pd.notnull(obj4))
    '''
    California    False
    Ohio           True
    Oregon         True
    Texas          True
    dtype: bool
    '''
    print(obj4.isnull())
    '''
    California     True
    Ohio          False
    Oregon        False
    Texas         False
    dtype: bool
    '''
    # Series的一个重要功能是:在算术运算中会自动对齐不同索引数据
    print(obj3+obj4)
    '''
    California         NaN
    Ohio           70000.0
    Oregon         32000.0
    Texas         142000.0
    Utah               NaN
    dtype: float64
    '''
    # Series对象本身及其索引都有一个name属性
    obj4.name = 'population'
    obj4.index.name = 'state'
    print(obj4)
    '''
    state
    California        NaN
    Ohio          35000.0
    Oregon        16000.0
    Texas         71000.0
    Name: population, dtype: float64
    '''
    # Series的索引可以通过赋值的方式就地修改
    obj.index = ['Bob','Steve','Jeff','Ryan']
    print(obj)
    '''
    Bob      4
    Steve    7
    Jeff    -5
    Ryan     3
    dtype: int64
    '''
    

    DataFrame

    from pandas import Series,DataFrame
    import pandas as pd
    import numpy as np
    
    # DataFrame既有行索引也有列索引,可以看做由Series组成的字典
    data = {'state':['Ohio','Ohio','Ohio','Nevada','Nevada'],
            'year':[2000,2001,2002,2001,2002],
            'pop':[1.5,1.7,3.6,2.4,2.9]}
    frame = DataFrame(data)
    print(frame)
    '''
       pop   state  year
    0  1.5    Ohio  2000
    1  1.7    Ohio  2001
    2  3.6    Ohio  2002
    3  2.4  Nevada  2001
    4  2.9  Nevada  2002
    '''
    
    # 如果指定了列序列,则dataframe的列就会按照指定顺序进行排列
    frame1 = DataFrame(data,columns=['year','state','pop'])
    print(frame1)
    '''
       year   state  pop
    0  2000    Ohio  1.5
    1  2001    Ohio  1.7
    2  2002    Ohio  3.6
    3  2001  Nevada  2.4
    4  2002  Nevada  2.9
    '''
    # 如果传入列在数据中找不到,就会产生NA值
    frame2 = DataFrame(data,columns=['year','state','pop','debt'],
                      index=['one','two','three','four','five'])
    print(frame2)
    '''
           year   state  pop debt
    one    2000    Ohio  1.5  NaN
    two    2001    Ohio  1.7  NaN
    three  2002    Ohio  3.6  NaN
    four   2001  Nevada  2.4  NaN
    five   2002  Nevada  2.9  NaN
    '''
    print(frame2.columns) # Index(['year', 'state', 'pop', 'debt'], dtype='object')
    # 通过类似字典标记的方式或者属性的方式,可将dataframe的列获取为一个series
    print(frame2['state'])
    '''
    one        Ohio
    two        Ohio
    three      Ohio
    four     Nevada
    five     Nevada
    Name: state, dtype: object
    '''
    print(frame2.year)
    '''
    one      2000
    two      2001
    three    2002
    four     2001
    five     2002
    Name: year, dtype: int64
    '''
    # 通过索引字段ix获取
    print(frame2.ix['three'])
    '''
    year     2002
    state    Ohio
    pop       3.6
    debt      NaN
    Name: three, dtype: object
    '''
    # 列可以通过赋值的方式进行修改,例如给debt列赋上一个标量的值或一组值
    frame2['debt'] = 16.5
    print(frame2)
    '''
           year   state  pop  debt
    one    2000    Ohio  1.5  16.5
    two    2001    Ohio  1.7  16.5
    three  2002    Ohio  3.6  16.5
    four   2001  Nevada  2.4  16.5
    five   2002  Nevada  2.9  16.5
    '''
    frame2['debt'] = np.arange(5.)
    print(frame2)
    '''
           year   state  pop  debt
    one    2000    Ohio  1.5   0.0
    two    2001    Ohio  1.7   1.0
    three  2002    Ohio  3.6   2.0
    four   2001  Nevada  2.4   3.0
    five   2002  Nevada  2.9   4.0
    '''
    # 将列表或数组赋值给某列时,其长度必须跟dataframe的长度相匹配,如果赋值的是一个series,就是精确匹配dataframe的索引,所有空位都将被填上缺失值
    val = Series([-1.2,-1.5,-1.7],index = ['two','four','five'])
    frame2['debt'] = val
    print(frame2)
    '''
           year   state  pop  debt
    one    2000    Ohio  1.5   NaN
    two    2001    Ohio  1.7  -1.2
    three  2002    Ohio  3.6   NaN
    four   2001  Nevada  2.4  -1.5
    five   2002  Nevada  2.9  -1.7
    '''
    # 为不存在的列赋值会创建出一个新列,关键字del用于删除列
    frame2['eastern'] = frame2.state =='Ohio'
    print(frame2)
    '''
           year   state  pop  debt  eastern
    one    2000    Ohio  1.5   NaN     True
    two    2001    Ohio  1.7  -1.2     True
    three  2002    Ohio  3.6   NaN     True
    four   2001  Nevada  2.4  -1.5    False
    five   2002  Nevada  2.9  -1.7    False
    '''
    del frame2['eastern']
    print(frame2)
    '''
           year   state  pop  debt
    one    2000    Ohio  1.5   NaN
    two    2001    Ohio  1.7  -1.2
    three  2002    Ohio  3.6   NaN
    four   2001  Nevada  2.4  -1.5
    five   2002  Nevada  2.9  -1.7
    '''
    
    # 嵌套字典
    pop = {"Nevada":{2001:2.4,2002:2.9},
           'Ohio':{2000:1.5,2001:1.7,2002:3.6}}
    frame3 = DataFrame(pop)
    print(frame3)
    # 字典外层的键作为列,内层键座位行索引
    '''
          Nevada  Ohio
    2000     NaN   1.5
    2001     2.4   1.7
    2002     2.9   3.6
    '''
    # 进行转置
    print(frame3.T)
    '''
            2000  2001  2002
    Nevada   NaN   2.4   2.9
    Ohio     1.5   1.7   3.6
    '''
    # 显示指定索引
    print(DataFrame(pop,index=[2001,2002,2003]))
    '''
          Nevada  Ohio
    2001     2.4   1.7
    2002     2.9   3.6
    2003     NaN   NaN
    '''
    
    # print(frame3['Ohio'][:-1])
    # print(frame3['Nevada'][:2])
    pdata = {"Nevada":frame3['Nevada'][:2],
           'Ohio':frame3['Ohio'][:-1]}
    print(DataFrame(pdata))
    '''
          Nevada  Ohio
    2000     NaN   1.5
    2001     2.4   1.7
    '''
    
    # 如果设置了dataframe的index和columns的name属性,则这些信息也会被显示出来
    frame3.index.name = 'year'
    frame3.columns.name = 'state'
    print(frame3)
    '''
    state  Nevada  Ohio
    year               
    2000      NaN   1.5
    2001      2.4   1.7
    2002      2.9   3.6
    '''
    #与series一样,values属性也会以二维ndarray的形式返回
    print(frame3.values)
    '''
    [[nan 1.5]
     [2.4 1.7]
     [2.9 3.6]]
    '''
    

    索引对象

    from pandas import Series,DataFrame
    import pandas as pd
    import numpy as np
    
    obj = Series(range(3),index=['a','b','c'])
    index = obj.index
    print(index) # Index(['a', 'b', 'c'], dtype='object')
    print(index[1:]) # Index(['b', 'c'], dtype='object')
    # index是不可修改的
    index = pd.Index(np.arange(3))
    obj2 = Series([1.5,-2.5,0],index=index)
    print(obj2.index is index) # True
    
    pop = {"Nevada":{2001:2.4,2002:2.9},
           'Ohio':{2000:1.5,2001:1.7,2002:3.6}}
    frame3 = DataFrame(pop)
    print('Ohio' in frame3.columns) # True
    print(2003 in frame3.index) # False
    

    Index的方法和属性

    from pandas import Series
    obj1 = Series([1,6,5,9],index=['a','b','c','d'])
    obj2 = Series([2,10,6],index=['a','a','b'])
    obj3 = Series([2,10,10])
    
    # append 连接另一个index对象,产生一个新的index
    obj_append = obj1.append(obj2)
    print(obj_append)
    '''
    a     1
    b     6
    c     5
    d     9
    a     2
    a    10
    b     6
    dtype: int64
    '''
    
    # diff 计算差集,得到一个index
    obj_diff = obj1.diff()
    print(obj_diff)
    # 后一个value-前一个value
    '''
    a    NaN
    b    5.0
    c   -1.0
    d    4.0
    dtype: float64
    '''
    
    # intersection 计算交集
    obj_intersection =obj1.index.intersection(obj2.index)
    print(obj_intersection) # Index(['a', 'a', 'b'], dtype='object')
    
    # union 计算并集
    obj_union = obj1.index.union(obj2.index)
    print(obj_union) # Index(['a', 'a', 'b', 'c', 'd'], dtype='object')
    
    # isin 计算一个指示各值是否都包含在参数几个中的布尔型数据
    obj_isin = obj1.index.isin(obj2.index)
    print(obj_isin) # [ True  True False False]
    
    # delete 删除索引处的元素,并得到新的index
    obj_delete =obj1.index.delete(2)
    print(obj_delete) # Index(['a', 'b', 'd'], dtype='object')
    
    # drop 删除传入的值,并得到新的index
    obj_drop = obj1.drop(['a']) # 删除第a行
    print(obj_drop)
    '''
    b    6
    c    5
    d    9
    dtype: int64
    '''
    
    # is_monotonic 当各元素均大于等于前一个元素时,返回True
    print(obj3.is_monotonic) # True
    print(obj2.is_monotonic) # False
    
    # is_unique 当index没有重复值时,返回True
    print(obj3.is_unique) # False
    print(obj2.is_unique) # True
    
    # unique 计算index中唯一值的数组
    print(obj3.unique()) # [ 2 10]
    
  • 相关阅读:
    findIndex() 方法用法
    Centos7安装nginx1.17.5,集成upstream和stream
    Centos7安装docker
    LeetCode(C++)刷题计划:17-电话号码的字母组合
    LeetCode(C++)刷题计划:16-最接近的三数之和
    LeetCode(C++)刷题计划:15-三数之和
    LeetCode(C++)刷题计划:14-最长公共前缀
    LeetCode(C++)刷题计划:13-罗马数字转整数
    LeetCode(C++)刷题计划:12-整数转罗马数字
    LeetCode(C++)刷题计划:11-盛最多水的容器
  • 原文地址:https://www.cnblogs.com/nicole-zhang/p/12955094.html
Copyright © 2020-2023  润新知