DataFrame的定义
1 data = {
2 'color': ['blue', 'green', 'yellow', 'red', 'white'],
3 'object': ['ball', 'pen', 'pecil', 'paper', 'mug'],
4 'price': [1.2, 1, 2.3, 5, 6]
5 }
6 frame0 = pd.DataFrame(data)
7 print(frame0)
8 frame1 = pd.DataFrame(data, columns=['object', 'price'])
9 print(frame1)
10 frame2 = pd.DataFrame(data, index=['张三','李斯','王五','陈久','小明'])
11 print(frame2)
12 Out[1]:
13 color object price
14 0 blue ball 1.2
15 1 green pen 1.0
16 2 yellow pecil 2.3
17 3 red paper 5.0
18 4 white mug 6.0
19 object price
20 0 ball 1.2
21 1 pen 1.0
22 2 pecil 2.3
23 3 paper 5.0
24 4 mug 6.0
25 color object price
26 张三 blue ball 1.2
27 李斯 green pen 1.0
28 王五 yellow pecil 2.3
29 陈久 red paper 5.0
30 小明 white mug 6.0
使用index参数可以设置index信息
选取元素
1 print(frame1.columns)
2 print(frame2.index)
3 print(frame2['price'])
4 print(frame2.price)
5 Out[2]:
6 Index(['object', 'price'], dtype='object')
7 Index(['张三', '李斯', '王五', '陈久', '小明'], dtype='object')
8 张三 1.2
9 李斯 1.0
10 王五 2.3
11 陈久 5.0
12 小明 6.0
13 Name: price, dtype: float64
14 张三 1.2
15 李斯 1.0
16 王五 2.3
17 陈久 5.0
18 小明 6.0
19 Name: price, dtype: float64
一般我们常需要按列取值,那么DataFrame提供了 loc 和 iloc 供大家选择,但是两者之间是由区别的。
1 print(frame2)
2 print(frame2.loc['王五']) # loc可以使用字符串类型的index,而iloc只能是int型的
3 print(frame0.iloc[2])
4 Out[3]:
5 color object price
6 张三 blue ball 1.2
7 李斯 green pen 1.0
8 王五 yellow pecil 2.3
9 陈久 red paper 5.0
10 小明 white mug 6.0
11 color yellow
12 object pecil
13 price 2.3
14 Name: 王五, dtype: object
15 color yellow
16 object pecil
17 price 2.3
18 Name: 2, dtype: object
一般取值操作
1 print(frame2[2:3]) # 取行
2 print(frame0['object']) # 取列
3 print(frame0['object'][1:3]) # 取列的元素
4 print(frame0.iloc[0:4, 1:3]) # 取一块的元素 ********************************************************************
5 Out[4]:
6 color object price
7 王五 yellow pecil 2.3
8 0 ball
9 1 pen
10 2 pecil
11 3 paper
12 4 mug
13 Name: object, dtype: object
14 1 pen
15 2 pecil
16 Name: object, dtype: object
17 object price
18 0 ball 1.2
19 1 pen 1.0
20 2 pecil 2.3
21 3 paper 5.0
元素的赋值
1 data = { 2 'color': ['blue', 'green', 'yellow', 'red', 'white'], 3 'object': ['ball', 'pen', 'pecil', 'paper', 'mug'], 4 'price': [1.2, 1, 2.3, 5, 6] 5 } 6 frame2 = pd.DataFrame(data, index=['张三', '李斯', '王五', '陈久', '小明']) 7 print("----*---- ", frame2) 8 frame2.index.name = 'usr_id' # 给index名字赋值 9 frame2.columns.name = 'item' # 给columns名字赋值 10 frame2['new'] = 12 # 给不存在的列赋值,会自动生成一列 11 print("----*---- ", frame2) 12 frame2['new'] = [3.0,1.3,2.2,0.8,1.1] # 可以指定具体不同的内容 13 print("----*---- ", frame2) 14 # 注意添加一列Series数据时,必须要注意index要一致,不一致的地方会用NaN替换 15 ser = pd.Series(np.arange(5), index=['张三', '李斯', '王五', '陈久', '小明']) 16 frame2['old'] = ser 17 print("----*---- ", frame2) 18 frame2.at['王五','price']= 22 # 改变具体一个元素的值 19 print("----*---- ", frame2) 20 Out[5]: 21 ----*---- 22 color object price 23 张三 blue ball 1.2 24 李斯 green pen 1.0 25 王五 yellow pecil 2.3 26 陈久 red paper 5.0 27 小明 white mug 6.0 28 ----*---- 29 item color object price new 30 usr_id 31 张三 blue ball 1.2 12 32 李斯 green pen 1.0 12 33 王五 yellow pecil 2.3 12 34 陈久 red paper 5.0 12 35 小明 white mug 6.0 12 36 ----*---- 37 item color object price new 38 usr_id 39 张三 blue ball 1.2 3.0 40 李斯 green pen 1.0 1.3 41 王五 yellow pecil 2.3 2.2 42 陈久 red paper 5.0 0.8 43 小明 white mug 6.0 1.1 44 ----*---- 45 item color object price new old 46 usr_id 47 张三 blue ball 1.2 3.0 0 48 李斯 green pen 1.0 1.3 1 49 王五 yellow pecil 2.3 2.2 2 50 陈久 red paper 5.0 0.8 3 51 小明 white mug 6.0 1.1 4 52 ----*---- 53 item color object price new old 54 usr_id 55 张三 blue ball 1.2 3.0 0 56 李斯 green pen 1.0 1.3 1 57 王五 yellow pecil 22.0 2.2 2 58 陈久 red paper 5.0 0.8 3 59 小明 white mug 6.0 1.1 4
赋值补充
1 print(frame2.isin([1, 'paper'])) 2 print("----*---- ", frame2[frame2.isin([1, 'paper'])]) 3 del frame2['old'] # 删除old列 4 print(frame2) 5 d1 = { 6 'red':{2012:22,2013:33}, 7 'white':{2011:13,2012:22,2013:16}, 8 'blue':{2011:17,2012:27,2013:18} 9 } 10 frame3 = pd.DataFrame(d1) 11 print(frame3) 12 print(frame3.T) 13 Out[6]: 14 item color object price new old 15 usr_id 16 张三 False False False False False 17 李斯 False False True False True 18 王五 False False False False False 19 陈久 False True False False False 20 小明 False False False False False 21 ----*---- 22 item color object price new old 23 usr_id 24 张三 NaN NaN NaN NaN NaN 25 李斯 NaN NaN 1.0 NaN 1.0 26 王五 NaN NaN NaN NaN NaN 27 陈久 NaN paper NaN NaN NaN 28 小明 NaN NaN NaN NaN NaN 29 item color object price new 30 usr_id 31 张三 blue ball 1.2 3.0 32 李斯 green pen 1.0 1.3 33 王五 yellow pecil 22.0 2.2 34 陈久 red paper 5.0 0.8 35 小明 white mug 6.0 1.1 36 red white blue 37 2011 NaN 13 17 38 2012 22.0 22 27 39 2013 33.0 16 18 40 2011 2012 2013 41 red NaN 22.0 33.0 42 white 13.0 22.0 16.0 43 blue 17.0 27.0 18.0
Index对象
1 ins = pd.Series([5,0,3,8,4],index=['red','blue','yellow','white','green'])
2 print(ins.index)
3 print(ins.idxmin()) # 返回一个索引,该索引对应的value最小
4 print(ins.idxmax()) # 返回一个索引,该索引对应的value最大
5 # 重复标签的Index
6 serd = pd.Series(range(6),index=['white','white','blue','green','green','yellow'])
7 print("serd['white']:
", serd['white'])
8 print("判断index是否存在重复项:", serd.index.is_unique) # 判断index是否存在重复项
9 # 更换索引
10 ser = pd.Series([1,2,3,4,5],index=['one','two','three','four','five'])
11 # ser.reindex(['four','five','six','one', 'two']) # 按这里给定的顺序设置index
12 ser.reindex(['张三', '王五', '陈久', '小明', '李斯'])
13 print("Series:ser :
", ser)
14 Out[7]:
15 Index(['red', 'blue', 'yellow', 'white', 'green'], dtype='object')
16 blue
17 white
18 serd['white']:
19 white 0
20 white 1
21 dtype: int64
22 判断index是否存在重复项: False
23 Series:ser :
24 one 1
25 two 2
26 three 3
27 four 4
28 five 5
29 dtype: int64
注意上面的 Series 用 reindex 改变了index, 但是如果在生成Series 时用了np.array(),这样是改变不了index的。
自动编制索引
1 ser2 = pd.Series([1,5,6,3],index =[0,3,5,6])
2 print(ser2)
3 print(ser2.reindex(range(6),method='ffill')) #插值,以得到一个index完整的序列(前插),index满足range(6)
4 print(ser2.reindex(range(6),method='bfill')) #插值,以得到一个index完整的序列(后插)
5 Out[8]:
6 0 1
7 3 5
8 5 6
9 6 3
10 dtype: int64
11 0 1
12 1 1
13 2 1
14 3 5
15 4 5
16 5 6
17 dtype: int64
18 0 1
19 1 5
20 2 5
21 3 5
22 4 6
23 5 6
24 dtype: int64
删除操作
1 ser3 = pd.Series(np.arange(4.),index=['red','blue','yellow','white'])
2 print(ser3.drop('yellow')) # ser3并没有变
3 frame = pd.DataFrame(np.arange(16).reshape((4,4)),index=['blue','yellow','red','white'],columns=['ball','pen','pencil','paper'])
4 print(frame)
5 print(frame.drop(['blue','yellow'])) #默认删除行
6 print(frame.drop(['pen','pencil'],axis=1)) #删除列
7 Out[9]:
8 red 0.0
9 blue 1.0
10 white 3.0
11 dtype: float64
12 ball pen pencil paper
13 blue 0 1 2 3
14 yellow 4 5 6 7
15 red 8 9 10 11
16 white 12 13 14 15
17 ball pen pencil paper
18 red 8 9 10 11
19 white 12 13 14 15
20 ball paper
21 blue 0 3
22 yellow 4 7
23 red 8 11
24 white 12 15
DataFrame之间的运算
1 frame1 = pd.DataFrame(np.arange(16).reshape((4,4)),index=['red','blue','yellow','white'],columns=['ball','pen','pencil','paper'])
2 print(frame1)
3 frame2 = pd.DataFrame(np.arange(12).reshape((4,3)),index=['blue','green','white','yellow'],columns=['mug','pen','ball'])
4 print(frame2)
5 print(frame1 + frame2) # 等价于:frame1.add(frame2)
6 frame3 = pd.DataFrame(np.arange(16).reshape((4,4)),index=['red','blue','yellow','white'],columns=['ball','pen','pencil','paper'])
7 ser1 = pd.Series(np.arange(4),index=['ball','pen','pencil','paper'])
8 print(frame3 - ser1)
9 ser1['mug'] = 9
10 print(frame3 - ser1)
11 Out[9]:
12 ball pen pencil paper
13 red 0 1 2 3
14 blue 4 5 6 7
15 yellow 8 9 10 11
16 white 12 13 14 15
17 mug pen ball
18 blue 0 1 2
19 green 3 4 5
20 white 6 7 8
21 yellow 9 10 11
22 ball mug paper pen pencil
23 blue 6.0 NaN NaN 6.0 NaN
24 green NaN NaN NaN NaN NaN
25 red NaN NaN NaN NaN NaN
26 white 20.0 NaN NaN 20.0 NaN
27 yellow 19.0 NaN NaN 19.0 NaN
28 ball pen pencil paper
29 red 0 0 0 0
30 blue 4 4 4 4
31 yellow 8 8 8 8
32 white 12 12 12 12
33 ball mug paper pen pencil
34 red 0 NaN 0 0 0
35 blue 4 NaN 4 4 4
36 yellow 8 NaN 8 8 8
37 white 12 NaN 12 12 12
通用函数
1 frame2 = pd.DataFrame(np.arange(12).reshape((4,3)),index=['blue','green','white','yellow'],columns=['mug','pen','ball'])
2 # 通用函数,Numpy中的通用函数这里也适用
3 print(np.sqrt(frame2))
4 Out[10]:
5 mug pen ball
6 blue 0.000000 1.000000 1.414214
7 green 1.732051 2.000000 2.236068
8 white 2.449490 2.645751 2.828427
9 yellow 3.000000 3.162278 3.316625
按行按列操作的函数
1 print(frame2)
2 # 按行按列操作的函数 .apply()
3 f = lambda x: x.max() - x.min()
4 print(frame2.apply(f))
5 print(frame2.apply(f, axis=1)) # 按行执行函数f
6 def f1(x):
7 return pd.Series([x.min(),x.max()],index=['min','max'])
8 print(frame2.apply(f1))
9 Out[11]:
10 mug pen ball
11 blue 0 1 2
12 green 3 4 5
13 white 6 7 8
14 yellow 9 10 11
15 mug 9
16 pen 9
17 ball 9
18 dtype: int64
19 blue 2
20 green 2
21 white 2
22 yellow 2
23 dtype: int64
24 mug pen ball
25 min 0 1 2
26 max 9 10 11
统计函数
1 print(frame2.sum()) # 按列统计求和
2 print(frame2.describe()) # 按列做统计描述
3 Out[12]:
4 mug pen ball
5 blue 0 1 2
6 green 3 4 5
7 white 6 7 8
8 yellow 9 10 11
9 mug 18
10 pen 22
11 ball 26
12 dtype: int64
13 mug pen ball
14 count 4.000000 4.000000 4.000000
15 mean 4.500000 5.500000 6.500000
16 std 3.872983 3.872983 3.872983
17 min 0.000000 1.000000 2.000000
18 25% 2.250000 3.250000 4.250000
19 50% 4.500000 5.500000 6.500000
20 75% 6.750000 7.750000 8.750000
21 max 9.000000 10.000000 11.000000
排序
1 frame2 = pd.DataFrame(np.arange(12).reshape((4,3)),index=['blue','white','yellow','green'],columns=['mug','pen','ball'])
2 # 根据索引排序
3 ser = pd.Series([5,0,3,8,4],index=['red','blue','yellow','white','green'])
4 print(ser.sort_index())
5 print(ser.sort_index(ascending=False))
6 print(frame2.sort_index())
7 print(frame2.sort_index(axis=1))
8 # 根据对象排序
9 frame2.at['yellow','pen'] = 5.9
10 print(frame2.sort_values(by='pen'))
11 # ser.rank() 对ser进行排序,index对应着数值的序号
12 print(ser.rank()) # rank(self, axis=0, method='average', numeric_only=None, na_option='keep', ascending=True, pct=False)
13 print(ser.rank(method = 'first'))
14 print(ser.rank(ascending=False)) # 降序排位
15 print(frame2.rank()) # 按列的元素排位
16 Out[13]:
17 blue 0
18 green 4
19 red 5
20 white 8
21 yellow 3
22 dtype: int64
23 yellow 3
24 white 8
25 red 5
26 green 4
27 blue 0
28 dtype: int64
29 mug pen ball
30 blue 0 1 2
31 green 9 10 11
32 white 3 4 5
33 yellow 6 7 8
34 ball mug pen
35 blue 2 0 1
36 white 5 3 4
37 yellow 8 6 7
38 green 11 9 10
39 mug pen ball
40 blue 0 1 2
41 white 3 4 5
42 yellow 6 5 8
43 green 9 10 11
44 red 4.0
45 blue 1.0
46 yellow 2.0
47 white 5.0
48 green 3.0
49 dtype: float64
50 red 4.0
51 blue 1.0
52 yellow 2.0
53 white 5.0
54 green 3.0
55 dtype: float64
56 red 2.0
57 blue 5.0
58 yellow 4.0
59 white 1.0
60 green 3.0
61 dtype: float64
62 mug pen ball
63 blue 1.0 1.0 1.0
64 white 2.0 2.0 2.0
65 yellow 3.0 3.0 3.0
66 green 4.0 4.0 4.0
相关系数与协方差
1 seq2 = pd.Series([3,4,3,4,5,4,3,2],['2006','2007','2008','2009','2010','2011','2012','2013'])
2 seq = pd.Series([1,2,3,4,4,3,2,1],['2006','2007','2008','2009','2010','2011','2012','2013'])
3 print(seq.corr(seq2)) # 计算相关系数
4 print(seq.cov(seq2)) # 计算协方差
5 frame2 = pd.DataFrame([[1,4,3,6],[4,5,6,1],[3,3,1,5],[4,1,6,4]],index=['red','blue','yellow','white'],columns = ['ball','pen','pencil','paper'])
6 print(frame2.corr()) # 列之间两两相关系数矩阵
7 print(frame2.cov())
8 # corrwith()方法可以计算DataFrame对象的列或行与Series对象或其他DataFrame对象元素"两两"之间的相关性
9 ser = pd.Series([5,0,3,8],index=['red','blue','yellow','white'])
10 print(frame2.corrwith(ser)) # corrwith(self, other, axis=0, drop=False)
11 frame = pd.DataFrame([[1, 3, 5, 6], [5, 8, 9, 1],[3,6,4,2],[4,8,7,3]],index=['red','blue','yellow','white'],columns = ['ball','pen','pencil','paper'])
12 print(frame2.corrwith(frame))
13 Out[14]:
14 0.7745966692414835
15 0.8571428571428571
16 ball pen pencil paper
17 ball 1.000000 -0.276026 0.577350 -0.763763
18 pen -0.276026 1.000000 -0.079682 -0.361403
19 pencil 0.577350 -0.079682 1.000000 -0.692935
20 paper -0.763763 -0.361403 -0.692935 1.000000
21 ball pen pencil paper
22 ball 2.000000 -0.666667 2.000000 -2.333333
23 pen -0.666667 2.916667 -0.333333 -1.333333
24 pencil 2.000000 -0.333333 6.000000 -3.666667
25 paper -2.333333 -1.333333 -3.666667 4.666667
26 ball -0.140028
27 pen -0.869657
28 pencil 0.080845
29 paper 0.595854
30 dtype: float64
31 ball 0.966092
32 pen -0.268455
33 pencil 0.920575
34 paper 0.785714
35 dtype: float64
NaN值的操作
1 frame3 = pd.DataFrame([[6,np.nan,6],[np.nan,np.nan,np.nan],[2,np.nan,5]],index = ['blue','green','red'],columns = ['ball','mug','pen'])
2 print(frame3)
3 print(frame3.notnull()) # 输出一个布尔矩阵,True表示非空
4 print(frame3.dropna()) # 行有NaN就删除
5 print(frame3.dropna(how ='all')) # 删除全是NaN的
6 print(frame3.fillna(6.6)) #指定缺失值填充
7 print(frame3.fillna({'ball':1,'mug':0,'pen':99}))
8 Out[15]:
9 ball mug pen
10 blue 6.0 NaN 6.0
11 green NaN NaN NaN
12 red 2.0 NaN 5.0
13 ball mug pen
14 blue True False True
15 green False False False
16 red True False True
17 Empty DataFrame
18 Columns: [ball, mug, pen]
19 Index: []
20 ball mug pen
21 blue 6.0 NaN 6.0
22 red 2.0 NaN 5.0
23 ball mug pen
24 blue 6.0 6.6 6.0
25 green 6.6 6.6 6.6
26 red 2.0 6.6 5.0
27 ball mug pen
28 blue 6.0 0.0 6.0
29 green 1.0 0.0 99.0
30 red 2.0 0.0 5.0
等级索引
1 mser = pd.Series(np.random.rand(8),index=[['white','white','white','blue','blue','red','red','red'],['up','down','right','up','down','up','down','left']])
2 print(mser, "
-----*-----
",mser.index)
3 print(mser['white'])
4 print(mser[:,'up'])
5 print(mser['white','up'])
6 frame = mser.unstack() #把等级索引Series转换成简单的DataFrame对象
7 print(frame)
8 test = frame.stack() # 变回去
9 print("----*----
", test)
10 mframe = pd.DataFrame(np.random.randn(16).reshape(4,4),index =[['white','white','red','red'],['up','down','up','down']],columns=[['pen','pen','paper','paper'],[1,2,1,2]])
11 print("mframe:
", mframe)
12 mframe.columns.names =['objects','id']
13 mframe.index.names = ['colors','status']
14 print("mframe:
", mframe)
15 mframe.swaplevel('colors','status') #互换位置
16 print("mframe:
", mframe)
17 print("----*----
", mframe.sort_index(level='colors')) #根据层级排序, ascending=False
18 print("----*----
", mframe.sum(level='colors')) #按照层级统计
19 print("----*----
", mframe.sum(level='id',axis=1)) #按照层级统计
20 Out[15]:
21 white up 0.510320
22 down 0.564982
23 right 0.253983
24 blue up 0.308429
25 down 0.895921
26 red up 0.555668
27 down 0.312702
28 left 0.680157
29 dtype: float64
30 -----*-----
31 MultiIndex(levels=[['blue', 'red', 'white'], ['down', 'left', 'right', 'up']],
32 labels=[[2, 2, 2, 0, 0, 1, 1, 1], [3, 0, 2, 3, 0, 3, 0, 1]])
33 up 0.510320
34 down 0.564982
35 right 0.253983
36 dtype: float64
37 white 0.510320
38 blue 0.308429
39 red 0.555668
40 dtype: float64
41 0.5103202702540969
42 down left right up
43 blue 0.895921 NaN NaN 0.308429
44 red 0.312702 0.680157 NaN 0.555668
45 white 0.564982 NaN 0.253983 0.510320
46 ----*----
47 blue down 0.895921
48 up 0.308429
49 red down 0.312702
50 left 0.680157
51 up 0.555668
52 white down 0.564982
53 right 0.253983
54 up 0.510320
55 dtype: float64
56 mframe:
57 pen paper
58 1 2 1 2
59 white up 0.145684 -1.665620 1.511783 -1.128178
60 down 0.364897 0.334767 0.488259 1.555273
61 red up 2.005307 0.071610 -0.778413 1.109162
62 down 1.376714 -0.478544 0.209413 -1.361551
63 mframe:
64 objects pen paper
65 id 1 2 1 2
66 colors status
67 white up 0.145684 -1.665620 1.511783 -1.128178
68 down 0.364897 0.334767 0.488259 1.555273
69 red up 2.005307 0.071610 -0.778413 1.109162
70 down 1.376714 -0.478544 0.209413 -1.361551
71 mframe:
72 objects pen paper
73 id 1 2 1 2
74 colors status
75 white up 0.145684 -1.665620 1.511783 -1.128178
76 down 0.364897 0.334767 0.488259 1.555273
77 red up 2.005307 0.071610 -0.778413 1.109162
78 down 1.376714 -0.478544 0.209413 -1.361551
79 ----*----
80 objects pen paper
81 id 1 2 1 2
82 colors status
83 red down 1.376714 -0.478544 0.209413 -1.361551
84 up 2.005307 0.071610 -0.778413 1.109162
85 white down 0.364897 0.334767 0.488259 1.555273
86 up 0.145684 -1.665620 1.511783 -1.128178
87 ----*----
88 objects pen paper
89 id 1 2 1 2
90 colors
91 white 0.510581 -1.330853 2.000042 0.427095
92 red 3.382021 -0.406933 -0.569000 -0.252389
93 ----*----
94 id 1 2
95 colors status
96 white up 1.657467 -2.793798
97 down 0.853157 1.890040
98 red up 1.226894 1.180773
99 down 1.586127 -1.840095