如果用 python 的列表和字典来作比较, 那么可以说 Numpy 是列表形式的,没有数值标签,而 Pandas 就是字典形式。Pandas是基于Numpy构建的,让Numpy为中心的应用变得更加简单
1、要使用 pandas,首先需要了解他主要两个数据结构:Series 和 DataFrame
1) Series 的字符串表现形式为:索引在左边,值在右边。如果没有为数据指定索引,将会自动创建一个 0 到 N-1(N 为长度)的整数型索引
import pandas as pd import numpy as np s = pd.Series([1, 3, 6, np.nan, 44, 1]) # 0 1.0 # 1 3.0 # 2 6.0 # 3 NaN # 4 44.0 # 5 1.0 # dtype: float64 print(s)
2) DataFrame 是一个表格型的数据结构,它包含一组有序的列,每列可以是不同的值类型(数值,字符串,布尔值等)。DataFrame 既有行索引也有列索引, 它可以被看做由 Series 组成的大字典,可以根据每一个不同的索引来挑选数据
import pandas as pd import numpy as np dates = pd.date_range('20160101', periods = 6) df = pd.DataFrame(np.random.randn(6, 4), index = dates, columns = ['a', 'b', 'c', 'd']) # a b c d # 2016-01-01 -0.209289 -1.245807 2.025371 1.259524 # 2016-01-02 -0.568688 -1.200045 2.624338 -0.546785 # 2016-01-03 -0.935191 1.962250 0.397714 -1.317615 # 2016-01-04 -0.496206 0.513763 0.826336 0.666354 # 2016-01-05 0.648660 -0.252978 0.396752 -0.611633 # 2016-01-06 0.690797 -1.122976 0.823072 1.003473 print(df)
DataFrame 的创建方法
# 创建一组没有给定行标签和列标签的数据 df1 = pd.DataFrame(np.arange(12).reshape((3, 4))) # 0 1 2 3 # 0 0 1 2 3 # 1 4 5 6 7 # 2 8 9 10 11 print(df1) # 另一种创建DataFrame的方法 df2 = pd.DataFrame({'A' : 1., 'B' : pd.Timestamp('20130102'), 'C' : pd.Series(1,index=list(range(4)),dtype='float32'), 'D' : np.array([3] * 4,dtype='int32'), 'E' : pd.Categorical(["test","train","test","train"]), 'F' : 'foo'}) # A B C D E F # 0 1.0 2013-01-02 1.0 3 test foo # 1 1.0 2013-01-02 1.0 3 train foo # 2 1.0 2013-01-02 1.0 3 test foo # 3 1.0 2013-01-02 1.0 3 train foo print(df2)
DataFrame 的基本操作
# 0 1.0 # 1 1.0 # 2 1.0 # 3 1.0 print(df2['A']) # 或 print(df2.A) # 查看数据中的类型 # A float64 # B datetime64[ns] # C float32 # D int32 # E category # F object # dtype: object print(df2.dtypes) # 查看队列序号 # Int64Index([0, 1, 2, 3], dtype='int64') print(df2.index) # 查看数据名称 # Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object') print(df2.columns) # 查看所有的值 # [[1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo'] # [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo'] # [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo'] # [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']] print(df2.values) # 查看数据的总结 # A C D # count 4.0 4.0 4.0 # mean 1.0 1.0 3.0 # std 0.0 0.0 0.0 # min 1.0 1.0 3.0 # 25% 1.0 1.0 3.0 # 50% 1.0 1.0 3.0 # 75% 1.0 1.0 3.0 # max 1.0 1.0 3.0 df2.describe() # 对行列进行排序 # F E D C B A # 0 foo test 3 1.0 2013-01-02 1.0 # 1 foo train 3 1.0 2013-01-02 1.0 # 2 foo test 3 1.0 2013-01-02 1.0 # 3 foo train 3 1.0 2013-01-02 1.0 print(df2.sort_index(axis = 1, ascending = False)) # axis = 0 时对行进行排序 # 对数据的值进行排序 # A B C D E F # 0 1.0 2013-01-02 1.0 3 test foo # 2 1.0 2013-01-02 1.0 3 test foo # 1 1.0 2013-01-02 1.0 3 train foo # 3 1.0 2013-01-02 1.0 3 train foo print(df2.sort_values(by = 'E'))
2、选择数据
1) 基本方式选择数据
# 创建一个 6x4 的矩阵 import pandas as pd import numpy as np dates = pd.date_range('20160101', periods = 6) df = pd.DataFrame(np.arange(24).reshape(6, 4), index = dates, columns = ['a', 'b', 'c', 'd']) # a b c d # 2016-01-01 0 1 2 3 # 2016-01-02 4 5 6 7 # 2016-01-03 8 9 10 11 # 2016-01-04 12 13 14 15 # 2016-01-05 16 17 18 19 # 2016-01-06 20 21 22 23 print(df) # 2016-01-01 0 # 2016-01-02 4 # 2016-01-03 8 # 2016-01-04 12 # 2016-01-05 16 # 2016-01-06 20 # Freq: D, Name: a, dtype: int64 print(df['a']) # 或 print(df.a) # 选择多行或多列 # a b c d # 2016-01-01 0 1 2 3 # 2016-01-02 4 5 6 7 # 2016-01-03 8 9 10 11 print(df[0: 3]) # a b c d # 2016-01-02 4 5 6 7 # 2016-01-03 8 9 10 11 # 2016-01-04 12 13 14 15 print(df['20160102': '20160104'])
2) 根据标签选择数据 loc
# 通过标签名字选择一行数据 # a 4 # b 5 # c 6 # d 7 # Name: 2016-01-02 00:00:00, dtype: int64 print(df.loc['20160102']) # 通过标签选择所有行,以及 a、b 两列数据 # a b # 2016-01-01 0 1 # 2016-01-02 4 5 # 2016-01-03 8 9 # 2016-01-04 12 13 # 2016-01-05 16 17 # 2016-01-06 20 21 print(df.loc[:, ['a', 'b']]) # 通过标签选择其中一行,以及 a、b 两列数据 # a 4 # b 5 # Name: 2016-01-02 00:00:00, dtype: int64 print(df.loc['20160102', ['a', 'b']])
3) 根据位置选择数据 iloc
# 13 print(df.iloc[3, 1]) # b c # 2016-01-04 13 14 # 2016-01-05 17 18 print(df.iloc[3: 5, 1: 3]) # b c # 2016-01-02 5 6 # 2016-01-04 13 14 # 2016-01-06 21 22 print(df.iloc[[1, 3, 5], 1: 3])
4) 混合方式选择位置 ix
# a c # 2016-01-01 0 2 # 2016-01-02 4 6 # 2016-01-03 8 10 print(df.ix[:3, ['a', 'c']])
5) 根据判断指令选择数据
# a b c d # 2016-01-04 12 13 14 15 # 2016-01-05 16 17 18 19 # 2016-01-06 20 21 22 23 print(df[df.a > 8])
3、设置数据
# 根据索引设置数据 df.iloc[2, 2] = 1111 # 根据标签设置数据 df.loc['20160101', 'b'] = 2222 # 根据判断指令设置数据 df.b[df.a > 4] = 0 # 新增一列 'e',并将 'e' 列全部设置为 nan df['e'] = np.nan # 新增一列 'f' df['f'] = pd.Series([1, 2, 3, 4, 5, 6], index = pd.date_range('20160101', periods = 6)) # a b c d e f # 2016-01-01 0 2222 2 3 NaN 1 # 2016-01-02 4 5 6 7 NaN 2 # 2016-01-03 8 0 1111 11 NaN 3 # 2016-01-04 12 0 14 15 NaN 4 # 2016-01-05 16 0 18 19 NaN 5 # 2016-01-06 20 0 22 23 NaN 6
4、处理 NaN 数据
# 创建一个 6x4的矩阵,并将其中两个值设置为 nan dates = pd.date_range('20160101', periods = 6) df = pd.DataFrame(np.arange(24).reshape(6, 4), index = dates, columns = ['a', 'b', 'c', 'd']) df.iloc[0, 1] = np.nan df.iloc[1, 2] = np.nan # a b c d # 2016-01-01 0 NaN 2.0 3 # 2016-01-02 4 5.0 NaN 7 # 2016-01-03 8 9.0 10.0 11 # 2016-01-04 12 13.0 14.0 15 # 2016-01-05 16 17.0 18.0 19 # 2016-01-06 20 21.0 22.0 23
1) 使用 dropna 直接去掉有 NaN 的行或列 dropna
# a b c d # 2016-01-03 8 9.0 10.0 11 # 2016-01-04 12 13.0 14.0 15 # 2016-01-05 16 17.0 18.0 19 # 2016-01-06 20 21.0 22.0 23 # axis 0-对行进行操作,1-对列进行操作 # how 'any'-只要有 NaN 就进行处理,'all'-所有元素为 NaN 才进行处理 df.dropna(axis = 0, how = 'any')
2) 使用 fillna 将 NaN 的值用其他值代替
# a b c d # 2016-01-01 0 0.0 2.0 3 # 2016-01-02 4 5.0 0.0 7 # 2016-01-03 8 9.0 10.0 11 # 2016-01-04 12 13.0 14.0 15 # 2016-01-05 16 17.0 18.0 19 # 2016-01-06 20 21.0 22.0 23 df.fillna(value = 0)
3) 判断是否有缺失数据
# a b c d # 2016-01-01 False True False False # 2016-01-02 False False True False # 2016-01-03 False False False False # 2016-01-04 False False False False # 2016-01-05 False False False False # 2016-01-06 False False False False df.isnull() # 检查数据中是否存在 NaN # True np.any(df.isnull())
5、pandas 处理多组数据的时候往往会要用到数据的合并处理,使用 concat 是一种基本的合并方式。而且 concat 中有很多参数可以调整,合并成想要的数据形式
1) axis 合并方向 (默认 axis = 0)
df1 = pd.DataFrame(np.ones((3, 4)) * 1, columns = ['a', 'b', 'c', 'd']) df2 = pd.DataFrame(np.ones((3, 4)) * 2, columns = ['a', 'b', 'c', 'd']) # axis 0-对行进行合并,1-对列进行合并 res = pd.concat([df1, df2], axis = 0) # a b c d # 0 1.0 1.0 1.0 1.0 # 1 1.0 1.0 1.0 1.0 # 2 1.0 1.0 1.0 1.0 # 0 2.0 2.0 2.0 2.0 # 1 2.0 2.0 2.0 2.0 # 2 2.0 2.0 2.0 2.0 print(res)
2) ignore_index 重绘 index
res = pd.concat([df1, df2], axis = 0, ignore_index = True) # a b c d # 0 1.0 1.0 1.0 1.0 # 1 1.0 1.0 1.0 1.0 # 2 1.0 1.0 1.0 1.0 # 3 2.0 2.0 2.0 2.0 # 4 2.0 2.0 2.0 2.0 # 5 2.0 2.0 2.0 2.0 print(res)
3) join (合并方式)
join = 'outer' 为预设值,此方式是依照 column 来做纵向合并,有相同的 column 上下合并在一起,其他独自的 column 个自成列,原本没有值的位置皆以 NaN 填充
join = 'inner' 时,只有相同的 column 合并在一起,其他的会被抛弃
df1 = pd.DataFrame(np.ones((3, 4)) * 1, columns = ['a', 'b', 'c', 'd'], index = [1, 2, 3]) df2 = pd.DataFrame(np.ones((3, 4)) * 2, columns = ['b', 'c', 'd', 'e'], index = [2, 3, 4]) res = pd.concat([df1, df2], axis = 0, join = 'outer') # a b c d e # 1 1.0 1.0 1.0 1.0 NaN # 2 1.0 1.0 1.0 1.0 NaN # 3 1.0 1.0 1.0 1.0 NaN # 2 NaN 2.0 2.0 2.0 2.0 # 3 NaN 2.0 2.0 2.0 2.0 # 4 NaN 2.0 2.0 2.0 2.0 print(res) res = pd.concat([df1, df2], axis = 0, join = 'inner') # b c d # 1 1.0 1.0 1.0 # 2 1.0 1.0 1.0 # 3 1.0 1.0 1.0 # 2 2.0 2.0 2.0 # 3 2.0 2.0 2.0 # 4 2.0 2.0 2.0 print(res)
4) join_axes (依照 axes 合并)
df1 = pd.DataFrame(np.ones((3, 4)) * 1, columns = ['a', 'b', 'c', 'd'], index = [1, 2, 3]) df2 = pd.DataFrame(np.ones((3, 4)) * 2, columns = ['b', 'c', 'd', 'e'], index = [2, 3, 4]) # 依照 df1.index 对列进行合并 res = pd.concat([df1, df2], axis = 1, join_axes = [df1.index]) # a b c d b c d e # 1 1.0 1.0 1.0 1.0 NaN NaN NaN NaN # 2 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0 # 3 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0 print(res) # 移除 join_axes = [df1.index] res = pd.concat([df1, df2], axis = 1) # a b c d b c d e # 1 1.0 1.0 1.0 1.0 NaN NaN NaN NaN # 2 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0 # 3 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0 # 4 NaN NaN NaN NaN 2.0 2.0 2.0 2.0 print(res)
5) append 添加数据
注意:append 只要纵向合并(即对行进行合并),没有横向合并
df1 = pd.DataFrame(np.ones((3, 4)) * 1, columns = ['a', 'b', 'c', 'd']) df2 = pd.DataFrame(np.ones((3, 4)) * 2, columns = ['a', 'b', 'c', 'd']) res = df1.append(df2, ignore_index=True) # a b c d # 0 1.0 1.0 1.0 1.0 # 1 1.0 1.0 1.0 1.0 # 2 1.0 1.0 1.0 1.0 # 3 2.0 2.0 2.0 2.0 # 4 2.0 2.0 2.0 2.0 # 5 2.0 2.0 2.0 2.0 print(res)
6、merge
merge 和 concat 类似,但主要是用于两组有一列以上相同列标签(key column)的数据,统一索引的数据,通常也被用在 Database 的处理当中
1) 根据一组 key 合并
# a b key # 0 a0 b0 0 # 1 a1 b1 1 # 2 a2 b2 2 # 3 a3 b3 3 left = pd.DataFrame({'key': [0, 1, 2, 3], 'a': ['a0', 'a1', 'a2', 'a3'], 'b': ['b0', 'b1', 'b2', 'b3']}) # c d key # 0 c0 d0 0 # 1 c1 d1 1 # 2 c2 d2 2 # 3 c3 d3 3 right = pd.DataFrame({'key': [0, 1, 2, 3], 'c': ['c0', 'c1', 'c2', 'c3'], 'd': ['d0', 'd1', 'd2', 'd3']}) res = pd.merge(left, right, on = 'key') # a b key c d # 0 a0 b0 0 c0 d0 # 1 a1 b1 1 c1 d1 # 2 a2 b2 2 c2 d2 # 3 a3 b3 3 c3 d3 print(res)
2) 根据两组 key 合并
# a b key1 key2 # 0 a0 b0 k0 k0 # 1 a1 b1 k0 k1 # 2 a2 b2 k1 k0 # 3 a3 b3 k2 k1 left = pd.DataFrame({'key1': ['k0', 'k0', 'k1', 'k2'], 'key2': ['k0', 'k1', 'k0', 'k1'], 'a': ['a0', 'a1', 'a2', 'a3'], 'b': ['b0', 'b1', 'b2', 'b3']}) # c d key1 key2 # 0 c0 d0 k0 k0 # 1 c1 d1 k1 k0 # 2 c2 d2 k1 k0 # 3 c3 d3 k2 k0 right = pd.DataFrame({'key1': ['k0', 'k1', 'k1', 'k2'], 'key2': ['k0', 'k0', 'k0', 'k0'], 'c': ['c0', 'c1', 'c2', 'c3'], 'd': ['d0', 'd1', 'd2', 'd3']}) ret = pd.merge(left, right, on = ['key1', 'key2'], how = 'inner') # a b key1 key2 c d # 0 a0 b0 k0 k0 c0 d0 # 1 a2 b2 k1 k0 c1 d1 # 2 a2 b2 k1 k0 c2 d2 print(ret) ret = pd.merge(left, right, on = ['key1', 'key2'], how = 'outer') # a b key1 key2 c d # 0 a0 b0 k0 k0 c0 d0 # 1 a1 b1 k0 k1 NaN NaN # 2 a2 b2 k1 k0 c1 d1 # 3 a2 b2 k1 k0 c2 d2 # 4 a3 b3 k2 k1 NaN NaN # 5 NaN NaN k2 k0 c3 d3 print(ret) ret = pd.merge(left, right, on = ['key1', 'key2'], how = 'left') # a b key1 key2 c d # 0 a0 b0 k0 k0 c0 d0 # 1 a1 b1 k0 k1 NaN NaN # 2 a2 b2 k1 k0 c1 d1 # 3 a2 b2 k1 k0 c2 d2 # 4 a3 b3 k2 k1 NaN NaN print(ret) ret = pd.merge(left, right, on = ['key1', 'key2'], how = 'right') # a b key1 key2 c d # 0 a0 b0 k0 k0 c0 d0 # 1 a2 b2 k1 k0 c1 d1 # 2 a2 b2 k1 k0 c2 d2 # 3 NaN NaN k2 k0 c3 d3 print(ret)
3) Indicator
indicator = True 会将合并的记录放在新的一列
# col1 col_left # 0 0 a # 1 1 b df1 = pd.DataFrame({'col1': [0, 1], 'col_left': ['a', 'b']}) # col1 col_right # 0 1 2 # 1 2 2 # 2 2 2 df2 = pd.DataFrame({'col1': [1, 2, 2], 'col_right': [2, 2, 2]}) ret = pd.merge(df1, df2, on = 'col1', how = 'outer', indicator = True) # col1 col_left col_right _merge # 0 0 a NaN left_only # 1 1 b 2.0 both # 2 2 NaN 2.0 right_only # 3 2 NaN 2.0 right_only print(ret) # 指定 indicator_column 的名称 ret = pd.merge(df1, df2, on = 'col1', how = 'outer', indicator = 'indicator_column') # col1 col_left col_right indicator_column # 0 0 a NaN left_only # 1 1 b 2.0 both # 2 2 NaN 2.0 right_only # 3 2 NaN 2.0 right_only print(ret)
4) 根据 index 合并
# a b # 0 a1 b1 # 1 a2 b2 # 2 a3 b3 left = pd.DataFrame({'a': ['a1', 'a2', 'a3'], 'b': ['b1', 'b2', 'b3']}, index = [0, 1, 2]) # c d # 1 c1 d1 # 2 c2 d2 # 3 c3 d3 right = pd.DataFrame({'c': ['c1', 'c2', 'c3'], 'd': ['d1', 'd2', 'd3']}, index = [1, 2, 3]) ret = pd.merge(left, right, left_index = True, right_index = True, how = 'inner') # a b c d # 1 a2 b2 c1 d1 # 2 a3 b3 c2 d2 print(ret) ret = pd.merge(left, right, left_index = True, right_index = True, how = 'outer') # a b c d # 0 a1 b1 NaN NaN # 1 a2 b2 c1 d1 # 2 a3 b3 c2 d2 # 3 NaN NaN c3 d3 print(ret)
5) suffixes
如果两个 DataFrame 的标签一样,可以使用 suffixes 来解决 overlapping 的问题
# age k # 0 1 K0 # 1 2 K1 # 2 3 K2 left = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]}) # age k # 0 4 K0 # 1 5 K0 # 2 6 K3 right = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]}) ret = pd.merge(left, right, on = 'k', suffixes = ['_left', '_right'], how = 'inner') # age_left k age_right # 0 1 K0 4 # 1 1 K0 5 print(ret)