一、Series的创建:
pd.Series([ 数据 ])
In [17]: import pandas as pd In [18]: import numpy as np In [19]: s = pd.Series([1,1,1,1,np.nan]) In [20]: s Out[20]: 0 1.0 1 1.0 2 1.0 3 1.0 4 NaN dtype: float64
二、生成DataFrame
1,Numpy 产生随机数组
In [17]: np.random.rand(5,5) # 生成5 x 5 的数组 Out[17]: array([[ 0.67935636, 0.75784959, 0.85428253, 0.73356 , 0.60555467], [ 0.93576592, 0.81445114, 0.18213442, 0.4784346 , 0.14720462], [ 0.57083505, 0.62618339, 0.13534874, 0.19513107, 0.7190744 ], [ 0.66931535, 0.50888897, 0.00685189, 0.16140523, 0.68407209], [ 0.91081342, 0.67702016, 0.32823171, 0.43670926, 0.98735408]])
2,Pandas 生成连续日期
In [18]: pd.date_range('20180101',periods=6) Out[18]: DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04', '2018-01-05', '2018-01-06'], dtype='datetime64[ns]', freq='D')
3,生成带index和columns的DataFrame
In [19]: df = pd.DataFrame(np.random.rand(6,4),index=pd.date_range('20180101',periods=6),columns=['a','b','c','d']) # 第一个参数为数据,第二个参数index为索引,第三个参数columns为列名 In [20]: df Out[20]: a b c d 2018-01-01 0.202113 0.205094 0.456936 0.535537 2018-01-02 0.912747 0.812827 0.856495 0.872259 2018-01-03 0.303067 0.832261 0.279915 0.297952 2018-01-04 0.480393 0.315161 0.333675 0.072642 2018-01-05 0.965324 0.561682 0.565189 0.503561 2018-01-06 0.959792 0.227326 0.970319 0.757595
4,Pandas 生成二维数组和一维数组
In [6]: arr = np.arange(12) # 一维数组 In [7]: arr Out[7]: array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
In [9]: arr = np.arange(12).reshape(3,4) # 二维数组
In [10]: arr
Out[10]:
array([[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11]])
5,生成一个没有定义index和column的DataFrame (如果没有定义,index和column则为数字)
In [11]: df = pd.DataFrame(arr) # 直接将二维数组传入即可 In [12]: df Out[12]: 0 1 2 3 0 0 1 2 3 1 4 5 6 7 2 8 9 10 11
6,利用数组生成DataFrame
In [13]: arr = {"a":[1]*3,"b":[2]*3,"c":[3]*3} # 定义数组 In [14]: arr Out[14]: {'a': [1, 1, 1], 'b': [2, 2, 2], 'c': [3, 3, 3]}
#生成DataFrame
In [16]: df = pd.DataFrame(arr) In [17]: In [17]: df Out[17]: a b c 0 1 2 3 1 1 2 3 2 1 2 3
三、DataFrame的基本操作
1,取某一列
In [20]: df['a'] Out[20]: 0 1 1 1 2 1 Name: a, dtype: int64
2,查看数组类型dtypes
In [21]: df.dtypes Out[21]: a int64 b int64 c int64 dtype: object
3,查看索引index
In [23]: df.index
Out[23]: RangeIndex(start=0, stop=3, step=1)
4,查看列cloumns
In [24]: df.columns Out[24]: Index([u'a', u'b', u'c'], dtype='object')
5,查看值values
In [25]: df.values Out[25]: array([[1, 2, 3], [1, 2, 3], [1, 2, 3]], dtype=int64)
6,查看数据的总结describe
In [32]: df.describe() Out[32]: a b c count 3.0 3.0 3.0 mean 1.0 2.0 3.0 std 0.0 0.0 0.0 min 1.0 2.0 3.0 25% 1.0 2.0 3.0 50% 1.0 2.0 3.0 75% 1.0 2.0 3.0 max 1.0 2.0 3.0
7,翻转数据 transpose、T
In [37]: df.transpose() Out[37]: 0 1 2 a 1 1 1 b 2 2 2 c 3 3 3 In [38]: df.T Out[38]: 0 1 2 a 1 1 1 b 2 2 2 c 3 3 3
四,对DataFrame的索引(index)进行排序
df.sort_index(axis=0,ascending=True) #axis为0代表对行排序1代表对列。ascending为True代表正序,False代表反序
# 行,正序
In [43]: df.sort_index(axis=0,ascending=True) Out[43]: a b c 0 1 2 3 1 1 2 3 2 1 2 3
# 行,反序
In [44]: df.sort_index(axis=0,ascending=False) Out[44]: a b c 2 1 2 3 1 1 2 3 0 1 2 3
#列,正序
In [46]: df.sort_index(axis=1,ascending=True) Out[46]: a b c 0 1 2 3 1 1 2 3 2 1 2 3
#列,反序
In [45]: df.sort_index(axis=1,ascending=False) Out[45]: c b a 0 3 2 1 1 3 2 1 2 3 2 1
五,对DataFrame的值(values)进行排序
# F列,正序
In [205]: df.sort_values(by='F',ascending=True) # by 是根据某列排序,ascending代表的是正序或者反序 Out[205]: A B C D E F 2013-01-01 0 1 2 3 NaN 11 2013-01-02 0 0 0 0 NaN 22 2013-01-03 0 0 0 0 NaN 33 2013-01-04 0 0 0 0 NaN 44 2013-01-05 0 0 0 0 NaN 55 2013-01-06 0 0 0 0 NaN 66
# F列,反序 In [206]: df.sort_values(by='F',ascending=False) Out[206]: A B C D E F 2013-01-06 0 0 0 0 NaN 66 2013-01-05 0 0 0 0 NaN 55 2013-01-04 0 0 0 0 NaN 44 2013-01-03 0 0 0 0 NaN 33 2013-01-02 0 0 0 0 NaN 22 2013-01-01 0 1 2 3 NaN 11
六,对DataFrame数据进行选择
1,简单筛选 df.A = df['A']
# 生成一个DataFrame
In [58]: dates = pd.date_range('20130101', periods=6) ...: df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates, columns=['A','B','C','D']) ...:
In [63]: df.A Out[63]: 2013-01-01 0 2013-01-02 4 2013-01-03 8 2013-01-04 12 2013-01-05 16 2013-01-06 20 Freq: D, Name: A, dtype: int32 In [64]: df['A'] Out[64]: 2013-01-01 0 2013-01-02 4 2013-01-03 8 2013-01-04 12 2013-01-05 16 2013-01-06 20 Freq: D, Name: A, dtype: int32
2,按索引或者索引下标进行筛选
In [60]: df[0:3] # 按照索引进行筛选 Out[60]: A B C D 2013-01-01 0 1 2 3 2013-01-02 4 5 6 7 2013-01-03 8 9 10 11 In [61]: df['20130101':'20130102'] # 按照索引下标值进行筛选 Out[61]: A B C D 2013-01-01 0 1 2 3 2013-01-02 4 5 6 7
3,使用loc进行标签筛选 df.loc['行',[列]]
列的格式为list,其中 : 代表全部。行只能筛选一行,列能筛选多列
# 只有行
In [66]: df.loc['20130102'] # 只有一个参数,默认是对行 Out[66]: A 4 B 5 C 6 D 7 Name: 2013-01-02 00:00:00, dtype: int32
# 只有列
In [71]: df.loc[:,['A']] Out[71]: A 2013-01-01 0 2013-01-02 4 2013-01-03 8 2013-01-04 12 2013-01-05 16 2013-01-06 20
# 全部行,特定列
In [72]: df.loc[:,['A','B','C']] Out[72]: A B C 2013-01-01 0 1 2 2013-01-02 4 5 6 2013-01-03 8 9 10 2013-01-04 12 13 14 2013-01-05 16 17 18 2013-01-06 20 21 22
# 全部列,特定行(只能筛选一行)
In [81]: df.loc['20130101',:] Out[81]: A 0 B 1 C 2 D 3 Name: 2013-01-01 00:00:00, dtype: int32
# 特定列,特定行
In [82]: df.loc['20130101',['A','B','C']] Out[82]: A 0 B 1 C 2 Name: 2013-01-01 00:00:00, dtype: int32
4,使用iloc进行位置筛选 df.loc[[行],[列]]
行和列的格式为list,其中 : 代表全部。这里的取值并不是list中的范围,例如 [0:5] 并不代表第0条到第5条,而是代表第0条和第5条
In [90]: df.iloc[[0,1],[0,3]] Out[90]: A D 2013-01-01 0 3 2013-01-02 4 7 In [91]: df.iloc[[1],[0,3]] Out[91]: A D 2013-01-02 4 7 In [92]: df.iloc[[2,4],[0,3]] Out[92]: A D 2013-01-03 8 11 2013-01-05 16 19
5,使用ix进行混合选择
# 前三行,A,B列
In [111]: df.ix[:3,['A','B']] Out[111]: A B 2013-01-01 0 1 2013-01-02 4 5 2013-01-03 8 9
# 前三行,前两列
In [112]: df.ix[:3,:2] Out[112]: A B 2013-01-01 0 1 2013-01-02 4 5 2013-01-03 8 9
# 20130101 行,A,B列
In [113]: df.ix['20130101',['A','B']] Out[113]: A 0 B 1 Name: 2013-01-01 00:00:00, dtype: int32
总结 : 使用loc以及,ix对DataFrame的行和列的操作的时候,如果对行的筛选条件为名称筛选,那么只能筛选一行
例如:
df.loc['20130101',['A','B','C']]
df.ix['20130101',['A','B']]
6,使用判断进行筛选
In [125]: df[df['A']>8] # A列的值大于8 Out[125]: A B C D 2013-01-04 12 13 14 15 2013-01-05 16 17 18 19 2013-01-06 20 21 22 23 In [126]: df[df.iloc[:]>8] # 所有的值大于8 Out[126]: A B C D 2013-01-01 NaN NaN NaN NaN 2013-01-02 NaN NaN NaN NaN 2013-01-03 NaN 9.0 10.0 11.0 2013-01-04 12.0 13.0 14.0 15.0 2013-01-05 16.0 17.0 18.0 19.0 2013-01-06 20.0 21.0 22.0 23.0
七,为DataFrame设置新的值
1,iloc定位赋值
In [136]: df # 先查看df Out[136]: A B C D 2013-01-01 0 1 2 3 2013-01-02 4 5 6 7 2013-01-03 8 9 10 11 2013-01-04 12 13 14 15 2013-01-05 16 17 18 19 2013-01-06 20 21 22 23 In [137]: df.iloc[0,0] # 定位第0行第0个元素 Out[137]: 0 In [138]: df.iloc[0,0] = 10 # 将第0行第0个元素的值赋为10 In [139]: df # 查看结果 Out[139]: A B C D 2013-01-01 10 1 2 3 2013-01-02 4 5 6 7 2013-01-03 8 9 10 11 2013-01-04 12 13 14 15 2013-01-05 16 17 18 19 2013-01-06 20 21 22 23
2,loc定位赋值
In [140]: df.loc['20130101','A'] # 定位行为'20130101',列为'A'的元素 Out[140]: 10 In [141]: df.loc['20130101','A'] = 0 # 将值赋为0 In [142]: df # 查看结果 Out[142]: A B C D 2013-01-01 0 1 2 3 2013-01-02 4 5 6 7 2013-01-03 8 9 10 11 2013-01-04 12 13 14 15 2013-01-05 16 17 18 19 2013-01-06 20 21 22 23
3,使用判断赋值
In [156]: df[df.B>4]=0 # B列的值大于4,就将值赋为0 In [157]: df Out[157]: A B C D 2013-01-01 0 1 2 3 2013-01-02 0 0 0 0 2013-01-03 0 0 0 0 2013-01-04 0 0 0 0 2013-01-05 0 0 0 0 2013-01-06 0 0 0 0
4,新增一列,并赋值为NaN
In [159]: df['E']=np.nan # 新增一列 E ,并且赋值 In [160]: df Out[160]: A B C D E 2013-01-01 0 1 2 3 NaN 2013-01-02 0 0 0 0 NaN 2013-01-03 0 0 0 0 NaN 2013-01-04 0 0 0 0 NaN 2013-01-05 0 0 0 0 NaN 2013-01-06 0 0 0 0 NaN
5,新增一列,并赋值
In [163]: df['F'] = pd.Series([11,22,33,44,55,66,], index=pd.date_range('20130101',periods=6)) In [164]: df Out[164]: A B C D E F 2013-01-01 0 1 2 3 NaN 11 2013-01-02 0 0 0 0 NaN 22 2013-01-03 0 0 0 0 NaN 33 2013-01-04 0 0 0 0 NaN 44 2013-01-05 0 0 0 0 NaN 55 2013-01-06 0 0 0 0 NaN 66
八,DataFrame处理NaN
1,直接删除含有NaN的行或列
In [181]: df Out[181]: A B C D E F 2013-01-01 0 1 2 3 NaN 11 2013-01-02 0 0 0 0 NaN 22 2013-01-03 0 0 0 0 NaN 33 2013-01-04 0 0 0 0 NaN 44 2013-01-05 0 0 0 0 NaN 55 2013-01-06 0 0 0 0 NaN 66 In [182]: df.dropna(axis=0,how='any') #0: 对行进行操作; 1: 对列进行操作
how='any' ,'any': 只要存在 NaN 就 drop 掉; 'all': 必须全部是 NaN 才 drop 。不会对原DataFrame操作,会返回一个新的
DataFrame
Out[182]: Empty DataFrame Columns: [A, B, C, D, E, F] Index: [] # 返回的结果
2,将值为NaN的值替换为指定的值
In [186]: df.fillna(value=10) # 放值为NaN时,将值设置10 Out[186]: A B C D E F 2013-01-01 0 1 2 3 10.0 11 2013-01-02 0 0 0 0 10.0 22 2013-01-03 0 0 0 0 10.0 33 2013-01-04 0 0 0 0 10.0 44 2013-01-05 0 0 0 0 10.0 55 2013-01-06 0 0 0 0 10.0 66
3,返回布尔值(为NaN则返回True,反之返回False)
In [187]: df.isnull() Out[187]: A B C D E F 2013-01-01 False False False False True False 2013-01-02 False False False False True False 2013-01-03 False False False False True False 2013-01-04 False False False False True False 2013-01-05 False False False False True False 2013-01-06 False False False False True False
4,检测数据中是否存在NaN
In [193]: np.any(df.isnull()) == True
Out[193]: True
九,文件的读取和操作
1,读取Excel或者csv
pd.read_csv('test.csv',encoding='utf-8') pd.read_excel('test.xlsx',encoding='utf-8')
2,将结果写入pickle
data.to_pickle('test.pickle')
十,读取文件拓展(写为函数)
1,读取文件
def get_dataframe_by_file(abs_filename, skiprows=0, nrows=None): ext = os.path.splitext(abs_filename)[1].lower() if ext == '.csv': try: df = pd.read_csv(abs_filename, encoding='utf8', skiprows=skiprows, nrows=nrows) except UnicodeDecodeError: df = pd.read_csv(abs_filename, encoding='gb18030', skiprows=skiprows, nrows=nrows) elif ext in ['.xlsx', '.xls']: df = pd.read_excel(abs_filename, skiprows=skiprows, nrows=nrows) else: raise Exception('not supported yet') return df
2,将数据转为utf-8
def convert_csv_to_utf8(csv_file): newpath = os.path.join(DATA_FILE_PATH, 'csvdata_utf8') if not os.path.exists(newpath): os.makedirs(newpath) basename = os.path.basename(csv_file) newname = os.path.join(newpath, basename) file(newname, 'wb').write( file(csv_file).read().decode('gb18030').encode('utf-8')) return newname
十一,Panads DataFrame合并
1,concat合并行
In [207]: df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
...: df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
...: df3 = pd.DataFrame(np.ones((3,4))*2, columns=['a','b','c','d'])
...:
In [208]:
In [208]:
In [208]: df1
Out[208]:
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
In [209]: df2
Out[209]:
a b c d
0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
In [210]: df3
Out[210]:
a b c d
0 2.0 2.0 2.0 2.0
1 2.0 2.0 2.0 2.0
2 2.0 2.0 2.0 2.0
In [211]: re = pd.concat([df1,df2,df3],axis=0) # pd.concat([DataFrame列表],axis=0) axis=0代表行合并,1代表列合并。合并后index是重复的,可使用ignore_index = True让索引不重复pd.concat([df1, df2, df3], axis=0, ignore_index=True)
In [212]: re Out[212]: a b c d 0 0.0 0.0 0.0 0.0 1 0.0 0.0 0.0 0.0 2 0.0 0.0 0.0 0.0 0 1.0 1.0 1.0 1.0 1 1.0 1.0 1.0 1.0 2 1.0 1.0 1.0 1.0 0 2.0 2.0 2.0 2.0 1 2.0 2.0 2.0 2.0 2 2.0 2.0 2.0 2.0 In [213]:
2,concat合并列
In [213]: re = pd.concat([df1,df2,df3],axis=1) In [214]: re Out[214]: a b c d a b c d a b c d 0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0 1 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0 2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0
3,concat join='outer' 合并(相同的columns或者index进行合并,独自的index或者columns自成行或列。没有值的为NaN)
# 先准备两个df
In [227]: df1 Out[227]: a b c d 1 0.0 0.0 0.0 0.0 2 0.0 0.0 0.0 0.0 3 0.0 0.0 0.0 0.0 In [228]: df2 Out[228]: b c d e 2 1.0 1.0 1.0 1.0 3 1.0 1.0 1.0 1.0 4 1.0 1.0 1.0 1.0
# join = 'outer',axis = 0 合并 (可以理解为按照行取并集合并)
In [231]: re = pd.concat([df1,df2],axis=0,join='outer',ignore_index=True) In [232]: re Out[232]: a b c d e 0 0.0 0.0 0.0 0.0 NaN 1 0.0 0.0 0.0 0.0 NaN 2 0.0 0.0 0.0 0.0 NaN 3 NaN 1.0 1.0 1.0 1.0 4 NaN 1.0 1.0 1.0 1.0 5 NaN 1.0 1.0 1.0 1.0
# join = 'outer',axis = 1 合并 (可以理解为按照列取交集进行合并)
In [233]: re = pd.concat([df1,df2],axis=1,join='outer',ignore_index=True) In [234]: re Out[234]: 0 1 2 3 4 5 6 7 1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN 2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 4 NaN NaN NaN NaN 1.0 1.0 1.0 1.0
4,concat join='inner' 合并(相同的columns或者index进行合并,独自的index或者columns自成行或列。没有值的为NaN)
# 准备两个df
In [240]: df1 Out[240]: a b c d 1 0.0 0.0 0.0 0.0 2 0.0 0.0 0.0 0.0 3 0.0 0.0 0.0 0.0 In [241]: df2 Out[241]: b c d e 2 1.0 1.0 1.0 1.0 3 1.0 1.0 1.0 1.0 4 1.0 1.0 1.0 1.0
# join = 'inner', axis = 0 合并(相当于行取交集进行合并)
In [235]: re = pd.concat([df1,df2],axis=0,join='inner',ignore_index=True) In [236]: re Out[236]: b c d 0 0.0 0.0 0.0 1 0.0 0.0 0.0 2 0.0 0.0 0.0 3 1.0 1.0 1.0 4 1.0 1.0 1.0 5 1.0 1.0 1.0
# join = 'inner', axis = 1 合并(相当于列取交集进行合并)
In [237]: re = pd.concat([df1,df2],axis=1,join='inner',ignore_index=True) In [238]: re Out[238]: 0 1 2 3 4 5 6 7 2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
5,concat join_axes合并(可以按照某个DataFrame的index或者columns进行合并)
# 准备两个df
In [253]: df1 Out[253]: a b c d 1 0.0 0.0 0.0 0.0 2 0.0 0.0 0.0 0.0 3 0.0 0.0 0.0 0.0 In [254]: df2 Out[254]: b c d e 2 1.0 1.0 1.0 1.0 3 1.0 1.0 1.0 1.0 4 1.0 1.0 1.0 1.0
# 按照df1的index进行合并
In [255]: res = pd.concat([df1, df2], axis=1, join_axes=[df1.index]) # 这里需要注意,join_axes是要进行合并的某个df的index或者columns,axis=1 代表行。这里需要同步,既,后面为index,axis为1,后面为columns,axis为0 In [256]: res Out[256]: a b c d b c d e 1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN 2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
# 按照df2的columns进行合并
In [257]: res = pd.concat([df1, df2], axis=0, join_axes=[df1.columns]) In [258]: res Out[258]: a b c d 1 0.0 0.0 0.0 0.0 2 0.0 0.0 0.0 0.0 3 0.0 0.0 0.0 0.0 2 NaN 1.0 1.0 1.0 3 NaN 1.0 1.0 1.0 4 NaN 1.0 1.0 1.0
6,append追加合并(追加只能行合并,不能列合并)
# 创建两个df
In [263]: df1 Out[263]: a b c d 0 0.0 0.0 0.0 0.0 1 0.0 0.0 0.0 0.0 2 0.0 0.0 0.0 0.0 In [264]: df2 Out[264]: a b c d 0 1.0 1.0 1.0 1.0 1 1.0 1.0 1.0 1.0 2 1.0 1.0 1.0 1.0
# 进行append操作
In [265]: res = df1.append(df2, ignore_index=True) In [266]: res Out[266]: a b c d 0 0.0 0.0 0.0 0.0 1 0.0 0.0 0.0 0.0 2 0.0 0.0 0.0 0.0 3 1.0 1.0 1.0 1.0 4 1.0 1.0 1.0 1.0 5 1.0 1.0 1.0 1.0