pandas
a.基本使用
#结构化的数据分析 pip3 install jupyter pip3 install pandas #import pandas as pd #import numpy as np #事例一: #s = pd.Series([1,3,5,np.NaN,8,4]) #s Out[5]: 0 1.0 1 3.0 2 5.0 3 NaN 4 8.0 5 4.0 dtype: float64 #事例二: #创建日期序列,里面有6个日期 #datas = pd.date_range('20170821',periods=6) #datas DatetimeIndex(['2017-08-21', '2017-08-22', '2017-08-23', '2017-08-24','2017-08-25', '2017-08-26'], dtype='datetime64[ns]', freq='D') #创建二维数组 #创建6行4列的随机数,索引datas,列ABCD #data = pd.DataFrame(np.random.randn(6,4),index=datas,columns=list('ABCD')) #data A B C D 2017-08-21 -0.245344 0.260401 -2.003621 0.427142 2017-08-22 -2.773848 1.604729 -0.711769 -0.677211 2017-08-23 0.396086 -0.731911 -0.100242 0.966344 2017-08-24 0.761821 -0.159621 -1.172729 -1.317056 2017-08-25 0.186582 0.739702 -1.688458 0.480121 2017-08-26 -0.519489 -0.002741 0.875164 -0.783657 #data.values array([[ -2.45343824e-01, 2.60401419e-01, -2.00362100e+00, 4.27142120e-01], [ -2.77384841e+00, 1.60472878e+00, -7.11768546e-01, -6.77211441e-01], [ 3.96086166e-01, -7.31910686e-01, -1.00241967e-01, 9.66344486e-01], [ 7.61820910e-01, -1.59621471e-01, -1.17272904e+00, -1.31705593e+00], [ 1.86582291e-01, 7.39702155e-01, -1.68845777e+00, 4.80121151e-01], [ -5.19488872e-01, -2.74128435e-03, 8.75164076e-01, -7.83656946e-01]]) #data.head(2) #看前两行的数据 A B C D 2017-08-21 -0.245344 0.260401 -2.003621 0.427142 2017-08-22 -2.773848 1.604729 -0.711769 -0.677211 #data.T #行列转置 2017-08-21 2017-08-22 2017-08-23 2017-08-24 2017-08-25 2017-08-26 A -0.245344 -2.773848 0.396086 0.761821 0.186582 -0.519489 B 0.260401 1.604729 -0.731911 -0.159621 0.739702 -0.002741 C -2.003621 -0.711769 -0.100242 -1.172729 -1.688458 0.875164 D 0.427142 -0.677211 0.966344 -1.317056 0.480121 -0.783657 #data.sort_index(axis=1) #列正序 A B C D 2017-08-21 -0.245344 0.260401 -2.003621 0.427142 2017-08-22 -2.773848 1.604729 -0.711769 -0.677211 2017-08-23 0.396086 -0.731911 -0.100242 0.966344 2017-08-24 0.761821 -0.159621 -1.172729 -1.317056 2017-08-25 0.186582 0.739702 -1.688458 0.480121 2017-08-26 -0.519489 -0.002741 0.875164 -0.783657 #data.sort_index(axis=1,ascending=False) #列倒序 D C B A 2017-08-21 0.427142 -2.003621 0.260401 -0.245344 2017-08-22 -0.677211 -0.711769 1.604729 -2.773848 2017-08-23 0.966344 -0.100242 -0.731911 0.396086 2017-08-24 -1.317056 -1.172729 -0.159621 0.761821 2017-08-25 0.480121 -1.688458 0.739702 0.186582 2017-08-26 -0.783657 0.875164 -0.002741 -0.519489 #data.sort_index(axis=0,ascending=False) #行倒序 A B C D 2017-08-26 -0.519489 -0.002741 0.875164 -0.783657 2017-08-25 0.186582 0.739702 -1.688458 0.480121 2017-08-24 0.761821 -0.159621 -1.172729 -1.317056 2017-08-23 0.396086 -0.731911 -0.100242 0.966344 2017-08-22 -2.773848 1.604729 -0.711769 -0.677211 2017-08-21 -0.245344 0.260401 -2.003621 0.427142 #data.sort_values(by='A') # A列排序 A B C D 2017-08-22 -2.773848 1.604729 -0.711769 -0.677211 2017-08-26 -0.519489 -0.002741 0.875164 -0.783657 2017-08-21 -0.245344 0.260401 -2.003621 0.427142 2017-08-25 0.186582 0.739702 -1.688458 0.480121 2017-08-23 0.396086 -0.731911 -0.100242 0.966344 2017-08-24 0.761821 -0.159621 -1.172729 -1.317056 #data['A'] 或 data.A #数据选择 2017-08-21 -0.245344 2017-08-22 -2.773848 2017-08-23 0.396086 2017-08-24 0.761821 2017-08-25 0.186582 2017-08-26 -0.519489 data.loc['2017-08-21':'2017-08-23'] #选择哪几行 A B C D 2017-08-21 -0.245344 0.260401 -2.003621 0.427142 2017-08-22 -2.773848 1.604729 -0.711769 -0.677211 2017-08-23 0.396086 -0.731911 -0.100242 0.966344 #data.iloc[2:4] #行 A B C D 2017-08-23 0.396086 -0.731911 -0.100242 0.966344 2017-08-24 0.761821 -0.159621 -1.172729 -1.317056 #data.loc[:,['B','C']] #取出列 B C 2017-08-21 0.260401 -2.003621 2017-08-22 1.604729 -0.711769 2017-08-23 -0.731911 -0.100242 2017-08-24 -0.159621 -1.172729 2017-08-25 0.739702 -1.688458 2017-08-26 -0.002741 0.875164 #data.loc['2017-08-21':'2017-08-23',['B','C']] #指定行和列 B C 2017-08-21 0.260401 -2.003621 2017-08-22 1.604729 -0.711769 2017-08-23 -0.731911 -0.100242 #data.loc['2017-08-21','B'] #访问特定的值 0.26040141861580018 #data.at[pd.Timestamp('2017-08-21'),'B'] #比上面效率高 #0.26040141861580018 #data.A = range(6) #修改列数据 #data A B C D 2017-08-21 0 0.260401 -2.003621 0.427142 2017-08-22 1 1.604729 -0.711769 -0.677211 2017-08-23 2 -0.731911 -0.100242 0.966344 2017-08-24 3 -0.159621 -1.172729 -1.317056 2017-08-25 4 0.739702 -1.688458 0.480121 2017-08-26 5 -0.002741 0.875164 -0.783657 #data.iloc[:,2:5] = 1000 #修改某几列 #data A B C D 2017-08-21 0 0.260401 1000 1000 2017-08-22 1 1.604729 1000 1000 2017-08-23 2 -0.731911 1000 1000 2017-08-24 3 -0.159621 1000 1000 2017-08-25 4 0.739702 1000 1000 2017-08-26 5 -0.002741 1000 1000 #事例三: #d = {'A':1,'B':pd.Timestamp("20160821"),'C':list(range(4)),'D':np.arange(4)} #d {'A': 1, 'B': Timestamp('2016-08-21 00:00:00'), 'C': [0, 1, 2, 3], 'D': array([0, 1, 2, 3])} df = pd.DataFrame(d) #df A B C D 0 1 2016-08-21 0 0 1 1 2016-08-21 1 1 2 1 2016-08-21 2 2 3 1 2016-08-21 3 3 #df.dtypes A int64 B datetime64[ns] C int64 D int64
b. DataFrame 创建
#创建方式 #pd.DataFrame({'one':[1,2,3,4],'two':[4,3,2,1]}) one two 0 1 4 1 2 3 2 3 2 3 4 1 #pd.DataFrame({'one':[1,2,3],'two':[3,2,1]},index=list('abc')) one two a 1 3 b 2 2 c 3 1 #pd.DataFrame({'one': pd.Series([1,2,3],index=['a','b','c']),'two' :pd.Series([1,2,3,4],index=['b','a','c','d'])}) one two a 1.0 2 b 2.0 1 c 3.0 3 d NaN 4
c. xx
#obj = pd.read_csv('601318.csv') #从文件中导入 #obj.to_csv('new_obj.csv') #导出 #obj.index #获取行索引 #obj.columns #获取列索引 #obj.values #返回所有的值 #obj.rename(columns={'close':'new_close'}) #给列改名称 # loc #obj.loc[:,['close','open']] #所有行的两列 #obj.loc[0:10,['close','open']] #前10行的两列 0可以省略 # iloc 只能位置(下标) #obj.iloc[0:10,0:3] #前10行的前3列 #obj[obj['open']<20] #open小于20的
import pandas as pd import numpy as np import matplotlib.pyplot as plt #资本显示 #dates = pd.date_range('20160821',periods=6) #df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD')) #df A B C D 2016-08-21 0.395062 -0.713272 0.560768 1.175623 2016-08-22 0.487270 0.541010 -1.451927 -0.374084 2016-08-23 1.190535 -0.017564 1.065334 -0.716946 2016-08-24 -0.231604 0.890002 -2.085083 -0.421781 2016-08-25 -0.282863 0.039613 0.900477 -1.048180 2016-08-26 0.238394 -0.086092 0.002821 1.893919 # 增加一列 E #df1 = df.reindex(index=dates[0:4],columns=list(df.columns) + ['E']) #df1 A B C D E 2016-08-21 0.395062 -0.713272 0.560768 1.175623 NaN 2016-08-22 0.487270 0.541010 -1.451927 -0.374084 NaN 2016-08-23 1.190535 -0.017564 1.065334 -0.716946 NaN 2016-08-24 -0.231604 0.890002 -2.085083 -0.421781 NaN # 修改列值 #df1.loc[dates[1:3],'E'] = 2 #df1 A B C D E 2016-08-21 0.395062 -0.713272 0.560768 1.175623 NaN 2016-08-22 0.487270 0.541010 -1.451927 -0.374084 2.0 2016-08-23 1.190535 -0.017564 1.065334 -0.716946 2.0 2016-08-24 -0.231604 0.890002 -2.085083 -0.421781 NaN #有空数据行的丢掉 #df1.dropna() A B C D E 2016-08-22 0.487270 0.541010 -1.451927 -0.374084 2.0 2016-08-23 1.190535 -0.017564 1.065334 -0.716946 2.0 # 有空的值用默认值替换 #df1.fillna(value=5) A B C D E 2016-08-21 0.395062 -0.713272 0.560768 1.175623 5.0 2016-08-22 0.487270 0.541010 -1.451927 -0.374084 2.0 2016-08-23 1.190535 -0.017564 1.065334 -0.716946 2.0 2016-08-24 -0.231604 0.890002 -2.085083 -0.421781 5.0 # 判断是否有空数据 #pd.isnull(df1) A B C D E 2016-08-21 False False False False True 2016-08-22 False False False False False 2016-08-23 False False False False False 2016-08-24 False False False False True # 求行平均值,空数据不参与计算 #df1.mean() A 0.460316 B 0.175044 C -0.477727 D -0.084297 E 2.000000 dtype: float64 # 求列平均值 #df1.mean(axis=1) A 0.460316 B 0.175044 C -0.477727 D -0.084297 E 2.000000 dtype: float64 # 累加 (把列传给参数处理) #df.apply(np.cumsum) A B C D 2016-08-21 -2.251855 0.967517 0.045508 1.011237 2016-08-22 -0.159759 -1.011472 0.642207 1.451304 2016-08-23 0.595429 -1.164885 -1.323172 2.214757 2016-08-24 0.582950 0.299636 1.421000 2.018076 2016-08-25 0.624217 0.838486 1.840071 1.241466 2016-08-26 0.102015 -0.251384 0.422820 0.166844 #每列的最大值 减去最小值 #df.apply(lambda x: x.max() - x.min()) A 4.343951 B 3.443510 C 4.709552 D 2.085859 dtype: float64 #追加一行 #df.append(s,ignore_index=True) ####-------------- concat()--------------- #df = pd.DataFrame(np.random.randn(10,4),columns=list('ABCD')) #df A B C D 0 0.448883 0.667777 1.974642 -0.598013 1 -0.384818 -1.616549 1.551960 0.201467 2 -1.428748 0.122964 0.837514 1.102314 3 0.634624 0.553064 -0.248767 0.019210 4 2.518095 0.240933 -0.722795 -0.815324 5 0.517742 -2.444094 1.270380 -0.160528 6 -0.800297 -1.242557 -1.118255 0.616456 7 0.953936 0.318387 0.151009 0.340203 8 0.558308 -1.344539 0.846960 1.264978 9 1.549733 1.496383 0.208102 1.265871 #df.iloc[:3] # 取前3行 #df.iloc[3:7] # 3-7行 #df.iloc[7:] # 7-最后一行 #df1 = pd.concat([df.iloc[:3],df.iloc[3:7],df.iloc[7:]]) #等于上面3行 #df1 A B C D 0 -0.151648 -0.077165 1.130773 1.587466 1 -0.854935 0.490470 -0.438102 -0.066202 2 -0.622838 -1.524100 -0.995986 -1.378272 3 -0.718499 0.184935 -1.164265 1.127623 4 2.180069 0.799651 1.630780 -1.592882 5 -0.590875 -0.824192 -0.109713 -1.502221 6 0.437851 2.268239 0.305557 -0.515982 7 -1.084059 -0.325458 1.279105 -0.404447 8 0.134603 0.012700 0.935611 1.017555 9 0.521873 0.630876 -1.714364 -1.243454 #判断 df1 和 df 是否相等 #df == df1 A B C D 0 True True True True 1 True True True True 2 True True True True 3 True True True True 4 True True True True 5 True True True True 6 True True True True 7 True True True True 8 True True True True 9 True True True True #(df == df1).all() A True B True C True D True #(df == df1).all().all() True ###------------- merge() ---------- #left = pd.DataFrame({'key':['foo','foo'],'rval':[1,2]}) #right = pd.DataFrame({'key':['foo','foo'],'rval':[4,5]}) #left key rval 0 foo 1 1 foo 2 #right key rval 0 foo 4 1 foo 5 #按照key关联 #pd.merge(left,right,on='key') key rval_x rval_y 0 foo 1 4 1 foo 1 5 2 foo 2 4 3 foo 2 5 ###------------- groupby() ----------- #df = pd.DataFrame({'A' : ['foo','bar','foo','bar'], 'B' : ['one','two','one','two'], 'C': np.random.randn(4), 'D': np.random.randn(4) }) #df A B C D 0 foo one 1.261140 1.109300 1 bar two -2.072375 -0.533420 2 foo one 1.470090 0.243500 3 bar two -0.243230 -0.721201 #分组 #df.groupby('A').sum() A C D bar -2.315605 -1.254621 foo 2.731229 1.352800 #df.groupby(['A','B']).sum() A B C D bar two -1.039401 -0.338988 foo one -1.004454 1.169985 -------------------------序列--------------------- #s = pd.Series(np.random.randint(10,20,size=20)) #s 0 13 1 16 2 11 3 13 4 17 5 18 6 12 7 11 8 13 9 12 10 18 11 15 12 19 13 10 14 16 15 10 16 16 17 17 18 10 19 13 # 每个数字产生的次数 #s.value_counts() 15 5 18 3 13 3 17 2 12 2 11 2 19 1 16 1 10 1 # 产生最多的数 #s.mode() 0 15 ---------------------------------------------------------- # 创建序列 包含空值 #s = pd.Series([1,3,5,np.nan,6,8],index=dates) #s 2016-08-21 1.0 2016-08-22 3.0 2016-08-23 5.0 2016-08-24 NaN 2016-08-25 6.0 2016-08-26 8.0 Freq: D, dtype: float64 In [12]: #s = pd.Series([1,3,5,np.nan,6,8],index=dates).shift(2) #s 2016-08-21 NaN 2016-08-22 NaN 2016-08-23 1.0 2016-08-24 3.0 2016-08-25 5.0 2016-08-26 NaN Freq: D, dtype: float64 #df A B C D 2016-08-21 -2.251855 0.967517 0.045508 1.011237 2016-08-22 2.092096 -1.978989 0.596700 0.440067 2016-08-23 0.755188 -0.153414 -1.965380 0.763453 2016-08-24 -0.012479 1.464521 2.744173 -0.196681 2016-08-25 0.041267 0.538850 0.419071 -0.776610 2016-08-26 -0.522202 -1.089871 -1.417252 -1.074622 # 二维 减去 序列 (空值不参与运算) #df.sub(s,axis='index') A B C D 2016-08-21 NaN NaN NaN NaN 2016-08-22 NaN NaN NaN NaN 2016-08-23 -0.244812 -1.153414 -2.965380 -0.236547 2016-08-24 -3.012479 -1.535479 -0.255827 -3.196681 2016-08-25 -4.958733 -4.461150 -4.580929 -5.776610 2016-08-26 NaN NaN NaN NaN
b. 数据整形
#数据整形 (把数据的行和列互换) %matplotlib inline import pandas as pd import numpy as np import matplotlib.pyplot as plt #行索引 #tuples = list(zip(*[['bar','bar','baz','baz','foo','foo','qux','qux'],['one','two','one','two','one','two','one','two']])) #tuples [('bar', 'one'), ('bar', 'two'), ('baz', 'one'), ('baz', 'two'), ('foo', 'one'), ('foo', 'two'), ('qux', 'one'), ('qux', 'two')] #index = pd.MultiIndex.from_tuples(tuples, names=['first','second']) #index MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']], labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]], names=['first', 'second']) #df = pd.DataFrame(np.random.randn(8,2),index=index,columns=['A','B']) #df A B first second bar one 1.109199 -0.482758 two 0.947951 1.284311 baz one -0.733705 -0.088907 two 1.146346 -0.946909 foo one 0.254840 1.868951 two 1.052604 -0.684604 qux one 0.133846 0.065647 two -1.137040 1.010329 #把列索引变成行索引 #stacked = df.stack() #stacked first second bar one A 1.109199 B -0.482758 two A 0.947951 B 1.284311 baz one A -0.733705 B -0.088907 two A 1.146346 B -0.946909 foo one A 0.254840 B 1.868951 two A 1.052604 B -0.684604 qux one A 0.133846 B 0.065647 two A -1.137040 B 1.010329