一.Numpy/Scipy
1 #coding=utf-8 2 import numpy 3 import scipy 4 5 x = numpy.ones((3, 4)) 6 print x 7 """ 8 [[ 1. 1. 1. 1.] 9 [ 1. 1. 1. 1.] 10 [ 1. 1. 1. 1.]] 11 """ 12 13 y = numpy.array([[1, 2], [3, 4]]) 14 print y 15 """ 16 [[1 2] 17 [3 4]] 18 """ 19 20 print numpy.linalg.det(y) #-2.0 21 22 print numpy.arange(1, 5, 0.5) #[ 1. 1.5 2. 2.5 3. 3.5 4. 4.5] 23 24 a = numpy.array([[5, 5, 5], [5, 5, 5]]) 25 b = numpy.array([[2, 2, 2], [2, 2, 2]]) 26 print a * b 27 """ 28 [[10 10 10] 29 [10 10 10]] 30 """ 31 32 print a.sum() #30 33 print a.sum(axis=0) #[10 10 10] 34 print a.sum(axis=1) #[15 15] 35 36 a = numpy.array([1, 3, 5]) 37 b = numpy.array([2, 4, 6]) 38 c = numpy.array([7, 8, 9]) 39 print numpy.where(a > 2, b, c) #[7 4 6] Numpy.where函数是三元表达式x if condition else y的矢量化版本a > 2 [False, True, True]
1 #coding=utf-8 2 import numpy 3 import scipy 4 5 def fun(x, y): 6 return (x + 1) * (y + 1) 7 8 a = numpy.fromfunction(fun, (9, 9)) 9 print a 10 """ 11 [[ 1. 2. 3. 4. 5. 6. 7. 8. 9.] 12 [ 2. 4. 6. 8. 10. 12. 14. 16. 18.] 13 [ 3. 6. 9. 12. 15. 18. 21. 24. 27.] 14 [ 4. 8. 12. 16. 20. 24. 28. 32. 36.] 15 [ 5. 10. 15. 20. 25. 30. 35. 40. 45.] 16 [ 6. 12. 18. 24. 30. 36. 42. 48. 54.] 17 [ 7. 14. 21. 28. 35. 42. 49. 56. 63.] 18 [ 8. 16. 24. 32. 40. 48. 56. 64. 72.] 19 [ 9. 18. 27. 36. 45. 54. 63. 72. 81.]] 20 """ 21 22 a = numpy.array([[1, 2, 3]]) 23 b = numpy.array([[3, 4, 5]]) 24 print numpy.add(a, b) #[[4 6 8]] 25 print numpy.multiply(a, b) #[[ 3 8 15]]
1 #coding=utf-8 2 from scipy.cluster.vq import * 3 import numpy as np 4 import matplotlib.pyplot as plt 5 6 class1 = 1.5 * np.random.randn(100, 2) #标准正态分布randn 7 class2 = np.random.randn(100, 2) + np.array([8, 8]) 8 9 features=np.vstack((class1,class2)) 10 centroids,variance=kmeans(features,2) 11 code,distance=vq(features,centroids) 12 13 plt.figure() 14 ndx=np.where(code==1)[0] 15 plt.plot(features[ndx,0],features[ndx,1],'*') 16 ndx=np.where(code==0)[0] 17 plt.plot(features[ndx,0],features[ndx,1],'r.') 18 plt.plot(centroids[:,0],centroids[:,1],'go') 19 plt.axis('off') 20 plt.show()
二.Pandas
1 #coding=utf-8 2 from pandas import Series 3 import pandas as pd 4 a = Series([3, 5, 7], index=['a', 'b', 'c']) 5 print a['a'] #3 6 7 data = {'a':1, 'b':2, 'c':3} 8 sindex = ['a', 'b', 'd'] 9 Ser = Series(data, index=sindex) 10 print Ser 11 """ 12 a 1 13 b 2 14 d NaN 15 dtype: float64 16 """ 17 print Series.isnull(Ser) 18 """ 19 a False 20 b False 21 d True 22 dtype: bool 23 """ 24 print a 25 """ 26 a 3 27 b 5 28 c 7 29 dtype: int64 30 """ 31 32 b = {'a':2, 'b':3, 'd':5} 33 print Series(a) + Series(b) 34 """ 35 a 5 36 b 8 37 c NaN 38 d NaN 39 dtype: float64 40 """ 41 data = {'Name':['a', 'b', 'c'], 'Num':[1, 2, 3]} 42 a = pd.DataFrame(data) 43 """ 44 Name Num 45 0 a 1 46 1 b 2 47 2 c 3 48 """ 49 print a['Name'] 50 print a.Name 51 """ 52 0 a 53 1 b 54 2 c 55 Name: Name, dtype: object 56 """ 57 print a[0:2] 58 print a[a.index < 2] 59 """ 60 Name Num 61 0 a 1 62 1 b 2 63 """ 64 print a.ix[1] 65 """ 66 Name b 67 Num 2 68 """ 69 del a['Name'] 70 print a 71 """ 72 Num 73 0 1 74 1 2 75 2 3 76 """
三.Matplotlib
1 #coding=utf-8 2 import pandas as pd 3 from matplotlib.finance import quotes_historical_yahoo 4 from datetime import date 5 today = date.today() 6 start = (today.year - 1, today.month, today.day) 7 quote = quotes_historical_yahoo('AXP', start, today) 8 fields = ['date', 'open', 'close', 'high', 'low', 'volume'] 9 df = pd.DataFrame(quote, index=range(1, len(quote) + 1), columns=fields) 10 print df.head(10) #df.tail(10) 11 """ 12 date open close high low volume 13 1 735663 79.412407 79.097240 79.924559 78.851015 6530200 14 2 735666 78.939660 79.294224 79.609392 78.742676 5846100 15 3 735667 78.526003 77.915364 78.565393 77.678990 7525000 16 4 735668 78.200986 78.250226 78.545699 77.915364 4546200 17 5 735669 78.890411 80.328364 80.761722 78.821469 9386600 18 6 735670 80.269272 79.382861 80.417009 78.742676 6919400 19 7 735673 79.658632 80.269272 80.417009 79.530598 5295100 20 8 735674 79.973799 79.835914 79.983650 79.333620 4258200 21 9 735675 79.363166 80.623836 81.076889 79.057843 6449400 22 10 735676 80.604134 80.308669 80.722325 79.717731 4677000 23 """ 24 list1 = [] 25 for i in range(0, len(quote)): 26 x = date.fromordinal(int(quote[i][0])) 27 y = date.strftime(x, '%y-%m-%d') 28 list1.append(y) 29 df = pd.DataFrame(quote, index=list1, columns=fields) 30 df = df.drop(['date'], axis=1) 31 print df 32 """ 33 open close high low volume 34 15-03-06 79.412407 79.097240 79.924559 78.851015 6530200 35 15-03-09 78.939660 79.294224 79.609392 78.742676 5846100 36 15-03-10 78.526003 77.915364 78.565393 77.678990 7525000 37 15-03-11 78.200986 78.250226 78.545699 77.915364 4546200 38 15-03-12 78.890411 80.328364 80.761722 78.821469 9386600 39 15-03-13 80.269272 79.382861 80.417009 78.742676 6919400 40 15-03-16 79.658632 80.269272 80.417009 79.530598 5295100 41 15-03-17 79.973799 79.835914 79.983650 79.333620 4258200 42 15-04-01 77.128302 77.997907 78.383301 76.930659 6163500 43 15-04-02 77.997907 78.758811 78.827984 77.642158 5695200 44 ........ 45 16-03-04 58.439999 58.290001 58.650002 57.810001 5407400 46 47 [252 rows x 5 columns] 48 """ 49 print df.loc[:, ['open', 'close']] 50 """ 51 open close 52 15-03-09 78.939660 79.294224 53 15-03-24 80.801118 80.141238 54 15-04-16 78.492002 79.954528 55 15-04-17 77.474167 76.406919 56 15-04-20 76.476093 76.317986 57 ... ... ... 58 16-03-03 57.160000 58.090000 59 16-03-04 58.439999 58.290001 60 61 [251 rows x 2 columns] 62 """ 63 print df.loc['15-03-09':'15-03-20', ['open']] 64 """ 65 open 66 15-03-09 78.939660 67 15-03-10 78.526003 68 15-03-11 78.200986 69 15-03-12 78.890411 70 15-03-13 80.269272 71 15-03-16 79.658632 72 15-03-17 79.973799 73 15-03-18 79.363166 74 15-03-19 80.604134 75 15-03-20 80.554894 76 """ 77 print df.iloc[1:10, 1] 78 """ 79 15-03-10 77.915364 80 15-03-11 78.250226 81 15-03-12 80.328364 82 15-03-13 79.382861 83 15-03-16 80.269272 84 15-03-17 79.835914 85 15-03-18 80.623836 86 15-03-19 80.308669 87 15-03-20 81.451148 88 """ 89 print df.at['15-03-09', 'open']#78.9396603442 90 print df.iat[0, 0] #78.9396603442 91 print len(df[df.close > df.open]) #120 92 print df.sort(columns='open') 93 """ 94 open close high low volume 95 16-02-11 51.220001 51.110001 51.590000 50.270000 9142900 96 16-02-12 51.880001 52.660000 52.730000 51.639999 6083400 97 16-02-09 52.259998 52.630001 53.020000 51.910000 8455800 98 16-03-04 58.439999 58.290001 58.650002 57.810001 5407400 99 ... ... ... ... ... ... 100 15-03-24 80.801118 80.141238 80.919309 80.042742 5217800 101 15-03-23 81.451148 80.958698 82.278467 80.958698 7291700 102 """ 103 print df.sort_index() 104 """ 105 open close high low volume 106 15-03-09 78.939660 79.294224 79.609392 78.742676 5846100 107 15-03-27 77.216088 76.792580 77.452460 76.536504 9022600 108 15-03-30 77.166838 76.871373 77.560799 76.822125 7285200 109 15-03-31 76.970187 77.197475 77.602629 76.703380 5918300 110 ... ... ... ... ... ... 111 16-03-02 56.880001 57.119999 57.230000 56.570000 7264700 112 16-03-03 57.160000 58.090000 58.180000 57.160000 6841300 113 16-03-04 58.439999 58.290001 58.650002 57.810001 5407400 114 """ 115 116 import numpy 117 status = numpy.sign(numpy.diff(df.close))# 返回相邻数组元素的差值构成的数组[ 1., 4., 7.]->[ 3., 3.] 118 print status 119 """ 120 [-1. 1. 1. -1. 1. -1. 1. -1. 1. -1. -1. -1. -1. -1. 1. 1. 1. 1. 121 -1. -1. 1. 1. 1. -1. 1. 1. 1. -1. -1. 1. 1. -1. -1. -1. 0. -1. 122 1. 1. 1. -1. 1. 1. 1. 1. -1. 1. 1. -1. -1. 1. -1. -1. 1. -1. 123 -1. -1. -1. -1. 1. 1. -1. 1. -1. 1. 1. 1. -1. -1. 1. 1. 1. -1. 124 1. 1. -1. -1. 0. -1. 1. 1. -1. -1. -1. -1. 1. 1. 1. 1. -1. 1. 125 1. 1. -1. 1. -1. -1. -1. 1. 1. 1. -1. -1. -1. -1. -1. 1. 1. 1. 126 -1. -1. 1. 1. 1. -1. -1. -1. -1. -1. 1. 1. -1. 1. -1. 1. 1. -1. 127 1. -1. 1. 1. -1. 1. 1. -1. -1. 1. -1. -1. -1. 1. -1. -1. 1. -1. 128 1. 1. -1. 1. 0. 1. -1. -1. -1. 1. 1. -1. -1. -1. -1. 1. -1. -1. 129 1. -1. -1. 1. 1. 1. -1. 1. -1. -1. -1. -1. -1. 1. -1. 1. 1. -1. 130 -1. -1. 1. 1. -1. 1. -1. -1. 1. -1. -1. -1. 1. -1. -1. 1. 1. -1. 131 -1. 1. 1. 1. 1. -1. 1. -1. -1. -1. -1. -1. -1. -1. 1. 1. -1. 1. 132 -1. -1. 1. -1. -1. -1. 1. -1. -1. 1. 1. -1. 1. 1. -1. -1. 1. -1. 133 -1. 1. 1. 1. 1. 1. 1. -1. -1. 1. -1. 1. 1. 1. 1. 1.] 134 """ 135 month = [] 136 for m in df.index: 137 month.append(m.split('-')[1]) 138 df['month'] = month 139 print df.groupby('month').count() 140 """ 141 month open close high low volume 142 01 19 19 19 19 19 143 02 20 20 20 20 20 144 03 21 21 21 21 21 145 04 21 21 21 21 21 146 05 20 20 20 20 20 147 06 22 22 22 22 22 148 07 22 22 22 22 22 149 08 21 21 21 21 21 150 09 21 21 21 21 21 151 10 22 22 22 22 22 152 11 20 20 20 20 20 153 12 22 22 22 22 22 154 """ 155 print df.groupby('month').sum().volume 156 """ 157 month 158 01 236344300 159 02 158919800 160 03 152726400 161 04 133853700 162 05 103420200 163 06 141794200 164 07 117895600 165 08 155122400 166 09 110385200 167 10 122095600 168 11 106839600 169 12 124219300 170 """ 171 print df.groupby('month').mean() 172 """ 173 month open close high low volume 174 01 61.257880 60.911867 61.901466 60.234361 12439173.684211 175 02 53.843500 53.965001 54.380000 53.313500 7945990.000000 176 03 75.035891 75.053161 75.592068 74.455664 7272685.714286 177 04 77.464754 77.617688 78.067080 77.056302 6373985.714286 178 05 78.718298 78.723239 79.189665 78.307704 5171010.000000 179 06 78.962402 78.858566 79.455639 78.498764 6445190.909091 180 07 76.915968 76.806860 77.310469 76.371782 5358890.909091 181 08 77.227940 77.476384 78.140478 76.495359 7386780.952381 182 09 74.696736 74.632971 75.198348 74.107744 5256438.095238 183 10 75.153440 75.341586 75.773063 74.656811 5549800.000000 184 11 72.393654 72.297575 72.752583 71.910767 5341980.000000 185 12 69.739317 69.640658 70.224466 69.159582 5646331.818182 186 """ 187 print df.groupby('month').min() 188 print df.groupby('month').max() 189 190 data = {'Name':['a', 'b', 'c'], 'month':['01', '02', '03']} 191 a = pd.DataFrame(data) 192 print pd.concat([df, a], ignore_index=True) 193 """ 194 Name close high low month open volume 195 0 NaN 79.294224 79.609392 78.742676 03 78.939660 5846100 196 1 NaN 77.915364 78.565393 77.678990 03 78.526003 7525000 197 2 NaN 78.250226 78.545699 77.915364 03 78.200986 4546200 198 3 NaN 80.328364 80.761722 78.821469 03 78.890411 9386600 199 .. ... ... ... ... ... ... ... 200 249 NaN 58.090000 58.180000 57.160000 03 57.160000 6841300 201 250 NaN 58.290001 58.650002 57.810001 03 58.439999 5407400 202 251 a NaN NaN NaN 01 NaN NaN 203 252 b NaN NaN NaN 02 NaN NaN 204 253 c NaN NaN NaN 03 NaN NaN 205 """ 206 print pd.merge(df, a, on='month') 207 """ 208 open close high low volume month Name 209 0 78.939660 79.294224 79.609392 78.742676 5846100 03 c 210 1 78.526003 77.915364 78.565393 77.678990 7525000 03 c 211 2 78.200986 78.250226 78.545699 77.915364 4546200 03 c 212 3 78.890411 80.328364 80.761722 78.821469 9386600 03 c 213 4 80.269272 79.382861 80.417009 78.742676 6919400 03 c 214 5 79.658632 80.269272 80.417009 79.530598 5295100 03 c 215 6 79.973799 79.835914 79.983650 79.333620 4258200 03 c 216 7 79.363166 80.623836 81.076889 79.057843 6449400 03 c 217 8 80.604134 80.308669 80.722325 79.717731 4677000 03 c 218 9 80.554894 81.451148 81.805713 80.269272 9338100 03 c 219 10 81.451148 80.958698 82.278467 80.958698 7291700 03 c 220 11 80.801118 80.141238 80.919309 80.042742 5217800 03 c 221 12 80.190478 78.900263 80.239726 78.821469 8908000 03 c 222 13 78.466905 77.294880 78.575245 76.713786 16181300 03 c 223 14 77.216088 76.792580 77.452460 76.536504 9022600 03 c 224 15 77.166838 76.871373 77.560799 76.822125 7285200 03 c 225 16 76.970187 77.197475 77.602629 76.703380 5918300 03 c 226 17 56.029999 56.799999 56.840000 55.619999 9147000 03 c 227 18 56.880001 57.119999 57.230000 56.570000 7264700 03 c 228 19 57.160000 58.090000 58.180000 57.160000 6841300 03 c 229 20 58.439999 58.290001 58.650002 57.810001 5407400 03 c 230 21 67.793285 67.295464 67.882897 66.479038 9248300 01 a 231 22 67.076429 66.260002 67.414943 65.383832 10809200 01 a 232 23 65.239998 64.419998 65.550003 64.239998 9752200 01 a 233 24 63.310001 63.840000 64.250000 63.080002 11323900 01 a 234 25 64.180000 63.630001 64.410004 63.570000 10003600 01 a 235 26 63.740002 64.050003 64.209999 63.099998 8157100 01 a 236 27 64.800003 64.400002 64.900002 63.599998 7560000 01 a 237 28 63.650002 62.849998 64.370003 62.230000 11291200 01 a 238 29 62.849998 63.290001 63.750000 62.369999 6664000 01 a 239 30 62.029999 62.910000 63.049999 61.500000 8643800 01 a 240 31 63.410000 62.639999 63.790001 62.240002 7336000 01 a 241 32 61.540001 63.029999 63.540001 61.290001 9026000 01 a 242 33 63.000000 62.639999 64.320000 62.509998 8832500 01 a 243 34 58.389999 55.060001 58.889999 54.139999 43731600 01 a 244 35 54.459999 55.020000 55.770000 54.139999 18498300 01 a 245 36 55.200001 55.090000 55.740002 54.959999 12834600 01 a 246 37 55.369999 54.520000 55.820000 54.419998 10852000 01 a 247 38 54.680000 52.880001 54.759998 52.150002 17859200 01 a 248 39 53.180000 53.500000 53.709999 53.049999 13920800 01 a 249 40 53.410000 54.700001 54.990002 53.000000 9860300 02 b 250 41 54.330002 53.660000 54.389999 53.490002 11664200 02 b 251 42 54.000000 54.110001 54.290001 52.830002 9728400 02 b 252 43 54.080002 54.380001 54.459999 53.810001 6870600 02 b 253 44 54.720001 53.980000 54.849998 53.810001 9091000 02 b 254 45 53.240002 52.400002 53.450001 52.230000 11815800 02 b 255 46 52.259998 52.630001 53.020000 51.910000 8455800 02 b 256 47 53.000000 52.290001 53.430000 52.279999 7040100 02 b 257 48 51.220001 51.110001 51.590000 50.270000 9142900 02 b 258 49 51.880001 52.660000 52.730000 51.639999 6083400 02 b 259 50 53.009998 53.180000 53.480000 52.730000 6945000 02 b 260 51 53.500000 53.610001 54.000000 53.299999 8610900 02 b 261 52 53.500000 54.150002 54.349998 53.450001 8502900 02 b 262 53 54.250000 54.709999 55.029999 54.029999 6549100 02 b 263 54 54.709999 55.630001 55.630001 54.709999 6535700 02 b 264 55 55.520000 55.110001 55.599998 54.959999 5866700 02 b 265 56 54.450001 54.639999 54.869999 53.560001 5585000 02 b 266 57 54.779999 55.389999 55.389999 54.299999 4379200 02 b 267 58 55.709999 55.380001 55.900002 55.150002 5885500 02 b 268 59 55.299999 55.580002 56.150002 54.810001 10307300 02 b 269 """
1 import matplotlib.pyplot as plt 2 closeMeansOK = df.groupby('month').mean().close 3 listOKIndex = closeMeansOK.index 4 listOK = [] 5 for i in range(0, 12): 6 listOK.append(closeMeansOK[i]) 7 plt.figure(figsize=(8, 6), dpi=100) #大小 精度 8 p1 = plt.subplot(221) 9 p2 = plt.subplot(222) 10 p3 = plt.subplot(223) 11 p4 = plt.subplot(224) 12 p1.plot(listOKIndex, listOK, 'rD') #r:红,D:宝石 13 14 t = numpy.arange(0, 4, 0.1) 15 p2.plot(t, t, t, t + 2, t, t ** 2, 'g--') #g:绿色,--:虚线 16 17 p3.plot(listOKIndex, listOK, 'o') #散点图 18 p3.set_title("123") 19 p3.set_xlabel("x") 20 p3.set_ylabel('y') 21 22 data = numpy.random.randint(1, 11, 5) 23 x = numpy.arange(len(data)) 24 p4.plot(x, data, color = 'r') 25 p4.bar(x, data, alpha = .5, color = 'g') 26 27 plt.show()