vectorized code 带来的好处。
1 import numpy as np 2 from sklearn.datasets import fetch_mldata 3 import time 4 import matplotlib.pyplot as plt 5 6 mnist = fetch_mldata('MNIST original') 7 8 X = mnist.data.astype(float) 9 Y = mnist.target.astype(float) 10 11 mask = np.random.permutation(range(np.shape(X)[0])) 12 13 num_train = 10000 14 num_test = 500 15 K = 10 16 17 X_train = X[mask[:num_train]] 18 Y_train = Y[mask[:num_train]] 19 20 X_mean = np.mean(X_train,axis = 0) 21 22 X_train = (X_train-X_mean)/255 23 24 X_test = X[mask[num_train:num_train+num_test]] 25 26 X_test = (X_test - X_mean)/255 27 28 Y_test = Y[mask[num_train:num_train+num_test]] 29 30 31 print('X_train',X_train.shape) 32 print('Y_train',Y_train.shape) 33 print('X_test',X_test.shape) 34 print('Y_test',Y_test.shape) 35 36 ex_image = (np.reshape(X_train[10,:]*255 + X_mean, (28, 28))).astype(np.uint8) 37 plt.imshow(ex_image, interpolation='nearest') 38 39 40 # **Computing the distance matrix (num_test x num_train)** 41 42 # Version 1 (Naive implementation using two for loops) 43 44 start = time.time() 45 dists_1 = np.zeros((num_test,num_train)) 46 for i in xrange(num_test): 47 for j in xrange(num_train): 48 dists_1[i,j] = np.sqrt(np.square(np.sum(X_test[i,:]-X_train[j,:]))) 49 50 stop = time.time() 51 time_taken = stop-start 52 print('Time taken with two for loops: {}s'.format(time_taken)) 53 54 55 # Version 2(Somewhat better implementation using one for loop) 56 57 start = time.time() 58 dists_2 = np.zeros((num_test,num_train)) 59 for i in xrange(num_test): 60 dists_2[i,:] = np.sqrt(np.square(np.sum(X_test[i,:]-X_train,axis = 1))) 61 62 stop = time.time() 63 time_taken = stop-start 64 print('Time taken with just one for loop: {}s'.format(time_taken)) 65 66 67 # Version 3 (Fully vectorized implementation with no for loop) 68 69 start = time.time() 70 dists_3 = np.zeros((num_test,num_train)) 71 A = np.sum(np.square(X_test),axis = 1) 72 B = np.sum(np.square(X_train),axis = 1) 73 C = np.dot(X_test,X_train.T) 74 75 dists_3 = np.sqrt(A[:,np.newaxis]+B[np.newaxis,:]-2*C) 76 77 stop = time.time() 78 time_taken = stop-start 79 print('Time taken with no for loops: {}s'.format(time_taken)) 80 81 sorted_dist_indices = np.argsort(dists_3,axis = 1) 82 83 closest_k = Y_train[sorted_dist_indices][:,:K].astype(int) 84 Y_pred = np.zeros_like(Y_test) 85 86 for i in xrange(num_test): 87 Y_pred[i] = np.argmax(np.bincount(closest_k[i,:])) 88 89 90 accuracy = (np.where(Y_test-Y_pred == 0)[0].size)/float(num_test) 91 print('Prediction accuracy: {}%'.format(accuracy*100))