    - KNN
    - cs231n
    - 机器学习
    date: 2019年9月16日 17:03:13



    k-Nearest Neighbor (kNN) exercise

    Complete and hand in this completed worksheet (including its outputs and any supporting code outside of the worksheet) with your assignment submission. For more details see the assignments page on the course website.

    The kNN classifier consists of two stages:

    • During training, the classifier takes the training data and simply remembers it
    • During testing, kNN classifies every test image by comparing to all training images and transfering the labels of the k most similar training examples
    • The value of k is cross-validated

    In this exercise you will implement these steps and understand the basic Image Classification pipeline, cross-validation, and gain proficiency in writing efficient, vectorized code.


    cifar10_dir = 'cs231n/datasets/cifar-10-batches-py'
    X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)
    # As a sanity check, we print out the size of the training and test data.
    print 'Training data shape: ', X_train.shape
    print 'Training labels shape: ', y_train.shape
    print 'Test data shape: ', X_test.shape
    print 'Test labels shape: ', y_test.shape
    Training data shape:  (50000, 32, 32, 3)
    Training labels shape:  (50000,)
    Test data shape:  (10000, 32, 32, 3)
    Test labels shape:  (10000,)


    num_training = 5000
    mask = range(num_training)
    X_train = X_train[mask]
    y_train = y_train[mask]
    num_test = 500
    mask = range(num_test)
    X_test = X_test[mask]
    y_test = y_test[mask]


    X_train = np.reshape(X_train, (X_train.shape[0], -1))
    X_test = np.reshape(X_test, (X_test.shape[0], -1))
    print X_train.shape, X_test.shape
    (5000, 3072) (500, 3072)


     for i in xrange(num_test):
          for j in xrange(num_train):
            # TODO:                                                             #
            # Compute the l2 distance between the ith test point and the jth    #
            # training point, and store the result in dists[i, j]. You should   #
            # not use a loop over dimension.                                    #
            # pass
            dists[i][j] = np.sqrt(np.sum(np.square(X[i] - self.X_train[j])))
            #                       END OF YOUR CODE                            #
        return dists


    for i in xrange(num_test):
          # TODO:                                                               #
          # Compute the l2 distance between the ith test point and all training #
          # points, and store the result in dists[i, :].                        #
          # pass
          dists[i] = np.sqrt(np.sum(np.square(self.X_train - X[i]), axis = 1))
          #                         END OF YOUR CODE                            #
        return dists

    如果测试集X是MxD,训练集self.X_train是NxD,那么 d1是MxN,d2.shape=(N,)可以认为是N维行向量,d3是M维列向量,所以可以相加,也是利用的python的广播机制。

        # TODO:                                                                 #
        # Compute the l2 distance between all test points and all training      #
        # points without using any explicit loops, and store the result in      #
        # dists.                                                                #
        #                                                                       #
        # You should implement this function using only basic array operations; #
        # in particular you should not use functions from scipy.                #
        #                                                                       #
        # HINT: Try to formulate the l2 distance using matrix multiplication    #
        #       and two broadcast sums.                                         #
        # pass
        d1 = -2*np.dot(X, self.X_train.T)
        d2 = np.sum(np.square(self.X_train), axis=1)
        d3 = np.sum(np.square(X), axis=1)
        d3 = d3.reshape(d3.shape[0],1)
        dists = np.sqrt(d1+d2+d3)
        #dists = np.sqrt(-2*np.dot(X, self.X_train.T) + np.sum(np.square(self.X_train), axis = 1) + np.transpose([np.sum(np.square(X), axis = 1)]))
        #                         END OF YOUR CODE                              #
        return dists


    def predict_labels(self, dists, k=1):
        Given a matrix of distances between test points and training points,
        predict a label for each test point.
        - dists: A numpy array of shape (num_test, num_train) where dists[i, j]
          gives the distance betwen the ith test point and the jth training point.
        - y: A numpy array of shape (num_test,) containing predicted labels for the
          test data, where y[i] is the predicted label for the test point X[i].  
        num_test = dists.shape[0]
        y_pred = np.zeros(num_test)
        for i in xrange(num_test):
          # A list of length k storing the labels of the k nearest neighbors to
          # the ith test point.
          closest_y = []
          # TODO:                                                                 #
          # Use the distance matrix to find the k nearest neighbors of the ith    #
          # testing point, and use self.y_train to find the labels of these       #
          # neighbors. Store these labels in closest_y.                           #
          # Hint: Look up the function numpy.argsort.                             #
          # pass     
          # np.argsort()返回由小到大排序后的下标,比如
          # np.argsort([4,2,5,1]) return [3,1,0,2]
          # 排序后取前k个,dists存的是相近的图像,而y_train转换成图像的分类(标签)
          closest_y = self.y_train[np.argsort(dists[i])[:k]]
          # TODO:                                                                 #
          # Now that you have found the labels of the k nearest neighbors, you    #
          # need to find the most common label in the list closest_y of labels.   #
          # Store this label in y_pred[i]. Break ties by choosing the smaller     #
          # label.                                                                #
          # pass
          # np.bincount()返回索引出现的次数,比如:
          # x = np.array([0, 1, 1, 3, 2, 1, 7])
          # np.bincount(x)  out:array([1, 3, 1, 1, 0, 0, 0, 1])
          # argmax()返回最大数的下标
          y_pred[i] = np.argmax(np.bincount(closest_y))
          #                           END OF YOUR CODE                            # 
        return y_pred





    我们可以通过classifier.predict_labels(dists, k=1)来从dists里提取最相近的图像的分类,并计算识别率。

    y_test_pred = classifier.predict_labels(dists, k=1)
    # Compute and print the fraction of correctly predicted examples
    num_correct = np.sum(y_test_pred == y_test)
    accuracy = float(num_correct) / num_test
    print 'Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy)
    out: Got 137 / 500 correct => accuracy: 0.274000



    Two loop version took 29.503677 seconds
    One loop version took 155.006175 seconds
    No loop version took 0.291267 seconds



    因为在实际的训练中,训练的结果对于训练集的拟合程度通常还是挺好的(初始条件敏感),但是对于训练集之外的数据的拟合程度通常就不那么令人满意了。因此我们通常并不会把所有的数据集都拿来训练,而是分出一部分来(这一部分不参加训练)对训练集生成的参数进行测试,相对客观的判断这些参数对训练集之外的数据的符合程度。这种思想就称为交叉验证(Cross Validation)



    # TODO:                                                                        #
    # Split up the training data into folds. After splitting, X_train_folds and    #
    # y_train_folds should each be lists of length num_folds, where                #
    # y_train_folds[i] is the label vector for the points in X_train_folds[i].     #
    # Hint: Look up the numpy array_split function.                                #
    # pass
    # 将y_train拉成列向量
    y_train_ = y_train.reshape(-1, 1)
    X_train_folds , y_train_folds = np.array_split(X_train, num_folds), np.array_split(y_train_, num_folds)
    #                                 END OF YOUR CODE                             #

    使用k_to_accuracies = {}存储运算结果,k_to_accuracies是一个字典类型,其中k_to_accuracies[i]存储一个长度为num_folds的list,表示k=i时的交叉验证精度。

    # pass
    for k_ in k_choices:
        k_to_accuracies.setdefault(k_, [])
    for i in range(num_folds):
        classifier = KNearestNeighbor()
        X_val_train = np.vstack(X_train_folds[0:i] + X_train_folds[i+1:])
        y_val_train = np.vstack(y_train_folds[0:i] + y_train_folds[i+1:])
        y_val_train = y_val_train[:,0]
        classifier.train(X_val_train, y_val_train)
        for k_ in k_choices:
            y_val_pred = classifier.predict(X_train_folds[i], k=k_)
            num_correct = np.sum(y_val_pred == y_train_folds[i][:,0])
            accuracy = float(num_correct) / len(y_val_pred)
            k_to_accuracies[k_] = k_to_accuracies[k_] + [accuracy]
    #                                 END OF YOUR CODE                             #


    k = 1, accuracy = 0.263000
    k = 1, accuracy = 0.257000
    k = 1, accuracy = 0.264000
    k = 1, accuracy = 0.278000
    k = 1, accuracy = 0.266000
    k = 3, accuracy = 0.239000
    k = 3, accuracy = 0.249000
    k = 3, accuracy = 0.240000
    k = 3, accuracy = 0.266000
    k = 3, accuracy = 0.254000
    k = 5, accuracy = 0.248000
    k = 5, accuracy = 0.266000
    k = 5, accuracy = 0.280000
    k = 5, accuracy = 0.292000
    k = 5, accuracy = 0.280000
    k = 8, accuracy = 0.262000
    k = 8, accuracy = 0.282000
    k = 8, accuracy = 0.273000
    k = 8, accuracy = 0.290000




    # Based on the cross-validation results above, choose the best value for k,   
    # retrain the classifier using all the training data, and test it on the test
    # data. You should be able to get above 28% accuracy on the test data.
    best_k = 10
    classifier = KNearestNeighbor()
    classifier.train(X_train, y_train)
    y_test_pred = classifier.predict(X_test, k=best_k)
    # Compute and display the accuracy
    num_correct = np.sum(y_test_pred == y_test)
    accuracy = float(num_correct) / num_test
    print 'Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy)
    out: Got 141 / 500 correct => accuracy: 0.282000


    The End!

