• 改善深层神经网络-week1编程题(初始化、正则化、梯度校验)


    初始化

    分别使用0、随机数和抑梯度异常初始化参数,比较发现抑梯度异常初始化参数可以得到更高的准确度。

    原始数据:

    import numpy as np
    import matplotlib.pyplot as plt
    import sklearn
    import sklearn.datasets
    from init_utils import sigmoid, relu, compute_loss, forward_propagation, backward_propagation
    from init_utils import update_parameters, predict, load_dataset, plot_decision_boundary, predict_dec
    from math import sqrt
    
    # %matplotlib inline
    plt.rcParams['figure.figsize'] = (7.0, 4.0) # set default size of plots
    plt.rcParams['image.interpolation'] = 'nearest'
    plt.rcParams['image.cmap'] = 'gray'
    
    # load image dataset: blue/red dots in circles
    train_X, train_Y, test_X, test_Y = load_dataset()

     使用抑梯度异常初始化代码如下:

      1 #three layers
      2 def model(X, Y, learning_rate=0.01, num_iterations=15000, print_cost=True, initialization="he"):
      3     """
      4     Implements a three-layer neural network: LINEAR->RELU->LINEAR->RELU->LINEAR->SIGMOID.
      5 
      6     Arguments:
      7     X -- input data, of shape (2, number of examples)
      8     Y -- true "label" vector (containing 0 for red dots; 1 for blue dots), of shape (1, number of examples)
      9     learning_rate -- learning rate for gradient descent
     10     num_iterations -- number of iterations to run gradient descent
     11     print_cost -- if True, print the cost every 1000 iterations
     12     initialization -- flag to choose which initialization to use ("zeros","random" or "he")
     13 
     14     Returns:
     15     parameters -- parameters learnt by the model
     16     """
     17 
     18     grads = {}
     19     costs = []     # to keep track of the loss
     20     m = X.shape[1] # number of examples
     21     layers_dims = [X.shape[0], 10, 5, 1]
     22 
     23     # Initialize parameters dictionary.
     24     parameters = initialize_parameters_he(layers_dims)
     25 
     26     # Loop (gradient descent)
     27     for i in range(0, num_iterations):
     28 
     29         # Forward propagation: LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID.
     30         a3, cache = forward_propagation(X, parameters)
     31 
     32         # Loss
     33         cost = compute_loss(a3, Y)
     34 
     35         # Backward propagation.
     36         grads = backward_propagation(X, Y, cache)
     37 
     38         # Update parameters.
     39         parameters = update_parameters(parameters, grads, learning_rate)
     40 
     41         # Print the loss every 1000 iterations
     42         if print_cost and i % 1000 == 0:
     43             print("Cost after iteration {}: {}".format(i, cost))
     44             costs.append(cost)
     45 
     46     # plot the loss
     47     plt.plot(costs)
     48     plt.ylabel('cost')
     49     plt.xlabel('iterations (per hundreds)')
     50     plt.title("Learning rate =" + str(learning_rate))
     51     plt.show()
     52 
     53     return parameters
     54 
     55 
     56 # GRADED FUNCTION: initialize_parameters_he
     57 def initialize_parameters_he(layers_dims):
     58     """
     59     Arguments:
     60     layer_dims -- python array (list) containing the size of each layer.
     61 
     62     Returns:
     63     parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL":
     64                     W1 -- weight matrix of shape (layers_dims[1], layers_dims[0])
     65                     b1 -- bias vector of shape (layers_dims[1], 1)
     66                     ...
     67                     WL -- weight matrix of shape (layers_dims[L], layers_dims[L-1])
     68                     bL -- bias vector of shape (layers_dims[L], 1)
     69     """
     70 
     71     np.random.seed(3)
     72     parameters = {}
     73     L = len(layers_dims) - 1 # integer representing the number of layers
     74 
     75     for l in range(1, L + 1):
     76         ### START CODE HERE ### (≈ 2 lines of code)
     77         parameters['W'+str(l)]=np.random.randn(layers_dims[l], layers_dims[l-1])*sqrt(2./layers_dims[l-1])
     78         parameters['b'+str(l)]=np.zeros((layers_dims[l], 1))
     79         ### END CODE HERE ###
     80     return parameters
     81 
     82 parameters = initialize_parameters_he([2, 4, 1])
     83 print("W1 = " + str(parameters["W1"]))
     84 print("b1 = " + str(parameters["b1"]))
     85 print("W2 = " + str(parameters["W2"]))
     86 print("b2 = " + str(parameters["b2"]))
     87 
     88 
     89 parameters = model(train_X, train_Y, initialization = "he")
     90 print("On the train set:")
     91 predictions_train = predict(train_X, train_Y, parameters)
     92 print("On the test set:")
     93 predictions_test = predict(test_X, test_Y, parameters)
     94 
     95 
     96 plt.title("Model with He initialization")
     97 axes = plt.gca()
     98 axes.set_xlim([-1.5, 1.5])
     99 axes.set_ylim([-1.5, 1.5])
    100 plot_decision_boundary(lambda x: predict_dec(parameters, x.T), train_X, train_Y)
    View Code

    预测准确度0.96

    L2正则化

    原始数据:

    1 from reg_utils import sigmoid, relu, plot_decision_boundary, initialize_parameters, load_2D_dataset, predict_dec
    2 from reg_utils import compute_cost, predict, forward_propagation, backward_propagation, update_parameters
    3 import scipy.io
    4 from testCases_v3 import *
    5 
    6 train_X, train_Y, test_X, test_Y = load_2D_dataset()

    如果不使用正则化:

     1 def model(X, Y, learning_rate = 0.3, num_iterations = 30000, print_cost = True, lambd = 0, keep_prob = 1):
     2     """
     3     Implements a three-layer neural network: LINEAR->RELU->LINEAR->RELU->LINEAR->SIGMOID.
     4 
     5     Arguments:
     6     X -- input data, of shape (input size, number of examples)
     7     Y -- true "label" vector (1 for blue dot / 0 for red dot), of shape (output size, number of examples)
     8     learning_rate -- learning rate of the optimization
     9     num_iterations -- number of iterations of the optimization loop
    10     print_cost -- If True, print the cost every 10000 iterations
    11     lambd -- regularization hyperparameter, scalar
    12     keep_prob - probability of keeping a neuron active during drop-out, scalar.
    13 
    14     Returns:
    15     parameters -- parameters learned by the model. They can then be used to predict.
    16     """
    17 
    18     grads = {}
    19     costs = []                            # to keep track of the cost
    20     m = X.shape[1]                        # number of examples
    21     layers_dims = [X.shape[0], 20, 3, 1]
    22 
    23     # Initialize parameters dictionary.
    24     parameters = initialize_parameters(layers_dims)
    25 
    26     # Loop (gradient descent)
    27     for i in range(0, num_iterations):
    28 
    29         # Forward propagation: LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID.
    30         if keep_prob == 1:
    31             a3, cache = forward_propagation(X, parameters)
    32         elif keep_prob < 1:
    33             a3, cache = forward_propagation_with_dropout(X, parameters, keep_prob)
    34 
    35         # Cost function
    36         if lambd == 0:
    37             cost = compute_cost(a3, Y)
    38         else:
    39             cost = compute_cost_with_regularization(a3, Y, parameters, lambd)
    40 
    41         # Backward propagation.
    42         assert(lambd == 0 or keep_prob == 1)    # it is possible to use both L2 regularization and dropout,
    43                                             # but this assignment will only explore one at a time
    44         if lambd == 0 and keep_prob == 1:
    45             grads = backward_propagation(X, Y, cache)
    46         elif lambd != 0:
    47             grads = backward_propagation_with_regularization(X, Y, cache, lambd)
    48         elif keep_prob < 1:
    49             grads = backward_propagation_with_dropout(X, Y, cache, keep_prob)
    50 
    51         # Update parameters.
    52         parameters = update_parameters(parameters, grads, learning_rate)
    53 
    54         # Print the loss every 10000 iterations
    55         if print_cost and i % 10000 == 0:
    56             print("Cost after iteration {}: {}".format(i, cost))
    57         if print_cost and i % 1000 == 0:
    58             costs.append(cost)
    59 
    60     # plot the cost
    61     plt.plot(costs)
    62     plt.ylabel('cost')
    63     plt.xlabel('iterations (x1,000)')
    64     plt.title("Learning rate =" + str(learning_rate))
    65     plt.show()
    66 
    67     return parameters
    68 
    69 
    70 parameters = model(train_X, train_Y)
    71 print("On the training set:")
    72 predictions_train = predict(train_X, train_Y, parameters)
    73 print("On the test set:")
    74 predictions_test = predict(test_X, test_Y, parameters)
    75 
    76 
    77 plt.title("Model without regularization")
    78 axes = plt.gca()
    79 axes.set_xlim([-0.75, 0.40])
    80 axes.set_ylim([-0.75, 0.65])
    81 plot_decision_boundary(lambda x: predict_dec(parameters, x.T), train_X, train_Y)
    View Code

    预测准确度0.915

    使用了L2正则化:

     1 # GRADED FUNCTION: compute_cost_with_regularization
     2 def compute_cost_with_regularization(A3, Y, parameters, lambd):
     3     """
     4     Implement the cost function with L2 regularization. See formula (2) above.
     5     
     6     Arguments:
     7     A3 -- post-activation, output of forward propagation, of shape (output size, number of examples)
     8     Y -- "true" labels vector, of shape (output size, number of examples)
     9     parameters -- python dictionary containing parameters of the model
    10     
    11     Returns:
    12     cost - value of the regularized loss function (formula (2))
    13     """
    14     m = Y.shape[1]
    15     W1 = parameters["W1"]
    16     W2 = parameters["W2"]
    17     W3 = parameters["W3"]
    18     
    19     cross_entropy_cost = compute_cost(A3, Y) # This gives you the cross-entropy part of the cost
    20     
    21     ### START CODE HERE ### (approx. 1 line)
    22     L2_regularization_cost=lambd/(2*m)*(np.sum(np.square(W1)) + np.sum(np.square(W2))  + np.sum(np.square(W3)))
    23     ### END CODER HERE ###
    24     
    25     cost = cross_entropy_cost + L2_regularization_cost    
    26     return cost
    27 
    28 A3, Y_assess, parameters = compute_cost_with_regularization_test_case()
    29 print("cost = " + str(compute_cost_with_regularization(A3, Y_assess, parameters, lambd = 0.1)))
    30 
    31 
    32 # GRADED FUNCTION: backward_propagation_with_regularization
    33 def backward_propagation_with_regularization(X, Y, cache, lambd):
    34     """
    35     Implements the backward propagation of our baseline model to which we added an L2 regularization.
    36     
    37     Arguments:
    38     X -- input dataset, of shape (input size, number of examples)
    39     Y -- "true" labels vector, of shape (output size, number of examples)
    40     cache -- cache output from forward_propagation()
    41     lambd -- regularization hyperparameter, scalar
    42     
    43     Returns:
    44     gradients -- A dictionary with the gradients with respect to each parameter, activation and pre-activation variables
    45     """
    46     
    47     m = X.shape[1]
    48     (Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3) = cache
    49     
    50     dZ3 = A3 - Y
    51     
    52     ### START CODE HERE ### (approx. 1 line)
    53     dW3=np.dot(dZ3,A2.T)/m+lambd*W3/m    
    54     ### END CODE HERE ###
    55     db3 = 1. / m * np.sum(dZ3, axis=1, keepdims=True)
    56     
    57     dA2 = np.dot(W3.T, dZ3)
    58     dZ2 = np.multiply(dA2, np.int64(A2 > 0))
    59     ### START CODE HERE ### (approx. 1 line)
    60     dW2=np.dot(dZ2,A1.T)/m+lambd*W2/m    
    61     ### END CODE HERE ###
    62     db2 = 1. / m * np.sum(dZ2, axis=1, keepdims=True)
    63     
    64     dA1 = np.dot(W2.T, dZ2)
    65     dZ1 = np.multiply(dA1, np.int64(A1 > 0))
    66     ### START CODE HERE ### (approx. 1 line)
    67     dW1=np.dot(dZ1,X.T)/m+lambd*W1/m    
    68     ### END CODE HERE ###
    69     db1 = 1. / m * np.sum(dZ1, axis=1, keepdims=True)
    70     
    71     gradients = {"dZ3": dZ3, "dW3": dW3, "db3": db3, "dA2": dA2,
    72                  "dZ2": dZ2, "dW2": dW2, "db2": db2, "dA1": dA1, 
    73                  "dZ1": dZ1, "dW1": dW1, "db1": db1}    
    74     return gradients
    75 
    76 
    77 X_assess, Y_assess, cache = backward_propagation_with_regularization_test_case()
    78 grads = backward_propagation_with_regularization(X_assess, Y_assess, cache, lambd=0.7)
    79 print ("dW1 = " + str(grads["dW1"]))
    80 print ("dW2 = " + str(grads["dW2"]))
    81 print ("dW3 = " + str(grads["dW3"]))
    82 
    83 
    84 parameters = model(train_X, train_Y, lambd=0.7)
    85 print("On the train set:")
    86 predictions_train = predict(train_X, train_Y, parameters)
    87 print("On the test set:")
    88 predictions_test = predict(test_X, test_Y, parameters)
    89 
    90 
    91 plt.title("Model with L2-regularization")
    92 axes = plt.gca()
    93 axes.set_xlim([-0.75,0.40])
    94 axes.set_ylim([-0.75,0.65])
    95 plot_decision_boundary(lambda x: predict_dec(parameters, x.T), train_X, train_Y)
    View Code

    预测准确率0.93

    Dropout正则化(随机失活)

      1 # GRADED FUNCTION: forward_propagation_with_dropout
      2 def forward_propagation_with_dropout(X, parameters, keep_prob=0.5):
      3     """
      4     Implements the forward propagation: LINEAR -> RELU + DROPOUT -> LINEAR -> RELU + DROPOUT -> LINEAR -> SIGMOID.
      5     
      6     Arguments:
      7     X -- input dataset, of shape (2, number of examples)
      8     parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2", "W3", "b3":
      9                     W1 -- weight matrix of shape (20, 2)
     10                     b1 -- bias vector of shape (20, 1)
     11                     W2 -- weight matrix of shape (3, 20)
     12                     b2 -- bias vector of shape (3, 1)
     13                     W3 -- weight matrix of shape (1, 3)
     14                     b3 -- bias vector of shape (1, 1)
     15     keep_prob - probability of keeping a neuron active during drop-out, scalar
     16     
     17     Returns:
     18     A3 -- last activation value, output of the forward propagation, of shape (1,1)
     19     cache -- tuple, information stored for computing the backward propagation
     20     """    
     21     np.random.seed(1)
     22     
     23     # retrieve parameters
     24     W1 = parameters["W1"]
     25     b1 = parameters["b1"]
     26     W2 = parameters["W2"]
     27     b2 = parameters["b2"]
     28     W3 = parameters["W3"]
     29     b3 = parameters["b3"]
     30     
     31     # LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID
     32     Z1 = np.dot(W1, X) + b1
     33     A1 = relu(Z1)
     34     ### START CODE HERE ### (approx. 4 lines)         # Steps 1-4 below correspond to the Steps 1-4 described above. 
     35     D1=np.random.rand(A1.shape[0],A1.shape[1])        # Step 1: initialize matrix D1 = np.random.rand(..., ...)
     36     D1=D1<keep_prob                                   # Step 2: convert entries of D1 to 0 or 1 (using keep_prob as the threshold)
     37     A1=np.multiply(A1,D1)                             # Step 3: shut down some neurons of A1
     38     A1/=keep_prob                                     # Step 4: scale the value of neurons that haven't been shut down
     39     ### END CODE HERE ###
     40        
     41     Z2 = np.dot(W2, A1) + b2
     42     A2 = relu(Z2)
     43     ### START CODE HERE ### (approx. 4 lines)
     44     D2=np.random.rand(A2.shape[0], A2.shape[1])      # Step 1: initialize matrix D2 = np.random.rand(..., ...)
     45     D2=D2<keep_prob                                  # Step 2: convert entries of D2 to 0 or 1 (using keep_prob as the threshold)                           
     46     A2=np.multiply(A2,D2)                            # Step 3: shut down some neurons of A2
     47     A2/=keep_prob                                    # Step 4: scale the value of neurons that haven't been shut down
     48     ### END CODE HERE ###
     49     
     50     Z3 = np.dot(W3, A2) + b3
     51     A3 = sigmoid(Z3)
     52     
     53     cache = (Z1, D1, A1, W1, b1, Z2, D2, A2, W2, b2, Z3, A3, W3, b3)   
     54     return A3, cache
     55 
     56 X_assess, parameters = forward_propagation_with_dropout_test_case()
     57 A3, cache = forward_propagation_with_dropout(X_assess, parameters, keep_prob=0.7)
     58 print ("A3 = " + str(A3))
     59 
     60 
     61 # GRADED FUNCTION: backward_propagation_with_dropout
     62 def backward_propagation_with_dropout(X, Y, cache, keep_prob):
     63     """
     64     Implements the backward propagation of our baseline model to which we added dropout.
     65     
     66     Arguments:
     67     X -- input dataset, of shape (2, number of examples)
     68     Y -- "true" labels vector, of shape (output size, number of examples)
     69     cache -- cache output from forward_propagation_with_dropout()
     70     keep_prob - probability of keeping a neuron active during drop-out, scalar
     71     
     72     Returns:
     73     gradients -- A dictionary with the gradients with respect to each parameter, activation and pre-activation variables
     74     """
     75     
     76     m = X.shape[1]
     77     (Z1, D1, A1, W1, b1, Z2, D2, A2, W2, b2, Z3, A3, W3, b3) = cache
     78     
     79     dZ3 = A3 - Y
     80     dW3 = 1. / m * np.dot(dZ3, A2.T)
     81     db3 = 1. / m * np.sum(dZ3, axis=1, keepdims=True)
     82     
     83     dA2 = np.dot(W3.T, dZ3)
     84     ### START CODE HERE ### (≈ 2 lines of code)
     85     dA2 = dA2*D2               # Step 1: Apply mask D2 to shut down the same neurons as during the forward propagation
     86     dA2 = dA2/keep_prob        # Step 2: Scale the value of neurons that haven't been shut down
     87     ### END CODE HERE ###
     88     
     89     dZ2 = np.multiply(dA2, np.int64(A2 > 0))
     90     dW2 = 1. / m * np.dot(dZ2, A1.T)
     91     db2 = 1. / m * np.sum(dZ2, axis=1, keepdims=True)
     92     
     93     dA1 = np.dot(W2.T, dZ2)
     94     ### START CODE HERE ### (≈ 2 lines of code)
     95     dA1=dA1*D1                   # Step 1: Apply mask D1 to shut down the same neurons as during the forward propagation
     96     dA1=dA1/keep_prob            # Step 2: Scale the value of neurons that haven't been shut down
     97     ### END CODE HERE ###
     98     
     99     dZ1 = np.multiply(dA1, np.int64(A1 > 0))
    100     dW1 = 1. / m * np.dot(dZ1, X.T)
    101     db1 = 1. / m * np.sum(dZ1, axis=1, keepdims=True)
    102     
    103     gradients = {"dZ3": dZ3, "dW3": dW3, "db3": db3,"dA2": dA2,
    104                  "dZ2": dZ2, "dW2": dW2, "db2": db2, "dA1": dA1, 
    105                  "dZ1": dZ1, "dW1": dW1, "db1": db1}
    106     
    107     return gradients
    108 
    109 
    110 X_assess, Y_assess, cache = backward_propagation_with_dropout_test_case()
    111 gradients = backward_propagation_with_dropout(X_assess, Y_assess, cache, keep_prob=0.8)
    112 print ("dA1 = " + str(gradients["dA1"]))
    113 print ("dA2 = " + str(gradients["dA2"]))
    114 
    115 
    116 parameters = model(train_X, train_Y, keep_prob=0.86, learning_rate=0.3)
    117 print("On the train set:")
    118 predictions_train = predict(train_X, train_Y, parameters)
    119 print("On the test set:")
    120 predictions_test = predict(test_X, test_Y, parameters)
    121 
    122 
    123 plt.title("Model with dropout")
    124 axes = plt.gca()
    125 axes.set_xlim([-0.75, 0.40])
    126 axes.set_ylim([-0.75, 0.65])
    127 plot_decision_boundary(lambda x: predict_dec(parameters, x.T), train_X, train_Y)
    View Code

    预测准确度0.95

    梯度校验

     一维梯度校验:

     1 from testCases_v3 import gradient_check_n_test_case
     2 from gc_utils import sigmoid, relu, dictionary_to_vector, vector_to_dictionary, gradients_to_vector
     3 
     4 #一维梯度校验
     5 # GRADED FUNCTION: forward_propagation
     6 def forward_propagation(x, theta):
     7     """
     8     Implement the linear forward propagation (compute J) presented in Figure 1 (J(theta) = theta * x)
     9     
    10     Arguments:
    11     x -- a real-valued input
    12     theta -- our parameter, a real number as well
    13     
    14     Returns:
    15     J -- the value of function J, computed using the formula J(theta) = theta * x
    16     """
    17     
    18     ### START CODE HERE ### (approx. 1 line)
    19     J = np.dot(theta, x)
    20     ### END CODE HERE ### 
    21     return J
    22 
    23 x, theta = 2, 4
    24 J = forward_propagation(x, theta)
    25 print ("J = " + str(J))
    26 
    27 
    28 # GRADED FUNCTION: backward_propagation
    29 def backward_propagation(x, theta):
    30     """
    31     Computes the derivative of J with respect to theta (see Figure 1).
    32     
    33     Arguments:
    34     x -- a real-valued input
    35     theta -- our parameter, a real number as well
    36     
    37     Returns:
    38     dtheta -- the gradient of the cost with respect to theta
    39     """    
    40     ### START CODE HERE ### (approx. 1 line)
    41     dtheta=x
    42     ### END CODE HERE ###   
    43     return dtheta
    44 
    45 x, theta = 2, 4
    46 dtheta = backward_propagation(x, theta)
    47 print ("dtheta = " + str(dtheta))
    48 
    49 
    50 # GRADED FUNCTION: gradient_check
    51 def gradient_check(x, theta, epsilon=1e-7):
    52     """
    53     Implement the backward propagation presented in Figure 1.
    54     
    55     Arguments:
    56     x -- a real-valued input
    57     theta -- our parameter, a real number as well
    58     epsilon -- tiny shift to the input to compute approximated gradient with formula(1)
    59     
    60     Returns:
    61     difference -- difference (2) between the approximated gradient and the backward propagation gradient
    62     """
    63     
    64     # Compute gradapprox using left side of formula (1). epsilon is small enough, you don't need to worry about the limit.
    65     ### START CODE HERE ### (approx. 5 lines)
    66     theta1=theta+epsilon                              # Step 1
    67     theta2=theta-epsilon                              # Step 2
    68     J1=forward_propagation(x, theta1)                 # Step 3
    69     J2=forward_propagation(x, theta2)                 # Step 4
    70     gradapprox=(J1-J2)/(2*epsilon)                    # Step 5
    71     ### END CODE HERE ###
    72     
    73     # Check if gradapprox is close enough to the output of backward_propagation()
    74     ### START CODE HERE ### (approx. 1 line)
    75     grad = backward_propagation(x, theta)
    76     ### END CODE HERE ###
    77     
    78     ### START CODE HERE ### (approx. 1 line)    
    79     numerator = np.linalg.norm(grad - gradapprox)                      # Step 1'
    80     denominator = np.linalg.norm(grad) + np.linalg.norm(gradapprox)    # Step 2'
    81     difference = numerator / denominator                               # Step 3'
    82     ### END CODE HERE ###
    83     
    84     if difference < 1e-7:
    85         print("The gradient is correct!")
    86     else:
    87         print("The gradient is wrong!")
    88     
    89     return difference
    90 
    91 x, theta = 2, 4
    92 difference = gradient_check(x, theta)
    93 print("difference = " + str(difference))
    View Code

    输出:

    The gradient is correct!
    difference = 2.919335883291695e-10

    N维梯度校验:

      1 #N维梯度校验
      2 def forward_propagation_n(X, Y, parameters):
      3     """
      4     Implements the forward propagation (and computes the cost) presented in Figure 3.
      5     
      6     Arguments:
      7     X -- training set for m examples
      8     Y -- labels for m examples 
      9     parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2", "W3", "b3":
     10                     W1 -- weight matrix of shape (5, 4)
     11                     b1 -- bias vector of shape (5, 1)
     12                     W2 -- weight matrix of shape (3, 5)
     13                     b2 -- bias vector of shape (3, 1)
     14                     W3 -- weight matrix of shape (1, 3)
     15                     b3 -- bias vector of shape (1, 1)
     16     
     17     Returns:
     18     cost -- the cost function (logistic cost for one example)
     19     """
     20     
     21     # retrieve parameters
     22     m = X.shape[1]
     23     W1 = parameters["W1"]
     24     b1 = parameters["b1"]
     25     W2 = parameters["W2"]
     26     b2 = parameters["b2"]
     27     W3 = parameters["W3"]
     28     b3 = parameters["b3"]
     29 
     30     # LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID
     31     Z1 = np.dot(W1, X) + b1
     32     A1 = relu(Z1)
     33     Z2 = np.dot(W2, A1) + b2
     34     A2 = relu(Z2)
     35     Z3 = np.dot(W3, A2) + b3
     36     A3 = sigmoid(Z3)
     37 
     38     # Cost
     39     logprobs = np.multiply(-np.log(A3), Y) + np.multiply(-np.log(1 - A3), 1 - Y)
     40     cost = 1. / m * np.sum(logprobs)
     41     
     42     cache = (Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3)
     43     
     44     return cost, cache
     45 
     46 
     47 def backward_propagation_n(X, Y, cache):
     48     """
     49     Implement the backward propagation presented in figure 2.
     50     
     51     Arguments:
     52     X -- input datapoint, of shape (input size, 1)
     53     Y -- true "label"
     54     cache -- cache output from forward_propagation_n()
     55     
     56     Returns:
     57     gradients -- A dictionary with the gradients of the cost with respect to each parameter, activation and pre-activation variables.
     58     """
     59     
     60     m = X.shape[1]
     61     (Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3) = cache
     62     
     63     dZ3 = A3 - Y
     64     dW3 = 1. / m * np.dot(dZ3, A2.T)
     65     db3 = 1. / m * np.sum(dZ3, axis=1, keepdims=True)
     66     
     67     dA2 = np.dot(W3.T, dZ3)
     68     dZ2 = np.multiply(dA2, np.int64(A2 > 0))
     69     dW2 = 1. / m * np.dot(dZ2, A1.T) * 2  # Should not multiply by 2
     70     db2 = 1. / m * np.sum(dZ2, axis=1, keepdims=True)
     71     
     72     dA1 = np.dot(W2.T, dZ2)
     73     dZ1 = np.multiply(dA1, np.int64(A1 > 0))
     74     dW1 = 1. / m * np.dot(dZ1, X.T)
     75     db1 = 4. / m * np.sum(dZ1, axis=1, keepdims=True) # Should not multiply by 4
     76     
     77     gradients = {"dZ3": dZ3, "dW3": dW3, "db3": db3,
     78                  "dA2": dA2, "dZ2": dZ2, "dW2": dW2, "db2": db2,
     79                  "dA1": dA1, "dZ1": dZ1, "dW1": dW1, "db1": db1}
     80     
     81     return gradients
     82 
     83 
     84 # GRADED FUNCTION: gradient_check_n
     85 def gradient_check_n(parameters, gradients, X, Y, epsilon=1e-7):
     86     """
     87     Checks if backward_propagation_n computes correctly the gradient of the cost output by forward_propagation_n
     88     
     89     Arguments:
     90     parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2", "W3", "b3":
     91     grad -- output of backward_propagation_n, contains gradients of the cost with respect to the parameters. 
     92     x -- input datapoint, of shape (input size, 1)
     93     y -- true "label"
     94     epsilon -- tiny shift to the input to compute approximated gradient with formula(1)
     95     
     96     Returns:
     97     difference -- difference (2) between the approximated gradient and the backward propagation gradient
     98     """
     99     
    100     # Set-up variables
    101     parameters_values, _ = dictionary_to_vector(parameters)
    102     grad = gradients_to_vector(gradients)
    103     num_parameters = parameters_values.shape[0]
    104     J_plus = np.zeros((num_parameters, 1))
    105     J_minus = np.zeros((num_parameters, 1))
    106     gradapprox = np.zeros((num_parameters, 1))
    107     
    108     # Compute gradapprox
    109     for i in range(num_parameters):
    110         
    111         # Compute J_plus[i]. Inputs: "parameters_values, epsilon". Output = "J_plus[i]".
    112         # "_" is used because the function you have to outputs two parameters but we only care about the first one
    113         ### START CODE HERE ### (approx. 3 lines)
    114         theta1=np.copy(parameters_values)                       # Step 1
    115         theta1[i][0]+=epsilon                                   # Step 2
    116         J_plus[i],_=forward_propagation_n(X, Y, vector_to_dictionary(theta1))           # Step 3
    117         ### END CODE HERE ###
    118         
    119         
    120         # Compute J_minus[i]. Inputs: "parameters_values, epsilon". Output = "J_minus[i]".
    121         ### START CODE HERE ### (approx. 3 lines)
    122         theta2=np.copy(parameters_values)                       # Step 1
    123         theta2[i][0]-=epsilon                                   # Step 2
    124         J_minus[i],_=forward_propagation_n(X, Y, vector_to_dictionary(theta2))          # Step 3
    125         ### END CODE HERE ###
    126         
    127         # Compute gradapprox[i]
    128         ### START CODE HERE ### (approx. 1 line)
    129         gradapprox[i]=(J_plus[i]-J_minus[i])/(2*epsilon)
    130         ### END CODE HERE ###
    131     
    132     # Compare gradapprox to backward propagation gradients by computing difference.
    133     ### START CODE HERE ### (approx. 1 line)
    134     numerator = np.linalg.norm(grad - gradapprox)                      # Step 1'
    135     denominator = np.linalg.norm(grad) + np.linalg.norm(gradapprox)    # Step 2'
    136     difference = numerator / denominator                               # Step 3'                                            
    137     ### END CODE HERE ###
    138 
    139     if difference > 1e-7:
    140         print("33[93m" + "There is a mistake in the backward propagation! difference = " + str(difference) + "33[0m")
    141     else:
    142         print("33[92m" + "Your backward propagation works perfectly fine! difference = " + str(difference) + "33[0m")
    143     
    144     return difference
    145 
    146 
    147 X, Y, parameters = gradient_check_n_test_case()
    148 
    149 cost, cache = forward_propagation_n(X, Y, parameters)
    150 gradients = backward_propagation_n(X, Y, cache)
    151 difference = gradient_check_n(parameters, gradients, X, Y)
    View Code

    输出:

    There is a mistake in the backward propagation! difference = 0.2850931566540251

  • 相关阅读:
    Java生鲜电商平台-物流配送的设计与架构
    五分钟学Java:如何学习Java面试必考的网络编程
    五分钟学Java:如何学习Java面试必考的网络编程
    Java原来还可以这么学:如何搞定面试中必考的集合类
    五分钟学Java:如何学习Java面试必考的JVM虚拟机
    先搞清楚这些问题,简历上再写你熟悉Java!
    MySql/Oracle和SQL Server的分页查
    Java面试题之int和Integer的区别
    Java基本数据类型转换
    Shiro-Subject 分析
  • 原文地址:https://www.cnblogs.com/cxq1126/p/13093231.html
Copyright © 2020-2023  润新知