RNN例子解释前向传导和后向反向传播

  1 import numpy as np
  2 from datetime import datetime
  3 import sys
  4 
  5 from numpy.core.fromnumeric import shape
  6 
  7 class RNN:
  8     def __init__(self, word_dim, hidden_dim=100, bptt_truncate=4):
  9         self.word_dim = word_dim
 10         self.hidden_dim = hidden_dim
 11         self.bptt_truncate = bptt_truncate
 12         # Randomly initialize the network parameters, np.random.uniform(low,high,size=(m,n)) -> matrix: m * n
 13         self.U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
 14         self.V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))
 15         self.W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))
 16 
 17     def softmax(self,x):
 18         exp_x = np.exp(x)
 19         softmax_x = exp_x / np.sum(exp_x)
 20         return softmax_x
 21 
 22     def forward_propagation(self, x):
 23         # hidden states is h, prediction is y_hat
 24         T = len(x)
 25         h = np.zeros((T + 1, self.hidden_dim))
 26         h[-1] = np.zeros(self.hidden_dim)
 27         y_hat = np.zeros((T, self.word_dim))
 28         # For each time step...
 29         for t in np.arange(T):
 30             x_t = np.array(x[t]).reshape(-1,1)
 31             h[t] = (self.U.dot(x_t) + self.W.dot(h[t-1].reshape(-1,1))).reshape(-1)
 32             o_t = self.V.dot(h[t])
 33             y_hat[t] = self.softmax(o_t)
 34         return y_hat, h
 35   
 36     def predict(self, x):
 37         # Perform forward propagation and return index of the highest score
 38         y, h = self.forward_propagation(x)
 39         return np.argmax(y, axis=1)
 40 
 41     def calculate_total_loss(self, x, labels):
 42         total_L = 0
 43         # For each sentence...
 44         for i in np.arange(len(labels)):
 45             y_hat, h = self.forward_propagation(x[i])
 46             total_L += -1 * sum([np.log(y_pred.T.dot(y_true)) for y_pred,y_true in zip(y_hat,np.array(labels[i]))])
 47         return total_L
 48     
 49     def calculate_loss(self, x, labels):
 50         # Divide the total loss by the number of training examples 
 51         N = np.sum([len(label_i) for label_i in labels])
 52         return self.calculate_total_loss(x,labels)/N
 53 
 54     def bptt(self, x, label):
 55         T = len(label)
 56         # Perform forward propagation
 57         y_hat, h = self.forward_propagation(x)
 58         # We accumulate the gradients in these variables
 59         dLdU = np.zeros(self.U.shape)
 60         dLdV = np.zeros(self.V.shape)
 61         dLdW = np.zeros(self.W.shape)
 62         # delta_y -> dLdy: y_hat_t - y_t
 63         delta_y = np.zeros(y_hat.shape)
 64         # For each output backwards...
 65         for t in np.arange(T - 1,-1,-1):
 66             delta_y[t] = y_hat[t] - np.array(label[t])
 67             dLdV += delta_y[t].reshape(-1,1) @ h[t].T.reshape(1,-1)
 68             # Initial delta_t calculation when t is T
 69             if t == T - 1:
 70                 delta_t = np.diag(1 - np.power(h[t],2)) @ self.V.T @ delta_y[t].reshape(-1,1)
 71             else:
 72                 delta_t = np.diag(1 - np.power(h[t],2)) @ (self.V.T @ delta_y[t].reshape(-1,1) + self.W.T @ delta_t.reshape(-1,1))
 73             dLdW += delta_t @ h[t - 1].reshape(1,-1)
 74             dLdU += delta_t @ np.array(x[t]).reshape(1,-1)
 75         return dLdU, dLdV, dLdW
 76 
 77     # Performs one step of SGD.
 78     def numpy_sdg_step(self, x, label, learning_rate):
 79         # Calculate the gradients
 80         dLdU, dLdV, dLdW = self.bptt(x, label)
 81         # Change parameters according to gradients and learning rate
 82         self.U -= learning_rate * dLdU
 83         self.V -= learning_rate * dLdV
 84         self.W -= learning_rate * dLdW
 85         
 86 # - model: The RNN model instance
 87 # - X_train: The training data set
 88 # - y_train: The training data labels
 89 # - learning_rate: Initial learning rate for SGD
 90 # - nepoch: Number of times to iterate through the complete dataset
 91 # - evaluate_loss_after: Evaluate the loss after this many epochs
 92 def train_with_sgd(model, X_train, y_train, learning_rate=0.005, nepoch=100, evaluate_loss_after=5):
 93     # We keep track of the losses so we can plot them later
 94     losses = []
 95     num_examples_seen = 0
 96     for epoch in range(nepoch):
 97         # Optionally evaluate the loss
 98         if (epoch % evaluate_loss_after == 0):
 99             loss = model.calculate_loss(X_train, y_train)
100             losses.append((num_examples_seen, loss))
101             time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
102             print(f'{time} Loss after num_examples_seen {num_examples_seen} epoch {epoch}, current loss is {loss}')
103             # Adjust the learning rate if loss increases
104             if(len(losses)>1 and losses[-1][1]>losses[-2][1]):
105                 learning_rate = learning_rate * 0.5  
106                 print("Setting learning rate to %f" % learning_rate)
107                 
108         # For each training example...
109         for i in range(len(y_train)):
110             # One SGD step
111             model.numpy_sdg_step(X_train[i], y_train[i], learning_rate)
112             num_examples_seen += 1
113             
114 if __name__=='__main__':
115     s1 = '你 好 李 焕 英'
116     s2 = '夏 洛 特 烦 恼'
117     vocab_size= len(s1.split(' ')) + len(s2.split(' '))
118     vocab = [[0] * vocab_size for _ in range(vocab_size)]
119     for i in range(vocab_size): vocab[i][i] = 1
120     x_sample = [vocab[:5]] + [vocab[5:]]
121     labels = [vocab[1:6]] + [vocab[6:]+[vocab[0]]]
122 
123     rnn = RNN(10)
124     train_with_sgd(rnn,x_sample,labels)
参考博客：https://zhuanlan.zhihu.com/p/371849556
每一个不曾起舞的日子，都是对生命的辜负。
相关阅读:
前端笔记7
前端笔记6
列表、行块元素
 我的学习方法（5）
（读后摘抄）《计算机程序设计语言的发展》_王汝传
 电影《比利·林恩的中场战事》
关于优化(1)
Debug技巧(1)
奇奇怪怪的东西(1)
我的学习方法（4）
原文地址：https://www.cnblogs.com/randy-lo/p/15268683.html