▶ 循环神经网络
● 代码,参考【https://zybuluo.com/hanbingtao/note/581764】,这里主要实现了一个单层 LSTM 神经网络类 LstmLayer,包含前向量和后向计算。
■ 原代码有多处错误,包括且不仅限于:Line 106/110/130: 传入的 activator 根本没用到;Line 254: delta_f_list -> delta_i_list;Line 256: delta_f_list -> delta_o_list;在 github 上反馈给了 Up 主,不知道有没有改善
■ 源代码有大量低效的循环和函数调用,在这里进行了数据结构调整和向量化。例如,类 LstmLayer 的构造函数要求输入样本数,以便一次性申请所有状态向量和矩阵的内存,而不是每输入一个样本就在各状态向量和矩阵后添加一行;改善了几乎所有向量的存储方式,降低了各矩阵的维数,方便理解和调试;优化了训练部分 delta 和 梯度的计算,使用向量外积和点乘来减少 for 循环和求和;添加了输出函数,方便观看构建的神经网络的各参数状态。
1 import numpy as np 2 3 global_epsilon = 1e-3 4 global_ita = 0.2 5 #np.random.seed(107) 6 7 class SigmoidActivator(object): # 两个激活函数 8 def forward(self, weighted_input): 9 return 1.0 / (1.0 + np.exp(-weighted_input)) 10 11 def backward(self, output): 12 return output * (1 - output) 13 14 class TanhActivator(object): 15 def forward(self, weighted_input): 16 return 2.0 / (1.0 + np.exp(-2 * weighted_input)) - 1.0 17 18 def backward(self, output): 19 return 1 - output * output 20 21 class LstmLayer(object): 22 def __init__(self, sCol, dCol, nSample, ita = global_ita): # 构造函数要求传入样本数,以申请足够的内存 23 self.sCol = sCol 24 self.dCol = dCol 25 self.ita = ita 26 self.nSample = nSample 27 self.time = 0 28 self.gActivator = SigmoidActivator() # 中间层激活函数 29 self.dActivator = TanhActivator() # 输出层激活函数 30 31 self.f = np.zeros((self.nSample + 1, self.dCol)) # 初始化状态向量,第 0 行永远为 0,方便递推计算 32 self.i = np.zeros((self.nSample + 1, self.dCol)) 33 self.ct = np.zeros((self.nSample + 1, self.dCol)) 34 self.c = np.zeros((self.nSample + 1, self.dCol)) 35 self.o = np.zeros((self.nSample + 1, self.dCol)) 36 self.h = np.zeros((self.nSample + 1, self.dCol)) 37 self.Wfh, self.Wfx, self.bf = self.weightMatrix() 38 self.Wih, self.Wix, self.bi = self.weightMatrix() 39 self.Wch, self.Wcx, self.bct = self.weightMatrix() 40 self.Woh, self.Wox, self.bo = self.weightMatrix() 41 42 def weightMatrix(self): # 初始化权重矩阵 43 Wh = np.random.uniform(-1, 1,(self.dCol, self.dCol)) 44 Wx = np.random.uniform(-1, 1,(self.dCol, self.sCol)) 45 b = np.zeros(self.dCol) 46 return Wh, Wx, b 47 48 def forward(self, x): # 前向计算 49 self.time += 1 50 tt = self.time 51 self.f[tt] = self.gActivator.forward(np.dot(self.Wfh, self.h[tt - 1]) + np.dot(self.Wfx, x) + self.bf) 52 self.i[tt] = self.gActivator.forward(np.dot(self.Wih, self.h[tt - 1]) + np.dot(self.Wix, x) + self.bi) 53 self.ct[tt] = self.dActivator.forward(np.dot(self.Wch, self.h[tt - 1]) + np.dot(self.Wcx, x) + self.bct) # 注意 ct 门不一样 54 self.c[tt] = self.f[tt] * self.c[tt - 1] + self.i[tt] * self.ct[tt] 55 self.o[tt] = self.gActivator.forward(np.dot(self.Woh, self.h[tt - 1]) + np.dot(self.Wox, x) + self.bo) 56 self.h[tt] = self.o[tt] * self.dActivator.forward(self.c[tt]) 57 58 def backward(self, x, deltaNextLayer): # 后向计算 59 self.deltaF = np.zeros((self.time + 1, self.dCol)) # 计算误差项部分,初始化各误差项 60 self.deltaI = np.zeros((self.time + 1, self.dCol)) 61 self.deltaO = np.zeros((self.time + 1, self.dCol)) 62 self.deltaCt = np.zeros((self.time + 1, self.dCol)) 63 self.deltaH = np.zeros((self.time + 1, self.dCol)) # deltaH 表示输出的误差项(用于分解给个状态向量) 64 self.deltaH[-1] = deltaNextLayer # 上一层传递来的误差项 65 for tt in range(self.time, 0, -1): # 同层倒序传递 66 f = self.f[tt] 67 i = self.i[tt] 68 ct = self.ct[tt] 69 o = self.o[tt] 70 h = self.deltaH[tt] 71 cPre = self.c[tt-1] 72 tanhC = self.dActivator.forward(self.c[tt]) 73 inverseTanhC = self.dActivator.backward(tanhC) 74 75 self.deltaF[tt] = h * o * inverseTanhC * cPre * self.gActivator.backward(f) # 用本层总 dealta 计算各状态向量的 delta 76 self.deltaI[tt] = h * o * inverseTanhC * ct * self.gActivator.backward(i) 77 self.deltaCt[tt] = h * o * inverseTanhC * i * self.dActivator.backward(ct) 78 self.deltaO[tt] = h * tanhC * self.gActivator.backward(o) 79 self.deltaH[tt-1] = np.dot(self.deltaO[tt], self.Woh) + np.dot(self.deltaI[tt], self.Wih) + 80 np.dot(self.deltaF[tt], self.Wfh) + np.dot(self.deltaCt[tt], self.Wch) # 用本层个状态向量 delta 计算上层总 delta,这里假设上一层输出即为本层输入,括号外没有偏导数项 81 82 self.WfhGrad = np.sum(np.array([ np.outer(self.deltaF[1+i], self.h[i]) for i in range(self.time) ]), 0) # 求各状态向量梯度,使用外积完全向量化,类似于张量缩并 83 self.WihGrad = np.sum(np.array([ np.outer(self.deltaI[1+i], self.h[i]) for i in range(self.time) ]), 0) 84 self.WohGrad = np.sum(np.array([ np.outer(self.deltaO[1+i], self.h[i]) for i in range(self.time) ]), 0) 85 self.WchGrad = np.sum(np.array([ np.outer(self.deltaCt[1+i],self.h[i]) for i in range(self.time) ]), 0) 86 87 self.bfGrad = np.sum(self.deltaF[1:1+self.time], 0) # 求 b 的梯度,相当于上式没有乘法部分 88 self.biGrad = np.sum(self.deltaI[1:1+self.time], 0) 89 self.boGrad = np.sum(self.deltaO[1:1+self.time], 0) 90 self.bcGrad = np.sum(self.deltaCt[1:1+self.time],0) 91 92 self.WfxGrad = np.outer(self.deltaF[-1], x) # 计算 Wx 的梯度 93 self.WixGrad = np.outer(self.deltaI[-1], x) 94 self.WoxGrad = np.outer(self.deltaO[-1], x) 95 self.WcxGrad = np.outer(self.deltaCt[-1],x) 96 97 def update(self): # 更新权重 98 self.Wfh -= self.ita * self.WhfGrad 99 self.Wfx -= self.ita * self.WhxGrad 100 self.bf -= self.ita * self.bfGrad 101 self.Wih -= self.ita * self.WhiGrad 102 self.Wix -= self.ita * self.WhiGrad 103 self.bi -= self.ita * self.biGrad 104 self.Woh -= self.ita * self.WofGrad 105 self.Wox -= self.ita * self.WoxGrad 106 self.bo -= self.ita * self.boGrad 107 self.Wch -= self.ita * self.WcfGrad 108 self.Wcx -= self.ita * self.WcxGrad 109 self.bct -= self.ita * self.bcGrad 110 111 def reset(self): # 重置各状态向量 112 self.time = 0 113 self.f = np.zeros((self.nSample + 1,self.dCol)) 114 self.i = np.zeros((self.nSample + 1,self.dCol)) 115 self.ct = np.zeros((self.nSample + 1,self.dCol)) 116 self.c = np.zeros((self.nSample + 1,self.dCol)) 117 self.o = np.zeros((self.nSample + 1,self.dCol)) 118 self.h = np.zeros((self.nSample + 1,self.dCol)) 119 120 def printLstmLayer(self): # 输出本层神经网络的所有参数 121 print("sCol = %d, dCol = %d, ita = %d, nSample = %d, time = %d"%(self.sCol, self.dCol, self.ita, self.nSample, self.time)) 122 print("f= ", self.f, " i= ", self.i, " ct= ", self.ct, " c= ", self.c, " o= ", self.o, " h= ", self.h) 123 print("Wfh= ", self.Wfh, " Wfx= ", self.Wfx, " bf= ", self.bf) 124 print("Wih= ", self.Wih, " Wix= ", self.Wix, " bi= ", self.bi) 125 print("Wch= ", self.Wch, " Wcx= ", self.Wcx, " bc= ", self.bct) 126 print("Woh= ", self.Woh, " Wox= ", self.Wox, " bo= ", self.bo) 127 128 print("deltaF= ", self.deltaF, " deltaI= ", self.deltaI, " deltaO= ", self.deltaO, " deltaCt= ", self.deltaCt, " deltaH= ", self.deltaH) 129 print("WfhGrad= ", self.WfhGrad, " WfxGrad= ", self.WfxGrad, " bfGrad= ", self.bfGrad) 130 print("WihGrad= ", self.WihGrad, " WixGrad= ", self.WixGrad, " biGrad= ", self.biGrad) 131 print("WohGrad= ", self.WohGrad, " WoxGrad= ", self.WoxGrad, " boGrad= ", self.boGrad) 132 print("WchGrad= ", self.WchGrad, " WcxGrad= ", self.WcxGrad, " bcGrad= ", self.bcGrad) 133 134 def createTestData(): # 创建测试数据 135 s = [ np.array([1, 2, 3]), np.array([2, 3, 4]) ] 136 d = np.array([1, 2]) 137 return s, d 138 139 def test(): 140 lstmLayer = LstmLayer(3, 2, 2) # 传入输入维度、输出维度,样本数,可选参数学习效率 141 x, d = createTestData() 142 lstmLayer.forward(x[0]), lstmLayer.forward(x[1]) 143 lstmLayer.backward(x[1], d) 144 lstmLayer.printLstmLayer() 145 146 def gradCheck(epsilon = global_epsilon): 147 lstm = LstmLayer(3, 2, 2, epsilon) 148 s, d = createTestData() 149 lstm.forward(s[0]), lstm.forward(s[1]) 150 lstm.backward(s[1], np.ones(lstm.h[-1].shape,dtype=np.float64)) # 计算参考梯度,假设最终误差项为全 1 向量 151 for i in range(lstm.Wfh.shape[0]): 152 for j in range(lstm.Wfh.shape[1]): 153 lstm.Wfh[i,j] += epsilon 154 lstm.reset() 155 lstm.forward(s[0]), lstm.forward(s[1]) 156 err1 = np.sum(lstm.h[-1]) 157 lstm.Wfh[i,j] -= 2*epsilon 158 lstm.reset() 159 lstm.forward(s[0]), lstm.forward(s[1]) 160 err2 = np.sum(lstm.h[-1]) 161 lstm.Wfh[i,j] += epsilon 162 print('weights(%d,%d): expected <-> actural %.4e <-> %.4e' % (i, j, (err1 - err2) / (2 * epsilon), lstm.WfhGrad[i,j])) 163 164 if __name__ == "__main__": 165 test() 166 gradCheck()
● 输出结果
sCol = 3, dCol = 2, ita = 0, nSample = 2, time = 2 f= [[0. 0. ] [0.08364582 0.64637889] [0.04687823 0.75054254]] i= [[0. 0. ] [0.64994358 0.11231909] [0.71495984 0.07056953]] ct= [[ 0. 0. ] [-0.77851627 -0.99959223] [-0.75373868 -0.99999457]] c= [[ 0. 0. ] [-0.50599165 -0.11227329] [-0.56261288 -0.15483503]] o= [[0. 0. ] [0.05723445 0.06130245] [0.0173681 0.01644683]] h= [[ 0. 0. ] [-0.02671797 -0.00685385] [-0.00885623 -0.00252639]] Wfh= [[-0.44650757 -0.34150997] [ 0.1461234 0.7320657 ]] Wfx= [[ 0.61845573 -0.74104458 -0.51005937] [ 0.50410244 -0.08955573 0.09272295]] bf= [0. 0.] Wih= [[ 0.05587383 -0.25802153] [-0.73662134 -0.25832213]] Wix= [[ 0.33294318 -0.38308928 0.35067554] [ 0.03109526 0.40860802 -0.97185992]] bi= [0. 0.] Wch= [[-0.16803787 -0.149016 ] [-0.68550217 0.24428858]] Wcx= [[ 0.29142476 0.62232088 -0.85921977] [-0.81363189 -0.65205061 -0.71037887]] bc= [0. 0.] Woh= [[-0.09910883 -0.49439315] [-0.90781981 0.44788208]] Wox= [[-0.23362093 -0.45101893 -0.55533428] [-0.88301662 0.34405375 -0.84458816]] bo= [0. 0.] deltaF= [[ 0. 0. ] [ 0. 0. ] [-0.00029056 -0.00067513]] deltaI= [[ 0.00000000e+00 0.00000000e+00] [-4.89958523e-05 -1.29338011e-05] [-1.97417511e-03 -2.10655809e-03]] deltaO= [[ 0.00000000e+00 0.00000000e+00] [-1.55659244e-04 -1.37923566e-05] [-8.70241385e-03 -4.96967325e-03]] deltaCt= [[0.00000000e+00 0.00000000e+00] [7.08195651e-05 1.18852402e-08] [3.96844066e-03 2.46176389e-08]] deltaH= [[2.28293924e-05 7.62122414e-05] [6.17970530e-03 2.14376899e-03] [1.00000000e+00 2.00000000e+00]] WfhGrad= [[7.76324944e-06 1.99147518e-06] [1.80382086e-05 4.62726915e-06]] WfxGrad= [[-0.00058113 -0.00087169 -0.00116225] [-0.00135027 -0.0020254 -0.00270054]] bfGrad= [-0.00029056 -0.00067513] WihGrad= [[5.27459502e-05 1.35307066e-05] [5.62829545e-05 1.44380401e-05]] WixGrad= [[-0.00394835 -0.00592253 -0.0078967 ] [-0.00421312 -0.00631967 -0.00842623]] biGrad= [-0.00202317 -0.00211949] WohGrad= [[2.32510827e-04 5.96450681e-05] [1.32779578e-04 3.40614115e-05]] WoxGrad= [[-0.01740483 -0.02610724 -0.03480966] [-0.00993935 -0.01490902 -0.01987869]] boGrad= [-0.00885807 -0.00498347] WchGrad= [[-1.06028676e-04 -2.71991102e-05] [-6.57733325e-10 -1.68725686e-10]] WcxGrad= [[7.93688131e-03 1.19053220e-02 1.58737626e-02] [4.92352779e-08 7.38529168e-08 9.84705558e-08]] bcGrad= [4.03926022e-03 3.65028791e-08] weights(0,0): expected <-> actural 1.4570e-02 <-> 1.4570e-02 weights(0,1): expected <-> actural -2.4253e-02 <-> -2.4253e-02 weights(1,0): expected <-> actural -5.2460e-03 <-> -5.2460e-03 weights(1,1): expected <-> actural 8.7327e-03 <-> 8.7327e-03