Logistic Regression
特别需要注意的是 exp 和 log 的使用。
sigmoid 原始表达式为 1 / (1+exp(-z)),但如果直接使用 z=-710,会显示 overflow。因此对于 z<0 的情况,采用 exp(z) / (1 + exp(z)) ,这样一来,exp(-710) 就没问题了。这就是 scipy 包里的 expit 函数
log_logistic = log(sigmoid),注意和 expit 函数是一致的,分情况讨论。
1 import numpy as np 2 from scipy.special import expit 3 from sklearn.utils.extmath import log_logistic 4 5 def predict(theta, x): 6 return expit(x.dot(theta)) 7 8 def compute_loss(y, yz): 9 return - np.sum(log_logistic(yz)) 10 11 def gradientdescent(x, y, theta, iterations=2000, lr=0.01): 12 loss_list = [] 13 for i in range(iterations): 14 yhat = predict(theta, x) 15 delta = x.T.dot(yhat - y) / m 16 loss = compute_loss(y, y * X.dot(theta)) 17 loss_list.append(loss) 18 theta = theta - lr * delta 19 return theta, loss_list 20 21 theta, loss_list = gradientdescent(X, y, np.zeros((n, 1)))
Kmeans
Kmeans 的本质就是 EM 算法,只不过是硬间隔而不是软间隔。首先初始化 K 个中心点,在 E 步,将样本分配到最近的中心点,在 M 步,选取新的中心点以最小化组内距离。
1 import numpy as np 2 3 4 def calc_dist(x1, x2): 5 return sum([(x1[i] - x2[i])**2 for i in range(len(x1))]) 6 7 8 # Assign samples to given centers 9 def E_step(X, cents): 10 cent_dict = dict(zip(cents, [[] for _ in range(len(cents))])) 11 for row in X: 12 min_dist, best_cent = 1e10, None 13 for cent in cent_dict: 14 dist = calc_dist(row, cent) 15 if dist < min_dist: 16 min_dist = dist 17 best_cent = cent 18 cent_dict[best_cent] += [row.tolist()] 19 return cent_dict 20 21 22 # Compute new centers 23 def M_step(cent_dict): 24 new_cents = [] 25 for cent in cent_dict: 26 new_cent = np.mean(np.array(cent_dict[cent]), axis=0) 27 new_cents.append(tuple(new_cent)) 28 return new_cents 29 30 31 def Kmeans(X, K=3, max_iter=10): 32 np.random.seed(1) 33 inds = np.random.choice(len(X), K) 34 init_cents = [tuple(X[i]) for i in inds] 35 cents = init_cents 36 for k in range(max_iter): 37 cent_dict = E_step(X, cents) 38 new_cents = M_step(cent_dict) 39 move = sum([calc_dist(c1, c2) for c1, c2 in zip(cents, new_cents)]) 40 if move < 0.1: 41 print('Converged in %s steps' % k) 42 break 43 cents = new_cents 44 return cent_dict
Neural Network
注意 softmax 的计算,需要考虑到 exp 的 overflow。因此通常会在 softmax 分子分母同时乘上一个常数 C,log(C) = -max(z),这就是 shift_score。
这里使用了 scipy 包里的 logsumexp,理由同 LR,logsumexp = log(sum(exp(z)))。
1 from scipy.special import logsumexp 2 import numpy as np 3 4 5 6 class Neural_Network: 7 8 9 def __init__(self, n, h, c, std=1e-4): 10 W1 = np.random.randn(n, h) * std 11 b1 = np.zeros(h) 12 W2 = np.random.randn(h, c) * std 13 b2 = np.zeros(c) 14 self.params = {'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2} 15 16 17 def forward_backward_prop(self, X, y): 18 W1, b1 = self.params['W1'], self.params['b1'] 19 W2, b2 = self.params['W2'], self.params['b2'] 20 21 # forward prop 22 hidden = X.dot(W1) + b1 23 relu = np.maximum(0, hidden) 24 scores = relu.dot(W2) + b2 25 shift_scores = scores - np.max(scores, axis=1, keepdims=True) 26 softmax = np.exp(shift_scores) / np.sum(np.exp(shift_scores), axis=1, keepdims=True) 27 loss = - np.sum(y * (shift_scores - logsumexp(shift_scores, axis=1, keepdims=True))) / X.shape[0] 28 29 # backward prop 30 dscores = (softmax - y) / X.shape[0] 31 drelu = dscores.dot(W2.T) 32 dW2 = relu.T.dot(dscores) 33 db2 = np.sum(dscores, axis=0) 34 dhidden = (hidden > 0) * drelu 35 dW1 = X.T.dot(dhidden) 36 db1 = np.sum(dhidden, axis=0) 37 38 grads = {'dW1': dW1, 'db1': db1, 'dW2': dW2, 'db2': db2} 39 40 return loss, grads 41 42 43 def train(self, X, y, lr=0.01, decay=0.95, iters=5000): 44 loss_list, acc_list = [], [] 45 for it in range(iters): 46 loss, grads = self.forward_backward_prop(X, y) 47 loss_list.append(loss) 48 self.params['W1'] -= lr * grads['dW1'] 49 self.params['b1'] -= lr * grads['db1'] 50 self.params['W2'] -= lr * grads['dW2'] 51 self.params['b2'] -= lr * grads['db2'] 52 53 if it % 100 == 0: 54 yhat = self.predict(X) 55 acc = np.sum(np.argmax(y, axis=1) == yhat) / X.shape[0] 56 acc_list.append(acc) 57 lr *= decay 58 59 return loss_list, acc_list 60 61 62 def predict(self, X): 63 hidden = X.dot(self.params['W1']) + self.params['b1'] 64 relu = np.maximum(0, hidden) 65 scores = relu.dot(self.params['W2']) + self.params['b2'] 66 yhat = np.argmax(scores, axis=1) 67 return yhat
Recurrent Neural Network
1 import numpy as np 2 3 4 5 def tanh(x): 6 return (np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x)) 7 8 def softmax(x): 9 ex = np.exp(x - np.max(x)) 10 return ex / ex.sum(axis=0) 11 12 13 14 class RNN: 15 16 17 def __init__(self, na, nx, ny, m, seed=1): 18 np.random.seed(seed) 19 Waa = np.random.randn(na, na) 20 Wax = np.random.randn(na, nx) 21 Wya = np.random.randn(ny, na) 22 ba = np.random.randn(na, 1) 23 by = np.random.randn(ny, 1) 24 self.a0 = np.random.randn(na, m) 25 self.params = {'Waa': Waa, 'Wax': Wax, 'Wya': Wya, 'ba': ba, 'by': by} 26 27 28 def RNN_cell_forward(self, xt, a_prev): 29 """ 30 Inputs: 31 xt -- Current input data, of shape (nx, m). 32 a_prev -- Previous hidden state, of shape (na, m) 33 34 Outputs: 35 at -- Current hidden state, of shape (na, m) 36 yt -- Current prediction, of shape (ny, m) 37 """ 38 Waa, Wax, ba = self.params['Waa'], self.params['Wax'], self.params['ba'] 39 Wya, by = self.params['Wya'], self.params['by'] 40 41 at = tanh(Waa.dot(a_prev) + Wax.dot(xt) + ba) 42 score = Wya.dot(at) + by 43 yt = softmax(score) 44 return at, yt 45 46 47 def RNN_forward(self, X, y): 48 """ 49 Inputs: 50 X -- Input data for every time step, of shape (nx, m, Tx) 51 y -- Target for every time step, of shape (ny, m, Tx) 52 53 Outputs: 54 a -- Hidden states for every time-step, of shape (n_a, m, T_x) 55 yhat -- Predictions for every time-step, of shape (n_y, m, T_x) 56 """ 57 a_prev = self.a0 58 na, m = a_prev.shape 59 ny = y.shape[0] 60 Tx = X.shape[2] 61 62 a = np.zeros((na, m, Tx)) 63 yhat = np.zeros((ny, m, Tx)) 64 loss = 0 65 for t in range(Tx): 66 a_next, yt = self.RNN_cell_forward(X[:, :, t], a_prev) 67 yhat[:, :, t] = yt 68 a[:, :, t] = a_next 69 loss -= np.sum(np.log(yt.T.dot(y[:, :, t]))) 70 a_prev = a_next 71 72 cache = (a, yhat) 73 return loss, cache 74 75 76 def RNN_cell_backward(self, dz, grads, cache): 77 """ 78 Inputs: 79 dz -- Gradient of loss with respect to score 80 grads -- Dictionary contains all gradients 81 cache -- Tuple contains xt, a_next, a_prev 82 83 Outputs: 84 grads -- Dictionary contains all gradients 85 """ 86 xt, a_next, a_prev = cache 87 Waa, Wax, ba = self.params['Waa'], self.params['Wax'], self.params['ba'] 88 Wya, by = self.params['Wya'], self.params['by'] 89 90 grads['dWya'] += dz.dot(a_next.T) 91 grads['dby'] += np.sum(dz, axis=1, keepdims=True) 92 da_y = Wya.T.dot(dz) 93 da_a = grads['da_prev'] 94 da_next = da_y + da_a # da is computed based on two paths, from da_y and da_a. 95 dtanh = (1 - a_next**2) * da_next 96 grads['dWaa'] += dtanh.dot(a_prev.T) 97 grads['da_prev'] = Waa.T.dot(dtanh) 98 grads['dWax'] += dtanh.dot(xt.T) 99 grads['dba'] += np.sum(dtanh, axis=1, keepdims=True) 100 101 return grads 102 103 104 def RNN_backward(self, X, y, cache): 105 """ 106 Inputs: 107 X -- Input data for every time step, of shape (nx, m, Tx) 108 y -- Target for every time step, of shape (ny, m, Tx) 109 cache -- Tuple from RNN_forward, contains a, yhat 110 111 Outputs: 112 grads -- Dictionary contains all gradients 113 a -- Hidden states for every time-step, of shape (n_a, m, T_x) 114 """ 115 a, yhat = cache 116 Waa, Wax, ba = self.params['Waa'], self.params['Wax'], self.params['ba'] 117 Wya, by = self.params['Wya'], self.params['by'] 118 Tx = X.shape[2] 119 120 grads = {} 121 grads['dWya'], grads['dby'] = np.zeros_like(Wya), np.zeros_like(by) 122 grads['dWaa'], grads['da_prev'] = np.zeros_like(Waa), np.zeros_like(self.a0) 123 grads['dWax'], grads['dba'] = np.zeros_like(Wax), np.zeros_like(ba) 124 125 for t in reversed(range(Tx)): 126 # compute gradient of loss wrt score 127 dz = yhat[:, :, t] - y[:, :, t] 128 cell_cache = X[:, :, t], a[:, :, t], a[:, :, t-1] 129 grads = self.RNN_cell_backward(dz, grads, cell_cache) 130 131 return grads, a 132 133 134 def update_parameters(self, grads, lr): 135 self.params['Wax'] -= lr * grads['dWax'] 136 self.params['Waa'] -= lr * grads['dWaa'] 137 self.params['Wya'] -= lr * grads['dWya'] 138 self.params['ba'] -= lr * grads['dba'] 139 self.params['by'] -= lr * grads['dby'] 140 141 142 def clip(self, grads, maxValue): 143 for key in ['dWax', 'dWaa', 'dWya', 'dba', 'dby']: 144 gradient = grads[key] 145 grads[key] = np.clip(gradient, -maxValue, maxValue, out=gradient) 146 return grads 147 148 149 def train(self, X, y, lr, iters=1): 150 loss_list = [] 151 for it in range(iters): 152 loss, cache = self.RNN_forward(X, y) 153 grads, a = self.RNN_backward(X, y, cache) 154 # Clip gradients between -5 (min) and 5 (max) 155 grads = self.clip(grads, 5) 156 self.update_parameters(grads, lr) 157 loss_list.append(loss) 158 return loss, grads, a