相关的公式
证明参考PPT: http://wenku.baidu.com/link?url=dBZZq7TYJOnIw2mwilKsJT_swT52I0OoikmvmgBaYE_NvP_KChFZ-HOURH5LMiLEuSVFcGmJ0bQfkG-ZYk-IRJf7D-w6P9PBec8EZ9IxgFS
Python实现代码参考(数据在同文件夹)
@author: Paul Rothnie
email : paul.rothnie@googlemail.com
https://github.com/siddharth950/Sparse-Autoencoder
# coding: utf8 # Refer to https://github.com/siddharth950/Sparse-Autoencoder import numpy as np import scipy.io import scipy.optimize import matplotlib.pyplot import struct import array class sparse_autoencoder(object): # 稀疏自编码类 def __init__(self, visible_size, hidden_size, lambda_, rho, beta): self.visible_size = visible_size self.hidden_size = hidden_size self.lambda_ = lambda_ self.rho = rho self.beta = beta w_max = np.sqrt(6.0 / (visible_size + hidden_size + 1.0)) w_min = -w_max W1 = (w_max - w_min) * np.random.random_sample(size=(hidden_size, visible_size)) + w_min W2 = (w_max - w_min) * np.random.random_sample(size=(visible_size, hidden_size)) + w_min b1 = np.zeros(hidden_size) b2 = np.zeros(visible_size) self.idx_0 = 0 self.idx_1 = hidden_size * visible_size # 64*25 self.idx_2 = self.idx_1 + hidden_size * visible_size # 25*64 self.idx_3 = self.idx_2 + hidden_size # 64 self.idx_4 = self.idx_3 + visible_size # 25 self.initial_theta = np.concatenate((W1.flatten(), W2.flatten(), b1.flatten(), b2.flatten())) def sigmoid(self, x): # sigmoid函数 return 1.0 / (1.0 + np.exp(-x)) def unpack_theta(self, theta): # 获取传递给scipy.optimize.minimize的theta W1 = theta[self.idx_0: self.idx_1] W1 = np.reshape(W1, (self.hidden_size, self.visible_size)) W2 = theta[self.idx_1: self.idx_2] W2 = np.reshape(W2, (self.visible_size, self.hidden_size)) b1 = theta[self.idx_2: self.idx_3] b1 = np.reshape(b1, (self.hidden_size, 1)) b2 = theta[self.idx_3: self.idx_4] b2 = np.reshape(b2, (self.visible_size, 1)) return W1, W2, b1, b2 def cost(self, theta, visible_input): # cost函数 W1, W2, b1, b2 = self.unpack_theta(theta) # layer=f(w*l+b) hidden_layer = self.sigmoid(np.dot(W1, visible_input) + b1) output_layer = self.sigmoid(np.dot(W2, hidden_layer) + b2) m = visible_input.shape[1] error = -(visible_input - output_layer) sum_sq_error = 0.5 * np.sum(error * error, axis=0) avg_sum_sq_error = np.mean(sum_sq_error) reg_cost = self.lambda_ * (np.sum(W1 * W1) + np.sum(W2 * W2)) / 2.0 # L2正则化 rho_bar = np.mean(hidden_layer, axis=1) # 平均激活程度 KL_div = np.sum(self.rho * np.log(self.rho / rho_bar) + (1 - self.rho) * np.log((1 - self.rho) / (1 - rho_bar))) # 相对熵 cost = avg_sum_sq_error + reg_cost + self.beta * KL_div # 损失函数 KL_div_grad = self.beta * (- self.rho / rho_bar + (1 - self.rho) / (1 - rho_bar)) del_3 = error * output_layer * (1.0 - output_layer) del_2 = np.transpose(W2).dot(del_3) + KL_div_grad[:, np.newaxis] del_2 *= hidden_layer * (1 - hidden_layer) # *=残差项 W1_grad = del_2.dot(visible_input.transpose()) / m # delt_w=del*(l.T) W2_grad = del_3.dot(hidden_layer.transpose()) / m b1_grad = del_2 # delt_b=del b2_grad = del_3 W1_grad += self.lambda_ * W1 W2_grad += self.lambda_ * W2 b1_grad = b1_grad.mean(axis=1) b2_grad = b2_grad.mean(axis=1) theta_grad = np.concatenate((W1_grad.flatten(), W2_grad.flatten(), b1_grad.flatten(), b2_grad.flatten())) return [cost, theta_grad] def train(self, data, max_iterations): # 训练令cost最小 opt_soln = scipy.optimize.minimize(self.cost, self.initial_theta, args=(data,), method='L-BFGS-B', jac=True, options= {'maxiter': max_iterations}) opt_theta = opt_soln.x return opt_theta def normalize_data(data): # 0.1<=data[i][j]<=0.9 data = data - np.mean(data) pstd = 3 * np.std(data) data = np.maximum(np.minimum(data, pstd), -pstd) / pstd data = (data + 1.0) * 0.4 + 0.1 return data def loadMNISTImages(file_name): # 获取mnist数据 image_file = open(file_name, 'rb') head1 = image_file.read(4) head2 = image_file.read(4) head3 = image_file.read(4) head4 = image_file.read(4) num_examples = struct.unpack('>I', head2)[0] num_rows = struct.unpack('>I', head3)[0] num_cols = struct.unpack('>I', head4)[0] dataset = np.zeros((num_rows * num_cols, num_examples)) images_raw = array.array('B', image_file.read()) image_file.close() for i in range(num_examples): limit1 = num_rows * num_cols * i limit2 = num_rows * num_cols * (i + 1) dataset[:, i] = images_raw[limit1: limit2] return dataset / 255 def load_data(num_patches, patch_side): # 随机选取num_patches个数据 images = scipy.io.loadmat('IMAGES.mat') # 515*512*10 images = images['IMAGES'] patches = np.zeros((patch_side * patch_side, num_patches)) seed = 1234 rand = np.random.RandomState(seed) image_index = rand.random_integers(0, 512 - patch_side, size= (num_patches, 2)) image_number = rand.random_integers(0, 10 - 1, size=num_patches) for i in xrange(num_patches): idx_1 = image_index[i, 0] idx_2 = image_index[i, 1] idx_3 = image_number[i] patch = images[idx_1:idx_1 + patch_side, idx_2:idx_2 + patch_side, idx_3] patch = patch.flatten() patches[:, i] = patch patches = normalize_data(patches) return patches def visualizeW1(opt_W1, vis_patch_side, hid_patch_side): # 可视化 figure, axes = matplotlib.pyplot.subplots(nrows=hid_patch_side, ncols=hid_patch_side) index = 0 for axis in axes.flat: axis.imshow(opt_W1[index, :].reshape(vis_patch_side, vis_patch_side), cmap=matplotlib.pyplot.cm.gray, interpolation='nearest') axis.set_frame_on(False) axis.set_axis_off() index += 1 matplotlib.pyplot.show() def run_sparse_ae(): # 稀疏自编码器 beta = 3.0 lamda = 0.0001 rho = 0.01 visible_side = 8 hidden_side = 5 visible_size = visible_side * visible_side hidden_size = hidden_side * hidden_side m = 10000 max_iterations = 400 training_data = load_data(num_patches=m, patch_side=visible_side) sae = sparse_autoencoder(visible_size, hidden_size, lamda, rho, beta) opt_theta = sae.train(training_data, max_iterations) opt_W1 = opt_theta[0: visible_size * hidden_size].reshape(hidden_size, visible_size) visualizeW1(opt_W1, visible_side, hidden_side) def run_sparse_ae_MNIST(): # 矢量化MNIST beta = 3.0 lamda = 3e-3 rho = 0.1 visible_side = 28 hidden_side = 14 visible_size = visible_side * visible_side hidden_size = hidden_side * hidden_side m = 10000 max_iterations = 400 training_data = loadMNISTImages('train-images.idx3-ubyte') training_data = training_data[:, 0:m] sae = sparse_autoencoder(visible_size, hidden_size, lamda, rho, beta) opt_theta = sae.train(training_data, max_iterations) opt_W1 = opt_theta[0: visible_size * hidden_size].reshape(hidden_size, visible_size) visualizeW1(opt_W1, visible_side, hidden_side) if __name__ == "__main__": run_sparse_ae() # run_sparse_ae_MNIST()