BP算法在minist数据集上的简单实现

BP算法在minist上的简单实现

数据：http://yann.lecun.com/exdb/mnist/

参考：blog,blog2,blog3,tensorflow

推导：http://www.cnblogs.com/yueshangzuo/p/8025157.html

基本实现

import struct
import random
import numpy as np
from math import sqrt

class Data:
    def __init__(self):
        print 'parameter initializing...'
        self.num_train= 50000
        self.num_confirm=10000
        self.num_test= 10000
        self.node_in=28*28
        self.node_out=10
        # need to adjust
        #epoch:8 hide_node:39 accuracy:0.9613
        #epoch:8 hide_node:44 accuracy:0.9612
        #epoch:8 hide_node:48 accuracy:0.9624
        #epoch:9 hide_node:48 accuracy:0.9648
        #epoch:10 hide_node:200 accuracy:0.9724
        self.epoch= 15
        self.node_hide= 30
        self.study_rate= 0.05
        self.error_limit= 1e-2

    def read_train_image(self,filename):
        print 'reading train-image data...'
        binfile=open(filename,'rb')
        buffer=binfile.read()
        index=0
        magic,num,rows,colums = struct.unpack_from('>IIII',buffer,index)  #>I:big-endian,unsigned int
        index+=struct.calcsize('IIII')
        for i in range(self.num_train):
            im=struct.unpack_from('784B',buffer,index)  #28*28=786,B unsigned char
            index+=struct.calcsize('784B')
            im=np.array(im)
            im=im.reshape(1,784)/255.0   #28*28-->1
            self.train_imag_list[i,:]=im
        j=0
        for i in range(self.num_train,self.num_train+self.num_confirm):
            im=struct.unpack_from('784B',buffer,index)
            index+=struct.calcsize('784B')
            im=np.array(im)
            im=im.reshape(1,784)/255.0
            self.confirm_imag_list[j,:]=im
            j=j+1

    def read_train_label(self,filename):
        print 'reading train-label data...'
        binfile=open(filename,'rb')
        buffer=binfile.read()
        index=0
        magic,num= struct.unpack_from('>II',buffer,index)
        index+=struct.calcsize('II')
        for i in range(self.num_train):
            lb=struct.unpack_from('B',buffer,index)
            index+=struct.calcsize('B')
            lb=int(lb[0])
            self.train_label_list[i,:]=lb
        j=0
        for i in range(self.num_train,self.num_train+self.num_confirm):
            lb=struct.unpack_from('B',buffer,index)
            index+=struct.calcsize('B')
            lb=int(lb[0])
            self.confirm_label_list[j,:]=lb
            j=j+1


    def read_test_image(self,filename):
        print 'reading test-image data...'
        binfile=open(filename,'rb')
        buffer=binfile.read()
        index=0
        magic,num,rows,colums = struct.unpack_from('>IIII',buffer,index)
        index+=struct.calcsize('IIII')

        for i in range(self.num_test):
            im=struct.unpack_from('784B',buffer,index)
            index+=struct.calcsize('784B')
            im=np.array(im)
            im=im.reshape(1,784)/256.0
            self.test_imag_list[i,:]=im


    def read_test_label(self,filename):
        print 'reading test-label data...'
        binfile=open(filename,'rb')
        buffer=binfile.read()
        index=0
        magic,num= struct.unpack_from('>II',buffer,index)
        index+=struct.calcsize('II')

        for i in range(self.num_test):
            lb=struct.unpack_from('B',buffer,index)
            index+=struct.calcsize('B')
            lb=int(lb[0])
            self.test_label_list[i,:]=lb


    def init_network(self):
        print 'network initializing...'
        self.train_imag_list=np.zeros((self.num_train,self.node_in))
        self.train_label_list=np.zeros((self.num_train,1))
        self.confirm_imag_list=np.zeros((self.num_confirm,self.node_in))
        self.confirm_label_list=np.zeros((self.num_confirm,1))
        self.test_imag_list=np.zeros((self.num_test,self.node_in))
        self.test_label_list=np.zeros((self.num_test,1))

        self.read_train_image('train-images.idx3-ubyte')
        self.read_train_label('train-labels.idx1-ubyte')
        self.read_test_image('t10k-images.idx3-ubyte')
        self.read_test_label('t10k-labels.idx1-ubyte')

        self.wjk=(np.random.rand(self.node_hide,self.node_out)-0.5)*2/sqrt(self.node_hide)
        self.wj0=(np.random.rand(self.node_out)-0.5)*2/sqrt(self.node_hide)
        self.wij=(np.random.rand(self.node_in,self.node_hide)-0.5)*2/sqrt(self.node_in)
        self.wi0=(np.random.rand(self.node_hide)-0.5)*2/sqrt(self.node_in)


    def sigmode(self,x):
            return 1.0/(1.0+np.exp(-x))

    def calc_yjzk(self,sample_i,imag_list):
        self.netj=np.dot(imag_list[sample_i],self.wij)+self.wi0
        self.yj=self.sigmode(self.netj)

        self.netk=np.dot(self.yj,self.wjk)+self.wj0
        self.zk=self.sigmode(self.netk)

    def calc_error(self):
        ans=0.0
        for sample_i in range(self.num_confirm):
            self.calc_yjzk(sample_i,self.confirm_imag_list)
            label_tmp=np.zeros(self.node_out)
            label_tmp[int(self.confirm_label_list[sample_i])]=1
            ans=ans+sum(np.square(label_tmp-self.zk)/2.0)
        # print ans
        return ans

    def training(self):
        print 'training model...'
        for epoch_i in range(self.epoch):
            for circle in range(self.num_train):
                sample_i=np.random.randint(0,self.num_train)
                #print 'debug epoch:%d sample:%d' % (epoch_i,sample_i)
                #calc  error
                #error_before=self.calc_error()
                self.calc_yjzk(sample_i,self.train_imag_list)
                #update weight hide->out
                tmp_label=np.zeros(self.node_out)
                tmp_label[int(self.train_label_list[sample_i])]=1
                delta_k=(self.zk-tmp_label)*self.zk*(1-self.zk)
                self.yj.shape=(self.node_hide,1)
                delta_k.shape=(1,self.node_out)
                self.wjk=self.wjk-self.study_rate*np.dot(self.yj,delta_k)
                #update weight in->hide
                self.yj=self.yj.T
                delta_j=np.dot(delta_k,self.wjk.T)*self.yj*(1-self.yj)
                tmp_imag=self.train_imag_list[sample_i]
                tmp_imag.shape=(self.node_in,1)
                self.wij=self.wij-self.study_rate*np.dot(tmp_imag,delta_j)
                # calc error
                # self.calc_yjzk(sample_i,self.train_imag_list)
                # error_delta=error_before-self.calc_error()
                # if np.abs(error_delta)<self.error_limit:
                #     print 'debug break'
                #     print error_delta
                #     break
            #print 'error %d %.2f' % (epoch_i,self.calc_error())

    def testing(self):
        print 'testing...'
        num_right=0.0
        for sample_i in range(self.num_test):
            self.calc_yjzk(sample_i,self.test_imag_list)
            ans=self.zk.argmax()
            if ans==int(self.test_label_list[sample_i]):
                num_right=num_right+1
        self.accuracy=num_right/self.num_test
        print 'accuracy: %.4f' % (self.accuracy*100) +'%'
def main():
    data=Data()
    data.init_network()
    data.training()
    data.testing()

if __name__=='__main__':
    main()

注意

注意数据的编码格式，在数据来源网站最底下有指出，上面还展示了一些机器学习的经典模型在minist数据集上的错误率可供参考
权值合理的初始化，及迭代次数，学习速率，隐层节点数的设置可参考经验值
数据的归一化(防止sigmode函数溢出)
矩阵乘法时注意行列条件的满足
合理的epoch(即迭代次数，学习速率小的时候可以大一点的迭代次数，学习速率大的时候迭代次数取较小值)
确认合适的迭代次数后可去掉确认集，用全部的样本数据训练模型
隐层节点基本上越多越好

调参脚本

import ann

f=open('best_parameter', 'a+')
for e in range(10,40):
    for node in range(10,50):
        data=ann.Data()
        data.node_hide=node
        data.epoch=e
        data.init_network()
        data.training()
        data.testing()
        ans='circling to get best parameter----->epoch:%d hide_node:%d accuracy:%.4f
' % (e,node,data.accuracy)
        print ans
        f.write(ans)
f.close()

可迭代计算迭代次数和隐层节点的数目对准确率的影响，大致规律是在学习速率0.05时，迭代次数在10-15为宜，隐层节点30以上

一些试验的结果如下：

circling to get best parameter----->epoch:14 hide_node:43 accuracy:0.9656
circling to get best parameter----->epoch:14 hide_node:44 accuracy:0.9651
circling to get best parameter----->epoch:14 hide_node:45 accuracy:0.9638
circling to get best parameter----->epoch:14 hide_node:46 accuracy:0.9641
circling to get best parameter----->epoch:14 hide_node:47 accuracy:0.9649
circling to get best parameter----->epoch:14 hide_node:48 accuracy:0.9651
circling to get best parameter----->epoch:14 hide_node:49 accuracy:0.9671
circling to get best parameter----->epoch:15 hide_node:46 accuracy:0.9661
circling to get best parameter----->epoch:15 hide_node:47 accuracy:0.9660
circling to get best parameter----->epoch:15 hide_node:48 accuracy:0.9650
circling to get best parameter----->epoch:15 hide_node:49 accuracy:0.9655
circling to get best parameter----->epoch:10 hide_node:100 accuracy:0.9685
circling to get best parameter----->epoch:10 hide_node:200 accuracy:0.9724
circling to get best parameter----->epoch:10 hide_node:300 accuracy:0.9718
circling to get best parameter----->epoch:10 hide_node:1000 accuracy:0.9568

Tensorflow实现

import argparse

# Import data
from tensorflow.examples.tutorials.mnist import input_data

import tensorflow as tf

FLAGS = None

def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)

def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

def conv2d(x, W):
    return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')

def max_pool_2x2(x):
    return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
                          strides=[1, 2, 2, 1], padding='SAME')


def add_layer(inputs, in_size, out_size, activation_function=None):
    # add a fully collected layer
    Weights = weight_variable([in_size, out_size])
    biases = bias_variable([out_size])
    Wx_plus_b = tf.matmul(inputs, Weights) + biases
    if activation_function is None:
        outputs = Wx_plus_b
    else:
        outputs = activation_function(Wx_plus_b)
    return outputs


def main(_):
    mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)



    # reshape the input to have batch size, width, height, channel size
    x = tf.placeholder(tf.float32, [None, 784])
    x_image = tf.reshape(x, [-1, 28, 28, 1])

    # 5*5 patch size, input channel is 1, output channel is 32
    W_conv1 = weight_variable([5, 5, 1, 32])

    # bias, same size with the output channel
    b_conv1 = bias_variable([32])

    # the first convolutional layer with a max pooling layer
    h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
    h_pool1 = max_pool_2x2(h_conv1)

    #after pooling, we have a tensor with shape[-1, 14, 14, 32]

    # the weights and bias for the second layer, we will get 64 channels
    W_conv2 = weight_variable([5, 5, 32, 64])
    b_conv2 = bias_variable([64])

    # the second convolutional layer with a max pooling layer
    h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
    h_pool2 = max_pool_2x2(h_conv2)

    # after pooling, we have a tensor with shape[-1, 7, 7, 64]

    # add a fully connected layer with 1024 neurons and use relu as the activation function
    h_pool2_flat = tf.reshape(h_pool2, [-1,7*7*64])
    h_fc1 = add_layer(h_pool2_flat, 7*7*64, 1024, tf.nn.relu)

    # we add dropout for the fully connected layer to avoid overfitting
    keep_prob = tf.placeholder(tf.float32)
    h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

    # finally, the output layer
    y_conv = add_layer(h_fc1_drop, 1024, 10, None)




    # loss function and so on
    y_ = tf.placeholder(tf.float32, [None, 10])
    cross_entropy = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(logits=y_conv, labels=y_))
    train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
    correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    # start training, and we test our model every 100 steps
    sess = tf.InteractiveSession()
    sess.run(tf.initialize_all_variables())
    for i in range(10000):
        batch = mnist.train.next_batch(100)
        if i % 100 == 0:
            train_accuracy = accuracy.eval(feed_dict={x: batch[0], y_: batch[1], keep_prob: 1.0})
            test_accuracy = accuracy.eval(feed_dict={x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0})
            print("step %d, training accuracy %g, test accuracy %g" % (i, train_accuracy, test_accuracy))

        train_step.run(feed_dict={x: batch[0], y_: batch[1], keep_prob: 0.5})



if __name__ == '__main__':
    parser = argparse.ArgumentParser()

    # modify the dir path to your own dataset
    parser.add_argument('--data_dir', type=str, default='/tmp/mnist',
                        help='Directory for storing data')
    FLAGS = parser.parse_args()
    tf.app.run()

需要配置tensorflow和python3.+的运行环境

结果如下

step 0, training accuracy 0.06, test accuracy 0.0892
step 100, training accuracy 0.86, test accuracy 0.8692
step 200, training accuracy 0.97, test accuracy 0.9207
step 300, training accuracy 0.92, test accuracy 0.9403
step 400, training accuracy 0.95, test accuracy 0.9485
step 500, training accuracy 0.91, test accuracy 0.9522
step 600, training accuracy 0.97, test accuracy 0.9565
step 700, training accuracy 0.97, test accuracy 0.9622
step 800, training accuracy 0.96, test accuracy 0.9638
step 900, training accuracy 0.98, test accuracy 0.9687
step 1000, training accuracy 0.97, test accuracy 0.9703

有任何环境配置的问题请联系，欢迎指出错误

相关阅读:
18.3.2从Class上获取信息(属性)
18.3.2从Class上获取信息(方法)
18.3.2从Class上获取信息(构造器)
18.3.1获得Class对象
 ClassLoader.loadClass和Class.forName的区别
 java线程池原理
 如何理解「不要用战术上的勤奋掩盖战略上的懒惰」？
(转)生产者/消费者问题的多种Java实现方式
 Machine learning system design---Error analysis
Machine learning system design---prioritizing what to work on
原文地址：https://www.cnblogs.com/yueshangzuo/p/8032300.html