Batch Normalization优化比较

一、Batch Normalization（批量标准化）的介绍以及理解

在神经网络中，在每一层上，通常是将该层的输入乘以权重矩阵加上偏移值，然后将计算结果使用激活函数进行非线性变换。但是随着网络深度的增加，其激活函数的输入值（W*X+B）的分布会发生变化。

可能会向激活函数取值的饱和区域延伸（比如：sigmoid函数，当激活函数输入值趋向于正比较大的正数或者比较小的负数时，因为其倒数很小，导致梯度消失。），使得收敛很慢，如图1所示。

图1 sigmoid激活函数

而Batch Normalization是将数据进行处理，即将每一层激活函数的输入值转换成一个均值为0，方差为1的标准正态分布，这样就使得进入激活函数敏感的区域，加快训练的收敛速度。

BN的过程如图2所示。

图2 BN算法流程

二、Batch Normalization训练优化比较

在MNIST数据集上进行验证：

如果is_bn设置成True，则使用BN进行训练，如果is_bn设置成False，则不使用BN进行训练，代码如下：

import tensorflow as tf
import argparse

old_v = tf.logging.get_verbosity()
tf.logging.set_verbosity(tf.logging.ERROR)

from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/home/sxj/MNIST_data/", one_hot=True)

tf.logging.set_verbosity(old_v)


def arg_parse():
    parser = argparse.ArgumentParser()

    parser.add_argument("--is_bn", default=True, type=str, help='whether using batch normalization or not')

    return parser.parse_args()

def batch_norm_full(prev_layer, num_units):
    gamma = tf.Variable(tf.ones([num_units]))
    beta = tf.Variable(tf.zeros([num_units]))

    epsilon = 1e-3

    batch_mean, batch_variance = tf.nn.moments(prev_layer, [0])

    ema = tf.train.ExponentialMovingAverage(decay=0.99)# 滑动平均的衰减系数

    def mean_var_with_update():
        ema_apply_op = ema.apply([batch_mean, batch_variance])
        with tf.control_dependencies([ema_apply_op]):
            return tf.identity(batch_mean), tf.identity(batch_variance)

    mean, var = mean_var_with_update()
    with tf.control_dependencies([mean, var]):
        batch_normalized_output = tf.nn.batch_normalization(prev_layer, mean, var, beta, gamma, epsilon)
    return batch_normalized_output


def batch_norm_conv(prev_layer, out_channels, is_training):
    gamma = tf.Variable(tf.ones([out_channels]))
    beta = tf.Variable(tf.zeros([out_channels]))

    pop_mean = tf.Variable(tf.zeros([out_channels]), trainable=False)
    pop_variance = tf.Variable(tf.ones([out_channels]), trainable=False)

    epsilon = 1e-3
    axis = list(range(len(prev_layer.get_shape())-1))

    def batch_norm_training():
        batch_mean, batch_variance = tf.nn.moments(prev_layer, axis, keep_dims=False)

        ema = tf.train.ExponentialMovingAverage(decay=0.99)  # 滑动平均的衰减系数

        def mean_var_with_update():
            ema_apply_op = ema.apply([batch_mean, batch_variance])
            with tf.control_dependencies([ema_apply_op]):
                return tf.identity(batch_mean), tf.identity(batch_variance)

        mean, var = mean_var_with_update()
        with tf.control_dependencies([mean, var]):
            return tf.nn.batch_normalization(prev_layer, mean, var, beta, gamma, epsilon)

    def batch_norm_inference():
        return tf.nn.batch_normalization(prev_layer, pop_mean, pop_variance, beta, gamma, epsilon)

    batch_normalized_output = tf.cond(is_training, batch_norm_training, batch_norm_inference)

    return batch_normalized_output


def train_inference(num_batches, batch_size, learning_rate, is_bn=True):
    inputs = tf.placeholder(tf.float32, [None, 784])
    labels = tf.placeholder(tf.float32, [None, 10])
    is_training = tf.placeholder(tf.bool)

    # define weight
    def weight_variable(shape):
        initial = tf.truncated_normal(shape, stddev=0.1)
        return tf.Variable(initial)

    # define bias
    def bias_variable(shape):
        initial = tf.constant(0.1, shape=shape)
        return tf.Variable(initial)

    # define conv
    def conv2d(x, w):
        return tf.nn.conv2d(x, w, strides=[1, 1, 1, 1], padding="SAME")

    # define pooling
    def max_pool_2x2(x):
        return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="SAME")

    x_image = tf.reshape(inputs, [-1, 28, 28, 1])

    # level 1
    w_conv1 = weight_variable([5, 5, 1, 32])
    b_conv1 = bias_variable([32])

    h_conv1 = tf.nn.relu(tf.add(conv2d(x_image, w_conv1), b_conv1))

    h_pool1 = max_pool_2x2(h_conv1)

    # level 2
    w_conv2 = weight_variable([5, 5, 32, 64])
    b_conv2 = bias_variable([64])
    if is_bn:
        # using Batch Normalization
        h_conv2 = tf.nn.relu(batch_norm_conv(tf.add(conv2d(h_pool1, w_conv2), b_conv2), 64, is_training))
        h_pool2 = max_pool_2x2(h_conv2)
    else:
        # not using Batch Normalization
        h_conv2 = tf.nn.relu(tf.add(conv2d(h_pool1, w_conv2), b_conv2))
        h_pool2 = max_pool_2x2(h_conv2)

    w_fc1 = weight_variable([7 * 7 * 64, 1024])
    b_fc1 = bias_variable([1024])

    if is_bn:
        # using Batch Normalization
        h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64])

        h_fc1 = tf.nn.relu(batch_norm_full(tf.add(tf.matmul(h_pool2_flat, w_fc1), b_fc1), 1024))
    else:
        # not using Batch Normalization
        h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64])
        h_fc1 = tf.layers.dense(h_pool2_flat, 1024, activation=tf.nn.relu)

    # dropout
    keep_prob = tf.placeholder(tf.float32)
    h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

    if is_bn:
        layer2 = tf.layers.dense(h_fc1_drop, 10, activation=None)
        logits = batch_norm_full(layer2, 10)
    else:
        logits = tf.layers.dense(h_fc1_drop, 10, activation=None)


    model_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels))

    train_opt = tf.train.AdamOptimizer(learning_rate).minimize(model_loss)

    correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(labels, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        if is_bn:
            file = open('with_bn.txt', 'w')
        else:
            file = open('without_bn.txt', 'w')
        for batch_i in range(num_batches):
            batch_xs, batch_ys = mnist.train.next_batch(batch_size)

            sess.run(train_opt, {inputs: batch_xs, labels: batch_ys, is_training: True, keep_prob: 0.5})

            if batch_i % 100 == 0:
                loss, acc = sess.run([model_loss, accuracy], {inputs: mnist.validation.images,
                                                              labels: mnist.validation.labels,
                                                              is_training: False,
                                                              keep_prob: 0.5})
                print(
                    'Batch: {:>2}: Validation loss: {:>3.5f}, Validation accuracy: {:>3.5f}'.format(batch_i, loss, acc))
            elif batch_i % 10 == 0:
                loss, acc = sess.run([model_loss, accuracy], {inputs: batch_xs, labels: batch_ys, is_training: False,
                                                              keep_prob: 0.5})
                print('Batch: {:>2}: Training loss: {:>3.5f}, Training accuracy: {:>3.5f}'.format(batch_i, loss, acc))

                file.write(str(loss) + '    ' + str(acc)+'
')

        acc = sess.run(accuracy, {inputs: mnist.validation.images,
                                  labels: mnist.validation.labels,
                                  is_training: False,
                                  keep_prob: 0.5})
        print('Final validation accuracy: {:>3.5f}'.format(acc))
        acc = sess.run(accuracy, {inputs: mnist.test.images,
                                  labels: mnist.test.labels,
                                  is_training: False,
                                  keep_prob: 0.5})
        print('Final test accuracy: {:>3.5f}'.format(acc))

        correct = 0
        for i in range(100):
            correct += sess.run(accuracy, feed_dict={inputs: [mnist.test.images[i]],
                                                     labels: [mnist.test.labels[i]],
                                                     is_training: False,
                                                     keep_prob: 0.5})

        print("Accuracy on 100 samples:", correct / 100)


num_batches = 800
batch_size = 64
learning_rate = 0.002

tf.reset_default_graph()

with tf.Graph().as_default():
    args = arg_parse()
    if args.is_bn == 'True':
        is_bn = True
    elif args.is_bn == 'False':
        is_bn = False
    else:
        raise ValueError("Invalid is_bn, which should be 'True' or 'False")
    train_inference(num_batches, batch_size, learning_rate, is_bn)

View Code

到处数据，进行比较：

最终训练的精度比较如下（横坐标1个单位为10次迭代）：

可见，经过BN之后，收敛快很多。

相关阅读:
新手学逆向，调试abexcm1过程
 （原创）渗透某国工业系统
 （原创）对某国的一次渗透
 汇编笔记 RETF
汇编笔记 CALL（1）
汇编笔记 RET
大小写转换
 JDK下载太慢?让国内镜像帮助你
 Win7,docker安装后，创建虚拟机分配不了ip错误 err: exit status 255
Spring事务传播实践与Spring“事务失效”
原文地址：https://www.cnblogs.com/xjlearningAI/p/14403229.html