1. 程式人生 > >《TensorFlow 實戰Google深度學習框架》中MNIST數字識別問題程式的實現與思考

《TensorFlow 實戰Google深度學習框架》中MNIST數字識別問題程式的實現與思考

書上的程式:

import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data

__author__: str = 'zhangkun'
INPUT_NODE = 784  # 輸入節點數
OUTPUT_NODE = 10  # 輸出節點數

LAYER1_NODE = 500  # 隱層節點數
BATCH_SIZE = 100  # BATCH大小

LEARNING_RATE_BASE = 0.8  # 基礎學習率
LEARNING_RATE_DECAY = 0.99  # 學習衰減率

REGULARIZATION_RATE = 0.0001  # 正則化係數
TRAINING_STEPS = 30000  # 迴圈次數
MOVING_AVERAGE_DECAY = 0.99  # 滑動平均衰減係數


def inferene(input_tensor, avg_class, weights1, biases1, weights2, biases2):  # avg_class 是什麼?
    """
    :param input_tensor: 輸入
    :param avg_class: 用於計算引數平均值的類
    :param weights1: 第一層權重
    :param biases1: 第一層偏置
    :param weights2: 第二層權重
    :param biases2: 第二層偏置
    :return: 返回神經網路的前向結果
    """
    # 不使用滑動平均
    if avg_class is None:
        layer1 = tf.nn.relu(tf.matmul(input_tensor, weights1) + biases1)
        return tf.matmul(layer1, weights2) + biases2
    # 使用滑動平均
    else:
        layer1 = tf.nn.relu(
            tf.matmul(input_tensor, avg_class.average(weights1)) + avg_class.average(biases1)
        )
        return tf.matmul(layer1, avg_class.average(weights2)) + avg_class.average(biases2)


def train(mnist):
    x = tf.placeholder(tf.float32, [None, INPUT_NODE], name='x-input')
    # 正確的分類y
    y_ = tf.placeholder(tf.float32, [None, OUTPUT_NODE], name='y-input')

    weights1 = tf.Variable(tf.truncated_normal([INPUT_NODE, LAYER1_NODE], stddev=0.1))  # 若使用stddev=0.1 則收斂很慢,為什麼?
    biases1 = tf.Variable(tf.constant(0.1, shape=[LAYER1_NODE]))

    weights2 = tf.Variable(tf.truncated_normal([LAYER1_NODE, OUTPUT_NODE], stddev=0.1))
    biases2 = tf.Variable(tf.constant(0.1, shape=[OUTPUT_NODE]))

    y = inferene(x, None, weights1, biases1, weights2, biases2)

    global_step = tf.Variable(0, trainable=False)

    # 滑動平均類,儲存滑動平均的引數
    variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step)

    variable_averages_op = variable_averages.apply(tf.trainable_variables())

    # 經過滑動平均的引數算出的y
    average_y = inferene(x, variable_averages, weights1, biases1, weights2, biases2)

    # 計算分類損失
    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf.argmax(y_, 1), logits=y)
    # cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf.argmax(y_, 1), logits=average_y)
    cross_entropy_mean = tf.reduce_mean(cross_entropy)

    regularizer = tf.contrib.layers.l2_regularizer(REGULARIZATION_RATE)
    regularization = regularizer(weights1) + regularizer(weights2)
    loss = cross_entropy_mean + regularization

    learning_rate = tf.train.exponential_decay(
        LEARNING_RATE_BASE,  # 基礎學習率
        global_step,  # 迭代輪數
        mnist.train.num_examples / BATCH_SIZE,  # 過完所有訓練資料需要的迭代次數
        LEARNING_RATE_DECAY  # 學習率衰減速率
    )

    # 為什麼不用adam?在這裡就更新了weights1和weights2?
    train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)

    #  順序執行train_step 和 variable_averages_op
    #  更新神經網路引數和滑動平均引數,滑動平均引數並沒有參與神經網路引數的更新
    with tf.control_dependencies([train_step, variable_averages_op]):
        train_op = tf.no_op(name='train')

    '''
    流控制
    其實用法很簡單,只有在 control_inputs被執行以後,上下文管理器中的操作才會被執行。例如
     with tf.control_dependencies([a, b, c]):
      # `d` and `e` will only run after `a`, `b`, and `c` have executed.
      d = ...
      e = ...
    '''

    #  計算正確率
    correct_prediction = tf.equal(tf.argmax(average_y, 1), tf.argmax(y_, 1))
    # correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    # 初始化會話,並開始訓練過程。
    with tf.Session() as sess:
        tf.global_variables_initializer().run()
        validate_feed = {x: mnist.validation.images, y_: mnist.validation.labels}
        test_feed = {x: mnist.test.images, y_: mnist.test.labels}

        for i in range(TRAINING_STEPS):
            if i % 1000 == 0:
                validate_acc = sess.run(accuracy, feed_dict=validate_feed)
                print("after %d training steps,validate accuracy using average model is %g" % (i, validate_acc))
            xs, ys = mnist.train.next_batch(BATCH_SIZE)
            sess.run(train_op, feed_dict={x: xs, y_: ys})

        test_acc = sess.run(accuracy, feed_dict=test_feed)
        print("after %d training steps,test accuracy using average model is %g" % (TRAINING_STEPS, test_acc))


def main(argv=None):  # 這是幹什麼的?
    mnist = input_data.read_data_sets("../MNIST_data/", one_hot=True)
    train(mnist)


if __name__ == '__main__':  # 入口
    tf.app.run()

除錯程式中出現了一個十分奇葩的bug,是因為

    else:
        layer1 = tf.nn.relu(
            tf.matmul(input_tensor, avg_class.average(weights1)) + avg_class.average(biases1)
        )
        return tf.matmul(layer1, avg_class.average(weights2)) + avg_class.average(biases2)

括號位置寫錯了,寫成了:

    else:
        layer1 = tf.nn.relu(
            tf.matmul(input_tensor, avg_class.average(weights1) + avg_class.average(biases1))
        )
        return tf.matmul(layer1, avg_class.average(weights2) + avg_class.average(biases2))

得到如下結果:

神奇的是竟然能通過編譯,實際上這樣寫導致了預測數值的計算錯誤。

另外,關於滑動平均的理解:

滑動平均是為了提高準確率,但是不能作為訓練的評價引數

在不使用滑動平均的情況下的正確率

使用滑動平均好像預測效果更好,如下:

那麼訓練引數的時候使用平均滑動會怎麼樣?

可以看到程式會以極慢速度優化引數。