經典CNN之:VGGNet
VGGNet(Visual Geometry Group)
VGGNet是牛津大學計算機視覺組和DeepMind公司共同研發一種深度卷積網路,並且在2014年在ILSVRC比賽上獲得了分類專案的第二名和定位專案的第一名,VggNet一共有六種不同的網路結構,但是每種結構都有含有5組卷積,每組卷積都使用3x3的卷積核,每組卷積後進行一個2x2最大池化,接下來是三個全連線層.在訓練高級別的網路時,可以先訓練低級別的網路,用前者獲得的權重初始化高級別的網路,可以加速網路的收斂.
網路的幾種不同配置
C級別的網路增加了1x1的卷積核,它的作用主要是對畫素進行線性變換,不改變輸入出通道,和大小.
網路的引數
在有些卷積組中會出現連續的幾個卷積層堆疊,使用3x3的卷積核,這樣的設計,既可以保證感受視野,有能減少卷基層的引數,
例如:兩個3x3的卷積層疊加,等價於一個5x5的卷積核的效果,3個3x3的卷積核的疊加相當於一個7x7的卷積核,而且引數更少.大約是7x7卷積核卷積層的(3*3*3)/(7*7)=0.55.而且擁有和7x7卷積核一樣的感受視野,三個卷積層的疊加,經過了更多次的非線性變換,對特徵的學習能力更強.
兩個3x3的卷積核
ImageNet資料量很大,所以用CIFAR10來測試,cifar10資料的下載問題,主要實現最基本的Vggnet.由於cifar10影象很小,所以最後兩個卷積組省去了池化操作.卷積核的數量也根據影象的大小進行了調整.
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import time
import cifar10_input #用於載入和處理資料,來自tensorflow/models/tutotials/image/cifar10
網路引數
max_steps = 20700 #一共訓練的兩萬多次,分了兩次,中途儲存過一次引數變數
batch_size = 256 # 小批量資料大小
s_times = 20 # 每輪訓練資料的組數,每組為一batchsize
learning_rate = 0.0001
data_dir = 'cifar10data/cifar-10-batches-bin' # 資料所在路徑
權重,偏置初始化
# Xavier初始化方法
# 卷積權重(核)初始化
def init_conv_weights(shape, name):
weights = tf.get_variable(name=name, shape=shape, dtype=tf.float32,
initializer=tf.contrib.layers.xavier_initializer_conv2d())
return weights
# 全連線權重初始化
def init_fc_weights(shape, name):
weights = tf.get_variable(name=name, shape=shape, dtype=tf.float32,
initializer=tf.contrib.layers.xavier_initializer())
return weights
# 偏置
def init_biases(shape, name):
biases = tf.Variable(tf.random_normal(shape),name=name, dtype=tf.float32)
return biases
卷積,池化,全連線
# 卷積
# 引數:輸入張量,卷積核,偏置,卷積核在高和寬維度上移動的步長
def conv2d(input_tensor, weights, biases, s_h, s_w):
conv = tf.nn.conv2d(input_tensor, weights, [1, s_h, s_w, 1], padding='SAME')
return tf.nn.relu(conv + biases)
# 池化
# 引數:輸入張量,池化核高和寬,池化核在高,寬維度上移動步長
def max_pool(input_tensor, k_h, k_w, s_h, s_w):
return tf.nn.max_pool(input_tensor, ksize=[1, k_h, k_w, 1], strides=[1, s_h, s_w, 1], padding='SAME')
# 全連結
# 引數:輸入張量,全連線權重,偏置
def fullc(input_tensor, weights, biases):
return tf.nn.relu_layer(input_tensor, weights, biases)
佔位節點
# 輸入佔位節點
images = tf.placeholder(tf.float32, [batch_size, 24 ,24 ,3])
labels = tf.placeholder(tf.int32, [batch_size])
# 正則
keep_prob = tf.placeholder(tf.float32)
構建網路
# 使用作用域對Op進行封裝,在使用tensorboard對網路結構進行視覺化的效果較好
# 第一組卷積 conv3-16
with tf.name_scope('conv_group_1'):
cw1 = init_conv_weights([3, 3, 3, 16], name='conv_w1')
cb1 = init_biases([16], name='conv_b1')
conv1 = conv2d(images, cw1, cb1, 1, 1)
# 最大池化 2x2
pool1 = max_pool(conv1, 2, 2, 2, 2)
# 第二組卷積 conv3-32
with tf.name_scope('conv_group_2'):
cw2 = init_conv_weights([3, 3, 16, 32], name='conv_w2')
cb2 = init_biases([32], name='conv_b2')
conv2 = conv2d(pool1, cw2, cb2, 1, 1)
# 最大池化
pool2 = max_pool(conv2, 2, 2, 2, 2)
# 第三組卷積 conv3-64 conv3-64
with tf.name_scope('conv_group_3'):
cw3 = init_conv_weights([3, 3, 32, 64], name='conv_w3')
cb3 = init_biases([64], name='conv_b3')
conv3 = conv2d(pool2, cw3, cb3, 1, 1)
cw4 = init_conv_weights([3, 3, 64, 64], name='conv_w4')
cb4 = init_biases([64], name='conv_b4')
conv4 = conv2d(conv3, cw4, cb4, 1, 1)
# 最大池化
pool3 = max_pool(conv4, 2, 2, 2, 2)
# 第四組卷積 conv3-128 conv3-128
with tf.name_scope('conv_group_4'):
cw5 = init_conv_weights([3, 3, 64, 128], name='conv_w5')
cb5 = init_biases([128], name='conv_b5')
conv5 = conv2d(pool3, cw5, cb5, 1, 1)
cw6 = init_conv_weights([3, 3, 128, 128], name='conv_w6')
cb6 = init_biases([128], name='conv_b6')
conv6 = conv2d(conv5, cw6, cb6, 1, 1)
# 此時張量的高和寬為 3x3,繼續池化為 2x2
#pool4 = max_pool(conv6, 2, 2, 2, 2)
# 第五組卷積 conv3-256 conv3-256
with tf.name_scope('conv_group_5'):
cw7 = init_conv_weights([3, 3, 128, 128], name='conv_w7')
cb7 = init_biases([128], name='conv_b7')
conv7 = conv2d(conv6, cw7, cb7, 1, 1)
cw8 = init_conv_weights([3, 3, 128, 128], name='conv_w8')
cb8 = init_biases([128], name='conv_b8')
conv8 = conv2d(conv7, cw8, cb8, 1, 1)
# 此處應該還有一個池化,但是現在張量為3x3,很小了,所以省略了池化
把最後一個卷積後的資料轉換為一維
# 轉換資料shape
reshape_conv8 = tf.reshape(conv8, [batch_size, -1])
n_in = reshape_conv8.get_shape()[-1].value
網路的全連線部分(兩個隱藏層)
# 地一個全連線層名稱空間
with tf.name_scope('fullc_1'):
fw9 = init_fc_weights([n_in, 256], name='fullc_w9')
fb9 = init_biases([256], name='fullc_b9')
activation1 = fullc(reshape_conv8, fw9, fb9)
# dropout正則
drop_act1 = tf.nn.dropout(activation1, keep_prob)
with tf.name_scope('fullc_2'):
fw10 = init_fc_weights([256, 256], name='fullc_w10')
fb10 = init_biases([256], name='fullc_b10')
activation2 = fullc(drop_act1, fw10, fb10)
# dropout正則
drop_act2 = tf.nn.dropout(activation2, keep_prob)
with tf.name_scope('fullc_3'):
fw11 = init_fc_weights([256, 10], name='fullc_w11')
fb11 = init_biases([10], name='full_b11')
logits = tf.add(tf.matmul(drop_act2, fw11), fb11)
output = tf.nn.softmax(logits)
損失函式優化器
cross_entropy= tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=labels)
cost = tf.reduce_mean(cross_entropy,name='Train_Cost')
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)
評估函式
# 用來評估測試資料的準確率
# 資料labels沒有使用one-hot編碼格式,labels是int32
def accuracy(labels, output):
labels = tf.to_int64(labels)
pred_result = tf.equal(labels, tf.argmax(output, 1))
accu = tf.reduce_mean(tf.cast(pred_result, tf.float32))
return accu
訓練.測試資料
# 載入訓練batch_size大小的資料,經過增強處理,剪裁,反轉,等等
train_images, train_labels = cifar10_input.distorted_inputs(batch_size= batch_size, data_dir= data_dir)
Filling queue with 20000 CIFAR images before starting to train. This will take a few minutes.
# 載入測試資料,batch_size大小,不進行增強處理
test_images, test_labels = cifar10_input.inputs(batch_size= batch_size, data_dir= data_dir,eval_data= True)
Training
# Training
def training(sess, max_steps, s_times, keeprob, display):
Cost = []
for i in range(max_steps):
for j in range(s_times):
start = time.time()
batch_images, batch_labels = sess.run([train_images, train_labels])
opt = sess.run(optimizer, feed_dict={images:batch_images, labels:batch_labels,
keep_prob:keeprob})
every_batch_time = time.time() - start
c = sess.run(cost, feed_dict={images:batch_images, labels:batch_labels,
keep_prob:keeprob})
Cost.append(c)
if i % display == 0:
samples_per_sec = float(batch_size) / every_batch_time
format_str = 'Epoch %d: %d samples/sec, %.4f sec/batch, Cost : %.5f'
print format_str%(i+display, samples_per_sec, every_batch_time, c)
return Cost
會話
sess = tf.Session()
sess.run(tf.global_variables_initializer())
# 圖片增強處理,時使用了16個執行緒加速,啟動16個獨立執行緒
tf.train.start_queue_runners(sess=sess)
[<Thread(Thread-4, started daemon 139861159835392)>,
<Thread(Thread-5, started daemon 139861147252480)>,
<Thread(Thread-6, started daemon 139861138859776)>,
.....
<Thread(Thread-35, started daemon 139859318511360)>,
<Thread(Thread-36, started daemon 139859310118656)>,
<Thread(Thread-37, started daemon 139859301725952)>]
儲存網路結構視覺化的二進位制檔案
#writer = tf.summary.FileWriter('./VggNet_visual',sess.graph)
載入上次訓練的引數變數
# 載入訓練3200次的權重變數
saver = tf.train.Saver()
saver.restore(sess,'./vgg_weights-3200')
INFO:tensorflow:Restoring parameters from ./vgg_weights-3200
train_cost = training(sess,17800,5,0.7,10)
Epoch 10: 474 samples/sec, 0.5400 sec/batch, Cost : 0.30948
Epoch 20: 482 samples/sec, 0.5302 sec/batch, Cost : 0.38076
Epoch 30: 479 samples/sec, 0.5339 sec/batch, Cost : 0.33604
..........
Epoch 17500: 473 samples/sec, 0.5411 sec/batch, Cost : 0.23114
fig,ax = plt.subplots(figsize=(13,6))
ax.plot(train_cost)
plt.title('Train Cost')
plt.grid()
plt.show()
前三千次訓練
後17800次訓練
15000次以後已經下降很緩慢了,但是下降的趨勢還很明顯.
# 訓練評估
train__images, train__labels = sess.run([train_images, train_labels])
train_output = sess.run(output,feed_dict={images:train__images,keep_prob:1.0})
train_accuracy = sess.run(accuracy(train__labels, output=train_output))
# 測試評估
test__images, test__labels = sess.run([test_images, test_labels])
test_output = sess.run(output, feed_dict={images:test__images, keep_prob:1.0})
test_accuracy = sess.run(accuracy(test__labels, test_output))
print 'train accuracy is: %.7f'%train_accuracy
print 'test accuracy is: %.7f'%test_accuracy
train accuracy is: 0.8789062
test accuracy is: 0.8807812
儲存訓練3200次的引數變數
ckpt_dir = './vgg_weights'
saver = tf.train.Saver()
saver.save(sess,save_path=ckpt_dir,global_step=3200)
'./vgg_weights-3200'
網路結構的視覺化
深度卷積網路的論文:
Karen Simonyan,Andrew Zisserman的論文<<Very Deep Convolutional Networks for Large-Scale Visual Recognition>>http://www.robots.ox.ac.uk/~vgg/research/very_deep/