TensorFlow學習--AlexNet實現&影象識別
AlexNet主要技術點
AlexNet使用的主要技術:
1. 使用ReLU作為CNN的啟用函式,解決了Sigmoid在較深網路中的梯度彌散問題(vanishing gradient problem).
2. 訓練時使用Dropout隨機忽略一部分神經元,避免了模型的過擬合問題.
3. 在CNN中使用重疊的最大池化,避免了平均池化造成的模糊效果.同時讓步長小於池化核的尺寸,使池化層的輸出發生重疊和覆蓋,提升特徵的豐富性.
4. 提出了LRN層,對區域性神經元的活動建立競爭機制,強化響應比較大的神經元,抑制反饋較小的神經元,增強模型的泛化能力.
5. 資料增強.隨機地從256*256的原始影象中擷取224*224大小的區域以及水平翻轉影象(相當於增加了
6. 使用CUDA加速深度卷積網路的訓練,利用GPU強大的並行運算能力,處理神經網路訓練時大量的矩陣運算.
AlexNet網路結構
AlexNet的網路結構:
5個卷積層+3個全連線層
AlexNet每層的超引數如圖.
兩個GPU,一個GPU執行圖形頂部的圖層部分,另一個執行圖層底部的圖層部分。 GPU僅在某些層進行通訊。
輸入的圖片規格為224*224*3,預處理後為227*227*3.
第一個卷積層使用96個較大的11*11尺寸的卷積核,步長為4,(採用了2個GPU處理,每個GPU處理48個).原影象為RGB 影象,是3通道,此處96個過濾器也是3通道的.得到的特徵圖大小new_feture_size=(img_size - filter_size)/stride +1 = (227-11)/4+1=55即大小為55*55.緊接著一個LRN層,然後是一個3*3的Max pooling最大池化層,步長為2.
AlexNet耗時測試
使用隨機圖片資料測試AlexNet前饋/反饋的平均耗時:
#!/usr/bin/python
# coding:utf-8
# TensorFlow實現AlexNet
from datetime import datetime
import math
import time
import tensorflow as tf
def convLayer(x, name, kh, kw, n_out, dh, dw, p):
# 輸入x的通道數
n_in = x.get_shape()[-1].value
with tf.name_scope(name) as scorp:
# 使用截斷正態分佈函式初始化卷積核(kh*kw*n_in)卷積核數量為n_out
kernel = tf.Variable(tf.truncated_normal([kh, kw, n_in, n_out], dtype=tf.float32, stddev=1e-1), name='weights')
# 對x進行卷積操作,strides步長為dh*dw,卷積核大小為kh*kw,padding模式為SAME即填充邊界的點
conv = tf.nn.conv2d(x, kernel, [1, dh, dw, 1], padding='SAME')
# biases初始化為0
biases = tf.Variable(tf.constant(0.0, shape=[n_out], dtype=tf.float32), trainable=True, name='biases')
# conv+biases
bias = tf.nn.bias_add(conv, biases)
activation =tf.nn.relu(bias, name=scorp)
# 將訓練引數kernel.biaases新增到p中
p += [kernel, biases]
# 打印出tensor activation結構
print activation.op.name, ' ', activation.get_shape().as_list()
return activation, p
# 總共測試100個batch的資料
num_batches = 100
# 全連線層
def fcLayer(x, inputData, outputData, reluFlag, name):
with tf.variable_scope(name) as scope:
w = tf.get_variable('w', shape=[inputData, outputData], dtype='float')
b = tf.get_variable('b', [outputData], dtype='float')
out = tf.nn.xw_plus_b(x, w, b, name=scope.name)
if reluFlag:
return tf.nn.relu(out)
else:
return out
# 接受images作為輸入,返回最後一層pool5及AlexNet中所有需要訓練的模型引數
def AlexNet(images, classNum=None, dropoutrate=None):
parameters = []
# 卷積層1
conv1, parameters = convLayer(images, name='conv1', kh=11, kw=11, n_out=64, dh=4, dw=4, p=parameters)
# 新增LRN層和最大池化層
# 對conv1進行LRN處理
lrn1 = tf.nn.lrn(conv1, 4, bias=1.0, alpha=0.001/9, beta=0.75, name='lrn1')
# 對lrn1進行最大池化處理,池化尺寸3*3,步長2*2,padding模式選VALID即取樣不能超過邊框
pool1 = tf.nn.max_pool(lrn1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='VALID', name='pool1')
# 打印出結果pool1的結構
print pool1.op.name, ' ', pool1.get_shape().as_list()
# 卷積層2
conv2, parameters = convLayer(pool1, name='conv2', kh=5, kw=5, n_out=192, dh=1, dw=1, p=parameters)
# LRN處理
lrn2 = tf.nn.lrn(conv2, 4, bias=1.0,alpha=0.001/9, beta=0.75, name='lrn2')
# 最大池化處理
pool2 = tf.nn.max_pool(lrn2, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='VALID', name='pool2')
# 打印出結果pool2的結構
print pool2.op.name, ' ', pool2.get_shape().as_list()
# 卷積層3
conv3, parameters = convLayer(pool2, name='conv3', kh=3, kw=3, n_out=384, dh=1, dw=1, p=parameters)
# 卷積層4
conv4, parameters = convLayer(conv3, name='conv4', kh=3, kw=3, n_out=256, dh=1, dw=1, p=parameters)
# 卷積層5
conv5, parameters = convLayer(conv4, name='conv5', kh=3, kw=3, n_out=256, dh=1, dw=1, p=parameters)
pool5 = tf.nn.max_pool(conv5, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='VALID', name='pool5')
print pool5.op.name, ' ', pool5.get_shape().as_list()
fc_in = tf.reshape(pool5, [-1, 256*6*6])
fc6 = fcLayer(fc_in, 256*6*6, 4096, True, 'fc6')
dropout6 = tf.nn.dropout(fc6, dropoutrate)
fc7 = fcLayer(dropout6, 4096, 4096,True, 'fc7')
dropout7 = tf.nn.dropout(fc7, dropoutrate)
fc8 = fcLayer(dropout7, 4096, classNum, True, 'fc8')
return pool5, parameters
# 評估AlexNet每輪計算佔用的時間
# 輸入TensorFlow的Session,需要測評的運算元target,測試的名稱info_string
def time_tensorflow_run(session, target, info_string):
# 定義預熱輪數(忽略前10輪,不考慮視訊記憶體載入等因素的影響)
num_steps_burn_in = 10
total_duration = 0.0
total_duration_squared = 0.0
for i in range(num_batches + num_steps_burn_in):
start_time = time.time()
_ = session.run(target)
# 持續時間
duration = time.time()- start_time
if i >= num_steps_burn_in:
# 只考量10輪迭代之後的計算時間
if not i % 10:
print '%s: step %d, duration = %.3f' % (datetime.now().strftime('%X'), i - num_steps_burn_in, duration)
# 記錄總時間
total_duration += duration
total_duration_squared += duration * duration
# 計算每輪迭代的平均耗時mn,和標準差sd
mn = total_duration / num_batches
vr = total_duration_squared / num_batches - mn * mn
sd = math.sqrt(vr)
# 打印出每輪迭代耗時
print '%s: %s across %d steps, %.3f +/- %.3f sec / batch' % (datetime.now().strftime('%X'), info_string, num_batches, mn, sd)
# 使用隨機圖片資料測試前饋和反饋計算的耗時
def run_benchmark():
with tf.Graph().as_default():
batch_size = 32
image_size = 224
# 生成隨機圖片資料
images = tf.Variable(tf.random_normal([batch_size, # 每輪迭代的樣本數
image_size,image_size,# 圖片尺寸224*224
3], # 圖片的通道數
dtype=tf.float32, # 資料型別
stddev=1e-1)) # 標準差
# 構建AlexNet,得到pool5和訓練引數集合parameters
pool5, parameters= AlexNet(images, classNum=1000, dropoutrate=0.5)
# 初始化所有引數
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
# AlexNet的forward計算測評
time_tensorflow_run(sess, pool5, 'Forward')
# 為pool5設定一個優化目標loss,計算poll5的loss
objective = tf.nn.l2_loss(pool5)
# 求相對於loss的所有模型引數的梯度,模擬訓練過程
grad = tf.gradients(objective, parameters)
# AlexNet的backward計算測評
time_tensorflow_run(sess, grad, 'Forward-backward')
if __name__ == '__main__':
run_benchmark()
列印輸出:
conv1 [32, 56, 56, 64]
pool1 [32, 27, 27, 64]
conv2 [32, 27, 27, 192]
pool2 [32, 13, 13, 192]
conv3 [32, 13, 13, 384]
conv4 [32, 13, 13, 256]
conv5 [32, 13, 13, 256]
pool5 [32, 6, 6, 256]
19:43:26: step 0, duration = 1.526
19:43:43: step 10, duration = 2.018
19:44:03: step 20, duration = 1.618
19:44:19: step 30, duration = 1.583
19:44:37: step 40, duration = 1.808
19:44:56: step 50, duration = 1.749
19:45:13: step 60, duration = 1.849
19:45:32: step 70, duration = 1.837
19:45:49: step 80, duration = 1.587
19:46:06: step 90, duration = 1.663
19:46:23: Forward across 100 steps, 1.789 +/- 0.210 sec / batch
19:47:30: step 0, duration = 5.831
19:48:34: step 10, duration = 5.831
19:49:49: step 20, duration = 8.383
19:50:57: step 30, duration = 6.152
19:52:48: step 40, duration = 13.673
19:54:44: step 50, duration = 10.054
19:56:32: step 60, duration = 11.055
19:58:17: step 70, duration = 10.246
20:00:06: step 80, duration = 12.227
20:02:01: step 90, duration = 10.946
20:03:31: Forward-backward across 100 steps, 9.666 +/- 2.279 sec / batch
可以看到5個卷積層以及最後一個池化層,以及每一層輸出tensor的尺寸.
然後還可以看到forward以及backword運算的時間,此處沒有使用GPU,因此可以看到每輪迭代的時間消耗比較大.
AlexNet實現及影象識別
# AlexNet實現
import tensorflow as tf
import numpy as np
# 卷積層
# group=2時等於AlexNet分上下兩部分
def convLayer(x, kHeight, kWidth, strideX, strideY, featureNum, name, padding="SAME", groups=1):
# 獲取channel數
channel = int(x.get_shape()[-1])
# 定義卷積的匿名函式
conv = lambda a, b: tf.nn.conv2d(a, b, strides=[1, strideY, strideX, 1], padding=padding)
with tf.variable_scope(name) as scope:
w = tf.get_variable("w", shape=[kHeight, kWidth, channel / groups, featureNum])
b = tf.get_variable("b", shape=[featureNum])
# 將張量分解成子張量,劃分後的輸入和權重
xNew = tf.split(value=x, num_or_size_splits=groups, axis=3)
wNew = tf.split(value=w, num_or_size_splits=groups, axis=3)
# 分別提取feature map
featureMap = [conv(t1, t2) for t1, t2 in zip(xNew, wNew)]
# feature map整合
mergeFeatureMap = tf.concat(axis=3, values=featureMap)
out = tf.nn.bias_add(mergeFeatureMap, b)
# relu後的結果
return tf.nn.relu(tf.reshape(out, mergeFeatureMap.get_shape().as_list()), name=scope.name)
# 全連線層
def fcLayer(x, inputD, outputD, reluFlag, name):
with tf.variable_scope(name) as scope:
w = tf.get_variable("w", shape=[inputD, outputD], dtype="float")
b = tf.get_variable("b", [outputD], dtype="float")
out = tf.nn.xw_plus_b(x, w, b, name=scope.name)
if reluFlag:
return tf.nn.relu(out)
else:
return out
# alexNet模型
class alexNet(object):
def __init__(self, x, keepPro, classNum, modelPath="bvlc_alexnet.npy"):
self.X = x
self.KEEPPRO = keepPro
self.CLASSNUM = classNum
self.MODELPATH = modelPath
self.buildCNN()
def buildCNN(self):
# 卷積層1
conv1 = convLayer(self.X, 11, 11, 4, 4, 96, "conv1", "VALID")
# 最大池化層,池化視窗3*3,步長2*2
pool1 = tf.nn.max_pool(conv1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='VALID', name='pool1')
lrn1 = tf.nn.lrn(pool1, depth_radius=2, alpha=2e-05,beta=0.75, bias=1.0, name='norm1')
# 卷積層2
conv2 = convLayer(lrn1, 5, 5, 1, 1, 256, "conv2", groups=2)
# 最大池化層,池化視窗3*3,步長2*2
pool2 = tf.nn.max_pool(conv2, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='VALID', name='pool2')
lrn2 = tf.nn.lrn(pool2, depth_radius=2, alpha=2e-05, beta=0.75, bias=1.0, name='lrn2')
# 卷積層3
conv3 = convLayer(lrn2, 3, 3, 1, 1, 384, "conv3")
# 卷積層4
conv4 = convLayer(conv3, 3, 3, 1, 1, 384, "conv4", groups=2)
# 卷積層5
conv5 = convLayer(conv4, 3, 3, 1, 1, 256, "conv5", groups=2)
# 最大池化層,池化視窗3*3,步長2*2
pool5 = tf.nn.max_pool(conv5, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='VALID', name='pool5')
# 全連線層1
fcIn = tf.reshape(pool5, [-1, 256 * 6 * 6])
fc1 = fcLayer(fcIn, 256 * 6 * 6, 4096, True, "fc6")
dropout1 = tf.nn.dropout(fc1, self.KEEPPRO)
# 全連線層2
fc2 = fcLayer(dropout1, 4096, 4096, True, "fc7")
dropout2 = tf.nn.dropout(fc2, self.KEEPPRO)
# 全連線層3
self.fc3 = fcLayer(dropout2, 4096, self.CLASSNUM, True, "fc8")
# 載入modeel
def loadModel(self, sess):
wDict = np.load(self.MODELPATH, encoding="bytes").item()
# 模型中的層
for name in wDict:
if name not in []:
with tf.variable_scope(name, reuse=True):
for p in wDict[name]:
if len(p.shape) == 1:
# bias 只有一維
sess.run(tf.get_variable('b', trainable=False).assign(p))
else:
# weights
sess.run(tf.get_variable('w', trainable=False).assign(p))
import os
import cv2
import caffe_classes
# AlexNet測試
if __name__=='__main__':
dropoutPro = 1
classNum = 1000
testPath = "testimage"
# 讀取測試影象
testImg = []
for f in os.listdir(testPath):
testImg.append(cv2.imread(testPath + "/" + f))
imgMean = np.array([104, 117, 124], np.float)
x = tf.placeholder("float", [1, 227, 227, 3])
# alexNet模型
model = alexNet(x, dropoutPro, classNum)
score = model.fc3
print score
softmax = tf.nn.softmax(score)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
# 載入模型
model.loadModel(sess)
for i, img in enumerate(testImg):
# resize成網路輸入大小,去均值
test = cv2.resize(img.astype(np.float), (227, 227)) - imgMean
# test拉成tensor
test = test.reshape((1, 227, 227, 3))
# 取概率最大類的下標
maxx = np.argmax(sess.run(softmax, feed_dict={x: test}))
# 概率最大的類
res = caffe_classes.class_names[maxx]
print(res)
# 設定字型
font = cv2.FONT_HERSHEY_SIMPLEX
# 顯示類的名字
cv2.putText(img, res, (int(img.shape[0] / 3), int(img.shape[1] / 3)), font, 1, (0, 0, 255), 2)
# 顯示
cv2.imshow("test", img)
cv2.waitKey(0)
可以看到斑馬zebar和鶴crane的測試結果:
AlexNet相關連線:
- 訓練好的檔案bvlc_alexnet.npy以及與網路對應的類別檔案caffe_classes.py下載連結