1. 程式人生 > >TensorFlow學習--AlexNet實現&影象識別

TensorFlow學習--AlexNet實現&影象識別

AlexNet主要技術點

AlexNet使用的主要技術:
1. 使用ReLU作為CNN的啟用函式,解決了Sigmoid在較深網路中的梯度彌散問題(vanishing gradient problem).
2. 訓練時使用Dropout隨機忽略一部分神經元,避免了模型的過擬合問題.
3. 在CNN中使用重疊的最大池化,避免了平均池化造成的模糊效果.同時讓步長小於池化核的尺寸,使池化層的輸出發生重疊和覆蓋,提升特徵的豐富性.
4. 提出了LRN層,對區域性神經元的活動建立競爭機制,強化響應比較大的神經元,抑制反饋較小的神經元,增強模型的泛化能力.
5. 資料增強.隨機地從256*256的原始影象中擷取224*224大小的區域以及水平翻轉影象(相當於增加了(

256224)2×2=2048 倍的資料量).進行預測時,取圖片的四個角加中間共5個位置並進行翻轉,即10個影象,對其進行預測並對10次結果求均值.
6. 使用CUDA加速深度卷積網路的訓練,利用GPU強大的並行運算能力,處理神經網路訓練時大量的矩陣運算.

AlexNet網路結構

AlexNet的網路結構:

這裡寫圖片描述
5個卷積層+3個全連線層

AlexNet每層的超引數如圖.
兩個GPU,一個GPU執行圖形頂部的圖層部分,另一個執行圖層底部的圖層部分。 GPU僅在某些層進行通訊。
輸入的圖片規格為224*224*3,預處理後為227*227*3.
第一個卷積層使用96個較大的11*11尺寸的卷積核,步長為4,(採用了2個GPU處理,每個GPU處理48個).原影象為RGB 影象,是3通道,此處96個過濾器也是3通道的.得到的特徵圖大小new_feture_size=(img_size - filter_size)/stride +1 = (227-11)/4+1=55即大小為55*55.緊接著一個LRN層,然後是一個3*3的Max pooling最大池化層,步長為2.

AlexNet耗時測試

使用隨機圖片資料測試AlexNet前饋/反饋的平均耗時:

#!/usr/bin/python
# coding:utf-8

# TensorFlow實現AlexNet

from datetime import datetime
import math
import time
import tensorflow as tf


def convLayer(x, name, kh, kw, n_out, dh, dw, p):
    # 輸入x的通道數
    n_in = x.get_shape()[-1].value
    with tf.name_scope(name) as
scorp: # 使用截斷正態分佈函式初始化卷積核(kh*kw*n_in)卷積核數量為n_out kernel = tf.Variable(tf.truncated_normal([kh, kw, n_in, n_out], dtype=tf.float32, stddev=1e-1), name='weights') # 對x進行卷積操作,strides步長為dh*dw,卷積核大小為kh*kw,padding模式為SAME即填充邊界的點 conv = tf.nn.conv2d(x, kernel, [1, dh, dw, 1], padding='SAME') # biases初始化為0 biases = tf.Variable(tf.constant(0.0, shape=[n_out], dtype=tf.float32), trainable=True, name='biases') # conv+biases bias = tf.nn.bias_add(conv, biases) activation =tf.nn.relu(bias, name=scorp) # 將訓練引數kernel.biaases新增到p中 p += [kernel, biases] # 打印出tensor activation結構 print activation.op.name, ' ', activation.get_shape().as_list() return activation, p # 總共測試100個batch的資料 num_batches = 100 # 全連線層 def fcLayer(x, inputData, outputData, reluFlag, name): with tf.variable_scope(name) as scope: w = tf.get_variable('w', shape=[inputData, outputData], dtype='float') b = tf.get_variable('b', [outputData], dtype='float') out = tf.nn.xw_plus_b(x, w, b, name=scope.name) if reluFlag: return tf.nn.relu(out) else: return out # 接受images作為輸入,返回最後一層pool5及AlexNet中所有需要訓練的模型引數 def AlexNet(images, classNum=None, dropoutrate=None): parameters = [] # 卷積層1 conv1, parameters = convLayer(images, name='conv1', kh=11, kw=11, n_out=64, dh=4, dw=4, p=parameters) # 新增LRN層和最大池化層 # 對conv1進行LRN處理 lrn1 = tf.nn.lrn(conv1, 4, bias=1.0, alpha=0.001/9, beta=0.75, name='lrn1') # 對lrn1進行最大池化處理,池化尺寸3*3,步長2*2,padding模式選VALID即取樣不能超過邊框 pool1 = tf.nn.max_pool(lrn1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='VALID', name='pool1') # 打印出結果pool1的結構 print pool1.op.name, ' ', pool1.get_shape().as_list() # 卷積層2 conv2, parameters = convLayer(pool1, name='conv2', kh=5, kw=5, n_out=192, dh=1, dw=1, p=parameters) # LRN處理 lrn2 = tf.nn.lrn(conv2, 4, bias=1.0,alpha=0.001/9, beta=0.75, name='lrn2') # 最大池化處理 pool2 = tf.nn.max_pool(lrn2, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='VALID', name='pool2') # 打印出結果pool2的結構 print pool2.op.name, ' ', pool2.get_shape().as_list() # 卷積層3 conv3, parameters = convLayer(pool2, name='conv3', kh=3, kw=3, n_out=384, dh=1, dw=1, p=parameters) # 卷積層4 conv4, parameters = convLayer(conv3, name='conv4', kh=3, kw=3, n_out=256, dh=1, dw=1, p=parameters) # 卷積層5 conv5, parameters = convLayer(conv4, name='conv5', kh=3, kw=3, n_out=256, dh=1, dw=1, p=parameters) pool5 = tf.nn.max_pool(conv5, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='VALID', name='pool5') print pool5.op.name, ' ', pool5.get_shape().as_list() fc_in = tf.reshape(pool5, [-1, 256*6*6]) fc6 = fcLayer(fc_in, 256*6*6, 4096, True, 'fc6') dropout6 = tf.nn.dropout(fc6, dropoutrate) fc7 = fcLayer(dropout6, 4096, 4096,True, 'fc7') dropout7 = tf.nn.dropout(fc7, dropoutrate) fc8 = fcLayer(dropout7, 4096, classNum, True, 'fc8') return pool5, parameters # 評估AlexNet每輪計算佔用的時間 # 輸入TensorFlow的Session,需要測評的運算元target,測試的名稱info_string def time_tensorflow_run(session, target, info_string): # 定義預熱輪數(忽略前10輪,不考慮視訊記憶體載入等因素的影響) num_steps_burn_in = 10 total_duration = 0.0 total_duration_squared = 0.0 for i in range(num_batches + num_steps_burn_in): start_time = time.time() _ = session.run(target) # 持續時間 duration = time.time()- start_time if i >= num_steps_burn_in: # 只考量10輪迭代之後的計算時間 if not i % 10: print '%s: step %d, duration = %.3f' % (datetime.now().strftime('%X'), i - num_steps_burn_in, duration) # 記錄總時間 total_duration += duration total_duration_squared += duration * duration # 計算每輪迭代的平均耗時mn,和標準差sd mn = total_duration / num_batches vr = total_duration_squared / num_batches - mn * mn sd = math.sqrt(vr) # 打印出每輪迭代耗時 print '%s: %s across %d steps, %.3f +/- %.3f sec / batch' % (datetime.now().strftime('%X'), info_string, num_batches, mn, sd) # 使用隨機圖片資料測試前饋和反饋計算的耗時 def run_benchmark(): with tf.Graph().as_default(): batch_size = 32 image_size = 224 # 生成隨機圖片資料 images = tf.Variable(tf.random_normal([batch_size, # 每輪迭代的樣本數 image_size,image_size,# 圖片尺寸224*224 3], # 圖片的通道數 dtype=tf.float32, # 資料型別 stddev=1e-1)) # 標準差 # 構建AlexNet,得到pool5和訓練引數集合parameters pool5, parameters= AlexNet(images, classNum=1000, dropoutrate=0.5) # 初始化所有引數 init = tf.global_variables_initializer() sess = tf.Session() sess.run(init) # AlexNet的forward計算測評 time_tensorflow_run(sess, pool5, 'Forward') # 為pool5設定一個優化目標loss,計算poll5的loss objective = tf.nn.l2_loss(pool5) # 求相對於loss的所有模型引數的梯度,模擬訓練過程 grad = tf.gradients(objective, parameters) # AlexNet的backward計算測評 time_tensorflow_run(sess, grad, 'Forward-backward') if __name__ == '__main__': run_benchmark()

列印輸出:

conv1   [32, 56, 56, 64]
pool1   [32, 27, 27, 64]
conv2   [32, 27, 27, 192]
pool2   [32, 13, 13, 192]
conv3   [32, 13, 13, 384]
conv4   [32, 13, 13, 256]
conv5   [32, 13, 13, 256]
pool5   [32, 6, 6, 256]

19:43:26: step 0, duration = 1.526
19:43:43: step 10, duration = 2.018
19:44:03: step 20, duration = 1.618
19:44:19: step 30, duration = 1.583
19:44:37: step 40, duration = 1.808
19:44:56: step 50, duration = 1.749
19:45:13: step 60, duration = 1.849
19:45:32: step 70, duration = 1.837
19:45:49: step 80, duration = 1.587
19:46:06: step 90, duration = 1.663
19:46:23: Forward across 100 steps, 1.789 +/- 0.210 sec / batch
19:47:30: step 0, duration = 5.831
19:48:34: step 10, duration = 5.831
19:49:49: step 20, duration = 8.383
19:50:57: step 30, duration = 6.152
19:52:48: step 40, duration = 13.673
19:54:44: step 50, duration = 10.054
19:56:32: step 60, duration = 11.055
19:58:17: step 70, duration = 10.246
20:00:06: step 80, duration = 12.227
20:02:01: step 90, duration = 10.946
20:03:31: Forward-backward across 100 steps, 9.666 +/- 2.279 sec / batch

可以看到5個卷積層以及最後一個池化層,以及每一層輸出tensor的尺寸.
然後還可以看到forward以及backword運算的時間,此處沒有使用GPU,因此可以看到每輪迭代的時間消耗比較大.

AlexNet實現及影象識別

# AlexNet實現
import tensorflow as tf
import numpy as np


# 卷積層
# group=2時等於AlexNet分上下兩部分
def convLayer(x, kHeight, kWidth, strideX, strideY, featureNum, name, padding="SAME", groups=1):
    # 獲取channel數
    channel = int(x.get_shape()[-1])
    # 定義卷積的匿名函式
    conv = lambda a, b: tf.nn.conv2d(a, b, strides=[1, strideY, strideX, 1], padding=padding)
    with tf.variable_scope(name) as scope:
        w = tf.get_variable("w", shape=[kHeight, kWidth, channel / groups, featureNum])
        b = tf.get_variable("b", shape=[featureNum])
        # 將張量分解成子張量,劃分後的輸入和權重
        xNew = tf.split(value=x, num_or_size_splits=groups, axis=3)
        wNew = tf.split(value=w, num_or_size_splits=groups, axis=3)
        # 分別提取feature map
        featureMap = [conv(t1, t2) for t1, t2 in zip(xNew, wNew)]
        # feature map整合
        mergeFeatureMap = tf.concat(axis=3, values=featureMap)
        out = tf.nn.bias_add(mergeFeatureMap, b)
        # relu後的結果
        return tf.nn.relu(tf.reshape(out, mergeFeatureMap.get_shape().as_list()), name=scope.name)

# 全連線層
def fcLayer(x, inputD, outputD, reluFlag, name):
    with tf.variable_scope(name) as scope:
        w = tf.get_variable("w", shape=[inputD, outputD], dtype="float")
        b = tf.get_variable("b", [outputD], dtype="float")
        out = tf.nn.xw_plus_b(x, w, b, name=scope.name)
        if reluFlag:
            return tf.nn.relu(out)
        else:
            return out

# alexNet模型
class alexNet(object):
    def __init__(self, x, keepPro, classNum, modelPath="bvlc_alexnet.npy"):
        self.X = x
        self.KEEPPRO = keepPro
        self.CLASSNUM = classNum
        self.MODELPATH = modelPath
        self.buildCNN()

    def buildCNN(self):
        # 卷積層1
        conv1 = convLayer(self.X, 11, 11, 4, 4, 96, "conv1", "VALID")
        # 最大池化層,池化視窗3*3,步長2*2
        pool1 = tf.nn.max_pool(conv1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='VALID', name='pool1')
        lrn1 = tf.nn.lrn(pool1, depth_radius=2, alpha=2e-05,beta=0.75, bias=1.0, name='norm1')
        # 卷積層2
        conv2 = convLayer(lrn1, 5, 5, 1, 1, 256, "conv2", groups=2)
        # 最大池化層,池化視窗3*3,步長2*2
        pool2 = tf.nn.max_pool(conv2, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='VALID', name='pool2')
        lrn2 = tf.nn.lrn(pool2, depth_radius=2, alpha=2e-05, beta=0.75, bias=1.0, name='lrn2')
        # 卷積層3
        conv3 = convLayer(lrn2, 3, 3, 1, 1, 384, "conv3")
        # 卷積層4
        conv4 = convLayer(conv3, 3, 3, 1, 1, 384, "conv4", groups=2)
        # 卷積層5
        conv5 = convLayer(conv4, 3, 3, 1, 1, 256, "conv5", groups=2)
        # 最大池化層,池化視窗3*3,步長2*2
        pool5 = tf.nn.max_pool(conv5, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='VALID', name='pool5')

        # 全連線層1
        fcIn = tf.reshape(pool5, [-1, 256 * 6 * 6])
        fc1 = fcLayer(fcIn, 256 * 6 * 6, 4096, True, "fc6")
        dropout1 = tf.nn.dropout(fc1, self.KEEPPRO)
        # 全連線層2
        fc2 = fcLayer(dropout1, 4096, 4096, True, "fc7")
        dropout2 = tf.nn.dropout(fc2, self.KEEPPRO)
        # 全連線層3
        self.fc3 = fcLayer(dropout2, 4096, self.CLASSNUM, True, "fc8")

    # 載入modeel
    def loadModel(self, sess):
        wDict = np.load(self.MODELPATH, encoding="bytes").item()
        # 模型中的層
        for name in wDict:
            if name not in []:
                with tf.variable_scope(name, reuse=True):
                    for p in wDict[name]:
                        if len(p.shape) == 1:
                            # bias 只有一維
                            sess.run(tf.get_variable('b', trainable=False).assign(p))
                        else:
                            # weights
                            sess.run(tf.get_variable('w', trainable=False).assign(p))


import os
import cv2
import caffe_classes


# AlexNet測試
if __name__=='__main__':
    dropoutPro = 1
    classNum = 1000
    testPath = "testimage"
    # 讀取測試影象
    testImg = []
    for f in os.listdir(testPath):
        testImg.append(cv2.imread(testPath + "/" + f))

    imgMean = np.array([104, 117, 124], np.float)
    x = tf.placeholder("float", [1, 227, 227, 3])
    # alexNet模型
    model = alexNet(x, dropoutPro, classNum)
    score = model.fc3
    print score
    softmax = tf.nn.softmax(score)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        # 載入模型
        model.loadModel(sess)
        for i, img in enumerate(testImg):
            # resize成網路輸入大小,去均值
            test = cv2.resize(img.astype(np.float), (227, 227)) - imgMean
            # test拉成tensor
            test = test.reshape((1, 227, 227, 3))
            # 取概率最大類的下標
            maxx = np.argmax(sess.run(softmax, feed_dict={x: test}))
            # 概率最大的類
            res = caffe_classes.class_names[maxx]
            print(res)
            # 設定字型
            font = cv2.FONT_HERSHEY_SIMPLEX
            # 顯示類的名字
            cv2.putText(img, res, (int(img.shape[0] / 3), int(img.shape[1] / 3)), font, 1, (0, 0, 255), 2)
            # 顯示
            cv2.imshow("test", img)
            cv2.waitKey(0)

可以看到斑馬zebar和鶴crane的測試結果:

這裡寫圖片描述

這裡寫圖片描述

AlexNet相關連線:

  1. 訓練好的檔案bvlc_alexnet.npy以及與網路對應的類別檔案caffe_classes.py下載連結