1. 程式人生 > >yoloV3引數理解及註釋

yoloV3引數理解及註釋

不對的地方求各位糾正

[net]
# Testing
#batch=1
#subdivisions=1
# Training
batch=64   				一批訓練樣本的樣本數量,每batch個樣本更新一次引數
subdivisions=64				batch/subdivisions作為一次性送入訓練器的樣本數量
					如果記憶體不夠大,將batch分割為subdivisions個子batch
 
					上面這兩個引數如果電腦記憶體小,則把batch改小一點,batch越大,訓練效果越好
					subdivisions越大,可以減輕顯示卡壓力

width=416			
height=416
channels=3
					以上三個引數為輸入影象的引數資訊 width和height影響網路對輸入影象的解析度,
					從而影響precision,只可以設定成32的倍數

momentum=0.9				DeepLearning1中最優化方法中的動量引數,這個值影響著梯度下降到最優值得速度  
decay=0.0005				權重衰減正則項,防止過擬合
angle=0					通過旋轉角度來生成更多訓練樣本
saturation = 1.5			通過調整飽和度來生成更多訓練樣本
exposure = 1.5				通過調整曝光量來生成更多訓練樣本
hue=.1					通過調整色調來生成更多訓練樣本

learning_rate=0.001			學習率決定著權值更新的速度,設定得太大會使結果超過最優值,太小會使下降速度過慢。
					如果僅靠人為干預調整引數,需要不斷修改學習率。剛開始訓練時可以將學習率設定的高一點,
					而一定輪數之後,將其減小
					在訓練過程中,一般根據訓練輪數設定動態變化的學習率。
					剛開始訓練時:學習率以 0.01 ~ 0.001 為宜。
					一定輪數過後:逐漸減緩。
					接近訓練結束:學習速率的衰減應該在100倍以上。
					學習率的調整參考https://blog.csdn.net/qq_33485434/article/details/80452941

burn_in=1000				在迭代次數小於burn_in時,其學習率的更新有一種方式,大於burn_in時,才採用policy的更新方式
max_batches = 500200		        訓練達到max_batches後停止學習
policy=steps				這個是學習率調整的策略,有policy:constant, steps, exp, poly, step, sig, RANDOM,constant等方式
					參考https://nanfei.ink/2018/01/23/YOLOv2%E8%B0%83%E5%8F%82%E6%80%BB%E7%BB%93/#more
steps=40000,45000			下面這兩個引數steps和scale是設定學習率的變化,比如迭代到40000次時,學習率衰減十倍。
scales=.1,.1				45000次迭代時,學習率又會在前一個學習率的基礎上衰減十倍

[convolutional]
batch_normalize=1 			?
filters=32			        輸出特徵圖的數量
size=3					卷積核的尺寸
stride=1				做卷積運算的步長
pad=1					如果pad為0,padding由 padding引數指定。
					如果pad為1,padding大小為size/2,padding應該是對輸入影象左邊緣拓展的畫素數量
activation=leaky			啟用函式的型別

# Downsample

[convolutional]
batch_normalize=1
filters=64
size=3
stride=2
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=32
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=64
size=3
stride=1
pad=1
activation=leaky

[shortcut]
from=-3
activation=linear

# Downsample

[convolutional]
batch_normalize=1
filters=128
size=3
stride=2
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=64
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=128
size=3
stride=1
pad=1
activation=leaky

[shortcut]
from=-3
activation=linear

[convolutional]
batch_normalize=1
filters=64
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=128
size=3
stride=1
pad=1
activation=leaky

[shortcut]
from=-3
activation=linear

# Downsample

[convolutional]
batch_normalize=1
filters=256
size=3
stride=2
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=128
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=256
size=3
stride=1
pad=1
activation=leaky

[shortcut]
from=-3
activation=linear

[convolutional]
batch_normalize=1
filters=128
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=256
size=3
stride=1
pad=1
activation=leaky

[shortcut]
from=-3
activation=linear

[convolutional]
batch_normalize=1
filters=128
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=256
size=3
stride=1
pad=1
activation=leaky

[shortcut]
from=-3
activation=linear

[convolutional]
batch_normalize=1
filters=128
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=256
size=3
stride=1
pad=1
activation=leaky

[shortcut]
from=-3
activation=linear


[convolutional]
batch_normalize=1
filters=128
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=256
size=3
stride=1
pad=1
activation=leaky

[shortcut]
from=-3
activation=linear

[convolutional]
batch_normalize=1
filters=128
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=256
size=3
stride=1
pad=1
activation=leaky

[shortcut]
from=-3
activation=linear

[convolutional]
batch_normalize=1
filters=128
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=256
size=3
stride=1
pad=1
activation=leaky

[shortcut]
from=-3
activation=linear

[convolutional]
batch_normalize=1
filters=128
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=256
size=3
stride=1
pad=1
activation=leaky

[shortcut]
from=-3
activation=linear

# Downsample

[convolutional]
batch_normalize=1
filters=512
size=3
stride=2
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=256
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=512
size=3
stride=1
pad=1
activation=leaky

[shortcut]
from=-3
activation=linear


[convolutional]
batch_normalize=1
filters=256
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=512
size=3
stride=1
pad=1
activation=leaky

[shortcut]
from=-3
activation=linear


[convolutional]
batch_normalize=1
filters=256
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=512
size=3
stride=1
pad=1
activation=leaky

[shortcut]
from=-3
activation=linear


[convolutional]
batch_normalize=1
filters=256
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=512
size=3
stride=1
pad=1
activation=leaky

[shortcut]
from=-3
activation=linear

[convolutional]
batch_normalize=1
filters=256
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=512
size=3
stride=1
pad=1
activation=leaky

[shortcut]
from=-3
activation=linear


[convolutional]
batch_normalize=1
filters=256
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=512
size=3
stride=1
pad=1
activation=leaky

[shortcut]
from=-3
activation=linear


[convolutional]
batch_normalize=1
filters=256
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=512
size=3
stride=1
pad=1
activation=leaky

[shortcut]
from=-3
activation=linear

[convolutional]
batch_normalize=1
filters=256
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=512
size=3
stride=1
pad=1
activation=leaky

[shortcut]
from=-3
activation=linear

# Downsample

[convolutional]
batch_normalize=1
filters=1024
size=3
stride=2
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=512
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=1024
size=3
stride=1
pad=1
activation=leaky

[shortcut]
from=-3
activation=linear

[convolutional]
batch_normalize=1
filters=512
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=1024
size=3
stride=1
pad=1
activation=leaky

[shortcut]
from=-3
activation=linear

[convolutional]
batch_normalize=1
filters=512
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=1024
size=3
stride=1
pad=1
activation=leaky

[shortcut]
from=-3
activation=linear

[convolutional]
batch_normalize=1
filters=512
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=1024
size=3
stride=1
pad=1
activation=leaky

[shortcut]
from=-3
activation=linear

######################

[convolutional]
batch_normalize=1
filters=512
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
size=3
stride=1
pad=1
filters=1024
activation=leaky

[convolutional]
batch_normalize=1
filters=512
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
size=3
stride=1
pad=1
filters=1024
activation=leaky

[convolutional]
batch_normalize=1
filters=512
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
size=3
stride=1
pad=1
filters=1024
activation=leaky

[convolutional]
size=1
stride=1
pad=1
filters=18				每一個[region/yolo]層前的最後一個卷積層中的 filters=num(yolo層個數)*(classes+5) 
					5的意義是5個座標,論文中的tx,ty,tw,th,to
activation=linear


[yolo] 					在yoloV2中yolo層叫region層
mask = 6,7,8
anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
					anchors是可以事先通過cmd指令計算出來的,是和圖片數量,width,height以及cluster(應該就是下面的num的值,
					即想要使用的anchors的數量)相關的預選框,可以手工挑選,也可以通過k means 從訓練樣本中學出

classes=1
num=9
jitter=.3
ignore_thresh = .5
truth_thresh = 1
random=1				random設定成1,可以增加檢測精度precision


[route]
layers = -4

[convolutional]
batch_normalize=1
filters=256
size=1
stride=1
pad=1
activation=leaky

[upsample]
stride=2

[route]
layers = -1, 61



[convolutional]
batch_normalize=1
filters=256
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
size=3
stride=1
pad=1
filters=512
activation=leaky

[convolutional]
batch_normalize=1
filters=256
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
size=3
stride=1
pad=1
filters=512
activation=leaky

[convolutional]
batch_normalize=1
filters=256
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
size=3
stride=1
pad=1
filters=512
activation=leaky

[convolutional]
size=1
stride=1
pad=1
filters=18
activation=linear


[yolo]
mask = 3,4,5
anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
classes=1
num=9	  				每個grid cell預測幾個box,和anchors的數量一致。當想要使用更多anchors時需要調大num,
					且如果調大num後訓練時Obj趨近0的話可以嘗試調大object_scale
jitter=.3			        利用資料抖動產生更多資料,YOLOv2中使用的是crop,filp,以及net層的angle,flip是隨機的,
					jitter就是crop的引數,tiny-yolo-voc.cfg中jitter=.3,就是在0~0.3中進行crop
ignore_thresh = .5			決定是否需要計算IOU誤差的引數,大於thresh,IOU誤差不會夾在cost function中
truth_thresh = 1
random=1				如果為1,每次迭代圖片大小隨機從320到608,步長為32,如果為0,每次訓練大小與輸入大小一致



[route]
layers = -4

[convolutional]
batch_normalize=1
filters=128
size=1
stride=1
pad=1
activation=leaky

[upsample]
stride=2

[route]
layers = -1, 36



[convolutional]
batch_normalize=1
filters=128
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
size=3
stride=1
pad=1
filters=256
activation=leaky

[convolutional]
batch_normalize=1
filters=128
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
size=3
stride=1
pad=1
filters=256
activation=leaky

[convolutional]
batch_normalize=1
filters=128
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
size=3
stride=1
pad=1
filters=256
activation=leaky

[convolutional]
size=1
stride=1
pad=1
filters=18
activation=linear


[yolo]
mask = 0,1,2
anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
classes=1
num=9
jitter=.3
ignore_thresh = .5
truth_thresh = 1
random=1	

官網介紹中的關鍵點:
    圖片樣本的獲取可以自己找,也可以下載相關資料集,coco,voc資料集中可能也含有相關能用的樣本

    Note: If during training you see nan values for avg (loss) field - then training goes wrong, 
    but if nan is in some other lines - then training goes well.

    When should I stop training:
    When you see that average loss 0.xxxxxx avg no longer decreases at many iterations then you should stop training.
    Once training is stopped, you should take some of last .weights-files from darknet\build\darknet\x64\backup and choose the best of them.

    Overfitting - is case when you can detect objects on images from training-dataset, 
    but can't detect objects on any others images. You should get weights from Early Stopping Point.
	
    IoU (intersect of union) - average instersect of union of objects and detections for a certain threshold = 0.24

    How to improve object detection:
    Before training:
    set flag random=1 in your .cfg-file - it will increase precision by training Yolo for different resolutions.
    increase network resolution in your .cfg-file (height=608, width=608 or any value multiple of 32) - it will increase precision.
    recalculate anchors for your dataset for width and height from cfg-file: 
    darknet.exe detector calc_anchors data/obj.data -num_of_clusters 9 -width 416 -height 416 then set the same 9 anchors in each of 3 [yolo]-layers in your cfg-file
    設定錨點
	
    desirable that your training dataset include images with objects at diffrent: 
    scales, rotations, lightings, from different sides, on different backgrounds
    樣本特點儘量多樣化,亮度,旋轉,背景,目標位置,尺寸

    desirable that your training dataset include images with non-labeled objects that you do not want to detect - negative samples without bounded box (empty .txt files)
    可以新增沒有標註框的圖片和其空的txt檔案,作為negative資料
	
    for training with a large number of objects in each image, add the parameter max=200 or higher value in the last layer [region] in your cfg-file

    to speedup training (with decreasing detection accuracy) do Fine-Tuning instead of Transfer-Learning, 
    set param stopbackward=1 in one of the penultimate convolutional layers before the 1-st [yolo]-layer, 
    for example here: https://github.com/AlexeyAB/darknet/blob/0039fd26786ab5f71d5af725fc18b3f521e7acfd/cfg/yolov3.cfg#L598
    可以在第一個[yolo]層之前的倒數第二個[convolutional]層末尾新增 stopbackward=1,以此提升訓練速度

    After training - for detection:
    Increase network-resolution by set in your .cfg-file (height=608 and width=608) or (height=832 and width=832) 
    or (any value multiple of 32) - this increases the precision and makes it possible to detect small objects,
    you do not need to train the network again, just use .weights-file already trained for 416x416 resolution.
    即使在用416*416訓練完之後,也可以在cfg檔案中設定較大的width和height,增加網路對影象的解析度,從而更可能檢測出影象中的小目標,而不需要重新訓練

    if error Out of memory occurs then in .cfg-file you should increase subdivisions=16, 32 or 64
    Out of memory的錯誤需要通過增大subdivisions來解決
官網介紹中的關鍵點:     圖片樣本的獲取可以自己找,也可以下載相關資料集,coco,voc資料集中可能也含有相關能用的樣本     Note: If during training you see nan values for avg (loss) field - then training goes wrong,     but if nan is in some other lines - then training goes well.     When should I stop training:     When you see that average loss 0.xxxxxx avg no longer decreases at many iterations then you should stop training.     Once training is stopped, you should take some of last .weights-files from darknet\build\darknet\x64\backup and choose the best of them.     Overfitting - is case when you can detect objects on images from training-dataset,     but can't detect objects on any others images. You should get weights from Early Stopping Point.     IoU (intersect of union) - average instersect of union of objects and detections for a certain threshold = 0.24     How to improve object detection:     Before training:     set flag random=1 in your .cfg-file - it will increase precision by training Yolo for different resolutions.     increase network resolution in your .cfg-file (height=608, width=608 or any value multiple of 32) - it will increase precision.     recalculate anchors for your dataset for width and height from cfg-file:     darknet.exe detector calc_anchors data/obj.data -num_of_clusters 9 -width 416 -height 416 then set the same 9 anchors in each of 3 [yolo]-layers in your cfg-file     設定錨點     desirable that your training dataset include images with objects at diffrent:     scales, rotations, lightings, from different sides, on different backgrounds     樣本特點儘量多樣化,亮度,旋轉,背景,目標位置,尺寸     desirable that your training dataset include images with non-labeled objects that you do not want to detect - negative samples without bounded box (empty .txt files)     可以新增沒有標註框的圖片和其空的txt檔案,作為negative資料     for training with a large number of objects in each image, add the parameter max=200 or higher value in the last layer [region] in your cfg-file     to speedup training (with decreasing detection accuracy) do Fine-Tuning instead of Transfer-Learning,     set param stopbackward=1 in one of the penultimate convolutional layers before the 1-st [yolo]-layer,     for example here: https://github.com/AlexeyAB/darknet/blob/0039fd26786ab5f71d5af725fc18b3f521e7acfd/cfg/yolov3.cfg#L598     可以在第一個[yolo]層之前的倒數第二個[convolutional]層末尾新增 stopbackward=1,以此提升訓練速度     After training - for detection:     Increase network-resolution by set in your .cfg-file (height=608 and width=608) or (height=832 and width=832)     or (any value multiple of 32) - this increases the precision and makes it possible to detect small objects,     you do not need to train the network again, just use .weights-file already trained for 416x416 resolution.     即使在用416*416訓練完之後,也可以在cfg檔案中設定較大的width和height,增加網路對影象的解析度,從而更可能檢測出影象中的小目標,而不需要重新訓練     if error Out of memory occurs then in .cfg-file you should increase subdivisions=16, 32 or 64     Out of memory的錯誤需要通過增大subdivisions來解決

參考文章:yolo引數

[net] batch=64 一批次用的影象數量,一般來說,在視訊記憶體允許範圍內儘量的大會收斂更精細,不過也可能導致收斂到區域性最優解 subdivisions=4 劃分子集 width=416 height=416 網路輸入尺寸 channels=3 影象通道數 momentum=0.9 SGD方法的一個缺點是其更新方向完全依賴於當前batch計算出的梯度,因而十分不穩定。Momentum演算法借用了物理中的動量概念,它模擬的是物體運動時的慣性,即更新的時候在一定程度上保留之前更新的方向,同時利用當前batch的梯度微調最終的更新方向。這樣一來,可以在一定程度上增加穩定性,從而學習地更快,並且還有一定擺脫區域性最優的能力:

vt=γ⋅vt−1+α⋅▽ΘJ(Θ)vt=γ⋅vt−1+α⋅▽ΘJ(Θ)

Θ=Θ−vtΘ=Θ−vt

Momentum演算法會觀察歷史梯度vt−1,若當前梯度的方向與歷史梯度一致(表明當前樣本不太可能為異常點),則會增強這個方向的梯度,若當前梯度與歷史梯方向不一致,則梯度會衰減。一種形象的解釋是:我們把一個球推下山,球在下坡時積聚動量,在途中變得越來越快,γ可視為空氣阻力,若球的方向發生變化,則動量會衰減。 對於權重來說公式如下: 公式二 這裡的m就是我們可以調整的引數,一般取值有0.5,0.9,0.99,當然,也可以讓α的值隨著時間而變化,一開始小點,後來再加大。 動量實現 特點: 前後梯度方向一致時,能夠加速學習 前後梯度方向不一致時,能夠抑制震盪 decay=0.0005 在實際應用中,為了避免網路的過擬合,必須對價值函式(Cost function)加入一些正則項,在SGD中加入正則項對這個Cost function進行規範化: 公式三 上面這個公式基本思想就是減小不重要的引數對最後結果的影響,網路中有用的權重則不會收到Weight decay影響。 可以看到,此項越大則防止過擬合的能力越強。 angle=0 資料擴充時圖片旋轉的角度 saturation = 1.5 飽和度範圍 exposure = 1.5 曝光度範圍 hue=.1 色調變化範圍 learning_rate=0.001 初始學習率 max_batches = 200000 訓練達到max_batches後停止學習 policy=steps 調整學習率的policy,有如下policy:constant, steps, exp, poly, step, sig, RANDOM constant 保持學習率為常量,caffe裡為fixed steps 比較好理解,按照steps來改變學習率 steps學習率 steps=-1,400,100000,150000 scales=.1,10,.1,.1 steps和scales是對應的 有人認為,使用自己的資料集訓練時很大概概率需要這個warm up的trick,當然這個也要看具體的資料集 Warmup method: Constant warmup: 在train前幾個epoch(一般前5 epochs)時採用較小的constant learning rate,但是對於大的learning rate,constant warmup不能很好的初始化網路。 gradual warmup: 在 training的前幾個 epoch,逐漸將learning rate由小到大的提高,讓training在開始的時候健康的收斂。 重要參考:https://arxiv.org/abs/1706.02677 exp gamma= 返回base_lr*gamma^iter,iter為當前迭代次數,gamma設定為0.98 exp學習率 poly power=4 max_batches=800000 poly學習率 對學習率進行多項式衰減。圖中power為0.9 sig 學習率進行sigmod函式衰減 gamma= 0.05 step=200 效果如圖所示 sig學習率 step 返回net.learning_rate*pow(net.scale, batch_num/net.step)

[convolutional] batch_normalize=1 是否做BN filters=32 輸出多少個特徵圖 size=3 卷積核的尺寸 stride=1 做卷積運算的步長 pad=1 如果pad為0,padding由padding引數指定。如果pad為1,padding大小為size/2 activation=leaky 啟用函式有以下幾種:logistic,loggy,relu,elu,relie,plse,hardtan,lhtan,linear,ramp,leaky,tanh,stair 一般來說,現在用leaky的會比較多 leaky啟用函式

[maxpool] size=2 池化層尺寸 stride=2 池化步長

特別:最後一個卷積層 [convolutional] size=1 stride=1 pad=1 filters=125 region前最後一個卷積層的filters數是特定的,計算公式為filter=num*(classes+5),5的意義是5個座標,論文中的tx,ty,tw,th,to activation=linear

[region] anchors = 1.08,1.19, 3.42,4.41, 6.63,11.38, 9.42,5.11, 16.62,10.52 預測框的初始寬高,第一個是w,第二個是h,總數量是num*2 聚類的指令碼放在github中。 bias_match=1 只是在選擇與groundturth_box寬、高最相近的anchors,然後在選定的anchors基礎之上進行寬、高的偏差調節。最終的預測框寬、高不一定與anchors一致。 classes=20 類別數 coords=4 BoundingBox的tx,ty,tw,th,tx與ty是相對於左上角的gird,同時是當前grid的比例,tw與th是寬度與高度取對數 num=5 每個grid預測的BoundingBox個數 softmax=1 jitter=.2 利用資料抖動產生更多資料,YOLOv2中使用的是crop,filp,以及net層的angle,flip是隨機的,crop就是jitter的引數,tiny-yolo-voc.cfg中jitter=.2,就是在0~0.2中進行crop rescore=1 決定使用哪種方式計算IOU的誤差,為1時,使用當前best iou計算,為0時,使用1計算 object_scale=5 noobject_scale=1 class_scale=1 coord_scale=1 loss的係數 absolute=1 thresh = .6 決定是否需要計算IOU誤差的引數,大於thresh,IOU誤差不會夾在cost function中 random=1 如果為1每次迭代圖片大小隨機從320到608,步長為32,如果為0,每次訓練大小與輸入大小一致