1. 程式人生 > >深度學習工具箱DeepLearnToolbox-master,DBN程式分析,

深度學習工具箱DeepLearnToolbox-master,DBN程式分析,

分析matlab裡深度學習工具箱裡的dbn的例子,對涉及到的函式進行簡單註釋分析,有助於程式理解,後續會應用在其他資料上進行註釋。加了一些比較簡單繁瑣的註釋。。。

以下函式為dbn例子,包括準備資料,設定dbn引數等

test_example_DBN呼叫了dbnsetup(構建DBN網路),dbntrain(訓練DBN網路),dbnunfoldtonn(),nntrain,nntest。後面會逐個展示

%https://github.com/rasmusbergpalm/DeepLearnToolbox
function test_example_DBN
load mnist_uint8;

train_x = double(train_x) / 255;
test_x  = double(test_x)  / 255;
train_y = double(train_y);
test_y  = double(test_y);

%%  ex1 train a 100 hidden unit RBM and visualize its weights
%設定隱層單元數為100,而可視層單元數為輸入向量的個數,由輸入資料決定
rand('state',0)
dbn.sizes = [100];
opts.numepochs =   1; %是計算時根據輸出誤差返回調整神經元權值和閥值的次數
%訓練次數,用同樣的樣本集,別人訓練的時候:1的時候11.41%error,5的時候4.2%error,10的時候2.73%error
opts.batchsize = 100;  %每次挑出一個batchsize的batch來訓練,也就是每用batchsize個樣本就調整一次權值,而不是把所有樣本都輸入了,計算所有樣本的誤差了才調整一次權值
opts.momentum  =   0;%動量
opts.alpha     =   1; %學習率
dbn = dbnsetup(dbn, train_x, opts); 
%構建DBN網路,並返回 dbn = dbntrain(dbn, train_x, opts); %給定訓練樣本,訓練網路 figure; visualize(dbn.rbm{1}.W'); % Visualize the RBM weights %% ex2 train a 100-100 hidden unit DBN and use its weights to initialize a NN rand('state',0) %train dbn dbn.sizes = [100 100]; opts.numepochs = 1; opts.batchsize = 100; opts.momentum = 0; opts.alpha = 1; dbn = dbnsetup(dbn, train_x, opts); dbn = dbntrain(dbn, train_x, opts); %unfold dbn to nn nn = dbnunfoldtonn(dbn, 10);
%10為輸出層節點數 nn.activation_function = 'sigm'; %nnsetup底層裡本身有啟用函式的設定, %但這裡根據具體應用進行了改變 %train nn opts.numepochs = 1; opts.batchsize = 100; %最後fine tuning就再訓練一下NN就可以了 nn = nntrain(nn, train_x, train_y, opts); %用測試樣本測試 [er, bad] = nntest(nn, test_x, test_y);
assert(er < 0.10, 'Too big error');
dbnsetup.m,主要給每個rbm賦初始值,沒有呼叫其他函式
function dbn = dbnsetup(dbn, x, opts)  %構建dbn
    n = size(x, 2);   %列的個數代表維度,也即輸入特徵的數量,即可視層單元個數
    dbn.sizes = [n, dbn.sizes];  %分別為可視層大小和隱層大小

	%numel(A)返回陣列A中元素個數
    for u = 1 : numel(dbn.sizes) - 1   %看有幾個rbm?此時dbn.sizes=[784,100],numel(...)=2,所以是一個rbm
	%總體來說,dbn.sizes裡的元素結果應該是【第一個rbm的可視層單元數即rbm1.v,rbm1.h,rbm2.h,rbm3.h,...】,
	%總之後一個rbm可視層的單元數即上個個rbm隱含層的單元數,所以就省略不寫了,所以整個rbm的個數也就確定了,
	%即number(dbn.sizes)-1,下面,分別為每個rbm賦引數
        dbn.rbm{u}.alpha    = opts.alpha;  %學習率
        dbn.rbm{u}.momentum = opts.momentum;  %動量

		
        dbn.rbm{u}.W  = zeros(dbn.sizes(u + 1), dbn.sizes(u));%權重個數(隱層節點數,可視層節點數)
        dbn.rbm{u}.vW = zeros(dbn.sizes(u + 1), dbn.sizes(u));%(隱層節點數,可視層節點數)

		%偏置
        dbn.rbm{u}.b  = zeros(dbn.sizes(u), 1);%可視層偏置,與可視層節點數對應
        dbn.rbm{u}.vb = zeros(dbn.sizes(u), 1);

        dbn.rbm{u}.c  = zeros(dbn.sizes(u + 1), 1);%隱含層偏置,與隱含層節點數對應,dbn.sizes(u + 1)為隱含層節點數
        dbn.rbm{u}.vc = zeros(dbn.sizes(u + 1), 1);
    end

end
dbntrain.m,訓練dbn,對每個rbm進行訓練,呼叫了rbmtrain和rbmup函式
function dbn = dbntrain(dbn, x, opts)
    n = numel(dbn.rbm);  %看有幾個rbm

    dbn.rbm{1} = rbmtrain(dbn.rbm{1}, x, opts);  %先對第一個rbm進行訓練
	%第一個引數是rbm的結構資訊,第二個是訓練資料,第三個是rbm訓練資訊
    for i = 2 : n
        x = rbmup(dbn.rbm{i - 1}, x);%實現rbm間的連線,資料傳遞,前一個rbm的輸出資料為後一個rbm的輸入資料
        dbn.rbm{i} = rbmtrain(dbn.rbm{i}, x, opts); %接著訓練新的rbm
    end

end
貼出被呼叫的兩個函式

rbmtrain.m訓練單個rbm,改變rbm引數,W,vW , c ,  vc , b , vb

function rbm = rbmtrain(rbm, x, opts)  %對單個rbm進行訓練
    assert(isfloat(x), 'x must be a float');
    assert(all(x(:)>=0) && all(x(:)<=1), 'all data in x must be in [0:1]');  
    m = size(x, 1);  %樣本數
    numbatches = m / opts.batchsize;%batchsize為調整一次權值所用的樣本數,numbatches為所有樣本都參與訓練,需要調整幾次權值
    
    assert(rem(numbatches, 1) == 0, 'numbatches not integer'); %調整一次權值所用的樣本數必須為整數

    for i = 1 : opts.numepochs%根據誤差進行調整的次數
        kk = randperm(m); %把1到m這些數隨機打亂得到的一個數字序列。每次訓練時,從其中拿出batchsize個樣本,用來調整權值
        err = 0; %定義誤差
        for l = 1 : numbatches 
            batch = x(kk((l - 1) * opts.batchsize + 1 : l * opts.batchsize), :);%拿出指定數量的樣本
            
            v1 = batch;%輸入節點,即可視層節點
			%repmat是複製和平鋪矩陣函式,也即是為每一個樣本分配一個隱含層的偏置
            h1 = sigmrnd(repmat(rbm.c', opts.batchsize, 1) + v1 * rbm.W');
            v2 = sigmrnd(repmat(rbm.b', opts.batchsize, 1) + h1 * rbm.W);
            h2 = sigm(repmat(rbm.c', opts.batchsize, 1) + v2 * rbm.W');

            c1 = h1' * v1;
            c2 = h2' * v2;

            rbm.vW = rbm.momentum * rbm.vW + rbm.alpha * (c1 - c2)     / opts.batchsize;
            rbm.vb = rbm.momentum * rbm.vb + rbm.alpha * sum(v1 - v2)' / opts.batchsize;
            rbm.vc = rbm.momentum * rbm.vc + rbm.alpha * sum(h1 - h2)' / opts.batchsize;

            rbm.W = rbm.W + rbm.vW;
            rbm.b = rbm.b + rbm.vb;
            rbm.c = rbm.c + rbm.vc;

            err = err + sum(sum((v1 - v2) .^ 2)) / opts.batchsize;
        end
        
        disp(['epoch ' num2str(i) '/' num2str(opts.numepochs)  '. Average reconstruction error is: ' num2str(err / numbatches)]);
        
    end
end

rbmup.m ,為訓練下一個rbm做準備

function x = rbmup(rbm, x)%輸入引數為上一個rbm和訓練資料
    x = sigm(repmat(rbm.c', size(x, 1), 1) + x * rbm.W');
	%rbm.c'為隱含層偏置的轉置,size(x.1)為樣本個數,
	%repmat是複製和平鋪矩陣函式,也即是為每一個樣本分配一個隱含層的偏置
	%即Wx+c
	%也就是實現rbm之間的傳遞,後一個rbm的輸入資料為前一個rbm的輸出資料
end
bdntrain對dbn訓練結束後,應用dbnunfoldtonn.m,用訓練得到的權重初始化NN,呼叫了nnsetup函式
function nn = dbnunfoldtonn(dbn, outputsize)
%DBNUNFOLDTONN Unfolds a DBN to a NN
%   dbnunfoldtonn(dbn, outputsize ) returns the unfolded dbn with a final
%   layer of size outputsize added.
%   或者說初始化Weight,是一個unsupervised learning,最後的supervised還得靠NN 
    if(exist('outputsize','var'))
        size = [dbn.sizes outputsize];%把輸出層節點數新增到存放各層神經元個數的變數裡
    else
        size = [dbn.sizes];  
    end
    nn = nnsetup(size);  %根據網路結果建立網路
	%把每一層展開後的Weight拿去初始化NN的Weight  
    %注意dbn.rbm{i}.c拿去初始化了bias項的值
    for i = 1 : numel(dbn.rbm)  %1,2 
        nn.W{i} = [dbn.rbm{i}.c dbn.rbm{i}.W];
		%W1=[W1.c W1.W],W2=[W2.c W2.W],c與對應的隱層節點數對應,大小相同
		%c為(隱層節點數,1),W為(隱層節點數,可視層節點數),所以合起來的W1為(隱層節點數,可視層節點數+1)
		%也即c為隱含層各節點的偏置
    end
end
nnsetup.m,包含對引數初始化,W,vW
function nn = nnsetup(architecture)
%NNSETUP creates a Feedforward Backpropagate Neural Network
% nn = nnsetup(architecture) returns an neural network structure with n=numel(architecture)
% layers, architecture being a n x 1 vector of layer sizes e.g. [784 100 10]
%首先從傳入的architecture中獲得這個網路的整體結構,可以參照上面的樣例呼叫nnsetup([784 100 10])加以理解  
    nn.size   = architecture;
    nn.n      = numel(nn.size);%nn.n表示這個網路有多少層,包括1個輸入層,多個隱層,1個輸出層,對於ex2,則為4層
    %接下來是一大堆的引數,這個到具體用的時候再加以說明
    nn.activation_function              = 'tanh_opt';   %  隱含層啟用函式Activation functions of hidden layers: 'sigm' (sigmoid) or 'tanh_opt' (optimal tanh).
    nn.learningRate                     = 2;            %  learning rate Note: typically needs to be lower when using 'sigm' activation function and non-normalized inputs.
    nn.momentum                         = 0.5;          %  Momentum
    nn.scaling_learningRate             = 1;            %  Scaling factor for the learning rate (each epoch)
    nn.weightPenaltyL2                  = 0;            %  L2 regularization
    nn.nonSparsityPenalty               = 0;            %  Non sparsity penalty
    nn.sparsityTarget                   = 0.05;         %  Sparsity target
    nn.inputZeroMaskedFraction          = 0;            %  Used for Denoising AutoEncoders
    nn.dropoutFraction                  = 0;            %  Dropout level (http://www.cs.toronto.edu/~hinton/absps/dropout.pdf)
    nn.testing                          = 0;            %  Internal variable. nntest sets this to one.
    nn.output                           = 'sigm';       %  輸出單元,是不是用換成‘linear’??output unit 'sigm' (=logistic), 'softmax' and 'linear'
	%對每一層的網路結構進行初始化,一共三個引數W,vW,p,其中W是主要的引數
	%vW是更新引數時的臨時引數,p是所謂的sparsity,(等看到程式碼了再細講)  
    for i = 2 : nn.n   %對於ex2,則從2,3,4
        % weights and weight momentum,加1加的是偏置???
		%W1,W2,W3    vW1,vW2,vW3
		%W1連線的是輸入層和第一個隱層
		%W2連線的是第一個隱層和第二個隱層
		%W3連線的是第二個隱層和輸出層
		%W1為(第一個隱層單元個數,輸入層單元個數+1),後面一長串改變值是為了什麼??
        nn.W{i - 1} = (rand(nn.size(i), nn.size(i - 1)+1) - 0.5) * 2 * 4 * sqrt(6 / (nn.size(i) + nn.size(i - 1)));
        nn.vW{i - 1} = zeros(size(nn.W{i - 1}));  %與W對應
        
        % average activations (for use with sparsity)
		%分別為p2,p3,p4,每個的大小相同,都為1行4列
        nn.p{i}     = zeros(1, nn.size(i));   
    end
end
接著是nntrain.m,訓練NN,
function [nn, L]  = nntrain(nn, train_x, train_y, opts, val_x, val_y) 
%NNTRAIN trains a neural net
% [nn, L] = nnff(nn, x, y, opts) trains the neural network nn with input x and
% output y for opts.numepochs epochs, with minibatches of size
% opts.batchsize. Returns a neural network nn with updated activations,
% errors, weights and biases, (nn.a, nn.e, nn.W, nn.b) and L, the sum
% squared error for each training minibatch.

assert(isfloat(train_x), 'train_x must be a float');
assert(nargin == 4 || nargin == 6,'number ofinput arguments must be 4 or 6')

loss.train.e               = [];%儲存的是對訓練資料進行前向傳遞,根據得到的網路輸出值計算損失,並儲存
%在nneval那裡有改變,loss.train.e(end + 1) = nn.L;
loss.train.e_frac          = [];%儲存的是:對分類問題,用訓練資料對網路進行測試,
%首先用網路預測得到預測分類,用預測分類與實際標籤進行對比,儲存錯分樣本的個數
%在nneval那裡有改變,loss.train.e_frac(end+1)    = er_train;
loss.val.e                 = [];%有關驗證集
loss.val.e_frac            = []; 
opts.validation = 0;
if nargin == 6
    opts.validation = 1;   %6個引數則要進行驗證
end

fhandle = []; 
if isfield(opts,'plot') && opts.plot == 1
    fhandle = figure();
end
%跳過那些檢驗傳入資料是否正確的程式碼,直接到關鍵的部分
%denoising 的部分請參考論文:Extracting and Composing Robust Features with Denoising Autoencoders
m = size(train_x, 1);  %m是訓練樣本的數量 
 
%注意在呼叫的時候我們設定了opt,batchsize是做batch gradient時候的大小 
batchsize = opts.batchsize;
numepochs = opts.numepochs;

numbatches = m / batchsize;%計算batch的數量

assert(rem(numbatches, 1) == 0, 'numbatches must be a integer');

L = zeros(numepochs*numbatches,1);  %L用來存the sum squared error for each training minibatch.
n = 1;  %n作為L的索引
%numepochs是迴圈的次數 
for i = 1 : numepochs  %記錄一次訓練所用的時間
    tic;
    
    kk = randperm(m);
	%把batches打亂順序進行訓練,randperm(m)生成一個亂序的1到m的陣列
    for l = 1 : numbatches
        batch_x = train_x(kk((l - 1) * batchsize + 1 : l * batchsize), :);  %提取訓練輸入
        
        %Add noise to input (for use in denoising autoencoder)
		%加入noise,這是denoising autoencoder需要使用到的部分
		%這部分請參見《Extracting and Composing Robust Features with Denoising Autoencoders》這篇論文
        %具體加入的方法就是把訓練樣例中的一些資料調整變為0,inputZeroMaskedFraction表示了調整的比例
		if(nn.inputZeroMaskedFraction ~= 0)  %之前給該引數設定值為0,所以不會執行
            batch_x = batch_x.*(rand(size(batch_x))>nn.inputZeroMaskedFraction);
			%(...>...)的取值要麼是1,要麼是0,所以樣本的取值要麼不變,要麼被置為0,也即加入了噪音
        end
        %這三個函式  
        %nnff是進行前向傳播,nnbp是後向傳播,nnapplygrads是進行梯度下降  
        %我們在下面分析這些函式的程式碼 
        batch_y = train_y(kk((l - 1) * batchsize + 1 : l * batchsize), :);  %提取訓練輸出
        
        nn = nnff(nn, batch_x, batch_y);%通過各層前向傳遞得到網路的輸出,並計算誤差和損失
        nn = nnbp(nn);%計算從輸入層到最後一個隱層的梯度dW
        nn = nnapplygrads(nn);  %更新每層的權值和閾值
        
        L(n) = nn.L;  %記錄損失
        
        n = n + 1;
    end  %用下一組batch繼續進行訓練
    
    t = toc;

	%訓練結束後,用nneval,和訓練資料,評價網路效能
    if opts.validation == 1   %如果引數為6個的話
        loss = nneval(nn, loss, train_x, train_y, val_x, val_y);
        str_perf = sprintf('; Full-batch train mse = %f, val mse = %f', loss.train.e(end), loss.val.e(end));
    else  
        loss = nneval(nn, loss, train_x, train_y);
		%在nneval函式裡對網路進行評價,繼續用訓練資料,並得到錯分的樣本數和錯分率,都存在了loss裡,
		%對應修改了上面提到的四個變數loss.train.e,loss.train.e_frac  ,loss.val.e ,loss.val.e_frac
        str_perf = sprintf('; Full-batch train err = %f', loss.train.e(end));%所有batch的訓練誤差
    end
    if ishandle(fhandle)
        nnupdatefigures(nn, fhandle, loss, opts, i);   %這個是畫圖
    end
        
	%這個展示
    disp(['epoch ' num2str(i) '/' num2str(opts.numepochs) '. Took ' num2str(t) ' seconds' '. Mini-batch mean squared error on training set is ' num2str(mean(L((n-numbatches):(n-1)))) str_perf]);
    nn.learningRate = nn.learningRate * nn.scaling_learningRate;%更新學習率
end
end
主要呼叫了nnff,nnbp,nnapplygrads,nneval

nnff.m
%nnff就是進行feedforward pass,其實非常簡單,就是整個網路正向跑一次就可以了
%當然其中有dropout和sparsity的計算
%具體的參見論文“Improving Neural Networks with Dropout“和Autoencoders and Sparsity

function nn = nnff(nn, x, y)
%通過前向傳遞得到各層的輸出,整個網路的誤差和損失(nn.a, nn.e and nn.L)
%NNFF performs a feedforward pass
% nn = nnff(nn, x, y) returns an neural network structure with updated
% layer activations, error and loss (nn.a, nn.e and nn.L)

    n = nn.n;%nn.n表示這個網路有多少層,包括1個輸入層,多個隱層,1個輸出層,對於ex2,則為4層
    m = size(x, 1);%樣本個數
    
    x = [ones(m,1) x];%新增一列??,多了1列??
    nn.a{1} = x;  %a裡邊放的是什麼??放的是每層神經元的值?
	%a1為輸入層,直接為樣本輸入,為計算下層輸入做準備

    %feedforward pass
    for i = 2 : n-1   %計算中間層神經元的輸出值,
		%根據選擇的啟用函式不同進行正向傳播計算  
        %你可以回過頭去看nnsetup裡面的第一個引數activation_function  
        %sigm就是sigmoid函式,tanh_opt就是tanh的函式,這個toolbox好像有一點改變  
        %tanh_opt是1.7159*tanh(2/3.*A) 
        switch nn.activation_function 
            case 'sigm'
                % Calculate the unit's outputs (including the bias term)
				%計算神經元的輸出,依據偏置項,在dbnunfoldtonn裡設定了W1,W2,輸入為上一層的輸出,
                nn.a{i} = sigm(nn.a{i - 1} * nn.W{i - 1}');
				%a{i-1}為(樣本數,特徵數+1),W{i-1}為(隱層節點數,輸入層節點數+1),
				%則W{i-1}的轉置為(輸入層節點數+1,隱層節點數),其實,特徵數=輸入層節點數
				%所以最後的a{i}為sigm(【樣本數,隱層節點數】)的值,即該隱層的輸出
            case 'tanh_opt'
                nn.a{i} = tanh_opt(nn.a{i - 1} * nn.W{i - 1}');
        end
		
        %dropout的計算部分部分 dropoutFraction 是nnsetup中可以設定的一個引數
        %dropout
        if(nn.dropoutFraction > 0)%在nnsetup中設定了該引數為0,所以這裡跳過了
            if(nn.testing)
                nn.a{i} = nn.a{i}.*(1 - nn.dropoutFraction);
            else
                nn.dropOutMask{i} = (rand(size(nn.a{i}))>nn.dropoutFraction);
                nn.a{i} = nn.a{i}.*nn.dropOutMask{i};
            end
        end
        
		%計算sparsity,nonSparsityPenalty 是對沒達到sparsitytarget的引數的懲罰係數  
        %calculate running exponential activations for use with sparsity
        if(nn.nonSparsityPenalty>0)%在nnsetup中也設定了該引數為0,所以這裡也跳過了
            nn.p{i} = 0.99 * nn.p{i} + 0.01 * mean(nn.a{i}, 1);   %所以p引數其實也沒發揮作用
        end
        
        %Add the bias term
        nn.a{i} = [ones(m,1) nn.a{i}];  
		%上面計算出來a{i}為(樣本數,隱層節點數),現在加上一列,對應每個偏置
    end
    switch nn.output  %計算輸出層的輸出值,nn.output在nnsetup裡進行了設定
        case 'sigm'
            nn.a{n} = sigm(nn.a{n - 1} * nn.W{n - 1}');
        case 'linear'
            nn.a{n} = nn.a{n - 1} * nn.W{n - 1}';
        case 'softmax'
            nn.a{n} = nn.a{n - 1} * nn.W{n - 1}';
            nn.a{n} = exp(bsxfun(@minus, nn.a{n}, max(nn.a{n},[],2)));
            nn.a{n} = bsxfun(@rdivide, nn.a{n}, sum(nn.a{n}, 2)); 
    end

    %error and loss
    nn.e = y - nn.a{n};
	%計算error
    
    switch nn.output
        case {'sigm', 'linear'}  
            nn.L = 1/2 * sum(sum(nn.e .^ 2)) / m;
        case 'softmax'
            nn.L = -sum(sum(y .* log(nn.a{n}))) / m;
    end
end
nnbp.m
%nnbp呢是進行back propagation的過程,過程還是比較中規中矩,和ufldl中的Neural Network講的基本一致
%值得注意的還是dropout和sparsity的部分
function nn = nnbp(nn)
%NNBP performs backpropagation %執行後項傳播
% nn = nnbp(nn) returns an neural network structure with updated weights 
    
    n = nn.n;%nn.n表示這個網路有多少層,包括1個輸入層,多個隱層,1個輸出層,對於ex2,則為4層
    sparsityError = 0;
    switch nn.output  %d{i}就是這一層的delta值
        case 'sigm'
            d{n} = - nn.e .* (nn.a{n} .* (1 - nn.a{n}));%由誤差和網路的輸出計算得到
        case {'softmax','linear'}
            d{n} = - nn.e;
    end
    for i = (n - 1) : -1 : 2  %n-1為倒數第一個隱層,2為第一個隱層
        % Derivative of the activation function  %啟用函式的導數
        switch nn.activation_function 
            case 'sigm'
                d_act = nn.a{i} .* (1 - nn.a{i});%由每個隱層的輸出得到
            case 'tanh_opt'
                d_act = 1.7159 * 2/3 * (1 - 1/(1.7159)^2 * nn.a{i}.^2);
        end
        
        if(nn.nonSparsityPenalty>0)  %該引數設為了0
            pi = repmat(nn.p{i}, size(nn.a{i}, 1), 1);
            sparsityError = [zeros(size(nn.a{i},1),1) nn.nonSparsityPenalty * (-nn.sparsityTarget ./ pi + (1 - nn.sparsityTarget) ./ (1 - pi))];
        end
        
        % Backpropagate first derivatives  %反向傳播一階導數
        if i+1==n % in this case in d{n} there is not the bias term to be removed  %則i+1為輸出層,本身沒有偏置,不用移除偏置           
            d{i} = (d{i + 1} * nn.W{i} + sparsityError) .* d_act; % Bishop (5.56)
        else % in this case in d{i} the bias term has to be removed %移除偏置
            d{i} = (d{i + 1}(:,2:end) * nn.W{i} + sparsityError) .* d_act;%d{i + 1}的第一列為偏置,被移除了
        end
        
        if(nn.dropoutFraction>0) %該值被置為0了
            d{i} = d{i} .* [ones(size(d{i},1),1) nn.dropOutMask{i}];
        end

    end

    for i = 1 : (n - 1)  %從輸入層到最後一個隱層,計算dW,dW{i}基本就是計算的gradient了
        if i+1==n
            nn.dW{i} = (d{i + 1}' * nn.a{i}) / size(d{i + 1}, 1);%d{i + 1}為輸出層,則不用移除偏置
        else
            nn.dW{i} = (d{i + 1}(:,2:end)' * nn.a{i}) / size(d{i + 1}, 1); %存在偏置,要移除掉   
        end
    end
end
%這只是實現的內容,程式碼中的d{i}就是這一層的delta值,在ufldl中有講的
%dW{i}基本就是計算的gradient了,只是後面還要加入一些東西,進行一些修改

%具體原理參見論文“Improving Neural Networks with Dropout“ 以及 Autoencoders and Sparsity的內容
nnapplygrads.m
function nn = nnapplygrads(nn)
%NNAPPLYGRADS updates weights and biases with calculated gradients
%用nnbp得到的梯度dW,更新權重和偏置,
% nn = nnapplygrads(nn) returns an neural network structure with updated
% weights and biases  %更新權重和偏置後,返回網路結構
    
    for i = 1 : (nn.n - 1)  %更新每層的權值和閾值
        if(nn.weightPenaltyL2>0)  %nnsetup中設定了該引數為0
            dW = nn.dW{i} + nn.weightPenaltyL2 * [zeros(size(nn.W{i},1),1) nn.W{i}(:,2:end)];
        else
            dW = nn.dW{i};
        end
        
        dW = nn.learningRate * dW;
        
        if(nn.momentum>0)
            nn.vW{i} = nn.momentum*nn.vW{i} + dW;
            dW = nn.vW{i};
        end
            
        nn.W{i} = nn.W{i} - dW;
    end
end
%這個內容就簡單了,nn.weightPenaltyL2 是weight decay的部分,也是nnsetup時可以設定的一個引數
%有的話就加入weight Penalty,防止過擬合,然後再根據momentum的大小調整一下,最後改變nn.W{i}即可
nneval.m
function [loss] = nneval(nn, loss, train_x, train_y, val_x, val_y)
%NNEVAL evaluates performance of neural network  評價神經網路表現
% Returns a updated loss struct  %返回更新後的loss損失結構
assert(nargin == 4 || nargin == 6, 'Wrong number of arguments');

nn.testing = 1;  
% training performance
nn                    = nnff(nn, train_x, train_y);
%通過各層前向傳遞得到網路的輸出,並計算誤差和損失(nn.a, nn.e and nn.L)
loss.train.e(end + 1) = nn.L;%追加到後邊,L為一個數?nnff計算了nn.L

% validation performance
if nargin == 6
    nn                    = nnff(nn, val_x, val_y);
    loss.val.e(end + 1)   = nn.L;   
end
nn.testing = 0;
%calc misclassification rate if softmax
if strcmp(nn.output,'softmax')  %如果相等為1,則執行
    [er_train, dummy]               = nntest(nn, train_x, train_y);%返回值第一個為錯分樣本的個數,第二個為錯分率
    loss.train.e_frac(end+1)    = er_train; %追加到後邊,在nntrain裡有定義,追加錯分樣本個數
    
    if nargin == 6
        [er_val, dummy]             = nntest(nn, val_x, val_y);
        loss.val.e_frac(end+1)  = er_val;
    end
end

end
nntest.m
function [er, bad] = nntest(nn, x, y)%返回值er為錯分樣本的個數,bad為錯分率
    labels = nnpredict(nn, x);  %labels為針對分類問題的最後預測結果
    [dummy, expected] = max(y,[],2);
	%y有10列,max(y,[],2)返回的是每一行(即每個樣本)中最大值dummy及所在的列expected,列號對應的是第幾類
	%max(nn.a{end},[],2); 是返回每一行的最大值以及所在的列數,所以labels返回的就是標號啦
    bad = find(labels ~= expected);    %統計錯誤個數
    er = numel(bad) / size(x, 1); %計算錯誤率
end
%nntest再簡單不過了,就是呼叫一下nnpredict,在和test的集合進行比較
nnpredict.m
function labels = nnpredict(nn, x)
    nn.testing = 1;  
    nn = nnff(nn, x, zeros(size(x,1), nn.size(end)));
	%通過前向傳遞得到各層的輸出,整個網路的誤差和損失(nn.a, nn.e and nn.L)
    nn.testing = 0;
    
    [dummy, i] = max(nn.a{end},[],2);%a{end}為輸出層的結果
    labels = i;
end
%繼續非常簡單,predict不過是nnff一次,得到最後的output~~
%max(nn.a{end},[],2); 是返回每一行的最大值以及所在的列數,所以labels返回的就是標號啦
%(這個test好像是專門用來test 分類問題的,我們知道nnff得到最後的值即可)
我已經把自己繞暈了。。。

列個表,列出列出的函式

text_example_DBN dbnsetup
dbntrain rbmtrain
rbmup
dbnunflodtonn nnsetup
nntrain nnff
nnbp
nnapplygrads
nneval nntest nnpredict
nntest