1. 程式人生 > >隨機森林程式碼註釋(C++版本)

隨機森林程式碼註釋(C++版本)

</pre><pre name="code"class="cpp">#include <iostream>

#include <fstream>

#include <sstream>

#include "random_forest.h"

using namespace std;

vector<decision_tree*>  alltrees;               // 森林(決策樹集合)

vector<TupleData>       trainAll,train,test; // 樣本集

vector<int>                  attributes;            // 屬性集(元素為屬性序號)

int                     trainAllNum = 0;      

int                     testAllNum  = 0;      

int                     MaxAttr;             // 屬性總數

int                     *ArrtNum;               // 屬性個數集(元素為屬性最大值)

unsigned int            F;

int                     tree_num    = 100;     // 決策樹個數

const int               leafattrnum = -1;       // 葉子節點的屬性序號

int                     TP          = 0,

                        FN          = 0,

                        FP          = 0,

                        TN          = 0,

                        TestP       = 0,

                        TestN       = 0;

// 讀入資料

void init(char * trainname, char * testname)

{

   trainAllNum     =readData(trainAll, trainname);

   testAllNum      = readData(test,testname);

   calculate_attributes();

   double temp     =(double)trainAllNum;

   temp            =log(temp)/log(2.0);

   F               = (unsigned int)floor(temp+0.5)+1;

   if(F>MaxAttr) F = MaxAttr;

}

// 初始化訓練樣本子集

void sub_init()

{

   // 選取決策樹的訓練樣本集合

   RandomSelectData(trainAll, train);

   // 計算樣本屬性個數

   calculate_ArrtNum();

}

// 讀資料

int readData(vector<TupleData>&data, const char* fileName)

{

   ifstream fin;

   fin.open(fileName);

   string line;

   int datanum=0;

   // 每行資料作為一個樣本

   while(getline(fin,line))

   {

       TupleData d;

       istringstream stream(line);

       string str;

       // 設定每個樣本的標籤和內容

       while(stream>>str)

       {

           if(str.find('+')==0)

           {

                d.label='+';

           }

           else if(str.find('-')==0)

           {

                d.label='-';

           }

           else

           {

                int j=stringtoint(str);

                d.A.push_back(j);

           }

       }

       data.push_back(d);

       datanum++;

   }

   fin.close();

   return datanum;

}

// 生成根節點的訓練樣本子集

voidRandomSelectData(vector<TupleData> &data, vector<TupleData>&subdata)

{

   int index;

   subdata.clear();

   int d = 0;

   while (d < trainAllNum)

   {

       index = rand() % trainAllNum;

       subdata.push_back(data.at(index));

       d++;

   }

}

// 計算屬性序列

void calculate_attributes()

{

   // 每個樣本必須具有相同的屬性個數

   TupleData d = trainAll.at(0);

   MaxAttr = d.A.size();

   attributes.clear();

   // 建立屬性集合attributes,元素為屬性序號

   for (int i = 0; i < MaxAttr; i++)

   {

       attributes.push_back(i);

   }

   // 初始化屬性最大值序列,元素為屬性最大值

   ArrtNum = new int[MaxAttr];

}

// 字串轉化為int

int stringtoint(string s)

{

   int sum=0;

   for(int i=0; s[i]!='\0';i++)

   {

       int j=int(s[i])-48;

       sum=sum*10+j;

   }

   return sum;

}

// 計算ArrtNum元素值

void calculate_ArrtNum()

{

   for(int i = 0; i < MaxAttr; i++) ArrtNum[i] = 0;

   // ArrtNum元素值為屬性最大值

   for (vector<TupleData>::const_iterator it = train.begin(); it !=train.end(); it++)

   {

       int i = 0;

       for (vector<int>::const_iterator intt=(*it).A.begin();intt!=(*it).A.end();intt++)

       {

           int valuemax=(*intt)+1;

           if(valuemax>ArrtNum[i]) ArrtNum[i]=valuemax;

           i++;

       }

   }

}

// 計算熵

double Entropy(double p, double s)

{

   double n = s - p;

   double result = 0;

   if (n != 0)

       result += - double(n) / s * log(double(n) / s) / log(2.0);

   if (p != 0)

       result += double(-p) / s * log(double(p) / s) / log(2.0);

   return result;

}

// 訓練一棵決策樹

int creat_classifier(decision_tree*&p, const vector<TupleData> &samples, vector<int>&attributes)

{

   if (p == NULL)

       p = new decision_tree();

   // 根據樣本真實類別,輸出葉子節點類別

   if (Allthesame(samples, '+'))

   {

       p->node.label = '+';

       p->node.attrNum = leafattrnum;

       p->childs.clear();

       return 1;

   }

   if (Allthesame(samples, '-'))

   {

       p->node.label = '-';

       p->node.attrNum = leafattrnum;

       p->childs.clear();

       return 1;

   }

   // 如果屬性序列為空,當前節點就為葉子節點

   if (attributes.size() == 0)

   {

       p->node.label = Majorityclass(samples);

       p->node.attrNum = leafattrnum;

       p->childs.clear();

       return 1;

   }

   // 計算當前節點的最優屬性

   p->node.attrNum = BestGainArrt(samples, attributes);

   // 中間節點無標籤

   p->node.label = ' ';

   // 計運算元節點候選屬性集合,候選集合元素越來越少

   vector<int> newAttributes;

   for (vector<int>::iterator it = attributes.begin(); it !=attributes.end(); it++)

       if ((*it) != p->node.attrNum)

           newAttributes.push_back((*it));

   // 初始化樣本子集,建立maxvalue個樣本子集,也就說明該節點有maxvalue個子節點

   // 為什麼不建立一個閾值,進行二分類?

   int maxvalue = ArrtNum[p->node.attrNum];

   vector<TupleData>* subSamples = newvector<TupleData>[maxvalue];

   for (int i = 0; i < maxvalue; i++)

       subSamples[i].clear();

   // 將樣本集合分為樣本子集

   for (vector<TupleData>::const_iterator it = samples.begin(); it !=samples.end(); it++)

   {

       // 對樣本進行分類,分別分到maxvalue個子節點中

       // p->node.attrNum是當前節點的最優屬性序號

       // (*it).A.at(p->node.attrNum)正是子節點的序號

       // 基於當前節點最優屬性,計算當前樣本的歸類

       subSamples[(*it).A.at(p->node.attrNum)].push_back((*it));

   }

   decision_tree *child;

   for (int i = 0; i < maxvalue; i++)

   {

       child = new decision_tree;

       if (subSamples[i].size() == 0)

           child->node.label = Majorityclass(samples);

       else

           creat_classifier(child, subSamples[i], newAttributes);

       p->childs.push_back(child);

   }

   delete[] subSamples;

   return 0;

}

// 計算節點處的資訊增益

int BestGainArrt(constvector<TupleData> &samples, vector<int> &attributes)

{

   int attr,

       bestAttr = 0,

       p = 0,

       s = (int)samples.size();

   // 計算正樣本個數

   for (vector<TupleData>::const_iterator it = samples.begin(); it !=samples.end(); it++)

   {

       if ((*it).label == '+')

           p++;

   }

   double infoD;

   double bestResult = 0;

   // 計算初始熵

   infoD = Entropy(p, s);

   vector<int> m_attributes;

   // 隨機確定候選屬性集

   RandomSelectAttr(attributes, m_attributes);

   // 遍歷屬性(即主題),通過資訊增益篩選最優屬性

   for (vector<int>::iterator it = m_attributes.begin(); it !=m_attributes.end(); it++)

   {

       attr            = (*it);

       double result   = infoD;

       // 第attr個屬性的最大屬性值

       int maxvalue    = ArrtNum[attr];

       // 正負樣本集

       int* subN       = newint[maxvalue];

       int* subP       = newint[maxvalue];

       int* sub        = newint[maxvalue];

       for (int i = 0; i < maxvalue; i++)

       {

           subN[i] = 0;

           subP[i] = 0;

           sub[i]  = 0;

       }

       // 基於特定屬性,對當前訓練樣本進行分類

       // 屬性計算這一步的確沒有,屬性值直接儲存在樣本中

       for (vector<TupleData>::const_iterator jt = samples.begin(); jt !=samples.end(); jt++)

       {

           if ((*jt).label == '+')

                subP[(*jt).A.at(attr)] ++;

           else

                subN[(*jt).A.at(attr)] ++;

           sub[(*jt).A.at(attr)]++;

       }

       // 計算特定屬性下資訊增益(相對熵)

       double SplitInfo = 0;

       for(int i = 0; i < maxvalue; i++)

       {

           double partsplitinfo;

           partsplitinfo   =-double(sub[i])/s*log(double(sub[i])/s)/log(2.0);

           SplitInfo       =SplitInfo+partsplitinfo;

       }

       double infoattr = 0;

       for (int i = 0; i < maxvalue; i++)

       {

           double partentropy;

           partentropy     = Entropy(subP[i],subP[i] + subN[i]);

           infoattr        =infoattr+((double)(subP[i] + subN[i])/(double)(s))*partentropy;

       }

       result = result - infoattr;

       result = result / SplitInfo;

       // 尋找最優屬性

       if (result > bestResult)

       {

           bestResult      = result;

           bestAttr        = attr;

       }

       delete[] subN;

       delete[] subP;

       delete[] sub;

   }

   if (bestResult == 0)

   {

       bestAttr=attributes.at(0);

   }

   return bestAttr;

}

void RandomSelectAttr(vector<int>&data, vector<int> &subdata)

{

   int index;

   unsigned int dataNum=data.size();

   subdata.clear();

   if(dataNum<=F)

   {

       for (vector<int>::iterator it = data.begin(); it != data.end();it++)

       {

           int attr = (*it);

           subdata.push_back(attr);

       }

   }

   else

   {

       set<int> AttrSet;

       AttrSet.clear();

       while (AttrSet.size() < F)

       {

           index = rand() % dataNum;

           if (AttrSet.count(index) == 0)

           {

                AttrSet.insert(index);

               subdata.push_back(data.at(index));

           }

       }

   }

}

bool Allthesame(constvector<TupleData> &samples, char ch)

{

   for (vector<TupleData>::const_iterator it = samples.begin(); it !=samples.end(); it++)

       if ((*it).label != ch)

           return false;

   return true;

}

// 確定節點中哪個類別樣本個數最多

char Majorityclass(constvector<TupleData> &samples)

{

   int p = 0, n = 0;

   for (vector<TupleData>::const_iterator it = samples.begin(); it !=samples.end(); it++)

       if ((*it).label == '+')

           p++;

       else

           n++;

   if (p >= n)

       return '+';

   else

       return '-';

}

// 測試階段

char testClassifier(decision_tree *p,TupleData d)

{

   // 抵達葉子節點

   if (p->node.label != ' ')

       return p->node.label;

   // 節點處最優屬性

   int attrNum = p->node.attrNum;

   // 錯誤樣本

   if (d.A.at(attrNum) < 0)

       return ' ';

   // 確定分支

   return testClassifier(p->childs.at(d.A.at(attrNum)), d);

}

void testData()

{

   for (vector<TupleData>::iterator it = test.begin(); it !=test.end(); it++)

   {

       printf("新樣本\n");

       if((*it).label=='+') TestP++;

       else TestN++;

       int p = 0, n = 0;

       for(int i = 0; i < tree_num; i++)

       {

           if(testClassifier(alltrees.at(i), (*it))=='+')  p++;

           else n++;

       }

       if(p>n)

       {

           if((*it).label=='+') TP++;

           else FP++;

       }

       else

       {

           if((*it).label=='+') FN++;

           else TN++;

       }

   }

}

void freeClassifier(decision_tree *p)

{

   if (p == NULL)

       return;

   for (vector<decision_tree*>::iterator it = p->childs.begin();it != p->childs.end(); it++)

   {

       freeClassifier(*it);

   }

   delete p;

}

void freeArrtNum()

{

   delete[] ArrtNum;

}

void showResult()

{

   cout << "Train size:   "<<trainAllNum<<endl;

   cout << "Test size:    "<<testAllNum<<endl;     

   cout << "True positive:        "<< TP << endl;

   cout << "False negative:       "<<FN<<endl;

   cout << "False positive:       "<<FP<<endl;

   cout << "True negative:        "<<TN<<endl;

}

int main(int argc, char **argv)

{

   char * trainfile=argv[1];

    char* testfile=argv[2];

   srand((unsigned)time(NULL));

   // 初始化樣本

   init("1.txt", "2.txt");

   // 訓練階段

   for(int i = 0; i < tree_num; i++)

   {

       printf("第 %d 棵決策樹訓練開始\n", i);

       // 每棵樹的訓練樣本子集

       sub_init();

       // 訓練每棵決策樹

       decision_tree * root=NULL;

       creat_classifier(root, train, attributes);

       // 建立森林

       alltrees.push_back(root);