隨機森林程式碼註釋(C++版本)
</pre><pre name="code"class="cpp">#include <iostream>
#include <fstream>
#include <sstream>
#include "random_forest.h"
using namespace std;
vector<decision_tree*> alltrees; // 森林(決策樹集合)
vector<TupleData> trainAll,train,test; // 樣本集
vector<int> attributes; // 屬性集(元素為屬性序號)
int trainAllNum = 0;
int testAllNum = 0;
int MaxAttr; // 屬性總數
int *ArrtNum; // 屬性個數集(元素為屬性最大值)
unsigned int F;
int tree_num = 100; // 決策樹個數
const int leafattrnum = -1; // 葉子節點的屬性序號
int TP = 0,
FN = 0,
FP = 0,
TN = 0,
TestP = 0,
TestN = 0;
// 讀入資料
void init(char * trainname, char * testname)
{
trainAllNum =readData(trainAll, trainname);
testAllNum = readData(test,testname);
calculate_attributes();
double temp =(double)trainAllNum;
temp =log(temp)/log(2.0);
F = (unsigned int)floor(temp+0.5)+1;
if(F>MaxAttr) F = MaxAttr;
}
// 初始化訓練樣本子集
void sub_init()
{
// 選取決策樹的訓練樣本集合
RandomSelectData(trainAll, train);
// 計算樣本屬性個數
calculate_ArrtNum();
}
// 讀資料
int readData(vector<TupleData>&data, const char* fileName)
{
ifstream fin;
fin.open(fileName);
string line;
int datanum=0;
// 每行資料作為一個樣本
while(getline(fin,line))
{
TupleData d;
istringstream stream(line);
string str;
// 設定每個樣本的標籤和內容
while(stream>>str)
{
if(str.find('+')==0)
{
d.label='+';
}
else if(str.find('-')==0)
{
d.label='-';
}
else
{
int j=stringtoint(str);
d.A.push_back(j);
}
}
data.push_back(d);
datanum++;
}
fin.close();
return datanum;
}
// 生成根節點的訓練樣本子集
voidRandomSelectData(vector<TupleData> &data, vector<TupleData>&subdata)
{
int index;
subdata.clear();
int d = 0;
while (d < trainAllNum)
{
index = rand() % trainAllNum;
subdata.push_back(data.at(index));
d++;
}
}
// 計算屬性序列
void calculate_attributes()
{
// 每個樣本必須具有相同的屬性個數
TupleData d = trainAll.at(0);
MaxAttr = d.A.size();
attributes.clear();
// 建立屬性集合attributes,元素為屬性序號
for (int i = 0; i < MaxAttr; i++)
{
attributes.push_back(i);
}
// 初始化屬性最大值序列,元素為屬性最大值
ArrtNum = new int[MaxAttr];
}
// 字串轉化為int
int stringtoint(string s)
{
int sum=0;
for(int i=0; s[i]!='\0';i++)
{
int j=int(s[i])-48;
sum=sum*10+j;
}
return sum;
}
// 計算ArrtNum元素值
void calculate_ArrtNum()
{
for(int i = 0; i < MaxAttr; i++) ArrtNum[i] = 0;
// ArrtNum元素值為屬性最大值
for (vector<TupleData>::const_iterator it = train.begin(); it !=train.end(); it++)
{
int i = 0;
for (vector<int>::const_iterator intt=(*it).A.begin();intt!=(*it).A.end();intt++)
{
int valuemax=(*intt)+1;
if(valuemax>ArrtNum[i]) ArrtNum[i]=valuemax;
i++;
}
}
}
// 計算熵
double Entropy(double p, double s)
{
double n = s - p;
double result = 0;
if (n != 0)
result += - double(n) / s * log(double(n) / s) / log(2.0);
if (p != 0)
result += double(-p) / s * log(double(p) / s) / log(2.0);
return result;
}
// 訓練一棵決策樹
int creat_classifier(decision_tree*&p, const vector<TupleData> &samples, vector<int>&attributes)
{
if (p == NULL)
p = new decision_tree();
// 根據樣本真實類別,輸出葉子節點類別
if (Allthesame(samples, '+'))
{
p->node.label = '+';
p->node.attrNum = leafattrnum;
p->childs.clear();
return 1;
}
if (Allthesame(samples, '-'))
{
p->node.label = '-';
p->node.attrNum = leafattrnum;
p->childs.clear();
return 1;
}
// 如果屬性序列為空,當前節點就為葉子節點
if (attributes.size() == 0)
{
p->node.label = Majorityclass(samples);
p->node.attrNum = leafattrnum;
p->childs.clear();
return 1;
}
// 計算當前節點的最優屬性
p->node.attrNum = BestGainArrt(samples, attributes);
// 中間節點無標籤
p->node.label = ' ';
// 計運算元節點候選屬性集合,候選集合元素越來越少
vector<int> newAttributes;
for (vector<int>::iterator it = attributes.begin(); it !=attributes.end(); it++)
if ((*it) != p->node.attrNum)
newAttributes.push_back((*it));
// 初始化樣本子集,建立maxvalue個樣本子集,也就說明該節點有maxvalue個子節點
// 為什麼不建立一個閾值,進行二分類?
int maxvalue = ArrtNum[p->node.attrNum];
vector<TupleData>* subSamples = newvector<TupleData>[maxvalue];
for (int i = 0; i < maxvalue; i++)
subSamples[i].clear();
// 將樣本集合分為樣本子集
for (vector<TupleData>::const_iterator it = samples.begin(); it !=samples.end(); it++)
{
// 對樣本進行分類,分別分到maxvalue個子節點中
// p->node.attrNum是當前節點的最優屬性序號
// (*it).A.at(p->node.attrNum)正是子節點的序號
// 基於當前節點最優屬性,計算當前樣本的歸類
subSamples[(*it).A.at(p->node.attrNum)].push_back((*it));
}
decision_tree *child;
for (int i = 0; i < maxvalue; i++)
{
child = new decision_tree;
if (subSamples[i].size() == 0)
child->node.label = Majorityclass(samples);
else
creat_classifier(child, subSamples[i], newAttributes);
p->childs.push_back(child);
}
delete[] subSamples;
return 0;
}
// 計算節點處的資訊增益
int BestGainArrt(constvector<TupleData> &samples, vector<int> &attributes)
{
int attr,
bestAttr = 0,
p = 0,
s = (int)samples.size();
// 計算正樣本個數
for (vector<TupleData>::const_iterator it = samples.begin(); it !=samples.end(); it++)
{
if ((*it).label == '+')
p++;
}
double infoD;
double bestResult = 0;
// 計算初始熵
infoD = Entropy(p, s);
vector<int> m_attributes;
// 隨機確定候選屬性集
RandomSelectAttr(attributes, m_attributes);
// 遍歷屬性(即主題),通過資訊增益篩選最優屬性
for (vector<int>::iterator it = m_attributes.begin(); it !=m_attributes.end(); it++)
{
attr = (*it);
double result = infoD;
// 第attr個屬性的最大屬性值
int maxvalue = ArrtNum[attr];
// 正負樣本集
int* subN = newint[maxvalue];
int* subP = newint[maxvalue];
int* sub = newint[maxvalue];
for (int i = 0; i < maxvalue; i++)
{
subN[i] = 0;
subP[i] = 0;
sub[i] = 0;
}
// 基於特定屬性,對當前訓練樣本進行分類
// 屬性計算這一步的確沒有,屬性值直接儲存在樣本中
for (vector<TupleData>::const_iterator jt = samples.begin(); jt !=samples.end(); jt++)
{
if ((*jt).label == '+')
subP[(*jt).A.at(attr)] ++;
else
subN[(*jt).A.at(attr)] ++;
sub[(*jt).A.at(attr)]++;
}
// 計算特定屬性下資訊增益(相對熵)
double SplitInfo = 0;
for(int i = 0; i < maxvalue; i++)
{
double partsplitinfo;
partsplitinfo =-double(sub[i])/s*log(double(sub[i])/s)/log(2.0);
SplitInfo =SplitInfo+partsplitinfo;
}
double infoattr = 0;
for (int i = 0; i < maxvalue; i++)
{
double partentropy;
partentropy = Entropy(subP[i],subP[i] + subN[i]);
infoattr =infoattr+((double)(subP[i] + subN[i])/(double)(s))*partentropy;
}
result = result - infoattr;
result = result / SplitInfo;
// 尋找最優屬性
if (result > bestResult)
{
bestResult = result;
bestAttr = attr;
}
delete[] subN;
delete[] subP;
delete[] sub;
}
if (bestResult == 0)
{
bestAttr=attributes.at(0);
}
return bestAttr;
}
void RandomSelectAttr(vector<int>&data, vector<int> &subdata)
{
int index;
unsigned int dataNum=data.size();
subdata.clear();
if(dataNum<=F)
{
for (vector<int>::iterator it = data.begin(); it != data.end();it++)
{
int attr = (*it);
subdata.push_back(attr);
}
}
else
{
set<int> AttrSet;
AttrSet.clear();
while (AttrSet.size() < F)
{
index = rand() % dataNum;
if (AttrSet.count(index) == 0)
{
AttrSet.insert(index);
subdata.push_back(data.at(index));
}
}
}
}
bool Allthesame(constvector<TupleData> &samples, char ch)
{
for (vector<TupleData>::const_iterator it = samples.begin(); it !=samples.end(); it++)
if ((*it).label != ch)
return false;
return true;
}
// 確定節點中哪個類別樣本個數最多
char Majorityclass(constvector<TupleData> &samples)
{
int p = 0, n = 0;
for (vector<TupleData>::const_iterator it = samples.begin(); it !=samples.end(); it++)
if ((*it).label == '+')
p++;
else
n++;
if (p >= n)
return '+';
else
return '-';
}
// 測試階段
char testClassifier(decision_tree *p,TupleData d)
{
// 抵達葉子節點
if (p->node.label != ' ')
return p->node.label;
// 節點處最優屬性
int attrNum = p->node.attrNum;
// 錯誤樣本
if (d.A.at(attrNum) < 0)
return ' ';
// 確定分支
return testClassifier(p->childs.at(d.A.at(attrNum)), d);
}
void testData()
{
for (vector<TupleData>::iterator it = test.begin(); it !=test.end(); it++)
{
printf("新樣本\n");
if((*it).label=='+') TestP++;
else TestN++;
int p = 0, n = 0;
for(int i = 0; i < tree_num; i++)
{
if(testClassifier(alltrees.at(i), (*it))=='+') p++;
else n++;
}
if(p>n)
{
if((*it).label=='+') TP++;
else FP++;
}
else
{
if((*it).label=='+') FN++;
else TN++;
}
}
}
void freeClassifier(decision_tree *p)
{
if (p == NULL)
return;
for (vector<decision_tree*>::iterator it = p->childs.begin();it != p->childs.end(); it++)
{
freeClassifier(*it);
}
delete p;
}
void freeArrtNum()
{
delete[] ArrtNum;
}
void showResult()
{
cout << "Train size: "<<trainAllNum<<endl;
cout << "Test size: "<<testAllNum<<endl;
cout << "True positive: "<< TP << endl;
cout << "False negative: "<<FN<<endl;
cout << "False positive: "<<FP<<endl;
cout << "True negative: "<<TN<<endl;
}
int main(int argc, char **argv)
{
char * trainfile=argv[1];
char* testfile=argv[2];
srand((unsigned)time(NULL));
// 初始化樣本
init("1.txt", "2.txt");
// 訓練階段
for(int i = 0; i < tree_num; i++)
{
printf("第 %d 棵決策樹訓練開始\n", i);
// 每棵樹的訓練樣本子集
sub_init();
// 訓練每棵決策樹
decision_tree * root=NULL;
creat_classifier(root, train, attributes);
// 建立森林
alltrees.push_back(root);