ID3演算法簡單例項(程式碼)

阿新 • • 發佈：2019-01-21

問題描述：這裡有17個屬性，背景是美國選舉，然後我們需要做的就是根據除了Class Name的16個屬性判斷這個人是哪個黨派。

std::string temp[17] = { "Class Name", "handicapped-infants", "water-project-cost-sharing",
"adoption-of-the-budget-resolution", "physician-fee-freeze",
"el-salvador-aid", "religious-groups-in-schools", "anti-satellite-test-ban",
"aid-to-nicaraguan-contras", "mx-missile", "immigration", "synfuels-corporation-cutback",
"education-spending", "superfund-right-to-use", "crime", "duty-free-exports",
"export-administration-act-south-africa" };

以下是原始碼:

#include <iostream>
#include <string>
#include <vector>
#include <map>
#include <algorithm>
#include <fstream>
#include <sstream>
#include <cmath>
using namespace std;
#define MAXLEN 17//輸入每行的資料個數

//多叉樹的實現 
//每個結點的所有孩子用vector儲存
//資料結構的設計很重要，本演算法採用5比較合適，同時
//注意維護剩餘樣例和剩餘屬性資訊，建樹時橫向遍歷考迴圈屬性的值，
//縱向遍歷靠遞迴呼叫

vector <vector <string> > state;    //訓練例項
vector <vector <string>> teststate;  //測試例項
vector <string> item(MAXLEN);//對應一行例項集
vector <string> attribute_row;//儲存首行即屬性行資料
string blank("");
map<string, vector < string > > map_attribute_values;//儲存屬性對應的所有的值
int tree_size = 0;
struct Node{//決策樹節點
	string attribute;//屬性值
	string arrived_value;//到達的屬性值
	vector<Node *> childs;//所有的孩子
	Node(){
		attribute = blank;
		arrived_value = blank;
	}
};
Node * root;

//根據資料例項計算屬性與值組成的map
void ComputeMap(){
	unsigned int i, j, k;
	bool exited = false;
	vector<string> values;
	for (i = 0; i < MAXLEN ; i++){//按照列遍歷
		for (j = 0; j < state.size(); j++){
			for (k = 0; k < values.size(); k++){
				if (!values[k].compare(state[j][i])) exited = true;
			}
			if (!exited){
				values.push_back(state[j][i]);//注意Vector的插入都是從前面插入的，注意更新it，始終指向vector頭
			}
			exited = false;
		}
		map_attribute_values[attribute_row[i]] = values;
		values.erase(values.begin(), values.end());
	}
}

//根據具體屬性和值來計算熵
double ComputeEntropy(vector <vector <string> > remain_state, string attribute, string value, bool ifparent){
	//vector<int> count(2, 0);
	int count[2] = { 0 };
	unsigned int i, j;
	bool done_flag = false;//哨兵值
	for (j = 1; j < MAXLEN; j++){
		if (done_flag) break;
		if (!attribute_row[j].compare(attribute)){
			for (i = 1; i < remain_state.size(); i++){
				if ((!ifparent&&!remain_state[i][j].compare(value)) || ifparent){//ifparent記錄是否算父節點
					if (!remain_state[i][0].compare("republican")){
						count[0]++;
					}
					else count[1]++;
				}
			}
			done_flag = true;
		}
	}
	if (count[0] == 0 || count[1] == 0) return 0;//全部是正例項或者負例項
	//具體計算熵 根據[+count[0],-count[1]],log2為底通過換底公式換成自然數底數
	double sum = count[0] + count[1];
	double entropy = -count[0] / sum*log(count[0] / sum) / log(2.0) - count[1] / sum*log(count[1] / sum) / log(2.0);
	return entropy;
}

//計算按照屬性attribute劃分當前剩餘例項的資訊增益
double ComputeGain(vector <vector <string> > remain_state, string attribute){
	unsigned int j, k, m;
	//首先求不做劃分時的熵
	double parent_entropy = ComputeEntropy(remain_state, attribute, blank, true);
	double children_entropy = 0;
	//然後求做劃分後各個值的熵
	vector<string> values = map_attribute_values[attribute];
	vector<double> ratio;
	vector<int> count_values;
	int tempint;
	for (m = 0; m < values.size(); m++){
		tempint = 0;
		for (k = 1; k < MAXLEN ; k++){
			if (!attribute_row[k].compare(attribute)){
				for (j = 0; j < remain_state.size(); j++){
					if (!remain_state[j][k].compare(values[m])){
						tempint++;
					}
				}
			}
		}
		count_values.push_back(tempint);
	}

	for (j = 0; j < values.size(); j++){
		ratio.push_back((double)count_values[j] / (double)(remain_state.size() - 1));
	}
	double temp_entropy;
	for (j = 0; j < values.size(); j++){
		temp_entropy = ComputeEntropy(remain_state, attribute, values[j], false);
		children_entropy += ratio[j] * temp_entropy;
	}
	return (parent_entropy - children_entropy);
}

int FindAttriNumByName(string attri){
	for (int i = 0; i < MAXLEN; i++){
		if (attribute_row[i]==attri) return i;
	}
	cerr << "can't find the numth of attribute" << endl;
	return 0;
}

//找出樣例中佔多數的黨派
string MostCommonLabel(vector <vector <string> > remain_state){
	int p = 0, n = 0;
	for (unsigned i = 0; i < remain_state.size(); i++){
		if (!remain_state[i][0].compare("republican")) p++;
		else n++;
	}
	if (p >= n) return "republican";
	else return "democrat";
}

//判斷樣例是否為同一個黨派
bool AllTheSameLabel(vector <vector <string> > remain_state, string label){
	int count = 0;
	bool mark = false;
	for (unsigned int i = 0; i < remain_state.size(); i++){
		if (!remain_state[i][0].compare(label)) count++;
	}
	if (count == remain_state.size() - 1) return true;
	else return false;
}

//計算資訊增益，DFS構建決策樹
//current_node為當前的節點
//remain_state為剩餘待分類的樣例
//remian_attribute為剩餘還沒有考慮的屬性
//返回根結點指標
Node * BulidDecisionTreeDFS(Node * p, vector <vector <string> > remain_state, vector <string> remain_attribute){
	if (p == NULL)
		p = new Node();
	//先看搜尋到樹葉的情況
	if (AllTheSameLabel(remain_state, "republican")){
		p->attribute = "republican";
		return p;
	}
	if (AllTheSameLabel(remain_state, "democrat")){
		p->attribute = "democrat";
		return p;
	}
	if (remain_attribute.size() == 0){//所有的屬性均已經考慮完了,結果中還是有兩個黨派
		string label = MostCommonLabel(remain_state);
		p->attribute = label;
		return p;
	}

	double max_gain = 0, temp_gain;
	vector <string>::iterator max_it = remain_attribute.begin();
	vector <string>::iterator it1;
	for (it1 = remain_attribute.begin(); it1 < remain_attribute.end(); it1++){
		temp_gain = ComputeGain(remain_state, (*it1));
		if (temp_gain > max_gain) {
			max_gain = temp_gain;
			max_it = it1;
		}
	}
	//下面根據max_it指向的屬性來劃分當前樣例，更新樣例集和屬性集
	vector <string> new_attribute;
	vector <vector <string> > new_state;
	for (vector <string>::iterator it2 = remain_attribute.begin(); it2 < remain_attribute.end(); it2++){
		if ((*it2).compare(*max_it)) new_attribute.push_back(*it2);
	}
	//確定了最佳劃分屬性，儲存
	p->attribute = *max_it;
	vector <string> values = map_attribute_values[*max_it];
	int attribue_num = FindAttriNumByName(*max_it);
	new_state.push_back(attribute_row);
	for (vector <string>::iterator it3 = values.begin(); it3 < values.end(); it3++){
		for (unsigned int i = 0; i < remain_state.size(); i++){
			if (!remain_state[i][attribue_num].compare(*it3)){
				new_state.push_back(remain_state[i]);
			}
		}
		Node * new_node = new Node();
		new_node->arrived_value = *it3;
		if (new_state.size() == 0){//表示當前沒有這個分支的樣例，當前的new_node為葉子節點
			new_node->attribute = MostCommonLabel(remain_state);
		}
		else
			BulidDecisionTreeDFS(new_node, new_state, new_attribute);
		//遞迴函式返回時即回溯時需要 將新結點加入父節點孩子容器  清除new_state容器
		p->childs.push_back(new_node);
		new_state.erase(new_state.begin() + 1, new_state.end());//注意先清空new_state中的前一個取值的樣例，準備遍歷下一個取值樣例
	}
	return p;
}


void Input(){
	std::fstream in("train.txt", std::fstream::in | std::fstream::out);
	int lines = 0;
	while (!in.eof()) {
		std::vector<std::string> temp(17);
		std::string buffer;
		int begin_sign = 0;
		int len;
		in >> buffer;
		int i1 = 0;
		for (int i = 0; i <= buffer.length(); i++) {
			if (i == buffer.length() || buffer[i] == ',') {
				temp[i1] = buffer.substr(begin_sign, i - begin_sign);
				begin_sign = i + 1;
				++i1;
			}
		}
		lines++;
		if (temp[0] != "") {
			state.push_back(temp);
		}
	}
	cout << lines << endl;
}
bool Judge(Node* root, vector <string> teststate){
	if (!root){ cout << "error tree!"; exit(0); }
	bool istrue = false;
	for (vector<Node*>::iterator it = root->childs.begin(); it != root->childs.end(); it++){         //遍歷子節點
		if (((*it)->attribute == "republican" || (*it)->attribute == "democrat")){                  //如果這個節點是葉子節點，即黨派
			int num = FindAttriNumByName(root->attribute);
			if ((*it)->attribute == teststate[0] && (*it)->arrived_value==teststate[num]) {         
				istrue = true;
				return true;
			}
		}
		int sub = FindAttriNumByName(root->attribute);                                          //非葉子節點，根據到達值繼續搜尋
		if ((*it)->arrived_value == teststate[sub]){
			return	Judge(*it, teststate);
		}
	}
	return istrue;
}


double Inputtest(Node* root){
	std::fstream in("test.txt", std::fstream::in | std::fstream::out);
	int lines = 0;
	while (!in.eof()) {
		std::vector<std::string> temp(17);
		std::string buffer;
		int begin_sign = 0;
		int len;
		in >> buffer;
		int i1 = 0;
		for (int i = 0; i <= buffer.length(); i++) {
			if (i == buffer.length() || buffer[i] == ',') {
				temp[i1] = buffer.substr(begin_sign, i - begin_sign);
				begin_sign = i + 1;
				++i1;
			}
		}
		lines++;
		if (temp[0] != "") {
			teststate.push_back(temp);
		}
	}
	int count = 0;
	cout << lines << endl;
	for (int a = 0; a < lines; a++){
		if (Judge(root, teststate[a]))
			count++;
	}
	return count*1.0 / lines*1.0;
    
}

void PrintTree(Node *p, int depth){
	for (int i = 0; i < depth; i++) cout << '\t';//按照樹的深度先輸出tab
	if (!p->arrived_value.empty()){
		cout << p->arrived_value << endl;
		for (int i = 0; i < depth + 1; i++) cout << '\t';//按照樹的深度先輸出tab
	}
	cout << p->attribute << endl;
	for (vector<Node*>::iterator it = p->childs.begin(); it != p->childs.end(); it++){
		PrintTree(*it, depth + 1);
	}
}


void Treesize(Node *p){
	if (p == NULL)
		return;
	for (vector<Node*>::iterator it = p->childs.begin(); it != p->childs.end(); it++){
		Treesize(*it);
	}
	tree_size++;
}

int main(){
	Input();
	vector <string> remain_attribute;
	std::string temp[17] = { "Class Name", "handicapped-infants", "water-project-cost-sharing",
		"adoption-of-the-budget-resolution", "physician-fee-freeze",
		"el-salvador-aid", "religious-groups-in-schools", "anti-satellite-test-ban",
		"aid-to-nicaraguan-contras", "mx-missile", "immigration", "synfuels-corporation-cutback",
		"education-spending", "superfund-right-to-use", "crime", "duty-free-exports",
		"export-administration-act-south-africa" };
	for (int a = 0; a < 17; a++){
		attribute_row.push_back(temp[a]);
		
	}
	for (int a = 1; a < 17; a++){
		remain_attribute.push_back(temp[a]);
	}

	vector <vector <string> > remain_state;
	for (unsigned int i = 0; i < state.size(); i++){
		remain_state.push_back(state[i]);
	}
	ComputeMap();
	root = BulidDecisionTreeDFS(root, remain_state, remain_attribute);
	cout << "the decision tree is :" << endl;
	PrintTree(root, 0);
	Treesize(root);
	cout << endl;
	cout << "tree_size:" << tree_size << endl;
	cout<<"正確率"<<Inputtest(root)*100<<"%"<<endl;
	return 0;
}

ID3演算法簡單例項(程式碼)

問題描述：這裡有17個屬性，背景是美國選舉，然後我們需要做的就是根據除了Class Name的16個屬性判斷這個人是哪個黨派。 std::string temp[17] = { "Class Name", "handicapped-infants", "water-p

Java中的氣泡排序演算法-簡單例項

冒泡演算法的原理實現：（從小到大排序） 1：比較相鄰的兩個元素，如果第一個比第二個大就交換位置。 2：對每一對相鄰的元素進行比較，從開始第一隊到結尾的最後一對，這樣最後的元素就是最大的了。

hadoop常用演算法簡單例項

例項一、對以下資料進行排序，根據收入減去支出得到最後結餘從大到小排序，資料如下： SumStep執行之後結果如下： SortStep執行之後結果為上圖根據結餘從大到小排序。程式碼如下： public class InfoBean implements Writabl

rabbitmq(一)搭建以及建立簡單的程式碼例項

使用Docker搭建的rabbitMqdocker pull拉取映象 docker pull rabbitmq:management 啟動映象 docker run -d --name rabbitmq -p 5671:5671 -p 5672:5672 -p 4369:4369 -p

基於內容的推薦演算法的實現程式碼例項

本次例項需要三個資料檔案分別為節目及其所屬標籤型別的01矩陣；使用者--節目評分矩陣；使用者收視了的節目--標籤01矩陣。可以直接下載下來使用https://download.csdn.net/download/qq_38281438/10757266 具體程式碼如下： #

通俗易懂的Adaboost演算法原理分析和例項+程式碼

【尊重原創，轉載請註明出處】 http://blog.csdn.net/guyuealian/article/details/70995333 本人最初瞭解AdaBoost演算法著實是花了幾天時間，才明白他的基本原理。也許是自己能力有限吧，很多資

Adaboost演算法原理分析和例項+程式碼（簡明易懂）

【尊重原創，轉載請註明出處】 http://blog.csdn.net/guyuealian/article/details/70995333 本人最初瞭解AdaBoost演算法著實是花了幾天時間，才明白他的基本原理。也許是自己能力有限吧，很多資料也是看得懵懵懂

遞迴演算法的簡單例項

一、什麼是遞迴（遞迴百度百科）程式呼叫自身的程式設計技巧稱為遞迴（recursion）。它通常把一個大型複雜的問題層層轉化為一個與原問題相似的規模較小的問題來求解，遞迴策略只需少量的程式就可描述出解題過程所需要的多次重複計算。二、如何使用遞迴解決問題

roc曲線簡單介紹及例項程式碼

今日筆記：roc曲線還是僅供自己參考學習。。首先要注意的是roc曲線僅適用於二分類問題，不是二分類問題應首先通過各種手段轉為二分類問題。 roc橫座標為TPR，縱座標為TPR，若要知道TPR,FPR，就要從混淆矩陣說起... 漏掉了f1..f1 = 2*p*r

機器學習實戰k-鄰近演算法(kNN)簡單實施程式碼解讀

一.概念 k-鄰近演算法是最簡單的機器學習演算法之一。 k-鄰近演算法採用測量不同特徵值之間的距離（具體說是歐氏距離）的方法進行分類。輸入待分類的資料後，計算輸入特徵與樣本集資料對應特徵的距離，選擇樣本集中與輸入特徵距離最小的前k個樣本，統計這k個樣本資

電影推薦演算法例項程式碼

# -*- coding=utf-8 -*- import math import sys from texttable import Texttable # 使用 |A&B|/sqrt(|A || B |)計算餘弦距離 def calcCos

統計學習方法 k 近鄰演算法(附簡單模型程式碼)

1. k 近鄰演算法 k近鄰法（k-nearest neighbor， k-NN）是一種基本分類與迴歸方法。 k近鄰法的輸入為例項的特徵向量，對應於特徵空間的點；輸出為例項的類別，可以取多類。 k近鄰法假設給定一個訓練資料集，其中的例項類別已定。分類時，對新的例項，根

k最鄰近演算法-KNN，及python3 例項程式碼

剛讀了《machine learning in action》的KNN演算法。 K最近鄰演算法（kNN，k-NearestNeighbo），即計算到每個樣本的距離，選取前k個。從前k個選擇出大多數屬於的class來進行分類，以下特點： 1. 簡單，無需訓練 2. 樣本數量不

java網路程式設計：12、基於UDP的socket程式設計（二）程式碼通訊-簡單例項

宣告：本教程不收取任何費用，歡迎轉載，尊重作者勞動成果，不得用於商業用途，侵權必究！！！文章目錄一、基於UDP伺服器端程式的編寫二、基於UDP客戶端程式的編寫三、系列文章（java網路程式設計）通過上篇文章瞭解了基於UDP通訊的理論、基本步驟以及它跟TCP的區別

Java socket通訊例項，簡單入門socket例項程式碼

是不是看了許多socket入門知識，卻還是不能實際運用呢，這篇文章通過利用簡單例項程式講解通過socket實現客戶端與伺服器之間的通訊。這篇文章可以讓你不需要了解socket原理也能利用，便於應急，但建議之後要好好補補關於soket的基礎知識。首先就是上程式碼，先是伺服器

簡單易懂的KMP，NEXT陣列，BF演算法（例項講解）！！！

去了360面試，問了一個關於KMP的知識點，呀，完全忘了啊，太不應該了，然後就打算看看這個KMP，，，看了好多關於KMP演算法的書籍和資料，總感覺沒有說的很清楚，為什麼會產生next陣列，為什麼給出了那麼簡短的程式，沒有一個過程，而有的帖子雖然next及其字串匹配說的很清

寶具滑 / JS簡單實現決策樹(ID3演算法)

簡單Java程式碼例項助你通俗易懂的理解什麼是裝飾(者)設計模式 (Decorator)

首先拋開到處都有的文字概念。來看下面的例子。現在以：人吃飯。這個需求為例。來講解這個設計模式。 1.0：想當年，人們都比較樸實。吃飯就是簡簡單單的吃飯。那麼PersonBefore類裡面一個簡單的

資料探勘之clara演算法原理及例項(程式碼中有bug)

繼上兩篇文章介紹聚類中基於劃分思想的k-means演算法和k-mediod演算法本文將繼續介紹另外一種基於劃分思想的k-mediod演算法-----clara演算法 clara演算法可以說是對k-mediod演算法的一種改進,就如同k-mediod演算法對 k-m

Ajax 簡單的例項程式碼

AJAX = Asynchronous JavaScript and XML（非同步的 JavaScript 和 XML）。 AJAX 不是新的程式語言，而是一種使用現有標準的新方法。 AJAX 是與伺服器交換資料並更新部分網頁的藝術，在不重新載入整個頁面的情況下。

ID3演算法簡單例項(程式碼)

相關推薦