KNN分類與迴歸-C++實現

阿新 • • 發佈：2019-02-14

#include <iostream>
#include <algorithm>
#include <iomanip>
#include <fstream>
#include <sstream>
#include <cstring> 
#include <string>
#include <vector>
#include <cmath>
#include <time.h>
#include <map>

using namespace std;


struct train_data {
	int index;						//訓練文字序號 
	int emotion_value;				//情感值 
	string emotion;					//情感狀態 
	vector<string> word;			//訓練文字單詞 
	int onehot[1000];				//onehot矩陣中的值 
	double distance;				//距離 
	
	train_data(int a = 0, int b = 0, string c = "", double d = 0.0) {
		index = a;
		emotion_value = b;
		emotion = c;
		distance = d;
		word.clear();
		for (int i = 0; i < 1000; i ++)
			onehot[i] = 0;
	}
};

struct ct {
	string s;						//情感狀態 
	int num;						//次數 
	ct(string a = "", int b = 0) {
		s = a;
		num = b;
	}
};
vector<string> train_text;			//每個完整的訓練文字 
vector<string> all_words;			//所有不同的單詞 ，縱軸 
vector<train_data> all_trains;		//所有訓練文字，橫軸 
int right_sum;						//預測正確的個數 

void reading_file(void );
void get_onehot(void );
void class_calculating(int );
double edistance(train_data , train_data );
double mdistance(train_data , train_data );
bool cmp(const train_data & , const train_data & );
bool cmp2(const ct & , const ct & );

int main() {
	int k; 
	for (k = 1; k < 15; k ++) {
		train_text.clear();
		all_words.clear();
		all_trains.clear();
	
		reading_file();
		get_onehot();
		cout << "input k = " << k << endl;
		cout << "不重複的詞個數 " << all_words.size() << endl;  
		class_calculating(k);	
		break;
	}

	return 0;
}

void reading_file() {
	ifstream train("train.txt");
	char read[100];
	string temp;
	train.getline(read, 100);
	while (!train.eof()) {
		train.getline(read, 100);
		temp = read;
		train_text.push_back(temp);
	}
	train.close();	
	stringstream s;
	int index;
	int emotion_value;
	string emotion;
	string word;
	for (int i = 0; i < train_text.size(); i ++) {
		s.str(train_text[i]);
		s >> index;
		s >> emotion_value;
		s >> emotion;
		
		//建立一個新的訓練文字資料 
		train_data new_train;
		new_train.index = index;
		new_train.emotion_value = emotion_value;
		new_train.emotion = emotion;
				
		while (s != NULL) {
			s >> word;
			
			//統計所有單詞
			bool flag1 = true;
			for (int i = 0; i < all_words.size(); i ++) {
				if (all_words[i] == word) {
					flag1 = false;
					break;					
				}
				else 
					continue;
			}
			if (flag1)
				all_words.push_back(word);
			
			//統計每個訓練文字中的單詞
			bool flag2 = true;
			for (int i = 0; i < new_train.word.size(); i ++) {
				if (new_train.word[i] == word) {
					flag2 = false;
					break;
				}
				else 
					continue;
			}
			if (flag2)
				new_train.word.push_back(word);
		}
		s.clear();
		
		all_trains.push_back(new_train);
	}
	//test 
	/*
	ofstream t("test.txt");
	for (int i = 0; i < all_trains.size(); i ++) {
		t << all_trains[i].index << " " << all_trains[i].emotion_value << " " << all_trains[i].emotion;
		for (int j = 0; j < all_trains[i].word.size(); j ++)
			t << " " << all_trains[i].word[j];
		t << endl;
	}
	
	for (int i = 0; i < all_words.size(); i ++)
		cout << i + 1 << " " << all_words[i] << endl;
	*/
}

void get_onehot() {
	int i, j, k;
	for (i = 0; i < all_trains.size(); i ++) {
		for (j = 0; j < all_trains[i].word.size(); j ++) {
			for (k = 0; k < all_words.size(); k ++) {
				if (all_trains[i].word[j] == all_words[k]) 
					all_trains[i].onehot[k] = 1;
			}
		}
	}
	//cout << "hot" <<endl;
	//test
	/*
	ofstream s("testonehot.txt");
	for (i = 0; i < all_words.size(); i ++)
		s << setw(14) << left << all_words[i];
	s << endl;
	for (j = 0; j < all_trains.size(); j ++) {
		for (k = 0; k < all_words.size(); k ++)
			s << setw(14) << left << all_trains[j].onehot[k];
		s << endl;
	}
	*/
}


void class_calculating(int k) {
	ifstream t("test.txt");
	right_sum = 0;
	char c[100];
	string temp;
	t.getline(c, 100);
	while (t.getline(c, 100)) {
		train_data test_train;
		char *p = strtok(c, " ");
		p = strtok(NULL, " ");
		p = strtok(NULL, " ");
		temp = p;
		test_train.emotion = temp;
		//cout << temp << endl;
		p = strtok(NULL, " ");
		
		while (p != NULL) {
			temp = p;
			bool flag = true;
			for (int i = 0; i < test_train.word.size(); i ++) {
				if (test_train.word[i] == temp) {
					flag = false;
					break;
				}
			}
			if (flag)
				test_train.word.push_back(temp);
			p = strtok(NULL, " ");
		}
		
		double d1 = 0;
		for (int i = 0; i < test_train.word.size(); i ++) {
			bool flag3 = true;
			for (int j = 0; j < all_words.size(); j ++) {
				if (test_train.word[i] == all_words[j]) {
					test_train.onehot[j] = 1;
					flag3 = false;
					break;
				}
				else
					continue;
			}
			if (flag3) {									//如果訓練樣本中沒有這個單詞，但又不能改變原始樣本 
				d1 ++;
			}
		}

		for (int i = 0; i < all_trains.size(); i ++) {
		//all_trains[i].distance = sqrt(d1 + edistance(all_trains[i], test_train));
			double part1 = 0;
			for (int j = 0; j < all_words.size(); j ++)
				part1 += all_trains[i].onehot[j] * test_train.onehot[j];
			all_trains[i].distance = part1/(sqrt(all_trains[i].word.size())*sqrt(test_train.word.size() + d1));
		} 
		sort(all_trains.begin(), all_trains.end(), cmp);	

		vector<ct> v;
		ct node("anger", 0);
		v.push_back(node);
		ct node1("disgust", 0);
		v.push_back(node1);
		ct node2("fear", 0);
		v.push_back(node2);
		ct node3("joy", 0);
		v.push_back(node3);
		ct node4("sad", 0);
		v.push_back(node4);
		ct node5("surprise", 0);
		v.push_back(node5);
		for (int i = 0; i < k; i ++) {
			for (int j = 0; j < v.size(); j ++) {
				if (all_trains[i].emotion == v[j].s) {
					v[j].num ++;
					break;
				}
			}
		}
		sort(v.begin(), v.end(), cmp2);
		if (v.back().s == test_train.emotion) {
			right_sum ++;
		}

			
	}
	cout << "正確個數" << right_sum << endl;
}

double edistance(train_data a, train_data b) {			//歐式 ：開方前 
	double total = 0.0;
	for (int i = 0; i < all_words.size(); i ++) {
		total += pow(a.onehot[i] - b.onehot[i], 2);
	}
	
	return total;
}
double mdistance(train_data a, train_data b) {			//曼哈頓 
	double total = 0.0;
	for (int i = 0; i < all_words.size(); i ++) {
		total += abs(a.onehot[i] - b.onehot[i]);
	}
	
	return total;
}
bool cmp(const train_data &a, const train_data &b) {
	return a.distance < b.distance;
}
bool cmp2(const ct &a, const ct &b ) {
	return a.num < b.num;	
}

KNN迴歸：夾角餘弦

#include <iostream>
#include <algorithm>
#include <iomanip>
#include <fstream>
#include <sstream>
#include <cstring> 
#include <string>
#include <vector>
#include <cmath>
#include <time.h>
#include <map>

using namespace std;


struct train_data {
	int index;						//訓練文字序號 
	int emotion_value;				//情感值 
	string emotion;					//情感狀態 
	vector<string> word;			//訓練文字單詞 
	int onehot[1000];			//onehot矩陣中的值 
	double distance;				//距離 
	vector<double> fre_set; 
	
	train_data(int a = 0, int b = 0, string c = "", double d = 0.0) {
		index = a;
		emotion_value = b;
		emotion = c;
		distance = d;
		word.clear();
		fre_set.clear();
		for (int i = 0; i < 1000; i ++)
			onehot[i] = 0;
	}
};

struct ct {
	string s;
	int num;
	ct(string a = "", int b = 0) {
		s = a;
		num = b;
	}
};

vector<string> train_text;			//每個完整的訓練文字 
vector<string> all_words;			//所有不同的單詞 ，縱軸 
vector<train_data> all_trains;		//所有訓練文字，橫軸 
int right_sum;						//預測正確的個數 

void reading_file(void );
void get_onehot(void );
void regre_calculating(int );
double edistance(train_data , train_data );
double mdistance(train_data , train_data );
bool cmp(const train_data & , const train_data & );


int main() {
	train_text.clear();
	all_words.clear();
	all_trains.clear();
	
	reading_file();
	//cout << all_words.size() << endl; 904
	get_onehot();
	int k;
	cout << "input k = ";
	cin >> k;
	regre_calculating(k);
	
	//for (int i = 0; i <= all_trains[2].onehot.size(); i ++)
	//	cout << all_trains[2].onehot[i] << endl;
	
	return 0;
}

void reading_file() {
	ifstream t("Dataset_train.csv");	
	char c[150];
	string temp;
	t.getline(c, 150);
	while (t.getline(c, 150)) {
		train_data new_train;
		char d[150];
		strcpy(d, c);
		char *p = strtok(c, ",");
		p = strtok(NULL, ",");
		//cout << p << endl;
		char *p2 = strtok(p, " ");
		while (p2 != NULL) {
			string word = p2;
			//統計所有單詞
			bool flag1 = true;
			for (int i = 0; i < all_words.size(); i ++) {
				if (all_words[i] == word) {
					flag1 = false;
					break;					
				}
				else 
					continue;
			}
			if (flag1)
				all_words.push_back(word);
			
			//統計每個訓練文字中的單詞
			bool flag2 = true;
			for (int i = 0; i < new_train.word.size(); i ++) {
				if (new_train.word[i] == word) {
					flag2 = false;
					break;
				}
				else 
					continue;
			}
			if (flag2)
				new_train.word.push_back(word);
			p2 = strtok(NULL, " ");
		}
		char *p3 = strtok(d, ",");
		p3 = strtok(NULL, ",");
		p3 = strtok(NULL, ",");
		stringstream ss;
		double fre;
		while (p3 != NULL) {
			temp = p3;
			ss.str(temp);
			ss >> fre;
			new_train.fre_set.push_back(fre);
			ss.clear();
			p3 = strtok(NULL, ",");
		}
		/*for (int i = 0; i < new_train.word.size(); i ++)
			cout << new_train.word[i] << " ";
		for (int i = 0; i < new_train.fre_set.size(); i ++)
			cout << new_train.fre_set[i] << " ";
		break;
		*/		
		all_trains.push_back(new_train);				
	}
	//cout << all_words.size();
}

void get_onehot() {
	int i, j, k;
	for (i = 0; i < all_trains.size(); i ++) {
		for (j = 0; j < all_trains[i].word.size(); j ++) {
			for (k = 0; k < all_words.size(); k ++) {
				if (all_trains[i].word[j] == all_words[k]) 
					all_trains[i].onehot[k] = 1;
			}
		}
	}
}

void regre_calculating(int k) {
	ifstream t("Dataset_validation.csv");
	char c[150];
	string temp;
	t.getline(c, 150);
	ofstream out("14353324_xiangketing_regression.txt");
	while (t.getline(c, 150)) {
		train_data test_train;
		char *p = strtok(c, ",");
		p = strtok(NULL, ",");
		//cout << p << endl;
		char *p2 = strtok(p, " ");
		while (p2 != NULL) {
			temp = p2;
			bool flag = true;
			for (int i = 0; i < test_train.word.size(); i ++) {
				if (test_train.word[i] == temp) {
					flag = false;
					break;
				}
			}
			if (flag)
				test_train.word.push_back(temp);
			p2 = strtok(NULL, " ");
		}
		
		double d1 = 0;
		for (int i = 0; i < test_train.word.size(); i ++) {
			bool flag3 = true;
			for (int j = 0; j < all_words.size(); j ++) {
				if (test_train.word[i] == all_words[j]) {
					test_train.onehot[j] = 1;
					flag3 = false;
					break;
				}
				else
					continue;
			}
			if (flag3) {									//如果訓練樣本中沒有這個單詞，但又不能改變原始樣本 
				d1 ++;
			}
		}
		for (int i = 0; i < all_trains.size(); i ++) {
			//all_trains[i].distance = sqrt(d1 + edistance(all_trains[i], test_train));
			double part1 = 0;
			for (int j = 0; j < all_words.size(); j ++)
				part1 += all_trains[i].onehot[j] * test_train.onehot[j];
			all_trains[i].distance = part1/(sqrt(all_trains[i].word.size())*sqrt(test_train.word.size() + d1));
		} 
		sort(all_trains.begin(), all_trains.end(), cmp);	
		double a[6];
		double sum = 0;
		for (int i = 0; i < 6; i ++) {
			double value = 0;
			for (int j = 0; j < k; j ++) {
				value += all_trains[j].fre_set[i] * all_trains[j].distance;
			}
			a[i] = value;
			sum += value;
		}
		out << a[0] / sum << '\t' << a[1] / sum << '\t' << a[2] / sum << '\t' 
			<< a[3] / sum << '\t' << a[4] / sum << '\t' << a[5] / sum << endl;
	}
}

double edistance(train_data a, train_data b) {
	double total = 0.0;
	for (int i = 0; i < all_words.size(); i ++) {
		total += pow(a.onehot[i] - b.onehot[i], 2);
	}
	
	return total;
}
double mdistance(train_data a, train_data b) {
	double total = 0.0;
	for (int i = 0; i < all_words.size(); i ++) {
		total += abs(a.onehot[i] - b.onehot[i]);
	}
	
	return total;
}

bool cmp(const train_data &a, const train_data &b) {
	return a.distance > b.distance;
}

KNN分類與迴歸-C++實現

#include <iostream> #include <algorithm> #include <iomanip> #include <fstream> #include <sstream> #include <cstring> #

CART分類與迴歸樹的原理與實現

// cart.cpp : 定義控制檯應用程式的入口點。 // #include "stdafx.h" #include<vector> #include<set> #include<algorithm> #include<iostream> #include

分類迴歸——CART分類與迴歸以及Python實現

CART分類與迴歸樹本質上是一樣的，構建過程都是逐步分割特徵空間，預測過程都是從根節點開始一層一層的判斷直到葉節點給出預測結果。只不過分類樹給出離散值，而回歸樹給出連續值(通常是葉節點包含樣本的均值），另外分類樹基於Gini指數選取分割點，而回歸樹基於平方誤差選取分割點。

決策樹C4 5分類演算法的C++實現

分享一下我老師大神的人工智慧教程！零基礎，通俗易懂！http://blog.csdn.net/jiangjunshow 也歡迎大家轉載本篇文章。分享知識，造福人民，實現我們中華民族偉大復興！

區分識別機器學習中的分類與迴歸

預測建模是關於學習從輸入到輸出的函式對映的問題，這個對映稱作函式逼近。分類是給一個樣本預測離散型類別標籤的問題。迴歸是給一個樣本預測連續輸出量的問題。本教程分為以下 5 個部分： 1. 函式逼近 2. 分類

機器學習中分類與迴歸的解決與區別

機器學習可以解決很多問題，其中最為重要的兩個是迴歸與分類。這兩個問題怎麼解決，它們之間又有什麼區別呢？以下舉幾個簡單的例子，以給大家一個概念 1. 線性迴歸迴歸分析常用於分析兩個變數X和Y 之間的關係。比如 X＝房子大小和 Y＝房價之間的關係， X=(公園人流量，公園門票票價

knn演算法與kd樹實現

最近鄰法和k-近鄰法　　下面圖片中只有三種豆，有三個豆是未知的種類，如何判定他們的種類？　　提供一種思路，即：未知的豆離哪種豆最近就認為未知豆和該豆是同一種類。由此，我們引出最近鄰演算法的定義：為了判定未知樣本的類別，以全部訓練樣本作為代表點，計算未知樣本與所有訓練

一份非常全面的機器學習分類與迴歸演算法的評估指標彙總

本文是《機器學習寶典》第 3 篇，讀完本文你能夠掌握分類與迴歸演算法的評估指標。 PS：文末附有練習題讀完機器學習演算法常識之後，你已經知道了什麼是欠擬合和過擬合、偏差和方差以及貝葉斯誤差。在這篇給大家介紹一些機器學習中離線評估模型效能的一些指標。當我們訓練得到

【分類】KNN分類演算法之Python實現

KNN稱為K最近鄰。對於待分類資料，它先計算出與其最相近的K個的樣本，然後判斷這K個樣本中最多的類標籤，並將待分類資料標記為這個最多的類標籤。 python樣例程式碼： import numpy as np from sklearn.neighbors import KN

機器學習中分類與迴歸問題的區別與聯絡

分類和迴歸問題之間存在重要差異。從根本上說，分類是關於預測標籤，而回歸是關於預測數量。我經常看到諸如以下問題：如何計算迴歸問題的準確性？像這樣的問題是沒有真正理解分類和迴歸之間的差異以及試圖衡量的準確度的症狀。在本教程中，您將發現分類和迴歸之間的差異。

機器學習~分類與迴歸異同

一篇來著外文網站的關於分類與迴歸異同的解釋。 Difference Between Classification and Regression in Machine Learning Fundamentally, classification is about predi

分類與迴歸

https://www.zhihu.com/question/21329754 如何區分類與迴歸，看的不是輸入，而是輸出的連續與否分類和迴歸的區別在於輸出變數的型別。定量輸出稱為迴歸，或者說是連續變數預測；定性輸出稱為分類，或者說是離散變數預測。舉個例子：預測明天的氣溫是

解密SVM系列（五）：matlab下libsvm的簡單使用：分類與迴歸

本節簡單介紹一下libsvm的使用方法。關於libsvm似乎曾經使用過，那個時候主要用libsvm進行。當時還翻譯過關於介紹與分類實驗下載下來的libsvm其實包含好多個平臺的工具箱軟體，c++，matlab，java，python都有。他們的函

凱撒加密與解密 C++實現

C++實現的原理過於簡單不作解釋 #include <iostream> #include <string> #include <cctype> using namespace std; string Caesarencrypt(string s, in

分類與迴歸樹(CART,Classification And Regression Tree)

分類迴歸樹也屬於一種決策樹。分類迴歸樹是一棵二叉樹，且每個非葉子節點都有兩個孩子。構建決策樹時通常採用自上而下的方法，在每一步選擇一個最好的屬性來分裂。 "最好" 的定義是使得子節點中的訓練集儘

數字影象實驗二：幾何變換與變形c++實現

實驗2.1：影象縮放實現一個影象縮放函式，可以對輸入影象進行任意倍數的縮放；採用雙線性插值進行重取樣； X,Y方向的縮放倍數參函式引數的形式傳入；可以只考慮輸入影象為3通道，8位深度的情況；不能呼叫影象處理庫的縮放函式來完成；實驗2.2：影

資料探勘十大經典演算法--CART: 分類與迴歸樹

一、決策樹的型別在資料探勘中，決策樹主要有兩種型別: 分類樹的輸出是樣本的類標。迴歸樹的輸出是一個實數 (例如房子的價格，病人呆在醫院的時間等)。術語分類和迴歸樹 (CART) 包含了上述兩種決策樹, 最先由Breiman 等提出.分類樹和迴歸樹有些共同點和不同

kNN分類演算法的Python實現

1.k-近鄰演算法實現 from numpy import * import operator def createDataSet(): group = array([[1.0, 1.1], [2.0, 2.0], [0, 0], [4.1, 5.1]]) labels = ['A

K 近鄰演算法（KNN）與KD 樹實現

KD樹節點 /// <summary> /// ＫＤ樹節點 /// /2016/4/1安晟新增 /// </summary> [Serializable] p

資料探勘-決策樹ID3分類演算法的C++實現

資料探勘課上面老師介紹了下決策樹ID3演算法，我抽空餘時間把這個演算法用C++實現了一遍。決策樹演算法是非常常用的分類演算法，是逼近離散目標函式的方法，學習得到的函式以決策樹的形式表示。其基本思路是不斷選取產生資訊增益最大的屬性來劃分樣例集和，構造決策樹。資訊增益定義為結點與

KNN分類與迴歸-C++實現

KNN迴歸：夾角餘弦

相關推薦