KNN分類與迴歸-C++實現
阿新 • • 發佈:2019-02-14
#include <iostream> #include <algorithm> #include <iomanip> #include <fstream> #include <sstream> #include <cstring> #include <string> #include <vector> #include <cmath> #include <time.h> #include <map> using namespace std; struct train_data { int index; //訓練文字序號 int emotion_value; //情感值 string emotion; //情感狀態 vector<string> word; //訓練文字單詞 int onehot[1000]; //onehot矩陣中的值 double distance; //距離 train_data(int a = 0, int b = 0, string c = "", double d = 0.0) { index = a; emotion_value = b; emotion = c; distance = d; word.clear(); for (int i = 0; i < 1000; i ++) onehot[i] = 0; } }; struct ct { string s; //情感狀態 int num; //次數 ct(string a = "", int b = 0) { s = a; num = b; } }; vector<string> train_text; //每個完整的訓練文字 vector<string> all_words; //所有不同的單詞 ,縱軸 vector<train_data> all_trains; //所有訓練文字,橫軸 int right_sum; //預測正確的個數 void reading_file(void ); void get_onehot(void ); void class_calculating(int ); double edistance(train_data , train_data ); double mdistance(train_data , train_data ); bool cmp(const train_data & , const train_data & ); bool cmp2(const ct & , const ct & ); int main() { int k; for (k = 1; k < 15; k ++) { train_text.clear(); all_words.clear(); all_trains.clear(); reading_file(); get_onehot(); cout << "input k = " << k << endl; cout << "不重複的詞個數 " << all_words.size() << endl; class_calculating(k); break; } return 0; } void reading_file() { ifstream train("train.txt"); char read[100]; string temp; train.getline(read, 100); while (!train.eof()) { train.getline(read, 100); temp = read; train_text.push_back(temp); } train.close(); stringstream s; int index; int emotion_value; string emotion; string word; for (int i = 0; i < train_text.size(); i ++) { s.str(train_text[i]); s >> index; s >> emotion_value; s >> emotion; //建立一個新的訓練文字資料 train_data new_train; new_train.index = index; new_train.emotion_value = emotion_value; new_train.emotion = emotion; while (s != NULL) { s >> word; //統計所有單詞 bool flag1 = true; for (int i = 0; i < all_words.size(); i ++) { if (all_words[i] == word) { flag1 = false; break; } else continue; } if (flag1) all_words.push_back(word); //統計每個訓練文字中的單詞 bool flag2 = true; for (int i = 0; i < new_train.word.size(); i ++) { if (new_train.word[i] == word) { flag2 = false; break; } else continue; } if (flag2) new_train.word.push_back(word); } s.clear(); all_trains.push_back(new_train); } //test /* ofstream t("test.txt"); for (int i = 0; i < all_trains.size(); i ++) { t << all_trains[i].index << " " << all_trains[i].emotion_value << " " << all_trains[i].emotion; for (int j = 0; j < all_trains[i].word.size(); j ++) t << " " << all_trains[i].word[j]; t << endl; } for (int i = 0; i < all_words.size(); i ++) cout << i + 1 << " " << all_words[i] << endl; */ } void get_onehot() { int i, j, k; for (i = 0; i < all_trains.size(); i ++) { for (j = 0; j < all_trains[i].word.size(); j ++) { for (k = 0; k < all_words.size(); k ++) { if (all_trains[i].word[j] == all_words[k]) all_trains[i].onehot[k] = 1; } } } //cout << "hot" <<endl; //test /* ofstream s("testonehot.txt"); for (i = 0; i < all_words.size(); i ++) s << setw(14) << left << all_words[i]; s << endl; for (j = 0; j < all_trains.size(); j ++) { for (k = 0; k < all_words.size(); k ++) s << setw(14) << left << all_trains[j].onehot[k]; s << endl; } */ } void class_calculating(int k) { ifstream t("test.txt"); right_sum = 0; char c[100]; string temp; t.getline(c, 100); while (t.getline(c, 100)) { train_data test_train; char *p = strtok(c, " "); p = strtok(NULL, " "); p = strtok(NULL, " "); temp = p; test_train.emotion = temp; //cout << temp << endl; p = strtok(NULL, " "); while (p != NULL) { temp = p; bool flag = true; for (int i = 0; i < test_train.word.size(); i ++) { if (test_train.word[i] == temp) { flag = false; break; } } if (flag) test_train.word.push_back(temp); p = strtok(NULL, " "); } double d1 = 0; for (int i = 0; i < test_train.word.size(); i ++) { bool flag3 = true; for (int j = 0; j < all_words.size(); j ++) { if (test_train.word[i] == all_words[j]) { test_train.onehot[j] = 1; flag3 = false; break; } else continue; } if (flag3) { //如果訓練樣本中沒有這個單詞,但又不能改變原始樣本 d1 ++; } } for (int i = 0; i < all_trains.size(); i ++) { //all_trains[i].distance = sqrt(d1 + edistance(all_trains[i], test_train)); double part1 = 0; for (int j = 0; j < all_words.size(); j ++) part1 += all_trains[i].onehot[j] * test_train.onehot[j]; all_trains[i].distance = part1/(sqrt(all_trains[i].word.size())*sqrt(test_train.word.size() + d1)); } sort(all_trains.begin(), all_trains.end(), cmp); vector<ct> v; ct node("anger", 0); v.push_back(node); ct node1("disgust", 0); v.push_back(node1); ct node2("fear", 0); v.push_back(node2); ct node3("joy", 0); v.push_back(node3); ct node4("sad", 0); v.push_back(node4); ct node5("surprise", 0); v.push_back(node5); for (int i = 0; i < k; i ++) { for (int j = 0; j < v.size(); j ++) { if (all_trains[i].emotion == v[j].s) { v[j].num ++; break; } } } sort(v.begin(), v.end(), cmp2); if (v.back().s == test_train.emotion) { right_sum ++; } } cout << "正確個數" << right_sum << endl; } double edistance(train_data a, train_data b) { //歐式 :開方前 double total = 0.0; for (int i = 0; i < all_words.size(); i ++) { total += pow(a.onehot[i] - b.onehot[i], 2); } return total; } double mdistance(train_data a, train_data b) { //曼哈頓 double total = 0.0; for (int i = 0; i < all_words.size(); i ++) { total += abs(a.onehot[i] - b.onehot[i]); } return total; } bool cmp(const train_data &a, const train_data &b) { return a.distance < b.distance; } bool cmp2(const ct &a, const ct &b ) { return a.num < b.num; }
KNN迴歸:夾角餘弦
#include <iostream> #include <algorithm> #include <iomanip> #include <fstream> #include <sstream> #include <cstring> #include <string> #include <vector> #include <cmath> #include <time.h> #include <map> using namespace std; struct train_data { int index; //訓練文字序號 int emotion_value; //情感值 string emotion; //情感狀態 vector<string> word; //訓練文字單詞 int onehot[1000]; //onehot矩陣中的值 double distance; //距離 vector<double> fre_set; train_data(int a = 0, int b = 0, string c = "", double d = 0.0) { index = a; emotion_value = b; emotion = c; distance = d; word.clear(); fre_set.clear(); for (int i = 0; i < 1000; i ++) onehot[i] = 0; } }; struct ct { string s; int num; ct(string a = "", int b = 0) { s = a; num = b; } }; vector<string> train_text; //每個完整的訓練文字 vector<string> all_words; //所有不同的單詞 ,縱軸 vector<train_data> all_trains; //所有訓練文字,橫軸 int right_sum; //預測正確的個數 void reading_file(void ); void get_onehot(void ); void regre_calculating(int ); double edistance(train_data , train_data ); double mdistance(train_data , train_data ); bool cmp(const train_data & , const train_data & ); int main() { train_text.clear(); all_words.clear(); all_trains.clear(); reading_file(); //cout << all_words.size() << endl; 904 get_onehot(); int k; cout << "input k = "; cin >> k; regre_calculating(k); //for (int i = 0; i <= all_trains[2].onehot.size(); i ++) // cout << all_trains[2].onehot[i] << endl; return 0; } void reading_file() { ifstream t("Dataset_train.csv"); char c[150]; string temp; t.getline(c, 150); while (t.getline(c, 150)) { train_data new_train; char d[150]; strcpy(d, c); char *p = strtok(c, ","); p = strtok(NULL, ","); //cout << p << endl; char *p2 = strtok(p, " "); while (p2 != NULL) { string word = p2; //統計所有單詞 bool flag1 = true; for (int i = 0; i < all_words.size(); i ++) { if (all_words[i] == word) { flag1 = false; break; } else continue; } if (flag1) all_words.push_back(word); //統計每個訓練文字中的單詞 bool flag2 = true; for (int i = 0; i < new_train.word.size(); i ++) { if (new_train.word[i] == word) { flag2 = false; break; } else continue; } if (flag2) new_train.word.push_back(word); p2 = strtok(NULL, " "); } char *p3 = strtok(d, ","); p3 = strtok(NULL, ","); p3 = strtok(NULL, ","); stringstream ss; double fre; while (p3 != NULL) { temp = p3; ss.str(temp); ss >> fre; new_train.fre_set.push_back(fre); ss.clear(); p3 = strtok(NULL, ","); } /*for (int i = 0; i < new_train.word.size(); i ++) cout << new_train.word[i] << " "; for (int i = 0; i < new_train.fre_set.size(); i ++) cout << new_train.fre_set[i] << " "; break; */ all_trains.push_back(new_train); } //cout << all_words.size(); } void get_onehot() { int i, j, k; for (i = 0; i < all_trains.size(); i ++) { for (j = 0; j < all_trains[i].word.size(); j ++) { for (k = 0; k < all_words.size(); k ++) { if (all_trains[i].word[j] == all_words[k]) all_trains[i].onehot[k] = 1; } } } } void regre_calculating(int k) { ifstream t("Dataset_validation.csv"); char c[150]; string temp; t.getline(c, 150); ofstream out("14353324_xiangketing_regression.txt"); while (t.getline(c, 150)) { train_data test_train; char *p = strtok(c, ","); p = strtok(NULL, ","); //cout << p << endl; char *p2 = strtok(p, " "); while (p2 != NULL) { temp = p2; bool flag = true; for (int i = 0; i < test_train.word.size(); i ++) { if (test_train.word[i] == temp) { flag = false; break; } } if (flag) test_train.word.push_back(temp); p2 = strtok(NULL, " "); } double d1 = 0; for (int i = 0; i < test_train.word.size(); i ++) { bool flag3 = true; for (int j = 0; j < all_words.size(); j ++) { if (test_train.word[i] == all_words[j]) { test_train.onehot[j] = 1; flag3 = false; break; } else continue; } if (flag3) { //如果訓練樣本中沒有這個單詞,但又不能改變原始樣本 d1 ++; } } for (int i = 0; i < all_trains.size(); i ++) { //all_trains[i].distance = sqrt(d1 + edistance(all_trains[i], test_train)); double part1 = 0; for (int j = 0; j < all_words.size(); j ++) part1 += all_trains[i].onehot[j] * test_train.onehot[j]; all_trains[i].distance = part1/(sqrt(all_trains[i].word.size())*sqrt(test_train.word.size() + d1)); } sort(all_trains.begin(), all_trains.end(), cmp); double a[6]; double sum = 0; for (int i = 0; i < 6; i ++) { double value = 0; for (int j = 0; j < k; j ++) { value += all_trains[j].fre_set[i] * all_trains[j].distance; } a[i] = value; sum += value; } out << a[0] / sum << '\t' << a[1] / sum << '\t' << a[2] / sum << '\t' << a[3] / sum << '\t' << a[4] / sum << '\t' << a[5] / sum << endl; } } double edistance(train_data a, train_data b) { double total = 0.0; for (int i = 0; i < all_words.size(); i ++) { total += pow(a.onehot[i] - b.onehot[i], 2); } return total; } double mdistance(train_data a, train_data b) { double total = 0.0; for (int i = 0; i < all_words.size(); i ++) { total += abs(a.onehot[i] - b.onehot[i]); } return total; } bool cmp(const train_data &a, const train_data &b) { return a.distance > b.distance; }