1. 程式人生 > >Softmax迴歸C++實現

Softmax迴歸C++實現

前言 Softmax迴歸模型的理論知識上一篇博文已經介紹。C++程式碼來源於一個開源專案,連結地址我忘了可憐,哪天找到了再附上。對原始碼改動不大,只是進行了一些擴充。

實驗環境

Visual Studio 2013

資料

裡面共有20000個樣本,每個樣本16維。

實驗目的

完成對資料集中字元樣本的分類。

實驗程式碼

1.定義一個LogisticRegression的類:

標頭檔案 LogisticRegression.h

#include <iostream>
#include <math.h>
#include<algorithm> 
#include <functional> 
#include <string>
#include <cassert>
#include <vector>
using namespace std;
class LogisticRegression {
public:
	LogisticRegression(int inputSize, int k, int dataSize, int num_iters,double learningRate);
	~LogisticRegression();
	bool loadData(const string& filename);//載入資料
	void train();//訓練函式
	void softmax(double* thetaX);//得到樣本對應的屬於某個類別的概率
	double predict(double* x);//預測函式
	double** getX();
	double** getY();
	void printX();
	void printY();
	void printTheta();
private:
	int inputSize;//輸入特徵數,不包括bias項
	int k;//類別數
	int dataSize;//樣本數
	int num_iters;//迭代次數
	double **theta;//學習得到的權值引數
	double alpha;//學習速率
	double** x;//訓練資料集
	double** y;//訓練資料集對應的標號
};

實現檔案 LogisticRegression.cpp

#include "LogisticRegression.h"
LogisticRegression::LogisticRegression(int in, int out,int size, int num_iters,double learningRate) {
	inputSize = in;
	k = out;
	alpha = learningRate;
	dataSize = size; 
	this->num_iters = num_iters;
	// initialize theta
	theta = new double*[k];
	for (int i = 0; i<k; i++) theta[i] = new double[inputSize];
	for (int i = 0; i<k; i++) {
		for (int j = 0; j<inputSize; j++) {
			theta[i][j] = 0;
		}
	}
	//initialize x
	x = new double*[dataSize];
	for (int i = 0; i<dataSize; i++) x[i] = new double[inputSize];
	for (int i = 0; i<dataSize; i++) {
		for (int j = 0; j<inputSize; j++) {
			x[i][j] = 0;
		}
	}
	//initialize y
	y = new double*[dataSize];
	for (int i = 0; i<dataSize; i++) y[i] = new double[k];
	for (int i = 0; i<dataSize; i++) {
		for (int j = 0; j<k; j++) {
			y[i][j] = 0;
		}
	}
}

LogisticRegression::~LogisticRegression() {
	for (int i = 0; i<k; i++) delete[] theta[i];
	delete[] theta;
	for (int i = 0; i < dataSize; i++)
	{
		delete[] x[i];
		delete[] y[i];
	}
	delete[] x;
	delete[] y;
}

void LogisticRegression::train() {
	for (int n = 0; n < num_iters; n++)
	{
		for (int s = 0; s < dataSize; s++)
		{
			double *py_x = new double[k];
			double *dy = new double[k];
			//1.求出theta*x
			for (int i = 0; i<k; i++) {
				py_x[i] = 0;
				for (int j = 0; j<inputSize; j++) {
					py_x[i] += theta[i][j] * x[s][j];
				}
			}
			//2.求出概率
			softmax(py_x);
			for (int i = 0; i<k; i++) {
				dy[i] = y[s][i] - py_x[i];//真實值與預測值的差異

				for (int j = 0; j<inputSize; j++) {
					theta[i][j] += alpha * dy[i] * x[s][j] / dataSize;
				}
			}
			delete[] py_x;
			delete[] dy;
		}
	}
}

void LogisticRegression::softmax(double *x) {
	double max = 0.0;
	double sum = 0.0;

	for (int i = 0; i<k; i++) if (max < x[i]) max = x[i];
	for (int i = 0; i<k; i++) {
		x[i] = exp(x[i] - max);
		sum += x[i];
	}

	for (int i = 0; i<k; i++) x[i] /= sum;
}


double LogisticRegression::predict(double *x) {
	double clsLabel;
	double* predictY = new double[k];
	for (int i = 0; i < k; i++) {
		predictY[i] = 0;
		for (int j = 0; j < inputSize; j++) {
			predictY[i] += theta[i][j] * x[j];
		}
	}
	softmax(predictY);
	double max = 0;
	for (int i = 0; i < k; i++)
	{
		if (predictY[i]>max) {
			clsLabel = i;
			max = predictY[i];
		}
	}
	return clsLabel;
}

double** LogisticRegression::getX()
{
	return x;
}
double** LogisticRegression::getY()
{
	return y;
}

bool LogisticRegression::loadData (const string& filename)
{
	const int M = 1024;
	char buf[M + 2];
	int i;
	vector<int> responses;
	FILE* f = fopen(filename.c_str(), "rt");
	if (!f)
	{
		cout << "Could not read the database " << filename << endl;
		return false;
	}
	int rowIndex = 0;
	for (;;)
	{
		char* ptr;
		if (!fgets(buf, M, f) || !strchr(buf, ','))// char *strchr(const char *s,char c):查詢字串s中首次出現字元c的位置
			break;
		y[rowIndex][buf[0] - 'A'] = 1;
		ptr = buf + 2;
		for (i = 0; i < inputSize; i++)
		{
			int n = 0;//存放sscanf當前已讀取了的總字元數
			int m = 0;
			sscanf(ptr, "%d%n", &m, &n);//sscanf() - 從一個字串中讀進與指定格式相符的資料
			x[rowIndex][i] = m;
			ptr += n + 1;
		}
		rowIndex++;
		if (rowIndex >= dataSize) break;
		if (i < inputSize)
			break;
	}
	fclose(f);
	cout << "The database " << filename << " is loaded.\n";
	return true;
}

void LogisticRegression::printX()
{
	for (int i = 0; i<dataSize; i++) {
		for (int j = 0; j<inputSize; j++) {
			cout << x[i][j] << " ";
		}
		cout << endl;
	}
}
void LogisticRegression::printY()
{
	for (int i = 0; i<dataSize; i++) {
		for (int j = 0; j<k; j++) {
			cout << y[i][j] << " ";
		}
		cout << endl;
	}
}

void LogisticRegression::printTheta()
{
	for (int i = 0; i < k; i++) {
		for (int j = 0; j < inputSize; j++) {
			cout << theta[i][j] << " ";
		}
		cout << endl;
	}

}
2.程式碼測試:
#include "LogisticRegression.h"
void letter_recog()
{
	double learning_rate = 0.1;
	int num_iters = 500;//迭代次數
	int train_N =10000;//訓練樣本個數
	int test_N = 8;//測試樣本個數
	int n_in = 16;//輸入特徵維數
	int n_out = 26;//類別數
	LogisticRegression classifier(n_in, n_out, train_N, num_iters, learning_rate);
	classifier.loadData("letter-recognition.data");
	//訓練
	classifier.train();
	// test data
	double test_X[8][16] = {
		{ 5, 10, 6, 8, 4, 7, 7, 12, 2, 7, 9, 8, 9, 6, 0, 8 },//M
		{ 6, 12, 7, 6, 5, 8, 8, 3, 3, 6, 9, 7, 10, 10, 3, 6 },//W
		{ 3, 8, 4, 6, 4, 7, 7, 12, 1, 6, 6, 8, 5, 8, 0, 8 },//N
		{ 1, 0, 1, 0, 0, 7, 8, 10, 1, 7, 5, 8, 2, 8, 0, 8 },//H
		{ 3, 6, 5, 5, 6, 6, 8, 3, 3, 6, 5, 9, 6, 7, 5, 9 },//R
		{ 7, 11, 11, 8, 7, 4, 8, 2, 9, 10, 11, 9, 5, 8, 5, 4 }, //X
		{ 6, 9, 6, 4, 4, 8, 9, 5, 3, 10, 5, 5, 5, 10, 5, 6 },//P	
		{ 4, 7, 6, 5, 5, 8, 5, 7, 4, 6, 7, 9, 3, 7, 6, 9 }//Q
	};
	// test
	for (int i = 0; i<test_N; i++) {
		double predict = classifier.predict(test_X[i]);
		char char_predict = 'A' + predict;
		cout << "predict:" << char_predict << endl;
	}
}

int main() {
	letter_recog();
	getchar();
	return 0;

輸出結果:

程式中用了前1w個樣本來訓練分類器,整個訓練過程花了328.117s。為了加快程式的執行速度,決定使用OpenMP來加速for迴圈。 在Visual Studio裡面使用OpenMP很簡單。 點選專案-->屬性,進入屬性頁。在c/c++下面的Language中開啟Open MP Support即可。 修改過後的train函式:
void LogisticRegression::train() {
	for (int n = 0; n < num_iters; n++)
	{
#pragma omp parallel for  
		for (int s = 0; s < dataSize; s++)
		{
			double *py_x = new double[k];
			double *dy = new double[k];
			//1.求出theta*x
			for (int i = 0; i<k; i++) {
				py_x[i] = 0;
				for (int j = 0; j<inputSize; j++) {
					py_x[i] += theta[i][j] * x[s][j];
				}
			}
			//2.求出概率
			softmax(py_x);
#pragma omp parallel for  
			for (int i = 0; i<k; i++) {
				dy[i] = y[s][i] - py_x[i];//真實值與預測值的差異			
				for (int j = 0; j<inputSize; j++) {
					theta[i][j] += alpha * dy[i] * x[s][j] / dataSize; //- lambda*theta[i][j];
				}
			}
			delete[] py_x;
			delete[] dy;
		}
	}
}
修改過後的softmax函式:
void LogisticRegression::softmax(double *x) {
	double max = 0.0;
	double sum = 0.0;
	for (int i = 0; i<k; i++) if (max < x[i]) max = x[i];
#pragma omp parallel for  
	for (int i = 0; i<k; i++) {
		x[i] = exp(x[i] - max);//防止資料溢位
		sum += x[i];
	}
#pragma omp parallel for  
	for (int i = 0; i<k; i++) x[i] /= sum;
}
輸出結果: 訓練時間從之前的328.117s較少到49.081s,提升了6.68倍。
從測試結果來看,分類器把R預測成了K,把X預測成了U。本文沒有對分類器的準確率進行嚴格的測試,有興趣的同學可以自己去測一下。