Python分類演算法交叉驗證
阿新 • • 發佈:2019-01-24
我們使用Sklearn-train_test_split隨機劃分訓練集和測試集
http://blog.csdn.net/cherdw/article/details/54881167
實驗程式碼:
import gensim
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.model_selection import train_test_split
file = open(u'/home/ubuntu/file/資料平衡無分類', encoding='utf-8')
documents = gensim.models.doc2vec.TaggedLineDocument(file)
fileresult=open(u'/home/ubuntu/file/調參結果' ,'w')
#怎麼根據model計算新的向量
#生成文字向量
#使用邏輯迴歸進行預測
def LR():
clf = LogisticRegression()
return clf
def getRecognitionRate(testPre, testClass):
testNum = len(testPre)
rightNum = 0
for i in range(0, testNum):
if testClass[i] == testPre[i]:
rightNum += 1
return float(rightNum) / float(testNum)
def getData(model):
#生成pandas
tigs = []
data_dict = {}
# 生成pandas資料
fileclass = open(u'/home/ubuntu/file/資料平衡分類結果', encoding='utf-8')
for tig in fileclass:
tigs.append(tig.strip())
for i in range(len(model.docvecs)-1):
data_dict['p'+str(i)] = model.docvecs[i]
data = pd.DataFrame(data_dict)
data = data.T
# data['class0'] = tigs
X_train1, X_test1, y_train1, y_test1 = train_test_split(data, tigs, test_size=0.4, random_state=1)
fileclass.close()
return X_train1, y_train1, X_test1, y_test1
#調參
# for i in range(20,100):
# for j in range(10,100):
# print('引數值:'+str(i)+":"+str(j))
# fileresult.write('引數值:'+str(i)+":"+str(j)+'\n')
model = gensim.models.Doc2Vec(documents, size=80, window=9, min_count=40, workers=8)
T = getData(model)
trainMatrix, trainClass, testMatrix, testClass = T[0], T[1], T[2], T[3]
clf_LR=LR()
clf_LR.fit(trainMatrix, trainClass)
print('Logistic Regression recognition rate: '+str(getRecognitionRate(clf_LR.predict(testMatrix), testClass)))
# fileresult.write('Logistic Regression recognition rate: '+str(getRecognitionRate(clf_LR.predict(testMatrix), testClass))+'\n')
# fileresult.close()
#怎麼畫出來ROC_curve
# ROC_curve(clf_LR,testClass)
使用交叉驗證方法計算平均正確率:
import gensim
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc
file = open(u'/home/ubuntu/file/資料平衡無分類', encoding='utf-8')
fileclass=open(u'/home/ubuntu/file/資料平衡分類結果',encoding='utf-8')
documents = gensim.models.doc2vec.TaggedLineDocument(file)
model = gensim.models.Doc2Vec(documents, size=81, window=8, min_count=39, workers=8)
#怎麼根據model計算新的向量
#生成文字向量
#使用邏輯迴歸進行預測
def LR():
clf = LogisticRegression()
return clf
def getRecognitionRate(testPre, testClass):
testNum = len(testPre)
rightNum = 0
for i in range(0, testNum):
if testClass[i] == testPre[i]:
rightNum += 1
return float(rightNum) / float(testNum)
def getData():
#生成pandas
tigs = []
data_dict = {}
# 生成pandas資料
for tig in fileclass:
tigs.append(tig.strip())
for i in range(len(model.docvecs)):
data_dict['p' + str(i)] = model.docvecs[i]
data = pd.DataFrame(data_dict)
data = data.T
data['class0'] = tigs
X_train1, X_test1, y_train1, y_test1 = train_test_split(data, data['class0'], test_size=0.4, random_state=0)
return X_train1, y_train1, X_test1, y_test1
# def ROC_curve(lr,y_test):
# pred_probas = lr.predict_proba(testMatrix)[:,1]
# fpr,tpr,_ = roc_curve(y_test, pred_probas)
# roc_auc = auc(fpr,tpr)
# plt.plot(fpr,tpr,label='area = %.2f' %roc_auc)
# plt.plot([0, 1], [0, 1], 'k--')
# plt.xlim([0.0, 1.0])
# plt.ylim([0.0, 1.05])
# plt.show()
def getData_3():
tigs = []
data_dict = {}
# 生成pandas資料
for tig in fileclass:
tigs.append(tig.strip())
for i in range(len(model.docvecs)-1):
data_dict['p' + str(i)] = model.docvecs[i]
data = pd.DataFrame(data_dict)
dataMatrix = data.T
dataMatrix['class0'] = tigs
print(dataMatrix)
sampleData = []
sampleClass = []
for i in dataMatrix.index:
tempList = dataMatrix.loc[i].values
print(tempList)
sampleClass.append(tempList[-1])
sampleData.append(tempList[0:-1])
sampleM = np.array(sampleData) # 二維矩陣,一行是一個樣本,行數=樣本總數,列數=樣本特徵數
classM = np.array(sampleClass) # 一維列向量,每個元素對應每個樣本所屬類別
# 呼叫StratifiedKFold方法生成訓練集和測試集
skf = StratifiedKFold(n_splits=2)
setDict = {} # 建立字典,用於儲存生成的訓練集和測試集
count = 1
for trainI, testI in skf.split(sampleM, classM):
trainSTemp = [] # 用於儲存當前迴圈抽取出的訓練樣本資料
trainCTemp = [] # 用於儲存當前迴圈抽取出的訓練樣本類標
testSTemp = [] # 用於儲存當前迴圈抽取出的測試樣本資料
testCTemp = [] # 用於儲存當前迴圈抽取出的測試樣本類標
# 生成訓練集
trainIndex = list(trainI)
for t1 in range(0, len(trainIndex)):
trainNum = trainIndex[t1]
trainSTemp.append(list(sampleM[trainNum, :]))
trainCTemp.append(list(classM)[trainNum])
setDict[str(count) + 'train'] = np.array(trainSTemp)
setDict[str(count) + 'trainclass'] = np.array(trainCTemp)
# 生成測試集
testIndex = list(testI)
for t2 in range(0, len(testIndex)):
testNum = testIndex[t2]
testSTemp.append(list(sampleM[testNum, :]))
testCTemp.append(list(classM)[testNum])
setDict[str(count) + 'test'] = np.array(testSTemp)
setDict[str(count) + 'testclass'] = np.array(testCTemp)
count += 1
return setDict
clf_LR=LR()
#怎麼畫出來ROC_curve
# ROC_curve(clf_LR,testClass)
setDict = getData_3()
setNums = len(setDict.keys())
print(setDict.keys())
LR_rate = 0.0
for i in range(1, 3):
print(i)
trainMatrix = setDict[str(i) + 'train']
trainClass = setDict[str(i) + 'trainclass']
print(len(trainClass))
testMatrix = setDict[str(i) + 'test']
testClass = setDict[str(i) + 'testclass']
print(len(testClass))
clf_LR.fit(trainMatrix, trainClass)
LR_rate += getRecognitionRate(clf_LR.predict(testMatrix), testClass)
print('Logistic Regression mean recognition rate: ', LR_rate / 2)
getData_3方法把資料集進行分割,分別對於不同的分割進行計算正確率,最後計算平均正確率。