1. 程式人生 > >Python分類演算法交叉驗證

Python分類演算法交叉驗證

我們使用Sklearn-train_test_split隨機劃分訓練集和測試集

http://blog.csdn.net/cherdw/article/details/54881167

實驗程式碼:

import gensim
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.model_selection import train_test_split

file = open(u'/home/ubuntu/file/資料平衡無分類', encoding='utf-8')
documents = gensim.models.doc2vec.TaggedLineDocument(file)
fileresult=open(u'/home/ubuntu/file/調參結果'
,'w') #怎麼根據model計算新的向量 #生成文字向量 #使用邏輯迴歸進行預測 def LR(): clf = LogisticRegression() return clf def getRecognitionRate(testPre, testClass): testNum = len(testPre) rightNum = 0 for i in range(0, testNum): if testClass[i] == testPre[i]: rightNum += 1 return float(rightNum) / float(testNum) def
getData(model):
#生成pandas tigs = [] data_dict = {} # 生成pandas資料 fileclass = open(u'/home/ubuntu/file/資料平衡分類結果', encoding='utf-8') for tig in fileclass: tigs.append(tig.strip()) for i in range(len(model.docvecs)-1): data_dict['p'+str(i)] = model.docvecs[i] data = pd.DataFrame(data_dict) data = data.T # data['class0'] = tigs
X_train1, X_test1, y_train1, y_test1 = train_test_split(data, tigs, test_size=0.4, random_state=1) fileclass.close() return X_train1, y_train1, X_test1, y_test1 #調參 # for i in range(20,100): # for j in range(10,100): # print('引數值:'+str(i)+":"+str(j)) # fileresult.write('引數值:'+str(i)+":"+str(j)+'\n') model = gensim.models.Doc2Vec(documents, size=80, window=9, min_count=40, workers=8) T = getData(model) trainMatrix, trainClass, testMatrix, testClass = T[0], T[1], T[2], T[3] clf_LR=LR() clf_LR.fit(trainMatrix, trainClass) print('Logistic Regression recognition rate: '+str(getRecognitionRate(clf_LR.predict(testMatrix), testClass))) # fileresult.write('Logistic Regression recognition rate: '+str(getRecognitionRate(clf_LR.predict(testMatrix), testClass))+'\n') # fileresult.close() #怎麼畫出來ROC_curve # ROC_curve(clf_LR,testClass)

使用交叉驗證方法計算平均正確率:

import gensim
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc

file = open(u'/home/ubuntu/file/資料平衡無分類', encoding='utf-8')
fileclass=open(u'/home/ubuntu/file/資料平衡分類結果',encoding='utf-8')
documents = gensim.models.doc2vec.TaggedLineDocument(file)
model = gensim.models.Doc2Vec(documents, size=81, window=8, min_count=39, workers=8)
#怎麼根據model計算新的向量
#生成文字向量
#使用邏輯迴歸進行預測
def LR():
    clf = LogisticRegression()
    return clf
def getRecognitionRate(testPre, testClass):
    testNum = len(testPre)
    rightNum = 0
    for i in range(0, testNum):
        if testClass[i] == testPre[i]:
            rightNum += 1
    return float(rightNum) / float(testNum)
def getData():
    #生成pandas
    tigs = []
    data_dict = {}
    # 生成pandas資料
    for tig in fileclass:
        tigs.append(tig.strip())
    for i in range(len(model.docvecs)):
        data_dict['p' + str(i)] = model.docvecs[i]
    data = pd.DataFrame(data_dict)
    data = data.T
    data['class0'] = tigs
    X_train1, X_test1, y_train1, y_test1 = train_test_split(data, data['class0'], test_size=0.4, random_state=0)
    return X_train1, y_train1, X_test1, y_test1
# def ROC_curve(lr,y_test):
#     pred_probas = lr.predict_proba(testMatrix)[:,1]
#     fpr,tpr,_ = roc_curve(y_test, pred_probas)
#     roc_auc = auc(fpr,tpr)
#     plt.plot(fpr,tpr,label='area = %.2f' %roc_auc)
#     plt.plot([0, 1], [0, 1], 'k--')
#     plt.xlim([0.0, 1.0])
#     plt.ylim([0.0, 1.05])
#     plt.show()
def getData_3():
    tigs = []
    data_dict = {}
    # 生成pandas資料
    for tig in fileclass:
        tigs.append(tig.strip())
    for i in range(len(model.docvecs)-1):
        data_dict['p' + str(i)] = model.docvecs[i]
    data = pd.DataFrame(data_dict)
    dataMatrix = data.T
    dataMatrix['class0'] = tigs
    print(dataMatrix)
    sampleData = []
    sampleClass = []
    for i in dataMatrix.index:
        tempList = dataMatrix.loc[i].values
        print(tempList)
        sampleClass.append(tempList[-1])
        sampleData.append(tempList[0:-1])
    sampleM = np.array(sampleData)  # 二維矩陣,一行是一個樣本,行數=樣本總數,列數=樣本特徵數
    classM = np.array(sampleClass)  # 一維列向量,每個元素對應每個樣本所屬類別
    # 呼叫StratifiedKFold方法生成訓練集和測試集
    skf = StratifiedKFold(n_splits=2)
    setDict = {}  # 建立字典,用於儲存生成的訓練集和測試集
    count = 1
    for trainI, testI in skf.split(sampleM, classM):
        trainSTemp = []  # 用於儲存當前迴圈抽取出的訓練樣本資料
        trainCTemp = []  # 用於儲存當前迴圈抽取出的訓練樣本類標
        testSTemp = []  # 用於儲存當前迴圈抽取出的測試樣本資料
        testCTemp = []  # 用於儲存當前迴圈抽取出的測試樣本類標
        # 生成訓練集
        trainIndex = list(trainI)
        for t1 in range(0, len(trainIndex)):
            trainNum = trainIndex[t1]
            trainSTemp.append(list(sampleM[trainNum, :]))
            trainCTemp.append(list(classM)[trainNum])
        setDict[str(count) + 'train'] = np.array(trainSTemp)
        setDict[str(count) + 'trainclass'] = np.array(trainCTemp)
        # 生成測試集
        testIndex = list(testI)
        for t2 in range(0, len(testIndex)):
            testNum = testIndex[t2]
            testSTemp.append(list(sampleM[testNum, :]))
            testCTemp.append(list(classM)[testNum])
        setDict[str(count) + 'test'] = np.array(testSTemp)
        setDict[str(count) + 'testclass'] = np.array(testCTemp)
        count += 1
    return setDict
clf_LR=LR()
#怎麼畫出來ROC_curve
# ROC_curve(clf_LR,testClass)
setDict = getData_3()
setNums = len(setDict.keys())
print(setDict.keys())
LR_rate = 0.0
for i in range(1, 3):
    print(i)
    trainMatrix = setDict[str(i) + 'train']
    trainClass = setDict[str(i) + 'trainclass']
    print(len(trainClass))
    testMatrix = setDict[str(i) + 'test']
    testClass = setDict[str(i) + 'testclass']
    print(len(testClass))
    clf_LR.fit(trainMatrix, trainClass)
    LR_rate += getRecognitionRate(clf_LR.predict(testMatrix), testClass)
print('Logistic Regression mean recognition rate: ', LR_rate / 2)

getData_3方法把資料集進行分割,分別對於不同的分割進行計算正確率,最後計算平均正確率。