Python分類演算法交叉驗證

阿新 • • 發佈：2019-01-24

我們使用Sklearn-train_test_split隨機劃分訓練集和測試集

http://blog.csdn.net/cherdw/article/details/54881167

實驗程式碼：

import gensim
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.model_selection import train_test_split

file = open(u'/home/ubuntu/file/資料平衡無分類', encoding='utf-8')
documents = gensim.models.doc2vec.TaggedLineDocument(file)
fileresult=open(u'/home/ubuntu/file/調參結果' 
,'w')
#怎麼根據model計算新的向量
#生成文字向量
#使用邏輯迴歸進行預測
def LR():
    clf = LogisticRegression()
    return clf
def getRecognitionRate(testPre, testClass):
    testNum = len(testPre)
    rightNum = 0
    for i in range(0, testNum):
        if testClass[i] == testPre[i]:
            rightNum += 1
    return float(rightNum) / float(testNum)
def 
 getData(model):
    #生成pandas
    tigs = []
    data_dict = {}
    # 生成pandas資料
    fileclass = open(u'/home/ubuntu/file/資料平衡分類結果', encoding='utf-8')
    for tig in fileclass:
        tigs.append(tig.strip())
    for i in range(len(model.docvecs)-1):
        data_dict['p'+str(i)] = model.docvecs[i]
    data = pd.DataFrame(data_dict)
    data = data.T
    # data['class0'] = tigs 

    X_train1, X_test1, y_train1, y_test1 = train_test_split(data, tigs, test_size=0.4, random_state=1)
    fileclass.close()
    return X_train1, y_train1, X_test1, y_test1
#調參
# for i in range(20,100):
#   for j in range(10,100):
# print('引數值：'+str(i)+":"+str(j))
# fileresult.write('引數值：'+str(i)+":"+str(j)+'\n')
model = gensim.models.Doc2Vec(documents, size=80, window=9, min_count=40, workers=8)
T = getData(model)
trainMatrix, trainClass, testMatrix, testClass = T[0], T[1], T[2], T[3]
clf_LR=LR()
clf_LR.fit(trainMatrix, trainClass)
print('Logistic Regression recognition rate: '+str(getRecognitionRate(clf_LR.predict(testMatrix), testClass)))
# fileresult.write('Logistic Regression recognition rate: '+str(getRecognitionRate(clf_LR.predict(testMatrix), testClass))+'\n')
# fileresult.close()
#怎麼畫出來ROC_curve
# ROC_curve(clf_LR,testClass)

使用交叉驗證方法計算平均正確率：

import gensim
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc

file = open(u'/home/ubuntu/file/資料平衡無分類', encoding='utf-8')
fileclass=open(u'/home/ubuntu/file/資料平衡分類結果',encoding='utf-8')
documents = gensim.models.doc2vec.TaggedLineDocument(file)
model = gensim.models.Doc2Vec(documents, size=81, window=8, min_count=39, workers=8)
#怎麼根據model計算新的向量
#生成文字向量
#使用邏輯迴歸進行預測
def LR():
    clf = LogisticRegression()
    return clf
def getRecognitionRate(testPre, testClass):
    testNum = len(testPre)
    rightNum = 0
    for i in range(0, testNum):
        if testClass[i] == testPre[i]:
            rightNum += 1
    return float(rightNum) / float(testNum)
def getData():
    #生成pandas
    tigs = []
    data_dict = {}
    # 生成pandas資料
    for tig in fileclass:
        tigs.append(tig.strip())
    for i in range(len(model.docvecs)):
        data_dict['p' + str(i)] = model.docvecs[i]
    data = pd.DataFrame(data_dict)
    data = data.T
    data['class0'] = tigs
    X_train1, X_test1, y_train1, y_test1 = train_test_split(data, data['class0'], test_size=0.4, random_state=0)
    return X_train1, y_train1, X_test1, y_test1
# def ROC_curve(lr,y_test):
#     pred_probas = lr.predict_proba(testMatrix)[:,1]
#     fpr,tpr,_ = roc_curve(y_test, pred_probas)
#     roc_auc = auc(fpr,tpr)
#     plt.plot(fpr,tpr,label='area = %.2f' %roc_auc)
#     plt.plot([0, 1], [0, 1], 'k--')
#     plt.xlim([0.0, 1.0])
#     plt.ylim([0.0, 1.05])
#     plt.show()
def getData_3():
    tigs = []
    data_dict = {}
    # 生成pandas資料
    for tig in fileclass:
        tigs.append(tig.strip())
    for i in range(len(model.docvecs)-1):
        data_dict['p' + str(i)] = model.docvecs[i]
    data = pd.DataFrame(data_dict)
    dataMatrix = data.T
    dataMatrix['class0'] = tigs
    print(dataMatrix)
    sampleData = []
    sampleClass = []
    for i in dataMatrix.index:
        tempList = dataMatrix.loc[i].values
        print(tempList)
        sampleClass.append(tempList[-1])
        sampleData.append(tempList[0:-1])
    sampleM = np.array(sampleData)  # 二維矩陣，一行是一個樣本，行數=樣本總數，列數=樣本特徵數
    classM = np.array(sampleClass)  # 一維列向量，每個元素對應每個樣本所屬類別
    # 呼叫StratifiedKFold方法生成訓練集和測試集
    skf = StratifiedKFold(n_splits=2)
    setDict = {}  # 建立字典，用於儲存生成的訓練集和測試集
    count = 1
    for trainI, testI in skf.split(sampleM, classM):
        trainSTemp = []  # 用於儲存當前迴圈抽取出的訓練樣本資料
        trainCTemp = []  # 用於儲存當前迴圈抽取出的訓練樣本類標
        testSTemp = []  # 用於儲存當前迴圈抽取出的測試樣本資料
        testCTemp = []  # 用於儲存當前迴圈抽取出的測試樣本類標
        # 生成訓練集
        trainIndex = list(trainI)
        for t1 in range(0, len(trainIndex)):
            trainNum = trainIndex[t1]
            trainSTemp.append(list(sampleM[trainNum, :]))
            trainCTemp.append(list(classM)[trainNum])
        setDict[str(count) + 'train'] = np.array(trainSTemp)
        setDict[str(count) + 'trainclass'] = np.array(trainCTemp)
        # 生成測試集
        testIndex = list(testI)
        for t2 in range(0, len(testIndex)):
            testNum = testIndex[t2]
            testSTemp.append(list(sampleM[testNum, :]))
            testCTemp.append(list(classM)[testNum])
        setDict[str(count) + 'test'] = np.array(testSTemp)
        setDict[str(count) + 'testclass'] = np.array(testCTemp)
        count += 1
    return setDict
clf_LR=LR()
#怎麼畫出來ROC_curve
# ROC_curve(clf_LR,testClass)
setDict = getData_3()
setNums = len(setDict.keys())
print(setDict.keys())
LR_rate = 0.0
for i in range(1, 3):
    print(i)
    trainMatrix = setDict[str(i) + 'train']
    trainClass = setDict[str(i) + 'trainclass']
    print(len(trainClass))
    testMatrix = setDict[str(i) + 'test']
    testClass = setDict[str(i) + 'testclass']
    print(len(testClass))
    clf_LR.fit(trainMatrix, trainClass)
    LR_rate += getRecognitionRate(clf_LR.predict(testMatrix), testClass)
print('Logistic Regression mean recognition rate: ', LR_rate / 2)

getData_3方法把資料集進行分割，分別對於不同的分割進行計算正確率，最後計算平均正確率。

Python分類演算法交叉驗證

我們使用Sklearn-train_test_split隨機劃分訓練集和測試集 http://blog.csdn.net/cherdw/article/details/54881167 實驗程式碼： import gensim from sklea

python+分類演算法

分類演算法：LR/RF/GBDT/ADABOOSTpython包：sklearn# -*- coding: utf-8 -*- """ Created on Wed May 9 10:37:12 2018 比較不同分類演算法效果分類演算法：LR/RF/GBDT/ADA

機器學習python分類演算法

from pandas import read_csv from sklearn.linear_model import LinearRegression from sklearn.linear_mo

【尋優演算法】交叉驗證（Cross Validation）引數尋優的python實現：多引數尋優

【尋優演算法】交叉驗證（Cross Validation）引數尋優的python實現：多引數尋優一、網格搜尋原理二、網格搜尋+交叉驗證用於多引數尋優的python實現 1、訓練模型及待尋優引數 2、直接迴圈巢狀實現網格搜尋 + cros

【尋優演算法】交叉驗證（Cross Validation）引數尋優的python實現：單一引數尋優

【尋優演算法】交叉驗證（Cross Validation）引數尋優的python實現：單一引數尋優一、交叉驗證的意義二、常用的交叉驗證方法 1、Hold one method 2、K-flod CV 3、Leave-One-Ou

python交叉驗證以及將全部資料分類訓練集和測試集（分類）

1,將全部資料分離成訓練集和測試集（之前首先先將x和y分類出來才可以） ''' 分離資料集-- test_size :如果是整數則選出來兩個測試集，如果是小數，則是選擇測試集所佔的百分比。 train_size ：同理，都含有預設值0.25 shuffle ：預設為Tru

python機器學習——十次交叉驗證訓練的資料準備演算法

攝於 2017年4月21日臺灣墾丁船帆石海灘前言 python強大的機器學習包scikit-learn可以直接進行交叉分割，之所以寫個相當於鍛鍊自己思維。這兩天本來打算開始寫樸素貝葉斯分類器的演算法的，由於上一篇博文python實現貝葉斯推斷

使用交叉驗證對鳶尾花分類模型進行調參(超參數)

www. eight data svc ans 分塊分類 app files 如何選擇超參數：交叉驗證：如圖，大訓練集分塊，使用不同的分塊方法分成N對小訓練集和驗證集。使用小訓練集進行訓練，使用驗證集進行驗證，得到準確率，求N個驗證集上的平均正確率；使用平均

分類預測，交叉驗證調超參數

date ESS read 實現簡單轉化 random end app ive 調參數是一件很頭疼的事情，今天學習到一個較為簡便的跑循環交叉驗證的方法，雖然不是最好的，如今網上有很多調參的技巧，目前覺得實現簡單的，以後了解更多了再更新。 import numpy as

python實現周志華西瓜書《機器學習》習題3.4 對比10折交叉驗證和留一法的對率迴歸錯誤率

這道題仍然在抄大神程式碼的基礎上寫註釋，首先感謝原始碼： https://blog.csdn.net/Snoopy_Yuan/article/details/64131129 感想是：sklearn是個好東西，如果沒有現成的驗證方法，光是10折驗證就要造10個表格才行，而用現成的庫，一

文字分類 - 樣本不平衡的解決思路與交叉驗證CV的有效性

現實情況中，很多機器學習訓練集會遇到樣本不均衡的情況，應對的方案也有很多種。筆者把看到的一些內容進行簡單羅列，此處還想分享的是交叉驗證對不平衡資料訓練極為重要。文章目錄 1 樣本不平衡的解決思路 1.2 將不平衡樣本當作離群點

機器學習基礎：(Python)訓練集測試集分割與交叉驗證

在上一篇關於Python中的線性迴歸的文章之後，我想再寫一篇關於訓練測試分割和交叉驗證的文章。在資料科學和資料分析領域中，這兩個概念經常被用作防止或最小化過度擬合的工具。我會解釋當使用統計模型時，通常將模型擬合在訓練集上，以便對未被訓練的資料進行預測。在統計學和機器學習領域中，我們通常把資料分成兩個子集：

【分類】KNN分類演算法之Python實現

KNN稱為K最近鄰。對於待分類資料，它先計算出與其最相近的K個的樣本，然後判斷這K個樣本中最多的類標籤，並將待分類資料標記為這個最多的類標籤。 python樣例程式碼： import numpy as np from sklearn.neighbors import KN

機器學習演算法：交叉驗證——（監督）學習器效能評估方法 [ sklearn.model_selection.cross_val_score()官方翻譯 ]

交叉驗證——（監督）學習器效能評估方法一、思考：交叉驗證有什麼好值得我們使用的？每個演算法模型都需要經過兩個階段：訓練和驗證。 1）一般情況下的，我們用的方法是：將原始資料集分為訓練資料集 & 測試資料集。優點：是，但僅僅是思路正確。缺點：思

K近鄰分類演算法實現 in Python

K近鄰（KNN）：分類演算法* KNN是non-parametric分類器（不做分佈形式的假設，直接從資料估計概率密度），是memory-based learning.* KNN不適用於高維資料（curse of dimension）* Machine Learning的Py

樸素貝葉斯分類演算法python實現

1 #==================================== 2 # 輸入: 3 # 空 4 # 輸出: 5 # postingList: 文件列表 6 # classVec: 分類標籤列表 7 #===

CrossValidation十字交叉驗證的Python實現

1.原理 1.1 概念交叉驗證(Cross-validation)主要用於模型訓練或建模應用中，如分類預測、PCR、PLS迴歸建模等。在給定的樣本空間中，拿出大部分樣本作為訓練集來訓練模型，剩餘的小部分樣本使用剛建立的模型進行預測，並求這小部分樣本的預測誤差或者預測

交叉驗證原理及Spark MLlib使用例項(Scala/Java/Python)

交叉驗證方法思想： CrossValidator將資料集劃分為若干子集分別地進行訓練和測試。如當k＝3時，CrossValidator產生3個訓練資料與測試資料對，每個資料對使用2/3的資料來訓練，1/3的資料來測試。對於一組特定的引數表，CrossVali

python實現機器學習分類演算法原始碼————上篇

python實現機器學習分類演算法原始碼文章

用K折交叉驗證估計KNN演算法中的K值

前幾天用KNN對自己的資料進行了分類，對於KNN中的K值，之前一直是靠緣分來試的，試的時候感覺K=3的效果挺好的。之後看了好多CSDN的部落格，發現一般大家除了靠緣分去試K值之外，也會採用交叉驗證的方法去近似求得K值，因此我決定自己實現一下，看看有什麼效果。

Python分類演算法交叉驗證

相關推薦