機器學習一個小目標——Task7
1. 任務
【任務六-模型融合】用你目前評分最高的模型作為基準模型,和其他模型進行stacking融合,得到最終模型及評分
2. Stacking融合
按照自己的理解
第一層:
使用交叉驗證的劃分方法,將訓練集劃分成5份,
使用第一個基分類器對劃分之後得到的test進行預測,得到的5個predict檔案,維數
,縱向拼接得到1個Predict檔案維數
使用第一個基分類器對整個Test進行預測,得到預測檔案5個p_t,維數
,橫向拼接,求平均值得到1個Pt檔案維數
使用第二個基分類器,
。。。
得到5個Predict檔案維數
,5個Pt檔案維數
第二層
將第一層得到的5個Predict檔案維數
橫向拼接,再和訓練集拼接,得到新的訓練集Train,維數
,
將第一層得到的5個t檔案維數
橫向拼接,再和訓練集拼接,得到新的訓練集Test,維數
對第二層的訓練集進行訓練,得到新的模型,
第三層:
使用行的模型對測試集進行預測
3. 實現程式碼
3.1 以下是按自己的理解寫的程式碼:
#!/usr/bin/env python 3.6
# -*- coding:utf-8 -*-
# @File : CV1.py
# @Date : 2018-11-22
# @Author : 黑桃
# @Software: PyCharm
from pandas import Series, DataFrame
import pickle
import pandas as pd
from sklearn.externals import joblib
from pandas import Series, DataFrame
from sklearn import svm
from sklearn.model_selection import * # 劃分資料 交叉驗證
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score, roc_curve
import warnings
warnings.filterwarnings("ignore")
path = "E:/MyPython/Machine_learning_GoGoGo/"
"""=====================================================================================================================
1 讀取資料
"""
print("0 讀取特徵")
f = open(path + 'feature/feature_V3.pkl', 'rb')
train, test, y_train, y_test = pickle.load(f)
f.close()
"""=====================================================================================================================
2 進行K次訓練;用K個模型分別對測試集進行預測,並得到K個結果,再進行結果的融合
"""
"""=====================================================================================================================
3 交叉驗證方式
"""
## 對交叉驗證方式進行指定,如驗證次數,訓練集測試集劃分比例等
kf = KFold(n_splits=5, random_state=1)
loo = LeaveOneOut() # 將資料集分成訓練集和測試集,測試集包含一個樣本,訓練集包含n-1個樣本
lpo = LeavePOut(p=2000) ## #將資料集分成訓練集和測試集,測試集包含p個樣本,訓練集包含n-p個樣本
ss = ShuffleSplit(n_splits=5, test_size=.25, random_state=0)
tss = TimeSeriesSplit(n_splits=5)
logo = LeaveOneGroupOut()
lpgo = LeavePGroupsOut(n_groups=3)
gss = GroupShuffleSplit(n_splits=4, test_size=.5, random_state=0)
gkf = GroupKFold(n_splits=2)
"""【配置交叉驗證方式】"""
cv = kf
"""=====================================================================================================================
2 讀取模型
"""
print("1 讀取模型")
SVM_linear = joblib.load( path + "model/model_file/SVM_linear.pkl")
SVM_poly = joblib.load( path + "model/model_file/SVM_poly.pkl")
SVM_rbf = joblib.load( path + "model/model_file/SVM_rbf.pkl")
SVM_sigmoid = joblib.load( path + "model/model_file/SVM_sigmoid.pkl")
lg_120 = joblib.load( path + "model/model_file/lg_120.pkl")
DT = joblib.load( path + "model/model_file/DT.pkl")
xgb_sklearn = joblib.load( path + "model/model_file/xgb_sklearn.pkl")
lgb_sklearn = joblib.load( path + "model/model_file/lgb_sklearn.pkl")
xgb = joblib.load( path + "model/model_file/xgb.pkl")
lgb = joblib.load( path + "model/model_file/lgb.pkl")
# 原始資料的索引不是從0開始的,因此重置索引
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
"""=====================================================================================================================
3 【第一層】用預測結果構建特徵
"""
def get_feature(clf,train,y_train,test,y_test,cv):
preds_Train = []
preds_Test = []
i = 0
score_sum = 0
for train_idx, vali_idx in cv.split(train, y_train):
i += 1
"""獲取訓練集和驗證集"""
f_train_x = DataFrame(train[train_idx])
f_train_y = DataFrame(y_train[train_idx])
f_vali_x = DataFrame(train[vali_idx])
f_vali_y = DataFrame(y_train[vali_idx])
"""訓練分類器"""
clf.fit(f_train_x, f_train_y)
"""對測試集進行預測"""
Train1 = clf.predict(f_vali_x)
Test_i = clf.predict(test)
preds_Test.append(Test_i)
preds_Train.append(Train1)
# """對驗證集進行預測,並計算f1分數"""
# pre_vali = clf.predict(f_vali_x)
# score_vali = f1_score(y_true=f_vali_y, y_pred=pre_vali, average='macro')
# print("第{}折, 驗證集分數:{}".format(i, score_vali))
# score_sum += score_vali
# score_mean = score_sum / i
# print("第{}折後, 驗證集分平均分數:{}".format(i, score_mean))
preds_Train = DataFrame(preds_Train).T
preds_Test = DataFrame(preds_Test).T
Train_i = pd.concat(objs=[preds_Train[0], preds_Train[1], preds_Train[2], preds_Train[3], preds_Train[4]], axis=0, sort=True)
Test_i = pd.concat(objs=[preds_Test[0], preds_Test[1], preds_Test[2], preds_Test[3], preds_Test[4]], axis=1)
Test_i = Test_i.T.sum()/3
return Test_i,Train_i
"""=====================================================================================================================
4 【第二層】特徵組合
"""
Test_1,Train1 = get_feature(SVM_linear,train,y_train,test,y_test,cv)
Test_2,Train2 = get_feature(lg_120,train,y_train,test,y_test,cv)
Test_3,Train3 = get_feature(DT,train,y_train,test,y_test,cv)
Test_4,Train4 = get_feature(SVM_rbf,train,y_train,test,y_test,cv)
Test_5,Train5 = get_feature(lgb_sklearn,train,y_train,test,y_test,cv)
Train = pd.concat(objs=[Train1, Train2, Train3, Train4, Train5], axis=1).reset_index(drop=True)
Test = pd.concat(objs=[Test_1, Test_2, Test_3, Test_4, Test_5], axis=1).astype(int).reset_index(drop=True)
train = DataFrame(train).reset_index(drop=True)
test = DataFrame(test).reset_index(drop=True)
Train = pd.concat(objs=[Train, train], axis=1)
Test = pd.concat(objs=[Test, test], axis=1)
"""=====================================================================================================================
【LGB_sklearn介面訓練】
"""
import lightgbm as lgbm
lgb_sklearn = lgbm.LGBMClassifier(learning_rate=0.1,
max_bin=150,
num_leaves=32,
max_depth=11,
reg_alpha=0.1,
reg_lambda=0.2,
# objective='multiclass',
n_estimators=300,)
lgb_sklearn.fit(Train,y_train)
# y_lgb_pre = lgb_sklearn.predict(Test)
y_lgb_pre = lgb_sklearn.predict(Test)
print( "lgb_sklearn_Train_Score :{}".format(lgb_sklearn.score(Train, y_train)))
print("lgb_sklearn_Test_Score :{}".format(lgb_sklearn.score(Test, y_test)))
# print("lgb_sklearn_Train_AUC Score :{:.4f}".format(roc_auc_score(y_train, y_lgb_pre)))
print("lgb_sklearn_Test_AUC Score :{}".format(roc_auc_score(y_test, y_lgb_pre)))
3.2 調包實現的程式碼:
#!/usr/bin/env python 3.6
#-*- coding:utf-8 -*-
# @File : Stacking2.py
# @Date : 2018-11-25
# @Author : 黑桃
# @Software: PyCharm
from sklearn import datasets
import warnings
import pickle
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingClassifier
import numpy as np
from sklearn.externals import joblib
warnings.filterwarnings("ignore")
iris = datasets.load_iris()
path = "E:/MyPython/Machine_learning_GoGoGo/"
"""=====================================================================================================================
1 讀取特徵
"""
print("0 讀取特徵")
f = open(path + 'feature/feature_V3.pkl', 'rb')
train, test, y_train,y_test= pickle.load(f)
f.close()
X, y = train, y_train
"""=====================================================================================================================
2 讀取模型
"""
print("1 讀取模型")
SVM_linear = joblib.load( path + "model/model_file/SVM_linear.pkl")
SVM_poly = joblib.load( path + "model/model_file/SVM_poly.pkl")
SVM_rbf = joblib.load( path + "model/model_file/SVM_rbf.pkl")
SVM_sigmoid = joblib.load( path + "model/model_file/SVM_sigmoid.pkl")
lg_120 = joblib.load( path + "model/model_file/lg_120.pkl")
DT = joblib.load( path + "model/model_file/DT.pkl")
xgb_sklearn = joblib.load( path + "model/model_file/xgb_sklearn.pkl")
lgb_sklearn = joblib.load( path + "model/model_file/lgb_sklearn.pkl")
xgb = joblib.load( path + "model/model_file/xgb.pkl")
lgb = joblib.load( path + "model/model_file/lgb.pkl")
clf1 =SVM_linear
clf2 = lg_120
clf3 = DT
clf4 = SVM_rbf
clf5 = lgb_sklearn
lr = LogisticRegression()
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3,clf4,clf5],meta_classifier=lgb_sklearn)
print('5-fold cross validation:\n')
for clf, label in zip([clf1, clf2, clf3,clf4,clf5 , sclf],
['SVM_linear','lg_120','DT','SVM_rbf','lgb_sklearn','StackingClassifier']):
scores = model_selection.cross_val_score(clf, X, y,cv=5, scoring='accuracy')
print("Accuracy: %s (+/- %0.9f) [%s]"
% (scores.mean(), scores.std(), label))
4. 實驗結果
自己的程式碼結果:
一級分類器 | 二級lgb | Stacking之前 | Stacking之後 |
---|---|---|---|
SVM_linear、DT、SVM_rbf、lg_120、LGB_sklearn | LGB_sklearn介面(predict) AUC Score | 0.7951391197086869 | 0.78980256597753 |
SVM_linear、DT、SVM_rbf、lg_120、LGB_sklearn | LGB_sklearn介面(proba) AUC Score | 0.6481179876945349 | 0.6372122138757783 |
調包實現的結果:
Accuracy: 0.7845685143591137 (+/- 0.008116008) [SVM_linear]
Accuracy: 0.7946686730541058 (+/- 0.008620505) [lg_120]
Accuracy: 0.7671842760458581 (+/- 0.017846894) [DT]
Accuracy: 0.7514728483069482 (+/- 0.000409207) [SVM_rbf]
Accuracy: 0.7831687376559587 (+/- 0.012327716) [lgb_sklearn]
Accuracy: 0.7831687376559587 (+/- 0.012327716) [StackingClassifier]
5. 遇到的問題
- 調包實現時,出現問題,五個一級分類器分別是,一級分類器 二級lgb Stacking之前 Stacking之後
SVM_linear、DT、SVM_rbf、lg_120、LGB_sklearn,但是最終的StackingClassifier分數始終是LGB_sklearn的分數?