模型融合—Stacking初涉
阿新 • • 發佈:2018-12-15
最近在學習kaggle baseline中遇到了,好撐kaggle大殺器的stacking,簡單來說,他就是整合學習的一種方法,如果你還沒有了解過這個,請點選以下兩篇博文,進行掃盲: 大話機器學習之STACKing,一個讓諸葛亮都吃癟的神技 (看名字這一篇就一定通俗易懂) 模型融合:bagging、Boosting、Blending、Stacking (這篇呢,就抓主要特點介紹了集中整合學習的方法,重點看看stacking的那個圖0哦)
談談實現
以前寫ML的東西,都是注重調庫啊,什麼的,現在是不行了,演算法崗要難死人了,所以那些演算法,該自己實現的都要自己實現一遍啊~
實現stacking方法
#!/usr/bin/env python
# coding: utf-8
import numpy as np
from sklearn.model_selection import KFold
# 實現stacking方法
def get_stacking(clf,x_train,y_train,x_test,n_folds=10):
'''
使用交叉驗證法得到次級訓練集
輸入資料型別為numpy.ndarray
'''
train_num,test_num = x_train.shape[0],x_test.shape[0]
second_level_train_set = np.zeros((train_num,))
second_level_test_set = np.zeros((test_num,))
test_nfolds_sets = np.zeros((test_num,n_folds))
kf = KFold(n_splits=n_folds)
# 訓練集/驗證集
for i,(train_index,test_index) in enumerate(kf.split(x_train)):
x_tra,y_tra = x_train[train_index],y_train[ train_index]
x_tst,y_tst = x_train[test_index],y_train[test_index]
clf.fit(x_tra,y_tra)
second_level_train_set[test_index] = clf.predict(x_tst)
test_nfolds_sets[:,i] = clf.predict(x_test)
second_level_test_set[:] = test_nfolds_sets.mean(axis=1)
return second_level_train_set,second_level_test_set
# 使用五個分類演算法
from sklearn.ensemble import (RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier,ExtraTreesClassifier)
from sklearn.svm import SVC
rf_model = RandomForestClassifier()
adb_model = AdaBoostClassifier()
gdbc_model = GradientBoostingClassifier()
et_model = ExtraTreesClassifier()
svc_model = SVC()
# 使用train_test_split來製造一些人為的資料
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
iris = load_iris()
train_x,test_x,train_y,test_y = train_test_split(iris.data,iris.target,test_size=0.2)
train_sets = []
test_sets = []
for clf in [rf_model,adb_model,gdbc_model,et_model,svc_model]:
train_set,test_set = get_stacking(clf,train_x,train_y,test_x)
train_sets.append(train_set)
test_sets.append(test_set)
meta_train = np.concatenate([reslut_set.reshape(-1,1) for reslut_set in train_sets],axis=1)
meta_test = np.concatenate([y_test_set.reshape(-1,1) for y_test_set in test_sets],axis=1)
# 使用決策樹作為次級分類器
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier()
dt_model.fit(meta_train,train_y)
df_predict = dt_model.predict(meta_test)
print(df_predict)
構造stacking類
# 構造stacking類
from sklearn.model_selection import KFold
from sklearn.base import BaseEstimator,RegressorMixin,TransformerMixin,clone
import numpy as np
class StackingAveragedModels(BaseEstimator,RegressorMixin,TransformerMixin):
def __init__(self,base_models,meta_model,n_folds=5):
self.base_models_ = base_models
self.meta_model_ = meta_model
self.n_folds = n_folds
# 將原來的模型clone出來並實現fit功能
def fit(self,X,y):
self.base_models = [list() for x in self.base_models]
self.meta_model = clone(self.meta_model)
kfold = KFold(n_splits=self.n_folds,shuffle=True,random_state=156)
# 對於每個模型,使用交叉驗證的方法來訓練初級學習器,並且得到次級訓練集
out_of_fold_predictions = np.zeros((X.shape[0],len(self.base_models)))
for i,model in enumerate(self.base_models):
# 訓練&驗證
for train_index,holdout_index in kfold.split(X,y):
self.base_models_[i].append(instance)
instance = clone(model)
instance.fit(X[train_index],y[train_index])
y_pred = instance.predict(X[holdout_index])
out_of_fold_predictions[holdout_index,i] = y_pred
# 使用次級訓練集來訓練次級資料
self.meta_model_.fit(out_of_fold_predictions,y)
return self
# predict的時候只需要用這些學習器構造我們的次級預測資料集並且進行預測就可以了
def predict(self,X):
meta_features = np.column_stack([
np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
for base_models in base_models_
])
return self.meta_model_.predict(meta_features)
閒扯
給自己定的學習時間是996,希望提高效率的同時,能把難點逐漸攻克。 不知道明年能找到好工作不?想去北京網際網路。