1. 程式人生 > 其它 >機器學習sklearn(47): 特徵工程(十四) 特徵選擇(五)Embedded嵌入法/Wrapper包裝法

機器學習sklearn(47): 特徵工程(十四) 特徵選擇(五)Embedded嵌入法/Wrapper包裝法

1Embedded嵌入法

from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier as RFC
RFC_ = RFC(n_estimators =10,random_state=0)
X_embedded = SelectFromModel(RFC_,threshold=0.005).fit_transform(X,y) #在這裡我只想取出來有限的特徵。0.005這個閾值對於有780個特徵的資料來說,是非常高的閾值,因為平均每個特徵
只能夠分到大約0.001的feature_importances_ X_embedded.shape #模型的維度明顯被降低了 #同樣的,我們也可以畫學習曲線來找最佳閾值 #======【TIME WARNING:10 mins】======# import numpy as np import matplotlib.pyplot as plt RFC_.fit(X,y).feature_importances_ threshold = np.linspace(0,(RFC_.fit(X,y).feature_importances_).max(),20) score = [] for i in threshold: X_embedded
= SelectFromModel(RFC_,threshold=i).fit_transform(X,y) once = cross_val_score(RFC_,X_embedded,y,cv=5).mean() score.append(once) plt.plot(threshold,score) plt.show()
X_embedded = SelectFromModel(RFC_,threshold=0.00067).fit_transform(X,y)
X_embedded.shape
cross_val_score(RFC_,X_embedded,y,cv=5).mean()
#======【TIME WARNING:10 mins】======#
score2 = []
for i in np.linspace(0,0.00134,20):
  X_embedded = SelectFromModel(RFC_,threshold=i).fit_transform(X,y)
  once = cross_val_score(RFC_,X_embedded,y,cv=5).mean()
  score2.append(once)
plt.figure(figsize=[20,5])
plt.plot(np.linspace(0,0.00134,20),score2)
plt.xticks(np.linspace(0,0.00134,20))
plt.show()
X_embedded = SelectFromModel(RFC_,threshold=0.000564).fit_transform(X,y)
X_embedded.shape
cross_val_score(RFC_,X_embedded,y,cv=5).mean()
#=====【TIME WARNING:2 min】=====#
#我們可能已經找到了現有模型下的最佳結果,如果我們調整一下隨機森林的引數呢?
cross_val_score(RFC(n_estimators=100,random_state=0),X_embedded,y,cv=5).mean()

2Wrapper包裝法

from sklearn.feature_selection import RFE
RFC_ = RFC(n_estimators =10,random_state=0)
selector = RFE(RFC_, n_features_to_select=340, step=50).fit(X, y)
selector.support_.sum()
selector.ranking_
X_wrapper = selector.transform(X)
cross_val_score(RFC_,X_wrapper,y,cv=5).mean()
我們也可以對包裝法畫學習曲線:
#======【TIME WARNING: 15 mins】======#
score = []
for i in range(1,751,50):
  X_wrapper = RFE(RFC_,n_features_to_select=i, step=50).fit_transform(X,y)
  once = cross_val_score(RFC_,X_wrapper,y,cv=5).mean()
  score.append(once)
plt.figure(figsize=[20,5])
plt.plot(range(1,751,50),score)
plt.xticks(range(1,751,50))
plt.show()

3特徵選擇總結