機器學習sklearn(47): 特徵工程(十四) 特徵選擇(五)Embedded嵌入法/Wrapper包裝法
阿新 • • 發佈:2021-06-25
1Embedded嵌入法
from sklearn.feature_selection import SelectFromModel from sklearn.ensemble import RandomForestClassifier as RFC RFC_ = RFC(n_estimators =10,random_state=0) X_embedded = SelectFromModel(RFC_,threshold=0.005).fit_transform(X,y) #在這裡我只想取出來有限的特徵。0.005這個閾值對於有780個特徵的資料來說,是非常高的閾值,因為平均每個特徵只能夠分到大約0.001的feature_importances_ X_embedded.shape #模型的維度明顯被降低了 #同樣的,我們也可以畫學習曲線來找最佳閾值 #======【TIME WARNING:10 mins】======# import numpy as np import matplotlib.pyplot as plt RFC_.fit(X,y).feature_importances_ threshold = np.linspace(0,(RFC_.fit(X,y).feature_importances_).max(),20) score = [] for i in threshold: X_embedded= SelectFromModel(RFC_,threshold=i).fit_transform(X,y) once = cross_val_score(RFC_,X_embedded,y,cv=5).mean() score.append(once) plt.plot(threshold,score) plt.show()
X_embedded = SelectFromModel(RFC_,threshold=0.00067).fit_transform(X,y)
X_embedded.shape
cross_val_score(RFC_,X_embedded,y,cv=5).mean()
#======【TIME WARNING:10 mins】======# score2 = [] for i in np.linspace(0,0.00134,20): X_embedded = SelectFromModel(RFC_,threshold=i).fit_transform(X,y) once = cross_val_score(RFC_,X_embedded,y,cv=5).mean() score2.append(once) plt.figure(figsize=[20,5]) plt.plot(np.linspace(0,0.00134,20),score2) plt.xticks(np.linspace(0,0.00134,20)) plt.show()
X_embedded = SelectFromModel(RFC_,threshold=0.000564).fit_transform(X,y) X_embedded.shape cross_val_score(RFC_,X_embedded,y,cv=5).mean() #=====【TIME WARNING:2 min】=====# #我們可能已經找到了現有模型下的最佳結果,如果我們調整一下隨機森林的引數呢? cross_val_score(RFC(n_estimators=100,random_state=0),X_embedded,y,cv=5).mean()
2Wrapper包裝法
from sklearn.feature_selection import RFE RFC_ = RFC(n_estimators =10,random_state=0) selector = RFE(RFC_, n_features_to_select=340, step=50).fit(X, y) selector.support_.sum() selector.ranking_ X_wrapper = selector.transform(X) cross_val_score(RFC_,X_wrapper,y,cv=5).mean()我們也可以對包裝法畫學習曲線:
#======【TIME WARNING: 15 mins】======# score = [] for i in range(1,751,50): X_wrapper = RFE(RFC_,n_features_to_select=i, step=50).fit_transform(X,y) once = cross_val_score(RFC_,X_wrapper,y,cv=5).mean() score.append(once) plt.figure(figsize=[20,5]) plt.plot(range(1,751,50),score) plt.xticks(range(1,751,50)) plt.show()