sklearn庫學習之決策樹整合
阿新 • • 發佈:2018-12-16
決策樹整合
整合是合併多個機器學習模型來構建更強大模型的方法。 兩種整合模型:隨機森林和梯度決策樹。
隨機森林
構造很多樹(確定用於構造的樹的個數),並且每棵樹的預測都很好,但都以不同的方式過擬合,那麼可以對這些樹的結果取平均值來降低過擬合。隨機森林中樹的隨機化方法有兩種:
- 通過選擇用於構造樹的資料點,即對資料進行自助取樣
- 通過選擇每次劃分測試的特徵,即在每個結點處演算法隨機選擇特徵的一個子集,並對其中的一個特徵尋找最佳測試
對於迴歸問題,對每棵樹的預測結果取平均值作為最終預測。 對於分類問題,採取軟投票策略。
隨機森林中樹的深度往往比決策樹還要大,可以使用n_jobs = -1來使用計算機的所有核心。它不擅長處理維度非常高的稀疏資料。對於分類,預設值是max_features = sqrt(n_features);對於迴歸,預設值是max_features = n_features。
#將由五棵樹組成的隨機森林應用到前面研究過的two_moons資料集上 from sklearn.ensemble import RandomForestClassifier from sklearn.datasets import make_moons from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt import mglearn X,y = make_moons(n_samples = 100, noise = 0.25, random_state = 3) #從100個數據點中自助取樣 X_train,X_test,y_train,y_test = train_test_split(X,y,stratify = y, random_state = 42) forest = RandomForestClassifier(n_estimators = 5, random_state = 2) #5棵樹 forest.fit(X_train,y_train) fig, axes = plt.subplots(2,3,figsize = (20,10)) for i ,(ax,tree) in enumerate(zip(axes.ravel(),forest.estimators_)): #作為隨機森林的一部分,樹被儲存在estimators_屬性中 ax.set_title("Tree {}".format(i)) mglearn.plots.plot_tree_partition(X_train,y_train,tree,ax = ax) #在檢視中畫樹與資料點的分佈 mglearn.plots.plot_2d_separator(forest, X_train, fill = True, ax = axes[-1,-1], alpha = .4)#???決策邊界視覺化 axes[-1,-1].set_title("Random Forest") mglearn.discrete_scatter(X_train[:,0],X_train[:,1],y_train) #在視覺化圖上畫訓練集資料點
#將包含100棵樹的隨機森林應用到乳腺癌資料集上 from sklearn.model_selection import train_test_split from sklearn.datasets import load_breast_cancer import mglearn import numpy as np cancer = load_breast_cancer() X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state = 0) forest = RandomForestClassifier(n_estimators = 100, random_state = 0)#100棵樹,還可以調節max_features引數,或預剪枝 forest.fit(X_train,y_train) print("Accuracy on training set:{:.3f}".format(forest.score(X_train,y_train))) print("Accuracy on test set:{:.3f}".format(forest.score(X_test,y_test))) #隨機森林也可以給出特徵重要性,將森林中所有樹的特徵重要性求和取平均 #將特徵重要性視覺化 def plot_feature_importances_cancer(model): n_features = cancer.data.shape[1] plt.barh(range(n_features),model.feature_importances_,align = 'center') plt.yticks(np.arange(n_features),cancer.feature_names) plt.xlabel("Feature importance") plt.ylabel("Feature") plot_feature_importances_cancer(forest)
梯度提升迴歸樹
梯度提升採用連續的方式構造深度很小(1到5之間)的樹,梯度提升的一個重要引數是learning_rate(學習率),控制每棵樹糾正前一棵樹的錯誤的強度。還有一個主要引數為樹的數量n_estimators
#在乳腺癌資料集上應用GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
X_train,X_test,y_train,y_test = train_test_split(cancer.data, cancer.target,random_state = 0)
gbrt = GradientBoostingClassifier(random_state = 0)
gbrt.fit(X_train,y_train)
print("Accuracy on training set:{:.3f}".format(forest.score(X_train,y_train)))
print("Accuracy on test set:{:.3f}".format(forest.score(X_test,y_test)))
#為了降低過擬合,限制最大深度來加強預剪枝
gbrt = GradientBoostingClassifier(random_state = 0, max_depth = 1)
gbrt.fit(X_train,y_train)
print("Accuracy on training set:{:.3f}".format(forest.score(X_train,y_train)))
print("Accuracy on test set:{:.3f}".format(forest.score(X_test,y_test)))
#也可以降低學習率
gbrt = GradientBoostingClassifier(random_state = 0, learning_rate = 0.01)
gbrt.fit(X_train,y_train)
print("Accuracy on training set:{:.3f}".format(forest.score(X_train,y_train)))
print("Accuracy on test set:{:.3f}".format(forest.score(X_test,y_test)))
#將特徵重要性視覺化
def plot_feature_importances_cancer(model):
n_features = cancer.data.shape[1]
plt.barh(range(n_features),model.feature_importances_,align = 'center')
plt.yticks(np.arange(n_features),cancer.feature_names)
plt.xlabel("Feature importance")
plt.ylabel("Feature")
plot_feature_importances_cancer(gbrt)