sklearn:決策分類樹_紅酒資料集
阿新 • • 發佈:2020-12-31
from sklearn import tree
from sklearn.datasets import load_wine # 紅酒資料
from sklearn.model_selection import train_test_split
wine = load_wine() # 匯入資料
wine
{'data': array([[1.423e+01, 1.710e+00, 2.430e+00, ..., 1.040e+00, 3.920e+00, 1.065e+03], [1.320e+01, 1.780e+00, 2.140e+00, ..., 1.050e+00, 3.400e+00, 1.050e+03], [1.316e+01, 2.360e+00, 2.670e+00, ..., 1.030e+00, 3.170e+00, 1.185e+03], ..., [1.327e+01, 4.280e+00, 2.260e+00, ..., 5.900e-01, 1.560e+00, 8.350e+02], [1.317e+01, 2.590e+00, 2.370e+00, ..., 6.000e-01, 1.620e+00, 8.400e+02], [1.413e+01, 4.100e+00, 2.740e+00, ..., 6.100e-01, 1.600e+00, 5.600e+02]]), 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]), 'target_names': array(['class_0', 'class_1', 'class_2'], dtype='<U7'), 'DESCR': '......rics).\n', 'feature_names': ['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']}
import pandas as pd
# 將特徵資料與 target拼接起來
wine_df = pd.concat([pd.DataFrame(wine.data), pd.DataFrame(wine.target)] ,axis=1)
wine_df.columns=list(wine.feature_names) + ['target'] # 將資料特徵名稱與資料對應
wine_df['target'] = wine_df['target'].map(dict(zip(range(3), wine.target_names))) # 顯示類別名稱
wine_df
alcohol | malic_acid | ash | alcalinity_of_ash | magnesium | total_phenols | flavanoids | nonflavanoid_phenols | proanthocyanins | color_intensity | hue | od280/od315_of_diluted_wines | proline | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 14.23 | 1.71 | 2.43 | 15.6 | 127.0 | 2.80 | 3.06 | 0.28 | 2.29 | 5.64 | 1.04 | 3.92 | 1065.0 | class_0 |
1 | 13.20 | 1.78 | 2.14 | 11.2 | 100.0 | 2.65 | 2.76 | 0.26 | 1.28 | 4.38 | 1.05 | 3.40 | 1050.0 | class_0 |
2 | 13.16 | 2.36 | 2.67 | 18.6 | 101.0 | 2.80 | 3.24 | 0.30 | 2.81 | 5.68 | 1.03 | 3.17 | 1185.0 | class_0 |
3 | 14.37 | 1.95 | 2.50 | 16.8 | 113.0 | 3.85 | 3.49 | 0.24 | 2.18 | 7.80 | 0.86 | 3.45 | 1480.0 | class_0 |
4 | 13.24 | 2.59 | 2.87 | 21.0 | 118.0 | 2.80 | 2.69 | 0.39 | 1.82 | 4.32 | 1.04 | 2.93 | 735.0 | class_0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
173 | 13.71 | 5.65 | 2.45 | 20.5 | 95.0 | 1.68 | 0.61 | 0.52 | 1.06 | 7.70 | 0.64 | 1.74 | 740.0 | class_2 |
174 | 13.40 | 3.91 | 2.48 | 23.0 | 102.0 | 1.80 | 0.75 | 0.43 | 1.41 | 7.30 | 0.70 | 1.56 | 750.0 | class_2 |
175 | 13.27 | 4.28 | 2.26 | 20.0 | 120.0 | 1.59 | 0.69 | 0.43 | 1.35 | 10.20 | 0.59 | 1.56 | 835.0 | class_2 |
176 | 13.17 | 2.59 | 2.37 | 20.0 | 120.0 | 1.65 | 0.68 | 0.53 | 1.46 | 9.30 | 0.60 | 1.62 | 840.0 | class_2 |
177 | 14.13 | 4.10 | 2.74 | 24.5 | 96.0 | 2.05 | 0.76 | 0.56 | 1.35 | 9.20 | 0.61 | 1.60 | 560.0 | class_2 |
178 rows × 14 columns
# 拆分資料為:訓練集和測試集
X_train, X_test, Y_train, Y_test = train_test_split(wine.data, wine.target, test_size=0.3)
clf = tree.DecisionTreeClassifier(criterion="entropy")
clf = clf.fit(X_train, Y_train)
score = clf.score(X_test, Y_test) # 返回準確度
score
0.9444444444444444
import graphviz # 需要提前安裝graphviz
dot_data = tree.export_graphviz(clf
,feature_names = wine.feature_names # 特徵名
,class_names = wine.target_names # 標籤名
,filled = True # 顏色填充
,rounded = True # 圓角邊框
)
graph = graphviz.Source(dot_data)
graph
clf.feature_importances_ # 檢視各特徵的重要性,沒有被使用的特徵 重要性為0
array([0. , 0. , 0. , 0. , 0. ,
0. , 0.3918564 , 0. , 0. , 0.1160134 ,
0.02128596, 0. , 0.47084424])
dict(zip(wine.feature_names, clf.feature_importances_)) # 將特徵名稱與重要性對應
{'alcohol': 0.0,
'malic_acid': 0.0,
'ash': 0.0,
'alcalinity_of_ash': 0.0,
'magnesium': 0.0,
'total_phenols': 0.0,
'flavanoids': 0.26190367697120653,
'nonflavanoid_phenols': 0.0,
'proanthocyanins': 0.0,
'color_intensity': 0.11601339710491781,
'hue': 0.0,
'od280/od315_of_diluted_wines': 0.15123868318487035,
'proline': 0.47084424273900527}
增加決策樹隨機性
- 決策樹的隨機性在高維度的資料集中表現的會比較好
- 在低維度資料集(比如鳶尾花資料集中),隨機性就表現得不夠好
clf = tree.DecisionTreeClassifier(criterion="entropy"
,random_state=50 # 隨機種子
# splitter:預設是best,就是上面的重要性。雖然隨機,但是還是選擇最重要的。
# random讓決策樹更加隨機,樹會更大更深
,splitter="random"
)
clf = clf.fit(X_train, Y_train)
score = clf.score(X_test, Y_test) # 返回準確度
score
0.8888888888888888
import graphviz
dot_data = tree.export_graphviz(clf
,feature_names = wine.feature_names # 特徵名
,class_names = wine.target_names # 標籤名
,filled = True # 顏色填充
,rounded = True # 圓角邊框
)
graph = graphviz.Source(dot_data)
graph
剪枝引數:min_samples_leaf & min_samples_split
- 為了使決策樹具有更大的泛化能力
- 限制樹的最大深度,建議從3開始逐漸嘗試
- 限制葉子節點數量
- 限制劃分節點數量
import graphviz
clf = tree.DecisionTreeClassifier(criterion="entropy"
,random_state=50
,splitter="random"
,max_depth = 3
,min_samples_leaf=10 # 將樣本數量小於10的葉子節點剪掉
,min_samples_split=10 # 將中間節點樣本數量小於10的剪掉
)
clf = clf.fit(X_train, Y_train)
score = clf.score(X_test, Y_test) # 返回準確度
print(score)
dot_data = tree.export_graphviz(clf
,feature_names = wine.feature_names # 特徵名
,class_names = wine.target_names # 標籤名
,filled = True # 顏色填充
,rounded = True # 圓角邊框
)
graph = graphviz.Source(dot_data)
graph
0.8518518518518519
max_features & min_impurity_decrease
- max_features:最大特徵數量限制,超過限制的特徵會被捨棄,是一種降維方式,使用較少
- min_impurity_decrease:限制資訊增益大小,當資訊增益小於這個值,就不再進行分支了
import graphviz
clf = tree.DecisionTreeClassifier(criterion="entropy"
,random_state=50
# ,splitter="random"
,max_depth = 5
# ,min_samples_leaf=10 # 將樣本數量小於10的葉子節點剪掉
# ,min_samples_split=10 # 將中間節點樣本數量小於10的剪掉
# ,max_features = 2
,min_impurity_decrease=0.1
)
clf = clf.fit(X_train, Y_train)
score = clf.score(X_test, Y_test) # 返回準確度
print(score)
dot_data = tree.export_graphviz(clf
,feature_names = wine.feature_names # 特徵名
,class_names = wine.target_names # 標籤名
,filled = True # 顏色填充
,rounded = True # 圓角邊框
)
graph = graphviz.Source(dot_data)
graph
0.9444444444444444
確認最優引數,畫學習曲線
import matplotlib.pyplot as plt
deths_rt = []
for dep in range(1, 10):
clf = tree.DecisionTreeClassifier(criterion="entropy"
,max_depth = dep
)
clf = clf.fit(X_train, Y_train)
score = clf.score(X_test, Y_test) # 返回準確度
deths_rt.append(score)
plt.plot(range(1, 10), deths_rt)
目標權重引數
- class_weight & min_weight_fraction_leaf
- 注意:sklearn不接受一維矩陣
class_weight # 目標型別的權重,其資料型別為dict或者列表內的dict,或者為"balanced"
min_weight_fraction_leaf # 權重剪枝引數,搭配目標權重使用,比min_samples_leaf更偏向於主導類
其他常用介面
# 返回樣本所在葉子節點的索引
clf.apply(X_test)
array([ 5, 5, 5, 4, 3, 5, 5, 5, 5, 10, 10, 5, 5, 3, 10, 10, 10,
5, 4, 5, 10, 4, 5, 10, 5, 5, 4, 5, 4, 4, 5, 4, 4, 10,
10, 5, 4, 5, 5, 5, 4, 10, 10, 10, 5, 5, 10, 4, 10, 10, 5,
5, 5, 10], dtype=int64)
# 返回預測標籤
clf.predict(X_test
array([1, 1, 1, 2, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 2, 1, 0, 2,
1, 0, 1, 1, 2, 1, 2, 2, 1, 2, 2, 0, 0, 1, 2, 1, 1, 1, 2, 0, 0, 0,
1, 1, 0, 2, 0, 0, 1, 1, 1, 0])