1. 程式人生 > 實用技巧 >sklearn:決策分類樹_紅酒資料集

sklearn:決策分類樹_紅酒資料集

from sklearn import tree
from sklearn.datasets import load_wine  # 紅酒資料
from sklearn.model_selection import train_test_split
wine = load_wine()  # 匯入資料
wine
{'data': array([[1.423e+01, 1.710e+00, 2.430e+00, ..., 1.040e+00, 3.920e+00,
         1.065e+03],
        [1.320e+01, 1.780e+00, 2.140e+00, ..., 1.050e+00, 3.400e+00,
         1.050e+03],
        [1.316e+01, 2.360e+00, 2.670e+00, ..., 1.030e+00, 3.170e+00,
         1.185e+03],
        ...,
        [1.327e+01, 4.280e+00, 2.260e+00, ..., 5.900e-01, 1.560e+00,
         8.350e+02],
        [1.317e+01, 2.590e+00, 2.370e+00, ..., 6.000e-01, 1.620e+00,
         8.400e+02],
        [1.413e+01, 4.100e+00, 2.740e+00, ..., 6.100e-01, 1.600e+00,
         5.600e+02]]),
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2]),
 'target_names': array(['class_0', 'class_1', 'class_2'], dtype='<U7'),
 'DESCR': '......rics).\n',
 'feature_names': ['alcohol',
  'malic_acid',
  'ash',
  'alcalinity_of_ash',
  'magnesium',
  'total_phenols',
  'flavanoids',
  'nonflavanoid_phenols',
  'proanthocyanins',
  'color_intensity',
  'hue',
  'od280/od315_of_diluted_wines',
  'proline']}
import pandas as pd

# 將特徵資料與 target拼接起來
wine_df = pd.concat([pd.DataFrame(wine.data), pd.DataFrame(wine.target)] ,axis=1)
wine_df.columns=list(wine.feature_names) + ['target']  # 將資料特徵名稱與資料對應
wine_df['target'] = wine_df['target'].map(dict(zip(range(3), wine.target_names)))  # 顯示類別名稱
wine_df
alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue od280/od315_of_diluted_wines proline target
0 14.23 1.71 2.43 15.6 127.0 2.80 3.06 0.28 2.29 5.64 1.04 3.92 1065.0 class_0
1 13.20 1.78 2.14 11.2 100.0 2.65 2.76 0.26 1.28 4.38 1.05 3.40 1050.0 class_0
2 13.16 2.36 2.67 18.6 101.0 2.80 3.24 0.30 2.81 5.68 1.03 3.17 1185.0 class_0
3 14.37 1.95 2.50 16.8 113.0 3.85 3.49 0.24 2.18 7.80 0.86 3.45 1480.0 class_0
4 13.24 2.59 2.87 21.0 118.0 2.80 2.69 0.39 1.82 4.32 1.04 2.93 735.0 class_0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
173 13.71 5.65 2.45 20.5 95.0 1.68 0.61 0.52 1.06 7.70 0.64 1.74 740.0 class_2
174 13.40 3.91 2.48 23.0 102.0 1.80 0.75 0.43 1.41 7.30 0.70 1.56 750.0 class_2
175 13.27 4.28 2.26 20.0 120.0 1.59 0.69 0.43 1.35 10.20 0.59 1.56 835.0 class_2
176 13.17 2.59 2.37 20.0 120.0 1.65 0.68 0.53 1.46 9.30 0.60 1.62 840.0 class_2
177 14.13 4.10 2.74 24.5 96.0 2.05 0.76 0.56 1.35 9.20 0.61 1.60 560.0 class_2

178 rows × 14 columns

# 拆分資料為:訓練集和測試集
X_train, X_test, Y_train, Y_test = train_test_split(wine.data, wine.target, test_size=0.3)
clf = tree.DecisionTreeClassifier(criterion="entropy")
clf = clf.fit(X_train, Y_train)
score = clf.score(X_test, Y_test)  # 返回準確度
score
0.9444444444444444
import graphviz  # 需要提前安裝graphviz

dot_data = tree.export_graphviz(clf
    ,feature_names = wine.feature_names  # 特徵名
    ,class_names = wine.target_names  # 標籤名
    ,filled = True  # 顏色填充
    ,rounded = True  # 圓角邊框
)

graph = graphviz.Source(dot_data)
graph

clf.feature_importances_  # 檢視各特徵的重要性,沒有被使用的特徵 重要性為0
array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.3918564 , 0.        , 0.        , 0.1160134 ,
       0.02128596, 0.        , 0.47084424])
dict(zip(wine.feature_names, clf.feature_importances_))  # 將特徵名稱與重要性對應
{'alcohol': 0.0,
 'malic_acid': 0.0,
 'ash': 0.0,
 'alcalinity_of_ash': 0.0,
 'magnesium': 0.0,
 'total_phenols': 0.0,
 'flavanoids': 0.26190367697120653,
 'nonflavanoid_phenols': 0.0,
 'proanthocyanins': 0.0,
 'color_intensity': 0.11601339710491781,
 'hue': 0.0,
 'od280/od315_of_diluted_wines': 0.15123868318487035,
 'proline': 0.47084424273900527}

增加決策樹隨機性

  • 決策樹的隨機性在高維度的資料集中表現的會比較好
  • 在低維度資料集(比如鳶尾花資料集中),隨機性就表現得不夠好
clf = tree.DecisionTreeClassifier(criterion="entropy"
                                  ,random_state=50  # 隨機種子
                                  # splitter:預設是best,就是上面的重要性。雖然隨機,但是還是選擇最重要的。
                                  # random讓決策樹更加隨機,樹會更大更深
                                  ,splitter="random"  
                                 )
clf = clf.fit(X_train, Y_train)
score = clf.score(X_test, Y_test)  # 返回準確度
score
0.8888888888888888
import graphviz
dot_data = tree.export_graphviz(clf
    ,feature_names = wine.feature_names  # 特徵名
    ,class_names = wine.target_names  # 標籤名
    ,filled = True  # 顏色填充
    ,rounded = True  # 圓角邊框
)

graph = graphviz.Source(dot_data)
graph


剪枝引數:min_samples_leaf & min_samples_split

  • 為了使決策樹具有更大的泛化能力
  • 限制樹的最大深度,建議從3開始逐漸嘗試
  • 限制葉子節點數量
  • 限制劃分節點數量
import graphviz

clf = tree.DecisionTreeClassifier(criterion="entropy"
                                  ,random_state=50
                                  ,splitter="random"
                                  ,max_depth = 3
                                  ,min_samples_leaf=10  # 將樣本數量小於10的葉子節點剪掉
                                  ,min_samples_split=10  # 將中間節點樣本數量小於10的剪掉
                                 )
clf = clf.fit(X_train, Y_train)
score = clf.score(X_test, Y_test)  # 返回準確度
print(score)
dot_data = tree.export_graphviz(clf
    ,feature_names = wine.feature_names  # 特徵名
    ,class_names = wine.target_names  # 標籤名
    ,filled = True  # 顏色填充
    ,rounded = True  # 圓角邊框
)

graph = graphviz.Source(dot_data)
graph
0.8518518518518519


max_features & min_impurity_decrease

  • max_features:最大特徵數量限制,超過限制的特徵會被捨棄,是一種降維方式,使用較少
  • min_impurity_decrease:限制資訊增益大小,當資訊增益小於這個值,就不再進行分支了
import graphviz

clf = tree.DecisionTreeClassifier(criterion="entropy"
                                  ,random_state=50
#                                   ,splitter="random"
                                  ,max_depth = 5
#                                   ,min_samples_leaf=10  # 將樣本數量小於10的葉子節點剪掉
#                                   ,min_samples_split=10  # 將中間節點樣本數量小於10的剪掉
#                                   ,max_features = 2
                                  ,min_impurity_decrease=0.1
                                 )
clf = clf.fit(X_train, Y_train)
score = clf.score(X_test, Y_test)  # 返回準確度
print(score)
dot_data = tree.export_graphviz(clf
    ,feature_names = wine.feature_names  # 特徵名
    ,class_names = wine.target_names  # 標籤名
    ,filled = True  # 顏色填充
    ,rounded = True  # 圓角邊框
)

graph = graphviz.Source(dot_data)
graph
0.9444444444444444


確認最優引數,畫學習曲線

import  matplotlib.pyplot as plt

deths_rt = []

for dep in range(1, 10):
    clf = tree.DecisionTreeClassifier(criterion="entropy"
                                      ,max_depth = dep
                                     )
    clf = clf.fit(X_train, Y_train)
    score = clf.score(X_test, Y_test)  # 返回準確度
    deths_rt.append(score)
    
plt.plot(range(1, 10), deths_rt)


目標權重引數

  • class_weight & min_weight_fraction_leaf
  • 注意:sklearn不接受一維矩陣
class_weight  # 目標型別的權重,其資料型別為dict或者列表內的dict,或者為"balanced"
min_weight_fraction_leaf  # 權重剪枝引數,搭配目標權重使用,比min_samples_leaf更偏向於主導類

其他常用介面

# 返回樣本所在葉子節點的索引
clf.apply(X_test)
array([ 5,  5,  5,  4,  3,  5,  5,  5,  5, 10, 10,  5,  5,  3, 10, 10, 10,
        5,  4,  5, 10,  4,  5, 10,  5,  5,  4,  5,  4,  4,  5,  4,  4, 10,
       10,  5,  4,  5,  5,  5,  4, 10, 10, 10,  5,  5, 10,  4, 10, 10,  5,
        5,  5, 10], dtype=int64)
# 返回預測標籤
clf.predict(X_test
array([1, 1, 1, 2, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 2, 1, 0, 2,
       1, 0, 1, 1, 2, 1, 2, 2, 1, 2, 2, 0, 0, 1, 2, 1, 1, 1, 2, 0, 0, 0,
       1, 1, 0, 2, 0, 0, 1, 1, 1, 0])