李航《統計學習方法》——第五章 決策樹模型
阿新 • • 發佈:2019-02-09
由於網上資料很多,這裡就不再對演算法原理進行推導,僅給出博主用Python實現的程式碼,供大家參考
適用問題:多類分類
三個步驟:特徵選擇、決策樹的生成和決策樹的剪枝
常見的決策樹演算法有:
- ID3:特徵劃分基於資訊增益
- C4.5:特徵劃分基於資訊增益比
- CART:特徵劃分基於基尼指數
測試資料集:train.csv
ID3演算法程式碼:
# encoding=utf-8
import cv2
import time
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
# 二值化
def binaryzation(img):
cv_img = img.astype(np.uint8)
cv2.threshold(cv_img,50,1,cv2.THRESH_BINARY_INV,cv_img)
return cv_img
def binaryzation_features(trainset):
features = []
for img in trainset:
img = np.reshape(img,(28,28))
cv_img = img.astype(np.uint8)
img_b = binaryzation(cv_img)
# hog_feature = np.transpose(hog_feature)
features.append(img_b)
features = np.array(features)
features = np.reshape(features,(-1,feature_len))
return features
class Tree(object):
def __init__(self,node_type,Class = None, feature = None):
self.node_type = node_type # 節點型別(internal或leaf)
self.dict = {} # dict的鍵表示特徵Ag的可能值ai,值表示根據ai得到的子樹
self.Class = Class # 葉節點表示的類,若是內部節點則為none
self.feature = feature # 表示當前的樹即將由第feature個特徵劃分(即第feature特徵是使得當前樹中資訊增益最大的特徵)
def add_tree(self,key,tree):
self.dict[key] = tree
def predict(self,features):
if self.node_type == 'leaf' or (features[self.feature] not in self.dict):
return self.Class
tree = self.dict.get(features[self.feature])
return tree.predict(features)
# 計算資料集x的經驗熵H(x)
def calc_ent(x):
x_value_list = set([x[i] for i in range(x.shape[0])])
ent = 0.0
for x_value in x_value_list:
p = float(x[x == x_value].shape[0]) / x.shape[0]
logp = np.log2(p)
ent -= p * logp
return ent
# 計算條件熵H(y/x)
def calc_condition_ent(x, y):
x_value_list = set([x[i] for i in range(x.shape[0])])
ent = 0.0
for x_value in x_value_list:
sub_y = y[x == x_value]
temp_ent = calc_ent(sub_y)
ent += (float(sub_y.shape[0]) / y.shape[0]) * temp_ent
return ent
# 計算資訊增益
def calc_ent_grap(x,y):
base_ent = calc_ent(y)
condition_ent = calc_condition_ent(x, y)
ent_grap = base_ent - condition_ent
return ent_grap
# ID3演算法
def recurse_train(train_set,train_label,features):
LEAF = 'leaf'
INTERNAL = 'internal'
# 步驟1——如果訓練集train_set中的所有例項都屬於同一類Ck
label_set = set(train_label)
if len(label_set) == 1:
return Tree(LEAF,Class = label_set.pop())
# 步驟2——如果特徵集features為空
class_len = [(i,len(list(filter(lambda x:x==i,train_label)))) for i in range(class_num)] # 計算每一個類出現的個數
(max_class,max_len) = max(class_len,key = lambda x:x[1])
if len(features) == 0:
return Tree(LEAF,Class = max_class)
# 步驟3——計算資訊增益,並選擇資訊增益最大的特徵
max_feature = 0
max_gda = 0
D = train_label
for feature in features:
# print(type(train_set))
A = np.array(train_set[:,feature].flat) # 選擇訓練集中的第feature列(即第feature個特徵)
gda=calc_ent_grap(A,D)
if gda > max_gda:
max_gda,max_feature = gda,feature
# 步驟4——資訊增益小於閾值
if max_gda < epsilon:
return Tree(LEAF,Class = max_class)
# 步驟5——構建非空子集
sub_features = list(filter(lambda x:x!=max_feature,features))
tree = Tree(INTERNAL,feature=max_feature)
max_feature_col = np.array(train_set[:,max_feature].flat)
feature_value_list = set([max_feature_col[i] for i in range(max_feature_col.shape[0])]) # 儲存資訊增益最大的特徵可能的取值 (shape[0]表示計算行數)
for feature_value in feature_value_list:
index = []
for i in range(len(train_label)):
if train_set[i][max_feature] == feature_value:
index.append(i)
sub_train_set = train_set[index]
sub_train_label = train_label[index]
sub_tree = recurse_train(sub_train_set,sub_train_label,sub_features)
tree.add_tree(feature_value,sub_tree)
return tree
def train(train_set,train_label,features):
return recurse_train(train_set,train_label,features)
def predict(test_set,tree):
result = []
for features in test_set:
tmp_predict = tree.predict(features)
result.append(tmp_predict)
return np.array(result)
class_num = 10 # MINST資料集有10種labels,分別是“0,1,2,3,4,5,6,7,8,9”
feature_len = 784 # MINST資料集每個image有28*28=784個特徵(pixels)
epsilon = 0.001 # 設定閾值
if __name__ == '__main__':
print("Start read data...")
time_1 = time.time()
raw_data = pd.read_csv('../data/train.csv', header=0) # 讀取csv資料
data = raw_data.values
imgs = data[::, 1::]
features = binaryzation_features(imgs) # 圖片二值化(很重要,不然預測準確率很低)
labels = data[::, 0]
# 避免過擬合,採用交叉驗證,隨機選取33%資料作為測試集,剩餘為訓練集
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.33, random_state=0)
time_2 = time.time()
print('read data cost %f seconds' % (time_2 - time_1))
# 通過ID3演算法生成決策樹
print('Start training...')
tree = train(train_features,train_labels,list(range(feature_len)))
time_3 = time.time()
print('training cost %f seconds' % (time_3 - time_2))
print('Start predicting...')
test_predict = predict(test_features,tree)
time_4 = time.time()
print('predicting cost %f seconds' % (time_4 - time_3))
# print("預測的結果為:")
# print(test_predict)
for i in range(len(test_predict)):
if test_predict[i] == None:
test_predict[i] = epsilon
score = accuracy_score(test_labels, test_predict)
print("The accruacy score is %f" % score)
# encoding=utf-8
import cv2
import time
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
# 二值化
def binaryzation(img):
cv_img = img.astype(np.uint8)
cv2.threshold(cv_img,50,1,cv2.THRESH_BINARY_INV,cv_img)
return cv_img
def binaryzation_features(trainset):
features = []
for img in trainset:
img = np.reshape(img,(28,28))
cv_img = img.astype(np.uint8)
img_b = binaryzation(cv_img)
# hog_feature = np.transpose(hog_feature)
features.append(img_b)
features = np.array(features)
features = np.reshape(features,(-1,feature_len))
return features
class Tree(object):
def __init__(self,node_type,Class = None, feature = None):
self.node_type = node_type # 節點型別(internal或leaf)
self.dict = {} # dict的鍵表示特徵Ag的可能值ai,值表示根據ai得到的子樹
self.Class = Class # 葉節點表示的類,若是內部節點則為none
self.feature = feature # 表示當前的樹即將由第feature個特徵劃分(即第feature特徵是使得當前樹中資訊增益最大的特徵)
def add_tree(self,key,tree):
self.dict[key] = tree
def predict(self,features):
if self.node_type == 'leaf' or (features[self.feature] not in self.dict):
return self.Class
tree = self.dict.get(features[self.feature])
return tree.predict(features)
# 計算資料集x的經驗熵H(x)
def calc_ent(x):
x_value_list = set([x[i] for i in range(x.shape[0])])
ent = 0.0
for x_value in x_value_list:
p = float(x[x == x_value].shape[0]) / x.shape[0]
logp = np.log2(p)
ent -= p * logp
return ent
# 計算條件熵H(y/x)
def calc_condition_ent(x, y):
x_value_list = set([x[i] for i in range(x.shape[0])])
ent = 0.0
for x_value in x_value_list:
sub_y = y[x == x_value]
temp_ent = calc_ent(sub_y)
ent += (float(sub_y.shape[0]) / y.shape[0]) * temp_ent
return ent
# 計算資訊增益
def calc_ent_grap(x,y):
base_ent = calc_ent(y)
condition_ent = calc_condition_ent(x, y)
ent_grap = base_ent - condition_ent
return ent_grap
# C4.5演算法
def recurse_train(train_set,train_label,features):
LEAF = 'leaf'
INTERNAL = 'internal'
# 步驟1——如果訓練集train_set中的所有例項都屬於同一類Ck
label_set = set(train_label)
if len(label_set) == 1:
return Tree(LEAF,Class = label_set.pop())
# 步驟2——如果特徵集features為空
class_len = [(i,len(list(filter(lambda x:x==i,train_label)))) for i in range(class_num)] # 計算每一個類出現的個數
(max_class,max_len) = max(class_len,key = lambda x:x[1])
if len(features) == 0:
return Tree(LEAF,Class = max_class)
# 步驟3——計算資訊增益,並選擇資訊增益最大的特徵
max_feature = 0
max_gda = 0
D = train_label
for feature in features:
# print(type(train_set))
A = np.array(train_set[:,feature].flat) # 選擇訓練集中的第feature列(即第feature個特徵)
gda = calc_ent_grap(A,D)
if calc_ent(A) != 0: ####### 計算資訊增益比,這是與ID3演算法唯一的不同
gda /= calc_ent(A)
if gda > max_gda:
max_gda,max_feature = gda,feature
# 步驟4——資訊增益小於閾值
if max_gda < epsilon:
return Tree(LEAF,Class = max_class)
# 步驟5——構建非空子集
sub_features = list(filter(lambda x:x!=max_feature,features))
tree = Tree(INTERNAL,feature=max_feature)
max_feature_col = np.array(train_set[:,max_feature].flat)
feature_value_list = set([max_feature_col[i] for i in range(max_feature_col.shape[0])]) # 儲存資訊增益最大的特徵可能的取值 (shape[0]表示計算行數)
for feature_value in feature_value_list:
index = []
for i in range(len(train_label)):
if train_set[i][max_feature] == feature_value:
index.append(i)
sub_train_set = train_set[index]
sub_train_label = train_label[index]
sub_tree = recurse_train(sub_train_set,sub_train_label,sub_features)
tree.add_tree(feature_value,sub_tree)
return tree
def train(train_set,train_label,features):
return recurse_train(train_set,train_label,features)
def predict(test_set,tree):
result = []
for features in test_set:
tmp_predict = tree.predict(features)
result.append(tmp_predict)
return np.array(result)
class_num = 10 # MINST資料集有10種labels,分別是“0,1,2,3,4,5,6,7,8,9”
feature_len = 784 # MINST資料集每個image有28*28=784個特徵(pixels)
epsilon = 0.001 # 設定閾值
if __name__ == '__main__':
print("Start read data...")
time_1 = time.time()
raw_data = pd.read_csv('../data/train.csv', header=0) # 讀取csv資料
data = raw_data.values
imgs = data[::, 1::]
features = binaryzation_features(imgs) # 圖片二值化(很重要,不然預測準確率很低)
labels = data[::, 0]
# 避免過擬合,採用交叉驗證,隨機選取33%資料作為測試集,剩餘為訓練集
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.33, random_state=0)
time_2 = time.time()
print('read data cost %f seconds' % (time_2 - time_1))
# 通過C4.5演算法生成決策樹
print('Start training...')
tree = train(train_features,train_labels,list(range(feature_len)))
time_3 = time.time()
print('training cost %f seconds' % (time_3 - time_2))
print('Start predicting...')
test_predict = predict(test_features,tree)
time_4 = time.time()
print('predicting cost %f seconds' % (time_4 - time_3))
# print("預測的結果為:")
# print(test_predict)
for i in range(len(test_predict)):
if test_predict[i] == None:
test_predict[i] = epsilon
score = accuracy_score(test_labels, test_predict)
print("The accruacy score is %f" % score)
程式碼可從這裡decision_tree/C45.py獲得
執行結果:
CART演算法程式碼(用sklearn實現):
# encoding=utf-8
import pandas as pd
import time
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
if __name__ == '__main__':
print("Start read data...")
time_1 = time.time()
raw_data = pd.read_csv('../data/train.csv', header=0)
data = raw_data.values
features = data[::, 1::]
labels = data[::, 0]
# 隨機選取33%資料作為測試集,剩餘為訓練集
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.33, random_state=0)
time_2 = time.time()
print('read data cost %f seconds' % (time_2 - time_1))
print('Start training...')
# criterion可選‘gini’, ‘entropy’,預設為gini(對應CART演算法),entropy為資訊增益(對應ID3演算法)
clf = DecisionTreeClassifier(criterion='gini')
clf.fit(train_features,train_labels)
time_3 = time.time()
print('training cost %f seconds' % (time_3 - time_2))
print('Start predicting...')
test_predict = clf.predict(test_features)
time_4 = time.time()
print('predicting cost %f seconds' % (time_4 - time_3))
score = accuracy_score(test_labels, test_predict)
print("The accruacy score is %f" % score)