樹迴歸:CART演算法構建迴歸樹和模型樹(程式碼筆記)
阿新 • • 發佈:2019-01-02
分類迴歸樹(Classification And Regression Trees,CART)是一種構造樹的監督學習方法。
和ID3決策樹作比較:
1. ID3每次直接用最佳特徵分割資料,即如果當前特徵有4個可能值,那麼資料將被分成4份,處理的是標稱型資料,不能直接處理連續型資料。CART則利用二元切分來處理連續型變數,每次會找一個最佳特徵的閾值,把資料集分成兩部分,也就是左子樹和右子樹。
2. CART使用方差計算來代替夏農熵。但目的都是找最佳切分特徵。
import numpy as np ''' CART使用二元切分來處理連續型變數。 迴歸樹和分類樹類似,只是葉節點的資料型別是連續型不是離散型 (其實也不是真正的“連續”,切分時仍取決於屬性值,只不過數值都是浮點數) 以下是兩種CART:迴歸樹,模型樹 ''' def loadData(filename): dataM = [] fr = open(filename) for line in fr.readlines(): curLine = line.strip().split('\t') fltLine = map(float, curLine) # 每行存成一組浮點數 dataM.append(fltLine) return dataM # ----------------- 迴歸樹(regression tree)每個葉節點包含單個值 ------------------- def regLeaf(data): # 資料不需要再切分時用來生成葉節點(常量) return np.mean(data[:,-1]) def regErr(data): # 誤差用均方差計算 return np.var(data[:,-1]) * np.shape(data)[0] # 找最佳的切分的位置(特徵)和閾值 def chooseBestSplit(data, leafType=regLeaf, errType=regErr, ops=(1,4)): tolS = ops[0] # 允許的誤差減少量的最低值 tolN = ops[1] # 允許切分的最少樣本數 if len(set(data[:,-1].T.tolist()[0])) == 1: # 標籤值只有一個值(都是一類) return None, leafType(data) m, n = np.shape(data) S = errType(data) # 目標變數的總方差 bestS = inf bestIdx = 0 bestVal = 0 for featIdx in range(n-1): for splitVal in set(data[:, featIdx]): mat0, mat1 = binSplitData(data, featIdx, splitVal) if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN): continue # 劃分條件 newS = errType(mat0) + errType(mat1) if newS < bestS: bestIdx = featIdx bestVal = splitVal bestS = newS if (S-newS) < tolS: return None, leafType(data) # 如果誤差變化量很小就退出 mat0, mat1 = binSplitData(data, bestIdx, bestVal) if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN): return None, leafType(data) # 如果切分的資料集很小也退出 return bestIdx, bestVal # 資料集的切分函式 def binSplitData(data, feature, value): mat0 = data[np.nonzero(data[:, feature] > value)[0], :] # 左子樹 mat1 = data[np.nonzero(data[:, feature] <= value)[0], :] # 右子樹 return mat0, mat1 def createTree(data, leafType=regLeaf, errType=regErr, ops=(1,4)): feat, val = chooseBestSplit(data, leafType, errType, ops) if feat == None: # feat為None是chooseBestSplit()決定的不再切分資料 return val # val是leafType()生成的葉節點 (這裡是常值, 變數均值) retTree = {} retTree['spInd'] = feat retTree['spVal'] = val lfData, rtData = binSplitData(data, feat, val) retTree['left'] = createTree(lfData, leafType, errType, ops) retTree['right']= createTree(rtData, leafType, errType, ops) return retTree # ------------------ 模型樹(model tree)每個葉節點包含一個線性方程 ------------------- def linearNode(data): m, n = np.shape(data) x = np.mat(np.ones((m,n))) y = np.mat(np.ones((m,1))) x[:, 1:n] = data[:, 0:n-1] y = data[:, -1] xTx = x.T * x if linalg.det(xTx) == 0.0: raise(NameError('This matrix is singular, cannot do inverse')) w = xTx.I * (x.T * y) return w, x, y def modelLeaf(data): # 資料不需要再切分時用來生成葉節點(線性函式) w, x, y = linearNode(data) return w def modelErr(data): # 誤差用平方差計算 w, x, y = linearNode(data) yHat = x * w return np.sum(np.power(y-yHat, 2)) def createTree(data, leafType=modelLeaf, errType=modelErr, ops=(1,4)): feat, val = chooseBestSplit(data, leafType, errType, ops) if feat == None: # feat為None是chooseBestSplit()決定的不再切分資料 return val # val是leafType()生成的葉節點 (這裡是直線, 迴歸係數 ) retTree = {} retTree['spInd'] = feat retTree['spVal'] = val lfData, rtData = binSplitData(data, feat, val) retTree['left'] = createTree(lfData, leafType, errType, ops) retTree['right']= createTree(rtData, leafType, errType, ops) return retTree # ----------------------------- 迴歸樹和模型樹做預測 ---------------------------------- def regTreeEval(treeNode, xdata): # 葉節點為常量值 return float(treeNode) def modelTreeEval(treeNode, xdata): # 葉節點為迴歸係數 n = np.shape(xdata)[1] x = np.mat(np.ones((1, n+1))) x[:, 1:n+1] = xdata return float(x*treeNode) def isTree(obj): return (type(obj).__name__ == 'dict') # modelEval指定樹的型別,區分兩種葉節點 def treePredict(tree, xTest, modelEval=regTreeEval): if not isTree(tree): return modelEval(tree, xTest) if xTest[tree['spInd']] > tree['spVal']: # 劃分特徵的值大於閾值,分到左子樹 if isTree(tree['left']): # 左子樹還有分支 return treePredict(tree['left'], xTest, modelEval) else: # 左子樹已經是葉節點 return modelEval(tree['left'], xTest) else: # 劃分特徵的值小於閾值,分到右子樹 if isTree(tree['right']): return treePredict(tree['right'], xTest, modelEval) else: return modelEval(tree['right'], xTest)