決策樹(挖坑待填)
阿新 • • 發佈:2021-01-07
決策樹
ID3 和 ID4.5
from math import log import operator #id 指定某一列屬性來計算熵值 def calcShannonEnt(dataSet, id): numEntries = len(dataSet) labelCounts = {} for featVec in dataSet: currentLabel = featVec[id] #取出類別列 if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0 labelCounts[currentLabel] += 1 shannonEnt = 0 for key in labelCounts: prob = float(labelCounts[key])/numEntries shannonEnt -= prob*log(prob,2) return shannonEnt def createDataSet(): dataSet=[[0, 0, 0, 0, 'no'], [0, 0, 0, 1, 'no'], [0, 1, 0, 1, 'yes'], [0, 1, 1, 0, 'yes'], [0, 0, 0, 0, 'no'], [1, 0, 0, 0, 'no'], [1, 0, 0, 1, 'no'], [1, 1, 1, 1, 'yes'], [1, 0, 1, 2, 'yes'], [1, 0, 1, 2, 'yes'], [2, 0, 1, 2, 'yes'], [2, 0, 1, 1, 'yes'], [2, 1, 0, 1, 'yes'], [2, 1, 0, 2, 'yes'], [2, 0, 0, 0, 'no']] #分類屬性 labels=['年齡','有工作','有自己的房子','信貸情況'] return dataSet, labels #取出按某個特徵分類後的資料(就是把該列特徵去掉) def splitDataSet(dataSet, axis, value): retDataSet = [] for featVec in dataSet: if featVec[axis] == value: reducedFeatVec = featVec[:axis] reducedFeatVec.extend(featVec[axis+1:]) retDataSet.append(reducedFeatVec) return retDataSet def chooseBestFeatureToSplit(dataSet): numFeatures = len(dataSet[0])-1 baseEntropy = calcShannonEnt(dataSet,-1) bestInfoGain = 0 bestFeature = -1 for i in range(numFeatures): featList = [example[i] for example in dataSet] uniqueVals = set(featList) newEntropy = 0 for value in uniqueVals: subDataSet = splitDataSet(dataSet, i, value) prob = len(subDataSet)/float(len(dataSet)) newEntropy += prob*calcShannonEnt(subDataSet,-1) infoGain = baseEntropy - newEntropy #計算劃分後的資訊增益 IV = calcShannonEnt(dataSet,i) infoGain = infoGain/IV print(i,value,infoGain," ",bestInfoGain) if infoGain > bestInfoGain: bestInfoGain = infoGain bestFeature = i #標記最好的特徵 print("bestGain = ",bestInfoGain) print("\n") return bestFeature #確定分類的類別(按分類後的類別數量排序,取數量最大的類別當做最終類別 def majorityCnt(classList): classCount = {} for vote in classList: if vote not in classCount.keys(): classCount[vote] = 0 classCount[vote] += 1 sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True) return sortedClassCount[0][0] def createTree(dataSet, labels): classList = [example[-1] for example in dataSet] #如果只有一種類別 if classList.count(classList[0]) == len(classList): return classList[0] if len(dataSet[0])==1: return majorityCnt(classList) bestFeat = chooseBestFeatureToSplit(dataSet) #選擇最優特徵 bestFeatLabel = labels[bestFeat] myTree = {bestFeatLabel:{}} #以字典的形式儲存樹 del(labels[bestFeat]) #把該特徵標籤刪掉 featValues = [example[bestFeat] for example in dataSet] uniqueVals = set(featValues) for value in uniqueVals: subLabels = labels[:] myTree[bestFeatLabel][value] = createTree(splitDataSet\ (dataSet,bestFeat,value),subLabels) return myTree if __name__=='__main__': dataSet, labels = createDataSet() print(createTree(dataSet,labels))
CART迴歸樹
# -*- coding: utf-8 -*- """ Created on Wed Jan 6 13:54:43 2021 @author: koneko """ import numpy as np def loadDataSet(fileName): dataMat = [] fr = open(fileName) for line in fr.readlines(): curLine = line.strip().split('\t') fitLine = list(map(float, curLine)) #python3 裡面map返回的是一個物件,需要型別轉換 dataMat.append(fitLine) return dataMat def binSplitDataSet(dataSet, feature, value): mat0 = dataSet[np.nonzero(dataSet[:,feature]>value)[0],:] mat1 = dataSet[np.nonzero(dataSet[:,feature]<=value)[0],:] return mat0,mat1 def regLeaf(dataSet): return np.mean(dataSet[:,-1]) def regErr(dataSet): return np.var(dataSet[:,-1])*np.shape(dataSet)[0] def chooseBestSplit(dataSet, leafType=regLeaf,errType=regErr,ops=(1,4)): tolS = ops[0] #使用者允許的誤差下降值 tolN = ops[1] #切分的最少樣本數 #如果所有值都相等則退出,也即是隻有一種類別,不需要進行分割 if len(set(dataSet[:,-1].T.tolist()[0])) == 1: return None, leafType(dataSet) m,n = np.shape(dataSet) S = errType(dataSet) bestS = np.inf bestIndex = 0 bestValue = 0 #遍歷集合中的各種特徵 for featIndex in range(n-1): #遍歷該特徵在集合中出現的各種可能取值 for splitVal in set(dataSet[:,featIndex].T.A.tolist()[0]): #注意這裡是Python3語法上有點不相容 #用該取值進行二元分割 mat0, mat1 = binSplitDataSet(dataSet,featIndex,splitVal) #如果其中一個分支的樣本數少於切分的最少樣本數則不切分 if (np.shape(mat0)[0]<tolN) or (np.shape(mat1)[0]<tolN): continue #計算切分之後的資料集的誤差S newS = errType(mat0) + errType(mat1) #如果誤差比較小則更新最小誤差 if newS < bestS: bestIndex = featIndex bestValue = splitVal bestS = newS #如果誤差減少不大則退出 if (S - bestS) < tolS: return None, leafType(dataSet) mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue) #如果切分出的資料集很小則退出 if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN): return None, leafType(dataSet) return bestIndex, bestValue def createTree(dataSet,leafType=regLeaf,errType=regErr,ops=(1,4)): feat, val = chooseBestSplit(dataSet,leafType,errType,ops) if feat == None: #滿足停止條件時返回 return val retTree = {} retTree['spInd'] = feat retTree['spVal'] = val lSet, rSet = binSplitDataSet(dataSet, feat, val) retTree['left'] = createTree(lSet,leafType,errType,ops) retTree['right'] = createTree(rSet,leafType,errType,ops) return retTree myData = loadDataSet('ex00.txt') myMat = np.mat(myData) tree = createTree(myMat) print(tree)
CART分類樹
from math import log import operator def createDataSet(): """ 建立測試的資料集 :return: """ dataSet = [ # 1 ['青綠', '蜷縮', '濁響', '清晰', '凹陷', '硬滑', '好瓜'], # 2 ['烏黑', '蜷縮', '沉悶', '清晰', '凹陷', '硬滑', '好瓜'], # 3 ['烏黑', '蜷縮', '濁響', '清晰', '凹陷', '硬滑', '好瓜'], # 4 ['青綠', '蜷縮', '沉悶', '清晰', '凹陷', '硬滑', '好瓜'], # 5 ['淺白', '蜷縮', '濁響', '清晰', '凹陷', '硬滑', '好瓜'], # 6 ['青綠', '稍蜷', '濁響', '清晰', '稍凹', '軟粘', '好瓜'], # 7 ['烏黑', '稍蜷', '濁響', '稍糊', '稍凹', '軟粘', '好瓜'], # 8 ['烏黑', '稍蜷', '濁響', '清晰', '稍凹', '硬滑', '好瓜'], # ---------------------------------------------------- # 9 ['烏黑', '稍蜷', '沉悶', '稍糊', '稍凹', '硬滑', '壞瓜'], # 10 ['青綠', '硬挺', '清脆', '清晰', '平坦', '軟粘', '壞瓜'], # 11 ['淺白', '硬挺', '清脆', '模糊', '平坦', '硬滑', '壞瓜'], # 12 ['淺白', '蜷縮', '濁響', '模糊', '平坦', '軟粘', '壞瓜'], # 13 ['青綠', '稍蜷', '濁響', '稍糊', '凹陷', '硬滑', '壞瓜'], # 14 ['淺白', '稍蜷', '沉悶', '稍糊', '凹陷', '硬滑', '壞瓜'], # 15 ['烏黑', '稍蜷', '濁響', '清晰', '稍凹', '軟粘', '壞瓜'], # 16 ['淺白', '蜷縮', '濁響', '模糊', '平坦', '硬滑', '壞瓜'], # 17 ['青綠', '蜷縮', '沉悶', '稍糊', '稍凹', '硬滑', '壞瓜'] ] # 特徵值列表 labels = ['色澤', '根蒂', '敲擊', '紋理', '臍部', '觸感'] return dataSet,labels def calcGini(dataSet): numEntries = len(dataSet) labelCounts = {} for featVec in dataSet: currentLabel = featVec[-1] #取最後一列(類別 if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0 labelCounts[currentLabel] += 1 #統計每個類別的數目 Gini = 1 for key in labelCounts: p = float(labelCounts[key])/numEntries Gini -= p*p return Gini def createDataSet1(): # 創造示例資料 dataSet = [['長', '粗', '男'], ['短', '粗', '男'], ['短', '粗', '男'], ['長', '細', '女'], ['短', '細', '女'], ['短', '粗', '女'], ['長', '粗', '女'], ['長', '粗', '女']] labels = ['頭髮','聲音'] #兩個特徵 return dataSet,labels def createDataSet2(): """ 創造示例資料/讀取資料 @param dataSet: 資料集 @return dataSet labels:資料集 特徵集 """ # 資料集 dataSet = [['青年', '否', '否', '一般', '不同意'], ['青年', '否', '否', '好', '不同意'], ['青年', '是', '否', '好', '同意'], ['青年', '是', '是', '一般', '同意'], ['青年', '否', '否', '一般', '不同意'], ['中年', '否', '否', '一般', '不同意'], ['中年', '否', '否', '好', '不同意'], ['中年', '是', '是', '好', '同意'], ['中年', '否', '是', '非常好', '同意'], ['中年', '否', '是', '非常好', '同意'], ['老年', '否', '是', '非常好', '同意'], ['老年', '否', '是', '好', '同意'], ['老年', '是', '否', '好', '同意'], ['老年', '是', '否', '非常好', '同意'], ['老年', '否', '否', '一般', '不同意']] labels = ['年齡', '有工作', '有房子', '信貸情況'] return dataSet,labels #對某一個特徵列,按照某個其是否等於value,劃分成兩個類 def binSplitDataSet(dataSet,index,value): set1=[] set2=[] for featVec in dataSet: reducedFeatVec = featVec[:index] reducedFeatVec.extend(featVec[index+1:]) if featVec[index] == value: set1.append(reducedFeatVec) else: set2.append(reducedFeatVec) return set1,set2 def chooseBestFeatureToSplit(dataSet): numFeatures = len(dataSet[0])-1 nD = len(dataSet) bestGini_feat = 100 bestFeature = -1 for feat in range(numFeatures): featvals = [example[feat] for example in dataSet] featvals = set(featvals) for val in featvals: set0,set1 = binSplitDataSet(dataSet,feat,val) newGini_feat = (len(set0)/float(nD)) * calcGini(set0) newGini_feat += (len(set1)/float(nD)) * calcGini(set1) if newGini_feat < bestGini_feat: bestGini_feat = newGini_feat bestFeature = feat bestVal = val return bestFeature,bestVal def majorityCnt(classList): classCount = {} for vote in classList: if vote not in classCount.keys(): classCount[vote] = 0 classCount[vote] += 1 sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True) return sortedClassCount[0][0] def createTree(dataSet, labels): classList = [a[-1] for a in dataSet] #如果只有一個類別 if classList.count(classList[0]) == len(classList): return classList[0] #如果沒有特徵可以再分了,返回多數表決 if len(dataSet[0]) == 1: return majorityCnt(classList) #選擇最佳特徵和特徵值進行分割 bestFeat,bestVal = chooseBestFeatureToSplit(dataSet) bestFeatLabel = labels[bestFeat] myTree = {bestFeatLabel:{}} #以字典的形式儲存樹 del(labels[bestFeat]) mat0,mat1 = binSplitDataSet(dataSet,bestFeat,bestVal) left = bestVal right = set([a[bestFeat] for a in dataSet]) right.remove(bestVal) right = tuple(right) print(right) subLabels = labels[:] myTree[bestFeatLabel][left] = createTree(mat0,subLabels) myTree[bestFeatLabel][right] = createTree(mat1,subLabels) return myTree dataSet, labels = createDataSet2() myTree = createTree(dataSet,labels) print(myTree)