決策樹(decision tree)
阿新 • • 發佈:2018-01-18
from ase size eat for pan ted count import
代碼還好懂,但是後面選擇更好的劃分數據集的方法,有點不知道為什麽那樣選。
還要好好理解推導。
from math import log #計算香農熵 def calcShannonEnt(dataSet): numEntries = len(dataSet) labelCount = {} for featVector in dataSet: currentlabel = featVector[-1] labelCount[currentlabel] = labelCount.get(currentlabel,0) + 1 shannonEnt= 0.0 for key in labelCount: prob = float(labelCount[key])/numEntries shannonEnt -= prob * log(prob, 2) return shannonEnt #訓練樣本 def createDataSet(): dataSet = [[1,1,‘yes‘],[1,1,‘yes‘],[1,0,‘no‘], [0,1,‘no‘],[0,1,‘no‘]] labels = [‘no surfacing‘,‘flippers‘]return dataSet,labels #按照給定特征劃分數據集 def splitDataSet(dataSet,axis,value): retDataSet = [] for featVec in dataSet: if(featVec[0]==value): reducedFeatVec = featVec[:axis] #這個變量幹嘛的? reducedFeatVec.extend(featVec[axis+1:]) retDataSet.append(reducedFeatVec)return retDataSet def main(): dataSet,labels = createDataSet() # shannonEnt = calcShannonEnt(dataSet) #香農熵 # print(shannonEnt) print(splitDataSet(dataSet,0,1)) print(splitDataSet(dataSet,0,0)) main()
append和extend區別:
a = [1,2,3] c = [1,2,3] b = [4,5,6] a.append(b) c.extend(b) print(a) print(c)
[1, 2, 3, [4, 5, 6]]
[1, 2, 3, 4, 5, 6]
決策樹(decision tree)