機器學習實戰決策樹演算法筆記

trees.py 原始碼部分：

from math import log
import operator
def calcShannonEnt(dataSet):
numEntries=len(dataSet)
labelCounts={}
for featVec in dataSet:
currentLabel=featVec[-1]
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel]=0
labelCounts[currentLabel]+=1
shannonEnt=0.0
for key in labelCounts:
prob=float(labelCounts[key])/numEntries

shannonEnt-=prob*log(prob,2)
return shannonEnt
def creataDataSet():
dataSet=[[1,1,'yes'],[1,1,'yes'],[1,0,'no'],[0,1,'no'],[0,1,'no']]
labels=['no surfacing','flippers']
return dataSet,labels
def splitDataSet(dataSet,axis,value):
retDataSet=[]
for featVec in dataSet:
if featVec[axis] == value:
reducedFeatVec=featVec[:axis]

reducedFeatVec.extend(featVec[axis+1:])
retDataSet.append(reducedFeatVec)
return retDataSet
def chooesBestFeatureToSplit(dataSet):
numFeatures=len(dataSet[0])-1
baseEntropy=calcShannonEnt(dataSet)
bestInFoGain=0.0
bestFeature=-1
for i in range(numFeatures):
featList=[example[i] for example in dataSet]
uniqueVals=set(featList)

newEntropy=0.0
for value in uniqueVals:
subDataSet=splitDataSet(dataSet,i,value)
prob=len(subDataSet)/float(len(dataSet))
newEntropy +=prob*calcShannonEnt(subDataSet)
infoGain=baseEntropy-newEntropy
if(infoGain>bestInFoGain):
bestInFoGain=infoGain
bestFeature=i
return bestFeature
def majorityCnt(classList):
classCount={}
for vote in classlist:
if vote not in classCount.keys():
classCount[vote]=0
classCount[vote]+=1
sortedClassCount=sorted(classCount.iteritems(),key=operator.itemgetter(1),reverse=True)
return sortedClassCount[0][0]
def createTree(dataSet,labels):
classList=[example[-1] for example in dataSet]
if classList.count(classList[0])==len(classList):
return classList[0]
if len(dataSet[0])==1:
return majorityCnt(classList)
bestFeat=chooesBestFeatureToSplit(dataSet)
bestFeatLabel=labels[bestFeat]
myTree={bestFeatLabel:{}}
del(labels[bestFeat])
featValues=[example[bestFeat] for example in dataSet]
uniqueVals=set(featValues)
for value in uniqueVals:
subLabels=labels[:]
myTree[bestFeatLabel][value]=createTree(splitDataSet(dataSet,bestFeat,value),subLabels)
return myTree
def classify(inputTree,featLabels,testVec):
firstStr=inputTree.keys()[0]
secondDict=inputTree[firstStr]
featIndex=featLabels.index(firstStr)
for key in secondDict.keys():
if testVec[featIndex]==key:
if type(secondDict[key])==dict:
classLabel=classify(secondDict[key],featLabels,testVec)
else:classLabel=secondDict[key]
return classLabel

treePlotter.py 原始碼部分：

import matplotlib.pyplot as plt
decisionNode=dict(boxstyle="sawtooth",fc="0.2")
leafNode=dict(boxstyle="round4",fc="0.8")
arrow_args=dict(arrowstyle="->")
def plotNode(nodeTxt,centerPt,parentPt,nodeType):
createPlot.ax1.annotate(nodeTxt,xy=parentPt,xycoords='axes fraction',xytext=centerPt,textcoords='axes fraction',va="center",ha="center",bbox=nodeType,arrowprops=arrow_args)
def createPlot():
fig=plt.figure(1,facecolor='blue')
fig.clf()
createPlot.ax1=plt.subplot(111,frameon=False)
plotNode('a decisionNode',(0.5,0.1),(0.1,0.5),decisionNode)
plotNode('a leafNode',(0.8,0.1),(0.3,0.5),leafNode)
plt.show()
def getNumLeafs(myTree):
numLeafs=0
firstStr=myTree.keys()[0]
secondDict=myTree[firstStr]
for key in secondDict.keys():
if type(secondDict[key])==dict:
numLeafs+=getNumLeafs(secondDict[key])
else: numLeafs+=1
return numLeafs
def getTreeDepth(myTree):
maxDepth=0
firstStr=myTree.keys()[0]
secondDict=myTree[firstStr]
for key in secondDict.keys():
if type(secondDict[key])==dict:
thisDepth=1+getTreeDepth(secondDict[key])
else: thisDepth=1
if thisDepth>maxDepth:maxDepth=thisDepth
return maxDepth
def retrieveTree(i):
listOfTree=[{'no surfacing':{0:'no',1:{'flippers':{0:'no',1:'yes'}}}},{'no surfacing':{0:'no',1:{'flippers':{0:{'head':{0:'no',1:'yes'}},1:'no'}}}}]
return listOfTree[i]
def plotMidText(cntrPt,parentPt,txtString):
xMid=(parentPt[0]-cntrPt[0])/2.0+cntrPt[0]
yMid=(parentPt[1]-cntrPt[1])/2.0+cntrPt[1]
createPlot.ax1.text(xMid,yMid,txtString)
def plotTree(myTree,parentPt,nodeTxt):
numLeafs=getNumLeafs(myTree)
depth=getTreeDepth(myTree)
firstStr=myTree.keys()[0]
cntrPt=(plotTree.xOff+(1.0+float(numLeafs))/2.0/plotTree.totalW,plotTree.yOff)
plotMidText(cntrPt,parentPt,nodeTxt)
plotNode(firstStr,cntrPt,parentPt,decisionNode)
secondDict=myTree[firstStr]
plotTree.yOff=plotTree.yOff-1.0/plotTree.totalD
for key in secondDict.keys():
if type(secondDict[key])==dict:
plotTree(secondDict[key],cntrPt,str(key))
else:
plotTree.xOff=plotTree.xOff+1.0/plotTree.totalW
plotNode(secondDict[key],(plotTree.xOff,plotTree.yOff),cntrPt,leafNode)
plotMidText((plotTree.xOff,plotTree.yOff),cntrPt,str(key))
plotTree.yOff=plotTree.yOff+1.0/plotTree.totalD
def createPlot2(inTree):
fig=plt.figure(1,facecolor='white')
fig.clf()
axprops=dict(xticks=[],yticks=[])
createPlot.ax1=plt.subplot(111,frameon=False,**axprops)
plotTree.totalW=float(getNumLeafs(inTree))
plotTree.totalD=float(getTreeDepth(inTree))
plotTree.xOff=-0.5/plotTree.totalW
plotTree.yOff=1.0
plotTree(inTree,(0.5,1.0),'')
plt.show()

筆記部分：

注：一般情況下如果出現錯誤都是由於Python換行的問題 tab鍵和space鍵不公用我用的是notepad++ 文字工具沒有自帶換行功能
煩的一B 。。。也沒刻意去下個編輯器。。就隨便弄著寫發現好麻煩換行的問題還有函式for if 的區分全靠縮排格數
如果縮排格數錯了就直接GG 還念C/java 的大括號。。。(*^__^*) 嘻嘻…… 因為這個問題出現了巨大問題導致我現在對Python
有點煩。。。不過方便是比java/c方便多了就是這個縮排問題。。。改天弄個編輯器玩玩。。看能不能解決要是不能解決我就GG

1:夏農熵：集合資訊的度量方式成為夏農熵或者簡稱熵，這個名字來源於資訊理論之父克勞德·夏農。熵定義為資訊的期望值。
如果待分類的事務可能劃分在多個分類中。關於期望的計算就是對於x*p(x)求和，p(x)表示事件x發生的概率。我們計算夏農熵也遵循著這種
方式，下面是計算給定資料集的夏農熵：
from math import log /*匯入math模組中的log函式，因為計算X的資訊時需要用到log函式
def calcShannonEnt(dataSet) /* 輸入資料集
numEntries=len(dataSet) /*統計資料集中例項的總數
labelCounts={} /*定義一個字典
for featVec in dataSet: /*依照每行遍歷資料集
currentLabel=featVec[-1] /*取每行中最後一個元素
if currentLabel not in labelCounts.keys(): /*如果最後一個元素不是鍵值
labelCounts[currentLabel]=0 /*設定currentLabel為建，值為0
labelCounts[currentLabel] +=1 /*由於字典中出現了一個currentLabel，就是上面被置為0的，所以變為1，
/*上面一條語句是新建，若沒執行if則說明已有一條currentLabel存在，即+1
shannonEnt = 0.0
for key in labelCounts:
prob = float(labelCounts[key])/numEntries /*求概率，出現次數/總數
shannonEnt -=prob*log(prob,2) /*求對數，自增就是期望，期望就是對概率和事件乘集的求和
return shannonEnt /*資料後
第一個for迴圈本質是記錄鍵出現的次數，若沒出現則新建，新建後置次數為0 ，然後通過語句+1，這是對+=1這條語句的利用
2:根據資料集的特徵進行劃分。
def splitDataSet(dataSet,axis,value): /*輸入樣本集， axis代表樣本集中第i行的第axis號元素，表示特徵屬性
value代表想要測量第axis號元素的值表示特徵屬性的值
retDataSet=[] /%為了儲存分類完畢的集合
for featVec in dataSet:
if featVec[axis] == value: /*若第 axis號元素等於初始給定的value值則記錄下來
reducedFeatVec=featVec[:axis]
reducedFeatVec.extend(featVec[axis+1:]) /*以上兩條語句是把featVec陣列中除了特徵值的元素給保留下來
[1,0,yes] 若0零特徵值則最後儲存的是[1,yes]
retDataSet.append(reducedFeatVec)
return retDataSet
3:選擇最好的資料集劃分方式。我們是按照獲取最大資訊增益來劃分的資料集。也就是計算最小的熵。初始熵是dataSet的熵，
通過劃分得到的熵是該劃分所佔原始資料的百分比乘以該劃分下來的子集的熵然後對兩個子集的熵相加才是劃分後的熵
注意！！！注意區分【特徵&特徵值】
def chooesBestFeatureToSplit(dataSet):
numFeatures=len(dataSet[0])-1 /* 特徵值的數量減去yes or no
baseEntropy=calcShannonEnt(dataSet) /* 先置最優熵為為劃分是資料集的熵
bestInFoGain=0.0;beastFeatutre=-1 /* 預設最好的特徵為-1
for i in range(numFeatures) /*在特徵數量內遍歷遍歷所有的特徵 i代表的是axis
featList=[example[i] for exapmle in dataSet] /* featList陣列存放的是特徵值存放len(dataSet)個
uniqueVals=set(featList) /* set集合中存放的是不同的在本列中是[0,1]
所代表的的是特徵值也就是value
newEntropy=0.0
for value in uniqueVals: /* 進行遍歷呼叫splitDataSet()進行劃分
subDataSet=splitDataSet(dataSet,i,value)
prob=len(subDataSet)/float(len(dataSet)) /*劃分後在原集合中佔的比例
newEntropy +=prob*calcShannonEnt(subDataSet) /* 比例乘以劃分後集合的熵相加表示兩個比例相加為1
然後各自乘以各自劃分集合的熵與原資料的熵比較
infoGain=baseEntropy-newEntropy /* 表示的是未劃分集合(原集合dataSet)的熵減去劃分好各自熵的和
if(infoGain>bestInFoGain): /* 與0相比若大於則表示熵減小了也就是這種劃分方式是有利的
bestInFoGain=infoGain /*用這個劃分與原資料的熵的差值代替0，並設定該劃分為當前最優劃分
在隨後的迴圈中直接計算下次劃分與當前最優劃分的熵的大小
bestFeature=i
return bestFeature /* 輸出的是最好的劃分中的特徵
4:通過多數表決的方式確定葉子節點的分類
def majorityCnt(classList):
classCount={}
for vote in classlist:
if vote not in classCount.keys():
classCount[vote]=0
classCount[vote]+=1
sortedClassCount=sorted(classCount.iteritems(),key=operator.itemgetter(1),reverse=True)
/* iteritems是返回當前字典操作後的迭代

機器學習實戰決策樹演算法筆記

機器學習實戰決策樹演算法筆記

機器學習實戰——決策樹Python實現問題記錄

機器學習_8.決策樹演算法

機器學習實戰-決策樹-畫圖

機器學習實戰--決策樹（一）

機器學習實戰--決策樹

機器學習實戰決策樹（一）——資訊增益與劃分資料集

【機器學習】決策樹演算法（二）— 程式碼實現

機器學習之決策樹演算法詳解

機器學習實戰——決策樹

機器學習實戰-決策樹

【機器學習】決策樹演算法的基本原理

機器學習之決策樹演算法（一）

機器學習之決策樹演算法（1）

機器學習之決策樹演算法python實現

機器學習之決策樹演算法

【機器學習】決策樹（基於ID3,C4.5,CART分類迴歸樹演算法）—— python3 實現方案

[機器學習]ID3決策樹詳細計算流程周志華機器學習筆記原創Excel手算方法

機器學習之決策樹機器學習之K-近鄰演算法

【機器學習實戰系列】讀書筆記之AdaBoost演算法公式推導和例子講解（一）

機器學習實戰 決策樹 演算法 筆記

相關推薦

機器學習實戰決策樹演算法筆記