《機器學習實戰》學習總結2——決策樹構造
阿新 • • 發佈:2018-11-24
決策樹
一種依託於策略抉擇而建立起來的樹。
從資料產生決策樹的機器學習技術叫做決策樹學習。
資料形式:決策過程只有:是/否
適用資料型別:數值型和標稱型
標稱型:其實就是離散型資料,變數的結果只在有限目標集中取值。
資訊增益
資訊熵:
表示資訊的混亂程度,也就是說:資訊越有序,資訊熵越低。
資訊增益:
資訊增益越大,做的東西越好——為了找劃分資料集的最好特徵
劃分資料集的最大原則是:將無序的資料變得更加有序。
from math import log
import operator
def createDataSet():
#每一行代表不同的資料,一共有五個資料
dataSet = [[1, 1, 'yes'],
[1, 1, 'yes'],
[1, 0, 'no'],
[0, 1, 'no'],
[0, 1, 'no']]
#第一個1代表是否露出水面,第二個1代表是否露出腳蹼,第三個是結果yes/no(是否是魚類)
labels = ['no surfacing', 'flippers']
#change to discrete values
return dataSet, labels#第一個是資料集,第二是描述(標籤)
def calcShannonEnt(dataSet):
numEntries = len(dataSet)
labelCounts = {}
for featVec in dataSet: #the the number of unique elements and their occurance
currentLabel = featVec[-1]#最後一列遍歷,統計
if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0
labelCounts[ currentLabel] += 1
shannonEnt = 0.0
for key in labelCounts:
prob = float(labelCounts[key])/numEntries
shannonEnt -= prob * log(prob, 2) #log base 2#夏農熵求出夏農值
return shannonEnt
def splitDataSet(dataSet, axis, value):
retDataSet = []
for featVec in dataSet:
#axis列為value的資料集【該資料集需要排除axis列】
if featVec[axis] == value:
reducedFeatVec = featVec[:axis] #chop out axis used for splitting
reducedFeatVec.extend(featVec[axis+1:])
retDataSet.append(reducedFeatVec)
return retDataSet
def chooseBestFeatureToSplit(dataSet):
#求第一行有多少列的Feature,(減去最後一列是label列)
numFeatures = len(dataSet[0]) - 1 #the last column is used for the labels
#label的資訊熵(代表整體資料集的資訊(混亂程度))
baseEntropy = calcShannonEnt(dataSet)
bestInfoGain = 0.0; bestFeature = -1
for i in range(numFeatures): #iterate over all the features
#獲取每一個feature的list集合
featList = [example[i] for example in dataSet]#create a list of all the examples of this feature
#獲取剔重後的集合
uniqueVals = set(featList) #get a set of unique values
#建立一個臨時的資訊熵
newEntropy = 0.0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i, value)
#value代表label(a,b,c,d),i代表列
#計算概率
prob = len(subDataSet)/float(len(dataSet))
#計算資訊熵
newEntropy += prob * calcShannonEnt(subDataSet)
infoGain = baseEntropy - newEntropy #calculate the info gain; ie reduction in entropy
#gain【資訊增益】:劃分資料集前後的資訊變化,獲取資訊熵最大的值
#劃分越有序就作為那個根
if (infoGain > bestInfoGain): #compare this to the best gain so far
bestInfoGain = infoGain #if better than current best, set to best
bestFeature = i
return bestFeature #returns an integer
def majorityCnt(classList):
classCount={}#字典宣告
for vote in classList:
if vote not in classCount.keys(): classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
#reverse=True倒序
return sortedClassCount[0][0]
def createTree(dataSet, labels):
classList = [example[-1] for example in dataSet] #取分類標籤
if classList.count(classList[0]) == len(classList): #如果類別完全相同則停止繼續劃分
return classList[0]
if len(dataSet[0]) == 1: #遍歷完所有特徵時返回出現次數最多的類標籤
return majorityCnt(classList)
bestFeat = chooseBestFeatureToSplit(dataSet) #選擇最優特徵
bestFeatLabel = labels[bestFeat] #最優特徵的標籤
myTree = {bestFeatLabel:{}} #根據最優特徵的標籤生成樹
del(labels[bestFeat]) #刪除已經使用特徵標籤
featValues =[example[bestFeat] for example in dataSet] #得到訓練集中所有最優特徵的屬性值
uniqueVals = set(featValues) #去掉重複的屬性值
for value in uniqueVals: #遍歷特徵,建立決策樹。
subLabels = labels[:]
myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels)
return myTree
def classify(inputTree, featLabels, testVec):
'''
inputTree:決策樹模型
featLabels:Feature標籤對應的名稱
testVec:測試輸入的資料
返回值:classlabel分類的結果,需要對映到label才能知道名稱
'''
#獲取tree的根節點對應的key值
firstStr = list(inputTree)[0]
#通過key得到根節點對應的value
secondDict = inputTree[firstStr]
#傳入featLabels的名稱,求出對應根的名稱。index名稱對應的座標
featIndex = featLabels.index(firstStr)
key = testVec[featIndex]
valueOfFeat = secondDict[key]
if isinstance(valueOfFeat, dict):
classLabel = classify(valueOfFeat, featLabels, testVec)
else: classLabel = valueOfFeat
return classLabel
def storeTree(inputTree, filename):
import pickle
fw = open(filename, 'wb')
pickle.dump(inputTree, fw)
fw.close()
def grabTree(filename):
import pickle
fr = open(filename, 'rb')
return pickle.load(fr)
總結:
1.extend與append的區別
>>> A = ['q', 'w', 'e', 'r']
>>> A.extend(['t', 'y'])
>>> A
['q', 'w', 'e', 'r', 't', 'y']
>>>len(A)
6
>>> B = ['q', 'w', 'e', 'r']
>>> B.append(['t', 'y'])
>>> B
['q', 'w', 'e', 'r', ['t', 'y']]
>>>len(B)
5
使用文字註解繪製樹節點
import matplotlib.pyplot as plt
decisionNode = dict(boxstyle="sawtooth", fc='0.8') #設定節點格式
leafNode = dict(boxstyle="round4", fc='0.8') #設定葉節點格式
arrow_args = dict(arrowstyle="<-") #定義箭頭格式
def plotNode(nodeTxt, centerPt, parentPt, nodeType):
createPlot.ax1.annotate(nodeTxt, xy=parentPt,xycoords='axes fraction',
xytext=centerPt, textcoords='axes fraction',va='center',
ha='center',bbox=nodeType,arrowprops=arrow_args) #繪製節點
def plotMidText(cntrPt, parentPt, txtString): #計算標註位置
xMid = (parentPt[0]-cntrPt[0])/2.0 + cntrPt[0]
yMid = (parentPt[1]-cntrPt[1])/2.0 + cntrPt[1]
createPlot.ax1.text(xMid,yMid, txtString)
def plotTree(myTree, parentPt, nodeTxt):
numLeafs = getNumLeafs(myTree) #獲取決策樹葉結點數目,決定了樹的寬度
depth = getTreeDepth(myTree) #獲取決策樹層數
firstStr = next(iter(myTree))
cntrPt = (plotTree.xOff +(1.0 + float(numLeafs))/2.0/plotTree.totalW,plotTree.yOff) #中心位置
plotMidText(cntrPt, parentPt, nodeTxt) #標註有向邊屬性值
plotNode(firstStr, cntrPt, parentPt, decisionNode) #繪製結點
secondDict = myTree[firstStr] #下一個字典,也就是繼續繪製子結點
plotTree.yOff = plotTree.yOff - 1.0/plotTree.totalD #y偏移
for key in secondDict.keys():
if type(secondDict[key]).__name__=='dict': #測試該結點是否為字典,如果不是字典,代表此結點為葉子結點
plotTree(secondDict[key],cntrPt,str(key)) #不是葉結點,遞迴呼叫繼續繪製
else: #如果是葉結點,繪製葉結點,並標註有向邊屬性值
plotTree.xOff = plotTree.xOff + 1.0/plotTree.totalW
plotNode(secondDict[key],(plotTree.xOff,plotTree.yOff),cntrPt, leafNode)
plotMidText((plotTree.xOff,plotTree.yOff),cntrPt, str(key))
plotTree.yOff = plotTree.yOff + 1.0/plotTree.totalD
def createPlot(inTree):
fig = plt.figure(1,facecolor='white') #建立fig
fig.clf() #清空fig
axprops = dict(xticks=[], yticks=[])
createPlot.ax1 = plt.subplot(111, frameon=False, **axprops) #去掉x、y軸
plotTree.totalW = float(getNumLeafs(inTree)) #獲取決策樹葉結點數目
plotTree.totalD = float(getTreeDepth(inTree)) #獲取決策樹層數
plotTree.xOff = -0.5/plotTree.totalW; plotTree.yOff = 1.0 #x偏移
plotTree(inTree,(0.5,1.0),'') #繪製決策樹
plt.show() #顯示繪製結果
使用pickle模組儲存決策樹
def storeTree(inputTree, filename):
import pickle
fw = open(filename,'w')
pickle.dump(inputTree,fw)
fw.close()
def grabTree(filename):
import pickle
fr = open(filename)
return pickle.load(fr)
fr=open('lenses.txt')
lenses = [inst.strip().split('\t') for inst in fr.readlines()]
lensesLabels = ['age','prescript','astigmatic','tearRate']
lensesTree = createTree(lenses,lensesLabels)
print(lensesTree)
createPlot(lensesTree)