機器學習實戰-3決策樹
阿新 • • 發佈:2019-02-06
劃分依據
決策樹的主要依據為資訊熵計算,資訊熵最大的最為分類依據流程
建立資料集 –> 計算資訊熵,最大值作為結點,劃分子資料集 –> 遞迴尋找程式碼
from math import log
import operator
'''
機器學習實戰-第三章(決策樹)
'''
# 建立資料集
def createDataSet():
dataset = [
[1, 1, 'yes'],
[1, 0, 'no'],
[0, 1, 'no'],
[0, 1, 'no']
]
labels = ['good' , 'bad']
return dataset, labels
# 計算夏農熵
def calcShannonEnt(dataset):
numEntries = len(dataset)
labelsCount = {} # 字典相當於java中的map
for featVec in dataset:
currentLabel = featVec[-1]
if currentLabel not in labelsCount.keys():
labelsCount[currentLabel] = 1
else :
labelsCount[currentLabel] += 1
shannonEnt = 0.0
for key in labelsCount:
prop = labelsCount[key] / numEntries
shannonEnt -= prop * log(prop, 2)
return shannonEnt
# 劃分資料集
# 篩選出第axis個特徵的值為value的項,同時刪除次特徵列
def splitDataSet(dataSet, axis, value):
retDataSet = []
for featureVec in dataSet:
if featureVec[axis] == value:
reducedFeatVec = featureVec[:axis]
reducedFeatVec.extend(featureVec[axis + 1:])
retDataSet.append(reducedFeatVec)
return retDataSet
# 補充:python中,可變的為引用,需要建立副本如列表;不可變的為值傳遞,如元組
# append和extend的區別,append是將後面一個作為整體一個加入,extend是將後面一個拆開,和之前的元組型別一樣的
# 計算每一個特徵值所對應的資訊熵,選出最大的資訊熵
def chooseBestFeatureToSplit(dataSet):
numFeature = len(dataSet[0]) - 1 # 特徵數
bestInfoGain = 0.0;
bestFeatureIndex = -1; # 最大的資訊增益和所在的特徵列,下標
# 分別對每一列特徵進行熵計算(i)
for i in range(numFeature):
featureList = [feature[0] for feature in dataSet]
featureSet = set(featureList) # 將list轉化為set集合,提取出每一列的特徵項(不重複)
for value in featureSet:
subDataSet = splitDataSet(dataSet, i, value)
prop = len(subDataSet) / float(len(dataSet)) # 百分比
infoGain = 0.0 - prop * calcShannonEnt(subDataSet)
if infoGain > bestFeatureIndex:
bestFeatureIndex = infoGain
bestFeatureIndex = i
return bestFeatureIndex
# 構建決策樹
def createTree(dataSet, labels):
classList = [oneData[-1] for oneData in dataSet]
# 類別全部相同,就不用分(即label都相同)
if classList.count(classList[0]) == len(dataSet):
return classList[0]
# 由於可能存在沒有屬性的情況,最後還有幾個不能分,此時,可以考慮將數量多的作為最終的結果。
if len(dataSet[0]) == 1:
return majorityCnt(classList)
bestFeatureIndex = chooseBestFeatureToSplit(dataSet)
bestFeatureLabel = labels[bestFeatureIndex]
myTree = {bestFeatureLabel: {}} # 通過字典來構建決策樹
featureList = [feature[0] for feature in dataSet]
featureSet = set(featureList) # 將list轉化為set集合,提取出每一列的特徵項(不重複)
del(labels[bestFeatureIndex])
for value in featureSet:
sublabels = labels[:]
myTree[bestFeatureLabel][value] = createTree(splitDataSet
(dataSet, bestFeatureIndex, value), sublabels)
return myTree
# 找出最多的項
def majorityCnt(classList):
countList = {}
for oneData in classList:
if oneData not in countList.keys():
countList[oneData] = 0
countList[oneData] += 1
# 從大到小排序,並返回最大值
sortedList = sorted(countList.iteritems(),key=operator.itemgetter(1),reverse=True)
return sortedList[0][0]
dataSet,labels = createDataSet()
myTree = createTree(dataSet,labels)
print(myTree)
歡迎使用 {小書匠}(xiaoshujiang)編輯器,您可以通過==設定==裡的修改模板來改變新建文章的內容。