決策樹——ID3演算法
阿新 • • 發佈:2019-02-16
先上程式碼,理論有空補上,採用python 3.X編寫,沒有剪枝部分
import math import operator # 計算資料集的資訊熵 def calcEntropy(data): # label = [] numClass = {} Entropy = 0.0 label = [sample[-1] for sample in data] for i in label: numClass[i] = numClass.get(i, 0) + 1 for i in numClass: prob = float(numClass[i]/len(label)) # 計算每個類別的概率 Entropy = Entropy - prob * math.log(prob, 2) # 計算每個類別熵之和 return Entropy # 將data中特徵為i,且特徵值等於setValue的資料取出 def splitData(data, i, setValue): subData = [] for sample in data: if sample[i] == setValue: reducedSample = sample[:i] # 刪除樣本的i特徵資料 reducedSample.extend(sample[i+1:]) subData.append(reducedSample) return subData def selAttribute(data): totalEntropy = calcEntropy(data) # 計算資料熵 IntiGainEn = 0.0 # 初始化資訊增益 for i in range(len(data[0])-1): # 遍歷data中每個特徵 valueList = [sample[i] for sample in data] # 每個樣本特徵i 的取值 numvalue = {} Entropy = 0.0 for i in valueList: numvalue[i] = numvalue.get(i, 0) + 1 value = set(valueList) # 特徵i 的所有不同值 for value in value: subData = splitData(data, i, value) subEntropy = calcEntropy(subData) prob = float(numvalue[value]/len(valueList)) Entropy = Entropy + prob * subEntropy GainEn = totalEntropy - Entropy # 資訊增益 if GainEn > IntiGainEn: return i # 返回分割特徵的索引 else: IntiGainEn = GainEn # 對於最後的劃分特徵,選該特徵下樣本型別最多的作為該葉節點的型別 def majorVote(classList): classCount = {} for i in classList: classCount[i] = classCount.get(i, 0) + 1 sortedClassCount = sorted(classCount.items, key=operator.itemgetter(1), reverse=True) return sortedClassCount[0][0] def createTree(data, attribute): # data是資料集,attribute是資料集的特徵 classList = [Class[-1] for Class in data] if len(set(classList)) == 1: # 如果classList中的類別都相同,則將該類別作為該葉子節點的標記 return classList[0] if len(data[0]) == 1: # 如果data中只剩下一個特徵了,則將該特徵下樣本類別最多的類別作為該節點標記 return majorVote(classList) attributeIndex = selAttribute(data) # 返回最佳劃分特徵的索引 bestAttribute = attribute[attributeIndex] # 按索引從資料集特徵中找到最佳特徵 myTree = {bestAttribute: {}} # 以字典方式儲存數 del(attribute[attributeIndex]) # 刪除上面使用過的特徵 attributeValue = [value[attributeIndex] for value in data] brach = set(attributeValue) for value in brach: subattribute = attribute[:] # 複製,防止其它地方修改 subData = splitData(data, attributeIndex, value) myTree[bestAttribute][value] = createTree(subData, subattribute) return myTree def createDataSet(): data = [[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']] attribute = ['no surfacing', 'flippers'] # data = [[1, 0, 'good'], [1, 0, 'good'], [0, 0, 'bad'], [0, 1, 'bad'], [1, 1, 'bad']] # attribute = ['根蒂', '紋理'] return data, attribute if __name__ == '__main__': data, attribute = createDataSet() Tree = createTree(data, attribute) print(Tree)