1. 程式人生 > >決策樹——ID3演算法

決策樹——ID3演算法

先上程式碼,理論有空補上,採用python 3.X編寫,沒有剪枝部分

import math
import operator


# 計算資料集的資訊熵
def calcEntropy(data):
    # label = []
    numClass = {}
    Entropy = 0.0
    label = [sample[-1] for sample in data]
    for i in label:
        numClass[i] = numClass.get(i, 0) + 1
    for i in numClass:
        prob = float(numClass[i]/len(label))    # 計算每個類別的概率
        Entropy = Entropy - prob * math.log(prob, 2)    # 計算每個類別熵之和
    return Entropy


# 將data中特徵為i,且特徵值等於setValue的資料取出
def splitData(data, i, setValue):
    subData = []
    for sample in data:
        if sample[i] == setValue:
            reducedSample = sample[:i]  # 刪除樣本的i特徵資料
            reducedSample.extend(sample[i+1:])
            subData.append(reducedSample)
    return subData


def selAttribute(data):
    totalEntropy = calcEntropy(data)  # 計算資料熵
    IntiGainEn = 0.0  # 初始化資訊增益
    for i in range(len(data[0])-1):   # 遍歷data中每個特徵
        valueList = [sample[i] for sample in data]     # 每個樣本特徵i 的取值
        numvalue = {}
        Entropy = 0.0
        for i in valueList:
            numvalue[i] = numvalue.get(i, 0) + 1
        value = set(valueList)   # 特徵i 的所有不同值
        for value in value:
            subData = splitData(data, i, value)
            subEntropy = calcEntropy(subData)
            prob = float(numvalue[value]/len(valueList))
            Entropy = Entropy + prob * subEntropy
        GainEn = totalEntropy - Entropy     # 資訊增益
        if GainEn > IntiGainEn:
            return i  # 返回分割特徵的索引
        else:
            IntiGainEn = GainEn


# 對於最後的劃分特徵,選該特徵下樣本型別最多的作為該葉節點的型別
def majorVote(classList):
    classCount = {}
    for i in classList:
        classCount[i] = classCount.get(i, 0) + 1
    sortedClassCount = sorted(classCount.items, key=operator.itemgetter(1), reverse=True)
    return sortedClassCount[0][0]


def createTree(data, attribute):        # data是資料集,attribute是資料集的特徵
    classList = [Class[-1] for Class in data]
    if len(set(classList)) == 1:    # 如果classList中的類別都相同,則將該類別作為該葉子節點的標記
        return classList[0]
    if len(data[0]) == 1:   # 如果data中只剩下一個特徵了,則將該特徵下樣本類別最多的類別作為該節點標記
        return majorVote(classList)
    attributeIndex = selAttribute(data)     # 返回最佳劃分特徵的索引
    bestAttribute = attribute[attributeIndex]   # 按索引從資料集特徵中找到最佳特徵
    myTree = {bestAttribute: {}}    # 以字典方式儲存數
    del(attribute[attributeIndex])   # 刪除上面使用過的特徵
    attributeValue = [value[attributeIndex] for value in data]
    brach = set(attributeValue)
    for value in brach:
        subattribute = attribute[:]  # 複製,防止其它地方修改
        subData = splitData(data, attributeIndex, value)
        myTree[bestAttribute][value] = createTree(subData, subattribute)
    return myTree


def createDataSet():
    data = [[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']]
    attribute = ['no surfacing', 'flippers']
    # data = [[1, 0, 'good'], [1, 0, 'good'], [0, 0, 'bad'], [0, 1, 'bad'], [1, 1, 'bad']]
    # attribute = ['根蒂', '紋理']
    return data, attribute


if __name__ == '__main__':
    data, attribute = createDataSet()
    Tree = createTree(data, attribute)
    print(Tree)