1. 程式人生 > 實用技巧 >決策樹基於ID3演算法

決策樹基於ID3演算法


from math import log
import operator

"""
使用ID3演算法劃分資料集,ID3演算法可以用於劃分標稱型資料集

決策樹分類器就像帶有終止塊的流程圖,終止塊表示分類結果。
開始處理資料集時,首先需要測量集合中資料的不一致,
然後尋找最優方案劃分資料集,直到資料集中的所有資料屬於同一分類
"""
def createDataSet():
    """ 不浮出水面是否可以生存     是否有腳蹼     屬於魚類 """
    dataSet = [[1, 1, 'yes'],
               [1, 1, 'yes'],
               [1, 0, 'no'],
               [0, 1, 'no'],
               [0, 1, 'no']]
    labels = ['no surfacing', 'flippers']
    #change to discrete values
    return dataSet, labels

def calcShannonEnt(dataSet):
    """計算資訊熵(夏農熵)
    H(x) = -∑P(xi)log(2,P(xi)) (i=1,2,..n)
    H 表示資訊熵
    P 表示某種語言文字的字元出現的概率
    LOG2是以二為底的對數,用的是二進位制,資訊熵的單位是位元(BIT,即二進位制的0和1)

    熵也可以作為一個系統的混亂程度的標準

    另一種:基尼不純度 也可以作為衡量系統混亂程度的標準
        基尼 = 1 − ∑P(xi)^2 (i=1,2,..n)

    主要的區別就是基尼中把logP(xi)換成了P(xi),相比於熵,基尼反而有計算量小的優勢
    """
    numEntries = len(dataSet)  # 計算資料集中例項的總數
    labelCounts = {}  # 類別次數字典
    for featVec in dataSet:  # the the number of unique elements and their occurance
        currentLabel = featVec[-1]
        if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1
    shannonEnt = 0.0   # 資訊熵
    for key in labelCounts:
        prob = float(labelCounts[key])/numEntries  # 類別出現的頻率
        shannonEnt -= prob * log(prob, 2)  # log base 2
    return shannonEnt

def splitDataSet(dataSet, axis, value):
    """
    按照給定特徵劃分資料集  第axis個特徵是value的資料集
    :param dataSet: 待劃分的資料集
    :param axis: 劃分資料集的特徵索引
    :param value: 需要返回的特徵的值
    :return: 符合的資料集
    """
    retDataSet = []
    for featVec in dataSet:
        if featVec[axis] == value:
            reducedFeatVec = featVec[:axis]     #chop out axis used for splitting
            reducedFeatVec.extend(featVec[axis+1:])  # [,,,]
            retDataSet.append(reducedFeatVec)  # [,,[]]
    return retDataSet

def chooseBestFeatureToSplit(dataSet):
    """
    選擇最好的資料集劃分方式
    選擇熵最小的,也就是資料最純的
    :param dataSet:
    :return: 最好特徵劃分的索引值
    """
    numFeatures = len(dataSet[0]) - 1  # 特徵數,最後一個為標籤 #the last column is used for the labels
    baseEntropy = calcShannonEnt(dataSet)
    bestInfoGain = 0.0; bestFeature = -1
    for i in range(numFeatures):        #iterate over all the features
        featList = [example[i] for example in dataSet]#create a list of all the examples of this feature
        uniqueVals = set(featList)  # 當前特徵中所有的唯一屬性值集合  #get a set of unique values
        newEntropy = 0.0
        for value in uniqueVals:  # 遍歷當前特徵中所有的唯一屬性值,對每個特徵劃分一次資料集
            subDataSet = splitDataSet(dataSet, i, value)
            prob = len(subDataSet)/float(len(dataSet))
            newEntropy += prob * calcShannonEnt(subDataSet)
        infoGain = baseEntropy - newEntropy     #calculate the info gain; ie reduction in entropy
        if (infoGain > bestInfoGain):       #compare this to the best gain so far
            bestInfoGain = infoGain         #if better than current best, set to best
            bestFeature = i
    return bestFeature                      #returns an integer

def majorityCnt(classList):
    """出現次數最多的分類名稱"""
    classCount={}
    for vote in classList:
        if vote not in classCount.keys(): classCount[vote] = 0
        classCount[vote] += 1
    sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
    return sortedClassCount[0][0]

def createTree(dataSet, labels):
    """
    構建決策數
    :param dataSet: 資料集
    :param labels: 標籤列表
    :return: 樹
    """
    print(dataSet)
    classList = [example[-1] for example in dataSet]  # 包含資料集的所有類標籤
    if classList.count(classList[0]) == len(classList):  # 所有類標籤完全相同,直接返回該標籤
        return classList[0] #stop splitting when all of the classes are equal
    #
    if len(dataSet[0]) == 1: #stop splitting when there are no more features in dataSet
        return majorityCnt(classList)
    bestFeat = chooseBestFeatureToSplit(dataSet)  # 最好特徵劃分的索引值
    bestFeatLabel = labels[bestFeat]  # 最好的特徵劃分標籤
    myTree = {bestFeatLabel:{}}
    del(labels[bestFeat])  # 刪除已經使用的標籤
    featValues = [example[bestFeat] for example in dataSet]
    uniqueVals = set(featValues)
    for value in uniqueVals:
        subLabels = labels[:]  # 拷貝標籤     #copy all of labels, so trees don't mess up existing labels
        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels)
    return myTree


def classify(inputTree, featLabels, testVec):
    """
    決策樹分類
    :param inputTree: 決策樹
    :param featLabels: 標籤
    :param testVec:
    :return:
    """
    firstStr = list(inputTree)[0]   # 相當於list(inputTree.keys())[0]
    secondDict = inputTree[firstStr]
    featIndex = featLabels.index(firstStr)
    key = testVec[featIndex]
    valueOfFeat = secondDict[key]
    if isinstance(valueOfFeat, dict):
        classLabel = classify(valueOfFeat, featLabels, testVec)
    else: classLabel = valueOfFeat
    return classLabel

"""
 儲存決策樹
"""
def storeTree(inputTree, filename):
    import pickle
    fw = open(filename, 'wb')
    pickle.dump(inputTree, fw)
    fw.close()

def grabTree(filename):
    import pickle
    fr = open(filename, 'rb')
    return pickle.load(fr)


if __name__ == '__main__':
    # 熵越高,則混合的資料也越多
    dataSet, labels = createDataSet()
    # dataSet[0][-1] = 'maybe'
    # print(calcShannonEnt(dataSet))
    # print(splitDataSet(dataSet,1,0))
    # print(splitDataSet(dataSet,0,0))
    # print(chooseBestFeatureToSplit(dataSet))  # 第0個特徵是最好的用於劃分資料集的特徵

    """ 不浮出水面是否可以生存   是否有腳蹼   屬於魚類
    dataSet = [[1, 1, 'yes'],
               [1, 1, 'yes'],
               [1, 0, 'no'],
               [0, 1, 'no'],
               [0, 1, 'no']]
    
    按照第一個特徵屬性劃分資料
        特徵是1的:兩個魚類,一個不是魚類
        特徵是0的:都是魚類
    按照第二個特徵屬性劃分資料
        特徵是1的:兩個魚類,兩個不是魚類
        特徵是0的:都不是魚類   
    比較得出第一種分組的輸出結果較好 
    """
    # print(createTree(dataSet, labels))  # {'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}