決策樹基於ID3演算法

阿新 • • 發佈：2020-12-30


from math import log
import operator

"""
使用ID3演算法劃分資料集，ID3演算法可以用於劃分標稱型資料集

決策樹分類器就像帶有終止塊的流程圖，終止塊表示分類結果。
開始處理資料集時，首先需要測量集合中資料的不一致，
然後尋找最優方案劃分資料集，直到資料集中的所有資料屬於同一分類
"""
def createDataSet():
    """ 不浮出水面是否可以生存     是否有腳蹼     屬於魚類 """
    dataSet = [[1, 1, 'yes'],
               [1, 1, 'yes'],
               [1, 0, 'no'],
               [0, 1, 'no'],
               [0, 1, 'no']]
    labels = ['no surfacing', 'flippers']
    #change to discrete values
    return dataSet, labels

def calcShannonEnt(dataSet):
    """計算資訊熵（夏農熵）
    H(x) = -∑P(xi)log(2,P(xi)) (i=1,2,..n)
    H 表示資訊熵
    P 表示某種語言文字的字元出現的概率
    LOG2是以二為底的對數，用的是二進位制，資訊熵的單位是位元（BIT，即二進位制的0和1）

    熵也可以作為一個系統的混亂程度的標準

    另一種：基尼不純度 也可以作為衡量系統混亂程度的標準
        基尼 = 1 − ∑P(xi)^2 (i=1,2,..n)

    主要的區別就是基尼中把logP(xi)換成了P(xi)，相比於熵，基尼反而有計算量小的優勢
    """
    numEntries = len(dataSet)  # 計算資料集中例項的總數
    labelCounts = {}  # 類別次數字典
    for featVec in dataSet:  # the the number of unique elements and their occurance
        currentLabel = featVec[-1]
        if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1
    shannonEnt = 0.0   # 資訊熵
    for key in labelCounts:
        prob = float(labelCounts[key])/numEntries  # 類別出現的頻率
        shannonEnt -= prob * log(prob, 2)  # log base 2
    return shannonEnt

def splitDataSet(dataSet, axis, value):
    """
    按照給定特徵劃分資料集  第axis個特徵是value的資料集
    :param dataSet: 待劃分的資料集
    :param axis: 劃分資料集的特徵索引
    :param value: 需要返回的特徵的值
    :return: 符合的資料集
    """
    retDataSet = []
    for featVec in dataSet:
        if featVec[axis] == value:
            reducedFeatVec = featVec[:axis]     #chop out axis used for splitting
            reducedFeatVec.extend(featVec[axis+1:])  # [,,,]
            retDataSet.append(reducedFeatVec)  # [,,[]]
    return retDataSet

def chooseBestFeatureToSplit(dataSet):
    """
    選擇最好的資料集劃分方式
    選擇熵最小的，也就是資料最純的
    :param dataSet:
    :return: 最好特徵劃分的索引值
    """
    numFeatures = len(dataSet[0]) - 1  # 特徵數，最後一個為標籤 #the last column is used for the labels
    baseEntropy = calcShannonEnt(dataSet)
    bestInfoGain = 0.0; bestFeature = -1
    for i in range(numFeatures):        #iterate over all the features
        featList = [example[i] for example in dataSet]#create a list of all the examples of this feature
        uniqueVals = set(featList)  # 當前特徵中所有的唯一屬性值集合  #get a set of unique values
        newEntropy = 0.0
        for value in uniqueVals:  # 遍歷當前特徵中所有的唯一屬性值，對每個特徵劃分一次資料集
            subDataSet = splitDataSet(dataSet, i, value)
            prob = len(subDataSet)/float(len(dataSet))
            newEntropy += prob * calcShannonEnt(subDataSet)
        infoGain = baseEntropy - newEntropy     #calculate the info gain; ie reduction in entropy
        if (infoGain > bestInfoGain):       #compare this to the best gain so far
            bestInfoGain = infoGain         #if better than current best, set to best
            bestFeature = i
    return bestFeature                      #returns an integer

def majorityCnt(classList):
    """出現次數最多的分類名稱"""
    classCount={}
    for vote in classList:
        if vote not in classCount.keys(): classCount[vote] = 0
        classCount[vote] += 1
    sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
    return sortedClassCount[0][0]

def createTree(dataSet, labels):
    """
    構建決策數
    :param dataSet: 資料集
    :param labels: 標籤列表
    :return: 樹
    """
    print(dataSet)
    classList = [example[-1] for example in dataSet]  # 包含資料集的所有類標籤
    if classList.count(classList[0]) == len(classList):  # 所有類標籤完全相同，直接返回該標籤
        return classList[0] #stop splitting when all of the classes are equal
    #
    if len(dataSet[0]) == 1: #stop splitting when there are no more features in dataSet
        return majorityCnt(classList)
    bestFeat = chooseBestFeatureToSplit(dataSet)  # 最好特徵劃分的索引值
    bestFeatLabel = labels[bestFeat]  # 最好的特徵劃分標籤
    myTree = {bestFeatLabel:{}}
    del(labels[bestFeat])  # 刪除已經使用的標籤
    featValues = [example[bestFeat] for example in dataSet]
    uniqueVals = set(featValues)
    for value in uniqueVals:
        subLabels = labels[:]  # 拷貝標籤     #copy all of labels, so trees don't mess up existing labels
        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels)
    return myTree


def classify(inputTree, featLabels, testVec):
    """
    決策樹分類
    :param inputTree: 決策樹
    :param featLabels: 標籤
    :param testVec:
    :return:
    """
    firstStr = list(inputTree)[0]   # 相當於list(inputTree.keys())[0]
    secondDict = inputTree[firstStr]
    featIndex = featLabels.index(firstStr)
    key = testVec[featIndex]
    valueOfFeat = secondDict[key]
    if isinstance(valueOfFeat, dict):
        classLabel = classify(valueOfFeat, featLabels, testVec)
    else: classLabel = valueOfFeat
    return classLabel

"""
 儲存決策樹
"""
def storeTree(inputTree, filename):
    import pickle
    fw = open(filename, 'wb')
    pickle.dump(inputTree, fw)
    fw.close()

def grabTree(filename):
    import pickle
    fr = open(filename, 'rb')
    return pickle.load(fr)


if __name__ == '__main__':
    # 熵越高，則混合的資料也越多
    dataSet, labels = createDataSet()
    # dataSet[0][-1] = 'maybe'
    # print(calcShannonEnt(dataSet))
    # print(splitDataSet(dataSet,1,0))
    # print(splitDataSet(dataSet,0,0))
    # print(chooseBestFeatureToSplit(dataSet))  # 第0個特徵是最好的用於劃分資料集的特徵

    """ 不浮出水面是否可以生存   是否有腳蹼   屬於魚類
    dataSet = [[1, 1, 'yes'],
               [1, 1, 'yes'],
               [1, 0, 'no'],
               [0, 1, 'no'],
               [0, 1, 'no']]
    
    按照第一個特徵屬性劃分資料
        特徵是1的：兩個魚類，一個不是魚類
        特徵是0的：都是魚類
    按照第二個特徵屬性劃分資料
        特徵是1的：兩個魚類，兩個不是魚類
        特徵是0的：都不是魚類   
    比較得出第一種分組的輸出結果較好 
    """
    # print(createTree(dataSet, labels))  # {'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}

決策樹基於ID3演算法

from math import log import operator \"\"\" 使用ID3演算法劃分資料集，ID3演算法可以用於劃分標稱型資料集

基於id3演算法根據房價資料進行畫圖預測python

根據已給的波士頓房價資料，對波斯頓房價進行預測。即，實現給出若干條件（如房間數、社群的低收入階層的比率和鎮上學生與教師數量比例的部分資料），要能說出給出的條件是否能夠有效進行預測，如可以做有效預測，則

周志華《機器學習》書中決策樹學習基本演算法的個人理解

# p74圖4.2 決策樹學習基本演算法輸入：訓練集D = {(x1, y1), (x2, y2), (x3, y3),..., (xm, ym)};

機器學習演算法（六）：基於決策樹的分類預測

一、決策樹的介紹決策樹是一種常見的分類模型，在金融分控、醫療輔助診斷等諸多行業具有較為廣泛的應用。決策樹的核心思想是基於樹結構對資料進行劃分，這種思想是人類處理問題時的本能方法。例如在婚戀市場中，女方

BP神經網路演算法以及ID3決策樹分析bankloan資料

一、BP神經網路演算法程式碼如下： # -*- coding: utf-8 -*- import pandas as pd filename = \'C:/Users/86136/Documents/python大資料分析/課本原始碼以及資料/chapter5/demo/data/bankloan.xls\'