決策樹基於ID3演算法
阿新 • • 發佈:2020-12-30
from math import log import operator """ 使用ID3演算法劃分資料集,ID3演算法可以用於劃分標稱型資料集 決策樹分類器就像帶有終止塊的流程圖,終止塊表示分類結果。 開始處理資料集時,首先需要測量集合中資料的不一致, 然後尋找最優方案劃分資料集,直到資料集中的所有資料屬於同一分類 """ def createDataSet(): """ 不浮出水面是否可以生存 是否有腳蹼 屬於魚類 """ dataSet = [[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']] labels = ['no surfacing', 'flippers'] #change to discrete values return dataSet, labels def calcShannonEnt(dataSet): """計算資訊熵(夏農熵) H(x) = -∑P(xi)log(2,P(xi)) (i=1,2,..n) H 表示資訊熵 P 表示某種語言文字的字元出現的概率 LOG2是以二為底的對數,用的是二進位制,資訊熵的單位是位元(BIT,即二進位制的0和1) 熵也可以作為一個系統的混亂程度的標準 另一種:基尼不純度 也可以作為衡量系統混亂程度的標準 基尼 = 1 − ∑P(xi)^2 (i=1,2,..n) 主要的區別就是基尼中把logP(xi)換成了P(xi),相比於熵,基尼反而有計算量小的優勢 """ numEntries = len(dataSet) # 計算資料集中例項的總數 labelCounts = {} # 類別次數字典 for featVec in dataSet: # the the number of unique elements and their occurance currentLabel = featVec[-1] if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0 labelCounts[currentLabel] += 1 shannonEnt = 0.0 # 資訊熵 for key in labelCounts: prob = float(labelCounts[key])/numEntries # 類別出現的頻率 shannonEnt -= prob * log(prob, 2) # log base 2 return shannonEnt def splitDataSet(dataSet, axis, value): """ 按照給定特徵劃分資料集 第axis個特徵是value的資料集 :param dataSet: 待劃分的資料集 :param axis: 劃分資料集的特徵索引 :param value: 需要返回的特徵的值 :return: 符合的資料集 """ retDataSet = [] for featVec in dataSet: if featVec[axis] == value: reducedFeatVec = featVec[:axis] #chop out axis used for splitting reducedFeatVec.extend(featVec[axis+1:]) # [,,,] retDataSet.append(reducedFeatVec) # [,,[]] return retDataSet def chooseBestFeatureToSplit(dataSet): """ 選擇最好的資料集劃分方式 選擇熵最小的,也就是資料最純的 :param dataSet: :return: 最好特徵劃分的索引值 """ numFeatures = len(dataSet[0]) - 1 # 特徵數,最後一個為標籤 #the last column is used for the labels baseEntropy = calcShannonEnt(dataSet) bestInfoGain = 0.0; bestFeature = -1 for i in range(numFeatures): #iterate over all the features featList = [example[i] for example in dataSet]#create a list of all the examples of this feature uniqueVals = set(featList) # 當前特徵中所有的唯一屬性值集合 #get a set of unique values newEntropy = 0.0 for value in uniqueVals: # 遍歷當前特徵中所有的唯一屬性值,對每個特徵劃分一次資料集 subDataSet = splitDataSet(dataSet, i, value) prob = len(subDataSet)/float(len(dataSet)) newEntropy += prob * calcShannonEnt(subDataSet) infoGain = baseEntropy - newEntropy #calculate the info gain; ie reduction in entropy if (infoGain > bestInfoGain): #compare this to the best gain so far bestInfoGain = infoGain #if better than current best, set to best bestFeature = i return bestFeature #returns an integer def majorityCnt(classList): """出現次數最多的分類名稱""" classCount={} for vote in classList: if vote not in classCount.keys(): classCount[vote] = 0 classCount[vote] += 1 sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True) return sortedClassCount[0][0] def createTree(dataSet, labels): """ 構建決策數 :param dataSet: 資料集 :param labels: 標籤列表 :return: 樹 """ print(dataSet) classList = [example[-1] for example in dataSet] # 包含資料集的所有類標籤 if classList.count(classList[0]) == len(classList): # 所有類標籤完全相同,直接返回該標籤 return classList[0] #stop splitting when all of the classes are equal # if len(dataSet[0]) == 1: #stop splitting when there are no more features in dataSet return majorityCnt(classList) bestFeat = chooseBestFeatureToSplit(dataSet) # 最好特徵劃分的索引值 bestFeatLabel = labels[bestFeat] # 最好的特徵劃分標籤 myTree = {bestFeatLabel:{}} del(labels[bestFeat]) # 刪除已經使用的標籤 featValues = [example[bestFeat] for example in dataSet] uniqueVals = set(featValues) for value in uniqueVals: subLabels = labels[:] # 拷貝標籤 #copy all of labels, so trees don't mess up existing labels myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels) return myTree def classify(inputTree, featLabels, testVec): """ 決策樹分類 :param inputTree: 決策樹 :param featLabels: 標籤 :param testVec: :return: """ firstStr = list(inputTree)[0] # 相當於list(inputTree.keys())[0] secondDict = inputTree[firstStr] featIndex = featLabels.index(firstStr) key = testVec[featIndex] valueOfFeat = secondDict[key] if isinstance(valueOfFeat, dict): classLabel = classify(valueOfFeat, featLabels, testVec) else: classLabel = valueOfFeat return classLabel """ 儲存決策樹 """ def storeTree(inputTree, filename): import pickle fw = open(filename, 'wb') pickle.dump(inputTree, fw) fw.close() def grabTree(filename): import pickle fr = open(filename, 'rb') return pickle.load(fr) if __name__ == '__main__': # 熵越高,則混合的資料也越多 dataSet, labels = createDataSet() # dataSet[0][-1] = 'maybe' # print(calcShannonEnt(dataSet)) # print(splitDataSet(dataSet,1,0)) # print(splitDataSet(dataSet,0,0)) # print(chooseBestFeatureToSplit(dataSet)) # 第0個特徵是最好的用於劃分資料集的特徵 """ 不浮出水面是否可以生存 是否有腳蹼 屬於魚類 dataSet = [[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']] 按照第一個特徵屬性劃分資料 特徵是1的:兩個魚類,一個不是魚類 特徵是0的:都是魚類 按照第二個特徵屬性劃分資料 特徵是1的:兩個魚類,兩個不是魚類 特徵是0的:都不是魚類 比較得出第一種分組的輸出結果較好 """ # print(createTree(dataSet, labels)) # {'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}