機器學習實戰例項練習-計算給定資料集的夏農熵
阿新 • • 發佈:2019-02-13
本文內容以程式碼為主(詳細請參考<機器學習實戰>書籍),主要用於自讀回顧,故註釋未精簡化,若發現錯誤還望各位前輩批評指正.
資訊熵:用來描述系統資訊量的不確定度.
from math import log def calcShannonEnt(dataSet): numEntries = len(dataSet) # labelCounts = {} # 以下五行為所有可能分類建立字典 for featVec in dataSet: currentLabel = featVec[-1] #提取最後一項做為標籤 if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0 labelCounts[currentLabel] += 1 # 書中有錯 # 0:{"yes":1} 1:{"yes":2} 2:{"no":1} 3:{"no":2} 4:{"no":3} shannonEnt = 0.0 for key in labelCounts: prob = float(labelCounts[key]) / numEntries # 計算概率 # 以2為底求對數 shannonEnt -= prob * log(prob,2) # 遞減求和得熵 return shannonEnt # 手動計算: Ent = -0.4*log(2,0.4)-0.6*log(2,0.6) # Ent_mannual = -(0.4 * log(0.4,2)) - (0.6 * log(0.6,2)) # print(Ent_mannual) # 寫一個數據集 def createDataSet(): dataSet = [[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']] labels = ['no surfacing', 'flippers'] return dataSet,labels def splitDataSet(dataSet,axis,value): # 三個輸入引數:待劃分的資料集、劃分資料集的特徵、需要返回的特徵的值 # 建立新的list物件 retDataSet = [] for featVec in dataSet: if featVec[axis] == value: # dataSet[0]=0時執行以下操作 # 以下三行抽取 reducedFeatVec = featVec[:axis] # featVec[:0]= [],即生成一個空列表 reducedFeatVec.extend(featVec[axis + 1:]) # 新增index==1及後的元素 : 0/1/2 跳過,3:1,4:1 retDataSet.append(reducedFeatVec) #整體作為元素新增 3:[[1,"no"]] , 4:[[1,"no"],[1,"no"]] return retDataSet # 選擇最好的資料集劃分方式 def chooseBestFeatureToSplit(dataSet): numFeatures = len(dataSet[0]) - 1 # 去掉標籤項 baseEntropy = calcShannonEnt(dataSet) # 計算熵 bestInfoGain = 0.0 bestFeature = -1 for i in range(numFeatures): # 以下兩行建立唯一的分類標籤列表 featList = [example[i] for example in dataSet] # i=0:[1,1,1,0,0] i=1:[1,1,0,1,1] uniqueVals = set(featList) # i=0:{0,1} i=1:{0,1} newEntropy = 0.0 # 以下五行計算每種劃分方式的資訊熵 for value in uniqueVals: subDataSet = splitDataSet(dataSet, i, value) print(subDataSet) # i=0:{(0,0),(0,1)} 返回:[[1, 'no'], [1, 'no']] [[1,"yes"],[1,"yes"],[0,"no"]] # i=1:{(1,0),(1,1)} 返回:[[0,"no"]] [[1,"yes"],[1,"yes"],[1,"no"],[1,"no"]] prob = len(subDataSet)/float(len(dataSet)) # i=0:{(0,0),(0,1)} 返回:2/5 3/5 # i=1:{(1,0),(1,1)} 返回:1/5 4/5 newEntropy += prob * calcShannonEnt(subDataSet) # 注意這裡是subDataSet 不是 dataSet print("當i={}時得到的熵為".format(i),newEntropy) infoGain = baseEntropy - newEntropy # 資訊增益 if (infoGain > bestInfoGain): # 計算最好的資訊增益 bestInfoGain = infoGain bestFeature = i return bestFeature if __name__ == "__main__": myDat,labels = createDataSet() a = splitDataSet(myDat,0,0) # print(a) b = chooseBestFeatureToSplit(myDat) print(b) # append()方法和extend()方法比較: # a = [1,2,3] # b = [4,5,6] # c = [7,8,9] # a.append(b) # print(a) # out:[1, 2, 3, [4, 5, 6]] # b.extend(c) # print(b) # out:[4, 5, 6, 7, 8, 9]