機器學習演算法與程式設計--鄭捷 C45D演算法 python3實現 修改部分
此演算法需要更改的地方出除了上篇寫到的loadDataSet函式,在課本中getBestFeat()函式中資訊增益計算公式處給出的矩陣相除在py3無法執行需要改為dot(A,B.T)形式
具體程式碼
def getBestFeat(self,dataSet):
Num_Feats=len(dataSet[0][:-1])
totality=len(dataSet)
BaseEntropy=self.computeEntropy(dataSet)
ConditionEntropy=[]
splitInfo=[]
allFeatVList=[]
for f in range(Num_Feats):
featList=[example[f] for example in dataSet]
[splitI,featureValueList]=self.computeSplitInfo(featList)
allFeatVList.append(featureValueList)
splitInfo.append(splitI)
resultGain=0.0
for value in featureValueList:
subSet=self.splitDataSet(dataSet,f,value)
appearNum=float(len(subSet))
subEntropy=self.computeEntropy(subSet)
resultGain+=(appearNum/totality)*subEntropy
ConditionEntropy.append(resultGain)
infoGainArray=BaseEntropy*ones(Num_Feats)-array(ConditionEntropy)
# infoGainRatio=infoGainArray/array(splitInfo)#py2可以這樣做但是py3不行
infoGainRatio=dot(infoGainArray,array(splitInfo).T)#py3這種用法更貼近線性代數中矩陣除法形式
bestFeatureIndex=argsort(-infoGainRatio)[0]
return bestFeatureIndex,allFeatVList[bestFeatureIndex]