西瓜書 習題4.4 程式設計實現基尼指數決策樹
資料及程式碼地址:https://github.com/qdbszsj/decisionTreeGini
這裡的程式碼在資訊熵決策樹的基礎上稍加修改就可以,之前是根據熵增的最大值來確定用哪個屬性劃分,現在是根據基尼指數(表現資料集D的純度)的最小值來建樹。
這裡網上的很多人說建出來的樹長得和書上的不一樣!一樣TM就怪了,書上P81頁的樹那是根據entropy建的,我們建出來的樹是根據Gini index的,而且經常會遇到有屬性值的Gini值相同,比如根節點,臍部和色澤,這倆屬性的Gini指數都是0.875,所以用誰都行,那麼這裡就要加入隨機因素,所以我這份程式碼跑出來的結果都是千變萬化的,根據理論來講,這個樹有很多種解法。
具體的程式碼過程解釋,請看我的上一篇部落格,習題4.3,這裡改動不大。
然後有一點要注意的,一個bug我調了好久。我傳入的引數A,應該是一個list,但是我用np.ones初始化的,然後想用newA=A[ : ]深copy的時候,會出問題,numpy是不支援B=A[ : ]這樣子深拷貝的,這樣子copy出來的是淺拷貝,把B改了,A也跟著改了,這個問題出現在了我的建樹的遞迴函式裡。解決這個問題,要麼一開始你就list(A)一下強制轉換,後面就可以用B=A[ : ]深拷貝了,或者你寫的時候就用numpy,然後import copy,用B=copy.deepcopy(A)來深拷貝。
程式最下面,我用建好的樹,去預測了一下7個測試用例,輸出了混淆矩陣,每次的執行,樹都建的不一樣,混淆矩陣的結果也不一樣,出入很大,意外的一次執行,我發現了一個很棒的結果,正確率100%。在實際操作中,我們也是應該挑選這種在測試集中表現好的模型,來作為我們的最終模型。
剪枝,這裡沒有做,後面有時間再寫。
#make the data watermelon_2 from 3 # import numpy as np # import pandas as pd # dataset = pd.read_csv('/home/parker/watermelonData/watermelon_3.csv', delimiter=",") # del dataset['密度'] # del dataset['含糖率'] # dataset.to_csv('/home/parker/watermelonData/watermelon_2.csv',header=True,index=False) # import numpy as np # import pandas as pd # dataset = pd.read_csv('/home/parker/watermelonData/watermelon_2.csv', delimiter=",") # #print(dataset) # trainID=[0,1,2,5,6,9,13,14,15,16] # testID=[3,4,7,8,10,11,12] # trainData=dataset.iloc[trainID,range(8)] # testData=dataset.iloc[testID,range(8)] # print(trainData) # print(testData) # trainData.to_csv('/home/parker/watermelonData/watermelon_2train.csv', header=True, index=False) # testData.to_csv('/home/parker/watermelonData/watermelon_2test.csv', header=True, index=False) import numpy as np import pandas as pd dataset = pd.read_csv('/home/parker/watermelonData/watermelon_2train.csv', delimiter=",") testData = pd.read_csv('/home/parker/watermelonData/watermelon_2test.csv', delimiter=",") print(dataset) Attributes=dataset.columns m,n=np.shape(dataset) # print(m,n) dataset=np.matrix(dataset) attributeSet=[] for i in range(n): curSet=set() for j in range(m): curSet.add(dataset[j,i]) attributeSet.append(curSet) DD=np.arange(0,m,1) AA=np.ones(n) AA=list(AA) AA[0]=AA[n-1]=-1 EPS=1 import random import copy def treeGenerate(D,A,title): node=Node(title) if isSameY(D):#p74 condition(1),samples are in the same cluster node.v=dataset[D[0], n - 1] return node if isBlankA(A) or isSameAinD(D,A):#condition(2),A==NULL or all the D have the same attribute selected node.v=mostCommonY(D) return node #choose the best attribute giniGain=n#max num=n , formula 4.6 floatV=0 sameValue = [] # for random choose for i in range(len(A)): if(A[i]>0): curGini,divideV=giniIndex(D,i)#formula 4.6 # print(Attributes[i],curGini) if curGini<=giniGain: if curGini<giniGain: sameValue=[i] p=i giniGain=curGini floatV=divideV else:#random choose sameValue.append(i) p = sameValue[random.randint(0,len(sameValue)-1)] # print("\n") if isSameValue(-1000,floatV,EPS):#not a float devide node.v=Attributes[p]+"=?" curSet=attributeSet[p] for i in curSet: Dv=[] for j in range(len(D)): if dataset[D[j],p]==i: Dv.append(D[j]) if Dv==[]:#condition(3) nextNode = Node(i) nextNode.v=mostCommonY(D) node.children.append(nextNode) #book said we should return here, but I think we should continue else: #newA=A[:] newA=copy.deepcopy(A) newA[p]=-1 node.children.append(treeGenerate(Dv,newA,i)) else:#is a float devide,the floatV is the boundary Dleft=[] Dright=[] node.v=Attributes[p]+"<="+str(floatV)+"?" for i in range(len(D)): if dataset[D[i],p]<=floatV:Dleft.append(D[i]) else: Dright.append(D[i]) #A[:] should be deepcopy, I found a bug here,A[:] does not work as a deepcopy #then I see why it is, numpy array can not use A[:] to deepcopy #change the A to list and A[:] can be a deepcopy node.children.append(treeGenerate(Dleft,A[:],"yes")) node.children.append(treeGenerate(Dright,A[:],"no")) return node class Node(object): def __init__(self,title): self.title=title self.v=1 self.children=[] self.deep=0#for plot self.ID=-1#for plot def isSameY(D): curY = dataset[D[0], n - 1] for i in range(1, len(D)): if dataset[D[i],n-1]!=curY: return False return True def isBlankA(A): for i in range(n): if A[i]>0:return False return True def isSameAinD(D,A): for i in range(n): if A[i]>0: for j in range(1,len(D)): if not isSameValue(dataset[D[0],i],dataset[D[j],i],EPS): return False return True def isSameValue(v1,v2,EPS): # if type(v1)==type(dataset[0,8]): # return abs(v1-v2)<EPS # else: return v1==v2 return v1==v2 def mostCommonY(D): res=dataset[D[0],n-1]#1 or 0 maxC = 1 count={} count[res]=1 for i in range(1,len(D)): curV=dataset[D[i],n-1] if curV not in count: count[curV]=1 else:count[curV]+=1 if count[curV]>maxC: maxC=count[curV] res=curV return res def gini(D):#formula 4.5 types = [] count = {} for i in range(len(D)): curY = dataset[D[i], n - 1] if curY not in count: count[curY] = 1 types.append(curY) else: count[curY] += 1 ans = 1 total = len(D) for i in range(len(types)): ans -= count[types[i]] / total * count[types[i]] / total #print(count[types[i]] / total * count[types[i]] / total) return ans def giniIndex(D,p):#formula 4.6 # if type(dataset[0,p])==type(dataset[0,8]): # res,divideV=gainFloat(D,p) if False: nothing=1 else: types=[] count={} for i in range(len(D)): a=dataset[D[i],p] if a not in count: count[a]=[D[i]] types.append(a) else: count[a].append(D[i]) res=0 for i in range(len(types)): res+=gini(count[types[i]]) divideV=-1000 return res,divideV myDecisionTreeRoot=treeGenerate(DD,AA,"root") def countLeaf(root,deep): root.deep=deep res=0 if root.v=='是' or root.v=='否': res+=1 return res,deep curdeep=deep for i in root.children: a,b=countLeaf(i,deep+1) res+=a if b>curdeep:curdeep=b return res,curdeep cnt,deep=countLeaf(myDecisionTreeRoot,0) def giveLeafID(root,ID): if root.v=='是' or root.v=='否': root.ID=ID #print(root.title,ID,root.deep) ID+=1 return ID for i in root.children: ID=giveLeafID(i,ID) return ID giveLeafID(myDecisionTreeRoot,0) import matplotlib.pyplot as plt decisionNode=dict(boxstyle="sawtooth",fc="0.8") leafNode=dict(boxstyle="round4",fc="0.8") arrow_args=dict(arrowstyle="<-") def plotNode(nodeTxt,centerPt,parentPt,nodeType): plt.annotate(nodeTxt,xy=parentPt,xycoords='axes fraction',xytext=centerPt, textcoords='axes fraction',va="center",ha="center",bbox=nodeType,arrowprops=arrow_args) fig=plt.figure(1,facecolor='white') import matplotlib as mpl mpl.rcParams[u'font.sans-serif'] = ['simhei'] mpl.rcParams['axes.unicode_minus'] = False def dfsPlot(root): if root.ID==-1: childrenPx=[] meanPx=0 for i in root.children: cur=dfsPlot(i) meanPx+=cur childrenPx.append(cur) meanPx=meanPx/len(root.children) c=0 for i in root.children: nodetype=leafNode if i.ID<0:nodetype=decisionNode plotNode(i.v,(childrenPx[c],0.9-i.deep*0.8/deep),(meanPx,0.9-root.deep*0.8/deep),nodetype) plt.text((childrenPx[c]+meanPx)/2,(0.9-i.deep*0.8/deep+0.9-root.deep*0.8/deep)/2,i.title) c+=1 return meanPx else: return 0.1+root.ID*0.8/(cnt-1) rootX=dfsPlot(myDecisionTreeRoot) plotNode(myDecisionTreeRoot.v,(rootX,0.9),(rootX,0.9),decisionNode) testData=np.matrix(testData) def treePredictSet(root,testSet):#return the precision testM,testN=np.shape(testSet) confusionMatrix=np.zeros((2,2)) for i in range(testM): predictV=treePredictOne(root,testSet,i) trueV=testSet[i,testN-1] if predictV==trueV: if trueV=='否':confusionMatrix[0,0]+=1 else: confusionMatrix[1,1]+=1 else: if trueV=='否':confusionMatrix[0,1]+=1 else:confusionMatrix[1,0]+=1 return confusionMatrix def treePredictOne(root,testSet,p):#not support float while(True): if root.children==[]:#ID!=-1 leafnode return root.v curAttribute=root.v for i in range(len(Attributes)): if Attributes[i] in curAttribute:#curAttribute.contain(Attributes[i]): curAttribute=i break title=testSet[p,curAttribute] for i in root.children: if i.title==title: root=i break print(treePredictSet(myDecisionTreeRoot,testData)) # fuck=np.ones(3) # fuck=list(fuck) # fuck2=fuck[:] # fuck2[0]=999 # print(fuck) plt.show()
best answer:
other1:
other2:
other3:
other4:這個也是滿分,但是樹長得跟第一個不一樣
還有好多種組合方式,就先這樣吧