MachineLearning—CART分類迴歸樹python應用實現
阿新 • • 發佈:2019-01-06
# -*- coding: utf-8 -*- from numpy import * import numpy as np import pandas as pd from math import log import operator import re from collections import defaultdict import itertools def calGini(dataSet): numEntries = len(dataSet) labelCounts={} for featVec in dataSet: currentLabel = featVec[-1] if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0 labelCounts[currentLabel] += 1 gini=1 for label in labelCounts.keys(): prop=float(labelCounts[label])/numEntries gini -=prop*prop return gini # 傳入的是一個特徵值的列表,返回特徵值二分的結果 def featuresplit(features): count = len(features)#特徵值的個數 if count < 2: #特徵值只有一個值比如'cold_blood' li=[] print "please check sample's features,only one feature value" li.append(features) return tuple(li) #列表轉化為元組 # 由於需要返回二分結果,所以每個分支至少需要一個特徵值,所以要從所有的特徵組合中選取1個以上的組合 # itertools的combinations 函式可以返回一個列表選多少個元素的組合結果,例如combinations(list,2)返回的列表元素選2個的組合 # 我們需要選擇1-(count-1)的組合 featureIndex = range(count) featureIndex.pop(0) combinationsList = [] resList=[] # 遍歷所有的組合 for i in featureIndex: temp_combination = list(itertools.combinations(features, len(features[0:i]))) combinationsList.extend(temp_combination) combiLen = len(combinationsList) # 每次組合的順序都是一致的,並且也是對稱的,所以我們取首尾組合集合 # zip函式提供了兩個列表對應位置組合的功能 resList = zip(combinationsList[0:combiLen/2], combinationsList[combiLen-1:combiLen/2-1:-1]) #往回數間隔為1 return resList #二分特徵的不同情況 #def splitDataSet(dataSet, axis, values): # retDataSet = [] # for featVec in dataSet: # for value in values: # if featVec[axis] == value: # reducedFeatVec = featVec[:axis] #剔除樣本集 # reducedFeatVec.extend(featVec[axis+1:]) # retDataSet.append(reducedFeatVec) # return retDataSet #把那些特徵值等於value的都剔出來 #def splitDataSet(dataSet, axis, values): #實現了一些特徵的重複利用 比如cover 特徵複用 # retDataSet = [] # if len(values) < 2: # for featVec in dataSet: #長度小於2即只有一個特徵值 # if featVec[axis] == values[0]: #如果特徵值只有一個,不抽取當選特徵 # reducedFeatVec = featVec[:axis] # reducedFeatVec.extend(featVec[axis+1:]) # retDataSet.append(reducedFeatVec) # else: # for featVec in dataSet: # for value in values: # if featVec[axis] == value: #如果特徵值多於一個,選取當前特徵 # retDataSet.append(featVec) # # return retDataSet #處理連續特徵值 def splitDataSet(dataSet, axis, value,threshold): retDataSet = [] if threshold == 'lt': for featVec in dataSet: if featVec[axis] <= value: retDataSet.append(featVec) else: for featVec in dataSet: if featVec[axis] > value: retDataSet.append(featVec) return retDataSet # 返回最好的特徵以及二分特徵值 """def chooseBestFeatureToSplit(dataSet): numFeatures = len(dataSet[0]) - 1 # bestGiniGain = 1.0 bestFeature = -1 bestBinarySplit=() for i in range(numFeatures): #遍歷特徵 featList = [example[i] for example in dataSet] #得到特徵列 uniqueVals = list(set(featList)) #去除重複值的特徵列 # 三個特徵值的二分結果: # [(('young',), ('old', 'middle')), (('old',), ('young', 'middle')), (('middle',), ('young', 'old'))] for split in featuresplit(uniqueVals): #featuresplit返回特徵的所有二分情況 GiniGain = 0.0 if len(split)==1: #split是一個元組 特徵值只有一個比如:cold_blood 只有一個特徵值就沒辦法繼續劃分下去了 所以跳出迴圈繼續下一迴圈 continue (left,right)=split # 對於每一個可能的二分結果計算gini增益 # 左增益 left_subDataSet = splitDataSet(dataSet, i, left) left_prob = len(left_subDataSet)/float(len(dataSet)) GiniGain += left_prob * calGini(left_subDataSet) # 右增益 right_subDataSet = splitDataSet(dataSet, i, right) right_prob = len(right_subDataSet)/float(len(dataSet)) GiniGain += right_prob * calGini(right_subDataSet) if (GiniGain <= bestGiniGain): #比較是否是最好的結果 bestGiniGain = GiniGain #記錄最好的結果和最好的特徵 bestFeature = i bestBinarySplit=(left,right) return bestFeature,bestBinarySplit """ #處理連續特徵值 def chooseBestFeatureToSplit(dataSet): numFeatures = len(dataSet[0]) - 1 bestGiniGain = 1.0; bestFeature = -1;bsetValue="" for i in range(numFeatures): #遍歷特徵 featList = [example[i] for example in dataSet] #得到特徵列 uniqueVals = list(set(featList)) #從特徵列獲取該特徵的特徵值的set集合 uniqueVals.sort() for value in uniqueVals: #遍歷所有的特徵值 GiniGain = 0.0 #左基尼指數 left_subDataSet = splitDataSet(dataSet, i, value,'lt') left_prob = len(left_subDataSet)/float(len(dataSet)) GiniGain += left_prob * calGini(left_subDataSet) #右基尼指數 right_subDataSet = splitDataSet(dataSet, i, value,'gt') right_prob = len(right_subDataSet)/float(len(dataSet)) GiniGain += right_prob * calGini(right_subDataSet) if (GiniGain < bestGiniGain): #比較是否是最好的結果 bestGiniGain = GiniGain #記錄最好的結果和最好的特徵 bestFeature = i bestValue=value return bestFeature,bestValue def majorityCnt(classList): classCount={} for vote in classList: if vote not in classCount.keys(): classCount[vote] = 0 classCount[vote] += 1 sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) return sortedClassCount[0][0] #返回標籤 """def createTree(dataSet,labels): classList = [example[-1] for example in dataSet] # 9/0 # print dataSet if classList.count(classList[0]) == len(classList): return classList[0] #所有的類別都一樣,就不用再劃分了 if len(dataSet) == 1: #如果沒有繼續可以劃分的特徵,就多數表決決定分支的類別 # print "here" return majorityCnt(classList) bestFeat,bestBinarySplit = chooseBestFeatureToSplit(dataSet) # 9/0 # print bestFeat,bestBinarySplit,labels bestFeatLabel = labels[bestFeat] if bestFeat==-1: return majorityCnt(classList) myTree = {bestFeatLabel:{}} featValues = [example[bestFeat] for example in dataSet] uniqueVals = list(set(featValues)) # 9/0 for value in bestBinarySplit: # 9/0 subLabels = labels[:] #拷貝防止其他地方修改 特徵標籤 if len(value)<2: del(subLabels[bestFeat]) # 9/0 myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels) # 9/0 return myTree """ #處理連續特徵值, labels是特徵標籤 def createTree(dataSet,labels): classList = [example[-1] for example in dataSet] if classList.count(classList[0]) == len(classList): return classList[0] #所有的類別都一樣,就不用再劃分了 if len(dataSet) == 1: #如果沒有繼續可以劃分的特徵,就多數表決決定分支的類別 return majorityCnt(classList) bestFeat,bestValue = chooseBestFeatureToSplit(dataSet) bestFeatLabel = labels[bestFeat] if bestFeat==-1: return majorityCnt(classList) myTree = {bestFeatLabel:{}} featValues = [example[bestFeat] for example in dataSet] uniqueVals = list(set(featValues)) subLabels = labels[:] myTree[bestFeatLabel][bestFeatLabel+'<='+str(round(float(bestValue),3))] = createTree(splitDataSet(dataSet, bestFeat, bestValue,'lt'),subLabels) myTree[bestFeatLabel][bestFeatLabel+'>'+str(round(float(bestValue),3))] = createTree(splitDataSet(dataSet, bestFeat, bestValue,'gt'),subLabels) return myTree #完美沒有問題!!! ####測試分類 #由於在Tree中,連續值特徵的名稱改為了feature<=value的形式 #因此對於這類特徵,需要利用正則表示式進行分割,獲得特徵名以及分割閾值(其他方法也可以) def classify(inputTree,featLabels,testVec): firstStr=inputTree.values()[0].keys() #第一個為最佳分類特徵 #連續值 if '<=' not in firstStr[0]: firstStr.reverse() featvalue=float(re.compile("(<=.+)").search(firstStr[0]).group()[2:]) #例子中的97 featkey=re.compile("(.+<=)").search(firstStr[0]).group()[:-2] #例子中的money featIndex=featLabels.index(featkey) #特徵列表中當前特徵標籤位置 if testVec[featIndex]<=featvalue: #測試樣本中對應位置的值 secondDict=inputTree.values()[0][firstStr[0]] if type(secondDict).__name__=='dict': classLabel=classify(secondDict,featLabels,testVec) #遞迴呼叫 else: classLabel=secondDict else: secondDict=inputTree.values()[0][firstStr[1]] if type(secondDict).__name__=='dict': classLabel=classify(secondDict,featLabels,testVec) #遞迴呼叫 else: classLabel=secondDict #else:離散的先不考慮 return classLabel #返回分類標籤 def testing(myTree,data_test,labels): error=0.0 for i in range(len(data_test)): #一個個的測試 if classify(myTree,labels,data_test[i])!=data_test[i][-1]: #如果測試的結果與實際的標籤不同 error+=1 print 'myTree %f'%((len(data_test)-error)/len(data_test)) #注意這裡的%f 之前是%d返回整型 # 9/0 return None df=pd.read_csv('C:/Users/test_5.csv') data=df.values[:280,1:].tolist() #從0行開始 從1列開始 data包含特徵資料和類別值 data_full=data[:] data_test=df.values[280:,1:].tolist() #劃分測試集 #test_length=len(data_test) labels=df.columns.values[1:-1].tolist() #特徵標籤 color root knocks texture navel touch 400個 labels_full=labels[:] myTree=createTree(data,labels) testing(myTree,data_test,labels_full) import treePlotter treePlotter.createPlot(myTree)