1. 程式人生 > 實用技巧 >決策樹(挖坑待填)

決策樹(挖坑待填)

決策樹

ID3 和 ID4.5

from math import log
import operator

#id 指定某一列屬性來計算熵值
def calcShannonEnt(dataSet, id):
    numEntries = len(dataSet)
    labelCounts = {}
    for featVec in dataSet:
        currentLabel = featVec[id]  #取出類別列
        if currentLabel not in labelCounts.keys():
            labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1
    shannonEnt = 0
    for key in labelCounts:
        prob = float(labelCounts[key])/numEntries
        shannonEnt -= prob*log(prob,2)
    return shannonEnt
    

def createDataSet():
    dataSet=[[0, 0, 0, 0, 'no'],
            [0, 0, 0, 1, 'no'],
            [0, 1, 0, 1, 'yes'],
            [0, 1, 1, 0, 'yes'],
            [0, 0, 0, 0, 'no'],
            [1, 0, 0, 0, 'no'],
            [1, 0, 0, 1, 'no'],
            [1, 1, 1, 1, 'yes'],
            [1, 0, 1, 2, 'yes'],
            [1, 0, 1, 2, 'yes'],
            [2, 0, 1, 2, 'yes'],
            [2, 0, 1, 1, 'yes'],
            [2, 1, 0, 1, 'yes'],
            [2, 1, 0, 2, 'yes'],
            [2, 0, 0, 0, 'no']]
    #分類屬性
    labels=['年齡','有工作','有自己的房子','信貸情況']
    return dataSet, labels


#取出按某個特徵分類後的資料(就是把該列特徵去掉)
def splitDataSet(dataSet, axis, value):
    retDataSet = []
    for featVec in dataSet:
        if featVec[axis] == value:
            reducedFeatVec = featVec[:axis]
            reducedFeatVec.extend(featVec[axis+1:])
            retDataSet.append(reducedFeatVec)
    return retDataSet


def chooseBestFeatureToSplit(dataSet):
    numFeatures = len(dataSet[0])-1
    baseEntropy = calcShannonEnt(dataSet,-1)
    bestInfoGain = 0
    bestFeature = -1
    for i in range(numFeatures):
        featList = [example[i] for example in dataSet]
        uniqueVals = set(featList)
        newEntropy = 0
        for value in uniqueVals:
            subDataSet = splitDataSet(dataSet, i, value)
            prob = len(subDataSet)/float(len(dataSet))
            newEntropy += prob*calcShannonEnt(subDataSet,-1)
        infoGain = baseEntropy - newEntropy  #計算劃分後的資訊增益
        IV = calcShannonEnt(dataSet,i)
        infoGain = infoGain/IV
        print(i,value,infoGain," ",bestInfoGain)
        if infoGain > bestInfoGain:
            bestInfoGain = infoGain
            bestFeature = i  #標記最好的特徵
            
    print("bestGain = ",bestInfoGain)
    print("\n")
    return bestFeature


#確定分類的類別(按分類後的類別數量排序,取數量最大的類別當做最終類別
def majorityCnt(classList):
    classCount = {}
    for vote in classList:                                                 
        if vote not in classCount.keys():
            classCount[vote] = 0
        classCount[vote] += 1
    sortedClassCount =            
    sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
    return sortedClassCount[0][0]


def createTree(dataSet, labels):
    classList = [example[-1] for example in dataSet]
    #如果只有一種類別
    if classList.count(classList[0]) == len(classList):
        return classList[0]
    if len(dataSet[0])==1:
        return majorityCnt(classList)
    bestFeat = chooseBestFeatureToSplit(dataSet)  #選擇最優特徵
    bestFeatLabel = labels[bestFeat]
    myTree = {bestFeatLabel:{}}  #以字典的形式儲存樹
    del(labels[bestFeat])  #把該特徵標籤刪掉
    featValues = [example[bestFeat] for example in dataSet]
    uniqueVals = set(featValues)
    for value in uniqueVals:
        subLabels = labels[:]
        myTree[bestFeatLabel][value] = createTree(splitDataSet\
                              (dataSet,bestFeat,value),subLabels)
    return myTree


if __name__=='__main__':
    dataSet, labels = createDataSet()
    print(createTree(dataSet,labels))
 

CART迴歸樹

# -*- coding: utf-8 -*-
"""
Created on Wed Jan  6 13:54:43 2021

@author: koneko
"""

import numpy as np

def loadDataSet(fileName):
    dataMat = []
    fr = open(fileName)
    for line in fr.readlines():
        curLine = line.strip().split('\t')
        fitLine = list(map(float, curLine)) #python3 裡面map返回的是一個物件,需要型別轉換
        dataMat.append(fitLine)
    return dataMat

def binSplitDataSet(dataSet, feature, value):
    mat0 = dataSet[np.nonzero(dataSet[:,feature]>value)[0],:]
    mat1 = dataSet[np.nonzero(dataSet[:,feature]<=value)[0],:]
    return mat0,mat1

def regLeaf(dataSet):
    return np.mean(dataSet[:,-1])

def regErr(dataSet):
    return np.var(dataSet[:,-1])*np.shape(dataSet)[0]

def chooseBestSplit(dataSet, leafType=regLeaf,errType=regErr,ops=(1,4)):
    tolS = ops[0]  #使用者允許的誤差下降值
    tolN = ops[1]  #切分的最少樣本數
    #如果所有值都相等則退出,也即是隻有一種類別,不需要進行分割
    if len(set(dataSet[:,-1].T.tolist()[0])) == 1:
        return None, leafType(dataSet)
    m,n = np.shape(dataSet)
    S = errType(dataSet)
    bestS = np.inf
    bestIndex = 0
    bestValue = 0
    #遍歷集合中的各種特徵
    for featIndex in range(n-1):
        #遍歷該特徵在集合中出現的各種可能取值
        for splitVal in set(dataSet[:,featIndex].T.A.tolist()[0]):  #注意這裡是Python3語法上有點不相容
            #用該取值進行二元分割
            mat0, mat1 = binSplitDataSet(dataSet,featIndex,splitVal)
            #如果其中一個分支的樣本數少於切分的最少樣本數則不切分
            if (np.shape(mat0)[0]<tolN) or (np.shape(mat1)[0]<tolN):
                continue
            #計算切分之後的資料集的誤差S
            newS = errType(mat0) + errType(mat1)
            #如果誤差比較小則更新最小誤差
            if newS < bestS:
                bestIndex = featIndex
                bestValue = splitVal
                bestS = newS
    #如果誤差減少不大則退出
    if (S - bestS) < tolS:
        return None, leafType(dataSet)
    mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
    #如果切分出的資料集很小則退出
    if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN):
        return None, leafType(dataSet)
    return bestIndex, bestValue

def createTree(dataSet,leafType=regLeaf,errType=regErr,ops=(1,4)):
    feat, val = chooseBestSplit(dataSet,leafType,errType,ops)
    if feat == None:  #滿足停止條件時返回
        return val
    retTree = {}
    retTree['spInd'] = feat
    retTree['spVal'] = val
    lSet, rSet = binSplitDataSet(dataSet, feat, val)
    retTree['left'] = createTree(lSet,leafType,errType,ops)
    retTree['right'] = createTree(rSet,leafType,errType,ops)
    return retTree
    
myData = loadDataSet('ex00.txt')
myMat = np.mat(myData)
tree = createTree(myMat)
print(tree)

CART分類樹

from math import log
import operator

def createDataSet():
    """
    建立測試的資料集
    :return:
    """
    dataSet = [
        # 1
        ['青綠', '蜷縮', '濁響', '清晰', '凹陷', '硬滑', '好瓜'],
        # 2
        ['烏黑', '蜷縮', '沉悶', '清晰', '凹陷', '硬滑', '好瓜'],
        # 3
        ['烏黑', '蜷縮', '濁響', '清晰', '凹陷', '硬滑', '好瓜'],
        # 4
        ['青綠', '蜷縮', '沉悶', '清晰', '凹陷', '硬滑', '好瓜'],
        # 5
        ['淺白', '蜷縮', '濁響', '清晰', '凹陷', '硬滑', '好瓜'],
        # 6
        ['青綠', '稍蜷', '濁響', '清晰', '稍凹', '軟粘', '好瓜'],
        # 7
        ['烏黑', '稍蜷', '濁響', '稍糊', '稍凹', '軟粘', '好瓜'],
        # 8
        ['烏黑', '稍蜷', '濁響', '清晰', '稍凹', '硬滑', '好瓜'],

        # ----------------------------------------------------
        # 9
        ['烏黑', '稍蜷', '沉悶', '稍糊', '稍凹', '硬滑', '壞瓜'],
        # 10
        ['青綠', '硬挺', '清脆', '清晰', '平坦', '軟粘', '壞瓜'],
        # 11
        ['淺白', '硬挺', '清脆', '模糊', '平坦', '硬滑', '壞瓜'],
        # 12
        ['淺白', '蜷縮', '濁響', '模糊', '平坦', '軟粘', '壞瓜'],
        # 13
        ['青綠', '稍蜷', '濁響', '稍糊', '凹陷', '硬滑', '壞瓜'],
        # 14
        ['淺白', '稍蜷', '沉悶', '稍糊', '凹陷', '硬滑', '壞瓜'],
        # 15
        ['烏黑', '稍蜷', '濁響', '清晰', '稍凹', '軟粘', '壞瓜'],
        # 16
        ['淺白', '蜷縮', '濁響', '模糊', '平坦', '硬滑', '壞瓜'],
        # 17
        ['青綠', '蜷縮', '沉悶', '稍糊', '稍凹', '硬滑', '壞瓜']
    ]

    # 特徵值列表
    labels = ['色澤', '根蒂', '敲擊', '紋理', '臍部', '觸感']
    return dataSet,labels

def calcGini(dataSet):
    numEntries = len(dataSet)
    labelCounts = {}
    for featVec in dataSet:
        currentLabel = featVec[-1] #取最後一列(類別
        if currentLabel not in labelCounts.keys():
            labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1  #統計每個類別的數目
    Gini = 1
    for key in labelCounts:
        p = float(labelCounts[key])/numEntries
        Gini -= p*p
    return Gini


def createDataSet1():    # 創造示例資料
    dataSet = [['長', '粗', '男'],
               ['短', '粗', '男'],
               ['短', '粗', '男'],
               ['長', '細', '女'],
               ['短', '細', '女'],
               ['短', '粗', '女'],
               ['長', '粗', '女'],
               ['長', '粗', '女']]
    labels = ['頭髮','聲音']  #兩個特徵
    return dataSet,labels

def createDataSet2():
    """
    創造示例資料/讀取資料
    @param dataSet: 資料集
    @return dataSet labels:資料集 特徵集
    """
    # 資料集
    dataSet = [['青年', '否', '否', '一般', '不同意'],
               ['青年', '否', '否', '好', '不同意'],
               ['青年', '是', '否', '好', '同意'],
               ['青年', '是', '是', '一般', '同意'],
               ['青年', '否', '否', '一般', '不同意'],
               ['中年', '否', '否', '一般', '不同意'],
               ['中年', '否', '否', '好', '不同意'],
               ['中年', '是', '是', '好', '同意'],
               ['中年', '否', '是', '非常好', '同意'],
               ['中年', '否', '是', '非常好', '同意'],
               ['老年', '否', '是', '非常好', '同意'],
               ['老年', '否', '是', '好', '同意'],
               ['老年', '是', '否', '好', '同意'],
               ['老年', '是', '否', '非常好', '同意'],
               ['老年', '否', '否', '一般', '不同意']]
    labels = ['年齡', '有工作', '有房子', '信貸情況']
    return dataSet,labels

#對某一個特徵列,按照某個其是否等於value,劃分成兩個類
def binSplitDataSet(dataSet,index,value):
    set1=[]
    set2=[]
    for featVec in dataSet:
        reducedFeatVec = featVec[:index]
        reducedFeatVec.extend(featVec[index+1:])
        if featVec[index] == value:
            set1.append(reducedFeatVec)
        else:
            set2.append(reducedFeatVec)
    return set1,set2

          
def chooseBestFeatureToSplit(dataSet):
    numFeatures = len(dataSet[0])-1
    nD = len(dataSet)
    bestGini_feat = 100
    bestFeature = -1
    for feat in range(numFeatures):
        featvals = [example[feat] for example in dataSet]
        featvals = set(featvals)
        for val in featvals:
            set0,set1 = binSplitDataSet(dataSet,feat,val)
            newGini_feat = (len(set0)/float(nD)) * calcGini(set0)
            newGini_feat += (len(set1)/float(nD)) * calcGini(set1)
            if newGini_feat < bestGini_feat:
                bestGini_feat = newGini_feat
                bestFeature = feat
                bestVal = val
    return bestFeature,bestVal
                 

def majorityCnt(classList):
    classCount = {}
    for vote in classList:
        if vote not in classCount.keys():
            classCount[vote] = 0
        classCount[vote] += 1
    sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
    return sortedClassCount[0][0]
      

def createTree(dataSet, labels):
    classList = [a[-1] for a in dataSet]
    #如果只有一個類別
    if classList.count(classList[0]) == len(classList):
        return classList[0]
    #如果沒有特徵可以再分了,返回多數表決
    if len(dataSet[0]) == 1:
        return majorityCnt(classList)
    #選擇最佳特徵和特徵值進行分割
    bestFeat,bestVal = chooseBestFeatureToSplit(dataSet)
    bestFeatLabel = labels[bestFeat]
    myTree = {bestFeatLabel:{}} #以字典的形式儲存樹
    del(labels[bestFeat])
    mat0,mat1 = binSplitDataSet(dataSet,bestFeat,bestVal)
    left = bestVal
    right = set([a[bestFeat] for a in dataSet])
    right.remove(bestVal)
    right = tuple(right)
    print(right)
    subLabels = labels[:]
    myTree[bestFeatLabel][left] = createTree(mat0,subLabels)
    myTree[bestFeatLabel][right] = createTree(mat1,subLabels)
    
    return myTree
    
dataSet, labels = createDataSet2()

myTree = createTree(dataSet,labels)
print(myTree)