1. 程式人生 > >程式設計實現C4.5演算法

程式設計實現C4.5演算法

以下是我根據資訊增益率原理設計的C4.5演算法,在分類演算法中,sklearn無法實現C4.5演算法,人們只能藉助其他框架或者獨立程式設計實現C4.5演算法。

# --*-- coding:utf-8 --*--

import numpy as np

# 計算資訊熵
def cal_ent(list):  # list為M行N列的樣本集
    list_count = len(list)
    lable_count = {}
    for i in list:
        k = i[-1]
        if k not in lable_count:
            lable_count[k] = 1
        else:
            lable_count[k] += 1
    ent = 0.0
    for key in lable_count:
        val1 = lable_count[key] / list_count
        val2 = -val1 * np.log2(val1)
        ent += val2
    return ent


# 按照特定特徵分類資料集並輸出
def splitdataset(list, vec_order, val):
    list_out0 = []
    for i in list:
        list_out = []
        if i[vec_order] == val:
            list_out.extend(i[:vec_order])
            list_out.extend(i[vec_order + 1:])
        list_out0.append(list_out)
    return list_out0


# 抽取特徵函式

def get_feat(list):
    out = []
    for i in range(len(list[0])):
        s = []
        for j in list:
            k = []
            k.append(j[i])
            s.append(k)
        out.append(s)
    return out


# 抽取單一特徵函式
def get_feat0(list, i):
    out = []
    for k in list:
        out.append(k[i])
    return out


# 選區最好劃分特徵:
def get_bestfeature(list):
    # 計算每個特徵的熵
    vect_list = get_feat(list)
    ent_list = []
    for i in vect_list:
        ent_list.append(cal_ent(i))

    # 劃分特徵集
    max_rate = 0.0
    best_vate = 0
    for i in range(len(list[0]) - 1):
        vect_order = i
        type_list = set(get_feat0(list, i))
        f_score = 0.0
        for j in type_list:
            son_list = splitdataset(list, i, j)
            f_score += (len(son_list) / len(list)) * cal_ent(son_list)
        if max_rate < float((ent_list[-1] - f_score) / ent_list[vect_order]):
            max_rate = float((ent_list[-1] - f_score) / ent_list[vect_order])
            best_vate = vect_order
    return best_vate


def c45(list):
    mytree = {'node': '', 'son_tree': {}}
    # 觀察資料集是否屬於同一類
    list_target = get_feat0(list, -1)
    list_set = set(list_target)
    if len(list_set) == 1:
        mytree['node'] = list_target[0]
    # if 資料集屬於同一類:
    #     返回類標籤
    else:
        #     尋找劃分資料集的最好特徵
        best_feat = get_bestfeature(list)
        #     劃分資料集
        mytree['node'] = best_feat
        list_v = get_feat(list)
        for i1 in list_v:
            mytree['son_tree'] = c45(i1)
    return mytree