程式設計實現C4.5演算法
阿新 • • 發佈:2019-01-02
以下是我根據資訊增益率原理設計的C4.5演算法,在分類演算法中,sklearn無法實現C4.5演算法,人們只能藉助其他框架或者獨立程式設計實現C4.5演算法。
# --*-- coding:utf-8 --*-- import numpy as np # 計算資訊熵 def cal_ent(list): # list為M行N列的樣本集 list_count = len(list) lable_count = {} for i in list: k = i[-1] if k not in lable_count: lable_count[k] = 1 else: lable_count[k] += 1 ent = 0.0 for key in lable_count: val1 = lable_count[key] / list_count val2 = -val1 * np.log2(val1) ent += val2 return ent # 按照特定特徵分類資料集並輸出 def splitdataset(list, vec_order, val): list_out0 = [] for i in list: list_out = [] if i[vec_order] == val: list_out.extend(i[:vec_order]) list_out.extend(i[vec_order + 1:]) list_out0.append(list_out) return list_out0 # 抽取特徵函式 def get_feat(list): out = [] for i in range(len(list[0])): s = [] for j in list: k = [] k.append(j[i]) s.append(k) out.append(s) return out # 抽取單一特徵函式 def get_feat0(list, i): out = [] for k in list: out.append(k[i]) return out # 選區最好劃分特徵: def get_bestfeature(list): # 計算每個特徵的熵 vect_list = get_feat(list) ent_list = [] for i in vect_list: ent_list.append(cal_ent(i)) # 劃分特徵集 max_rate = 0.0 best_vate = 0 for i in range(len(list[0]) - 1): vect_order = i type_list = set(get_feat0(list, i)) f_score = 0.0 for j in type_list: son_list = splitdataset(list, i, j) f_score += (len(son_list) / len(list)) * cal_ent(son_list) if max_rate < float((ent_list[-1] - f_score) / ent_list[vect_order]): max_rate = float((ent_list[-1] - f_score) / ent_list[vect_order]) best_vate = vect_order return best_vate def c45(list): mytree = {'node': '', 'son_tree': {}} # 觀察資料集是否屬於同一類 list_target = get_feat0(list, -1) list_set = set(list_target) if len(list_set) == 1: mytree['node'] = list_target[0] # if 資料集屬於同一類: # 返回類標籤 else: # 尋找劃分資料集的最好特徵 best_feat = get_bestfeature(list) # 劃分資料集 mytree['node'] = best_feat list_v = get_feat(list) for i1 in list_v: mytree['son_tree'] = c45(i1) return mytree