ID3決策樹的生成
阿新 • • 發佈:2019-01-27
# coding=utf-8 import math ''' 決策樹模型,假設有三個條件 年齡,有三個選項 1 表示老年人 2 表示中年 人 3 表示青年人 工作,有兩個選項 1 表示有工作 2表示 沒有工作 房子,有兩個選項 2 表示有房子 2表示 沒有房子 信貸情況 1表示一般 2表示號 3表示非常好 輸出,有兩個選項 1 表示給貸款 2表示 不予貸款 要求: 依次獲得每個選項的資訊增益 ''' output = [] class Node: def __init__(self, desc, child_list_input=[], child_desc=[], child=[]): self.desc = desc self.child = [] self.child_list_input = child_list_input self.child_desc = child_desc def set_child(self, child): self.child = child def append_child(self, child): self.child.append(child) g_desc = [ {1: "老年人", 2: "中年人", 3: "青年人"}, {1: "有工作", 2: "沒工作"}, {1: "有房子", 2: "沒有房子"}, {1: "信貸一般", 2: "信貸好", 3: "信貸非常好"} ] g_columns = ["年齡", "工作", "房子", "信貸"] limit_low_shang = 0.2 sample_input = [[3, 2, 2, 1, 2], [3, 2, 2, 2, 2], [3, 1, 2, 2, 1], [3, 1, 1, 1, 1], [3, 2, 2, 1, 2], [2, 2, 2, 1, 2], [2, 2, 2, 2, 2], [2, 1, 1, 2, 1], [2, 2, 1, 3, 1], [2, 2, 1, 3, 1], [1, 2, 1, 3, 1], [1, 2, 1, 2, 1], [1, 1, 2, 2, 1], [1, 1, 2, 3, 1], [1, 2, 2, 1, 2]] # 計算 H(D) def get_proper_column_index(sample_input, desc=""): # 獲得輸出的經驗熵 if len(sample_input) == 0: return out_put_array = {} for i in sample_input: index = len(i) - 1 if i[index] in out_put_array: out_put_array[i[index]] = out_put_array[i[index]] + 1 else: out_put_array[i[index]] = 1 N = len(sample_input) for i in out_put_array.keys(): sample = out_put_array[i] out_put_array[i] = {} pi = sample / N out_put_array[i]['pi'] = pi out_put_array[i]['log'] = math.log(pi, 2) out_put_array[i]['count'] = sample HD = 0 for i in out_put_array.keys(): HD += 0 - (out_put_array[i]['pi'] * out_put_array[i]['log']) columns_num = len(sample_input[0]) max_shang = 0 current_index = 0 for i in range(0, columns_num - 1): result = getTezhengX(sample_input, i, N, HD) if result > max_shang: max_shang = result current_index = i global limit_low_shang if max_shang < limit_low_shang: #print("未能成功分類:" + desc) output.append(desc) # 返回葉子節點 return Node(desc, []) # 根據 current位置的資訊進行樣本分割 result = {} global g_desc for item in sample_input: value = item[current_index] if value in result.keys(): result[value]['item'].append(item) else: result[value] = {} result[value]['item'] = [] result[value]['desc'] = g_desc[current_index][value] result[value]['item'].append(item) data_list = [] desc_list = [] for item in result.keys(): data_list.append(result[item]['item']) desc_list.append(result[item]['desc']) # get_proper_column_index(result[item]['item'], result[item]['desc']) return Node(g_columns[current_index], data_list, desc_list) # 接下來計算年齡特徵的資訊增益 def getTezhengX(sample_input, index, N, HD): result = {} for i in sample_input: key = i[index] if key not in result.keys(): result[key] = {} result[key]['count'] = 0 # 行元素最後一個是輸出 out = i[len(i) - 1] if out in result[key].keys(): result[key][out] = result[key][out] + 1 else: result[key][out] = 1 result[key]['count'] = result[key]['count'] + 1 all_sum = 0 # result.keys 裡面的值是 第i列的不同的值得陣列 for key in result.keys(): # 當前輸出類的概率是 item = result[key] # 去掉count元素 sum_count = item['count'] result[key].pop('count') item = result[key] # 特徵X 在所有樣本中的概率 sum_value = sum_count / N tmp_sum = 0 for tmp in item.keys(): pi = item[tmp] / sum_count log = math.log(pi, 2) tmp_sum = tmp_sum + pi * log sum_value = sum_value * (0 - tmp_sum) all_sum = all_sum + sum_value return HD - all_sum def proc_Node(node): if len(node.child_list_input) == 0: # 說明是葉子節點 return node else: node_list = [] for k, item in enumerate(node.child_list_input): # print(node.child_list_input[k]) tmp = get_proper_column_index(item, node.child_desc[k]) current_node = proc_Node(tmp) node_list.append(current_node) node.child = node_list return node def bianli_node(node, depth=0, parent=""): if depth == 0: print("根節點:"+node.desc) else: print("父節點:" + parent + " 當前節點:" + node.desc + " 深度:" + str(depth)) for i, item in enumerate(node.child_list_input): bianli_node(node.child[i], depth + 1, node.desc) root = get_proper_column_index(sample_input) root = proc_Node(root) bianli_node(root)