機器學習之隨機森林——CART模型的PYTHON實現
阿新 • • 發佈:2018-12-13
機器學習之隨機森林——CART模型PYTHON實現
把機器學習的過程記錄一下。隨機森林即利用決策樹群對樣本進行訓練並預測的一種分類器,其與單棵決策樹相比可以平衡誤差。 其中CART模型:
- 二叉決策樹 ,節點特徵只取值“是”與“否”;
- 輸入特徵的切分方式 ,啟發式方法:
當前輸入空間的切分變數 假設為第 輸入量,切分點為 的取值 ,則切分的兩個區域為:
尋找最優的,求解:
而 可以:
遍歷所有輸入變數,可找到最優,將該輸入空間分成兩個區域。對每個區域重複上述劃分過程,直到滿足停止條件,就生成一棵迴歸樹;
-
生成分類樹, CART決策樹使用基尼指數選擇最優特徵;
-
選擇資料集中不同的子資料集即可生成不同的 CART決策樹,測試資料集通過該決策樹集和多數決策,得到最終結果。
實現程式碼網上找的,稍微修改了一下,資料集用的是UCI的Wine Data Set (link). 能夠直接執行。
pytho程式碼
#-*- coding: utf-8 -*-
# Random Forest Algorithm on Sonar Dataset
from random import seed
from random import randrange
from csv import reader
from math import sqrt
from math import log
# Load data file
def load_data(filename, ty): #匯入csv檔案
dataset = list()
with open(filename, 'r') as file:
if ty == 'csv':
readers = reader(file)
for row in readers:
if not row:
continue
dataset.append(row)
else:
while True: # txt檔案
readers = file.readline()
if not readers:
break
pass
p_tmp = [float(i) for i in readers.split(',')]
dataset.append(p_tmp)
pass
### wine資料集預設類別標籤在第一列,統一放入最後一列
length = len(dataset[0])-1
sets = []
for data in dataset:
temp = data[1: length]+ [data[0]]
sets.append(temp)
#### --------------------
return sets
# Convert string column to float
def str_column_to_float(dataset, column): #將資料集的第column列轉換成float形式
for row in dataset:
row[column] = float(row[column]) #strip()返回移除字串頭尾指定的字元生成的新字串。
# Convert string column to integer
def str_column_to_int(dataset, column): #將最後一列表示標籤的值轉換為Int型別0,1,...
class_values = [row[column] for row in dataset]
unique = set(class_values)
lookup = dict()
for i, value in enumerate(unique):
lookup[value] = i
for row in dataset:
row[column] = lookup[row[column]]
return lookup
# Split a dataset into k folds
def cross_validation_split(dataset, n_folds): #將資料集dataset分成n_flods份,每份包含len(dataset) / n_folds個值,每個值由dataset資料集的內容隨機產生,每個值被使用一次
dataset_split = list()
fold_size = len(dataset) / n_folds
for i in range(n_folds):
fold = list() #每次迴圈fold清零,防止重複匯入dataset_split
dataset_copy = list(dataset) #
while len(fold) < fold_size: #這裡不能用if,if只是在第一次判斷時起作用,while執行迴圈,直到條件不成立
index = randrange(len(dataset_copy))
fold.append(dataset_copy.pop(index)) #將對應索引index的內容從dataset_copy中匯出,並將該內容從dataset_copy中刪除。pop() 函式用於移除列表中的一個元素(預設最後一個元素),並且返回該元素的值。
dataset_split.append(fold)
return dataset_split #由dataset分割出的n_folds個數據構成的列表,為了用於交叉驗證
# Calculate accuracy percentage
def accuracy_metric(actual, predicted): #匯入實際值和預測值,計算精確度
correct = 0
for i in range(len(actual)):
if actual[i] == predicted[i]:
correct += 1
return correct / float(len(actual)) * 100.0
# Split a dataset based on an attribute and an attribute value #根據特徵和特徵值分割資料集
def test_split(index, value, dataset):
left, right = list(), list()
for row in dataset:
if row[index] < value:
left.append(row)
else:
right.append(row)
return left, right
# Calculate the Gini index for a split dataset
def gini_index(groups, class_values): #分類越準確,則gini越小
gini = 0.0
for class_value in class_values: #class_values =[0,1]
for group in groups: #groups=(left,right)
size = len(group)
if size == 0:
continue
proportion = [row[-1] for row in group].count(class_value) / float(size)
gini += (proportion * (1.0 - proportion))
return gini
# Select the best split point for a dataset #找出分割資料集的最優特徵,得到最優的特徵index,特徵值row[index],以及分割完的資料groups(left,right)
def get_split(dataset, n_features):
class_values = list(set(row[-1] for row in dataset)) #class_values =[0,1]
b_index, b_value, b_score, b_groups = 999, 999, 999, None
features = list()
while len(features) < n_features:
index = randrange(1,len(dataset[0])) #往features新增n_features個特徵(n_feature等於特徵數的根號),特徵索引從dataset中隨機取
if index not in features:
features.append(index)
for index in features: #在n_features個特徵中選出最優的特徵索引,並沒有遍歷所有特徵,從而保證了每課決策樹的差異性
for row in dataset:
groups = test_split(index, row[index], dataset) #groups=(left,right);row[index]遍歷每一行index索引下的特徵值作為分類值value,找出最優的分類特徵和特徵值
gini = gini_index(groups, class_values)
if gini < b_score:
b_index, b_value, b_score, b_groups = index, row[index], gini, groups #最後得到最優的分類特徵b_index,分類特徵值b_value,分類結果b_groups。b_value為分錯的代價成本。
#print b_score
return {'index':b_index, 'value':b_value, 'groups':b_groups}
# Create a terminal node value #輸出group中出現次數較多的標籤
def to_terminal(group):
outcomes = [row[-1] for row in group] #max()函式中,當key引數不為空時,就以key的函式物件為判斷的標準;
return max(set(outcomes), key=outcomes.count) # 輸出group中出現次數較多的標籤
# Create child splits for a node or make terminal #建立子分割器,遞迴分類,直到分類結束
def split(node, max_depth, min_size, n_features, depth): #max_depth = 10,min_size = 1,n_features = int(sqrt(len(dataset[0])-1))
left, right = node['groups']
del(node['groups'])
# check for a no split
if not left or not right:
node['left'] = node['right'] = to_terminal(left + right)
return
# check for max depth
if depth >= max_depth:
node['left'], node['right'] = to_terminal(left), to_terminal(right)
return
# process left child
if len(left) <= min_size:
node['left'] = to_terminal(left)
else:
node['left'] = get_split(left, n_features) #node['left']是一個字典,形式為{'index':b_index, 'value':b_value, 'groups':b_groups},所以node是一個多層字典
split(node['left'], max_depth, min_size, n_features, depth+1) #遞迴,depth+1計算遞迴層數
# process right child
if len(right) <= min_size:
node['right'] = to_terminal(right)
else:
node['right'] = get_split(right, n_features)
split(node['right'], max_depth, min_size, n_features, depth+1)
# Build a decision tree
def build_tree(train, max_depth, min_size, n_features):
#root = get_split(dataset, n_features)
root = get_split(train, n_features)
split(root, max_depth, min_size, n_features, 1)
return root
# Make a prediction with a decision tree
def predict(node, row): #預測模型分類結果
if row[node['index']] < node['value']:
if isinstance(node['left'], dict): #isinstance是Python中的一個內建函式。是用來判斷一個物件是否是一個已知的型別。
return predict(node['left'], row)
else:
return node['left']
else:
if isinstance(node['right'], dict):
return predict(node['right'], row)
else:
return node['right']
# Make a prediction with a list of bagged trees
def bagging_predict(trees, row):
predictions = [predict(tree, row) for tree in trees] #使用多個決策樹trees對測試集test的第row行進行預測,再使用簡單投票法判斷出該行所屬分類
return max(set(predictions), key=predictions.count)
# Create a random subsample from the dataset with replacement
def subsample(dataset, ratio): #建立資料集的隨機子樣本
sample = list()
n_sample = round(len(dataset) * ratio) #round() 方法返回浮點數x的四捨五入值。
while len(sample) < n_sample:
index = randrange(len(dataset)) #有放回的隨機取樣,有一些樣本被重複取樣,從而在訓練集中多次出現,有的則從未在訓練集中出現,此則自助取樣法。從而保證每棵決策樹訓練集的差異性
sample.append(dataset[index])
return sample
# Random Forest Algorithm
def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_features):
trees = list()
for i in range(n_trees): #n_trees表示決策樹的數量
sample = subsample(train, sample_size) #隨機取樣保證了每棵決策樹訓練集的差異性
tree = build_tree(sample, max_depth, min_size, n_features) #建立一個決策樹
trees.