基於R的C4.5決策樹的建立

阿新 • • 發佈：2019-01-07

下邊的程式碼是一個簡單的C4.5決策樹建立過程，該決策樹不含有剪枝的過程，並且針對的屬性必須是標量（非連續屬性值）
#計算類屬性的熵，其中data是一個數據框，它代表著原始資料，class.index是一個類的索引值
entropy <- function(data,class.index){
data[,class.index] = as.factor(data[,class.index])

class.freq.frame = as.data.frame(table(data[,class.index]))

class.freq.vec = class.freq.frame$Freq/sum(class.freq.frame$Freq)

class.log.freq.vec = log2(class.freq.vec)

#為了處理log2(0)的情況，人為設定log2(0)=0

if(any(class.freq.vec == 0)){
zero.index.vec <- which(0 == class.freq.vec)
class.log.freq.vec[zero.index.vec] = 0;
}

entropy = -sum(class.freq.vec * class.log.freq.vec)
return(entropy)
}

subGain <- function(x,data,attr.index,class.index){

data.sub <- data[which(x == data[,attr.index]),]

entropy.sub <- entropy(data.sub,class.index)

return((dim(data.sub)[1]/dim(data)[1])*entropy.sub)

}

#計算單個屬性的資訊熵值
gain <- function(data, attr.index, class.index){

attr.val.vec <- unique(data[,attr.index])
attr.entropy.vec<-unlist(lapply(attr.val.vec, subGain,data = data, attr.index = attr.index, class.index = class.index))

return(entropy(data,class.index)-sum(attr.entropy.vec))
}

#calculate split information

splitInfo <- function(data, attr.index, class.index){
attr.val.vec <- unique(data[,attr.index])
split.val.vec <- unlist(lapply(attr.val.vec, function(x,data,attr.index) {split.freq=length(which(x == data[,attr.index]))/dim(data)[1];return(split.freq * log2(split.freq))}, data=data, attr.index=attr.index))
return(-sum(split.val.vec))
}

# calculate gain ratio
gainRation <- function(x, data, class.index){
attr.index = x
splitInfo.attr = splitInfo(data, attr.index, class.index)
gain.attr = gain(data,attr.index, class.index)
return(ifelse(splitInfo.attr ==0,1,gain.attr/splitInfo.attr))
}

# create tree node
createNode <- function(data, tree.max.num, tree.level, class.index, node.index=NA, node.val=NA){
attr.vec <- (1:dim(data)[2])[-class.index]
gain.ration.vec <- unlist(lapply(attr.vec, gainRation, data = data, class.index = class.index))

#TEST_B
#print(data)
#TEST_E
#look for the index of the attribute that is the most gain rate
attr.best.index = attr.vec[which(max(gain.ration.vec) == gain.ration.vec)]

#select the first best attribute when there exist multiple best attributes
if(length(attr.best.index)!=1){
attr.best.index = attr.best.index[1]
}

#decision tree for label variables
if(!is.character(data[,attr.best.index]) && !is.integer(data[,attr.best.index]) && !is.factor(data[,attr.best.index])){
stop("The values of attributes are characters, integer or factor")
}
else{
attr.best.val.vec <- unique(data[,attr.best.index])
result.list=lapply(attr.best.val.vec,function(x,data) return(which(x == data)), data = data[,attr.best.index])

#get the label value for the special node
class.matrix = as.data.frame(table(data[,class.index]))
class.label = as.numeric(class.matrix[which( max(class.matrix[,2]) == class.matrix[,2]),1])

#create TreeNode class
node <- list(split.vec = NA, split.index = NA, nodes.list=list(),split.num = 0, node.label = class.label,
node.index = NA, node.val = NA, level = tree.level,node.num = 0)
class(node)<-"TreeNode"

#when the node is not root node
if(!is.na(node.val)){
node$node.val = node.val
node$node.index = node.index
node$node.num = dim(data)[1]
}

#calculate the purity of class in node
if(length(unique(data[,class.index])) == 1){
return(node)
}
node$split.vec = as.character(attr.best.val.vec)
node$split.num = length(attr.best.val.vec)
node$split.index = names(data)[attr.best.index]

# divide the data based on the selected variable
data.sub = data[,-attr.best.index]

#change class.index
if(attr.index < class.index){
class.index = class.index -1
}

#create children nodes, when the data has only one attribute or the height of the tree is max
if(length(attr.vec) == 1 || tree.max.num == 0){
for(i in 1:length(attr.best.val.vec)){
class.sub.matrix = as.data.frame(table(data[result.list[[i]],][,class.index]))
class.sub.label = class.sub.matrix[which(class.sub.matrix[,2] == max(class.sub.matrix[,2])),1]
# split.index =0 denotes leaf node
node.new <- list(split.vec = NA, split.index = NA, nodes.list = list(), split.num = 0, node.label=as.numeric(class.sub.label),
node.index = NA, node.val = NA, level = tree.level+1, node.num = 0)
class(node.new)<- "TreeNode"
node.new$node.index = node$split.index
node.new$node.val = node$split.vec[i]
node.new$node.num = length(result.list[[i]])
node$nodes.list[[i]] <- node.new
}
}
#create children using iterative method when the number of attributes is more than 0
else{
for(i in 1:length(attr.best.val.vec)){
node.new = createNode(data.sub[result.list[[i]],],tree.max.num-1, tree.level+1, class.index, node$split.index, node$split.vec[i])
node$nodes.list[[i]] = node.new
}
}
return(node)
}
}

#build decision tree

createTree <- function(data,tree.max.num, class.index){
if(tree.max.num <= 0){
stop("The height of tree must be more than 0")
}
root.node <- createNode(data, tree.max.num, 0, class.index)
tree <- list(root = root.node)
class(tree)<-"Tree"

return(tree)
}

#predict using tree

predict.tree <- function(tree.ins, x.ins){

}

print.TreeNode <- function(x){
if(x$level != 0){
for(i in 1:x$level){
cat("\t\t\t")
}
}
#leaf node
if(is.na(x$split.index)){
cat(x$node.index,"=",x$node.val,": ",x$label,"(",x$node.num,")\n")
}
else{
cat(x$node.index,"=",x$node.val,"\n")
for(i in 1:length(x$nodes.list)){
print(x$nodes.list[[i]])
}
}
}

print.Tree <- function(x){

#if(is.na(x)){
#cat("The tree is empty\n")
#}

node.root <- x$root

if(is.na(node.root) || length(node.root$nodes.list) == 0){
cat("The root is empty\n")
}

for(i in 1:length(node.root$nodes.list)){
print(node.root$nodes.list[[i]])
}

}

注：葉子節點的split.index=NA，root節點的node.index=NA和node.val=NA。

基於R的C4.5決策樹的建立

基於R的C4.5決策樹的建立

C4.5決策樹學習(基於集體智慧程式設計程式碼)

模式識別筆記5-決策樹

基於sklearn的決策樹演算法

西瓜書課後習題4.3 基於資訊熵決策樹，連續和離散屬性，並驗證模型

Python3實現機器學習經典演算法（四）C4.5決策樹

基於邏輯迴歸/決策樹/隨機森林/多層感知分類器/xgboost/樸素貝葉斯分類的資訊多分類效能對比

西瓜書習題4.3 基於資訊熵決策樹，連續和離散屬性

機器學習與人工智障(5):決策樹與隨機森林

機器學習筆記（7）——C4.5決策樹中的缺失值處理

C4.5決策樹演算法（Python實現）

機器學習筆記（6）——C4.5決策樹中的剪枝處理和Python實現

機器學習筆記（5）——C4.5決策樹中的連續值處理和Python實現

ID3和C4.5決策樹演算法總結

Thinking in SQL系列之四：資料探勘C4.5決策樹演算法

R_針對churn資料用id3、cart、C4.5和C5.0建立決策樹模型進行判斷哪種模型更合適

【機器學習】決策樹（基於ID3,C4.5,CART分類迴歸樹演算法）—— python3 實現方案

演算法-基於ID3和C4.5的決策樹演算法

基於資訊增益的決策樹歸納的Python實現【CD4.5演算法】

分類算法：決策樹（C4.5）(轉)

基於R的C4.5決策樹的建立

相關推薦