基於R的C4.5決策樹的建立
阿新 • • 發佈:2019-01-07
下邊的程式碼是一個簡單的C4.5決策樹建立過程,該決策樹不含有剪枝的過程,並且針對的屬性必須是標量(非連續屬性值)
#計算類屬性的熵,其中data是一個數據框,它代表著原始資料,class.index是一個類的索引值
entropy <- function(data,class.index){
data[,class.index] = as.factor(data[,class.index])
class.freq.frame = as.data.frame(table(data[,class.index]))
class.freq.vec = class.freq.frame$Freq/sum(class.freq.frame$Freq)
zero.index.vec <- which(0 == class.freq.vec)
class.log.freq.vec[zero.index.vec] = 0;
}
entropy = -sum(class.freq.vec * class.log.freq.vec)
return(entropy)
}
subGain <- function(x,data,attr.index,class.index){
data.sub <- data[which(x == data[,attr.index]),]
entropy.sub <- entropy(data.sub,class.index)
return((dim(data.sub)[1]/dim(data)[1])*entropy.sub)
}
#計算單個屬性的資訊熵值
gain <- function(data, attr.index, class.index){
attr.val.vec <- unique(data[,attr.index])
attr.entropy.vec<-unlist(lapply(attr.val.vec, subGain,data = data, attr.index = attr.index, class.index = class.index))
return(entropy(data,class.index)-sum(attr.entropy.vec))
}
#calculate split information
splitInfo <- function(data, attr.index, class.index){
attr.val.vec <- unique(data[,attr.index])
split.val.vec <- unlist(lapply(attr.val.vec, function(x,data,attr.index) {split.freq=length(which(x == data[,attr.index]))/dim(data)[1];return(split.freq * log2(split.freq))}, data=data, attr.index=attr.index))
return(-sum(split.val.vec))
}
# calculate gain ratio
gainRation <- function(x, data, class.index){
attr.index = x
splitInfo.attr = splitInfo(data, attr.index, class.index)
gain.attr = gain(data,attr.index, class.index)
return(ifelse(splitInfo.attr ==0,1,gain.attr/splitInfo.attr))
}
# create tree node
createNode <- function(data, tree.max.num, tree.level, class.index, node.index=NA, node.val=NA){
attr.vec <- (1:dim(data)[2])[-class.index]
gain.ration.vec <- unlist(lapply(attr.vec, gainRation, data = data, class.index = class.index))
#TEST_B
#print(data)
#TEST_E
#look for the index of the attribute that is the most gain rate
attr.best.index = attr.vec[which(max(gain.ration.vec) == gain.ration.vec)]
#select the first best attribute when there exist multiple best attributes
if(length(attr.best.index)!=1){
attr.best.index = attr.best.index[1]
}
#decision tree for label variables
if(!is.character(data[,attr.best.index]) && !is.integer(data[,attr.best.index]) && !is.factor(data[,attr.best.index])){
stop("The values of attributes are characters, integer or factor")
}
else{
attr.best.val.vec <- unique(data[,attr.best.index])
result.list=lapply(attr.best.val.vec,function(x,data) return(which(x == data)), data = data[,attr.best.index])
#get the label value for the special node
class.matrix = as.data.frame(table(data[,class.index]))
class.label = as.numeric(class.matrix[which( max(class.matrix[,2]) == class.matrix[,2]),1])
#create TreeNode class
node <- list(split.vec = NA, split.index = NA, nodes.list=list(),split.num = 0, node.label = class.label,
node.index = NA, node.val = NA, level = tree.level,node.num = 0)
class(node)<-"TreeNode"
#when the node is not root node
if(!is.na(node.val)){
node$node.val = node.val
node$node.index = node.index
node$node.num = dim(data)[1]
}
#calculate the purity of class in node
if(length(unique(data[,class.index])) == 1){
return(node)
}
node$split.vec = as.character(attr.best.val.vec)
node$split.num = length(attr.best.val.vec)
node$split.index = names(data)[attr.best.index]
# divide the data based on the selected variable
data.sub = data[,-attr.best.index]
#change class.index
if(attr.index < class.index){
class.index = class.index -1
}
#create children nodes, when the data has only one attribute or the height of the tree is max
if(length(attr.vec) == 1 || tree.max.num == 0){
for(i in 1:length(attr.best.val.vec)){
class.sub.matrix = as.data.frame(table(data[result.list[[i]],][,class.index]))
class.sub.label = class.sub.matrix[which(class.sub.matrix[,2] == max(class.sub.matrix[,2])),1]
# split.index =0 denotes leaf node
node.new <- list(split.vec = NA, split.index = NA, nodes.list = list(), split.num = 0, node.label=as.numeric(class.sub.label),
node.index = NA, node.val = NA, level = tree.level+1, node.num = 0)
class(node.new)<- "TreeNode"
node.new$node.index = node$split.index
node.new$node.val = node$split.vec[i]
node.new$node.num = length(result.list[[i]])
node$nodes.list[[i]] <- node.new
}
}
#create children using iterative method when the number of attributes is more than 0
else{
for(i in 1:length(attr.best.val.vec)){
node.new = createNode(data.sub[result.list[[i]],],tree.max.num-1, tree.level+1, class.index, node$split.index, node$split.vec[i])
node$nodes.list[[i]] = node.new
}
}
return(node)
}
}
#build decision tree
createTree <- function(data,tree.max.num, class.index){
if(tree.max.num <= 0){
stop("The height of tree must be more than 0")
}
root.node <- createNode(data, tree.max.num, 0, class.index)
tree <- list(root = root.node)
class(tree)<-"Tree"
return(tree)
}
#predict using tree
predict.tree <- function(tree.ins, x.ins){
}
print.TreeNode <- function(x){
if(x$level != 0){
for(i in 1:x$level){
cat("\t\t\t")
}
}
#leaf node
if(is.na(x$split.index)){
cat(x$node.index,"=",x$node.val,": ",x$label,"(",x$node.num,")\n")
}
else{
cat(x$node.index,"=",x$node.val,"\n")
for(i in 1:length(x$nodes.list)){
print(x$nodes.list[[i]])
}
}
}
print.Tree <- function(x){
#if(is.na(x)){
#cat("The tree is empty\n")
#}
node.root <- x$root
if(is.na(node.root) || length(node.root$nodes.list) == 0){
cat("The root is empty\n")
}
for(i in 1:length(node.root$nodes.list)){
print(node.root$nodes.list[[i]])
}
#計算類屬性的熵,其中data是一個數據框,它代表著原始資料,class.index是一個類的索引值
entropy <- function(data,class.index){
data[,class.index] = as.factor(data[,class.index])
class.freq.frame = as.data.frame(table(data[,class.index]))
class.freq.vec = class.freq.frame$Freq/sum(class.freq.frame$Freq)
class.log.freq.vec = log2(class.freq.vec)
#為了處理log2(0)的情況,人為設定log2(0)=0
if(any(class.freq.vec == 0)){zero.index.vec <- which(0 == class.freq.vec)
class.log.freq.vec[zero.index.vec] = 0;
}
entropy = -sum(class.freq.vec * class.log.freq.vec)
return(entropy)
}
subGain <- function(x,data,attr.index,class.index){
data.sub <- data[which(x == data[,attr.index]),]
entropy.sub <- entropy(data.sub,class.index)
return((dim(data.sub)[1]/dim(data)[1])*entropy.sub)
}
#計算單個屬性的資訊熵值
gain <- function(data, attr.index, class.index){
attr.val.vec <- unique(data[,attr.index])
attr.entropy.vec<-unlist(lapply(attr.val.vec, subGain,data = data, attr.index = attr.index, class.index = class.index))
return(entropy(data,class.index)-sum(attr.entropy.vec))
}
#calculate split information
splitInfo <- function(data, attr.index, class.index){
attr.val.vec <- unique(data[,attr.index])
split.val.vec <- unlist(lapply(attr.val.vec, function(x,data,attr.index) {split.freq=length(which(x == data[,attr.index]))/dim(data)[1];return(split.freq * log2(split.freq))}, data=data, attr.index=attr.index))
return(-sum(split.val.vec))
}
# calculate gain ratio
gainRation <- function(x, data, class.index){
attr.index = x
splitInfo.attr = splitInfo(data, attr.index, class.index)
gain.attr = gain(data,attr.index, class.index)
return(ifelse(splitInfo.attr ==0,1,gain.attr/splitInfo.attr))
}
# create tree node
createNode <- function(data, tree.max.num, tree.level, class.index, node.index=NA, node.val=NA){
attr.vec <- (1:dim(data)[2])[-class.index]
gain.ration.vec <- unlist(lapply(attr.vec, gainRation, data = data, class.index = class.index))
#TEST_B
#print(data)
#TEST_E
#look for the index of the attribute that is the most gain rate
attr.best.index = attr.vec[which(max(gain.ration.vec) == gain.ration.vec)]
#select the first best attribute when there exist multiple best attributes
if(length(attr.best.index)!=1){
attr.best.index = attr.best.index[1]
}
#decision tree for label variables
if(!is.character(data[,attr.best.index]) && !is.integer(data[,attr.best.index]) && !is.factor(data[,attr.best.index])){
stop("The values of attributes are characters, integer or factor")
}
else{
attr.best.val.vec <- unique(data[,attr.best.index])
result.list=lapply(attr.best.val.vec,function(x,data) return(which(x == data)), data = data[,attr.best.index])
#get the label value for the special node
class.matrix = as.data.frame(table(data[,class.index]))
class.label = as.numeric(class.matrix[which( max(class.matrix[,2]) == class.matrix[,2]),1])
#create TreeNode class
node <- list(split.vec = NA, split.index = NA, nodes.list=list(),split.num = 0, node.label = class.label,
node.index = NA, node.val = NA, level = tree.level,node.num = 0)
class(node)<-"TreeNode"
#when the node is not root node
if(!is.na(node.val)){
node$node.val = node.val
node$node.index = node.index
node$node.num = dim(data)[1]
}
#calculate the purity of class in node
if(length(unique(data[,class.index])) == 1){
return(node)
}
node$split.vec = as.character(attr.best.val.vec)
node$split.num = length(attr.best.val.vec)
node$split.index = names(data)[attr.best.index]
# divide the data based on the selected variable
data.sub = data[,-attr.best.index]
#change class.index
if(attr.index < class.index){
class.index = class.index -1
}
#create children nodes, when the data has only one attribute or the height of the tree is max
if(length(attr.vec) == 1 || tree.max.num == 0){
for(i in 1:length(attr.best.val.vec)){
class.sub.matrix = as.data.frame(table(data[result.list[[i]],][,class.index]))
class.sub.label = class.sub.matrix[which(class.sub.matrix[,2] == max(class.sub.matrix[,2])),1]
# split.index =0 denotes leaf node
node.new <- list(split.vec = NA, split.index = NA, nodes.list = list(), split.num = 0, node.label=as.numeric(class.sub.label),
node.index = NA, node.val = NA, level = tree.level+1, node.num = 0)
class(node.new)<- "TreeNode"
node.new$node.index = node$split.index
node.new$node.val = node$split.vec[i]
node.new$node.num = length(result.list[[i]])
node$nodes.list[[i]] <- node.new
}
}
#create children using iterative method when the number of attributes is more than 0
else{
for(i in 1:length(attr.best.val.vec)){
node.new = createNode(data.sub[result.list[[i]],],tree.max.num-1, tree.level+1, class.index, node$split.index, node$split.vec[i])
node$nodes.list[[i]] = node.new
}
}
return(node)
}
}
#build decision tree
createTree <- function(data,tree.max.num, class.index){
if(tree.max.num <= 0){
stop("The height of tree must be more than 0")
}
root.node <- createNode(data, tree.max.num, 0, class.index)
tree <- list(root = root.node)
class(tree)<-"Tree"
return(tree)
}
#predict using tree
predict.tree <- function(tree.ins, x.ins){
}
print.TreeNode <- function(x){
if(x$level != 0){
for(i in 1:x$level){
cat("\t\t\t")
}
}
#leaf node
if(is.na(x$split.index)){
cat(x$node.index,"=",x$node.val,": ",x$label,"(",x$node.num,")\n")
}
else{
cat(x$node.index,"=",x$node.val,"\n")
for(i in 1:length(x$nodes.list)){
print(x$nodes.list[[i]])
}
}
}
print.Tree <- function(x){
#if(is.na(x)){
#cat("The tree is empty\n")
#}
node.root <- x$root
if(is.na(node.root) || length(node.root$nodes.list) == 0){
cat("The root is empty\n")
}
for(i in 1:length(node.root$nodes.list)){
print(node.root$nodes.list[[i]])
}
}
注:葉子節點的split.index=NA,root節點的node.index=NA和node.val=NA。