1. 程式人生 > >R programming for feature selection and regression

R programming for feature selection and regression

  • data introduction
  • Select packages
  • Split dataset
  • feature selection
  • tune parameters
  • prediciton

1. data introduction

我的資料包含一千五百多條樣例,92個屬性,待預測專案有性別這樣的分類問題,也有年齡這樣的迴歸問題。

2. select packages

library(ggplot2) # Data visualization
library(randomForest)
library(ROCR) #visualizing classifier performance in R
library(caret) 
library("nnet")
# library(rpart)
# library("rfUtilities") # for crross validation 
library("openxlsx") # write to excel

3. data splitting

data_splitting <- function(number_of_predictors,prediction_name){
	set.seed(123)
	#get selected predictors
	if (number_of_predictors < 92){
		#get the variables
		set.seed(100)
		variables <- read.csv(file.path(paste0("D:/Program Files/R/R-3.5.1/bin/","thr_",prediction_name,".csv")), header = TRUE, sep = ",")
		variables <- variables[,"thres"]
		var_vec <- as.vector(variables)
		vars_inds <- var_vec[1:number_of_predictors]+3
	}else{
		vars_inds <- seq(4,95)
		#print(vars_inds)
	}
	#select data according to selected predictors
	data_frame <- read.csv(file.path("C:/Users/ntu/Downloads/LEARNING/personality/pers_desk.csv"), header = TRUE, sep = ",")
	features <- data_frame[,vars_inds]
	##the attributes waiting for prediction
	lab <- data_frame[c(2,3,96,97,98,99,100)]
	
	#split training and test dataset, 70% training
	sample.ind <- sample(2,nrow(features),replace = T,prob = c(0.7,0.3))
	train <- features[sample.ind==1,]
	test <- features[sample.ind==2,]
	train_label <- lab[sample.ind==1,]
	test_label <- lab[sample.ind==2,]
	#print(test[,c(1,2)])
	
	return (list(train = train, train_label = train_label, test = test, test_label = test_label))
}

4. feature selection via VSURF

feature_selection <- function(label_name, x, y){

	library("VSURF")
	
	vsurf <- VSURF(x = x, y = y, mtry = 100)
	print(summary(vsurf))
	#thres <- VSURF_thres(extract_features(new_data), new_data$keyword, mtry <- 10)
	#print(thres)
	#plot(vsurf, step = "thres", imp.sd = FALSE, var.names = TRUE)
	imp_dec = vsurf$imp.mean.dec
	imp_ind = vsurf$imp.mean.dec.ind
	thres = vsurf$varselect.thres
	interp = vsurf$varselect.interp
	pred = vsurf$varselect.pred
	thr_df = data.frame(thres)
	int_df = data.frame(interp)
	prd_df = data.frame(pred)
	imp_df = data.frame(imp_ind,imp_dec)
	write.csv(thr_df,paste0("thr_",label_name,".csv"))
	number <- c(1:3, 5:30)
	print(number[vsurf$varselect.thres])
	print(number[vsurf$varselect.interp])
	print(number[vsurf$varselect.pred])
	return (c(92, number[vsurf$varselect.thres], number[vsurf$varselect.interp], number[vsurf$varselect.pred]))
}

4.1 VSURF簡介

vsurf 會將所有的屬性劃分為三個等級,threshold, interpretation and prediction. number[varselect.thres] 就是threshold 屬性個數。 vsurf$$varselect.thres會返回threshold屬性列表。一般來說prediciton的數量最少,threshold屬性數量最多。這些屬性的選擇原理描述: Three steps variable selection procedure based on random forests. First step is dedicated to eliminate irrelevant variables from the dataset. Second step aims to select all variables related to the response for interpretation purpose. Third step refines the selection by eliminating redundancy in the set of variables selected by the second step, for prediction purpose. 也就是說其實VSURF的原理還是基於random forest的重要性,但是它在此基礎上通過屬性排列組合迴歸或者分類的效果將屬性分成三個等級。因此該特徵選擇是要考慮label的,它本質上是執行random forest得到評價屬性的指標並對模型效能進行分析確定應該使用哪些屬性。

在這裡插入圖片描述

4.1 VSURF解釋

  • thresholding step: 第一步(“閾值處理步驟”):首先,使用帶有引數importance = TRUE的函式randomForest計算nfor.thres rf,並選擇ntree和mtry的預設值。 然後,按屬性的平均屬性重要性(VI)降序排序。 接下來,計算閾值:min.thres,修剪的CART樹的最小預測值擬合到VI的標準偏差的曲線。 最後,執行實際的“閾值處理步驟”:僅保留平均VI大於nmin * min.thres的變數。
  • 第二步(“解釋步驟”):考慮第一步選擇的變數。 nfor.interp嵌入隨機森林模型的增長,一開始只選擇最重要的變數,知道考慮完所有的第一步選擇的屬性結束。 然後,err.min計算這些模型的最小平均袋外(OOB)誤差及其相關的標準偏差sd.min。 最後,選擇平均OOB誤差小於err.min + nsd * sd.min的最小模型(及其相應的變數)。
  • 第三步(“預測步驟”):起點與第二步相同。 但是,現在變數逐步新增到模型中。 mean.jump,平均跳躍值是使用第二步遺漏的變數計算的,並設定為一個模型的平均OOB誤差與其第一個跟隨模型之間的平均絕對差值。 因此,如果平均OOB誤差減小大於nmj * mean.jump,則變數包含在模型中。

5. tune parameters

對每一個預測都選擇最優的ntrees作為fitting model
5.1 迴歸問題
choose_best_rf_reg <- function(train, test, train_label, test_label){

	testing_mse_arr <- c()
	training_mse_arr <- c()
	variance_explained <- c()
	correlation <- c()
	num_trees <- c()
	for (n_trees in seq(10,510,20)){
		i <- (n_trees+10)/20
		#fit Random Forest Model
		rf <- randomForest(x = train, y = train_label, ntree = n_trees, importance = TRUE)
		#using model rf fit on training data to predict test data
		predicted <- predict(rf, test)
		#calculate mse
		testing_mse_arr[i] <- mean((predicted-test_label)^2)
		training_mse_arr[i] <- min(rf$mse)
		#calculate variance explained
		variance_explained[i] <- max(100*rf$rsq)
		#calculate correlation between prediction and actual test_labels
		correlation[i] <- cor(predicted,test_label)
		num_trees[i] <- n_trees
		#print(testing_mse_arr[i])
		}
	metrics <- data.frame(num_trees = num_trees, training_mse = training_mse_arr, variance_explained = variance_explained,testing_mse = testing_mse_arr, correlation = correlation)
	#print("metrics generated")
	#print(testing_mse_arr)
	n = which.min(testing_mse_arr)*20-10
	print(n)
	#print("metrics generated")
	rf = randomForest(x = train, y = train_label, ntree = n, importance = TRUE)
	imp = importance(rf)
	importances = data.frame(att_names = rownames(imp), imp = imp)
	write_csv(importances,"imp.csv")
	#print(importances)
	return (list(met=metrics,imp=importances))
}
5.2 分類問題
choose_best_rf_clf <- function(train, test, train_label, test_label){

	accuracy <- c()
	precision <- c()
	recall <- c()
	F1 <- c()
	for (n_trees in seq(10,510,20)){
		i <- (n_trees+10)/20
		#Fit Random Forest Model
		rf <- randomForest(x = train, y = as.factor(train_label), ntree = n_trees, importance = TRUE)
		#rf.cv <- rf.crossValidation(rf, train, p = 0.1, n = 10)
		# using model rf fit on training data to predict test data
		predicted <- predict(rf, test)  
		#print(predicted)
		# write results to CSV file
		cm <-  confusionMatrix(data = predicted, factor(test_label, levels = 1:2))
		acc <- cm$overall['Accuracy']
		p <- posPredValue(predicted, as.factor(test_label), positive = "1")
		r <- sensitivity(predicted, as.factor(test_label), positive = "1")

		f <- (2 * p * r) / (p + r)

		#print(acc)
		accuracy[i] <- acc
		precision[i] <- p
		recall[i] <- r
		F1[i] <- f
		#print(i,accuracy[i])
		}
	metrics <- data.frame(Accuracy = accuracy, P = precision, R = recall, F1 <- F1)
	#write_csv(index_datafram,paste0(prediction_name, number_of_predictors,"_metrics.csv"))
	n_trees = which.max(F1)*20-10
	print(n_trees)
	rf = randomForest(x = train, y = train_label, ntree = n_trees, importance = TRUE)
	importances = importance(rf)
	write_csv(metrics,"gender_metrics.csv")
	return (list(met=metrics,imp=importances))
}

5. prediction

main <- function(prediction_name){
	splitted_data4_fs = data_splitting(number_of_predictors = 92, "all")
	print(splitted_data4_fs$train_label)
	splitted_data4_fs$train_label = splitted_data4_fs$train_label[,prediction_name]
	print(splitted_data4_fs$train_label)
	print(paste("selecting feature for", prediction_name))
	selected_numbers_list = feature_selection(prediction_name, splitted_data4_fs$train, splitted_data4_fs$train_label)
	#selected_numbers_list = c(92,37,6,6)
	print("predicting")
	if (prediction_name == "gender"){
		i = 0
		wb <- createWorkbook()
		sheet_names = c("all","thre","inte","pred","all_imp","thre_imp","inte_imp","pred_imp")
		addWorksheet(wb, "all")
		addWorksheet(wb, "thre")
		addWorksheet(wb, "inte")
		addWorksheet(wb, "pred")
		addWorksheet(wb, "all_imp")
		addWorksheet(wb, "thre_imp")
		addWorksheet(wb, "inte_imp")
		addWorksheet(wb, "pred_imp")
		for (num_of_predictors in selected_numbers_list){
			i = i + 1
			print(i)
			imp_arr <- data.frame()
			df_arr <- data.frame()
			data = data_splitting(num_of_predictors, prediction_name)
			data$train_label = data$train_label[,prediction_name]
			data$test_label = data$test_label[,prediction_name]
			met_and_imp <- choose_best_rf_reg(data$train, data$test, data$train_label, data$test_label)
			#print(metrics_and_importances$metrics)
			df_arr <- rbind(df_arr,met_and_imp$met)
			imp_arr <- rbind(imp_arr,met_and_imp$imp)
			writeData(wb,sheet_names[i], df_arr, rowNames = TRUE, colNames = TRUE)
			writeData(wb,sheet_names[i+4], imp_arr, rowNames = TRUE, colNames = TRUE)
		}
		saveWorkbook(wb, paste0(prediction_name,"_metrics_and_imp.xlsx"), overwrite = TRUE)
	}else{
		i = 0
		wb <- createWorkbook()
		sheet_names = c("all","thre","inte","pred","all_imp","thre_imp","inte_imp","pred_imp")
		addWorksheet(wb, "all")
		addWorksheet(wb, "thre")
		addWorksheet(wb, "inte")
		addWorksheet(wb, "pred")
		addWorksheet(wb, "all_imp")
		addWorksheet(wb, "thre_imp")
		addWorksheet(wb, "inte_imp")
		addWorksheet(wb, "pred_imp")
		for (num_of_predictors in selected_numbers_list){
			i = i + 1
			print(i)
			imp_arr <- data.frame()
			df_arr <- data.frame()
			data = data_splitting(num_of_predictors, prediction_name)
			data$train_label = data$train_label[,prediction_name]
			data$test_label = data$test_label[,prediction_name]
			met_and_imp <- choose_best_rf_reg(data$train, data$test, data$train_label, data$test_label)
			#print(metrics_and_importances$metrics)
			df_arr <- rbind(df_arr,met_and_imp$met)
			imp_arr <- rbind(imp_arr,met_and_imp$imp)
			writeData(wb,sheet_names[i], df_arr, rowNames = TRUE, colNames = TRUE)
			writeData(wb,sheet_names[i+4], imp_arr, rowNames = TRUE, colNames = TRUE)
		}
		saveWorkbook(wb, paste0(prediction_name,"_metrics_and_imp.xlsx"), overwrite = TRUE)
	}
}

main("gender")

Note: 推薦使用openxlsx進行資料寫入,先建立workbook,然後命名每一個sheet,並將每一個sheet新增入workbook,此時擁有一個空表,接下來寫入資料,儲存。

wb <- createWorkbook() # create
addWorksheet(wb, "all") # add sheet
addWorksheet(wb, "thre")
addWorksheet(wb, "inte")
addWorksheet(wb, "pred")
writeData(wb,“all”, df_arr, rowNames = TRUE, colNames = TRUE) # write data
saveWorkbook(wb, paste0(prediction_name,"_metrics_and_imp.xlsx"), overwrite = TRUE) # save