學習筆記資料清洗--異常值檢驗
阿新 • • 發佈:2020-12-12
學習筆記資料清洗--異常值檢驗
文章目錄
前言
。
一、異常值檢驗
outlierKD <- function(dt, var){
var_name <- eval(substitute(var), eval(dt)) #eval 對錶達式求值
tot <- sum(!is.na(var_name))
na1 <- sum(is.na(var_name))
m1 <- mean(var_name, na.rm = T)
par(mfrow=c(2 ,2), oma=c(0,0,3,0))
boxplot(var_name, main="with outliers")
hist(var_name, main="with outliers", xlab=NA, ylab=NA)#直方圖
outlier <- boxplot.stats(var_name)$out #boxplot.stats生成box圖的資料
mo <- mean(outlier)
var_name <- ifelse(var_name %in% outlier, NA, var_name)
boxplot( var_name, main="without outliers", xlab=NA, ylab=NA)
hist(var_name, main="without outliers", xlab=NA, ylab=NA)
title("outlier check",outer = TRUE)
na2 <- sum(is.na(var_name))
cat("outliers identified:", na2-na1, "\n")
cat("propotion (%) of outliers" , round(na2 - na1)/
tot*100, "\n")
cat("mean of the outliers:", round(mo, 2), "\n")
m2 <- mean(var_name, na.rm = T)
cat("mean without removing outliers:", round(m1, 2), "\n")
cat("mean if we remove outliers:", round(m2, 2), "\n")
response <-readline(prompt = "do you want to remove outliers
and to replace with NA? [yes/no]:")
if(response == "y" | response == "yes"){
dt[as.character(substitute(var))] <- invisible(var_name)
assign(as.character(as.list(match.call())$dt),dt,
envir = .GlobalEnv)
cat("outliers successfully removed", "\n")
return(invisible(dt))
} else{
cat("nothing changed", "\n")
return(invisible(var_name))
}
}
例子
df <- data.frame(bp = c(sample(80:250, 1000, replace = T),
NA, 390, 100))
outlierKD(df, bp)
結果
在程式最後給出選擇是否剔除異常值