在R環境下基於鳶尾花資料做聚類分析
title: “cluster with graphs (Iris species)”
author:”gongzi liu”
data:”2017/06/02”
output: html_notebook
setwd(“C:/Users/Administrator/Desktop/初學R語言”)
getwd()
dir()
Data<-read.csv(“Iris.csv”)
1.利用箱線圖觀察變數均值的差異(二維與四維)
library(mclust)##為了得到Mclust函式
iris.mclus<-Mclust(Data[,1:4])
summary(iris.mclus)
boxplot(Data
boxplot(Data
boxplot(Data
boxplot(Data
library(dplyr)
library(tidyr)
library(ggplot2)
long_Data <- Data%>%gather(part,value,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm)%>%separate(part, c(‘part’, ‘measure’), sep = ‘\.’)##整理變數便於後期四維箱線圖的繪製
ggplot(long_Data, aes(x = Species, y = value, col = part))+geom_jitter(alpha=0.3,size=0.8)+stat_boxplot(alpha=0.5)+ facet_grid(.~ measure)
2.找到每種型別中Length 與width之間的線性關係便於後期研究
glimpse(Data)
Data$Flower <- 1:nrow(Data)
wide_Data <- Data%>%
gather(key, value, -Species, -Flower) %>%
separate(key, c(‘part’, ‘measure’),sep = “\.”) %>%
spread(measure, value)
head(wide_Data,10)
ggplot(wide_Data,aes(x=Width,y=Length,col=Species))+geom_point(alpha=0.4,size=0.8)+stat_smooth(method=’lm’,fullrange=T,size=0.5)
3.K均值聚類
Data_training<-Data[,1:4]
Data_plot_1<-Data[,c(2,3)]
model<-kmeans(Data_training,3)
summary(model)
plot(Data_plot_1,col=model$cluster,main=”K-Means”)
4.畫出四個變數所有種類的折線圖
hc<-highchart()%>%
hc_xAxis(categories=Data
hc_add_series(name=”SepalWidthCm”,data=Data
hc_add_series(name=”PetalWidthCm”,data=Data$PetalWidthCm)%>%
hc_add_theme(hc_theme_google())%>%
hc_title(text=”All species with Sepal and petal”)