1. 程式人生 > >在R環境下基於鳶尾花資料做聚類分析

在R環境下基於鳶尾花資料做聚類分析

title: “cluster with graphs (Iris species)”
author:”gongzi liu”
data:”2017/06/02”

output: html_notebook

setwd(“C:/Users/Administrator/Desktop/初學R語言”)
getwd()
dir()
Data<-read.csv(“Iris.csv”)

1.利用箱線圖觀察變數均值的差異(二維與四維)

library(mclust)##為了得到Mclust函式
iris.mclus<-Mclust(Data[,1:4])
summary(iris.mclus)
boxplot(DataP

etalLengthCmiris.mclusclassification,ylab=”Petal length”,xlab=”M cluster”)
boxplot(DataSepalLengthCmiris.mclusclassification,ylab=”Sepal length”,xlab=”M cluster”)
boxplot(DataSepalWidthCmiris.mclusclassification,ylab=”Sepal Width”,xlab=”M cluster”)
boxplot(DataPetalWidthCmiris.mclusclassification,ylab=”Petal Width”,xlab=”M cluster”)
library(dplyr)
library(tidyr)
library(ggplot2)
long_Data <- Data%>%gather(part,value,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm)%>%separate(part, c(‘part’, ‘measure’), sep = ‘\.’)##整理變數便於後期四維箱線圖的繪製
ggplot(long_Data, aes(x = Species, y = value, col = part))+geom_jitter(alpha=0.3,size=0.8)+stat_boxplot(alpha=0.5)+ facet_grid(.~ measure)

2.找到每種型別中Length 與width之間的線性關係便於後期研究

glimpse(Data)
Data$Flower <- 1:nrow(Data)

wide_Data <- Data%>%
gather(key, value, -Species, -Flower) %>%
separate(key, c(‘part’, ‘measure’),sep = “\.”) %>%
spread(measure, value)
head(wide_Data,10)
ggplot(wide_Data,aes(x=Width,y=Length,col=Species))+geom_point(alpha=0.4,size=0.8)+stat_smooth(method=’lm’,fullrange=T,size=0.5)

3.K均值聚類

Data_training<-Data[,1:4]
Data_plot_1<-Data[,c(2,3)]
model<-kmeans(Data_training,3)
summary(model)
plot(Data_plot_1,col=model$cluster,main=”K-Means”)

4.畫出四個變數所有種類的折線圖

hc<-highchart()%>%
hc_xAxis(categories=DataSpecies)hcaddseries(name=SepalLengthCm,data=DataSepalLengthCm)%>%
hc_add_series(name=”SepalWidthCm”,data=DataSepalWidthCm)hcaddseries(name=PetalLengthCm,data=DataPetalLengthCm)%>%
hc_add_series(name=”PetalWidthCm”,data=Data$PetalWidthCm)%>%
hc_add_theme(hc_theme_google())%>%
hc_title(text=”All species with Sepal and petal”)