1. 程式人生 > >聚類-----KMeans

聚類-----KMeans

create mllib edi cit clust package contex kmean local

package Spark_MLlib

import org.apache.spark.ml.clustering.KMeans
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.linalg.{Vector, Vectors}

/**
  * K均值
  */
case class features_schema(features:Vector)
object 聚類__KMeans {
       val spark=SparkSession.builder().master("local[2]").getOrCreate()
       import spark.implicits._
  def main(args: Array[String]): Unit 
= { val data=spark.sparkContext.textFile("file:///home/soyo/桌面/spark編程測試數據/soyo2.txt") .map(_.split(",")).map(x=>features_schema(Vectors.dense(x(0).toDouble,x(1).toDouble,x(2).toDouble,x(3).toDouble))).toDF() data.show() val KMeansModel=new KMeans().setK(7).setFeaturesCol("
features").setPredictionCol("prediction").fit(data) val results=KMeansModel.transform(data) results.show(150) //模型所有的聚類中心(指最後生成的聚類中心,K是幾就有幾組)的情況 KMeansModel.clusterCenters.foreach(println) //集合內誤差平方和(選取K的大小可以參照,使用場景+最大的集合內誤差平方的值=較合適的K) val cost=KMeansModel.computeCost(data) println(cost) } }

結果:

+-----------------+
| features|
+-----------------+
|[5.1,3.5,1.4,0.2]|
|[4.9,3.0,1.4,0.2]|
|[4.7,3.2,1.3,0.2]|
|[4.6,3.1,1.5,0.2]|
|[5.0,3.6,1.4,0.2]|
|[5.4,3.9,1.7,0.4]|
|[4.6,3.4,1.4,0.3]|
|[5.0,3.4,1.5,0.2]|
|[4.4,2.9,1.4,0.2]|
|[4.9,3.1,1.5,0.1]|
|[5.4,3.7,1.5,0.2]|
|[4.8,3.4,1.6,0.2]|
|[4.8,3.0,1.4,0.1]|
|[4.3,3.0,1.1,0.1]|
|[5.8,4.0,1.2,0.2]|
|[5.7,4.4,1.5,0.4]|
|[5.4,3.9,1.3,0.4]|
|[5.1,3.5,1.4,0.3]|
|[5.7,3.8,1.7,0.3]|
|[5.1,3.8,1.5,0.3]|
+-----------------+
only showing top 20 rows

+-----------------+----------+
| features|prediction|
+-----------------+----------+
|[5.1,3.5,1.4,0.2]| 0|
|[4.9,3.0,1.4,0.2]| 0|
|[4.7,3.2,1.3,0.2]| 0|
|[4.6,3.1,1.5,0.2]| 0|
|[5.0,3.6,1.4,0.2]| 0|
|[5.4,3.9,1.7,0.4]| 0|
|[4.6,3.4,1.4,0.3]| 0|
|[5.0,3.4,1.5,0.2]| 0|
|[4.4,2.9,1.4,0.2]| 0|
|[4.9,3.1,1.5,0.1]| 0|
|[5.4,3.7,1.5,0.2]| 0|
|[4.8,3.4,1.6,0.2]| 0|
|[4.8,3.0,1.4,0.1]| 0|
|[4.3,3.0,1.1,0.1]| 0|
|[5.8,4.0,1.2,0.2]| 0|
|[5.7,4.4,1.5,0.4]| 0|
|[5.4,3.9,1.3,0.4]| 0|
|[5.1,3.5,1.4,0.3]| 0|
|[5.7,3.8,1.7,0.3]| 0|
|[5.1,3.8,1.5,0.3]| 0|
+-----------------+----------+
only showing top 20 rows

[5.005999999999999,3.4180000000000006,1.4640000000000002,0.2439999999999999]
[6.8538461538461535,3.076923076923076,5.715384615384614,2.0538461538461537]
[5.883606557377049,2.740983606557377,4.388524590163936,1.4344262295081966]
78.94506582597859

聚類-----KMeans