SparkML實戰之二:Kmeans
阿新 • • 發佈:2019-02-08
package class8
import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors
/**
* Created by root on 16-1-12.
*/
object Kmeans {
def main(args: Array[String]) {
// 遮蔽不必要的日誌顯示在終端上
Logger.getLogger ("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
//設定執行環境
val conf = new SparkConf().setAppName("Kmeans").setMaster("local[4]")
//在叢集上執行需要設定set.Master("spark://moon:7077")並且要打包
//sc.addJar("/path/to/jarfile")
val sc = new SparkContext(conf)
//裝載資料集
// 0.0 0.0 0.0
// 2 0.1 0.1 0.1
// 3 0.2 0.2 0.2
// 4 9.0 9.0 9.0
// 5 9.1 9.1 9.1
// 6 9.2 9.2 9.2
val data = sc.textFile("/usr/local/spark/spark-data/data/class8/kmeans_data.txt",1)
val parsedData = data.map(s=>Vectors.dense(s.split(' ').map(_.toDouble)))
//將資料集聚類,2個類,20次迭代,進行模型訓練形成資料模型
val numClusters =2
val numIterations = 20
val model = KMeans.train(parsedData, numClusters, numIterations)
//列印資料模型的中心點
println("Cluster centers:")
for(c <-model.clusterCenters){
println(" "+c.toString)
}
//使用誤差平方之和來評估資料模型,--------------------------------------模型在訓練集上計算損失
val cost=model.computeCost(parsedData)
println("Within Set Sum of Squared Errors ="+cost)
//使用模型測試單點資料-----------------------------------------------模型對測試樣本分類
println("Vectors 0.2 0.2 0.2 is belongs to clusters:" +
model.predict(Vectors.dense("0.2 0.2 0.2".split(' ').map(_.toDouble)))) //1
println("Vectors 0.25 0.25 0.25 is belongs to clusters:" +
model.predict(Vectors.dense("0.25 0.25 0.25".split(' ').map(_.toDouble))))
println("Vectors 8 8 8 is belongs to clusters:" + model.predict(Vectors.dense("8 8 8".split(' ').map(_.toDouble))))
//交叉評估,之返回結果 testdata就是parseddata
val testdata = data.map(s=>Vectors.dense(s.split(' ').map(_.toDouble)))
val result1 = model.predict(testdata)
//result1.saveAsTextFile("/usr/local/spark/spark-data/data/class8/result_kmeans1")
result1.foreach(println)
//交叉評估2,返回資料集和結果
// val resutl2 = data.map{
// line =>
// val linevectore = Vectors.dense(line.split(' ').map(_.toDouble))
// val prediction =model.predict(linevectore)
// line+" "+prediction
// }.saveAsTextFile("/usr/local/spark/spark-data/data/class8/result_kmeans2")
sc.stop()
}
}