1. 程式人生 > 其它 >第七節 Spark資料切分

第七節 Spark資料切分

知識點

資料切分函式:data.randomSplit

1、案例一

package chapterFour

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession

case class  NEW(id:String,name:String,url:String,age:String)
object MyDataSplit {

  def main(args: Array[String]): Unit = {

    Logger.getLogger("org").setLevel(Level.ERROR)
    Logger.getLogger(
"akka").setLevel(Level.ERROR) // setup SparkSession to use for interactions with Spark val spark = SparkSession .builder .master("local[*]") .appName("Data Splitting") .config("spark.sql.warehouse.dir", ".") .getOrCreate() import spark.implicits._ //TODO 讀取檔案
// http://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip val data = spark.read.csv("E:\\Spark\\sparkml\\MLDemo\\src\\main\\resources\\data\\newsCorpora.csv").toDF("id","name","age","url") // todo 檢視資料數量 val rowCount = data.count() data.show(5) println("Original RowCount=" + rowCount)
// todo 資料切分DataFrame randomSplit val splitData = data.randomSplit(Array(0.8, 0.2)) //todo 獲取訓練資料集和測試資料集 val trainingSet = splitData(0) val testSet = splitData(1) val trainingSetCount = trainingSet.count() val testSetCount = testSet.count() println("trainingSet RowCount=" + trainingSetCount) println("testSet RowCount=" + testSetCount) println("Combined RowCount=" + (trainingSetCount+testSetCount)) spark.stop() } }