第七節 Spark資料切分
阿新 • • 發佈:2021-06-10
知識點
資料切分函式:data.randomSplit
1、案例一
package chapterFour import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession case class NEW(id:String,name:String,url:String,age:String) object MyDataSplit { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.ERROR) Logger.getLogger("akka").setLevel(Level.ERROR) // setup SparkSession to use for interactions with Spark val spark = SparkSession .builder .master("local[*]") .appName("Data Splitting") .config("spark.sql.warehouse.dir", ".") .getOrCreate() import spark.implicits._ //TODO 讀取檔案// http://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip val data = spark.read.csv("E:\\Spark\\sparkml\\MLDemo\\src\\main\\resources\\data\\newsCorpora.csv").toDF("id","name","age","url") // todo 檢視資料數量 val rowCount = data.count() data.show(5) println("Original RowCount=" + rowCount)// todo 資料切分DataFrame randomSplit val splitData = data.randomSplit(Array(0.8, 0.2)) //todo 獲取訓練資料集和測試資料集 val trainingSet = splitData(0) val testSet = splitData(1) val trainingSetCount = trainingSet.count() val testSetCount = testSet.count() println("trainingSet RowCount=" + trainingSetCount) println("testSet RowCount=" + testSetCount) println("Combined RowCount=" + (trainingSetCount+testSetCount)) spark.stop() } }