Spark DataFrame常用API
阿新 • • 發佈:2020-08-20
Spark DataFrame常用API
package com.imooc.bigdata.chapter04 import org.apache.spark.sql.{DataFrame, SparkSession} object DataFrameAPIApp { def main(args: Array[String]): Unit = { val spark = SparkSession.builder().master("local").appName("DataFrameAPIApp").getOrCreate() import spark.implicits._ val people: DataFrame = spark.read.json("E:\\06-work\\03-java\\01-JavaCodeDome\\SparkSqlCode\\sparksql-train\\data\\people.json") people.printSchema() // 檢視DF的內部結構:列名、列的資料型別、是否可以為空 people.show() // 展示出DF內部的資料 // TODO... DF裡面有兩列,只要name列 ==> select name from people people.select("name").show() people.select($"name").show() // TODO... select * from people where age > 21 people.filter($"age" > 21).show() people.filter("age > 21").show() // TODO... select age, count(1) from people group by age people.groupBy("age").count().show() // TODO... select name,age+10 from people people.select($"name", ($"age"+10).as("new_age")).show() // TODO... 使用SQL的方式操作 people.createOrReplaceTempView("people") spark.sql("select name from people where age > 21").show() val zips: DataFrame = spark.read.json("E:\\06-work\\03-java\\01-JavaCodeDome\\SparkSqlCode\\sparksql-train\\data\\zips.json") zips.printSchema() // 檢視schema資訊 /** * 1)loc的資訊沒用展示全,超過一定長度就使用...來展示 * 2)只顯示了前20條 * show() ==> show(20) ==> show(numRows, truncate = true) */ zips.show(10, false) zips.head(3).foreach(println) zips.first() zips.take(5) val count: Long = zips.count() println(s"Total Counts: $count") // 過濾出大於40000,withColumnRenamed:欄位重新命名 zips.filter(zips.col("pop") > 40000).withColumnRenamed("_id","new_id").show(10,false) import org.apache.spark.sql.functions._ // 統計加州pop最多的10個城市名稱和ID desc是一個內建函式 zips.select("_id","city","pop","state").filter(zips.col("state") === "CA").orderBy(desc("pop")).show(10,false) zips.createOrReplaceTempView("zips") spark.sql("select _id,city,pop,state from zips where state='CA' order by pop desc limit 10").show() spark.stop() } }