1. 程式人生 > 實用技巧 >Spark DataFrame常用API

Spark DataFrame常用API

Spark DataFrame常用API

package com.imooc.bigdata.chapter04

import org.apache.spark.sql.{DataFrame, SparkSession}

object DataFrameAPIApp {

  def main(args: Array[String]): Unit = {

    val spark = SparkSession.builder().master("local").appName("DataFrameAPIApp").getOrCreate()
    import spark.implicits._


     val people: DataFrame = spark.read.json("E:\\06-work\\03-java\\01-JavaCodeDome\\SparkSqlCode\\sparksql-train\\data\\people.json")

     people.printSchema()  // 檢視DF的內部結構:列名、列的資料型別、是否可以為空

    people.show() // 展示出DF內部的資料

    // TODO... DF裡面有兩列,只要name列 ==> select name from people
    people.select("name").show()
    people.select($"name").show()

    // TODO...  select * from people where age > 21
    people.filter($"age" > 21).show()
    people.filter("age > 21").show()

    // TODO... select age, count(1) from people group by age
    people.groupBy("age").count().show()

    // TODO... select name,age+10 from people
     people.select($"name", ($"age"+10).as("new_age")).show()


    // TODO... 使用SQL的方式操作
    people.createOrReplaceTempView("people")
    spark.sql("select name from people where age > 21").show()


    val zips: DataFrame = spark.read.json("E:\\06-work\\03-java\\01-JavaCodeDome\\SparkSqlCode\\sparksql-train\\data\\zips.json")
    zips.printSchema()  // 檢視schema資訊

    /**
      * 1)loc的資訊沒用展示全,超過一定長度就使用...來展示
      * 2)只顯示了前20條
      * show() ==> show(20) ==> show(numRows, truncate = true)
      */
    zips.show(10, false)

    zips.head(3).foreach(println)
    zips.first()
    zips.take(5)

    val count: Long = zips.count()
    println(s"Total Counts: $count")

    // 過濾出大於40000,withColumnRenamed:欄位重新命名
     zips.filter(zips.col("pop") > 40000).withColumnRenamed("_id","new_id").show(10,false)


    import org.apache.spark.sql.functions._
    // 統計加州pop最多的10個城市名稱和ID  desc是一個內建函式
    zips.select("_id","city","pop","state").filter(zips.col("state") === "CA").orderBy(desc("pop")).show(10,false)

    zips.createOrReplaceTempView("zips")
    spark.sql("select _id,city,pop,state from zips where state='CA' order by pop desc limit 10").show()


    spark.stop()
  }
}