1. 程式人生 > >Spark core分組取topN案例

Spark core分組取topN案例

描述:在HDFS上有訂單資料order.txt檔案,檔案欄位的分割符號",",樣本資料如下:

其中欄位依次表示訂單id,商品id,交易額

問題:使用sparkcore,求每個訂單中成交額最大的商品id

程式碼如下:

package com.company.sparkcore

import org.apache.spark.{SparkConf, SparkContext}

object TopOrderItem {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("top n order and item").setMaster("local")
    val sc = new SparkContext(conf)
    val orderData = sc.textFile("file:///e:/order.txt")
    val splitOrderData = orderData.map(_.split(",")).cache()
    val mapOrderData = splitOrderData.map { arrValue =>
      val orderID = arrValue(0)
      val itemID = arrValue(1)
      val total = arrValue(2).toDouble
      (orderID, (itemID, total))
    }
    val groupOrderData = mapOrderData.groupByKey()
    //groupOrderData.foreach(x => println(x))
    //    (Order_00003,CompactBuffer((Pdt_01,222.8)))
    //    (Order_00002,CompactBuffer((Pdt_03,522.8), (Pdt_04,122.4), (Pdt_05,722.4)))
    //    (Order_00001,CompactBuffer((Pdt_01,222.8), (Pdt_05,25.8)))
    val topOrderData = groupOrderData.map(tupleData => {
      val orderid = tupleData._1
      val maxTotal = tupleData._2.toArray.sortWith(_._2 > _._2).take(1)
      (orderid, maxTotal(0))
    }
    )
    topOrderData.foreach(value =>
      println("最大成交額的訂單ID為:" + value._1 + " ,對應的商品ID為:" + value._2._1)
//    最大成交額的訂單ID為:Order_00003 ,對應的商品ID為:Pdt_01
//    最大成交額的訂單ID為:Order_00002 ,對應的商品ID為:Pdt_05
//    最大成交額的訂單ID為:Order_00001 ,對應的商品ID為:Pdt_01

    )

  }

}