Spark core分組取topN案例
阿新 • • 發佈:2018-12-17
描述:在HDFS上有訂單資料order.txt檔案,檔案欄位的分割符號",",樣本資料如下:
其中欄位依次表示訂單id,商品id,交易額
問題:使用sparkcore,求每個訂單中成交額最大的商品id
程式碼如下:
package com.company.sparkcore import org.apache.spark.{SparkConf, SparkContext} object TopOrderItem { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("top n order and item").setMaster("local") val sc = new SparkContext(conf) val orderData = sc.textFile("file:///e:/order.txt") val splitOrderData = orderData.map(_.split(",")).cache() val mapOrderData = splitOrderData.map { arrValue => val orderID = arrValue(0) val itemID = arrValue(1) val total = arrValue(2).toDouble (orderID, (itemID, total)) } val groupOrderData = mapOrderData.groupByKey() //groupOrderData.foreach(x => println(x)) // (Order_00003,CompactBuffer((Pdt_01,222.8))) // (Order_00002,CompactBuffer((Pdt_03,522.8), (Pdt_04,122.4), (Pdt_05,722.4))) // (Order_00001,CompactBuffer((Pdt_01,222.8), (Pdt_05,25.8))) val topOrderData = groupOrderData.map(tupleData => { val orderid = tupleData._1 val maxTotal = tupleData._2.toArray.sortWith(_._2 > _._2).take(1) (orderid, maxTotal(0)) } ) topOrderData.foreach(value => println("最大成交額的訂單ID為:" + value._1 + " ,對應的商品ID為:" + value._2._1) // 最大成交額的訂單ID為:Order_00003 ,對應的商品ID為:Pdt_01 // 最大成交額的訂單ID為:Order_00002 ,對應的商品ID為:Pdt_05 // 最大成交額的訂單ID為:Order_00001 ,對應的商品ID為:Pdt_01 ) } }