Spark 演算法練習兩則
阿新 • • 發佈:2019-01-01
練習一:求最大最小值
在檔案中獲取資料求浮點數和整數的最大值和最小值
1,2.33,4,1.55,2.56,
55,55,55,23.77,1.88987,0.3324,
22.567,5.5567,7.8895,33
import org.apache.spark.{SparkConf, SparkContext} object demo01 { def main(args: Array[String]): Unit = { println("求最大最小值") val conf=new SparkConf().setMaster("local").setAppName("minmax") val sc = new SparkContext(conf) val data= sc.textFile("D:\\resource\\minmax.txt",1) var minn=Double.MaxValue var maxn=Double.MinValue val datap=data.flatMap{x=>x.split("\r\n").mkString("").split(",")} var i=1 val countt=datap.count() for (x<-datap){ val n=x.toDouble if(n>maxn)maxn=n if(n<minn)minn=n if(i==countt)println("最大值:"+maxn+", 最小值:"+minn) i=i+1 } }
練習二:求爺孫關係
janet winnie
winnie poul
sam dida
helen janet
helen jack
jack salon
salon rose
rose tom
jack nicole
sam flitter
janet sam
sam sisi
def ancesor(sc:SparkContext)= { val data = sc.textFile("D:\\resource\\ancesor.txt", 2) val cps = data.map { x => (x.split(" ")(0), x.split(" ")(1)) } val cpss=cps.collectAsMap() //啟用RDD val to=cps.foreach{cp1=> val list=List("") cpss.foreach{cp2=> if(cp2._1.equals(cp1._2)) { println("祖輩:"+cp1._1+",孫輩:"+cp2._2) } } } }
練習三:排序
對csv檔案進行排序
name,score
helen,40
tom,50
mary,69
ben,60
sasa,70
marier,76
dida,78
object demo01 { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("sort1").setMaster("local") val sc = new SparkContext(conf) val sqc = new SQLContext(sc) //csv檔案的第一行為列資訊,設定header=true可以讀取列資訊 val data2 = sqc.read.option("header", "true").csv("d:\\demo.csv").toDF() data2.registerTempTable("tb_score") sqc.sql("select * from tb_score order by score desc").show() } }
對普通檔案進行排序
hello 2
say 4
dida 5
discuss 6
subway 10
object demo01 {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("sort2").setMaster("local")
val sc = new SparkContext(conf)
val sqc = new SQLContext(sc)
val data=sc.textFile("d:\\demo2.txt").map(x=>(x.split(" ")(0),x.split(" ")(1).toInt))
//方法一,直接用內建函式,排序得出降序排序
sqc.createDataFrame(data).toDF("name","num").orderBy("num").show()
//方法二,轉變成臨時table進行sql排序,可指定升降序
sqc.createDataFrame(data).toDF("name","num").registerTempTable("tb_s")
sqc.sql("select * from tb_s order by num").show()
}
}
對json檔案進行排序
{“id”:1, “name”:“leo”, “age”:18}
{“id”:2, “name”:“jack”, “age”:19}
{“id”:3, “name”:“marry”, “age”:17}
object demo01 {
def main(args: Array[String]): Unit = {
val conf=new SparkConf().setMaster("local").setAppName("json")
val sc=new SparkContext(conf)
val sqc=new SQLContext(sc)
val jdata=sqc.read.json("d:/demo3.json").createOrReplaceTempView("demo3")
sqc.sql("select id,name,age from demo3 order by id").show()
}
}
練習四:二次排序
hello 2 23
baby 2 44
hello 1 22
hello 3 55
nice 2 58
kitty 3 66
apple 1 44
使用SparkSQL
object demo01 {
def main(args: Array[String]): Unit = {
val conf=new SparkConf().setMaster("local").setAppName("second")
val sc=new SparkContext(conf)
val sqc=new SQLContext(sc)
val data=sc.textFile("d:/demo4.txt")
.map{x=>(x.split(" ")(0),x.split(" ")(1),x.split(" ")(2))}
sqc.createDataFrame(data).toDF("name","class","score")
.createOrReplaceTempView("demo4")
sqc.sql("select * from demo4 as t order by class,score desc").show()
}
}
使用Spark
//用於排序的類,注意第一行的寫法
class record(val clss:Int,val score:Int) extends Ordered[record] with Serializable {
def compare(other: record): Int = {
val comp = clss.compareTo(other.clss)
if (comp == 0) {
other.score.compareTo(score)
} else {
comp
}
}
}
//用於處理的Spark
object demo01 {
def main(args: Array[String]): Unit = {
val conf=new SparkConf().setMaster("local").setAppName("second")
val sc=new SparkContext(conf)
val data=sc.textFile("d:/demo4.txt")
.map{x=>
val ele=x.split(" ")
(new record(ele(1).toInt,ele(2).toInt),x)
}
val result=data.sortByKey(true).map(x=>x._2.replace(" ",","))
result.foreach(println)
}
}
練習五:倒排索引
搜尋引擎的索引法則
object demo01 {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local").setAppName("second")
val sc = new SparkContext(conf)
sc.wholeTextFiles("D:\\resource\\daopai", 2).flatMap{ x =>
val file = x._1.split("/").last.dropRight(4)
x._2.split("\r\n").mkString(" ").split(" ")
.map { x => (x, file) }
}.map(x=>(x._1,x._2)).groupByKey()
.map{x=>(x._1,x._2.toList.distinct.mkString(","))}
.foreach(println)
}
}
練習六:??