Spark Streaming整合Kafka實現網站點選流實時統計
阿新 • • 發佈:2018-12-25
- 安裝並配置zk
- 安裝並配置Kafka
- 啟動zk
- 啟動Kafka
- 建立topic
bin/kafka-topics.sh --create --zookeeper node1.itcast.cn:2181,node2.itcast.cn:2181 \
--replication-factor 3 --partitions 3 --topic urlcount
package cn.itcast.spark.streaming package cn.itcast.spark import org.apache.spark.{HashPartitioner, SparkConf} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.{Seconds, StreamingContext} object UrlCount { val updateFunc = (iterator: Iterator[(String, Seq[Int], Option[Int])]) => { iterator.flatMap{case(x,y,z)=> Some(y.sum + z.getOrElse(0)).map(n=>(x, n))} } def main(args: Array[String]) { //接收命令列中的引數 val Array(zkQuorum, groupId, topics, numThreads, hdfs) = args //建立SparkConf並設定AppName val conf = new SparkConf().setAppName("UrlCount") //建立StreamingContext val ssc = new StreamingContext(conf, Seconds(2)) //設定檢查點 ssc.checkpoint(hdfs) //設定topic資訊 val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap //重Kafka中拉取資料建立DStream val lines = KafkaUtils.createStream(ssc, zkQuorum ,groupId, topicMap, StorageLevel.MEMORY_AND_DISK).map(_._2) //切分資料,擷取使用者點選的url val urls = lines.map(x=>(x.split(" ")(6), 1)) //統計URL點選量 val result = urls.updateStateByKey(updateFunc, new HashPartitioner(ssc.sparkContext.defaultParallelism), true) //將結果列印到控制檯 result.print() ssc.start() ssc.awaitTermination() } }