Spark Kafka(createDirectStream)自己管理offset
阿新 • • 發佈:2019-02-09
4、使用Java來管理offset
// 注意:一定要存在這個包下面 package org.apache.spark.streaming.kafka; import kafka.common.TopicAndPartition; import kafka.message.MessageAndMetadata; import kafka.serializer.StringDecoder; import org.apache.spark.SparkException; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.function.Function; import org.apache.spark.streaming.api.java.JavaInputDStream; import org.apache.spark.streaming.api.java.JavaStreamingContext; import scala.Tuple2; import scala.collection.JavaConversions; import scala.collection.mutable.ArrayBuffer; import scala.util.Either; import java.io.Serializable; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; /** * * @author wei * @date 10/24/17 */ public class JavaKafkaManager implements Serializable{ private scala.collection.immutable.Map<String, String> kafkaParams; private KafkaCluster kafkaCluster; public JavaKafkaManager(Map<String, String> kafkaParams) { //TODO this.kafkaParams = toScalaImmutableMap(kafkaParams); kafkaCluster = new KafkaCluster(this.kafkaParams); } public JavaInputDStream<String> createDirectStream( JavaStreamingContext jssc, Map<String, String> kafkaParams, Set<String> topics) throws SparkException { String groupId = kafkaParams.get("group.id"); // 在zookeeper上讀取offsets前先根據實際情況更新offsets setOrUpdateOffsets(topics, groupId); //從zookeeper上讀取offset開始消費message //TODO scala.collection.immutable.Set<String> immutableTopics = JavaConversions.asScalaSet(topics).toSet(); Either<ArrayBuffer<Throwable>, scala.collection.immutable.Set<TopicAndPartition>> partitionsE = kafkaCluster.getPartitions(immutableTopics); if (partitionsE.isLeft()){ throw new SparkException("get kafka partition failed: ${partitionsE.left.get}"); } Either.RightProjection<ArrayBuffer<Throwable>, scala.collection.immutable.Set<TopicAndPartition>> partitions = partitionsE.right(); Either<ArrayBuffer<Throwable>, scala.collection.immutable.Map<TopicAndPartition, Object>> consumerOffsetsE = kafkaCluster.getConsumerOffsets(groupId, partitions.get()); if (consumerOffsetsE.isLeft()){ throw new SparkException("get kafka consumer offsets failed: ${consumerOffsetsE.left.get}"); } scala.collection.immutable.Map<TopicAndPartition, Object> consumerOffsetsTemp = consumerOffsetsE.right().get(); Map<TopicAndPartition, Object> consumerOffsets = JavaConversions.mapAsJavaMap(consumerOffsetsTemp); Map<TopicAndPartition, Long> consumerOffsetsLong = new HashMap<TopicAndPartition, Long>(); for (TopicAndPartition key: consumerOffsets.keySet()){ consumerOffsetsLong.put(key, (Long)consumerOffsets.get(key)); } JavaInputDStream<String> message = KafkaUtils.createDirectStream( jssc, String.class, String.class, StringDecoder.class, StringDecoder.class, String.class, kafkaParams, consumerOffsetsLong, new Function<MessageAndMetadata<String, String>, String>() { @Override public String call(MessageAndMetadata<String, String> v) throws Exception { return v.message(); } }); return message; } /** * 建立資料流前,根據實際消費情況更新消費offsets * @param topics * @param groupId */ private void setOrUpdateOffsets(Set<String> topics, String groupId) throws SparkException { for (String topic: topics){ boolean hasConsumed = true; HashSet<String> topicSet = new HashSet<>(); topicSet.add(topic); scala.collection.immutable.Set<String> immutableTopic = JavaConversions.asScalaSet(topicSet).toSet(); Either<ArrayBuffer<Throwable>, scala.collection.immutable.Set<TopicAndPartition>> partitionsE = kafkaCluster.getPartitions(immutableTopic); if (partitionsE.isLeft()){ throw new SparkException("get kafka partition failed: ${partitionsE.left.get}"); } scala.collection.immutable.Set<TopicAndPartition> partitions = partitionsE.right().get(); Either<ArrayBuffer<Throwable>, scala.collection.immutable.Map<TopicAndPartition, Object>> consumerOffsetsE = kafkaCluster.getConsumerOffsets(groupId, partitions); if (consumerOffsetsE.isLeft()){ hasConsumed = false; } if (hasConsumed){// 消費過 /** * 如果streaming程式執行的時候出現kafka.common.OffsetOutOfRangeException, * 說明zk上儲存的offsets已經過時了,即kafka的定時清理策略已經將包含該offsets的檔案刪除。 * 針對這種情況,只要判斷一下zk上的consumerOffsets和earliestLeaderOffsets的大小, * 如果consumerOffsets比earliestLeaderOffsets還小的話,說明consumerOffsets已過時, * 這時把consumerOffsets更新為earliestLeaderOffsets */ Either<ArrayBuffer<Throwable>, scala.collection.immutable.Map<TopicAndPartition, KafkaCluster.LeaderOffset>> earliestLeaderOffsetsE = kafkaCluster.getEarliestLeaderOffsets(partitions); if (earliestLeaderOffsetsE.isLeft()){ throw new SparkException("get earliest leader offsets failed: ${earliestLeaderOffsetsE.left.get}"); } scala.collection.immutable.Map<TopicAndPartition, KafkaCluster.LeaderOffset> earliestLeaderOffsets = earliestLeaderOffsetsE.right().get(); scala.collection.immutable.Map<TopicAndPartition, Object> consumerOffsets = consumerOffsetsE.right().get(); // 可能只是存在部分分割槽consumerOffsets過時,所以只更新過時分割槽的consumerOffsets為earliestLeaderOffsets HashMap<TopicAndPartition, Object> offsets = new HashMap<>(); Map<TopicAndPartition, Object> topicAndPartitionObjectMap = JavaConversions.mapAsJavaMap(consumerOffsets); for (TopicAndPartition key: topicAndPartitionObjectMap.keySet()){ Long n = (Long) topicAndPartitionObjectMap.get(key); long earliestLeaderOffset = earliestLeaderOffsets.get(key).get().offset(); if (n < earliestLeaderOffset){ System.out.println("consumer group:" + groupId + ",topic:" + key.topic() + ",partition:" + key.partition() + " offsets已經過時,更新為" + earliestLeaderOffset); offsets.put(key, earliestLeaderOffset); } } if (!offsets.isEmpty()){ //TODO scala.collection.immutable.Map<TopicAndPartition, Object> topicAndPartitionLongMap = toScalaImmutableMap(offsets); kafkaCluster.setConsumerOffsets(groupId, topicAndPartitionLongMap); } }else{// 沒有消費過 String offsetReset = kafkaParams.get("auto.offset.reset").get().toLowerCase(); scala.collection.immutable.Map<TopicAndPartition, KafkaCluster.LeaderOffset> leaderOffsets = null; if ("smallest".equals(offsetReset)){ Either<ArrayBuffer<Throwable>, scala.collection.immutable.Map<TopicAndPartition, KafkaCluster.LeaderOffset>> leaderOffsetsE = kafkaCluster.getEarliestLeaderOffsets(partitions); if (leaderOffsetsE.isLeft()) { throw new SparkException("get earliest leader offsets failed: ${leaderOffsetsE.left.get}"); } leaderOffsets = leaderOffsetsE.right().get(); }else { Either<ArrayBuffer<Throwable>, scala.collection.immutable.Map<TopicAndPartition, KafkaCluster.LeaderOffset>> latestLeaderOffsetsE = kafkaCluster.getLatestLeaderOffsets(partitions); if (latestLeaderOffsetsE.isLeft()){ throw new SparkException("get latest leader offsets failed: ${leaderOffsetsE.left.get}"); } leaderOffsets = latestLeaderOffsetsE.right().get(); } Map<TopicAndPartition, KafkaCluster.LeaderOffset> topicAndPartitionLeaderOffsetMap = JavaConversions.mapAsJavaMap(leaderOffsets); Map<TopicAndPartition, Object> offsets = new HashMap<>(); for (TopicAndPartition key: topicAndPartitionLeaderOffsetMap.keySet()){ KafkaCluster.LeaderOffset offset = topicAndPartitionLeaderOffsetMap.get(key); long offset1 = offset.offset(); offsets.put(key, offset1); } //TODO scala.collection.immutable.Map<TopicAndPartition, Object> immutableOffsets = toScalaImmutableMap(offsets); kafkaCluster.setConsumerOffsets(groupId,immutableOffsets); } } } /** * 更新zookeeper上的消費offsets * @param rdd */ public void updateZKOffsets(JavaRDD<String> rdd){ String groupId = kafkaParams.get("group.id").get(); OffsetRange[] offsetRanges = ((HasOffsetRanges) rdd.rdd()).offsetRanges(); for (OffsetRange offset: offsetRanges){ TopicAndPartition topicAndPartition = new TopicAndPartition(offset.topic(), offset.partition()); Map<TopicAndPartition, Object> offsets = new HashMap<>(); offsets.put(topicAndPartition, offset.untilOffset()); Either<ArrayBuffer<Throwable>, scala.collection.immutable.Map<TopicAndPartition, Object>> o = kafkaCluster.setConsumerOffsets(groupId, toScalaImmutableMap(offsets)); if (o.isLeft()){ System.out.println("Error updating the offset to Kafka cluster: ${o.left.get}"); } } } /** * java Map convert immutable.Map * @param javaMap * @param <K> * @param <V> * @return */ private static <K, V> scala.collection.immutable.Map<K, V> toScalaImmutableMap(java.util.Map<K, V> javaMap) { final java.util.List<scala.Tuple2<K, V>> list = new java.util.ArrayList<>(javaMap.size()); for (final java.util.Map.Entry<K, V> entry : javaMap.entrySet()) { list.add(scala.Tuple2.apply(entry.getKey(), entry.getValue())); } final scala.collection.Seq<Tuple2<K, V>> seq = scala.collection.JavaConverters.asScalaBufferConverter(list).asScala().toSeq(); return (scala.collection.immutable.Map<K, V>) scala.collection.immutable.Map$.MODULE$.apply(seq); } }
import org.apache.spark.SparkConf; import org.apache.spark.SparkException; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.VoidFunction; import org.apache.spark.streaming.Durations; import org.apache.spark.streaming.api.java.JavaInputDStream; import org.apache.spark.streaming.api.java.JavaStreamingContext; import org.apache.spark.streaming.kafka.JavaKafkaManager; import java.util.HashMap; import java.util.HashSet; import java.util.Map; /** * Created by weiw\ on 10/24/17. */ public class KafkaManagerDemo { public static void main(String[] args) throws SparkException, InterruptedException { SparkConf sparkConf = new SparkConf().setAppName(KafkaManagerDemo.class.getName()); sparkConf.setMaster("local[3]"); sparkConf.set("spark.streaming.kafka.maxRatePerPartition", "5"); sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf); JavaStreamingContext javaStreamingContext = new JavaStreamingContext(javaSparkContext, Durations.seconds(5)); javaStreamingContext.sparkContext().setLogLevel("WARN"); String brokers = "localhost:9092"; String topics = "finance_test2"; String groupId = "test22"; HashSet<String> topcisSet = new HashSet<>(); topcisSet.add(topics); Map<String,String> kafkaParams = new HashMap<>(); kafkaParams.put("metadata.broker.list", brokers); kafkaParams.put("group.id", groupId); kafkaParams.put("auto.offset.reset", "smallest"); JavaKafkaManager javaKafkaManager = new JavaKafkaManager(kafkaParams); JavaInputDStream<String> message = javaKafkaManager.createDirectStream(javaStreamingContext, kafkaParams, topcisSet); message.transform(new Function<JavaRDD<String>, JavaRDD<String>>() { @Override public JavaRDD<String> call(JavaRDD<String> v1) throws Exception { return v1; } }).foreachRDD(new VoidFunction<JavaRDD<String>>() { @Override public void call(JavaRDD<String> rdd) throws Exception { System.out.println(rdd); if (!rdd.isEmpty()){ rdd.foreach(new VoidFunction<String>() { @Override public void call(String r) throws Exception { System.out.println(r); } }); javaKafkaManager.updateZKOffsets(rdd); } } }); javaStreamingContext.start(); javaStreamingContext.awaitTermination(); } }
5、使用Scala來管理offset
package org.apache.spark.streaming.kafka import kafka.common.TopicAndPartition import kafka.message.MessageAndMetadata import kafka.serializer.Decoder import org.apache.spark.SparkException import org.apache.spark.rdd.RDD import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.InputDStream import org.apache.spark.streaming.kafka.KafkaCluster.LeaderOffset import scala.reflect.ClassTag /** * 自己管理offset */ class KafkaManager(val kafkaParams: Map[String, String]) extends Serializable { private val kc = new KafkaCluster(kafkaParams) /** * 建立資料流 */ def createDirectStream[K: ClassTag, V: ClassTag, KD <: Decoder[K]: ClassTag, VD <: Decoder[V]: ClassTag](ssc: StreamingContext, kafkaParams: Map[String, String], topics: Set[String]): InputDStream[(K, V)] = { val groupId = kafkaParams.get("group.id").get // 在zookeeper上讀取offsets前先根據實際情況更新offsets setOrUpdateOffsets(topics, groupId) //從zookeeper上讀取offset開始消費message val messages = { val partitionsE = kc.getPartitions(topics) if (partitionsE.isLeft) throw new SparkException(s"get kafka partition failed: ${partitionsE.left.get}") val partitions = partitionsE.right.get val consumerOffsetsE = kc.getConsumerOffsets(groupId, partitions) if (consumerOffsetsE.isLeft) throw new SparkException(s"get kafka consumer offsets failed: ${consumerOffsetsE.left.get}") val consumerOffsets = consumerOffsetsE.right.get KafkaUtils.createDirectStream[K, V, KD, VD, (K, V)]( ssc, kafkaParams, consumerOffsets, (mmd: MessageAndMetadata[K, V]) => (mmd.key, mmd.message)) } messages } /** * 建立資料流前,根據實際消費情況更新消費offsets * @param topics * @param groupId */ private def setOrUpdateOffsets(topics: Set[String], groupId: String): Unit = { topics.foreach(topic => { var hasConsumed = true val partitionsE = kc.getPartitions(Set(topic)) if (partitionsE.isLeft) throw new SparkException(s"get kafka partition failed: ${partitionsE.left.get}") val partitions = partitionsE.right.get val consumerOffsetsE = kc.getConsumerOffsets(groupId, partitions) if (consumerOffsetsE.isLeft) hasConsumed = false if (hasConsumed) {// 消費過 /** * 如果streaming程式執行的時候出現kafka.common.OffsetOutOfRangeException, * 說明zk上儲存的offsets已經過時了,即kafka的定時清理策略已經將包含該offsets的檔案刪除。 * 針對這種情況,只要判斷一下zk上的consumerOffsets和earliestLeaderOffsets的大小, * 如果consumerOffsets比earliestLeaderOffsets還小的話,說明consumerOffsets已過時, * 這時把consumerOffsets更新為earliestLeaderOffsets */ val earliestLeaderOffsetsE = kc.getEarliestLeaderOffsets(partitions) if (earliestLeaderOffsetsE.isLeft) throw new SparkException(s"get earliest leader offsets failed: ${earliestLeaderOffsetsE.left.get}") val earliestLeaderOffsets = earliestLeaderOffsetsE.right.get val consumerOffsets = consumerOffsetsE.right.get // 可能只是存在部分分割槽consumerOffsets過時,所以只更新過時分割槽的consumerOffsets為earliestLeaderOffsets var offsets: Map[TopicAndPartition, Long] = Map() consumerOffsets.foreach({ case(tp, n) => val earliestLeaderOffset = earliestLeaderOffsets(tp).offset if (n < earliestLeaderOffset) { println("consumer group:" + groupId + ",topic:" + tp.topic + ",partition:" + tp.partition + " offsets已經過時,更新為" + earliestLeaderOffset) offsets += (tp -> earliestLeaderOffset) } }) if (!offsets.isEmpty) { kc.setConsumerOffsets(groupId, offsets) } } else {// 沒有消費過 val reset = kafkaParams.get("auto.offset.reset").map(_.toLowerCase) var leaderOffsets: Map[TopicAndPartition, LeaderOffset] = null if (reset == Some("smallest")) { val leaderOffsetsE = kc.getEarliestLeaderOffsets(partitions) if (leaderOffsetsE.isLeft) throw new SparkException(s"get earliest leader offsets failed: ${leaderOffsetsE.left.get}") leaderOffsets = leaderOffsetsE.right.get } else { val leaderOffsetsE = kc.getLatestLeaderOffsets(partitions) if (leaderOffsetsE.isLeft) throw new SparkException(s"get latest leader offsets failed: ${leaderOffsetsE.left.get}") leaderOffsets = leaderOffsetsE.right.get } val offsets = leaderOffsets.map { case (tp, offset) => (tp, offset.offset) } kc.setConsumerOffsets(groupId, offsets) } }) } /** * 更新zookeeper上的消費offsets * @param rdd */ def updateZKOffsets(rdd: RDD[(String, String)]) : Unit = { val groupId = kafkaParams.get("group.id").get val offsetsList = rdd.asInstanceOf[HasOffsetRanges].offsetRanges for (offsets <- offsetsList) { val topicAndPartition = TopicAndPartition(offsets.topic, offsets.partition) val o = kc.setConsumerOffsets(groupId, Map((topicAndPartition, offsets.untilOffset))) if (o.isLeft) { println(s"Error updating the offset to Kafka cluster: ${o.left.get}") } } } }
import kafka.serializer.StringDecoder
import org.apache.spark.rdd.RDD
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka.KafkaManager
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* Created by root on 10/24/17.
*/
object SparkKafkaStreaming {
/* def dealLine(line: String): String = {
val list = line.split(',').toList
// val list = AnalysisUtil.dealString(line, ',', '"')// 把dealString函式當做split即可
list.get(0).substring(0, 10) + "-" + list.get(26)
}*/
def processRdd(rdd: RDD[(String, String)]): Unit = {
val lines = rdd.map(_._2).map(x => (1,1)).reduceByKey(_+_)
/*val words = lines.map(_.split(" "))
val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _)*/
lines.foreach(println)
}
def main(args: Array[String]) {
if (args.length < 3) {
System.err.println(
s"""
|Usage: DirectKafkaWordCount <brokers> <topics> <groupid>
| <brokers> is a list of one or more Kafka brokers
| <topics> is a list of one or more kafka topics to consume from
| <groupid> is a consume group
|
""".stripMargin)
System.exit(1)
}
Logger.getLogger("org").setLevel(Level.WARN)
val Array(brokers, topics, groupId) = args
// Create context with 2 second batch interval
val sparkConf = new SparkConf().setAppName("DirectKafkaWordCount")
sparkConf.setMaster("local[3]")
sparkConf.set("spark.streaming.kafka.maxRatePerPartition", "5")
sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
val ssc = new StreamingContext(sparkConf, Seconds(5))
ssc.sparkContext.setLogLevel("WARN")
// Create direct kafka stream with brokers and topics
val topicsSet = topics.split(",").toSet
val kafkaParams = Map[String, String](
"metadata.broker.list" -> brokers,
"group.id" -> groupId,
"auto.offset.reset" -> "smallest"
)
val km = new KafkaManager(kafkaParams)
val messages = km.createDirectStream[String, String, StringDecoder, StringDecoder](
ssc, kafkaParams, topicsSet)
messages.foreachRDD(rdd => {
if (!rdd.isEmpty()) {
// 先處理訊息
processRdd(rdd)
// 再更新offsets
km.updateZKOffsets(rdd)
}
})
ssc.start()
ssc.awaitTermination()
}
}