1. 程式人生 > >kafka&&sparkstreaming整合入門之Wordcount

kafka&&sparkstreaming整合入門之Wordcount

/**
 * @author Mr.lu
 * @Title: KafkaStreamingWordCount
 * @ProjectName spark-scala
 * @Description: TODO
 * @date 2018/11/19:16:58
 */
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;

import org.apache.spark.*;
import org.apache.spark.api.java.function.*;
import org.apache.spark.streaming.*;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaPairReceiverInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka.KafkaUtils;

import scala.Tuple2;

import com.google.common.collect.Lists;
public class KafkaStreamingWordCount {

    public static void main(String[] args) throws InterruptedException {
        //設定匹配模式,以空格分隔
        final Pattern SPACE = Pattern.compile(" ");
        //接收資料的地址和埠
        String zkQuorum = "localhost:2181";
        //話題所在的組
        String group = "1";
        //話題名稱以“,”分隔
        String topics = "top1,top2";
        //每個話題的分片數
        int numThreads = 2;
        SparkConf sparkConf = new SparkConf().setAppName("KafkaWordCount").setMaster("local[2]");
        JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, new Duration(10000));
//        jssc.checkpoint("checkpoint"); //設定檢查點
        //存放話題跟分片的對映關係
        Map<String, Integer> topicmap = new HashMap();
        String[] topicsArr = topics.split(",");
        int n = topicsArr.length;
        for(int i=0;i<n;i++){
            topicmap.put(topicsArr[i], numThreads);
        }
        //從Kafka中獲取資料轉換成RDD
        JavaPairReceiverInputDStream<String, String> lines = KafkaUtils.createStream(jssc, zkQuorum, group, topicmap);
        //從話題中過濾所需資料
        JavaDStream<String> words = lines.flatMap(new FlatMapFunction<Tuple2<String, String>, String>() {

            //@Override
            public Iterable<String> call(Tuple2<String, String> arg0)
                    throws Exception {
                return Lists.newArrayList(SPACE.split(arg0._2));
            }
        });
        //對其中的單詞進行統計
        JavaPairDStream<String, Integer> wordCounts = words.mapToPair(
                new PairFunction<String, String, Integer>() {
                    //@Override
                    public Tuple2<String, Integer> call(String s) {
                        return new Tuple2<String, Integer>(s, 1);
                    }
                }).reduceByKey(new Function2<Integer, Integer, Integer>() {
            //@Override
            public Integer call(Integer i1, Integer i2) {
                return i1 + i2;
            }
        });
        //列印結果
        wordCounts.print();
        jssc.start();
        jssc.awaitTermination();

    }

}

 

相關依賴

<!--kafka-->
        <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming-kafka -->
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming-kafka_2.10</artifactId>
            <version>1.0.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.kafka</groupId>
            <artifactId>kafka_2.11</artifactId>
            <version>1.0.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.kafka</groupId>
            <artifactId>kafka-clients</artifactId>
            <version>1.0.0</version>
        </dependency>
        <!--spark-streaming的相關依賴-->
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming_2.11</artifactId>
            <version>2.3.0</version>
        </dependency>
        <!--spark-core依賴-->
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.11</artifactId>
            <version>2.3.0</version>
        </dependency>
        <!--scala依賴-->
        <dependency>
            <groupId>org.scala-lang</groupId>
            <artifactId>scala-library</artifactId>
            <version>2.11.7</version>
        </dependency>