Spark版本WordCount 和日誌熱搜詞排序
阿新 • • 發佈:2019-01-07
楔子
Spark 版本 word count按照 詞的個數倒序排序,再次基礎上衍生出來日誌熱門搜尋排序。
wordcount 按照詞個數排序
如果不清楚,每一步輸出看看輸出效果也就差不多了,注意,排序好後,之後就不要繼續排序了。
/**
* 模擬wordcount 按照 詞數量倒序排列
*/
public static void sortBysearch() {
JavaRDD<String> lines = sc.parallelize(Arrays.asList("ahello", "bwod", "grq", "grq", "grq"));
JavaPairRDD< String, Integer> reduceByKey = lines.mapToPair(t -> new Tuple2<String, Integer>(t, 1)).reduceByKey((a, b) -> a + b).sortByKey();
System.out.println(reduceByKey.collect());
// [(ahello,1), (bwod,1), (grq,3)]
//TODO 此處排序,之後就不要排序了
JavaPairRDD<Integer, String> secondStepRdd = reduceByKey. mapToPair(t -> new Tuple2<Integer, String>(t._2, t._1)).sortByKey(false);
System.out.println(secondStepRdd.collect());
// [(3,grq), (1,ahello), (1,bwod)]
JavaPairRDD<String, Integer> resuleRdd = secondStepRdd.mapToPair(t -> new Tuple2<String, Integer>(t._2, t._1));
System. out.println(resuleRdd.collect());
// [(grq,3), (ahello,1), (bwod,1)]
}
資料來源
我取的資料格式如下。如果資料太大,使用head 擷取資料比較方便
20111230000005 57375476989eea12893c0c3811607bcf 奇藝高清 1 1 http://www.qiyi.com/
20111230000005 66c5bb7774e31d0a22278249b26bc83a 凡人修仙傳 3 1 http://www.booksky.org/BookDetail.aspx?BookID=1050804&Level=1
20111230000007 b97920521c78de70ac38e3713f524b50 本本聯盟 1 1 http://www.bblianmeng.com/
20111230000008 6961d0c97fe93701fc9c0d861d096cd9 華南師範大學圖書館 1 1 http://lib.scnu.edu.cn/
20111230000008 f2f5a21c764aebde1e8afcc2871e086f 線上代理 2 1 http://proxyie.cn/
20111230000009 96994a0480e7e1edcaef67b20d8816b7 偉大導演 1 1 http://movie.douban.com/review/1128960/
20111230000009 698956eb07815439fe5f46e9a4503997 youku 1 1 http://www.youku.com/
20111230000009 599cd26984f72ee68b2b6ebefccf6aed 安徽合肥365房產網 1 1 http://hf.house365.com/
20111230000010 f577230df7b6c532837cd16ab731f874 哈薩克網址大全 1 1 http://www.kz321.com/
20111230000010 285f88780dd0659f5fc8acc7cc4949f2 IQ數碼 1 1 http://www.iqshuma.com/
20111230000010 f4ba3f337efb1cc469fcd0b34feff9fb 推薦待機時間長的手機 1 1 http://mobile.zol.com.cn/148/1487938.html
20111230000010 3d1acc7235374d531de1ca885df5e711 滿江紅 1 1 http://baike.baidu.com/view/6500.htm
20111230000010 dbce4101683913365648eba6a85b6273 游標下載 1 1 http://zhidao.baidu.com/question/38626533
20111230000011 58e7d0caec23bcb4daa7bbcc4d37f008 張國立的電視劇 2 1 http://tv.sogou.com/vertical/2xc3t6wbuk24jnphzlj35zy.html?p=40230600
20111230000011 a3b83dc38b2bbc35660dffcab4ed9da8 吹暖花開性吧 1 1 http://www.7183.info/
20111230000011 b89952902d7821db37e8999776b32427 怎麼罵一個人不帶髒字 1 1 http://wenwen.soso.com/z/q131927207.htm
20111230000011 7c54c43f3a8a0af0951c26d94a57d6c8 百度一下 你就知道 1 1 http://www.baidu.com/
20111230000011 2d6c22c084a501c0b8f7f0a845aefd9f 快播爽吧 5 1 http://www.dy241.com/
20111230000011 11097724dae8b9fdcc60bd6fa4ce4df2 118相簿 2 1 http://118123.net/
排序效果
程式碼
import org.apache.commons.io.FileUtils;
import org.apache.ibatis.io.Resources;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.joda.time.DateTime;
import scala.Tuple2;
/**
* @Title: SouGou.java
* @Package cn.zhuzi.sparksp01
* @Description: TODO(搜狗查詢日誌排行)
* @author 作者 grq
* @version 建立時間:2018年11月21日 上午11:03:30
*
*/
public class SouGou {
static SparkSession sparkSession;
static JavaSparkContext sc;
static {
if (sparkSession == null) {
sparkSession = buildSparkSession();
sc = new JavaSparkContext(sparkSession.sparkContext());
}
}
public static void main(String[] args) throws IOException {
// sortLogByKey();
sortBysearch();
}
private static void sortLogByKey() throws IOException {
String filePath = Resources.getResourceAsFile("data/txt/20.TXT").getAbsolutePath();
JavaRDD<String> fileStrRdd = sc.textFile(filePath);
JavaRDD<String> filter = fileStrRdd.filter(t -> t.split("\t").length == 6);
JavaPairRDD<String, Integer> mapToPair = filter.mapToPair(t -> (new Tuple2<String, Integer>((t.split("\t")[2]), 1)));
JavaPairRDD<String, Integer> resuleRDD = mapToPair.reduceByKey((a, b) -> a + b).mapToPair(t -> new Tuple2<Integer, String>(t._2, t._1)).sortByKey(false).mapToPair(t -> new Tuple2<String, Integer>(t._2, t._1));
File file = FileUtils.getFile("E:/had/spark/out/a_wc" + new DateTime().toString("yyyyMMdd_HHmm_ss"));
resuleRDD.saveAsTextFile(file.getAbsolutePath());
}
/**
* 模擬wordcount 按照 詞數量倒序排列
*/
public static void sortBysearch() {
JavaRDD<String> lines = sc.parallelize(Arrays.asList("ahello", "bwod", "grq", "grq", "grq"));
JavaPairRDD<String, Integer> reduceByKey = lines.mapToPair(t -> new Tuple2<String, Integer>(t, 1)).reduceByKey((a, b) -> a + b).sortByKey();
System.out.println(reduceByKey.collect());
// [(ahello,1), (bwod,1), (grq,3)]
// 此處排序,之後就不要排序了
JavaPairRDD<Integer, String> secondStepRdd = reduceByKey.mapToPair(t -> new Tuple2<Integer, String>(t._2, t._1)).sortByKey(false);
System.out.println(secondStepRdd.collect());
// [(3,grq), (1,ahello), (1,bwod)]
JavaPairRDD<String, Integer> resuleRdd = secondStepRdd.mapToPair(t -> new Tuple2<String, Integer>(t._2, t._1));
System.out.println(resuleRdd.collect());
// [(grq,3), (ahello,1), (bwod,1)]
}
/**
* 官方例子構建session的方法
*/
public static SparkSession buildSparkSession() {
SparkSession sparkSession = SparkSession.builder().appName("JavaSparkPi")
// .master("spark://hadoop:7077")遠端地址
.master("local").getOrCreate();
return sparkSession;
}
}