ElasticSearch_仿京東搜尋
阿新 • • 發佈:2020-11-02
仿京東搜尋
目錄本部落格內容根據狂神說整理
https://www.bilibili.com/video/BV17a4y1x7zq
1. 爬蟲
首先明確資料從哪裡來
- 資料庫中獲取
- 訊息佇列中獲取
- 爬蟲
- ......
爬取資料 : 獲取請求返回的頁面資訊, 篩選出屋面想要的資訊就可以了!
匯入依賴 JSoup, 解析網頁 ==> 爬電影, 音樂, 用tika
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup --> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.13.1</version> </dependency>
1. 測試使用爬蟲
package com.wang.wangesjd.utils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.IOException; import java.net.URL; public class HtmlParseUtil { public static void main(String[] args) throws IOException { //獲取請求 https://search.jd.com/Search?keyword=java //前提: 需要聯網, 而且不能獲取到AJAX! String url = "https://search.jd.com/Search?keyword=java"; //設定超時時間 30S int timeOut = 30000; //解析網頁 ==> Document就是瀏覽器的Document物件 Document document = Jsoup.parse(new URL(url), timeOut); //所有你在JS中可以使用的方法, 這裡都能用! Element element = document.getElementById("J_goodsList"); // System.out.println(element.html()); //獲取所有的li元素 Elements elements = element.getElementsByTag("li"); //獲取元素中的內容, 這裡的el就是每一個li標籤了 for (Element el : elements) { //關於這種圖片特別多的網站, 所有的圖片都是延遲載入的! //JD 放在了這個class data-lazy-img String img = el.getElementsByTag("img").eq(0).attr("data-lazy-img"); String price = el.getElementsByClass("p-price").eq(0).text(); String title = el.getElementsByClass("p-name").eq(0).text(); System.out.println("==================================================="); System.out.println(img); System.out.println(price); System.out.println(title); } } }
- 注意
- JD 貌似圖片使用了反爬蟲技術, 要獲取的屬性名和我們在前端除錯時看到的不一樣...
2. 提取工具類
package com.wang.wangesjd.utils; import com.wang.wangesjd.pojo.Content; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.IOException; import java.net.URL; import java.net.URLEncoder; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; public class HtmlParseUtil { public static void main(String[] args) throws IOException { //URL會將符號轉義! // HtmlParseUtil.parseJD("C%2B%2B").forEach(System.out::println); //查詢中文需要URL轉碼 // HtmlParseUtil.parseJD("心理學").forEach(System.out::println); HtmlParseUtil.parseJD("C++").forEach(System.out::println); } public static List<Content> parseJD(String keywords) throws IOException { //URL會對符號和漢字轉碼 //要先轉碼再拼接, 否則URL無法解析 (因為會將url中的符號也一起轉碼, 無法識別) String urlKeywords = URLEncoder.encode(keywords, "UTF-8"); //獲取請求 https://search.jd.com/Search?keyword=java //前提: 需要聯網, 而且不能獲取到AJAX! String url ="https://search.jd.com/Search?keyword=" + urlKeywords + "&enc=utf-8"; //設定超時時間 30S int timeOut = 30000; //解析網頁 ==> Document就是瀏覽器的Document物件 Document document = Jsoup.parse(new URL(url), timeOut); //所有你在JS中可以使用的方法, 這裡都能用! Element element = document.getElementById("J_goodsList"); //獲取所有的li元素 Elements elements = element.getElementsByTag("li"); List<Content> goodsList = new ArrayList<>(); //獲取元素中的內容, 這裡的el就是每一個li標籤了 for (Element el : elements) { //關於這種圖片特別多的網站, 所有的圖片都是延遲載入的! //JD 放在了這個class data-lazy-img String img = el.getElementsByTag("img").eq(0).attr("data-lazy-img"); String price = el.getElementsByClass("p-price").eq(0).text(); String title = el.getElementsByClass("p-name").eq(0).text(); Content content = new Content(); content.setImg(img) .setPrice(price) .setTitle(title); goodsList.add(content); } return goodsList; } }
- 注意
- URL解析時會轉義符號和中文, 因此如果我們想傳遞中文或者符號的關鍵字, 需要先轉義
- 不能將拼接後的url轉義, 這樣會導致URL中正常的符號也被轉義, 導致無法識別, 正確的做法是先將被拼接的轉義, 再拼接即可
3. 編寫實體類和業務層
1. 實體類
package com.wang.wangesjd.pojo;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.Accessors;
@Data
@AllArgsConstructor
@NoArgsConstructor
@Accessors(chain = true)
public class Content {
private String img;
private String price;
private String title;
}
2. 業務層
這裡有個小坑 ==> SpringBoot接管類, 如果是靜態方法, 使用自動裝載無法使用靜態方法
package com.wang.wangesjd.service;
import com.alibaba.fastjson.JSON;
import com.wang.wangesjd.pojo.Content;
import com.wang.wangesjd.utils.HtmlParseUtil;
import org.elasticsearch.action.bulk.BulkRequest;
import org.elasticsearch.action.bulk.BulkResponse;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.action.search.SearchRequest;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.RequestOptions;
import org.elasticsearch.client.RestHighLevelClient;
import org.elasticsearch.common.text.Text;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.common.xcontent.XContentType;
import org.elasticsearch.index.query.MatchQueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.index.query.TermQueryBuilder;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.builder.SearchSourceBuilder;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightField;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.stereotype.Service;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
@Service
public class ContentService {
@Autowired
@Qualifier("restHighLevelClient")
private RestHighLevelClient rest;
@Autowired
private HtmlParseUtil htmlParseUtil;
//解析資料, 放入ES索引中
public Boolean parseContent(String keywords) throws IOException {
List<Content> contents = htmlParseUtil.parseJD(keywords);
//把查詢的資料放入ES中
BulkRequest bulkRequest = new BulkRequest();
bulkRequest.timeout(TimeValue.timeValueMinutes(2L));
for (int i = 0; i < contents.size(); i++) {
bulkRequest.add(new IndexRequest("jd_goods")
.source(JSON.toJSONString(contents.get(i)), XContentType.JSON));
}
BulkResponse bulk = rest.bulk(bulkRequest, RequestOptions.DEFAULT);
return !bulk.hasFailures();
}
//獲取這些資料(從ES索引中), 實現搜尋功能
// public List<Map<String, Object>> searchPage(String keyword, int pageNo, int pageSize) throws IOException {
// if (pageNo <= 1) {
// pageNo = 1;
// }
// //條件搜尋
// SearchRequest searchRequest = new SearchRequest("jd_goods");
// SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
// //精準匹配
// MatchQueryBuilder matchQueryBuilder = QueryBuilders.matchQuery("title", keyword);
// sourceBuilder.query(matchQueryBuilder)
// .timeout(TimeValue.timeValueMinutes(1L));
// //分頁
// sourceBuilder.from(pageNo)
// .size(pageSize);
// //執行搜尋
// searchRequest.source(sourceBuilder);
// SearchResponse searchResponse = rest.search(searchRequest, RequestOptions.DEFAULT);
// //解析結果
// List<Map<String, Object>> list = new ArrayList<>();
// for (SearchHit documentFields : searchResponse.getHits()) {
// list.add(documentFields.getSourceAsMap());
// }
// return list;
// }
//實現搜尋高亮
public List<Map<String, Object>> searchPage(String keyword, int pageNo, int pageSize) throws IOException {
if (pageNo <= 1) {
pageNo = 1;
}
//條件搜尋
SearchRequest searchRequest = new SearchRequest("jd_goods");
SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
//精準匹配
MatchQueryBuilder matchQueryBuilder = QueryBuilders.matchQuery("title", keyword);
sourceBuilder.query(matchQueryBuilder)
.timeout(TimeValue.timeValueMinutes(1L));
//高亮
HighlightBuilder highlightBuilder = new HighlightBuilder();
//定義要高亮的標籤和樣式
highlightBuilder.field("title")
.preTags("<span style='color:red'>")
.postTags("</span>")
.requireFieldMatch(false); //是否需要高亮多個欄位
sourceBuilder.highlighter(highlightBuilder);
//分頁
sourceBuilder.from(pageNo)
.size(pageSize);
//執行搜尋
searchRequest.source(sourceBuilder);
SearchResponse searchResponse = rest.search(searchRequest, RequestOptions.DEFAULT);
//解析結果
List<Map<String, Object>> list = new ArrayList<>();
for (SearchHit documentFields : searchResponse.getHits()) {
//解析高亮的欄位
Map<String, HighlightField> highlightFields = documentFields.getHighlightFields();
HighlightField title = highlightFields.get("title");
Map<String, Object> sourceAsMap = documentFields.getSourceAsMap(); //這裡是原來的結果(不含高亮)
if(title != null) {
Text[] fragments = title.fragments();
String highlightTitle = "";
for (Text text : fragments) {
highlightTitle += text;
}
//將高亮欄位替換沒有高亮的欄位
sourceAsMap.put("title", highlightTitle);
}
list.add(sourceAsMap);
}
return list;
}
}
4. 頁面跳轉
package com.wang.wangesjd.controller;
import com.wang.wangesjd.service.ContentService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.PathVariable;
import org.springframework.web.bind.annotation.RestController;
import java.io.IOException;
import java.util.List;
import java.util.Map;
@RestController
public class ContentController {
@Autowired
private ContentService contentService;
@GetMapping("/parse/{keyword}")
public Boolean parse(@PathVariable String keyword) throws IOException {
return contentService.parseContent(keyword);
}
@GetMapping("/search/{keyword}/{pageNo}/{pageSize}")
public List<Map<String, Object>> search(@PathVariable("keyword") String keyword,
@PathVariable("pageNo") int pageNo,
@PathVariable("pageSize") int pageSize) throws IOException {
return contentService.searchPage(keyword, pageNo, pageSize);
}
}
使用RestFul風格
5. 前端
<!DOCTYPE html>
<html xmlns:th="http://www.thymeleaf.org">
<head>
<meta charset="utf-8"/>
<title>ES仿京東實戰</title>
<link rel="stylesheet" th:href="@{/css/style.css}"/>
<script src="https://unpkg.com/axios/dist/axios.min.js"></script>
<script src="https://cdn.jsdelivr.net/npm/vue/dist/vue.js"></script>
</head>
<body class="pg">
<div class="page" id="app">
<div id="mallPage" class=" mallist tmall- page-not-market ">
<!-- 頭部搜尋 -->
<div id="header" class=" header-list-app">
<div class="headerLayout">
<div class="headerCon ">
<!-- Logo-->
<h1 id="mallLogo">
<img th:src="@{/images/jdlogo.png}" alt="">
</h1>
<div class="header-extra">
<!--搜尋-->
<div id="mallSearch" class="mall-search">
<form name="searchTop" class="mallSearch-form clearfix">
<fieldset>
<legend>天貓搜尋</legend>
<div class="mallSearch-input clearfix">
<div class="s-combobox" id="s-combobox-685">
<div class="s-combobox-input-wrap">
<input v-model="keyword" type="text" autocomplete="off" value="dd"
id="mq"
class="s-combobox-input" aria-haspopup="true">
</div>
</div>
<button type="submit" @click.prevent="searchKey" id="searchbtn">搜尋</button>
</div>
</fieldset>
</form>
<ul class="relKeyTop">
<li><a>狂神說Java</a></li>
<li><a>狂神說前端</a></li>
<li><a>狂神說Linux</a></li>
<li><a>狂神說大資料</a></li>
<li><a>狂神聊理財</a></li>
</ul>
</div>
</div>
</div>
</div>
</div>
<!-- 商品詳情頁面 -->
<div id="content">
<div class="main">
<!-- 品牌分類 -->
<form class="navAttrsForm">
<div class="attrs j_NavAttrs" style="display:block">
<div class="brandAttr j_nav_brand">
<div class="j_Brand attr">
<div class="attrKey">
品牌
</div>
<div class="attrValues">
<ul class="av-collapse row-2">
<li><a href="#"> 狂神說 </a></li>
<li><a href="#"> Java </a></li>
</ul>
</div>
</div>
</div>
</div>
</form>
<!-- 排序規則 -->
<div class="filter clearfix">
<a class="fSort fSort-cur">綜合<i class="f-ico-arrow-d"></i></a>
<a class="fSort">人氣<i class="f-ico-arrow-d"></i></a>
<a class="fSort">新品<i class="f-ico-arrow-d"></i></a>
<a class="fSort">銷量<i class="f-ico-arrow-d"></i></a>
<a class="fSort">價格<i class="f-ico-triangle-mt"></i><i class="f-ico-triangle-mb"></i></a>
</div>
<!-- 商品詳情 -->
<div class="view grid-nosku">
<div class="product" v-for="result in results">
<div class="product-iWrap">
<!--商品封面-->
<div class="productImg-wrap">
<a class="productImg">
<img :src="result.img">
</a>
</div>
<!--價格-->
<p class="productPrice">
<em><b>¥</b>{{result.price}}</em>
</p>
<!--標題, 我們傳遞的是一個html-->
<p class="productTitle">
<a v-html="result.title"></a>
</p>
<!-- 店鋪名 -->
<div class="productShop">
<span>店鋪: 狂神說Java </span>
</div>
<!-- 成交資訊 -->
<p class="productStatus">
<span>月成交<em>999筆</em></span>
<span>評價 <a>3</a></span>
</p>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
<!--前端使用Vue, 實現前後端分離-->
<script>
new Vue({
el: '#app',
data: {
keyword: '', //搜尋的關鍵字
results: [] //搜尋的結果
},
methods: {
searchKey() {
let keyword = this.keyword;
console.log(keyword);
//對接後端的介面
axios.get('search/' + keyword + "/0/10").then(response => {
console.log(response.data);
this.results = response.data; //繫結資料
})
}
}
})
</script>
</body>
</html>
- 注意
- 由於後端高亮傳回來的是一個html, 我們需要解析, 不能單純的雙向繫結, 因此用 v-html