1. 程式人生 > 實用技巧 >Elasticsearch實現搜尋推薦詞

Elasticsearch實現搜尋推薦詞

本篇介紹的是基於Elasticsearch實現搜尋推薦詞,其中需要用到Elasticsearch的pinyin外掛以及ik分詞外掛,程式碼的實現這裡提供了java跟C#的版本方便大家參考。

1.實現的結果

①當搜尋【qiy】的時候,能匹配企業、祈願等

②當搜尋【qi業】的時候,只能匹配的到企業,如果沒有企業,將使用模糊查詢,匹配祈願。

③當搜尋【q業】的時候結果同②。

④當搜尋【企y】或【企ye】的時候結果同②。

④當搜尋【qy】的時候,能匹配企業、祈願等。

2.實現的邏輯

中文匹配字首==》全拼匹配字首==》拼音首字母匹配字首==》拼音模糊匹配字首

優先順序從左到右,當前面三個有結果的時候不建議用模糊匹配,這樣結果更加精確。比如需要獲取8個推薦詞,先獲取中文的,如果足夠8個將不再獲取之後的匹配結果。但是當模糊匹配之前已經存在匹配結果了,即使數量沒有達到8個,也不再繼續獲取模糊匹配結果。

3.外掛準備

ik分詞外掛安裝相對簡單,網上教程也多,這裡不做介紹。這裡講解下pinyin外掛,官方版本的拼音外掛不支援中文,處理結果只有拼音的,這樣會出現同音字匹配,結果不準確。

這裡感謝小夥伴分享的拼音外掛修改方法:https://www.cnblogs.com/danvid/p/10691547.html

按照裡面的操作處理後的外掛將實現:

企業畫報:{"qi","企","ye","業","hua","畫","bao","報"}

拼音外掛的各項具體屬性參考:https://blog.csdn.net/a1148233614/article/details/80280024,裡面有詳細介紹。

4.Elasticsearch建立index

這裡使用的ES版本為7.0.1,不再支援mapping,建立程式碼如下:

PUT /suggest_tset
{
  "settings": {
    "number_of_shards": 1,
    "number_of_replicas": 0,
    "analysis": {
      "analyzer": {
        "prefix_pinyin_analyzer": {
          "tokenizer": "standard",
          "filter": [
            "lowercase",
            "prefix_pinyin"
          ]
        },
        "full_pinyin_analyzer": {
          "tokenizer": "standard",
          "filter": [
            "lowercase",
            "full_pinyin"
          ]
        },
        "like_pinyin_analyzer": {
          "tokenizer": "standard",
          "filter": [
            "lowercase",
            "like_pinyin"
          ]
        }
      },
      "filter": {
        "_pattern": {
          "type": "pattern_capture",
          "preserve_original": true,
          "patterns": [
            "([0-9])",
            "([a-z])"
          ]
        },
        "prefix_pinyin": {
          "type": "pinyin",
          "keep_first_letter": "true",
          "keep_full_pinyin": "false",
          "none_chinese_pinyin_tokenize": "false",
          "keep_separate_chinese": "true",
          "keep_original": "false"
        },
        "full_pinyin": {
          "type": "pinyin",
          "keep_first_letter": "false",
          "keep_full_pinyin": "true",
          "keep_original": "false",
          "keep_separate_chinese": "true",
          "keep_none_chinese_in_first_letter": "false"
        },
        "like_pinyin": {
          "type": "pinyin",
          "keep_first_letter": "true",
          "keep_full_pinyin": "true",
          "keep_joined_full_pinyin": "false",
          "keep_original": "false",
          "keep_separate_chinese": "false",
          "keep_none_chinese_in_first_letter": "false"
        }
      }
    }
  },
  "mappings": {
    "dynamic": "false",
    "properties": {
      "kwsuggest": {
        "fields": {
          "suggestText": {
            "type": "completion",
            "analyzer": "standard",
            "preserve_separators": "false",
            "preserve_position_increments": "true",
            "max_input_length": 50
          },
          "prefix_pinyin": {
            "type": "completion",
            "analyzer": "prefix_pinyin_analyzer",
            "search_analyzer": "standard",
            "preserve_separators": "false"
          },
          "full_pinyin": {
            "type": "completion",
            "analyzer": "full_pinyin_analyzer",
            "search_analyzer": "standard",
            "preserve_separators": "false"
          },
          "like_pinyin": {
            "type": "completion",
            "analyzer": "like_pinyin_analyzer",
            "preserve_separators": "false"
          }
        },
        "type": "text"
      }
    }
  }
}

 這裡插入幾條測試資料

POST _bulk/?refresh=true
{ "index" : { "_index" : "suggest_tset", "_type" : "_doc" } }
{ "kwsuggest": "企業規劃"}
{ "index" : { "_index" : "suggest_tset", "_type" : "_doc" } }
{ "kwsuggest": "祈願設計 完美無瑕"}
{ "index" : { "_index" : "suggest_tset", "_type" : "_doc" } }
{ "kwsuggest": "懸崖的圖片 美景"}
{ "index" : { "_index" : "suggest_tset", "_type" : "_doc" } }
{ "kwsuggest": "縣衙地址 那裡呢"}
{ "index" : { "_index" : "suggest_tset", "_type" : "_doc" } }
{ "kwsuggest": "懸崖風景圖"}
{ "index" : { "_index" : "suggest_tset", "_type" : "_doc" } }
{ "kwsuggest": "起夜的風光 真的美"}
{ "index" : { "_index" : "suggest_tset", "_type" : "_doc" } }
{ "kwsuggest": "起夜第二個詞 測試使用"}
{ "index" : { "_index" : "suggest_tset", "_type" : "_doc" } }
{ "kwsuggest": "需要一半留下一半打一字謎"}
{ "index" : { "_index" : "suggest_tset", "_type" : "_doc" } }
{ "kwsuggest": "許亞為"}
{ "index" : { "_index" : "suggest_tset", "_type" : "_doc" } }
{ "kwsuggest": "許雅非測試"}
{ "index" : { "_index" : "suggest_tset", "_type" : "_doc" } }
{ "kwsuggest": "徐楊是誰"}

 

下面為測試的查詢語句

GET /suggest_tset/_search
{
  "suggest": {
    "suggestText": {
      "prefix": "qi業",
      "completion": {
        "field": "kwsuggest.suggestText",
        "skip_duplicates": true
      }
    },
    "full_pinyin": {
      "prefix": "qi業",
      "completion": {
        "field": "kwsuggest.full_pinyin",
        "skip_duplicates": true
      }
    },
    "prefix_pinyin": {
      "prefix": "qi業",
      "completion": {
        "field": "kwsuggest.prefix_pinyin",
        "skip_duplicates": true
      }
    },
     "like_pinyin": {
      "prefix": "qi業",
      "completion": {
        "field": "kwsuggest.like_pinyin",
        "skip_duplicates": true,
         "fuzzy": {
          "fuzziness": 1
        }
      }
    }
  }
}  

當輸入查詢條件為【qiy】的時候,結果為:

{
  "took" : 17,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 0,
      "relation" : "eq"
    },
    "max_score" : null,
    "hits" : [ ]
  },
  "suggest" : {
    "full_pinyin" : [
      {
        "text" : "qiy",
        "offset" : 0,
        "length" : 3,
        "options" : [
          {
            "text" : "起夜的風光 真的美",
            "_index" : "suggest_tset",
            "_type" : "_doc",
            "_id" : "-jgnlHMBSEyTxFiDO4lU",
            "_score" : 1.0,
            "_source" : {
              "kwsuggest" : "起夜的風光 真的美"
            }
          },
          {
            "text" : "起夜第二個詞 測試使用",
            "_index" : "suggest_tset",
            "_type" : "_doc",
            "_id" : "aDg3lHMBSEyTxFiDXprV",
            "_score" : 1.0,
            "_source" : {
              "kwsuggest" : "起夜第二個詞 測試使用"
            }
          }
        ]
      }
    ],
    "like_pinyin" : [
      {
        "text" : "qiy",
        "offset" : 0,
        "length" : 3,
        "options" : [
          {
            "text" : "企業規劃",
            "_index" : "suggest_tset",
            "_type" : "_doc",
            "_id" : "9TgnlHMBSEyTxFiDO4lU",
            "_score" : 2.0,
            "_source" : {
              "kwsuggest" : "企業規劃"
            }
          },
          {
            "text" : "祈願設計 這是啥呢",
            "_index" : "suggest_tset",
            "_type" : "_doc",
            "_id" : "9jgnlHMBSEyTxFiDO4lU",
            "_score" : 2.0,
            "_source" : {
              "kwsuggest" : "祈願設計 這是啥呢"
            }
          },
          {
            "text" : "起夜的風光 真的美",
            "_index" : "suggest_tset",
            "_type" : "_doc",
            "_id" : "-jgnlHMBSEyTxFiDO4lU",
            "_score" : 2.0,
            "_source" : {
              "kwsuggest" : "起夜的風光 真的美"
            }
          },
          {
            "text" : "起夜第二個詞 測試使用",
            "_index" : "suggest_tset",
            "_type" : "_doc",
            "_id" : "aDg3lHMBSEyTxFiDXprV",
            "_score" : 2.0,
            "_source" : {
              "kwsuggest" : "起夜第二個詞 測試使用"
            }
          }
        ]
      }
    ],
    "prefix_pinyin" : [
      {
        "text" : "qiy",
        "offset" : 0,
        "length" : 3,
        "options" : [ ]
      }
    ],
    "suggestText" : [
      {
        "text" : "qiy",
        "offset" : 0,
        "length" : 3,
        "options" : [ ]
      }
    ]
  }
}  

 輸入【qi業】的查詢結果為

{
  "took" : 2,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 0,
      "relation" : "eq"
    },
    "max_score" : null,
    "hits" : [ ]
  },
  "suggest" : {
    "full_pinyin" : [
      {
        "text" : "qi業",
        "offset" : 0,
        "length" : 3,
        "options" : [
          {
            "text" : "企業規劃",
            "_index" : "suggest_tset",
            "_type" : "_doc",
            "_id" : "9TgnlHMBSEyTxFiDO4lU",
            "_score" : 1.0,
            "_source" : {
              "kwsuggest" : "企業規劃"
            }
          }
        ]
      }
    ],
    "like_pinyin" : [
      {
        "text" : "qi業",
        "offset" : 0,
        "length" : 3,
        "options" : [
          {
            "text" : "企業規劃",
            "_index" : "suggest_tset",
            "_type" : "_doc",
            "_id" : "9TgnlHMBSEyTxFiDO4lU",
            "_score" : 2.0,
            "_source" : {
              "kwsuggest" : "企業規劃"
            }
          },
          {
            "text" : "祈願設計 這是啥呢",
            "_index" : "suggest_tset",
            "_type" : "_doc",
            "_id" : "9jgnlHMBSEyTxFiDO4lU",
            "_score" : 2.0,
            "_source" : {
              "kwsuggest" : "祈願設計 這是啥呢"
            }
          },
          {
            "text" : "起夜的風光 真的美",
            "_index" : "suggest_tset",
            "_type" : "_doc",
            "_id" : "-jgnlHMBSEyTxFiDO4lU",
            "_score" : 2.0,
            "_source" : {
              "kwsuggest" : "起夜的風光 真的美"
            }
          },
          {
            "text" : "起夜第二個詞 測試使用",
            "_index" : "suggest_tset",
            "_type" : "_doc",
            "_id" : "aDg3lHMBSEyTxFiDXprV",
            "_score" : 2.0,
            "_source" : {
              "kwsuggest" : "起夜第二個詞 測試使用"
            }
          }
        ]
      }
    ],
    "prefix_pinyin" : [
      {
        "text" : "qi業",
        "offset" : 0,
        "length" : 3,
        "options" : [ ]
      }
    ],
    "suggestText" : [
      {
        "text" : "qi業",
        "offset" : 0,
        "length" : 3,
        "options" : [ ]
      }
    ]
  }
}

  輸入【qy】的結果為

{
  "took" : 1,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 0,
      "relation" : "eq"
    },
    "max_score" : null,
    "hits" : [ ]
  },
  "suggest" : {
    "full_pinyin" : [
      {
        "text" : "qy",
        "offset" : 0,
        "length" : 2,
        "options" : [ ]
      }
    ],
    "like_pinyin" : [
      {
        "text" : "qy",
        "offset" : 0,
        "length" : 2,
        "options" : [
          {
            "text" : "起夜的風光 真的美",
            "_index" : "suggest_tset",
            "_type" : "_doc",
            "_id" : "-jgnlHMBSEyTxFiDO4lU",
            "_score" : 2.0,
            "_source" : {
              "kwsuggest" : "起夜的風光 真的美"
            }
          },
          {
            "text" : "起夜第二個詞 測試使用",
            "_index" : "suggest_tset",
            "_type" : "_doc",
            "_id" : "aDg3lHMBSEyTxFiDXprV",
            "_score" : 2.0,
            "_source" : {
              "kwsuggest" : "起夜第二個詞 測試使用"
            }
          }
        ]
      }
    ],
    "prefix_pinyin" : [
      {
        "text" : "qy",
        "offset" : 0,
        "length" : 2,
        "options" : [
          {
            "text" : "起夜的風光 真的美",
            "_index" : "suggest_tset",
            "_type" : "_doc",
            "_id" : "-jgnlHMBSEyTxFiDO4lU",
            "_score" : 1.0,
            "_source" : {
              "kwsuggest" : "起夜的風光 真的美"
            }
          },
          {
            "text" : "起夜第二個詞 測試使用",
            "_index" : "suggest_tset",
            "_type" : "_doc",
            "_id" : "aDg3lHMBSEyTxFiDXprV",
            "_score" : 1.0,
            "_source" : {
              "kwsuggest" : "起夜第二個詞 測試使用"
            }
          }
        ]
      }
    ],
    "suggestText" : [
      {
        "text" : "qy",
        "offset" : 0,
        "length" : 2,
        "options" : [ ]
      }
    ]
  }
}

  

5.java版本程式碼

這裡使用elasticsearch-rest-high-level-client

application.yml新增配置

# ES配置
elasticsearch:
  ipAddress: [127.0.0.1:9200]

新增配置類

@Component
@Configuration
@ConfigurationProperties(prefix = "elasticsearch")
@Data
public class ElasticsearchRestClientConfig {
    private Logger logger = LoggerFactory.getLogger(getClass());

    private static final int ADDRESS_LENGTH = 2;
    private static final String HTTP_SCHEME = "http";

    /**
     * 使用冒號隔開ip和埠
     */
    public String[] ipAddress;

    @Bean
    public RestClientBuilder restClientBuilder() {
        HttpHost[] hosts = Arrays.stream(ipAddress)
                .map(this::makeHttpHost)
                .filter(Objects::nonNull)
                .toArray(HttpHost[]::new);
        logger.debug("hosts:{}", Arrays.toString(hosts));
        return RestClient.builder(hosts);
    }


    @Bean(name = "highLevelClient")
    public RestHighLevelClient highLevelClient(@Autowired RestClientBuilder restClientBuilder) {
        return new RestHighLevelClient(restClientBuilder);
    }


    private HttpHost makeHttpHost(String s) {
        assert StringUtils.isNotEmpty(s);
        String[] address = s.split(":");
        if (address.length == ADDRESS_LENGTH) {
            String ip = address[0];
            int port = Integer.parseInt(address[1]);
            return new HttpHost(ip, port, HTTP_SCHEME);
        } else {
            return null;
        }
    }
}

實現的程式碼:

@Service
public class KwSuggestService implements IKwSuggest {
    @Autowired
    RestHighLevelClient highLevelClient;

    @Override
    public List<String> GetKwSuggestList(String kw){
        SearchRequest searchRequest = new SearchRequest("suggest_tset");
        SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
        SuggestBuilder suggestBuilder=new SuggestBuilder();
        suggestBuilder.addSuggestion("suggestText", SuggestBuilders.completionSuggestion("kwsuggest.suggestText").prefix(kw).skipDuplicates(true).size(5));
        suggestBuilder.addSuggestion("full_pinyin", SuggestBuilders.completionSuggestion("kwsuggest.full_pinyin").prefix(kw).skipDuplicates(true).size(5));
        suggestBuilder.addSuggestion("prefix_pinyin", SuggestBuilders.completionSuggestion("kwsuggest.prefix_pinyin").prefix(kw).skipDuplicates(true).size(5));
        suggestBuilder.addSuggestion("like_pinyin", SuggestBuilders.completionSuggestion("kwsuggest.like_pinyin").prefix(kw, Fuzziness.fromEdits(1)).skipDuplicates(true).size(5));
        sourceBuilder.suggest(suggestBuilder);
        sourceBuilder.timeout(new TimeValue(10, TimeUnit.SECONDS));
        searchRequest.source(sourceBuilder);
        List<String> result = new ArrayList<>();
        List<String> suggestionList= Arrays.asList("suggestText","full_pinyin","prefix_pinyin","like_pinyin");
        try {
            SearchResponse response = highLevelClient.search(searchRequest, RequestOptions.DEFAULT);
            Suggest suggestions = response.getSuggest();
            Integer index = 1;
            for(String suggestionType : suggestionList){
                CompletionSuggestion completionSuggestion = suggestions.getSuggestion(suggestionType);
                for (CompletionSuggestion.Entry entry : completionSuggestion.getEntries()) {
                    for (CompletionSuggestion.Entry.Option option : entry) {
                        String suggestText =  option.getHit().getSourceAsMap().get("kwsuggest").toString();
                        result.add(suggestText);
                    }
                }
                // 按照中文匹配、全拼匹配、拼音首字母匹配、模糊匹配的順序,結果大於5的時候返回結果,根據自己業務需要判斷這個返回的數量
                if(result.size()>=5){
                    break;
                }
                // 中文匹配,全拼匹配以及拼音首字母匹配存在結果的,不需要模糊匹配
                if(index==3 && result.size()>0){
                    break;
                }
                // 超過3個字模糊匹配不準確
                if(kw.length()>3 && result.size()==0){
                    break;
                }
            }
            return result;
        } catch (IOException e) {
            e.printStackTrace();
            return new ArrayList<>();
        }
    }
}

  

6..c#程式碼實現

C#使用的是NEST

 public partial class ElasticFactory
    {
        public ExternalServiceResponse<KeywordsSuggestResponseDataEntity> GetKeywordsSuggest(ElasticKeywordsSuggestRequest request)
        {
            var result = new ExternalServiceResponse<KeywordsSuggestResponseDataEntity>();

            try
            {
                if (string.IsNullOrEmpty(request.q)) return result;

                var nodes = new Uri[0];
                nodes[0] = new Uri("http://127.0.0.1:9200");
                var pool = new StaticConnectionPool(nodes);
                var settings = new ConnectionSettings(pool).DefaultIndex("suggest_tset");
                var client = new ElasticClient(settings);

                string[] keys = new[] { "suggestText", "full_pinyin", "prefix_pinyin", "like_pinyin" };
                SearchDescriptor<object> search = new SearchDescriptor<object>();
                search
                    .Source(r => r
                        .Includes(f => f
                            .Fields("kw")
                        )
                    )
                    .Suggest(s => s.Completion(keys[0], c => c.Field("kwsuggest.suggestText").SkipDuplicates(true).Prefix(request.q).SkipDuplicates())
                        .Completion(keys[1], c => c.Field("kwsuggest.full_pinyin").SkipDuplicates(true).Prefix(request.q).SkipDuplicates())
                        .Completion(keys[2], c => c.Field("kwsuggest.prefix_pinyin").SkipDuplicates(true).Prefix(request.q).SkipDuplicates())
                        .Completion(keys[3], c => c.Field("kwsuggest.like_pinyin").SkipDuplicates(true).Prefix(request.q).SkipDuplicates().Fuzzy(m=>m.Fuzziness(Fuzziness.EditDistance(1)))))
                    ;
                var esResult = client.Search<dynamic>(s => search);
                if (esResult != null)
                {
                    result.code = 1;
                    result.data = new KeywordsSuggestResponseDataEntity();
                    //1.先獲取中文全匹配
                    //2.上面不滿5個,再匹配全拼
                    //3.上面不滿5個,中文全拼匹配首字母
                    //4.上面都沒有用模糊匹配
                    if (esResult.Suggest != null)
                    {
                        result.data.items = new List<KeywordsSuggestResponseItemEntity>();
                        int index = 1;
                        foreach (var key in keys)
                        {
                            AddSuggestItems(esResult.Suggest, key, result.data.items);
                            //1-3之間,夠了5個就返回
                            if (index >= 1 && index <= 3 && result.data.items.Count >= 5)
                            {
                                result.data.items = result.data.items.Skip(0).Take(5).ToList();
                                break;
                            }
                            //到了第3步如果還沒有滿足5個,直接返回,模糊匹配不精確
                            if (index == 3 && result.data.items.Count > 0)
                            {
                                break;
                            }
                            //輸入的字元數大於3個以上,前面沒有關鍵詞匹配,後面不做模糊處理,匹配度太差了
                            if (index == 3 && request.q.Length>3)
                            {
                                break;
                            }
                            index++;
                        }
                        result.data.num = result.data.items.Count;
                    }
                    else
                    {
                        result.data.num = 0;
                    }
                }
                else
                {
                    result.code = 0;
                    result.msg = "查詢失敗";
                }
            }
            catch (Exception ex)
            {
                result.code = 0;
                result.msg = ex.Message;
            }

            return result;
        }

        private void AddSuggestItems(ISuggestDictionary<dynamic> suggest, string key, List<KeywordsSuggestResponseItemEntity> items)
        {
            var suggestFullPinyin = suggest[key];
            if (suggestFullPinyin != null)
            {
                foreach (var hit in suggestFullPinyin[0].Options)
                {
                    string kwSource = hit.Source["kwsuggest"];
                    //已經存在的不要重複新增
                    if (items.Any(m => m.kw == kwSource))
                    {
                        continue;
                    }
                    items.Add(new KeywordsSuggestResponseItemEntity() { kw = kwSource });
                }
            }
        }
    }