1. 程式人生 > 其它 >【Elasticsearch】之基本使用(二)

【Elasticsearch】之基本使用(二)

技術標籤:Elasticsearch 應用筆記elasticsearchelk

全文搜尋兩個最重要的方面是:相關性(Relevance) 它是評價查詢與其結果間的相關程度,並根據這種相關程度對結果排名的一種能力,這種計算方式可以是 TF/IDF 方法、地理位置鄰近、模糊相似,或其他的某些演算法。分詞(Analysis) 它是將文字塊轉換為有區別的、規範化的 token 的一個過程,目的是為了建立倒排索引以及查詢倒排索引。

構造資料

PUT http://172.21.188.234:9200/sfd

引數

{
    "settings":{
        "index"
:{ "number_of_shards":"1", "number_of_replicas":"0" } }, "mappings":{ "person":{ "properties":{ "name":{ "type":"text"
}, "age":{ "type":"integer" }, "mail":{ "type":"keyword" }, "hobby":{ "type"
:"text", "analyzer":"ik_max_word" } } } } }

響應

{
    "acknowledged": true,
    "shards_acknowledged": true,
    "index": "sfd"
}

批量插入資料

POST http://172.21.188.234:9200/sfd/_bulk

引數

{"index":{"_index":"sfd","_type":"person"}}
{"name":"張三","age": 20,"mail": "[email protected]","hobby":"羽毛球、乒乓球、足球"}
{"index":{"_index":"sfd","_type":"person"}}
{"name":"李四","age": 21,"mail": "[email protected]","hobby":"羽毛球、乒乓球、足球、籃球"}
{"index":{"_index":"sfd","_type":"person"}}
{"name":"王五","age": 22,"mail": "[email protected]","hobby":"羽毛球、籃球、游泳、聽音樂"}
{"index":{"_index":"sfd","_type":"person"}}
{"name":"趙六","age": 23,"mail": "[email protected]","hobby":"跑步、游泳、籃球"}
{"index":{"_index":"sfd","_type":"person"}}
{"name":"孫七","age": 24,"mail": "[email protected]","hobby":"聽音樂、看電影、羽毛球"}

響應

{
    "took": 41,
    "errors": false,
    "items": [
        {
            "index": {
                "_index": "sfd",
                "_type": "person",
                "_id": "ilnyRXYB3olRGZUDu6ug",
                "_version": 1,
                "result": "created",
                "_shards": {
                    "total": 1,
                    "successful": 1,
                    "failed": 0
                },
                "_seq_no": 0,
                "_primary_term": 1,
                "status": 201
            }
        },
        {
            "index": {
                "_index": "sfd",
                "_type": "person",
                "_id": "i1nyRXYB3olRGZUDu6ug",
                "_version": 1,
                "result": "created",
                "_shards": {
                    "total": 1,
                    "successful": 1,
                    "failed": 0
                },
                "_seq_no": 1,
                "_primary_term": 1,
                "status": 201
            }
        },
        {
            "index": {
                "_index": "sfd",
                "_type": "person",
                "_id": "jFnyRXYB3olRGZUDu6ug",
                "_version": 1,
                "result": "created",
                "_shards": {
                    "total": 1,
                    "successful": 1,
                    "failed": 0
                },
                "_seq_no": 2,
                "_primary_term": 1,
                "status": 201
            }
        },
        {
            "index": {
                "_index": "sfd",
                "_type": "person",
                "_id": "jVnyRXYB3olRGZUDu6ug",
                "_version": 1,
                "result": "created",
                "_shards": {
                    "total": 1,
                    "successful": 1,
                    "failed": 0
                },
                "_seq_no": 3,
                "_primary_term": 1,
                "status": 201
            }
        },
        {
            "index": {
                "_index": "sfd",
                "_type": "person",
                "_id": "jlnyRXYB3olRGZUDu6ug",
                "_version": 1,
                "result": "created",
                "_shards": {
                    "total": 1,
                    "successful": 1,
                    "failed": 0
                },
                "_seq_no": 4,
                "_primary_term": 1,
                "status": 201
            }
        }
    ]
}

單詞搜尋

POST http://172.21.188.234:9200/sfd/person/_search

引數

{ 
    "query":{ 
        "match":{ 
            "hobby":"音樂" 
        } 
    },
    "highlight": { 
        "fields": { 
            "hobby": {} 
        } 
    } 
}

響應

{
    "took": 66,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": 2,
        "max_score": 0.81652206,
        "hits": [
            {
                "_index": "sfd",
                "_type": "person",
                "_id": "jFnyRXYB3olRGZUDu6ug",
                "_score": 0.81652206,
                "_source": {
                    "name": "王五",
                    "age": 22,
                    "mail": "[email protected]",
                    "hobby": "羽毛球、籃球、游泳、聽音樂"
                },
                "highlight": {
                    "hobby": [
                        "羽毛球、籃球、游泳、聽<em>音樂</em>"
                    ]
                }
            },
            {
                "_index": "sfd",
                "_type": "person",
                "_id": "jlnyRXYB3olRGZUDu6ug",
                "_score": 0.81652206,
                "_source": {
                    "name": "孫七",
                    "age": 24,
                    "mail": "[email protected]",
                    "hobby": "聽音樂、看電影、羽毛球"
                },
                "highlight": {
                    "hobby": [
                        "聽<em>音樂</em>、看電影、羽毛球"
                    ]
                }
            }
        ]
    }
}

過程說明:

  1. 檢查欄位型別 愛好 hobby 欄位是一個 text 型別( 指定了IK分詞器),這意味著查詢字串本身也應該被分詞。
  2. 分析查詢字串 將查詢的字串 “音樂” 傳入IK分詞器中,輸出的結果是單個項 音樂。因為只有一個單詞項,所以 match 查詢執行的是單個底層 term 查詢。
  3. 查詢匹配文件 用 term 查詢在倒排索引中查詢 “音樂” 然後獲取一組包含該項的文件,本例的結果是文件:3 、5
  4. 為每個文件評分 用 term 查詢計算每個文件相關度評分 _score ,這是種將 詞頻(term frequency,即詞 “音樂” 在相關文件的 hobby 欄位中出現的頻率)和 反向文件頻率(inverse document frequency,即詞 “音樂” 在所有文件的 hobby 欄位中出現的頻率),以及欄位的長度(即欄位越短相關度越高)相結合的計算方式。

多詞搜尋

查詢同時包含籃球和音樂

引數

{ 
    "query":{ 
        "match":{ 
            "hobby":"音樂 籃球"
        } 
    },
    "highlight": { 
        "fields": { 
            "hobby": {} 
        } 
    } 
}

響應

{
    "took": 7,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": 4,
        "max_score": 1.3192271,
        "hits": [
            {
                "_index": "sfd",
                "_type": "person",
                "_id": "jFnyRXYB3olRGZUDu6ug",
                "_score": 1.3192271,
                "_source": {
                    "name": "王五",
                    "age": 22,
                    "mail": "[email protected]",
                    "hobby": "羽毛球、籃球、游泳、聽音樂"
                },
                "highlight": {
                    "hobby": [
                        "羽毛球、<em>籃球</em>、游泳、聽<em>音樂</em>"
                    ]
                }
            },
            {
                "_index": "sfd",
                "_type": "person",
                "_id": "jlnyRXYB3olRGZUDu6ug",
                "_score": 0.81652206,
                "_source": {
                    "name": "孫七",
                    "age": 24,
                    "mail": "[email protected]",
                    "hobby": "聽音樂、看電影、羽毛球"
                },
                "highlight": {
                    "hobby": [
                        "聽<em>音樂</em>、看電影、羽毛球"
                    ]
                }
            },
            {
                "_index": "sfd",
                "_type": "person",
                "_id": "jVnyRXYB3olRGZUDu6ug",
                "_score": 0.6987338,
                "_source": {
                    "name": "趙六",
                    "age": 23,
                    "mail": "[email protected]",
                    "hobby": "跑步、游泳、籃球"
                },
                "highlight": {
                    "hobby": [
                        "跑步、游泳、<em>籃球</em>"
                    ]
                }
            },
            {
                "_index": "sfd",
                "_type": "person",
                "_id": "i1nyRXYB3olRGZUDu6ug",
                "_score": 0.50270504,
                "_source": {
                    "name": "李四",
                    "age": 21,
                    "mail": "[email protected]",
                    "hobby": "羽毛球、乒乓球、足球、籃球"
                },
                "highlight": {
                    "hobby": [
                        "羽毛球、乒乓球、足球、<em>籃球</em>"
                    ]
                }
            }
        ]
    }
}

以上顯然是或關係,下面我們改為與關係

**引數

{ 
    "query":{ 
        "match":{ 
            "hobby":{ 
                "query":"音樂 籃球", 
                "operator":"and" 
                } 
            } 
        },
        "highlight": { 
            "fields": { 
                "hobby": {} 
            } 
        } 
    }

響應

{
    "took": 11,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": 1,
        "max_score": 1.3192271,
        "hits": [
            {
                "_index": "sfd",
                "_type": "person",
                "_id": "jFnyRXYB3olRGZUDu6ug",
                "_score": 1.3192271,
                "_source": {
                    "name": "王五",
                    "age": 22,
                    "mail": "[email protected]",
                    "hobby": "羽毛球、籃球、游泳、聽音樂"
                },
                "highlight": {
                    "hobby": [
                        "羽毛球、<em>籃球</em>、游泳、聽<em>音樂</em>"
                    ]
                }
            }
        ]
    }
}

通過minimum_should_match來指定匹配度

引數

{ 
    "query":{ 
        "match":{ 
            "hobby":{ 
                "query":"游泳 羽毛球", 
                "minimum_should_match":"80%" 
            } 
        } 
    },
    "highlight": { 
        "fields": { 
            "hobby": {} 
        } 
    } 
}
{
    "took": 8,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": 4,
        "max_score": 1.621458,
        "hits": [
            {
                "_index": "sfd",
                "_type": "person",
                "_id": "jFnyRXYB3olRGZUDu6ug",
                "_score": 1.621458,
                "_source": {
                    "name": "王五",
                    "age": 22,
                    "mail": "[email protected]",
                    "hobby": "羽毛球、籃球、游泳、聽音樂"
                },
                "highlight": {
                    "hobby": [
                        "<em>羽毛</em><em>球</em>、籃球、<em>游泳</em>、聽音樂"
                    ]
                }
            },
            {
                "_index": "sfd",
                "_type": "person",
                "_id": "ilnyRXYB3olRGZUDu6ug",
                "_score": 0.9608413,
                "_source": {
                    "name": "張三",
                    "age": 20,
                    "mail": "[email protected]",
                    "hobby": "羽毛球、乒乓球、足球"
                },
                "highlight": {
                    "hobby": [
                        "<em>羽毛</em><em>球</em>、乒乓<em>球</em>、足球"
                    ]
                }
            },
            {
                "_index": "sfd",
                "_type": "person",
                "_id": "i1nyRXYB3olRGZUDu6ug",
                "_score": 0.91348255,
                "_source": {
                    "name": "李四",
                    "age": 21,
                    "mail": "[email protected]",
                    "hobby": "羽毛球、乒乓球、足球、籃球"
                },
                "highlight": {
                    "hobby": [
                        "<em>羽毛</em><em>球</em>、乒乓<em>球</em>、足球、籃球"
                    ]
                }
            },
            {
                "_index": "sfd",
                "_type": "person",
                "_id": "jlnyRXYB3olRGZUDu6ug",
                "_score": 0.80493605,
                "_source": {
                    "name": "孫七",
                    "age": 24,
                    "mail": "[email protected]",
                    "hobby": "聽音樂、看電影、羽毛球"
                },
                "highlight": {
                    "hobby": [
                        "聽音樂、看電影、<em>羽毛</em><em>球</em>"
                    ]
                }
            }
        ]
    }
}

相似度應該多少合適,需要在實際的需求中進行反覆測試,才可得到合理的值

組合搜尋

引數

{
    "query":{
        "bool":{
            "must":{
                "match":{
                    "hobby":"籃球"
                }
            },
            "must_not":{
                "match":{
                    "hobby":"音樂"
                }
            },
            "should":[
                {
                    "match":{
                        "hobby":"游泳"
                    }
                }
            ]
        }
    },
    "highlight":{
        "fields":{
            "hobby":{

            }
        }
    }
}

搜尋結果中必須包含籃球,不能包含音樂,如果包含了游泳,那麼它的相似度更高

響應

{
    "took": 9,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": 2,
        "max_score": 1.8336569,
        "hits": [
            {
                "_index": "sfd",
                "_type": "person",
                "_id": "jVnyRXYB3olRGZUDu6ug",
                "_score": 1.8336569,
                "_source": {
                    "name": "趙六",
                    "age": 23,
                    "mail": "[email protected]",
                    "hobby": "跑步、游泳、籃球"
                },
                "highlight": {
                    "hobby": [
                        "跑步、<em>游泳</em>、<em>籃球</em>"
                    ]
                }
            },
            {
                "_index": "sfd",
                "_type": "person",
                "_id": "i1nyRXYB3olRGZUDu6ug",
                "_score": 0.50270504,
                "_source": {
                    "name": "李四",
                    "age": 21,
                    "mail": "[email protected]",
                    "hobby": "羽毛球、乒乓球、足球、籃球"
                },
                "highlight": {
                    "hobby": [
                        "羽毛球、乒乓球、足球、<em>籃球</em>"
                    ]
                }
            }
        ]
    }
}

評分的計算規則 bool 查詢會為每個文件計算相關度評分 _score , 再將所有匹配的 must 和 should 語句的分數 _score 求和,最後除以 must 和 should 語句的總數。must_not 語句不會影響評分; 它的作用只是將不相關的文件排除。

預設情況下,should中的內容不是必須匹配的,如果查詢語句中沒有must,那麼就會至少匹配其中一個。當然了,也可以通過minimum_should_match引數進行控制,該值可以是數字也可以的百分比。

引數

{
    "query":{
        "bool":{
            "should":[
                {
                    "match":{
                        "hobby":"游泳"
                    }
                },
                {
                    "match":{
                        "hobby":"籃球"
                    }
                },
                {
                    "match":{
                        "hobby":"音樂"
                    }
                }
            ],
            "minimum_should_match":2
        }
    },
    "highlight":{
        "fields":{
            "hobby":{

            }
        }
    }
}

響應

{
    "took": 5,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": 2,
        "max_score": 2.135749,
        "hits": [
            {
                "_index": "sfd",
                "_type": "person",
                "_id": "jFnyRXYB3olRGZUDu6ug",
                "_score": 2.135749,
                "_source": {
                    "name": "王五",
                    "age": 22,
                    "mail": "[email protected]",
                    "hobby": "羽毛球、籃球、游泳、聽音樂"
                },
                "highlight": {
                    "hobby": [
                        "羽毛球、<em>籃球</em>、<em>游泳</em>、聽<em>音樂</em>"
                    ]
                }
            },
            {
                "_index": "sfd",
                "_type": "person",
                "_id": "jVnyRXYB3olRGZUDu6ug",
                "_score": 1.8336569,
                "_source": {
                    "name": "趙六",
                    "age": 23,
                    "mail": "[email protected]",
                    "hobby": "跑步、游泳、籃球"
                },
                "highlight": {
                    "hobby": [
                        "跑步、<em>游泳</em>、<em>籃球</em>"
                    ]
                }
            }
        ]
    }
}

minimum_should_match為2,意思是should中的三個詞,至少要滿足2個。

權重

有些時候,我們可能需要對某些詞增加權重來影響該條資料的得分。如下:搜尋關鍵字為“游泳籃球”,如果結果中包含了“音樂”權重為10,包含了“跑步”權重為2。

引數

{
    "query":{
        "bool":{
            "must":{
                "match":{
                    "hobby":{
                        "query":"游泳籃球",
                        "operator":"and"
                    }
                }
            },
            "should":[
                {
                    "match":{
                        "hobby":{
                            "query":"音樂",
                            "boost":10
                        }
                    }
                },
                {
                    "match":{
                        "hobby":{
                            "query":"跑步",
                            "boost":2
                        }
                    }
                }
            ]
        }
    },
    "highlight":{
        "fields":{
            "hobby":{

            }
        }
    }
}

引數

{
    "took": 5,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": 2,
        "max_score": 9.484448,
        "hits": [
            {
                "_index": "sfd",
                "_type": "person",
                "_id": "jFnyRXYB3olRGZUDu6ug",
                "_score": 9.484448,
                "_source": {
                    "name": "王五",
                    "age": 22,
                    "mail": "[email protected]",
                    "hobby": "羽毛球、籃球、游泳、聽音樂"
                },
                "highlight": {
                    "hobby": [
                        "羽毛球、<em>籃球</em>、<em>游泳</em>、聽<em>音樂</em>"
                    ]
                }
            },
            {
                "_index": "sfd",
                "_type": "person",
                "_id": "jVnyRXYB3olRGZUDu6ug",
                "_score": 5.4279313,
                "_source": {
                    "name": "趙六",
                    "age": 23,
                    "mail": "[email protected]",
                    "hobby": "跑步、游泳、籃球"
                },
                "highlight": {
                    "hobby": [
                        "<em>跑步</em>、<em>游泳</em>、<em>籃球</em>"
                    ]
                }
            }
        ]
    }
}