1. 程式人生 > >lucene多種查詢方式

lucene多種查詢方式

複製程式碼
package junit;

import java.io.File;
import java.io.IOException;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.lang.math.NumberUtils;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumericField;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.queryParser.MultiFieldQueryParser; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.Filter; import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MultiPhraseQuery; import org.apache.lucene.search.NumericRangeFilter; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.PrefixQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermRangeQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.search.highlight.Formatter; import org.apache.lucene.search.highlight.Fragmenter; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.SimpleFragmenter; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import org.apache.lucene.search.regex.RegexQuery; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.junit.Test; import org.wltea.analyzer.lucene.IKAnalyzer; import com.ljq.entity.Person; import com.ljq.utils.Consts; import com.ljq.utils.DateUtils; import com.ljq.utils.LuceneUtil; import com.ljq.utils.XMLPropertyConfig; /** * Lucene搜尋方式大合集<br/><br/> * * Lucene搜尋種類很多。這裡就介紹幾個最常用的。其中TermQuery\BooleanQuery\RegexQuery功能最強大,最為常用。 * * @author 林計欽 * @version 1.0 2013-6-7 上午09:34:08 */ public class IndexQueryTest { /** * 詞條搜尋(單個關鍵字查詢)<br/><br/> * * 主要物件是TermQuery,呼叫方式如下:<br/> * Term term=new Term(欄位名, 搜尋關鍵字);<br/> * Query query=new TermQuery(term);<br/> * Hits hits=searcher.search(query);<br/> * @throws Exception */ @Test public void termQuery() throws Exception { IndexReader reader = IndexReader.open(FSDirectory.open(new File(XMLPropertyConfig.getConfigXML().getString("index_path"))), true); IndexSearcher searcher = new IndexSearcher(reader); //Term term=new Term("ids", "1"); //Term term=new Term("ages", "20"); //Term term=new Term("birthdays", "2008-06-12"); //Term term=new Term("name", "張三"); Term term=new Term("city", "廈門"); Query query=new TermQuery(term); TopDocs topDocs=searcher.search(query, 1000); System.out.println("共檢索出 " + topDocs.totalHits + " 條記錄"); System.out.println(); ScoreDoc[] scoreDocs = topDocs.scoreDocs; for (ScoreDoc scDoc : scoreDocs) { Document document = searcher.doc(scDoc.doc); String id = document.get("id"); String name = document.get("name"); String age = document.get("age"); String city = document.get("city"); String birthday = document.get("birthday"); float score = scDoc.score; //相似度 System.out.println(String.format("id:%s, name:%s, age:%s, city:%s, birthday:%s, 相關度:%s.", id, name, age, city, DateUtils.longToString(Long.parseLong(birthday), Consts.FORMAT_SHORT), score)); } searcher.close(); reader.close(); } /** * 組合搜尋(允許多個關鍵字組合搜尋)<br/><br/> * * 主要物件是BooleanQuery,呼叫方式如下:<br/> * Term term1=new Term(欄位名, 搜尋關鍵字);<br/> * TermQuery query1=new TermQuery(term1);<br/><br/> * * Term term2=new Term(欄位名, 搜尋關鍵字);<br/> * TermQuery query2=new TermQuery(term2);<br/><br/> * * BooleanQuery booleanQuery=new BooleanQuery();<br/> * booleanQuery.add(query1, 引數);<br/> * booleanQuery.add(query2, 引數);<br/><br/> * * Hits hits=searcher.search(booleanquery);<br/> * 此方法中的核心在BooleanQuery的add方法上,其第二個引數有三個可選值,對應著邏輯上的與或非關係。<br/><br/> * * 引數如下:<br/> * BooleanClause.Occur.MUST:必須包含,類似於邏輯運算的與<br/> * BooleanClause.Occur.MUST_NOT:必須不包含,類似於邏輯運算的非<br/> * BooleanClause.Occur.SHOULD:可以包含,類似於邏輯運算的或<br/> * 這三者組合,妙用無窮。<br/> * @throws Exception */ @Test public void booleanQuery() throws Exception { IndexReader reader = IndexReader.open(FSDirectory.open(new File(XMLPropertyConfig.getConfigXML().getString("index_path"))), true); IndexSearcher searcher = new IndexSearcher(reader); //組合條件: //年齡(或):10、20、30、40 //名字(與): 四 //城市(非): 莆田 TermQuery ageQuery10=new TermQuery(new Term("ages", "10")); TermQuery ageQuery20=new TermQuery(new Term("ages", "20")); TermQuery ageQuery30=new TermQuery(new Term("ages", "30")); TermQuery ageQuery40=new TermQuery(new Term("ages", "40")); TermQuery nameQuery=new TermQuery(new Term("name", "四")); TermQuery cityQuery=new TermQuery(new Term("city", "莆田")); BooleanQuery booleanQuery=new BooleanQuery(); booleanQuery.add(ageQuery10, BooleanClause.Occur.SHOULD); booleanQuery.add(ageQuery20, BooleanClause.Occur.SHOULD); booleanQuery.add(ageQuery30, BooleanClause.Occur.SHOULD); booleanQuery.add(ageQuery40, BooleanClause.Occur.SHOULD); booleanQuery.add(nameQuery, BooleanClause.Occur.MUST); booleanQuery.add(cityQuery, BooleanClause.Occur.MUST_NOT); TopDocs topDocs=searcher.search(booleanQuery, 1000); System.out.println("共檢索出 " + topDocs.totalHits + " 條記錄"); System.out.println(); ScoreDoc[] scoreDocs = topDocs.scoreDocs; for (ScoreDoc scDoc : scoreDocs) { Document document = searcher.doc(scDoc.doc); String id = document.get("id"); String name = document.get("name"); String age = document.get("age"); String city = document.get("city"); String birthday = document.get("birthday"); float score = scDoc.score; //相似度 System.out.println(String.format("id:%s, name:%s, age:%s, city:%s, birthday:%s, 相關度:%s.", id, name, age, city, DateUtils.longToString(Long.parseLong(birthday), Consts.FORMAT_SHORT), score)); } searcher.close(); reader.close(); } /** * 範圍搜尋(允許搜尋指定範圍內的關鍵字結果)<br/><br/> * * 主要物件是TermRangeQuery,呼叫方式如下:<br/> * TermRangeQuery rangequery=new TermRangeQuery(欄位名, 起始值, 終止值, 起始值是否包含邊界, 終止值是否包含邊界); <br/><br/> * * Hits hits=searcher.search(rangequery);<br/> * 此方法中的引數是Boolean型別的,表示是否包含邊界 。<br/> * true 包含邊界<br/> * false不包含邊界<br/> * @throws Exception */ @Test public void rangeQuery() throws Exception { IndexReader reader = IndexReader.open(FSDirectory.open(new File(XMLPropertyConfig.getConfigXML().getString("index_path"))), true); IndexSearcher searcher = new IndexSearcher(reader); TermRangeQuery idQuery=new TermRangeQuery("ids", "1", "3", true, true); TermRangeQuery ageQuery=new TermRangeQuery("ages", "10", "30", true, true); TermRangeQuery timeQuery=new TermRangeQuery("birthdays", "2011-03-09", "2013-01-07", true, true); TopDocs topDocs=searcher.search(timeQuery, 1000); System.out.println("共檢索出 " + topDocs.totalHits + " 條記錄"); System.out.println(); ScoreDoc[] scoreDocs = topDocs.scoreDocs; for (ScoreDoc scDoc : scoreDocs) { Document document = searcher.doc(scDoc.doc); String id = document.get("id"); String name = document.get("name"); String age = document.get("age"); String city = document.get("city"); String birthday = document.get("birthday"); float score = scDoc.score; //相似度 System.out.println(String.format("id:%s, name:%s, age:%s, city:%s, birthday:%s, 相關度:%s.", id, name, age, city, DateUtils.longToString(Long.parseLong(birthday), Consts.FORMAT_SHORT), score)); } searcher.close(); reader.close(); } /** * 字首搜尋(搜尋起始位置符合要求的結果)<br/><br/> * * 主要物件是PrefixQuery,呼叫方式如下:<br/> * Term term=new Term(欄位名, 搜尋關鍵字);<br/> * PrefixQuery prefixquery=new PrefixQuery(term);<br/> * Hits hits=searcher.search(prefixquery);<br/> * * @throws Exception */ @Test public void prefixQuery() throws Exception { IndexReader reader = IndexReader.open(FSDirectory.open(new File(XMLPropertyConfig.getConfigXML().getString("index_path"))), true); IndexSearcher searcher = new IndexSearcher(reader); Term term=new Term("name", "王"); PrefixQuery prefixquery=new PrefixQuery(term); TopDocs topDocs=searcher.search(prefixquery, 1000); System.out.println("共檢索出 " + topDocs.totalHits + " 條記錄"); System.out.println(); ScoreDoc[] scoreDocs = topDocs.scoreDocs; for (ScoreDoc scDoc : scoreDocs) { Document document = searcher.doc(scDoc.doc); String id = document.get("id"); String name = document.get("name"); String age = document.get("age"); String city = document.get("city"); String birthday = document.get("birthday"); float score = scDoc.score; //相似度 System.out.println(String.format("id:%s, name:%s, age:%s, city:%s, birthday:%s, 相關度:%s.", id, name, age, city, DateUtils.longToString(Long.parseLong(birthday), Consts.FORMAT_SHORT), score)); } searcher.close(); reader.close(); } /** * 短語搜尋(根據零碎的短語組合成新的片語進行搜尋)<br/><br/> * * 主要物件是PhraseQuery,呼叫方式如下:<br/> * Term term1=new Term(欄位名, 搜尋關鍵字);<br/> * Term term2=new Term(欄位名, 搜尋關鍵字);<br/><br/> * * PhraseQuery phrasequery=new PhraseQuery();<br/> * phrasequery.setSlop(引數);<br/> * phrasequery.add(term1);<br/> * phrasequery.add(term2);<br/> * Hits hits=searcher.search(phrasequery);<br/> * 其中setSlop的引數是設定兩個關鍵字之間允許間隔的最大值。<br/> * @throws Exception */ @Test public void phraseQuery() throws Exception { IndexReader reader = IndexReader.open(FSDirectory.open(new File(XMLPropertyConfig.getConfigXML().getString("index_path"))), true); IndexSearcher searcher = new IndexSearcher(reader); Term term1=new Term("name", "林"); Term term2=new Term("name", "欽"); PhraseQuery phrasequery=new PhraseQuery(); phrasequery.setSlop(100); phrasequery.add(term1); phrasequery.add(term2); TopDocs topDocs=searcher.search(phrasequery, 1000); System.out.println("共檢索出 " + topDocs.totalHits + " 條記錄"); System.out.println(); ScoreDoc[] scoreDocs = topDocs.scoreDocs; for (ScoreDoc scDoc : scoreDocs) { Document document = searcher.doc(scDoc.doc); String id = document.get("id"); String name = document.get("name"); String age = document.get("age"); String city = document.get("city"); String birthday = document.get("birthday"); float score = scDoc.score; //相似度 System.out.println(String.format("id:%s, name:%s, age:%s, city:%s, birthday:%s, 相關度:%s.", id, name, age, city, DateUtils.longToString(Long.parseLong(birthday), Consts.FORMAT_SHORT), score)); } searcher.close(); reader.close(); } /** * 多短語搜尋(先指定一個字首關鍵字,然後其他的關鍵字加在此關鍵字之後,組成詞語進行搜尋)<br/><br/> * * 主要物件是MultiPhraseQuery,呼叫方式如下:<br/> * * Term term=new Term(欄位名,前置關鍵字);<br/> * Term term1=new Term(欄位名,搜尋關鍵字);<br/> * Term term2=new Term(欄位名,搜尋關鍵字);<br/><br/> * * MultiPhraseQuery multiPhraseQuery=new MultiPhraseQuery();<br/><br/> * * multiPhraseQuery.add(term);<br/> * multiPhraseQuery.add(new Term[]{term1, term2});<br/><br/> * * Hits hits=searcher.search(multiPhraseQuery);<br/> * @throws Exception */ @Test public void multiPhraseQuery() throws Exception { IndexReader reader = IndexReader.open(FSDirectory.open(new File(XMLPropertyConfig.getConfigXML().getString("index_path"))), true); IndexSearcher searcher = new IndexSearcher(reader); //查詢“計張”、“計欽”組合的關鍵詞,先指定一個字首關鍵字,然後其他的關鍵字加在此關鍵字之後,組成詞語進行搜尋 Term term=new Term("name", "計"); //前置關鍵字 Term term1=new Term("name", "張"); //搜尋關鍵字 Term term2=new Term("name", "欽"); //搜尋關鍵字 MultiPhraseQuery multiPhraseQuery=new MultiPhraseQuery(); multiPhraseQuery.add(term); multiPhraseQuery.add(new Term[]{term1, term2}); TopDocs topDocs=searcher.search(multiPhraseQuery, 1000); System.out.println("共檢索出 " + topDocs.totalHits + " 條記錄"); System.out.println(); ScoreDoc[] scoreDocs = topDocs.scoreDocs; for (ScoreDoc scDoc : scoreDocs) { Document document = searcher.doc(scDoc.doc); String id = document.get("id"); String name = document.get("name"); String age = document.get("age"); String city = document.get("city"); String birthday = document.get("birthday"); float score = scDoc.score; //相似度 System.out.println(String.format("id:%s, name:%s, age:%s, city:%s, birthday:%s, 相關度:%s.", id, name, age, city, DateUtils.longToString(Long.parseLong(birthday), Consts.FORMAT_SHORT), score)); } searcher.close(); reader.close(); } /** * 模糊搜尋(顧名思義)<br/><br/> * * 主要物件是FuzzyQuery,呼叫方式如下:<br/><br/> * * Term term=new Term(欄位名, 搜尋關鍵字);<br/> * FuzzyQuery fuzzyquery=new FuzzyQuery(term,引數);<br/> * Hits hits=searcher.search(fuzzyquery);<br/> * 此中的引數是表示模糊度,是小於1的浮點小數,比如0.5f * @throws Exception */ @Test public void fuzzyQuery() throws Exception { IndexReader reader = IndexReader.open(FSDirectory.open(new File(XMLPropertyConfig.getConfigXML().getString("index_path"))), true); IndexSearcher searcher = new IndexSearcher(reader); Term term=new Term("name", "三張"); FuzzyQuery fuzzyquery=new FuzzyQuery(term, 0.5f); TopDocs topDocs=searcher.search(fuzzyquery, 1000); System.out.println("共檢索出 " + topDocs.totalHits + " 條記錄"); System.out.println(); ScoreDoc[] scoreDocs = topDocs.scoreDocs; for (ScoreDoc scDoc : scoreDocs) { Document document = searcher.doc(scDoc.doc); String id = document.get("id"); String name = document.get("name"); String age = document.get("age"); String city = document.get("city"); String birthday = document.get("birthday"); float score = scDoc.score; //相似度 System.out.println(String.format("id:%s, name:%s, age:%s, city:%s, birthday:%s, 相關度:%s.", id, name, age, city, DateUtils.longToString(Long.parseLong(birthday), Consts.FORMAT_SHORT), score)); } searcher.close(); reader.close(); } /** * 萬用字元搜尋(顧名思義)<br/><br/> * * 主要物件是:WildcardQuery,呼叫方式如下:<br/><br/> * * Term term=new Term(欄位名,搜尋關鍵字+萬用字元);<br/> * WildcardQuery wildcardquery=new WildcardQuery(term);<br/> * Hits hits=searcher.search(wildcardquery);<br/><br/> * * 其中的萬用字元分兩種,即*和?<br/> * * 表示任意多的自負<br/> * ?表示任意一個字元 * @throws Exception */ @Test public void wildcardQuery() throws Exception { IndexReader reader = IndexReader.open(FSDirectory.open(new File(XMLPropertyConfig.getConfigXML().getString("index_path"))), true); IndexSearcher searcher = new IndexSearcher(reader); Term term=new Term("name", "三?"); WildcardQuery wildcardQuery=new WildcardQuery(term); TopDocs topDocs=searcher.search(wildcardQuery, 1000); System.out.println("共檢索出 " + topDocs.totalHits + " 條記錄"); System.out.println(); ScoreDoc[] scoreDocs = topDocs.scoreDocs; for (ScoreDoc scDoc : scoreDocs) { Document document = searcher.doc(scDoc.doc); String id = document.get("id"); String name = document.get("name"); String age = document.get("age"); String city = document.get("city"); String birthday = document.get("birthday"); float score = scDoc.score; //相似度 System.out.println(String.format("id:%s, name:%s, age:%s, city:%s, birthday:%s, 相關度:%s.", id, name, age, city, DateUtils.longToString(Long.parseLong(birthday), Consts.FORMAT_SHORT), score)); } searcher.close(); reader.close(); } /** * 正則表示式搜尋(顧名思義,這個類引入lucene-queries-3.5.0.jar包)<br/><br/> * * 主要物件是:RegexQuery,呼叫方式如下 <br/> * String regex = ".*"; <br/> * Term term = new Term (search_field_name, regex); <br/> * RegexQuery query = new RegexQuery (term); <br/> * TopDocs hits = searcher.search (query, 100); <br/> * @throws Exception */ @Test public void regexQuery() throws Exception { IndexReader reader = IndexReader.open(FSDirectory.open(new File(XMLPropertyConfig.getConfigXML().getString("index_path"))), true); IndexSearcher searcher = new IndexSearcher(reader); String regex = "林*"; Term term=new Term("name", regex); RegexQuery query = new RegexQuery(term); TopDocs topDocs=searcher.search(query, 1000); System.out.println("共檢索出 " + topDocs.totalHits + " 條記錄"); System.out.println(); ScoreDoc[] scoreDocs = topDocs.scoreDocs; for (ScoreDoc scDoc : scoreDocs) { Document document = searcher.doc(scDoc.doc); String id = document.get("id"); String name = document.get("name"); String age = document.get("age"); String city = document.get("city"); String birthday = document.get("birthday"); float score = scDoc.score; //相似度 System.out.println(String.format("id:%s, name:%s, age:%s, city:%s, birthday:%s, 相關度:%s.", id, name, age, city, DateUtils.longToString(Long.parseLong(birthday), Consts.FORMAT_SHORT), score)); } searcher.close(); reader.close(); } /** * 數值範圍過濾器,如:int、long、float型別等 * * @throws Exception */ @Test public void numericFilter() throws Exception{ //CustomScoreQuery //Filter filter = NumericRangeFilter.newLongRange("id", 1l, 3l, true, true); Filter filter = NumericRangeFilter.newIntRange("age", 1, 39, true, true); List<Person> persons=search(filter, new String[]{"name","city"}, "廈門"); for(Person person : persons){ System.out.println(String.format("id:%s, name:%s, age:%s, city:%s, birthday:%s.", person.getId(), person.getName(), person.getAge(), person.getCity(), DateUtils.dateToString(person.getBirthday(), Consts.FORMAT_SHORT))); } } /** * 時間範圍過濾器 * @throws Exception */ @Test public void dateFilter() throws Exception{ //2008-06-12 long min=DateUtils.stringToDate("2008-06-12", Consts.FORMAT_SHORT).getTime(); //2013-01-07 long max=DateUtils.stringToDate("2013-01-07", Consts.FORMAT_SHORT).getTime(); Filter filter = NumericRangeFilter.newLongRange("birthday", min, max, true, true); List<Person> persons=search(filter, new String[]{"name","city"}, "廈門"); for(Person person : persons){ System.out.println(String.format("id:%s, name:%s, age:%s, city:%s, birthday:%s.", person.getId(), person.getName(), person.getAge(), person.getCity(), DateUtils.dateToString(person.getBirthday(), Consts.FORMAT_SHORT))); } } /** * 建立索引 * * @throws Exception */ @Test public void createIndex() throws Exception { List<Document> docs = new ArrayList<Document>(); for (Person person : getPersons()) { Document doc = new Document(); //宣告為NumericField的欄位,只能用NumericRangeFilter物件範圍查詢,不能用作關鍵字查詢。 //NumericField不推薦,統一用Field doc.add(new NumericField("id", Field.Store.YES, true).setLongValue(person.getId())); doc.add(new NumericField("age", Field.Store.YES, true).setIntValue(person.getAge())); doc.add(new NumericField("birthday", Field.Store.YES, true).setLongValue(person.getBirthday().getTime())); doc.add(new Field("ids", person.getId()+"", Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("ages", person.getAge()+"", Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("birthdays", DateUtils.dateToString(person.getBirthday(), Consts.FORMAT_SHORT), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("name", person.getName(), Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("city", person.getCity(), Field.Store.YES, Field.Index.ANALYZED)); docs.add(doc); } LuceneUtil.createIndex(docs); } private List<Person> search(Filter filter, String[] fields, String keyword) { List<Person> result = new ArrayList<Person>(); IndexSearcher indexSearcher = null; TopDocs topDocs = null; try { // 建立索引搜尋器,且只讀 IndexReader indexReader = IndexReader.open(FSDirectory.open(new File(XMLPropertyConfig.getConfigXML().getString("index_path"))), true); indexSearcher = new IndexSearcher(indexReader); MultiFieldQueryParser queryParser = new MultiFieldQueryParser(Version.LUCENE_35, fields, new IKAnalyzer()); Query query = queryParser.parse(keyword); // 返回前number條記錄 if(filter == null){ topDocs=indexSearcher.search(query, 100000); }else { topDocs=indexSearcher.search(query, filter, 100000); } // 資訊展示 int totalCount = topDocs.totalHits; System.out.println("共檢索出 " + totalCount + " 條記錄"); //高亮顯示 Formatter formatter = new SimpleHTMLFormatter("<font color='red'>", "</font>"); QueryScorer fragmentScorer = new QueryScorer(query); Highlighter highlighter = new Highlighter(formatter, fragmentScorer); Fragmenter fragmenter = new SimpleFragmenter(100); highlighter.setTextFragmenter(fragmenter); ScoreDoc[] scoreDocs = topDocs.scoreDocs; for (ScoreDoc scDoc : scoreDocs) { Document document = indexSearcher.doc(scDoc.doc); String id = document.get("id"); String name = document.get("name"); String age = document.get("age"); String city = document.get("city"); String birthday = document.get("birthday"); float score = scDoc.score; //相似度 System.out.println("相似度:"+score); String lighterName = highlighter.getBestFragment(new IKAnalyzer(), "name", name); if (null == lighterName) { lighterName = name; } String lighterAge = highlighter.getBestFragment(new IKAnalyzer(), "age", age); if (null == lighterAge) { lighterAge = age; } String lighterCity= highlighter.getBestFragment(new IKAnalyzer(), "city", city); if (null == lighterCity) { lighterCity = c