1. 程式人生 > >lucene4.8.0 + IKAnalyzer5.0.1 建立索引與查詢demo

lucene4.8.0 + IKAnalyzer5.0.1 建立索引與查詢demo

主要程式碼:

建立索引:

public void createIndex(){
	
	try {
		// 有檔案系統或者記憶體儲存方式,這裡使用檔案系統儲存索引資料
		Directory directory = new SimpleFSDirectory(new File("C:\\myindex"));
		//例項化IKAnalyzer分詞器
		Analyzer analyzer = new IKAnalyzer(false);
		//配置IndexWriterConfig
		IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_48 , analyzer);
		indexWriterConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
		IndexWriter indexWriter = new IndexWriter(directory , indexWriterConfig);
		//刪除全部索引
		indexWriter.deleteAll();
		
		//寫入索引
		Document doc = new Document();
		doc.add(new StringField("id", "1", Store.YES));
		doc.add(new TextField("title", "IKAnalyzer的介紹", Store.YES));
		doc.add(new TextField("content", "IK Analyzer是一個結合詞典分詞和文法分詞的中文分詞開源工具包。它使用了全新的正向迭代最細粒度切分演算法。", Store.YES));
		
		// 向IndexWriter中增加新的一行記錄
		indexWriter.addDocument(doc);
		// 提交資料內容
		indexWriter.commit();
		
		indexWriter.close();
		directory.close();
	} catch (Exception e) {
		e.printStackTrace();
	}
}

查詢+高亮:
public void search(){
	try {
		// 有檔案系統或者記憶體儲存方式,這裡使用檔案系統儲存索引資料
		Directory directory = new SimpleFSDirectory(new File("C:\\myindex"));
		IndexReader reader = DirectoryReader.open(directory);
		IndexSearcher searcher = new IndexSearcher(reader);
		
		Query query = new TermQuery(new Term("content","演算法"));
		
		String preTag = "<font color='red'>";
		String postTag = "</font>";
		Formatter formatter = new SimpleHTMLFormatter(preTag, postTag);
		
		Scorer fragmentScorer = new QueryScorer(query);
		Highlighter highlighter = new Highlighter(formatter, fragmentScorer);
		// 這個一般等於你要返回的,高亮的資料長度  
		highlighter.setTextFragmenter(new SimpleFragmenter(Integer.MAX_VALUE));
		
		TopDocs topDocs = searcher.search(query, 10);
		System.out.println("一共查到:" + topDocs.totalHits + "條記錄");

		//例項化IKAnalyzer分詞器
		Analyzer analyzer = new IKAnalyzer(false);
		ScoreDoc[] scoreDoc = topDocs.scoreDocs;
		for (int i = 0; i < scoreDoc.length; i++) {
			// 內部編號
			int docId = scoreDoc[i].doc;
			System.out.println("內部編號:" + docId);
			// 根據文件id找到文件
			Document doc = searcher.doc(docId);
			
			//String id = highlighter.getBestFragment(analyzer, "id", doc.get("id"));
			//String title = highlighter.getBestFragment(analyzer, "title", doc.get("title"));
			String content = highlighter.getBestFragment(analyzer, "content", doc.get("content"));
			
			//System.out.println("id:" + id + " title:" + title);
			System.out.println("content:" + content);
		}
		
		directory.close();
	} catch (Exception e) {
		e.printStackTrace();
	}
}

查詢結果:
IK Analyzer是一個結合詞典分詞和文法分詞的中文分詞開源工具包。它使用了全新的正向迭代最細粒度切分<font color='red'>演算法</font>。

索引可以用luke來檢視:

開啟cmd,進入luke所在目錄,輸入命令 java -jar lukeall-4.10.2.jar即可執行。



pom.xml中:

<!--Lucene -->
<dependency>
	<groupId>org.apache.lucene</groupId>
	<artifactId>lucene-core</artifactId>
	<version>${lucene}</version>
</dependency>
<dependency>
	<groupId>org.apache.lucene</groupId>
	<artifactId>lucene-highlighter</artifactId>
	<version>${lucene}</version>
</dependency>
<dependency>
	<groupId>org.apache.lucene</groupId>
	<artifactId>lucene-memory</artifactId>
	<version>${lucene}</version>
</dependency>
<dependency>
	<groupId>org.apache.lucene</groupId>
	<artifactId>lucene-queries</artifactId>
	<version>${lucene}</version>
</dependency>
<dependency>
	<groupId>org.apache.lucene</groupId>
	<artifactId>lucene-queryparser</artifactId>
	<version>${lucene}</version>
</dependency>
<dependency>
	<groupId>org.apache.lucene</groupId>
	<artifactId>lucene-analyzers-common</artifactId>
	<version>${lucene}</version>
</dependency>

IKAnalyzer.cfg.xml(在src/main/resources下):

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">  
<properties>  
	<comment>IK Analyzer 擴充套件配置</comment>
	<!--使用者可以在這裡配置自己的擴充套件字典 
	<entry key="ext_dict">/mydict.dic;</entry> 
	-->	
	<entry key="ext_dict">mydict.dic</entry> 
	<!--使用者可以在這裡配置自己的擴充套件停止詞字典
	<entry key="ext_stopwords">ext_stopword.dic</entry>-->
	
</properties>