Lucene7.0與HanLP分詞器整合索引資料庫建立索引檔案

阿新 • • 發佈：2018-12-03

HanLP官網：http://hanlp.linrunsoft.com/

GitHup地址：https://github.com/hankcs/HanLP

HanLP外掛地址：https://github.com/hankcs/hanlp-lucene-plugin

需要一下jar包

類

package com.kyd.demo.hanLP;

import java.io.IOException;
import java.nio.file.Paths;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.Statement;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.LongPoint;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.IntPoint;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.junit.Test;

import com.hankcs.lucene.HanLPAnalyzer;
import com.hankcs.lucene.HanLPIndexAnalyzer;

/**

索引資料庫欄位建立索引檔案
@author zhengzhen

/
public class JdbcIndexDemo {
public static void main(String[] args) {
try {
Class.forName(“com.mysql.jdbc.Driver”);
String url = “jdbc:mysql://192.168.100.69:3306/xxxx?useUnicode=true&characterEncoding=utf8&autoReconnect=true&failOverReadOnly=false”;
String password =“root”;
String userName =“root”;
String sql =“select * from xxxx”;
try (
Connection conn = DriverManager.getConnection(url,userName,password);
PreparedStatement sta =conn.prepareStatement(sql);
ResultSet rs = sta.executeQuery();
){
/

*
* 1.設定索引檔案儲存路徑
/
Directory directory = FSDirectory.open(Paths.get(“xxxx_index”));
/*
* 2.建立分詞器
/
Analyzer analyzer = new HanLPIndexAnalyzer();
/*
* 3.分詞器配置
/
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
indexWriterConfig.setOpenMode(OpenMode.CREATE);
/*
* 4.建立索引輸出流
/
IndexWriter indexWriter = new IndexWriter(directory,indexWriterConfig);
/*
* 5.迴圈遍歷建立索引文件
/
while (rs.next()) {
/*
* 5.1.建立文件
/
Document document = new Document();
/*
* 5.2.新增欄位
*/
Long id =rs.getLong(“unitId”);
IndexableField unitIdField = new StringField(“unitId”, id+"",Store.YES);
document.add(unitIdField);

					String title = rs.getString("title");
					if( title != null) {
						IndexableField sectionNameField = new TextField("sectionName", title, Store.YES);
						document.add(sectionNameField);
					}
					
				
					
					String  unitName= rs.getString("unitName");
					if( unitName != null) {
						IndexableField unitNameField = new TextField("unitName", unitName, Store.YES);
						document.add(unitNameField);
					}
					
					
					String  courseName= rs.getString("courseName");
					if(courseName !=null) {
						IndexableField courseNameField = new TextField("courseName", courseName, Store.YES);
						document.add(courseNameField);
					}
					
					
					String  startPage= rs.getString("startPage");
					if(startPage !=null) {
						IndexableField startPageField = new StringField("startPage", startPage, Store.YES);
						document.add(startPageField);
					}
					
					
					String  endPage= rs.getString("startEndPage");
					if(endPage != null) {
						IndexableField endPageField = new StringField("endPage", endPage,Store.YES);
						document.add(endPageField);
					}
				
					
					indexWriter.addDocument(document);
					
				}
				indexWriter.commit();
		} catch (Exception e) {
			e.printStackTrace();
		}
	} catch (ClassNotFoundException e1) {
		
		e1.printStackTrace();
	}
	
}
/**
 * HanLPAnalyzer
 * 這個分詞器對於長詞不會切割 ，例如 “中華人民共和國” 是一個長詞會保留下來
 * @throws IOException
 */
@Test
public void hanLPAnalyzerTest() throws IOException {
	String text = "中華人民共和國很遼闊";
	for (int i = 0; i < text.length(); ++i)
	{
	    System.out.print(text.charAt(i) + "" + i + " ");
	}
	System.out.println();
	Analyzer analyzer = new HanLPAnalyzer();
	TokenStream tokenStream = analyzer.tokenStream("field", text);
	tokenStream.reset();
	while (tokenStream.incrementToken())
	{
	    CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class);
	    // 偏移量
	    OffsetAttribute offsetAtt = tokenStream.getAttribute(OffsetAttribute.class);
	    // 距離
	    PositionIncrementAttribute positionAttr = tokenStream.getAttribute(PositionIncrementAttribute.class);
	    System.out.println(attribute + " " + offsetAtt.startOffset() + " " + offsetAtt.endOffset() + " " + positionAttr.getPositionIncrement());
	}
	/* 輸出：
	 * 中0 華1 人2 民3 共4 和5 國6 很7 遼8 闊9 
	 * 中華人民共和國 0 7 1
	 * 很 7 8 1
	 * 遼闊 8 10 1
	 */
}
/**
 * HanLPIndexAnalyzer
 * 這個分詞器會對長詞進行分割 “中華人民共和國” 會切分成“中華人民共和國” “中華” “人民”等等
 * @throws IOException
 */
@Test
public void hanLPIndexAnalyzerTest() throws IOException {
	String text = "中華人民共和國很遼闊";
	for (int i = 0; i < text.length(); ++i)
	{
	    System.out.print(text.charAt(i) + "" + i + " ");
	}
	System.out.println();
	Analyzer analyzer = new HanLPIndexAnalyzer();
	TokenStream tokenStream = analyzer.tokenStream("field", text);
	tokenStream.reset();
	while (tokenStream.incrementToken())
	{
	    CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class);
	    // 偏移量
	    OffsetAttribute offsetAtt = tokenStream.getAttribute(OffsetAttribute.class);
	    // 距離
	    PositionIncrementAttribute positionAttr = tokenStream.getAttribute(PositionIncrementAttribute.class);
	    System.out.println(attribute + " " + offsetAtt.startOffset() + " " + offsetAtt.endOffset() + " " + positionAttr.getPositionIncrement());
	}
	/* 輸出：
	 * 中0 華1 人2 民3 共4 和5 國6 很7 遼8 闊9 
	 * 中華人民共和國 0 7 1
	 * 中華人民 0 4 1
	 * 中華 0 2 1
	 * 華人 1 3 1
	 * 人民共和國 2 7 1
	 * 人民 2 4 1
	 * 共和國 4 7 1
	 * 共和 4 6 1
	 * 很 7 8 1
	 * 遼闊 8 10 1
	 */
}

}

文章來源於雨夜星辰03的部落格

Lucene7.0與HanLP分詞器整合索引資料庫建立索引檔案

Lucene7.0與HanLP分詞器整合索引資料庫建立索引檔案

Elasticsearch整合HanLP分詞器

HanLP分詞器的使用方法

基於elasticsearch6.4.0 配置IK分詞器

HanLPTokenizer HanLP分詞器

ElasticSearch搜尋伺服器與IK分詞器

ElasticSearch6.0配置IK分詞器

Solr 5.0.0配置中文分詞器IK Analyzer

solr與中文分詞器的安裝配置

solr8.0 ik中文分詞器的簡單配置（二）

Ansj與hanlp分詞工具對比

中文分詞器（IK）的配置檔案

Solr6.6.0添加IK中文分詞器

Lucene.net(4.8.0) 學習問題記錄五: JIEba分詞和Lucene的結合，以及對分詞器的思考

Elasticsearch5.4.0叢集安裝IK分詞器

ElasticSearch6.5.0 【安裝IK分詞器】

solr 7+tomcat 8 + mysql實現solr 7基本使用(安裝、整合中文分詞器、定時同步資料庫資料以及專案整合)

Solr搜尋引擎之整合IKAnalyzer分詞器

學習筆記:從0開始學習大資料-29. solr增加ik中文分詞器並匯入doc，pdf文件全文檢索

elasticsearch-手動設定_mapping中欄位型別及分詞器-ES5.X與ES6.X區別

Lucene7.0與HanLP分詞器整合索引資料庫建立索引檔案

相關推薦