Lucene實現自定義中文同義詞分詞器
----------------------------------------------------------
lucene的分詞_中文分詞介紹
----------------------------------------------------------
Paoding:庖丁解牛分詞器。已經沒有更新了
mmseg:使用搜狗的詞庫
1.匯入包(有兩個包:1.帶dic的,2.不帶dic的)
如果使用不帶dic的,得自己指定詞庫位置
2.建立MMSegAnalyzer(指明詞庫所在的位置)
----------------------------------------------------------
----------------------------------------------------------
/* * 實現自定義中文同義詞分詞器(mmseg詞庫) */ public class MySameAnalyzer extends Analyzer { @Override public TokenStream tokenStream(String fieldName, Reader read) { Dictionary dic = Dictionary .getInstance("F:\\BaiduYunDownload\\Cache\\lucune\\chinesedic"); return new MySameTokenFilter(new MMSegTokenizer(new MaxWordSeg(dic), read)); } }
---------------------------------------------------------
lucene的分詞_實現自定義同義詞分詞器_實現分詞器
---------------------------------------------------------
Reader ---->MMSegTokenizer(進行分詞)---->(新增同義詞)MySameTokenFilter(自定義分詞器)---->獲取同義詞(在相同的位置儲存同義詞)
--->發現同義詞--->儲存當前狀態--->跳到下一個元素--->根據同義詞列表來儲存元素--->還原狀態--->在同一位置儲存元素
/*
* 自定義同義詞分詞過濾器
*/
public class MySameTokenFilter extends TokenFilter {
// 儲存分詞資料
private CharTermAttribute cta = null;
// 儲存語彙單元的位置資訊
private PositionIncrementAttribute pia = null;
// 新增是否有同義詞的判斷變數屬性,儲存當前元素的狀態資訊
private AttributeSource.State current;
// 棧儲存
private Stack<String> sames = null;
protected MySameTokenFilter(TokenStream input) {
super(input);
cta = this.addAttribute(CharTermAttribute.class);
pia = this.addAttribute(PositionIncrementAttribute.class);
sames = new Stack<String>();
}
@Override
public boolean incrementToken() throws IOException {
// 儲存上一個語彙的同義詞
while (sames.size() > 0) {
// 出棧,並獲取同義詞
String str = sames.pop();
// 還原上一個語彙的狀態
restoreState(current);
// 在上一個語彙上儲存元素
cta.setEmpty();
cta.append(str);
// 設定同義詞位置為0
pia.setPositionIncrement(0);
return true;
}
// 跳到下個cta
if (!this.input.incrementToken())
// 沒有元素返回false
return false;
if (getSameWords(cta.toString())) {
// 如果有同義詞,改變詞彙的current狀態資訊,把當前狀態儲存(捕獲當前狀態)
current = captureState();
}
return true;
}
/*
*
* 獲取同義詞
*/
private Boolean getSameWords(String name) {
Map<String, String[]> maps = new HashMap<String, String[]>();
maps.put("我", new String[] { "俺", "咱" });
maps.put("湖南", new String[] { "魚米之鄉", "湘" });
String[] sws = maps.get(name);
if (sws != null) {
// 新增進棧中
for (String str : sws) {
sames.push(str);
}
return true;
}
return false;
}
}
----------------------------------------------------
lucene的分詞_實現自定義同義詞分詞器_實現分詞器(良好設計方案)
----------------------------------------------------
思路:針對介面程式設計才是王道
1.建立管理同義詞的介面
/*
* 用於儲存同義詞的介面
*/
public interface MySameContxt {
//獲取同義詞String[]
public String[] getSameWords(String name);
}
2.實現介面,新增同義詞庫
public class MySimpleSameContxt implements MySameContxt {
/*
* 實現同義詞介面
*/
Map<String, String[]> maps = new HashMap<String, String[]>();
public MySimpleSameContxt() {
maps.put("我", new String[] { "俺", "咱" });
maps.put("湖南", new String[] { "魚米之鄉", "湘" });
}
public String[] getSameWords(String name) {
return maps.get(name);
}
}
3.自定義的分詞器的過濾器中TokenFilter中新增同義詞屬性
// 獲取專門管理同義詞的庫
private MySameContxt sameContxt;
全程式碼
/*
* 自定義同義詞分詞過濾器
*/
public class MySameTokenFilter extends TokenFilter {
// 儲存分詞資料
private CharTermAttribute cta = null;
// 儲存語彙單元的位置資訊
private PositionIncrementAttribute pia = null;
// 新增是否有同義詞的判斷變數屬性,儲存當前元素的狀態資訊
private AttributeSource.State current;
// 棧儲存
private Stack<String> sames = null;
// 獲取專門管理同義詞的庫
private MySameContxt sameContxt;
protected MySameTokenFilter(TokenStream input, MySameContxt sameContxt) {
super(input);
cta = this.addAttribute(CharTermAttribute.class);
pia = this.addAttribute(PositionIncrementAttribute.class);
sames = new Stack<String>();
this.sameContxt = sameContxt;
}
@Override
public boolean incrementToken() throws IOException {
// 儲存上一個語彙的同義詞
while (sames.size() > 0) {
// 出棧,並獲取同義詞
String str = sames.pop();
// 還原上一個語彙的狀態
restoreState(current);
// 在上一個語彙上儲存元素
cta.setEmpty();
cta.append(str);
// 設定同義詞位置為0
pia.setPositionIncrement(0);
return true;
}
// 跳到下個cta
if (!this.input.incrementToken())
// 沒有元素返回false
return false;
if (getSameWords(cta.toString())) {
// 如果有同義詞,改變詞彙的current狀態資訊,把當前狀態儲存(捕獲當前狀態)
current = captureState();
}
return true;
}
/*
*
* 獲取同義詞
*/
private Boolean getSameWords(String name) {
// 通過介面sameContxt獲取同義詞的所有String[]
String[] sws = sameContxt.getSameWords(name);
if (sws != null) {
// 新增進棧中
for (String str : sws) {
sames.push(str);
}
return true;
}
return false;
}
}
4.實現TokenStream
/*
* 實現自定義中文同義詞分詞器(mmseg詞庫)
*/
public class MySameAnalyzer extends Analyzer {
// 新增同義詞詞庫
private MySameContxt sameContxt;
public MySameAnalyzer(MySameContxt msc) {
this.sameContxt = msc;
}
@Override
public TokenStream tokenStream(String fieldName, Reader read) {
// z最後傳遞的是自定義的同義詞庫管理類
Dictionary dic = Dictionary
.getInstance("F:\\BaiduYunDownload\\Cache\\lucune\\chinesedic");
return new MySameTokenFilter(new MMSegTokenizer(new MaxWordSeg(dic),
read), sameContxt);
}
}
5.編寫索引測試
public void test05() {
try {
//將同義詞詞庫作為分詞器Analyzer的屬性獲取TokenStream
Analyzer a1 = new MySameAnalyzer(new MySimpleSameContxt());
String txt = "我來自湖南邵陽";
// 建立索引
Directory dir = new RAMDirectory();
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(
Version.LUCENE_35, a1));
Document doc = new Document();
doc.add(new Field("content", txt, Field.Store.YES,
Field.Index.ANALYZED));
writer.addDocument(doc);
writer.close();
// 建立搜尋
IndexReader reader = IndexReader.open(dir);
IndexSearcher search = new IndexSearcher(reader);
TopDocs tds = search.search(new TermQuery(new Term("content",
"魚米之鄉")), 10);
for (ScoreDoc sdc : tds.scoreDocs) {
Document docc = search.doc(sdc.doc);
System.out.println(docc.get("content"));
}
// new AnalyzerUtils().displayToken(txt, a1);
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (LockObtainFailedException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}