經典的分詞方法實現(JAVA)
基於規則的自動分詞演算法
原理
(1) 事先人工建立好分詞詞典和分詞規則庫。
(2) 原理為基於字串匹配進行分詞,這樣就要求有足夠大的詞表為依據。
(3) 通過一定的演算法來實現,如正向最大匹配法、逆向最大匹配法、雙向匹配法等。
(4) 憂缺點:當分詞詞典所收容的詞較少時,顯然覆蓋度就有限,分詞的正確率就低。
正向最大匹配法
演算法描述
設MaxLen表示最大詞長,D為分詞詞典
(1) 從待切分語料中按正向取長度為MaxLen的字串str,令Len=MaxLen;
(2) 把str與D中的詞相匹配;
(3) 若匹配成功,則認為該字串為詞,指向待切分語料的指標向前移Len個漢字(位元組),返回到(1);
(4) 若不成功:如果Len>1,則將Len減2,從待切分語料中取長度為Len的字串str,返回到(2)。否則,得到長度為2的單字詞,指向待切分語料的指標向前移1個漢字,返回(1)。
演算法程式碼
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package nlp;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
/**
*
* @author quincy1994
*/
public class Nlp {
private String m_sResult = ""; // 切分後的結果串
private int m_nPosIndex; // 指向待切分語料的指標的具體位置
private int m_MaxLen; // 最大取詞長
private int totalMaxLen; //總最大取詞長
private Set<String> dictionary; // 分詞字典
public Nlp(int maxLen){
this.m_MaxLen = maxLen;
this.m_nPosIndex = 0;
this.totalMaxLen = maxLen;
try {
this.dictionary = this.loadFile();
} catch (IOException ex) {
Logger.getLogger(Nlp.class.getName()).log(Level.SEVERE, null, ex);
}
}
public Nlp(){
this.m_MaxLen = 3;
this.totalMaxLen = 3;
this.m_nPosIndex = 0;
try {
this.dictionary = this.loadFile();
} catch (IOException ex) {
Logger.getLogger(Nlp.class.getName()).log(Level.SEVERE, null, ex);
}
}
public Set<String> loadFile() throws FileNotFoundException, IOException{
//讀取字典
Set<String> dictionary = new HashSet<String>();
String filename = "dict.txt";
BufferedReader br = new BufferedReader(new FileReader(filename));
String tmp;
while( ( tmp = br.readLine() )!=null){
String[] token = tmp.split(",");
String word = token[0];
dictionary.add(word);
}
return dictionary;
}
public String MMSegment(String source){
int len = totalMaxLen;
int frompos = 0;
MM(source, len, frompos);
return m_sResult;
}
public String getSubString(String source, int m_nPosIndex, int len){
int endIndex = m_nPosIndex + len;
int length = source.length();
//需要判斷是否超出句子邊界
while(endIndex > length){
endIndex -= 1;
}
String sub = source.substring(m_nPosIndex, endIndex);
return sub;
}
public void MM(String source, int len , int frompos){
//遞迴匹配
if (m_nPosIndex >= source.length()) return;
String sub = getSubString(source, m_nPosIndex,len);
if(dictionary.contains(sub)){
//匹配
m_sResult += sub + "/ ";
m_nPosIndex = m_nPosIndex + m_MaxLen;
m_MaxLen = totalMaxLen;
MM(source, m_MaxLen, m_nPosIndex);
}
else{
//不匹配
if(m_MaxLen > 1){
m_MaxLen = m_MaxLen - 1;
MM(source, m_MaxLen, m_nPosIndex);
}
else{
m_sResult += sub+ "/ ";
m_nPosIndex += 1;
m_MaxLen = totalMaxLen;
MM(source, m_MaxLen, m_nPosIndex);
}
}
}
/**
* @param args the command line arguments
*/
public static void main(String[] args) {
// TODO code application logic here
Nlp nlp = new Nlp();
String source = "今天天氣不錯!";
String result = nlp.MMSegment(source);
System.out.println(result);
}
}
逆向最大匹配法
演算法描述
與正向最大匹配法原理一樣,只是匹配的開始為句尾
程式碼實現
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package nlp;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
/**
*
* @author quincy1994
*/
public class RMM {
private String m_sResult = ""; //切分後的結果串
private int m_nPosIndex; //遊標指標
private int m_MaxLen; //最大取詞長
private int totalMaxlen; //總最大取詞長
private Set<String> dictionary; //分詞字典
public RMM(int maxLen){
this.m_MaxLen = maxLen;
this.totalMaxlen = maxLen;
try {
this.dictionary = loadFile();
} catch (IOException ex) {
Logger.getLogger(RMM.class.getName()).log(Level.SEVERE, null, ex);
}
}
public RMM(){
this.m_MaxLen = 3;
this.totalMaxlen = 3;
try {
this.dictionary = loadFile();
} catch (IOException ex) {
Logger.getLogger(RMM.class.getName()).log(Level.SEVERE, null, ex);
}
}
public Set<String> loadFile() throws IOException{
//讀取字典
Set<String> dictionary = new HashSet<String>();
String filename = "dict.txt";
BufferedReader br = new BufferedReader(new FileReader(filename));
String tmp;
while((tmp=br.readLine())!= null){
String[] token = tmp.split(",");
String word = token[0];
dictionary.add(word);
}
return dictionary;
}
public String RMMSegment(String source){
int len= totalMaxlen;
this.m_nPosIndex = source.length();
int frompos = this.m_nPosIndex;
rmm(source, m_MaxLen, m_nPosIndex);
//將結果按順序輸出
String[] token = m_sResult.split("/");
String result = "";
for(int i = token.length-1; i > 0 ; i--){
result += token[i] + "/ ";
}
return result;
}
public String getSubString(String source, int m_nPosIndex, int len){
int startIndex = m_nPosIndex - len;
//判斷越界條件
while(startIndex < 0){
startIndex += 1;
}
String sub = source.substring(startIndex, m_nPosIndex);
return sub;
}
public void rmm(String source, int len, int frompos){
if(m_nPosIndex < 0) return;
String sub = getSubString(source, m_nPosIndex, len);
if(dictionary.contains(sub)){
//匹配成功
m_sResult += "/" + sub ;
m_nPosIndex = m_nPosIndex - m_MaxLen;
m_MaxLen = totalMaxlen;
rmm(source, m_MaxLen, m_nPosIndex);
}
else{
//不匹配
if(m_MaxLen > 1){
m_MaxLen = m_MaxLen - 1;
rmm(source, m_MaxLen, m_nPosIndex);
}
else{
m_sResult += "/" + sub ;
m_nPosIndex -= 1;
m_MaxLen = totalMaxlen;
rmm(source, m_MaxLen, m_nPosIndex);
}
}
}
public static void main(String[] args) {
// TODO code application logic here
RMM myRMM = new RMM();
String source = "記錄最佳前候選詞列表";
String result = myRMM.RMMSegment(source);
System.out.println(result);
}
}
基於統計的中文分詞演算法
基本思想
選擇概率最大的分詞路徑作為最優結果
利用動態規劃演算法來實現,即最優路徑中的第i個詞w i 的累計概率等於它的左相鄰詞w i-1 的累積概率乘以w i 自身的概率
具體演算法
(1)對一個待分詞的字串S,按照從左到右的順序取出全部候選詞w 1 ,w 2 ,…,w i ,w n ;
(2)計算每個候選詞的概率值P(w i ),記錄每個候選詞的全部左鄰詞;
(3)計算每個候選詞的累計概率,累計概率最大的候選詞為最佳左鄰詞;
如果當前詞w n 是字串的尾詞,且累計概率P’(w n )最大,則w n 是S的終點詞;
(4)從w n 開始,按照從右到左順序,依次將每個詞的最佳左鄰詞輸出,即S的分詞結果.
字典樹
又稱單詞查詢樹,Trie樹,是一種樹形結構,是一種雜湊樹的變種。典型應用是用於統計,排序和儲存大量的字串(但不僅限於字串),所以經常被搜尋引擎系統用於文字詞頻統計。它的優點是:利用字串的公共字首來減少查詢時間,最大限度地減少無謂的字串比較,查詢效率比雜湊樹高。
字典樹的程式碼實現
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package nlp;
import java.util.HashMap;
import java.util.Map;
/**
*
* @author quincy1994
*/
public class TireNode {
private String character; // 單個漢字
private int frequency = -1; // 詞頻, -1來區別某條路徑上的字串是否是一個片語
private double antilog = -1; // 對數化的詞頻
private Map<String, TireNode> children; //下一個節點
public String getCharacter(){
return character;
}
public void setCharacter(String character){
this.character = character;
}
public int getFrequency(){
return frequency;
}
public void setFrequency(int frequency){
this.frequency = frequency;
}
public double getAntilog(){
return antilog;
}
public void setAntilog(double antilog){
this.antilog = antilog;
}
public void addChild(TireNode node){
if (children == null){
children = new HashMap<String, TireNode>();
}
if (!children.containsKey(node.getCharacter())){
children.put(node.getCharacter(), node);
}
}
public TireNode getChild(String ch){
if (children == null || ! children.containsKey(ch)){
return null;
}
return children.get(ch);
}
public void removeChildren(String ch){
if (children == null || !children.containsKey(ch)){
return;
}
children.remove(ch);
}
}
演算法實現
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package nlp;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
*
* @author quincy1994
*/
public class ChnSeq {
private TireNode tire = null;
public List<String> loadFile() throws FileNotFoundException, IOException {
//讀取字典
List<String> lines = new ArrayList<String>();
String filename = "wordFre.txt";
BufferedReader br = new BufferedReader(new FileReader(filename));
String tmp;
while ((tmp = br.readLine()) != null) {
lines.add(tmp);
}
br.close();
return lines;
}
public void init() throws IOException {
List<String> lines = loadFile();
tire = new TireNode();
for (String line : lines) {
String[] tokens = line.split(",");
String word = tokens[0];
int freq = Integer.parseInt(tokens[1]);
double antilog = Math.log(1+0.01/Double.parseDouble(tokens[2].replace("%", ""))) ;
//構建詞典樹
TireNode root = tire;
for (int i = 0; i < word.length(); i++) {
String c = "" + word.charAt(i);
TireNode node = root.getChild(c);
if (node == null) {
node = new TireNode();
node.setCharacter(c);
root.addChild(node);
}
root = node;
}
root.setFrequency(freq); //為每個詞設立詞頻
root.setAntilog(antilog); //為每個詞設立逆文件頻率
}
}
public TireNode getTire() {
return tire;
}
public TireNode getNodeByWord(String word) {
TireNode node = tire;
for (int i = 0; i < word.length(); i++) {
String ch = word.charAt(i) + "";
if (node == null) {
break;
} else {
node = node.getChild(ch);
}
}
return node;
}
private class Segment {
public String word; //詞
public String endChar; //結束詞
public String lastChar; //字首詞
public double cost;
public final static String START_SIGN = "<< STARTING >>";
public final static String END_SIGN = "<< ENDING >>";
}
//尋找候選詞
public List<Segment> preSegment(String sentence) {
List<Segment> segs = new ArrayList<Segment>();
//設定句子的開始標誌
Segment terminal = new Segment();
terminal.word = Segment.START_SIGN;
terminal.endChar = Segment.START_SIGN;
terminal.lastChar = null;
segs.add(terminal);
for (int i = 0; i < sentence.length(); i++) {
for (int j = i + 1; j <= sentence.length(); j++) {
String word = sentence.substring(i, j);
TireNode tnode = this.getNodeByWord(word);
if (tnode == null) {
break;
}
if (tnode.getFrequency() <= 0) {
continue;
}
Segment seg = new Segment();
seg.word = word;
seg.endChar = word.substring(word.length() - 1, word.length());
if (i == 0) {
seg.lastChar = Segment.START_SIGN;
} else {
seg.lastChar = sentence.substring(i - 1, i);
}
seg.cost = tnode.getAntilog();
System.out.println(word + " " + seg.cost +" " + tnode.getFrequency());
segs.add(seg);
}
}
//設定句子的結束標誌
terminal = new Segment();
terminal.word = Segment.END_SIGN;
terminal.endChar = Segment.END_SIGN;
terminal.lastChar = sentence.substring(sentence.length() - 1, sentence.length());
segs.add(terminal);
return segs;
}
public String dynamicSegment(List<Segment> segs) {
//基於動態規劃的概率統計分詞
final double INFINITE = 9999999;
if (segs == null || segs.size() == 0) {
System.out.println("找不到候選詞");
return null;
}
int n = segs.size(); //候選詞的個數
//單個詞
double[][] costs = new double[n][n];
for (int i = 0; i < n - 1; i++) {
for (int j = 0; j < n; j++) {
String endChar = segs.get(i).endChar;
if (j == i && endChar.equals(segs.get(j).word)) {
costs[i][j] = segs.get(j).cost; //候選詞j的概率
continue;
}
costs[i][j] = INFINITE;
}
}
//尋找前一個候選詞
for (int i = 0; i < n - 1; i++) {
String endChar = segs.get(i).endChar;
for (int j = i + 1; j < n; j++) {
String lastChar = segs.get(j).lastChar;
if (lastChar != null && lastChar.equals(endChar) &&( j- i < 4)) { //j字首詞不為空,同時j的字首詞等於i的字尾詞,且j和i之間的間隔不超過4個候選詞
costs[i][j] = segs.get(j).cost; //候選詞j的概率
}
}
}
int sp = 0; //開始點
int fp = n - 1; //結束點
double[] dist = new double[n]; // 記錄累計概率, n為候選詞的個數
List<List<Integer>> sPaths = new ArrayList<List<Integer>>();
List<Integer> list = new ArrayList<Integer>();
for (int i = 0; i < n; i++) {
dist[i] = costs[sp][i]; //i的累計概率的初始值為索引sp到索引i的詞的概率
if (sp != i) {
list.add(i); //記錄候選詞的索引位置
}
if (dist[i] < INFINITE) {
List<Integer> spa = new ArrayList<Integer>(); //如果索引sp到索引i構成一個詞,則開啟一條劃分路徑
sPaths.add(spa);
} else {
sPaths.add(null);
}
}
while (!list.isEmpty()) {
//選切分點
Integer minIdx = list.get(0);
list.remove(minIdx);
//判斷minIdx是否為開頭的候選詞
if(dist[minIdx] == INFINITE){
continue;
}
//動態規劃
for (int i = minIdx+1; i < n; i++) {
if (dist[i] > dist[minIdx] + costs[minIdx][i]) {
dist[i] = dist[minIdx] + costs[minIdx][i];
List<Integer> tmp = new ArrayList<Integer>(sPaths.get(minIdx));
tmp.add(minIdx);
sPaths.set(i, tmp); //記錄最佳前候選詞列表
}
}
}
String result = "";
for (int i = 0; i < sPaths.get(fp).size(); i++) {
result += segs.get(sPaths.get(fp).get(i)).word + "/ ";
}
return result;
}
public String segment(String sentences) {
return dynamicSegment(preSegment(sentences));
}
public static void main(String[] args) throws ClassNotFoundException, IOException {
ChnSeq cs = new ChnSeq();
cs.init();
String sentence = "在這一年中,改革開放和現代化建設繼續向前邁進。經濟保持了“高增長、低通脹”的良好發展態勢。農業生產再次獲得好的收成,企業改革繼續深化,人民生活進一步改善。對外經濟技術合作與交流不斷擴大。";
String segs = cs.segment(sentence);
System.out.println(segs);
}
}