經典的分詞方法實現（JAVA)

阿新 • • 發佈：2019-02-16

基於規則的自動分詞演算法

原理

(1) 事先人工建立好分詞詞典和分詞規則庫。
(2) 原理為基於字串匹配進行分詞,這樣就要求有足夠大的詞表為依據。
(3) 通過一定的演算法來實現,如正向最大匹配法、逆向最大匹配法、雙向匹配法等。
(4) 憂缺點:當分詞詞典所收容的詞較少時,顯然覆蓋度就有限,分詞的正確率就低。

正向最大匹配法

演算法描述

設MaxLen表示最大詞長,D為分詞詞典
(1) 從待切分語料中按正向取長度為MaxLen的字串str,令Len=MaxLen;
(2) 把str與D中的詞相匹配;
(3) 若匹配成功,則認為該字串為詞,指向待切分語料的指標向前移Len個漢字(位元組),返回到(1);
(4) 若不成功:如果Len>1,則將Len減2,從待切分語料中取長度為Len的字串str,返回到(2)。否則,得到長度為2的單字詞,指向待切分語料的指標向前移1個漢字,返回(1)。

演算法程式碼

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package nlp;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import 
 java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
 *
 * @author quincy1994
 */
public class Nlp {

    private String m_sResult = ""; // 切分後的結果串
    private int m_nPosIndex;  // 指向待切分語料的指標的具體位置
    private int m_MaxLen; // 最大取詞長 

    private int totalMaxLen; //總最大取詞長
    private Set<String> dictionary; // 分詞字典

    public Nlp(int maxLen){
        this.m_MaxLen = maxLen;
        this.m_nPosIndex = 0;
        this.totalMaxLen = maxLen;
        try {
            this.dictionary = this.loadFile();
        } catch (IOException ex) {
            Logger.getLogger(Nlp.class.getName()).log(Level.SEVERE, null, ex);
        }
    }
    public Nlp(){
        this.m_MaxLen = 3;
        this.totalMaxLen = 3;
        this.m_nPosIndex = 0;
        try {
            this.dictionary = this.loadFile();
        } catch (IOException ex) {
            Logger.getLogger(Nlp.class.getName()).log(Level.SEVERE, null, ex);
        }
    }
    public Set<String> loadFile() throws FileNotFoundException, IOException{
        //讀取字典
        Set<String> dictionary = new HashSet<String>();
        String filename = "dict.txt";
        BufferedReader br = new BufferedReader(new FileReader(filename));
        String tmp;
        while( ( tmp = br.readLine() )!=null){
            String[] token = tmp.split(",");
            String word = token[0];
            dictionary.add(word);
        }
        return dictionary;
    }
    public String MMSegment(String source){
         int len = totalMaxLen;
         int frompos = 0;
         MM(source, len, frompos);
         return m_sResult;
     }
    public String getSubString(String source, int m_nPosIndex, int len){
        int endIndex = m_nPosIndex + len;
        int length = source.length();

        //需要判斷是否超出句子邊界
        while(endIndex > length){
            endIndex -= 1;
        }
        String sub = source.substring(m_nPosIndex, endIndex);
        return sub;
    }
    public void MM(String source, int len , int frompos){

        //遞迴匹配
         if (m_nPosIndex >= source.length()) return;
        String sub = getSubString(source, m_nPosIndex,len);
        if(dictionary.contains(sub)){
            //匹配
            m_sResult += sub + "/ ";
            m_nPosIndex = m_nPosIndex + m_MaxLen;
            m_MaxLen = totalMaxLen;
            MM(source, m_MaxLen, m_nPosIndex);
        }
        else{
            //不匹配
            if(m_MaxLen > 1){
                m_MaxLen = m_MaxLen - 1;
                MM(source, m_MaxLen, m_nPosIndex);
            }
            else{
                m_sResult += sub+ "/ ";
                m_nPosIndex  += 1;
                m_MaxLen = totalMaxLen;
                MM(source, m_MaxLen, m_nPosIndex);
            }
    }
}
    /**
     * @param args the command line arguments
     */
    public static void main(String[] args) {
        // TODO code application logic here
        Nlp nlp = new Nlp();
        String source = "今天天氣不錯！";
        String result = nlp.MMSegment(source);
        System.out.println(result);
    } 
}

逆向最大匹配法

演算法描述

與正向最大匹配法原理一樣，只是匹配的開始為句尾

程式碼實現

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package nlp;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
 *
 * @author quincy1994
 */
public class RMM {
    private String m_sResult = "";         //切分後的結果串
    private int m_nPosIndex;                //遊標指標
    private int m_MaxLen;                    //最大取詞長
    private int totalMaxlen;                //總最大取詞長
    private Set<String> dictionary;      //分詞字典

    public RMM(int maxLen){
        this.m_MaxLen = maxLen;
        this.totalMaxlen = maxLen;
        try {
            this.dictionary = loadFile();
        } catch (IOException ex) {
            Logger.getLogger(RMM.class.getName()).log(Level.SEVERE, null, ex);
        }
    }
    public RMM(){
        this.m_MaxLen = 3;
        this.totalMaxlen = 3;
        try {
            this.dictionary = loadFile();
        } catch (IOException ex) {
            Logger.getLogger(RMM.class.getName()).log(Level.SEVERE, null, ex);
        }
    }
    public Set<String> loadFile() throws IOException{

        //讀取字典
        Set<String> dictionary = new HashSet<String>();
        String filename = "dict.txt";
        BufferedReader br = new BufferedReader(new FileReader(filename));
        String tmp;
        while((tmp=br.readLine())!= null){
            String[] token = tmp.split(",");
            String word = token[0];
            dictionary.add(word);
        }
        return dictionary;
    }
    public String RMMSegment(String source){
        int len= totalMaxlen;
        this.m_nPosIndex = source.length();
        int frompos = this.m_nPosIndex;
        rmm(source, m_MaxLen, m_nPosIndex);

        //將結果按順序輸出
        String[] token = m_sResult.split("/");
        String result = "";
        for(int i = token.length-1; i > 0 ; i--){
            result += token[i] + "/ ";
        }
        return result;
    }
    public String getSubString(String source, int m_nPosIndex, int len){

        int startIndex = m_nPosIndex - len;
        //判斷越界條件
        while(startIndex < 0){
            startIndex += 1;
        }
        String sub = source.substring(startIndex, m_nPosIndex);
        return sub;
    }

    public void rmm(String source, int len, int frompos){
         if(m_nPosIndex < 0)  return;
         String sub = getSubString(source, m_nPosIndex, len);
         if(dictionary.contains(sub)){
             //匹配成功
             m_sResult += "/" + sub ;
             m_nPosIndex = m_nPosIndex - m_MaxLen;
             m_MaxLen = totalMaxlen;
             rmm(source, m_MaxLen, m_nPosIndex);
         }
         else{
             //不匹配
             if(m_MaxLen > 1){
                 m_MaxLen = m_MaxLen - 1;
                 rmm(source, m_MaxLen, m_nPosIndex);
             }
             else{
                 m_sResult += "/" + sub ;
                 m_nPosIndex -= 1;
                 m_MaxLen = totalMaxlen;
                 rmm(source, m_MaxLen, m_nPosIndex);
            }
        }
    }
    public static void main(String[] args) {
        // TODO code application logic here
        RMM myRMM = new RMM();
        String source = "記錄最佳前候選詞列表";
        String result = myRMM.RMMSegment(source);
        System.out.println(result);
    } 
}

基於統計的中文分詞演算法

基本思想

選擇概率最大的分詞路徑作為最優結果
利用動態規劃演算法來實現,即最優路徑中的第i個詞w i 的累計概率等於它的左相鄰詞w i-1 的累積概率乘以w i 自身的概率

具體演算法

(1)對一個待分詞的字串S,按照從左到右的順序取出全部候選詞w 1 ,w 2 ,…,w i ,w n ;
(2)計算每個候選詞的概率值P(w i ),記錄每個候選詞的全部左鄰詞;
(3)計算每個候選詞的累計概率,累計概率最大的候選詞為最佳左鄰詞;
如果當前詞w n 是字串的尾詞,且累計概率P’(w n )最大,則w n 是S的終點詞;
(4)從w n 開始,按照從右到左順序,依次將每個詞的最佳左鄰詞輸出,即S的分詞結果.

字典樹

又稱單詞查詢樹，Trie樹，是一種樹形結構，是一種雜湊樹的變種。典型應用是用於統計，排序和儲存大量的字串（但不僅限於字串），所以經常被搜尋引擎系統用於文字詞頻統計。它的優點是：利用字串的公共字首來減少查詢時間，最大限度地減少無謂的字串比較，查詢效率比雜湊樹高。

字典樹的程式碼實現

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package nlp;

import java.util.HashMap;
import java.util.Map;

/**
 *
 * @author quincy1994
 */
public class TireNode {
    private String character;           //　單個漢字
    private int frequency = -1;       //     詞頻, -1來區別某條路徑上的字串是否是一個片語
    private double antilog = -1;    //      對數化的詞頻
    private Map<String, TireNode> children;  //下一個節點

    public String getCharacter(){
        return character;
    }

    public void setCharacter(String character){
        this.character = character;
    }

    public int getFrequency(){
        return frequency;
    }

    public void setFrequency(int frequency){
        this.frequency = frequency;
    }

    public double getAntilog(){
        return antilog;
    }

    public void setAntilog(double antilog){
        this.antilog = antilog;
    }

    public void addChild(TireNode node){
        if (children == null){
            children = new HashMap<String, TireNode>();
        }
        if (!children.containsKey(node.getCharacter())){
            children.put(node.getCharacter(), node);
        }
    }

    public TireNode getChild(String ch){
        if (children == null || ! children.containsKey(ch)){
            return null;
        }
        return children.get(ch);
    }

    public void removeChildren(String ch){
        if (children == null || !children.containsKey(ch)){
            return;
        }
        children.remove(ch);
    }
}

演算法實現

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package nlp;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 *
 * @author quincy1994
 */
public class ChnSeq {

    private TireNode tire = null;

    public List<String> loadFile() throws FileNotFoundException, IOException {
        //讀取字典
        List<String> lines = new ArrayList<String>();
        String filename = "wordFre.txt";
        BufferedReader br = new BufferedReader(new FileReader(filename));
        String tmp;
        while ((tmp = br.readLine()) != null) {
            lines.add(tmp);
        }
        br.close();
        return lines;
    }

    public void init() throws IOException {
        List<String> lines = loadFile();
        tire = new TireNode();

        for (String line : lines) {
            String[] tokens = line.split(",");
            String word = tokens[0];
            int freq = Integer.parseInt(tokens[1]);
            double antilog =  Math.log(1+0.01/Double.parseDouble(tokens[2].replace("%", ""))) ;
            //構建詞典樹
            TireNode root = tire;
            for (int i = 0; i < word.length(); i++) {
                String c = "" + word.charAt(i);
                TireNode node = root.getChild(c);
                if (node == null) {
                    node = new TireNode();
                    node.setCharacter(c);
                    root.addChild(node);
                }
                root = node;
            }
            root.setFrequency(freq);    //為每個詞設立詞頻
            root.setAntilog(antilog);   //為每個詞設立逆文件頻率
        }

    }

    public TireNode getTire() {
        return tire;
    }

    public TireNode getNodeByWord(String word) {
        TireNode node = tire;
        for (int i = 0; i < word.length(); i++) {
            String ch = word.charAt(i) + "";
            if (node == null) {
                break;
            } else {
                node = node.getChild(ch);
            }
        }
        return node;
    }

    private class Segment {

        public String word;     //詞
        public String endChar; //結束詞
        public String lastChar; //字首詞
        public double cost;

        public final static String START_SIGN = "<< STARTING >>";
        public final static String END_SIGN = "<< ENDING >>";
    }

    //尋找候選詞
    public List<Segment> preSegment(String sentence) {
        List<Segment> segs = new ArrayList<Segment>();

        //設定句子的開始標誌
        Segment terminal = new Segment();
        terminal.word = Segment.START_SIGN;
        terminal.endChar = Segment.START_SIGN;
        terminal.lastChar = null;
        segs.add(terminal);

        for (int i = 0; i < sentence.length(); i++) {
            for (int j = i + 1; j <= sentence.length(); j++) {
                String word = sentence.substring(i, j);
                TireNode tnode = this.getNodeByWord(word);
                if (tnode == null) {
                    break;
                }
                if (tnode.getFrequency() <= 0) {
                    continue;
                }

                Segment seg = new Segment();
                seg.word = word;
                seg.endChar = word.substring(word.length() - 1, word.length());
                if (i == 0) {
                    seg.lastChar = Segment.START_SIGN;
                } else {
                    seg.lastChar = sentence.substring(i - 1, i);
                }
                seg.cost = tnode.getAntilog();
                System.out.println(word + " " + seg.cost +" " + tnode.getFrequency());
                segs.add(seg);
            }
        }

        //設定句子的結束標誌
        terminal = new Segment();
        terminal.word = Segment.END_SIGN;
        terminal.endChar = Segment.END_SIGN;
        terminal.lastChar = sentence.substring(sentence.length() - 1, sentence.length());
        segs.add(terminal);

        return segs;
    }

    public String dynamicSegment(List<Segment> segs) {

        //基於動態規劃的概率統計分詞
        final double INFINITE = 9999999;

        if (segs == null || segs.size() == 0) {
            System.out.println("找不到候選詞");
            return null;
        }

        int n = segs.size();    //候選詞的個數

        //單個詞
        double[][] costs = new double[n][n];
        for (int i = 0; i < n - 1; i++) {
            for (int j = 0; j < n; j++) {
                String endChar = segs.get(i).endChar;
                if (j == i && endChar.equals(segs.get(j).word)) {
                    costs[i][j] = segs.get(j).cost;    //候選詞j的概率
                    continue;
                }
                costs[i][j] = INFINITE;
            }
        }

        //尋找前一個候選詞
        for (int i = 0; i < n - 1; i++) {
            String endChar = segs.get(i).endChar;
            for (int j = i + 1; j < n; j++) {
                String lastChar = segs.get(j).lastChar;
                if (lastChar != null && lastChar.equals(endChar) &&( j- i < 4)) {       //ｊ字首詞不為空，同時ｊ的字首詞等於ｉ的字尾詞,且j和i之間的間隔不超過4個候選詞
                    costs[i][j] = segs.get(j).cost;    //候選詞j的概率
                }
            }
        }

        int sp = 0;   //開始點
        int fp = n - 1;    //結束點

        double[] dist = new double[n];         // 記錄累計概率, n為候選詞的個數
        List<List<Integer>> sPaths = new ArrayList<List<Integer>>();
        List<Integer> list = new ArrayList<Integer>();
        for (int i = 0; i < n; i++) {
            dist[i] = costs[sp][i];    //ｉ的累計概率的初始值為索引sp到索引ｉ的詞的概率
            if (sp != i) {
                list.add(i);   //記錄候選詞的索引位置
            }
            if (dist[i] < INFINITE) {
                List<Integer> spa = new ArrayList<Integer>();     //如果索引sp到索引ｉ構成一個詞，則開啟一條劃分路徑
                sPaths.add(spa);
            } else {
                sPaths.add(null);
            }
        }
        while (!list.isEmpty()) {

            //選切分點
            Integer minIdx = list.get(0);
            list.remove(minIdx);

            //判斷minIdx是否為開頭的候選詞
            if(dist[minIdx] == INFINITE){
                continue;
            }

            //動態規劃
            for (int i = minIdx+1; i < n; i++) {
                if (dist[i] > dist[minIdx] + costs[minIdx][i]) {
                    dist[i] = dist[minIdx] + costs[minIdx][i];
                    List<Integer> tmp = new ArrayList<Integer>(sPaths.get(minIdx));
                    tmp.add(minIdx);
                    sPaths.set(i, tmp);  //記錄最佳前候選詞列表
                }
            }
        }
        String result = "";
        for (int i = 0; i < sPaths.get(fp).size(); i++) {
            result += segs.get(sPaths.get(fp).get(i)).word + "/ ";
        }
        return result;
    }

    public String segment(String sentences) {
        return dynamicSegment(preSegment(sentences));
    }

    public static void main(String[] args) throws ClassNotFoundException, IOException {
        ChnSeq cs = new ChnSeq();
        cs.init();
        String sentence = "在這一年中，改革開放和現代化建設繼續向前邁進。經濟保持了“高增長、低通脹”的良好發展態勢。農業生產再次獲得好的收成，企業改革繼續深化，人民生活進一步改善。對外經濟技術合作與交流不斷擴大。";
        String segs = cs.segment(sentence);
        System.out.println(segs);
    }
}

經典的分詞方法實現（JAVA)

基於規則的自動分詞演算法

原理

正向最大匹配法

演算法描述

演算法程式碼

逆向最大匹配法

演算法描述

程式碼實現

基於統計的中文分詞演算法

基本思想

具體演算法

字典樹

字典樹的程式碼實現

演算法實現

經典的分詞方法實現（JAVA)

jieba分詞/jieba-analysis（java版）

Java中的native方法實現（Java混用C/C++）

jieba分詞的應用（java）

java分頁的實現（後臺工具類和前臺jsp頁面）

SSM框架下分頁的實現（封裝page.java和List<?>）

Lucene筆記20-Lucene的分詞-實現自定義同義詞分詞器-實現分詞器（良好設計方案）

淺談分詞演算法（4）基於字的分詞方法（CRF）

python自然語言處理（NLP）1------中文分詞1，基於規則的中文分詞方法

SSM框架下分頁的實現（封裝page.java和List）

【原創】中文分詞系統 ICTCLAS2015 的JAVA封裝和多執行緒執行（附程式碼）

經典演算法總結——揹包問題（java實現）【已完結】

django 分頁效果實現（djangorestframework內建以及django內建方法）

自然語言處理基礎（1）--基本分詞方法

中文分詞系統NLPIR（2015版）的Java介面使用學習

springboot+redis實現熱詞搜尋推薦（Java）

資料庫分詞查詢的優缺點以及英文和中文各自的分詞方法（一）

工廠方法模式（Java與Kotlin版）

棧的數組和鏈表實現（Java實現）

淺談分詞算法（1）分詞中的基本問題

經典的分詞方法實現（JAVA)

基於規則的自動分詞演算法

原理

正向最大匹配法

演算法描述

演算法程式碼

逆向最大匹配法

演算法描述

程式碼實現

基於統計的中文分詞演算法

基本思想

具體演算法

字典樹

字典樹的程式碼實現

演算法實現

相關推薦