1. 程式人生 > >IK動態詞庫及禁用內建主詞庫

IK動態詞庫及禁用內建主詞庫

/**
 * 詞典管理類,單子模式
 */
public class Dictionary {

    /*
     * 詞典單子例項
     */
    private static Dictionary singleton;

    /*
     * 主詞典物件
     */
    private static DictSegment _MainDict = null;

    /*
     * 停止詞詞典
     */
    private static DictSegment _StopWordDict = null;

    /*
     * 量詞詞典
     */
    private DictSegment _QuantifierDict;

    /**
     * 詞典上傳修改時間.
     */
    private static Map<String, Long> dicLastModified = new HashMap<String, Long>();

    /**
     * 擴充套件詞.
     */
    private static Set<String> dicExtSet = new HashSet<String>(10000);

    /**
     * 停用詞.
     */
    private static Set<String> dicStopSet = new HashSet<String>(2000);

    /**
     * 配置物件
     */
    private static Configuration cfg;

    /**
     * 執行緒池定時載入詞典.
     */
    private static ScheduledExecutorService scheduledThreadPool = Executors.newScheduledThreadPool(1);

    /**
     * 是否已載入過詞典.
     */
    private static boolean hasAdd = false;

    /**
     * SimpleDateFormat(程式邏輯不存在併發,不考慮執行緒不安全情況).
     */
    private final static java.text.SimpleDateFormat DATE_FORMAT = new java.text.SimpleDateFormat("yyyy-MM-dd HH:mm:ss.S");

    /**
     * 詞典初始化
     * 
     * 由於IK Analyzer的詞典採用Dictionary類的靜態方法進行詞典初始化
     * 
     * 只有當Dictionary類被實際呼叫時,才會開始載入詞典, 這將延長首次分詞操作的時間,
     * 
     * 該方法提供了一個在應用載入階段就初始化字典的手段
     * 
     * @return Dictionary
     */
    public static Dictionary initial(Configuration cfg) {
        if (singleton == null) {
            synchronized (Dictionary.class) {
                if (singleton == null) {
                    singleton = new Dictionary(cfg);
                    Integer[] dicUpdateMin = cfg.getDicUpdateMin();
                    if (null != dicUpdateMin) {
                        print("loadDicFixedTime", "start");
                        loadDicFixedTime(dicUpdateMin);
                    }
                    return singleton;
                }
            }
        }
        return singleton;
    }

    /**
     * 定期載入配置檔案.
     * 
     * @param dicUpdateMin
     *            載入間隔
     */
    private static void loadDicFixedTime(Integer[] dicUpdateMin) {
        scheduledThreadPool.scheduleWithFixedDelay(new Runnable() {

            public void run() {
                try {
                    loadMainDict();
                    loadStopWordDict();
                } catch (Exception e) {
                    print(e);
                }
            }
        }, dicUpdateMin[0], dicUpdateMin[1], TimeUnit.MINUTES);
    }

    private Dictionary(Configuration cfg) {
        this.cfg = cfg;
        this.loadMainDict();
        this.loadStopWordDict();
        this.loadQuantifierDict();
        hasAdd = true;
    }

    /**
     * 獲取詞典單子例項
     * 
     * @return Dictionary 單例物件
     */
    public static Dictionary getSingleton() {
        if (singleton == null) {
            throw new IllegalStateException("詞典尚未初始化,請先呼叫initial方法");
        }
        return singleton;
    }

    /**
     * 批量載入新詞條
     * 
     * @param words
     *            Collection<String>詞條列表
     */
    public void addWords(Collection<String> words) {
        if (words != null) {
            for (String word : words) {
                if (word != null) {
                    // 批量載入詞條到主記憶體詞典中
                    singleton._MainDict.fillSegment(word.trim().toLowerCase().toCharArray());
                }
            }
        }
    }

    /**
     * 批量移除(遮蔽)詞條
     * 
     * @param words
     */
    public void disableWords(Collection<String> words) {
        if (words != null) {
            for (String word : words) {
                if (word != null) {
                    // 批量遮蔽詞條
                    singleton._MainDict.disableSegment(word.trim().toLowerCase().toCharArray());
                }
            }
        }
    }

    /**
     * 檢索匹配主詞典
     * 
     * @param charArray
     * @return Hit 匹配結果描述
     */
    public Hit matchInMainDict(char[] charArray) {
        return singleton._MainDict.match(charArray);
    }

    /**
     * 檢索匹配主詞典
     * 
     * @param charArray
     * @param begin
     * @param length
     * @return Hit 匹配結果描述
     */
    public Hit matchInMainDict(char[] charArray, int begin, int length) {
        return singleton._MainDict.match(charArray, begin, length);
    }

    /**
     * 檢索匹配量詞詞典
     * 
     * @param charArray
     * @param begin
     * @param length
     * @return Hit 匹配結果描述
     */
    public Hit matchInQuantifierDict(char[] charArray, int begin, int length) {
        return singleton._QuantifierDict.match(charArray, begin, length);
    }

    /**
     * 從已匹配的Hit中直接取出DictSegment,繼續向下匹配
     * 
     * @param charArray
     * @param currentIndex
     * @param matchedHit
     * @return Hit
     */
    public Hit matchWithHit(char[] charArray, int currentIndex, Hit matchedHit) {
        DictSegment ds = matchedHit.getMatchedDictSegment();
        return ds.match(charArray, currentIndex, 1, matchedHit);
    }

    /**
     * 判斷是否是停止詞
     * 
     * @param charArray
     * @param begin
     * @param length
     * @return boolean
     */
    public boolean isStopWord(char[] charArray, int begin, int length) {
        return singleton._StopWordDict.match(charArray, begin, length).isMatch();
    }

    /**
     * 載入主詞典及擴充套件詞典
     */
    private static void loadMainDict() {
        // 建立一個主詞典例項
        if (_MainDict == null) { // 首次載入
            _MainDict = new DictSegment((char) 0);
            String mainDictionary = cfg.getMainDictionary();
            // 讀取主詞典檔案
            if (!cfg.isDicDisable()) {
                loadToMain(mainDictionary, 1);
            }
        }
        // 載入擴充套件詞典
        List<String> extDictFiles = cfg.getExtDictionarys();
        if (null != extDictFiles && !extDictFiles.isEmpty()) {
            for (String extFile : extDictFiles) {
                loadToMain(extFile, null);
            }
        }
    }

    /**
     * 將檔案載入到主庫.
     * 
     * @param mainDictionary
     *            mainDictionary
     * @param innerDic
     *            是否是內建詞典(1是)
     */
    private static void loadToMain(String mainDictionary, Integer innerDic) {

        String path = null;
        InputStream is = null;
        File file = new File("");
        if (Objects.equals(1, innerDic)) {
            is = Dictionary.class.getClassLoader().getResourceAsStream(mainDictionary);
        } else {
            path = getFilePath(mainDictionary);
            file = new File(path);
            try {
                is = new FileInputStream(file);
            } catch (FileNotFoundException e) {
                print(e);
            }
        }
        if (is == null) {
            print("loadToMain:FileNotFoundException", path);
            // throw new RuntimeException("Main Dictionary not found!!!");
            return;
        }
        if (hasAdd && dicLastModified.containsKey(path) && file.lastModified() <= dicLastModified.get(path)) {
            return; // 非首次載入或詞典未修改
        }
        print("loadToMain_START", mainDictionary);
        BufferedReader br = null;
        InputStreamReader inputStreamReader = null;
        StringBuilder updateDic = new StringBuilder();
        try {
            inputStreamReader = new InputStreamReader(is, "UTF-8");
            br = new BufferedReader(inputStreamReader, 512);
            String theWord = null;
            do {
                theWord = br.readLine();
                if (theWord != null && !"".equals(theWord.trim())) {
                    if (!dicExtSet.contains(theWord)) {
                        dicExtSet.add(theWord);
                        if (hasAdd) {
                            updateDic.append(theWord).append(";");
                        }
                    }
                    _MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
                }
            } while (theWord != null);

        } catch (IOException ioe) {
            print("loadToMain exception.");
            print(ioe);
        } finally {
            dicLastModified.put(path, file.lastModified());
            if (updateDic.length() != 0) {
                print("loadToMain_END", "FileLastModified:" + DATE_FORMAT.format(new Date(file.lastModified())), updateDic.toString());
            }
            close(is, inputStreamReader, br);
        }
    }

    /**
     * 獲取字典檔案實際路徑.
     * 
     * @param dictionary
     *            字典名
     * @return 字典路徑
     */
    private static String getFilePath(String dictionary) {
        URL resource = Dictionary.class.getClassLoader().getResource(dictionary);
        if (null == resource) {
            print("NullPointerException", "getFilePath", dictionary); // 提示使用者配置詞庫有誤,方便使用者定位異常
        }
        return resource.getPath(); // 丟擲異常,終止IK
    }

    /**
     * 載入使用者擴充套件的停止詞詞典
     */
    private static void loadStopWordDict() {
        // 建立一個主詞典例項
        if (_StopWordDict == null) {
            _StopWordDict = new DictSegment((char) 0);
        }
        // 載入擴充套件停止詞典
        List<String> extStopWordDictFiles = cfg.getExtStopWordDictionarys();
        if (extStopWordDictFiles != null) {
            InputStream is = null;
            for (String extStopWordDictName : extStopWordDictFiles) {
                // 讀取擴充套件詞典檔案
                // is = Dictionary.class.getClassLoader().getResourceAsStream(extStopWordDictName);
                String path = getFilePath(extStopWordDictName);
                File file = new File(path);
                try {
                    is = new FileInputStream(file);
                } catch (FileNotFoundException e) {
                    print("loadStopWordDict:FileNotFoundException", path);
                    print(e);
                } finally {
                    close(is);
                }
                // 如果找不到擴充套件的字典,則忽略
                if (is == null) {
                    continue;
                }
                if (hasAdd && dicLastModified.containsKey(path) && file.lastModified() <= dicLastModified.get(path)) {
                    continue; // 非首次載入或詞典未修改
                }
                print("loadStopWordDict_START", extStopWordDictName);
                BufferedReader br = null;
                InputStreamReader inputStreamReader = null;
                StringBuilder updateDic = new StringBuilder();
                try {
                    inputStreamReader = new InputStreamReader(is, "UTF-8");
                    br = new BufferedReader(inputStreamReader, 512);
                    String theWord = null;
                    do {
                        theWord = br.readLine();
                        if (theWord != null && !"".equals(theWord.trim())) {
                            // System.out.println(theWord);
                            // 載入擴充套件停止詞典資料到記憶體中
                            _StopWordDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
                            if (!dicStopSet.contains(theWord)) {
                                dicStopSet.add(theWord);
                                if (hasAdd) {
                                    updateDic.append(theWord).append(";");
                                }
                            }
                        }
                    } while (theWord != null);

                } catch (IOException ioe) {
                    print("loadStopWordDict exception.");
                    print(ioe);
                } finally {
                    dicLastModified.put(path, file.lastModified());
                    if (updateDic.length() != 0) {
                        print("loadStopWordDict_END", "FileLastModified:" + DATE_FORMAT.format(new Date(file.lastModified())), updateDic.toString());
                    }
                    close(is, inputStreamReader, br);
                }
            }
        }
    }

    /**
     * 載入量詞詞典
     */
    private void loadQuantifierDict() {
        // 建立一個量詞典例項
        _QuantifierDict = new DictSegment((char) 0);
        // 讀取量詞詞典檔案
        InputStream is = this.getClass().getClassLoader().getResourceAsStream(cfg.getQuantifierDicionary());
        if (is == null) {
            throw new RuntimeException("Quantifier Dictionary not found!!!");
        }
        BufferedReader br = null;
        InputStreamReader inputStreamReader = null;
        try {
            inputStreamReader = new InputStreamReader(is, "UTF-8");
            br = new BufferedReader(inputStreamReader, 512);
            String theWord = null;
            do {
                theWord = br.readLine();
                if (theWord != null && !"".equals(theWord.trim())) {
                    _QuantifierDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
                }
            } while (theWord != null);

        } catch (IOException ioe) {
            print("Quantifier Dictionary loading exception.");
            print(ioe);

        } finally {
            close(is, inputStreamReader, br);
        }
    }

    /**
     * 批量關閉檔案流.
     * 
     * @param closeables
     *            檔案流集合
     */
    private static void close(AutoCloseable... closeables) {
        if (null != closeables && closeables.length > 0) {
            for (AutoCloseable autoCloseable : closeables) {
                if (null != autoCloseable) {
                    try {
                        autoCloseable.close();
                    } catch (Exception e) {
                        print(e);
                    }
                }
            }
        }
    }

    /**
     * 控制檯列印.
     * 
     * @param param
     *            引數
     */
    public static void print(String... param) {
        StringBuilder builder = new StringBuilder();
        builder.append("[").append(DATE_FORMAT.format(new Date())).append("]");
        for (String str : param) {
            builder.append("[").append(str).append("]");
        }
        System.out.println(builder.toString());
    }

    /**
     * 控制檯列印.
     * 
     * @param e
     *            異常資訊
     */
    public static void print(Exception e) {
        StringBuilder builder = new StringBuilder();
        builder.append("[").append(DATE_FORMAT.format(new Date())).append("]").append(e.getMessage());
        System.out.println(builder.toString());
        e.printStackTrace();
    }
}

專案完整原始碼:https://github.com/zxiaofan/ik-analyzer-solr6