IK動態詞庫及禁用內建主詞庫
阿新 • • 發佈:2019-01-05
/** * 詞典管理類,單子模式 */ public class Dictionary { /* * 詞典單子例項 */ private static Dictionary singleton; /* * 主詞典物件 */ private static DictSegment _MainDict = null; /* * 停止詞詞典 */ private static DictSegment _StopWordDict = null; /* * 量詞詞典 */ private DictSegment _QuantifierDict; /** * 詞典上傳修改時間. */ private static Map<String, Long> dicLastModified = new HashMap<String, Long>(); /** * 擴充套件詞. */ private static Set<String> dicExtSet = new HashSet<String>(10000); /** * 停用詞. */ private static Set<String> dicStopSet = new HashSet<String>(2000); /** * 配置物件 */ private static Configuration cfg; /** * 執行緒池定時載入詞典. */ private static ScheduledExecutorService scheduledThreadPool = Executors.newScheduledThreadPool(1); /** * 是否已載入過詞典. */ private static boolean hasAdd = false; /** * SimpleDateFormat(程式邏輯不存在併發,不考慮執行緒不安全情況). */ private final static java.text.SimpleDateFormat DATE_FORMAT = new java.text.SimpleDateFormat("yyyy-MM-dd HH:mm:ss.S"); /** * 詞典初始化 * * 由於IK Analyzer的詞典採用Dictionary類的靜態方法進行詞典初始化 * * 只有當Dictionary類被實際呼叫時,才會開始載入詞典, 這將延長首次分詞操作的時間, * * 該方法提供了一個在應用載入階段就初始化字典的手段 * * @return Dictionary */ public static Dictionary initial(Configuration cfg) { if (singleton == null) { synchronized (Dictionary.class) { if (singleton == null) { singleton = new Dictionary(cfg); Integer[] dicUpdateMin = cfg.getDicUpdateMin(); if (null != dicUpdateMin) { print("loadDicFixedTime", "start"); loadDicFixedTime(dicUpdateMin); } return singleton; } } } return singleton; } /** * 定期載入配置檔案. * * @param dicUpdateMin * 載入間隔 */ private static void loadDicFixedTime(Integer[] dicUpdateMin) { scheduledThreadPool.scheduleWithFixedDelay(new Runnable() { public void run() { try { loadMainDict(); loadStopWordDict(); } catch (Exception e) { print(e); } } }, dicUpdateMin[0], dicUpdateMin[1], TimeUnit.MINUTES); } private Dictionary(Configuration cfg) { this.cfg = cfg; this.loadMainDict(); this.loadStopWordDict(); this.loadQuantifierDict(); hasAdd = true; } /** * 獲取詞典單子例項 * * @return Dictionary 單例物件 */ public static Dictionary getSingleton() { if (singleton == null) { throw new IllegalStateException("詞典尚未初始化,請先呼叫initial方法"); } return singleton; } /** * 批量載入新詞條 * * @param words * Collection<String>詞條列表 */ public void addWords(Collection<String> words) { if (words != null) { for (String word : words) { if (word != null) { // 批量載入詞條到主記憶體詞典中 singleton._MainDict.fillSegment(word.trim().toLowerCase().toCharArray()); } } } } /** * 批量移除(遮蔽)詞條 * * @param words */ public void disableWords(Collection<String> words) { if (words != null) { for (String word : words) { if (word != null) { // 批量遮蔽詞條 singleton._MainDict.disableSegment(word.trim().toLowerCase().toCharArray()); } } } } /** * 檢索匹配主詞典 * * @param charArray * @return Hit 匹配結果描述 */ public Hit matchInMainDict(char[] charArray) { return singleton._MainDict.match(charArray); } /** * 檢索匹配主詞典 * * @param charArray * @param begin * @param length * @return Hit 匹配結果描述 */ public Hit matchInMainDict(char[] charArray, int begin, int length) { return singleton._MainDict.match(charArray, begin, length); } /** * 檢索匹配量詞詞典 * * @param charArray * @param begin * @param length * @return Hit 匹配結果描述 */ public Hit matchInQuantifierDict(char[] charArray, int begin, int length) { return singleton._QuantifierDict.match(charArray, begin, length); } /** * 從已匹配的Hit中直接取出DictSegment,繼續向下匹配 * * @param charArray * @param currentIndex * @param matchedHit * @return Hit */ public Hit matchWithHit(char[] charArray, int currentIndex, Hit matchedHit) { DictSegment ds = matchedHit.getMatchedDictSegment(); return ds.match(charArray, currentIndex, 1, matchedHit); } /** * 判斷是否是停止詞 * * @param charArray * @param begin * @param length * @return boolean */ public boolean isStopWord(char[] charArray, int begin, int length) { return singleton._StopWordDict.match(charArray, begin, length).isMatch(); } /** * 載入主詞典及擴充套件詞典 */ private static void loadMainDict() { // 建立一個主詞典例項 if (_MainDict == null) { // 首次載入 _MainDict = new DictSegment((char) 0); String mainDictionary = cfg.getMainDictionary(); // 讀取主詞典檔案 if (!cfg.isDicDisable()) { loadToMain(mainDictionary, 1); } } // 載入擴充套件詞典 List<String> extDictFiles = cfg.getExtDictionarys(); if (null != extDictFiles && !extDictFiles.isEmpty()) { for (String extFile : extDictFiles) { loadToMain(extFile, null); } } } /** * 將檔案載入到主庫. * * @param mainDictionary * mainDictionary * @param innerDic * 是否是內建詞典(1是) */ private static void loadToMain(String mainDictionary, Integer innerDic) { String path = null; InputStream is = null; File file = new File(""); if (Objects.equals(1, innerDic)) { is = Dictionary.class.getClassLoader().getResourceAsStream(mainDictionary); } else { path = getFilePath(mainDictionary); file = new File(path); try { is = new FileInputStream(file); } catch (FileNotFoundException e) { print(e); } } if (is == null) { print("loadToMain:FileNotFoundException", path); // throw new RuntimeException("Main Dictionary not found!!!"); return; } if (hasAdd && dicLastModified.containsKey(path) && file.lastModified() <= dicLastModified.get(path)) { return; // 非首次載入或詞典未修改 } print("loadToMain_START", mainDictionary); BufferedReader br = null; InputStreamReader inputStreamReader = null; StringBuilder updateDic = new StringBuilder(); try { inputStreamReader = new InputStreamReader(is, "UTF-8"); br = new BufferedReader(inputStreamReader, 512); String theWord = null; do { theWord = br.readLine(); if (theWord != null && !"".equals(theWord.trim())) { if (!dicExtSet.contains(theWord)) { dicExtSet.add(theWord); if (hasAdd) { updateDic.append(theWord).append(";"); } } _MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray()); } } while (theWord != null); } catch (IOException ioe) { print("loadToMain exception."); print(ioe); } finally { dicLastModified.put(path, file.lastModified()); if (updateDic.length() != 0) { print("loadToMain_END", "FileLastModified:" + DATE_FORMAT.format(new Date(file.lastModified())), updateDic.toString()); } close(is, inputStreamReader, br); } } /** * 獲取字典檔案實際路徑. * * @param dictionary * 字典名 * @return 字典路徑 */ private static String getFilePath(String dictionary) { URL resource = Dictionary.class.getClassLoader().getResource(dictionary); if (null == resource) { print("NullPointerException", "getFilePath", dictionary); // 提示使用者配置詞庫有誤,方便使用者定位異常 } return resource.getPath(); // 丟擲異常,終止IK } /** * 載入使用者擴充套件的停止詞詞典 */ private static void loadStopWordDict() { // 建立一個主詞典例項 if (_StopWordDict == null) { _StopWordDict = new DictSegment((char) 0); } // 載入擴充套件停止詞典 List<String> extStopWordDictFiles = cfg.getExtStopWordDictionarys(); if (extStopWordDictFiles != null) { InputStream is = null; for (String extStopWordDictName : extStopWordDictFiles) { // 讀取擴充套件詞典檔案 // is = Dictionary.class.getClassLoader().getResourceAsStream(extStopWordDictName); String path = getFilePath(extStopWordDictName); File file = new File(path); try { is = new FileInputStream(file); } catch (FileNotFoundException e) { print("loadStopWordDict:FileNotFoundException", path); print(e); } finally { close(is); } // 如果找不到擴充套件的字典,則忽略 if (is == null) { continue; } if (hasAdd && dicLastModified.containsKey(path) && file.lastModified() <= dicLastModified.get(path)) { continue; // 非首次載入或詞典未修改 } print("loadStopWordDict_START", extStopWordDictName); BufferedReader br = null; InputStreamReader inputStreamReader = null; StringBuilder updateDic = new StringBuilder(); try { inputStreamReader = new InputStreamReader(is, "UTF-8"); br = new BufferedReader(inputStreamReader, 512); String theWord = null; do { theWord = br.readLine(); if (theWord != null && !"".equals(theWord.trim())) { // System.out.println(theWord); // 載入擴充套件停止詞典資料到記憶體中 _StopWordDict.fillSegment(theWord.trim().toLowerCase().toCharArray()); if (!dicStopSet.contains(theWord)) { dicStopSet.add(theWord); if (hasAdd) { updateDic.append(theWord).append(";"); } } } } while (theWord != null); } catch (IOException ioe) { print("loadStopWordDict exception."); print(ioe); } finally { dicLastModified.put(path, file.lastModified()); if (updateDic.length() != 0) { print("loadStopWordDict_END", "FileLastModified:" + DATE_FORMAT.format(new Date(file.lastModified())), updateDic.toString()); } close(is, inputStreamReader, br); } } } } /** * 載入量詞詞典 */ private void loadQuantifierDict() { // 建立一個量詞典例項 _QuantifierDict = new DictSegment((char) 0); // 讀取量詞詞典檔案 InputStream is = this.getClass().getClassLoader().getResourceAsStream(cfg.getQuantifierDicionary()); if (is == null) { throw new RuntimeException("Quantifier Dictionary not found!!!"); } BufferedReader br = null; InputStreamReader inputStreamReader = null; try { inputStreamReader = new InputStreamReader(is, "UTF-8"); br = new BufferedReader(inputStreamReader, 512); String theWord = null; do { theWord = br.readLine(); if (theWord != null && !"".equals(theWord.trim())) { _QuantifierDict.fillSegment(theWord.trim().toLowerCase().toCharArray()); } } while (theWord != null); } catch (IOException ioe) { print("Quantifier Dictionary loading exception."); print(ioe); } finally { close(is, inputStreamReader, br); } } /** * 批量關閉檔案流. * * @param closeables * 檔案流集合 */ private static void close(AutoCloseable... closeables) { if (null != closeables && closeables.length > 0) { for (AutoCloseable autoCloseable : closeables) { if (null != autoCloseable) { try { autoCloseable.close(); } catch (Exception e) { print(e); } } } } } /** * 控制檯列印. * * @param param * 引數 */ public static void print(String... param) { StringBuilder builder = new StringBuilder(); builder.append("[").append(DATE_FORMAT.format(new Date())).append("]"); for (String str : param) { builder.append("[").append(str).append("]"); } System.out.println(builder.toString()); } /** * 控制檯列印. * * @param e * 異常資訊 */ public static void print(Exception e) { StringBuilder builder = new StringBuilder(); builder.append("[").append(DATE_FORMAT.format(new Date())).append("]").append(e.getMessage()); System.out.println(builder.toString()); e.printStackTrace(); } }
專案完整原始碼:https://github.com/zxiaofan/ik-analyzer-solr6