基於ansj_seg和nlp-lang的簡單nlp工具類
阿新 • • 發佈:2018-12-30
1、首先在pom中引入ansj_seg和nlp-lang的依賴包,
ansj_seg包的作用:
這是一個基於n-Gram+CRF+HMM的中文分詞的java實現;
分詞速度達到每秒鐘大約200萬字左右(mac air下測試),準確率能達到96%以上;
目前實現了.中文分詞. 中文姓名識別 . 使用者自定義詞典,關鍵字提取,自動摘要,關鍵字標記等功能;
可以應用到自然語言處理等方面,適用於對分詞效果要求高的各種專案;
nlp-lang包的作用(nlp常用工具和元件):
工具:詞語標準化、tire樹結構、雙陣列tire樹、文字斷句、html標籤清理、Viterbi演算法增加;
元件:漢字轉拼音、簡繁體轉換、bloomfilter、指紋去重、SimHash文章相似度計算、詞貢獻統計、基於記憶體的搜尋提示、WordWeight詞頻統計,詞idf統計,詞類別相關度統計;
Maven:
<!-- nlp-lang --> <dependency> <groupId>org.nlpcn</groupId> <artifactId>nlp-lang</artifactId> <version>1.7.2</version> </dependency> <!-- ansj_seg --> <dependency> <groupId>org.ansj</groupId> <artifactId>ansj_seg</artifactId> <version>5.1.2</version> </dependency>
2、建立WordUtil類,如下:
package com.mengyao.nlp.util; import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.Map; import java.util.Map.Entry; import org.ansj.app.keyword.KeyWordComputer; import org.ansj.app.keyword.Keyword; import org.ansj.app.summary.SummaryComputer; import org.ansj.app.summary.pojo.Summary; import org.ansj.domain.Result; import org.ansj.domain.Term; import org.ansj.splitWord.analysis.IndexAnalysis; import org.ansj.splitWord.analysis.NlpAnalysis; import org.ansj.splitWord.analysis.ToAnalysis; import org.apache.commons.lang3.StringUtils; import org.nlpcn.commons.lang.jianfan.JianFan; import org.nlpcn.commons.lang.pinyin.Pinyin; import org.nlpcn.commons.lang.util.WordAlert; import org.nlpcn.commons.lang.util.WordWeight; /** * * @author mengyao * */ public class WordUtil { public static void main(String[] args) { System.out.println("2016/06/25".matches("^\\d{4}(\\-|\\/|\\.)\\d{1,2}\\1\\d{1,2}$")); System.out.println("20160625".matches("^\\d{8}$")); } /** * 文章摘要 * @param title * @param content * @return */ public static String getSummary(String title, String content) { SummaryComputer summaryComputer = new SummaryComputer(title, content); Summary summary = summaryComputer.toSummary(); return summary.getSummary(); } /** * 帶標題的文章關鍵詞提取 * @param title * @param content * @return */ public static List<Keyword> getKeyWord(String title, String content) { List<Keyword> keyWords = new ArrayList<Keyword>(); KeyWordComputer<NlpAnalysis> kwc = new KeyWordComputer<NlpAnalysis>(20); Collection<Keyword> result = kwc.computeArticleTfidf(title, content); for (Keyword keyword : result) { keyWords.add(keyword); } return keyWords; } /** * 不帶標題的文章關鍵詞提取 * @param content * @return */ public static List<Keyword> getKeyWord2(String content) { List<Keyword> keyWords = new ArrayList<Keyword>(); KeyWordComputer<NlpAnalysis> kwc = new KeyWordComputer<NlpAnalysis>(20); Collection<Keyword> result = kwc.computeArticleTfidf(content); for (Keyword keyword : result) { keyWords.add(keyword); } return keyWords; } /** * 標準分詞 * @param text * @return */ public static List<Term> getToSeg(String text) { List<Term> words = new ArrayList<Term>(); Result parse = ToAnalysis.parse(text); for (Term term : parse) { if (null!=term.getName()&&!term.getName().trim().isEmpty()) { words.add(term); } } return words; } /** * NLP分詞 * @param text * @return */ public static List<Term> getNlpSeg(String text) { List<Term> words = new ArrayList<Term>(); Result parse = NlpAnalysis.parse(text); for (Term term : parse) { if (null!=term.getName()&&!term.getName().trim().isEmpty()) { words.add(term); } } return words; } /** * Index分詞 * @param text * @return */ public static List<Term> getIndexSeg(String text) { List<Term> words = new ArrayList<Term>(); Result parse = IndexAnalysis.parse(text); for (Term term : parse) { if (null!=term.getName()&&!term.getName().trim().isEmpty()) { words.add(term); } } return words; } /** * 簡體轉繁體 * @param word * @return */ public static String jian2fan(String text) { return JianFan.j2f(text); } /** * 繁體轉簡體 * @param word * @return */ public static String fan2jian(String text) { return JianFan.f2j(text); } /** * 拼音(不帶音標) * @param word * @return */ public static String pinyin(String text) { StringBuilder builder = new StringBuilder(); List<String> pinyins = Pinyin.pinyin(text); for (String pinyin : pinyins) { if (null != pinyin) { builder.append(pinyin+" "); } } return builder.toString(); } /** * 拼音(不帶音標,首字母大寫) * @param word * @return */ public static String pinyinUp(String text) { StringBuilder builder = new StringBuilder(); List<String> pinyins = Pinyin.pinyin(text); for (String pinyin : pinyins) { if (StringUtils.isEmpty(pinyin)) { continue; } builder.append(pinyin.substring(0,1).toUpperCase()+pinyin.substring(1)); } return builder.toString(); } /** * 拼音(帶數字音標) * @param word * @return */ public static String tonePinyin(String text) { StringBuilder builder = new StringBuilder(); List<String> pinyins = Pinyin.tonePinyin(text); for (String pinyin : pinyins) { if (null != pinyin) { builder.append(pinyin+" "); } } return builder.toString(); } /** * 拼音(帶符號音標) * @param word * @return */ public static String unicodePinyin(String text) { StringBuilder builder = new StringBuilder(); List<String> pinyins = Pinyin.unicodePinyin(text); for (String pinyin : pinyins) { if (null != pinyin) { builder.append(pinyin+" "); } } return builder.toString(); } /** * 詞頻統計 * @param words * @return */ public static Map<String, Double> wordCount(List<String> words) { WordWeight ww = new WordWeight(); for (String word : words) { ww.add(word); } return ww.export(); } /** * 詞頻統計 * @param words * @return */ public static List<String> wordCount1(List<String> words) { List<String> wcs = new ArrayList<String>(); WordWeight ww = new WordWeight(); for (String word : words) { ww.add(word); } Map<String, Double> export = ww.export(); for (Entry<String, Double> entry : export.entrySet()) { wcs.add(entry.getKey()+":"+entry.getValue()); } return wcs; } /** * 語種識別:1英文;0中文 * @param words * @return */ public static int language(String word) { return WordAlert.isEnglish(word)?1:0; } }