lingpipe: 文字分詞識別例子
阿新 • • 發佈:2019-01-08
1)什麼是lingpipe?
詳細見百度,簡而言之是自然語言處理軟體包(Natural Language Processing,NLP)。
lingpipe主要包含以下模組:
主題分類(Top Classification)
命名實體識別(Named Entity Recognition,NER)(什麼是NER?繼續百度。。。簡而言之是人名、地名、機構名等文字識別)
詞性標註(Part-of Speech Tagging)
句題檢測(Sentence Detection)
查詢拼寫檢查(Query Spell Checking)
興趣短語檢測(Interseting Phrase Detection)
聚類(Clustering)
字元語言建模(Character Language Modeling)
醫學文獻下載/解析/索引(MEDLINE Download, Parsing and Indexing)
資料庫文字挖掘(Database Text Mining)
中文分詞(Chinese Word Segmentation)
情感分析(Sentiment Analysis)
語言辨別(Language Identification)
Reference
lingpipe官方文件:http://alias-i.com/lingpipe/demos/tutorial/cluster/read-me.html
背後NLP模型:http://nlp.stanford.edu/software/corenlp.shtml
2)我做了個分詞的例子作為參考(應用到:命名實體識別、句題檢測,用到 lingpipe-4.1.0.jar),e.g.
import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; import com.aliasi.chunk.CharLmHmmChunker; import com.aliasi.chunk.Chunk; import com.aliasi.chunk.Chunker; import com.aliasi.chunk.Chunking; import com.aliasi.corpus.Parser; import com.aliasi.dict.DictionaryEntry; import com.aliasi.dict.MapDictionary; import com.aliasi.dict.ExactDictionaryChunker; import com.aliasi.hmm.HmmCharLmEstimator; import com.aliasi.sentences.IndoEuropeanSentenceModel; import com.aliasi.sentences.SentenceModel; import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory; import com.aliasi.tokenizer.Tokenizer; import com.aliasi.tokenizer.TokenizerFactory; import com.aliasi.util.AbstractExternalizable; public class TextAnalyzer { static final double CHUNK_SCORE = 1.0; static final TokenizerFactory TOKENIZER_FACTORY = IndoEuropeanTokenizerFactory.INSTANCE; static final SentenceModel SENTENCE_MODEL = new IndoEuropeanSentenceModel(); public static void main(String[] args) { // testChunkSentences(); // testChunkDictionary(); test(); } private static void test() { } // Sentences - Sentences Chunking(分句) private static void testChunkSentences() { String text = "50 Cent XYZ120 DVD Player 50 Cent lawyer. Person is john, he is a lawyer."; List<String> result = new ArrayList<String>(); List<String> tokenList = new ArrayList<String>(); List<String> whiteList = new ArrayList<String>(); Tokenizer tokenizer = TOKENIZER_FACTORY.tokenizer(text.toCharArray(), 0, text.length()); tokenizer.tokenize(tokenList, whiteList); String[] tokens = new String[tokenList.size()]; String[] whites = new String[whiteList.size()]; tokenList.toArray(tokens); whiteList.toArray(whites); int[] sentenceBoundaries = SENTENCE_MODEL.boundaryIndices(tokens, whites); int sentStartTok = 0; int sentEndTok = 0; for (int i = 0; i < sentenceBoundaries.length; ++i) { System.out.println("Sentense " + (i + 1) + ", sentense's length(from 0):" + (sentenceBoundaries[i])); StringBuilder sb = new StringBuilder(); sentEndTok = sentenceBoundaries[i]; for (int j = sentStartTok; j <= sentEndTok; j++) { sb.append(tokens[j]).append(whites[j + 1]); } sentStartTok = sentEndTok + 1; result.add(sb.toString()); } System.out.println("Final result:" + result); } // NER(named entity recognition) - Exact Dictionary-Based Chunking(分詞) private static void testChunkDictionary() { String[] args1 = {"50 Cent XYZ120 DVD Player 50 Cent lawyer.", "person is john, he is a lawyer."}; MapDictionary<String> dictionary = new MapDictionary<String>(); dictionary.addEntry(new DictionaryEntry<String>("50 Cent","PERSON",CHUNK_SCORE)); dictionary.addEntry(new DictionaryEntry<String>("XYZ120 DVD Player","DB_ID_1232",CHUNK_SCORE)); dictionary.addEntry(new DictionaryEntry<String>("cent","MONETARY_UNIT",CHUNK_SCORE)); dictionary.addEntry(new DictionaryEntry<String>("dvd player","PRODUCT",CHUNK_SCORE)); ExactDictionaryChunker dictionaryChunkerTT = new ExactDictionaryChunker(dictionary, IndoEuropeanTokenizerFactory.INSTANCE, true,true); ExactDictionaryChunker dictionaryChunkerTF = new ExactDictionaryChunker(dictionary, IndoEuropeanTokenizerFactory.INSTANCE, true,false); // returnAllMatches is false means bypassing the matched text from further matching process ExactDictionaryChunker dictionaryChunkerFT = new ExactDictionaryChunker(dictionary, IndoEuropeanTokenizerFactory.INSTANCE, false,true); ExactDictionaryChunker dictionaryChunkerFF = new ExactDictionaryChunker(dictionary, IndoEuropeanTokenizerFactory.INSTANCE, false,false); System.out.println("\nDICTIONARY\n" + dictionary); for (int i = 0; i < args1.length; ++i) { String text = args1[i]; System.out.println("\n\nTEXT=" + text); chunk(dictionaryChunkerTT,text); chunk(dictionaryChunkerTF,text); chunk(dictionaryChunkerFT,text); chunk(dictionaryChunkerFF,text); } } static void chunk(ExactDictionaryChunker chunker, String text) { System.out.println("\nChunker." + " All matches=" + chunker.returnAllMatches() + " Case sensitive=" + chunker.caseSensitive()); Chunking chunking = chunker.chunk(text); for (Chunk chunk : chunking.chunkSet()) { int start = chunk.start(); int end = chunk.end(); String type = chunk.type(); double score = chunk.score(); String phrase = text.substring(start,end); System.out.println(" phrase=|" + phrase + "|" + " start=" + start + " end=" + end + " type=" + type + " score=" + score); } } }