利用庖丁解牛工具進行中文分詞
阿新 • • 發佈:2019-02-11
關於如何使用庖丁解牛分詞工具可參考:http://www.letiantian.me/2014-11-26-word-segmentation-paoding-analysis/
該工具可實現自定義詞典,對於有些特殊的詞,比如明星名字林心如霍建華等,可構建詞典以.dic為字尾,放入paoding-analysis-2.0.4-beta\dic目錄下,然後新建Java工程即可~
對於一些停用詞,也可以處理~
主要程式碼如下:
</pre><pre class="java" name="code">import java.io.IOException; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.util.*; public class fenciMain3 { //停用詞詞表 public static final String stopWordTable = "." + File.separator + "srcFile" + File.separator + "StopWordTable_all.txt"; public static void main(String[] args) throws IOException { String srcFile = "." + File.separator + "srcFile" + File.separator + "user_tag_query.txt"; //String srcFile = "." + File.separator + "srcFile" + File.separator + "test.txt"; String destFile = "." + File.separator + "destFile" + File.separator + "fileExcludeStopWord2.0.txt"; //String destFile = "." + File.separator + "destFile" + File.separator + "output.txt"; new fenciMain3().fenciMain3(srcFile, destFile); System.out.println("OVER DONE!!!!!!!!!!"); // String text = "秋刀魚的滋味,貓跟你都想了解"; //System.out.println(pd.fenci01(text)); } public void fenciMain3(String srcFile, String destFile){ try { //讀取原檔案和停用詞表 BufferedReader srcFileBr = new BufferedReader(new InputStreamReader(new FileInputStream(new File(srcFile)))); BufferedReader StopWordFileBr = new BufferedReader(new InputStreamReader(new FileInputStream(stopWordTable), "UTF-8")); //將分詞好的文字資訊存入輸出檔案 BufferedWriter destFileBw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(destFile)))); //用來存放停用詞的集合 Set<String> stopWordSet = new HashSet<String>(); //初始化停用詞集 String stopWord = null; for(; (stopWord = StopWordFileBr.readLine()) != null;){ // System.out.println(stopWord); stopWordSet.add(stopWord); } String paragraph = null; paodingfenci pd = new paodingfenci(); for(; (paragraph = srcFileBr.readLine()) != null; ){ //對讀入的文字進行分詞 //顯示結果 //System.out.println(pd.fenci01(paragraph)); String spiltResultStr = pd.fenci01(paragraph); String[] resultArray = spiltResultStr.split(" "); //過濾停用詞 for(int i = 4; i< resultArray.length; i++){ if(stopWordSet.contains(resultArray[i])){ resultArray[i] = null; } else{ for(int j = resultArray[i].length(); --j >= 0;){ char c = resultArray[i].charAt(j); if(Character.isDigit(c)){ resultArray[i] = null; break; } if((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')){ resultArray[i] = null; break; } } } } //把過濾後的字串陣列存入到一個字串中 StringBuffer finalStr = new StringBuffer(); for(int i = 0; i< resultArray.length; i++){ if(resultArray[i] != null){ finalStr = finalStr.append(resultArray[i]).append(" "); } } //輸出結果到指定檔案 destFileBw.write(finalStr.toString()); destFileBw.newLine(); } //關閉輸入流 destFileBw.close(); srcFileBr.close(); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch(Exception e){ e.printStackTrace(); } } }