java英文拼寫檢查並自動糾正
阿新 • • 發佈:2018-12-16
SpellCorrect原理:https://www.cnblogs.com/jianxinzhou/p/4740392.html
專案原始碼:
package com.xq.algorithm; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; public class SpellCorrect { public static final char[] c = {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'}; static Map<String,Integer> trainMap=train(); public static void editDistance1Test(String word){ Set<String> set =editDistance1(word); for (String s : set) { System.out.println(s); } System.out.print(set.size()); } public static String correct(String word){ Set<String> set=new HashSet<String>(); String str=known(word, trainMap); if(!"".equals(str)){ return str; }else{ set.add(word); } set.addAll(known(editDistance1(word), trainMap)); set.addAll(editDistance2(word, trainMap)); //set.add(word); Map<String,Integer> wordsMap=new HashMap<String,Integer>(); for(String s: set){ wordsMap.put(s, trainMap.get(s)==null ? 0: trainMap.get(s)); } List<Map.Entry<String, Integer>> info = new ArrayList<Map.Entry<String, Integer>>(wordsMap.entrySet()); Collections.sort(info, new Comparator<Map.Entry<String, Integer>>() { public int compare(Map.Entry<String, Integer> obj1, Map.Entry<String, Integer> obj2) { return obj2.getValue() - obj1.getValue(); } }); //語料庫中沒有該單詞,則返回該單詞本身 return info.get(0).getValue()>0 ? info.get(0).getKey() : word; } /** * * @Title: words * @Description: 讀取語料庫檔案 * @param @return * @param @throws IOException * @return Map<String,Integer> * @throws */ public static Map<String,Integer> train(){ try { // String cnt = util.MyFileTool.readFile(util.Directory.GetAppPath("data") + "big.txt"); // System.out.println(cnt); } catch (Exception e1) { e1.printStackTrace(); } // InputStream is; // try { // is = new FileInputStream(new File(util.Directory.GetAppPath("data") + "big.txt")); // } catch (FileNotFoundException e1) { // System.out.println("error"); // e1.printStackTrace(); // } // InputStream is = new SpellCorrect().getClass().getClassLoader().getResourceAsStream(util.Directory.GetAppPath("data") + "big.txt"); // if(is == null){ // throw new RuntimeException("big.txt not found!!!"); // } Map<String,Integer> map = new HashMap<String,Integer>(); BufferedReader br = null; try { //讀取語料庫big.txt // BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512); br = util.MyFileTool.GetBufferReader(util.Directory.GetAppPath("data") + "big.txt"); String s=""; while ((s = br.readLine()) != null) { // 去掉文件中除字母外的所有符號 s = s.replaceAll("\\pP|\\pS|\\pM|\\pN|\\pC", ""); // 將文件轉成小寫,然後切分成單詞,存在list中 s = s.toLowerCase(); String[] splits = s.split(" "); for (int j = 0; j < splits.length; j++) { if (!" ".equals(splits[j]) && !"".equals(splits[j]) && !splits[j].equals(null)){ if(map.containsKey(splits[j])){ Integer count=map.get(splits[j]); map.put(splits[j], count+1); }else{ map.put(splits[j], 1); } } } } } catch (FileNotFoundException e) { e.printStackTrace(); }catch(IOException e){ e.printStackTrace(); } finally{ try{ br.close(); }catch(Exception e){ e.printStackTrace(); } } return map; } /** * * @Title: editDistance2 * @Description: 編輯距離為2的集合.通過editDistance1函式得到編輯距離為1的集合,該集合單詞再通過editDistance1函式,就可以得到編輯距離為2的集合 * @param @param set * @param @param trainMap * @param @return * @return Set<String> * @throws */ public static Set<String> editDistance2(Set<String> set,HashMap<String,Integer> trainMap){ Set<String> editDistance2Set=new HashSet<String>(); Set<String> tempSet=new HashSet<String>(); Set<String> tmpSet=new HashSet<String>(); for(String s: set){ tempSet.addAll(editDistance1(s)); } for(String s: tempSet){ editDistance2Set.addAll(editDistance1(s)); } for(String s : editDistance2Set){ if(!trainMap.containsKey(s)){ tmpSet.add(s); } } return tmpSet; } /** * * @Title: editDistance2 * @Description: 得到一個word的編輯距離為2的集合 * @param @param word * @param @param trainMap * @param @return * @return Set<String> * @throws */ public static Set<String> editDistance2(String word,Map<String,Integer> trainMap){ Set<String> editDistance2Set=new HashSet<String>(); Set<String> tmpSet=new HashSet<String>(); Set<String> editDistance1Set=editDistance1(word); for(String s: editDistance1Set){ editDistance2Set.addAll(editDistance1(s)); } for(String s : editDistance2Set){ if(!trainMap.containsKey(s)){ tmpSet.add(s); } } return tmpSet; } /** * * @Title: known * @Description: 輸入的單詞集合是否在訓練語料庫中 * @param @param wordsSet * @param @param map * @param @return * @return Set<String> * @throws */ public static Set<String> known(Set<String> wordsSet, Map<String, Integer> map) { Set<String> set = new HashSet<String>(); for(String s : wordsSet){ if (map.containsKey(s)) { set.add(s); } } return set; } public static String known(String word, Map<String, Integer> map) { if(map.containsKey(word)){ return word; }else{ return ""; } } /** * * @Title: editDistance1 * @Description: 編輯距離為1的函式 * @param @param word * @param @return * @return Set<String> * @throws */ public static Set<String> editDistance1(String word) { String tempWord = ""; Set<String> set = new HashSet<String>(); int n = word.length(); // delete一個字母的情況 for (int i = 0; i < n; i++){ tempWord = word.substring(0, i) + word.substring(i + 1); set.add(tempWord); } //transposition for (int i = 0; i < n - 1; i++) { /*tempWord = word.substring(0, i) + word.substring(i + 1, i + 2) + word.substring(i, i + 1) + word.substring(i + 2, n);*/ tempWord = word.substring(0, i) + word.charAt(i+1)+word.charAt(i)+word.substring(i + 2, n); set.add(tempWord); } // alteration 26n for (int i = 0; i < n; i++) { for (int j = 0; j < 26; j++) { tempWord = word.substring(0, i) + c[j] + word.substring(i + 1, n); set.add(tempWord); } } // insertion 26n for (int i = 0; i < n+1; i++) { for (int j = 0; j < 26; j++) { tempWord = word.substring(0, i) + c[j] + word.substring(i, n); set.add(tempWord); } } // 將字母插入到最後 n for (int j = 0; j < 26; j++) { set.add(word + c[j]); } return set; } public static void main(String[] args) throws Exception { // String cnt = util.MyFileTool.readFile(util.Directory.GetAppPath("data") + "big.txt"); System.out.println(correct("speling")); System.out.println(correct("love")); System.out.println(correct("korrecter")); System.out.println(correct("korrect")); System.out.println(correct("qove")); //editDistance1Test("korrecter"); } }
maven專案下載地址:https://blog.csdn.net/ryo1060732496/article/details/80170797