【Java】聊天過濾 DFA演算法的Java實現
阿新 • • 發佈:2019-02-01
開心就好
Trie樹的原理不講了,直接上程式碼
ChatFilter.java 是核心的過濾器,他從NoneWantToSee.list檔案中讀敏感詞,這個檔案中一個敏感詞放一行,這個檔案放在src目錄下就行。
過濾器實現資料載入和提供過濾服務,過濾服務是把敏感詞替換成**,可以自定義行為。
和一些例子不同,我在程式碼中處理了部分重疊狀態的識別,比如“絲襪” “絲襪網” 都作為敏感詞可以被識別出來並處理掉。
另外有一點,構造使用的Set是TreeSet,其中的元素長度從大到小排列,這樣在構造sensitiveMap的時候,重疊匹配處理起來方便一些。
import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.InputStreamReader; import java.util.Comparator; import java.util.HashMap; import java.util.TreeSet; /** * 聊天過濾器,DFA演算法 * @author yuantao * */ public class ChatFilter { private static HashMap<String, ChatFilterTreeNode> sensitiveMap = new HashMap<>(); static { File file = new File(ChatFilter.class.getResource("/").getPath()+"NoneWantToSee.list"); TreeSet<String> set = new TreeSet<>(new Comparator<String>() { @Override public int compare(String o1, String o2) { // TODO Auto-generated method stub return o1.length() > o2.length() ? -1 : 1; } }); try(BufferedReader bReader = new BufferedReader(new InputStreamReader(new FileInputStream(file)));) { String line = null; while ((line = bReader.readLine()) != null) { set.add(line); } initFilter(set); } catch (Exception e) { // TODO: handle exception e.printStackTrace(); } } /** * 只是用來載入靜態程式碼的 */ public static void initChatFilter(){} /** * 構造關鍵詞查詢器 * @param keySet 按長度倒敘排列的TreeSet */ private static void initFilter(TreeSet<String> keySet) { for (String oneKey : keySet) { HashMap<String, ChatFilterTreeNode> iterMap = sensitiveMap; for (int index = 0; index < oneKey.length(); ++index) { char keyChar = oneKey.charAt(index); ChatFilterTreeNode node = iterMap.get(String.valueOf(keyChar)); // 按一個字元查詢 if (node != null) { //如果存在嘗試下探 if (index < (oneKey.length()-1)) { node.setEnd(false); node.setOverLapEnd(false); } else { //部分匹配 if (!node.getNextNodeMap().isEmpty()) { node.setEnd(false); node.setOverLapEnd(true); } } iterMap = node.getNextNodeMap(); } else { //不存在就構造 ChatFilterTreeNode nextNewNode = new ChatFilterTreeNode(); if (index < (oneKey.length()-1)) { nextNewNode.setEnd(false); } iterMap.put(String.valueOf(keyChar), nextNewNode); iterMap = nextNewNode.getNextNodeMap(); } } } } public String filte(String targetStr) { HashMap<String, ChatFilterTreeNode> iterMap = sensitiveMap; StringBuilder sb = new StringBuilder(); boolean needProcessOverlap = false; int sensitivityIndex = 0; // 標記敏感詞起始位置 int normalStartIndex = 0; // 標記正常詞起始位置 int normalEndIndex = 0; // 標記正常詞結束位置 for (int index = 0; index < targetStr.length(); index++) { char inputChar = targetStr.charAt(index); ChatFilterTreeNode node = iterMap.get(String.valueOf(inputChar)); if (node != null) { iterMap = node.getNextNodeMap(); if (node.isEnd()) { //匹配上了先替換敏感詞再調整索引值 if (normalEndIndex > normalStartIndex) { //先擷取前面的非敏感詞部分 sb.append(targetStr.substring(normalStartIndex, normalEndIndex)); } sb.append("**"); normalStartIndex = index + 1; sensitivityIndex = index + 1; normalEndIndex = index + 1; iterMap = sensitiveMap; needProcessOverlap = false; } else if (node.isOverLapEnd()) { needProcessOverlap = true; } } else { if (needProcessOverlap) { //處理重疊匹配的狀態 if (normalEndIndex > normalStartIndex) { sb.append(targetStr.substring(normalStartIndex, normalEndIndex)); } sb.append("**"); needProcessOverlap = false; normalStartIndex = index; sensitivityIndex = index; normalEndIndex = index; } //這裡要嘗試去匹配一下, 如果匹配了一半退出了,需要重新去匹配 iterMap = sensitiveMap; node = iterMap.get(String.valueOf(inputChar)); if (node != null) { normalEndIndex = index; sensitivityIndex = index; iterMap = node.getNextNodeMap(); if (node.isEnd()) { //匹配上了先替換敏感詞再調整索引值 if (normalEndIndex > normalStartIndex) { //先擷取前面的非敏感詞部分 sb.append(targetStr.substring(normalStartIndex, normalEndIndex)); } sb.append("**"); normalStartIndex = index + 1; sensitivityIndex = index + 1; normalEndIndex = index + 1; iterMap = sensitiveMap; } } else { //1.正常start=正常end=敏感start,讓正常end=index, if (normalEndIndex == normalStartIndex && normalEndIndex == sensitivityIndex) { iterMap = sensitiveMap; sensitivityIndex = normalStartIndex; } normalEndIndex = index + 1; } } } if (needProcessOverlap) { if (normalStartIndex < normalEndIndex) { sb.append(targetStr.substring(normalStartIndex, normalEndIndex)); } sb.append("**"); normalStartIndex = targetStr.length(); sensitivityIndex = targetStr.length(); normalEndIndex = targetStr.length(); } if (normalStartIndex < targetStr.length()) { sb.append(targetStr.substring(normalStartIndex)); } return sb.toString(); } }
資料結構 ChatFilterTreeNode.java 兩個標記,一個(isEnd)是標記葉子節點,一個(isOverLapEnd)是標記被覆蓋的葉子節點。
import java.util.HashMap; /** * 聊天過濾器查詢樹的節點。 * 在查詢樹中,每一個Key都必須對應一個節點 * 最後一個Key對應的節點中isEnd==true,nextNodeMap.size==0 * @author yuantao * */ public class ChatFilterTreeNode { private boolean isEnd = true; private HashMap<String, ChatFilterTreeNode> nextNodeMap = null; private boolean isOverLapEnd = false; /** * Lazy Getter and Setter * @return */ public HashMap<String, ChatFilterTreeNode> getNextNodeMap() { if (nextNodeMap==null) { nextNodeMap = new HashMap<String, ChatFilterTreeNode>(); } return nextNodeMap; } public void setNextNodeMap(HashMap<String, ChatFilterTreeNode> nextNodeMap) { this.nextNodeMap = nextNodeMap; } public boolean isEnd() { return isEnd; } public void setEnd(boolean isEnd) { this.isEnd = isEnd; } public boolean isOverLapEnd() { return isOverLapEnd; } public void setOverLapEnd(boolean isOverLapEnd) { this.isOverLapEnd = isOverLapEnd; } }
用法很簡單
ChatFilter filter =new ChatFilter();
String testStr = "啊日本人絲襪敏網啊日本人敏網絲襪網我日本絲襪日本";
System.out.println(testStr);
String result =filter.filte(testStr);
System.out.println(result);
敏感詞是[絲襪, 絲襪網]
啊日本人絲襪敏網啊日本人敏網絲襪網我日本絲襪日本
啊日本人**敏網啊日本人敏網**我日本**日本