1. 程式人生 > >【Java】聊天過濾 DFA演算法的Java實現

【Java】聊天過濾 DFA演算法的Java實現

開心就好

Trie樹的原理不講了,直接上程式碼

ChatFilter.java 是核心的過濾器,他從NoneWantToSee.list檔案中讀敏感詞,這個檔案中一個敏感詞放一行,這個檔案放在src目錄下就行。

過濾器實現資料載入和提供過濾服務,過濾服務是把敏感詞替換成**,可以自定義行為。

和一些例子不同,我在程式碼中處理了部分重疊狀態的識別,比如“絲襪” “絲襪網” 都作為敏感詞可以被識別出來並處理掉。

另外有一點,構造使用的Set是TreeSet,其中的元素長度從大到小排列,這樣在構造sensitiveMap的時候,重疊匹配處理起來方便一些。

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.Comparator;
import java.util.HashMap;
import java.util.TreeSet;

/**
 * 聊天過濾器,DFA演算法
 * @author yuantao
 *
 */
public class ChatFilter {
    private static HashMap<String, ChatFilterTreeNode> sensitiveMap = new HashMap<>();
    
    static {
        File file = new File(ChatFilter.class.getResource("/").getPath()+"NoneWantToSee.list");
        TreeSet<String> set = new TreeSet<>(new Comparator<String>() {
            @Override
            public int compare(String o1, String o2) {
                // TODO Auto-generated method stub
                return o1.length() > o2.length() ? -1 : 1;
            }
        });
        try(BufferedReader bReader = new BufferedReader(new InputStreamReader(new FileInputStream(file)));) {
            String line = null;
            
            while ((line = bReader.readLine()) != null) {
                set.add(line);
            }
            initFilter(set);
        } catch (Exception e) {
            // TODO: handle exception
            e.printStackTrace();
        }
    }
    
    /**
     * 只是用來載入靜態程式碼的
     */
    public static void initChatFilter(){}
    /**
     * 構造關鍵詞查詢器
     * @param keySet 按長度倒敘排列的TreeSet
     */
    private static void initFilter(TreeSet<String> keySet) { 
        for (String oneKey : keySet) {
            HashMap<String, ChatFilterTreeNode> iterMap = sensitiveMap;
            for (int index = 0; index < oneKey.length(); ++index) {
                char keyChar = oneKey.charAt(index);
                ChatFilterTreeNode node = iterMap.get(String.valueOf(keyChar)); // 按一個字元查詢 
                if (node != null) { //如果存在嘗試下探
                    if (index < (oneKey.length()-1)) {
                        node.setEnd(false);
                        node.setOverLapEnd(false);
                    } else { //部分匹配
                        if (!node.getNextNodeMap().isEmpty()) {
                            node.setEnd(false);
                            node.setOverLapEnd(true);
                        }
                    }
                    iterMap = node.getNextNodeMap();
                } else {
                    //不存在就構造
                    ChatFilterTreeNode nextNewNode = new ChatFilterTreeNode();
                    if (index < (oneKey.length()-1)) {
                        nextNewNode.setEnd(false);
                    }
                    iterMap.put(String.valueOf(keyChar), nextNewNode);
                    iterMap = nextNewNode.getNextNodeMap();
                }
            }
        }
    }
    
    
    public String filte(String targetStr) {
        HashMap<String, ChatFilterTreeNode> iterMap = sensitiveMap;
        StringBuilder sb = new StringBuilder();
        boolean needProcessOverlap = false;
        int sensitivityIndex = 0; // 標記敏感詞起始位置
        int normalStartIndex = 0; // 標記正常詞起始位置
        int normalEndIndex = 0; // 標記正常詞結束位置 
        for (int index = 0; index < targetStr.length(); index++) {
            char inputChar = targetStr.charAt(index);
            ChatFilterTreeNode node = iterMap.get(String.valueOf(inputChar));
            if (node != null) {
                iterMap = node.getNextNodeMap();
                if (node.isEnd()) {
                    //匹配上了先替換敏感詞再調整索引值
                    if (normalEndIndex > normalStartIndex) { //先擷取前面的非敏感詞部分
                        sb.append(targetStr.substring(normalStartIndex, normalEndIndex));
                    }
                    sb.append("**");
                    
                    normalStartIndex = index + 1;
                    sensitivityIndex = index + 1;
                    normalEndIndex = index + 1;
                    iterMap = sensitiveMap;
                    needProcessOverlap = false;
                    
                } else if (node.isOverLapEnd()) {
                    needProcessOverlap = true;
                }
                
            } else { 
                if (needProcessOverlap) { //處理重疊匹配的狀態
                    if (normalEndIndex > normalStartIndex) {
                        sb.append(targetStr.substring(normalStartIndex, normalEndIndex));
                    }
                    sb.append("**");
                    needProcessOverlap = false;
                    normalStartIndex = index;
                    sensitivityIndex = index;
                    normalEndIndex = index;
                }
                //這裡要嘗試去匹配一下, 如果匹配了一半退出了,需要重新去匹配
                iterMap = sensitiveMap;
                node = iterMap.get(String.valueOf(inputChar));
                if (node != null) {
                    normalEndIndex = index;
                    sensitivityIndex = index;
                    iterMap = node.getNextNodeMap();
                    if (node.isEnd()) {
                        //匹配上了先替換敏感詞再調整索引值
                        if (normalEndIndex > normalStartIndex) { //先擷取前面的非敏感詞部分
                            sb.append(targetStr.substring(normalStartIndex, normalEndIndex));
                        }
                        sb.append("**");
                        
                        normalStartIndex = index + 1;
                        sensitivityIndex = index + 1;
                        normalEndIndex = index + 1;

                        iterMap = sensitiveMap;
                    }
                    
                } else {
                    //1.正常start=正常end=敏感start,讓正常end=index,
                    if (normalEndIndex == normalStartIndex 
                            && normalEndIndex == sensitivityIndex) {
                        iterMap = sensitiveMap;
                        sensitivityIndex = normalStartIndex;
                    }
                    normalEndIndex = index + 1;
                }
                
            }
        }
        if (needProcessOverlap) {
            if (normalStartIndex < normalEndIndex) {
                sb.append(targetStr.substring(normalStartIndex, normalEndIndex));
            }
            sb.append("**");
            normalStartIndex = targetStr.length();
            sensitivityIndex = targetStr.length();
            normalEndIndex = targetStr.length();
        }
        if (normalStartIndex < targetStr.length()) {
            sb.append(targetStr.substring(normalStartIndex));
        }

        return sb.toString();
    }
}

資料結構 ChatFilterTreeNode.java 兩個標記,一個(isEnd)是標記葉子節點,一個(isOverLapEnd)是標記被覆蓋的葉子節點。
import java.util.HashMap;
/**
 * 聊天過濾器查詢樹的節點。
 * 在查詢樹中,每一個Key都必須對應一個節點
 * 最後一個Key對應的節點中isEnd==true,nextNodeMap.size==0
 * @author yuantao
 *
 */
public class ChatFilterTreeNode {
        private boolean isEnd = true;
        private HashMap<String, ChatFilterTreeNode> nextNodeMap = null;
        private boolean isOverLapEnd = false;
        /**
         * Lazy Getter and Setter
         * @return
         */
        
        public HashMap<String, ChatFilterTreeNode> getNextNodeMap() {
            if (nextNodeMap==null) {
                nextNodeMap = new HashMap<String, ChatFilterTreeNode>();
            }
            return nextNodeMap;
        }
        
        public void setNextNodeMap(HashMap<String, ChatFilterTreeNode> nextNodeMap) {
            this.nextNodeMap = nextNodeMap;
        }
        
        public boolean isEnd() {
            return isEnd;
        }
        
        public void setEnd(boolean isEnd) {
            this.isEnd = isEnd;
        }

        public boolean isOverLapEnd() {
            return isOverLapEnd;
        }

        public void setOverLapEnd(boolean isOverLapEnd) {
            this.isOverLapEnd = isOverLapEnd;
        }
}

用法很簡單

    ChatFilter filter =new ChatFilter();

    String testStr = "啊日本人絲襪敏網啊日本人敏網絲襪網我日本絲襪日本";

    System.out.println(testStr);

    String result =filter.filte(testStr);

    System.out.println(result);

敏感詞是[絲襪, 絲襪網]

啊日本人絲襪敏網啊日本人敏網絲襪網我日本絲襪日本

啊日本人**敏網啊日本人敏網**我日本**日本