1. 程式人生 > >過濾敏感字DFA JAVA實現

過濾敏感字DFA JAVA實現

pen string 關鍵詞 arr util ber 添加節點 odi index

package com.member.schedule;

import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.util.List;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * @author 徐良永
 * @Date 2011-10-13 上午9:23:43
 */
public class DFA {

    private static
final Logger logger = LoggerFactory.getLogger(DFA.class); /** * 根節點 */ private TreeNode rootNode = new TreeNode(); /** * 關鍵詞緩存 */ private ByteBuffer keywordBuffer = ByteBuffer.allocate(1024); /** * 關鍵詞編碼 */ private String charset = "GBK"; /** * 創建DFA * *
@param keywordList * @throws UnsupportedEncodingException */ public void createKeywordTree(List<String> keywordList) throws UnsupportedEncodingException { for (String keyword : keywordList) { if (keyword == null) continue; keyword = keyword.trim();
byte[] bytes = keyword.getBytes(charset); TreeNode tempNode = rootNode; // 循環每個字節 for (int i = 0; i < bytes.length; i++) { int index = bytes[i] & 0xff; // 字符轉換成數字 TreeNode node = tempNode.getSubNode(index); if (node == null) { // 沒初始化 node = new TreeNode(); tempNode.setSubNode(index, node); } tempNode = node; if (i == bytes.length - 1) { tempNode.setKeywordEnd(true); // 關鍵詞結束, 設置結束標誌 logger.debug("DFA:{}", keyword); } } // end for } // end for } /** * 搜索關鍵字 */ public String searchKeyword(String text) throws UnsupportedEncodingException { return searchKeyword(text.getBytes(charset)); } /** * 搜索關鍵字 */ public String searchKeyword(byte[] bytes) { StringBuilder words = new StringBuilder(); if (bytes == null || bytes.length == 0) { return words.toString(); } TreeNode tempNode = rootNode; int rollback = 0; // 回滾數 int position = 0; // 當前比較的位置 while (position < bytes.length) { int index = bytes[position] & 0xFF; keywordBuffer.put(bytes[position]); // 寫關鍵詞緩存 tempNode = tempNode.getSubNode(index); // 當前位置的匹配結束 if (tempNode == null) { position = position - rollback; // 回退 並測試下一個字節 rollback = 0; tempNode = rootNode; // 狀態機復位 keywordBuffer.clear(); // 清空 } else if (tempNode.isKeywordEnd()) { // 是結束點 記錄關鍵詞 keywordBuffer.flip(); String keyword = Charset.forName(charset).decode(keywordBuffer).toString(); logger.debug("Find keyword:{}", keyword); keywordBuffer.limit(keywordBuffer.capacity()); if (words.length() == 0) words.append(keyword); else words.append(":").append(keyword); rollback = 1; // 遇到結束點 rollback 置為1 } else { rollback++; // 非結束點 回退數加1 } position++; } return words.toString(); } public void setCharset(String charset) { this.charset = charset; } }
package com.member.schedule;

import java.util.ArrayList;
import java.util.List;

/**
 * 樹節點 每個節點包含一個長度為256的數組
 * 
 * @author 徐良永
 * @Date 2011-10-12 上午3:11:24
 */
public class TreeNode {

    private static final int NODE_LEN = 256;

    /**
     * true 關鍵詞的終結 ; false 繼續
     */
    private boolean end = false;

    private List<TreeNode> subNodes = new ArrayList<TreeNode>(NODE_LEN);

    public TreeNode() {
        for (int i = 0; i < NODE_LEN; i++) {
            subNodes.add(i, null);
        }
    }

    /**
     * 向指定位置添加節點樹
     * 
     * @param index
     * @param node
     */
    public void setSubNode(int index, TreeNode node) {
        subNodes.set(index, node);
    }

    public TreeNode getSubNode(int index) {
        return subNodes.get(index);
    }

    public boolean isKeywordEnd() {
        return end;
    }

    public void setKeywordEnd(boolean end) {
        this.end = end;
    }

}

link:http://www.iteye.com/topic/1116520

過濾敏感字DFA JAVA實現