過濾敏感字DFA JAVA實現
阿新 • • 發佈:2017-08-13
pen string 關鍵詞 arr util ber 添加節點 odi index
package com.member.schedule; import java.io.UnsupportedEncodingException; import java.nio.ByteBuffer; import java.nio.charset.Charset; import java.util.List; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * @author 徐良永 * @Date 2011-10-13 上午9:23:43 */ public class DFA { private staticfinal Logger logger = LoggerFactory.getLogger(DFA.class); /** * 根節點 */ private TreeNode rootNode = new TreeNode(); /** * 關鍵詞緩存 */ private ByteBuffer keywordBuffer = ByteBuffer.allocate(1024); /** * 關鍵詞編碼 */ private String charset = "GBK"; /** * 創建DFA * *@param keywordList * @throws UnsupportedEncodingException */ public void createKeywordTree(List<String> keywordList) throws UnsupportedEncodingException { for (String keyword : keywordList) { if (keyword == null) continue; keyword = keyword.trim();byte[] bytes = keyword.getBytes(charset); TreeNode tempNode = rootNode; // 循環每個字節 for (int i = 0; i < bytes.length; i++) { int index = bytes[i] & 0xff; // 字符轉換成數字 TreeNode node = tempNode.getSubNode(index); if (node == null) { // 沒初始化 node = new TreeNode(); tempNode.setSubNode(index, node); } tempNode = node; if (i == bytes.length - 1) { tempNode.setKeywordEnd(true); // 關鍵詞結束, 設置結束標誌 logger.debug("DFA:{}", keyword); } } // end for } // end for } /** * 搜索關鍵字 */ public String searchKeyword(String text) throws UnsupportedEncodingException { return searchKeyword(text.getBytes(charset)); } /** * 搜索關鍵字 */ public String searchKeyword(byte[] bytes) { StringBuilder words = new StringBuilder(); if (bytes == null || bytes.length == 0) { return words.toString(); } TreeNode tempNode = rootNode; int rollback = 0; // 回滾數 int position = 0; // 當前比較的位置 while (position < bytes.length) { int index = bytes[position] & 0xFF; keywordBuffer.put(bytes[position]); // 寫關鍵詞緩存 tempNode = tempNode.getSubNode(index); // 當前位置的匹配結束 if (tempNode == null) { position = position - rollback; // 回退 並測試下一個字節 rollback = 0; tempNode = rootNode; // 狀態機復位 keywordBuffer.clear(); // 清空 } else if (tempNode.isKeywordEnd()) { // 是結束點 記錄關鍵詞 keywordBuffer.flip(); String keyword = Charset.forName(charset).decode(keywordBuffer).toString(); logger.debug("Find keyword:{}", keyword); keywordBuffer.limit(keywordBuffer.capacity()); if (words.length() == 0) words.append(keyword); else words.append(":").append(keyword); rollback = 1; // 遇到結束點 rollback 置為1 } else { rollback++; // 非結束點 回退數加1 } position++; } return words.toString(); } public void setCharset(String charset) { this.charset = charset; } }
package com.member.schedule; import java.util.ArrayList; import java.util.List; /** * 樹節點 每個節點包含一個長度為256的數組 * * @author 徐良永 * @Date 2011-10-12 上午3:11:24 */ public class TreeNode { private static final int NODE_LEN = 256; /** * true 關鍵詞的終結 ; false 繼續 */ private boolean end = false; private List<TreeNode> subNodes = new ArrayList<TreeNode>(NODE_LEN); public TreeNode() { for (int i = 0; i < NODE_LEN; i++) { subNodes.add(i, null); } } /** * 向指定位置添加節點樹 * * @param index * @param node */ public void setSubNode(int index, TreeNode node) { subNodes.set(index, node); } public TreeNode getSubNode(int index) { return subNodes.get(index); } public boolean isKeywordEnd() { return end; } public void setKeywordEnd(boolean end) { this.end = end; } }
link:http://www.iteye.com/topic/1116520
過濾敏感字DFA JAVA實現