1. 程式人生 > >java DFA 敏感詞過濾

java DFA 敏感詞過濾

@SuppressWarnings("unchecked")
public class SensitiveWordUtils
{
    /**
     * 只要匹配到一個就返回
     */
    public static final int MATCHTYPE_MIN = 1 << 0;

    /**
     * 統計所有敏感詞
     */
    public static final int MATCHTYPE_ALL = 1 << 1;

    /**
     * map 中的結束標誌
     */
    private static final String END = "1"
; private static final String END_NOT = "0"; private static final Map<String, Object> sensitiveWordMap = new HashMap<>(); @SuppressWarnings("serial") private static final Set<String> sensitiveWordSets = new HashSet<String>() { { add("日本人"
); add("本來啊"); } }; static { Map<String, Object> newWordMap = null; Map<String, Object> addMap = null; for (String keywords : sensitiveWordSets) { newWordMap = sensitiveWordMap; for (int i = 0; i < keywords.length(); i++) { String c = String.valueOf(keywords.charAt(i)); Object o = newWordMap.get(c); if
(null == o) { addMap = new HashMap<String, Object>(); addMap.put("isEnd", END_NOT); // 不是最後一個 newWordMap.put(c, addMap); newWordMap = addMap; } else { newWordMap = (Map<String, Object>)o; } if (i == keywords.length() - 1) { newWordMap.put("isEnd", END); // 最後一個 } } } } /** * 敏感詞匹配,返回匹配到的敏感詞數量 * * @Description * @param str 需要匹配的字元 * @param beginIndex 開始匹配位置 * @param matchType 匹配規則: MATCHTYPE_MIN(只要匹配到一個敏感詞就返回) MATCHTYPE_ALL(返回匹配到的所有敏感詞數量) * @return */ public static int checkSensitiveWord(String str, int beginIndex, int matchType) { int mark = 0; Map<String, Object> newWordMap = null; Object o; for (int i = beginIndex; i < str.length(); i++) { newWordMap = sensitiveWordMap; int j = i; while (j < str.length()) { o = newWordMap.get(String.valueOf(str.charAt(j))); if (o == null) { break; } else { newWordMap = (Map<String, Object>)o; if (END.equals(newWordMap.get("isEnd").toString())) { mark++; if (matchType == MATCHTYPE_MIN) { return mark; } break; } } j++; } } return mark; } /** * 返回匹配到的敏感詞 * * @Description * @param str * @param beginIndex 開始匹配位置 * @param matchType 匹配規則 * @return */ public static Set<String> firstSensitiveWord(String str, int beginIndex, int matchType) { Set<String> set = new HashSet<>(); Map<String, Object> newWordMap = null; Object o; for (int i = beginIndex; i < str.length(); i++) { newWordMap = sensitiveWordMap; int j = i; while (j < str.length()) { o = newWordMap.get(String.valueOf(str.charAt(j))); if (o == null) { break; } else { newWordMap = (Map<String, Object>)o; if (END.equals(newWordMap.get("isEnd").toString())) { set.add(str.substring(i, j + 1)); if (matchType == MATCHTYPE_MIN) { return set; } break; } } j++; } } return set; } public static void main(String[] args) { System.out.println(checkSensitiveWord("日本來啊日本人", 0, MATCHTYPE_ALL)); } }

程式碼中 sensitiveWordMap 為{日={本={男={人={isEnd=1}, isEnd=0}, 人={isEnd=1}, isEnd=0}, isEnd=0}, 法={isEnd=0, 輪={isEnd=0, 功={isEnd=1}}}}