java DFA 敏感詞過濾
阿新 • • 發佈:2019-02-17
@SuppressWarnings("unchecked")
public class SensitiveWordUtils
{
/**
* 只要匹配到一個就返回
*/
public static final int MATCHTYPE_MIN = 1 << 0;
/**
* 統計所有敏感詞
*/
public static final int MATCHTYPE_ALL = 1 << 1;
/**
* map 中的結束標誌
*/
private static final String END = "1" ;
private static final String END_NOT = "0";
private static final Map<String, Object> sensitiveWordMap = new HashMap<>();
@SuppressWarnings("serial")
private static final Set<String> sensitiveWordSets = new HashSet<String>()
{
{
add("日本人" );
add("本來啊");
}
};
static
{
Map<String, Object> newWordMap = null;
Map<String, Object> addMap = null;
for (String keywords : sensitiveWordSets)
{
newWordMap = sensitiveWordMap;
for (int i = 0; i < keywords.length(); i++)
{
String c = String.valueOf(keywords.charAt(i));
Object o = newWordMap.get(c);
if (null == o)
{
addMap = new HashMap<String, Object>();
addMap.put("isEnd", END_NOT); // 不是最後一個
newWordMap.put(c, addMap);
newWordMap = addMap;
}
else
{
newWordMap = (Map<String, Object>)o;
}
if (i == keywords.length() - 1)
{
newWordMap.put("isEnd", END); // 最後一個
}
}
}
}
/**
* 敏感詞匹配,返回匹配到的敏感詞數量
*
* @Description
* @param str 需要匹配的字元
* @param beginIndex 開始匹配位置
* @param matchType 匹配規則: MATCHTYPE_MIN(只要匹配到一個敏感詞就返回) MATCHTYPE_ALL(返回匹配到的所有敏感詞數量)
* @return
*/
public static int checkSensitiveWord(String str, int beginIndex, int matchType)
{
int mark = 0;
Map<String, Object> newWordMap = null;
Object o;
for (int i = beginIndex; i < str.length(); i++)
{
newWordMap = sensitiveWordMap;
int j = i;
while (j < str.length())
{
o = newWordMap.get(String.valueOf(str.charAt(j)));
if (o == null)
{
break;
}
else
{
newWordMap = (Map<String, Object>)o;
if (END.equals(newWordMap.get("isEnd").toString()))
{
mark++;
if (matchType == MATCHTYPE_MIN)
{
return mark;
}
break;
}
}
j++;
}
}
return mark;
}
/**
* 返回匹配到的敏感詞
*
* @Description
* @param str
* @param beginIndex 開始匹配位置
* @param matchType 匹配規則
* @return
*/
public static Set<String> firstSensitiveWord(String str, int beginIndex, int matchType)
{
Set<String> set = new HashSet<>();
Map<String, Object> newWordMap = null;
Object o;
for (int i = beginIndex; i < str.length(); i++)
{
newWordMap = sensitiveWordMap;
int j = i;
while (j < str.length())
{
o = newWordMap.get(String.valueOf(str.charAt(j)));
if (o == null)
{
break;
}
else
{
newWordMap = (Map<String, Object>)o;
if (END.equals(newWordMap.get("isEnd").toString()))
{
set.add(str.substring(i, j + 1));
if (matchType == MATCHTYPE_MIN)
{
return set;
}
break;
}
}
j++;
}
}
return set;
}
public static void main(String[] args)
{
System.out.println(checkSensitiveWord("日本來啊日本人", 0, MATCHTYPE_ALL));
}
}
程式碼中 sensitiveWordMap 為{日={本={男={人={isEnd=1}, isEnd=0}, 人={isEnd=1}, isEnd=0}, isEnd=0}, 法={isEnd=0, 輪={isEnd=0, 功={isEnd=1}}}}