字首樹實現過濾敏感詞
阿新 • • 發佈:2022-03-18
原文:
https://blog.csdn.net/weixin_42700635/article/details/105637764
import org.apache.commons.lang3.CharUtils; import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.stereotype.Component; import javax.annotation.PostConstruct; importjava.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.HashMap; import java.util.Map; @Component public class SensitiveFilter { private static final Logger logger = LoggerFactory.getLogger(SensitiveFilter.class);//替換符 private static String REPLACEMENT = "***"; //根節點 private TrieNode root = new TrieNode(); @PostConstruct public void init() { try (InputStream is = this.getClass().getClassLoader().getResourceAsStream("sensitive-words.txt"); BufferedReader reader=new BufferedReader(newInputStreamReader(is)); ) { String keyword; while ((keyword=reader.readLine())!=null){ //新增到字首樹 this.addKeyword(keyword); } } catch (IOException e) { logger.error("載入敏感詞檔案失敗:"+e.getMessage()); } } //將敏感詞新增到字首樹當中 private void addKeyword(String keyword){ TrieNode tempNode=root; for (int i=0;i<keyword.length();i++){ char c=keyword.charAt(i); TrieNode subNode=tempNode.getSubNode(c); if (subNode==null){ //初始化子節點 subNode=new TrieNode(); tempNode.addSubNode(c,subNode); } //指向子節點,進入下一輪迴圈 tempNode=subNode; //設定結束標識 if (i==keyword.length()-1){ tempNode.setKeywordEnd(true); } } } /** * 過濾敏感詞 * @param text 待過濾文字 * @return 過濾後的文字 */ public String filter(String text){ if (StringUtils.isBlank(text)){ return null; } //指標1 TrieNode tempNode=root; //指標2 int begin=0; //指標3 int position=0; //結果 StringBuilder sb=new StringBuilder(); while (position<text.length()){ char c=text.charAt(position); //跳過符號 if (isSymbol(c)){ //若指標1處於根節點,將此符號計入結果,讓指標2向下走一步 if (tempNode==root){ sb.append(c); begin++; } //無論符號在開頭或中間,指標3都向下走一步 position++; continue; } //檢查下級節點 tempNode=tempNode.getSubNode(c); if (tempNode==null){ //以begin開頭的字元不是敏感詞 sb.append(text.charAt(begin)); //進入下一個位置 position=++begin; //重新指向根節點 tempNode=root; }else if (tempNode.isKeywordEnd()){ //發現敏感詞,將begin-position字串替換掉 sb.append(REPLACEMENT); begin=++position; //重新指向根節點 tempNode=root; }else { //檢查下一個字元 position++; } } //將最後一批字元計入結果 sb.append(text.substring(begin)); return sb.toString(); } //判斷是否為符號 private boolean isSymbol(Character c){ // 0x2E80-0x9FFF是東亞文字範圍 return !CharUtils.isAsciiAlphanumeric(c) && (c<0x2E80||c>0x9FFF); } //字首樹 private class TrieNode { //關鍵詞結束標識 private boolean isKeywordEnd = false; //子節點(key是下級字元,value是下級節點) private Map<Character, TrieNode> subNodes = new HashMap<>(); public boolean isKeywordEnd() { return isKeywordEnd; } public void setKeywordEnd(boolean keywordEnd) { isKeywordEnd = keywordEnd; } //新增子節點方法 public void addSubNode(Character key, TrieNode value) { subNodes.put(key, value); } //獲取子節點方法 public TrieNode getSubNode(Character key) { return subNodes.get(key); } } }