Java過濾敏感詞彙演算法(字典樹)
阿新 • • 發佈:2018-12-17
定義節點
import java.util.ArrayList; import java.util.List; public class Node { private char root; private List<Node> childList; private boolean isLeaf; public Node(char root){ this.root=root; childList=new ArrayList<Node>(); isLeaf=false; } public Node subNode(char c){ if(childList != null){ for(Node eachChild : childList){ if(eachChild.root == c){ return eachChild; } } } return null; } public boolean isLeaf() { return isLeaf; } public void setLeaf(boolean leaf) { isLeaf = leaf; } public void addChild(Node node){ this.childList.add(node); } }
定義字典樹
public class WordsTree { private Node head; public WordsTree() { this.head = new Node(' ');//頭結點 } public void insert(String word) { if(search(word).equals(word)) return;//檢測樹中是否存在此詞 Node node = head; for (int i = 0; i < word.length(); i++) { Node child = node.subNode(word.charAt(i)); if (child != null) { node = child; } else { node.addChild(new Node(word.charAt(i))); node = node.subNode(word.charAt(i)); } } node.setLeaf(true); } public String search(String word) { Node node = this.head; String str = ""; for (int i = 0; i < word.length(); i++) { if (node.isLeaf()) return str; //在這新增防止文章中的敏感詞彙被空格(可以加其他字元)隔開,無法識別 if (word.charAt(i) == ' ') { str += word.charAt(i); continue; } if (node.subNode(word.charAt(i)) == null) return ""; node = node.subNode(word.charAt(i)); str += word.charAt(i); } if (node.isLeaf() == true) return str; else return ""; } public String searchComment(String comment) { String str = ""; String comment1 = comment; for (int i = 0; i < comment.length(); i++) { str = this.search(comment.substring(i)); if (!str.equals("")) { comment1 = comment1.replaceAll(str, "\\*\\*"); } i += str.length(); } return comment1; } }
呼叫
public class main { public static void main(String[] args){ WordsTree tree=new WordsTree(); tree.insert("鯉魚"); tree.insert("他家"); System.out.println(tree.searchComment("紅鯉魚家有頭小綠驢叫李屢屢,綠鯉 魚家有頭小紅驢叫呂裡裡,紅鯉1魚說他家的李屢屢要比綠鯉魚家的呂裡裡綠,綠鯉魚說他家的呂裡裡要比紅鯉魚家的李屢屢紅,是紅鯉魚比綠鯉魚的驢綠,還是綠鯉魚比紅鯉魚的驢紅。")); } }
結果:
紅**家有頭小綠驢叫李屢屢,綠**家有頭小紅驢叫呂裡裡,紅鯉1魚說**的李屢屢要比綠**家的呂裡裡綠,綠**說**的呂裡裡要比紅**家的李屢屢紅,是紅**比綠**的驢綠,還是綠**比紅**的驢紅。
可以看出用空格隔開的“鯉魚”二字仍然被識別出來了,而用“1”隔開的“鯉魚”二字沒有被識別,在網上聊天的時候就是這樣識別的,當一些詞語無法傳送的時候用一些常見字元隔開就可以傳送了,“他家”兩個字也可以檢測出來。