【python 走進NLP】英文敏感詞過濾演算法改進版本
阿新 • • 發佈:2018-12-18
中文DFA演算法過濾敏感詞改進版本
# 中文DFA演算法過濾敏感詞改進版本 class Chinese_DFAFilter(): def __init__(self): self.keyword_chains = {} self.delimit = '\x00' def add(self, keyword): keyword = keyword.lower() chars = keyword.strip() if not chars: return level = self.keyword_chains for i in range(len(chars)): if chars[i] in level: level = level[chars[i]] else: if not isinstance(level, dict): break for j in range(i, len(chars)): level[chars[j]] = {} last_level, last_char = level, chars[j] level = level[chars[j]] last_level[last_char] = {self.delimit: 0} break if i == len(chars) - 1: level[self.delimit] = 0 # def parse(self, path): # with open(path,encoding='utf-8') as f: # for keyword in f: # # print(keyword) # self.add(str(keyword).strip()) # 載入敏感詞函式 def parse(self, data): for i in data['lable']: self.add(str(i).strip()) def filter(self, message, repl="*"): message = message.lower() ret = [] start = 0 hit_word=[] while start < len(message): level = self.keyword_chains step_ins = 0 for char in message[start:]: if char in level: step_ins += 1 if self.delimit not in level[char]: level = level[char] else: # print(step_ins) ret.append(repl * step_ins) # print("%s--------step_ins" %step_ins) start += step_ins - 1 # print("%s--------start" %start) kk=message[start-step_ins+1:start+1] hit_word.append(kk) break else: ret.append(message[start]) # print(message[start]) break else: ret.append(message[start]) start += 1 return hit_word
英文DFA演算法過濾敏感詞改進版本
# 英文DFA演算法 class English_DFAFilter(): def __init__(self): self.keyword_chains = {} self.delimit = '\x00' def find_english_word_last_index(self,message): """ :param sentence: 英文句子 :return: 返回英文句子的每個單詞最後的字母的索引 """ last_index_list = [] for i, j in enumerate(message): # print(i, j) if j == ' ': last_index_list.append(i - 1) last_index_list.append(len(message) - 1) print(last_index_list) return last_index_list def add(self, keyword): keyword = keyword.lower() chars = keyword.strip() if not chars: return level = self.keyword_chains for i in range(len(chars)): if chars[i] in level: level = level[chars[i]] else: if not isinstance(level, dict): break for j in range(i, len(chars)): level[chars[j]] = {} last_level, last_char = level, chars[j] level = level[chars[j]] last_level[last_char] = {self.delimit: 0} break if i == len(chars) - 1: level[self.delimit] = 0 # def parse2(self, path): # with open(path,encoding='utf-8') as f: # for keyword in f: # # print(keyword) # self.add(str(keyword).strip()) # 載入敏感詞函式 def parse(self, data): for i in data['lable']: self.add(str(i).strip()) def filter(self, message, repl="*"): message = message.lower() ret = [] start = 0 hit_word=[] while start < len(message): level = self.keyword_chains step_ins = 0 for char in message[start:]: if char in level: step_ins += 1 if self.delimit not in level[char]: level = level[char] else: # print(step_ins) ret.append(repl * step_ins) # print("%s--------step_ins" %step_ins) start += step_ins - 1 # print("%s--------start" %start) # 判斷找到是否是每個單詞的最後一個字母的索引 if start in self.find_english_word_last_index(message): kk=message[start-step_ins+1:start+1] hit_word.append(kk) break else: ret.append(message[start]) # print(message[start]) break else: ret.append(message[start]) start += 1 return hit_word