AC自動機1——適用於utf-8編碼的Trie樹
阿新 • • 發佈:2019-02-07
最近需要用到文字的拼音相似度計算,看了hankcs大神的hanlp裡面通過ac自動機實現拼音的儲存,想把它轉成python版本的。開始啃AC自動機吧。
AC自動機建立在Trie樹和KMP字串匹配演算法。首先啃Trie樹。
我所要做的是把utf-8編碼的中文詞和拼音對應起來。Utf-8編碼將一個漢字編碼成3個byte,每個byte按照16進位制儲存。鑑於這種情況,需要構造一個256 Trie,即每一層可能有256個節點。
看了幾個程式後,集眾人智慧,寫了一個自己的。
# coding:utf-8 import sys reload(sys) sys.setdefaultencoding("utf-8") class TrieNode(object): def __init__(self): self.one_byte = {} self.value = None self.is_word = False class Trie256(object): def __init__(self): self.root = TrieNode() def getUtf8String(self, string): bytes_array = bytearray(string.encode("utf-8")) return bytes_array def insert(self, bytes_array, str): node = self.root for byte in bytes_array: child = node.one_byte.get(byte) if child == None: node.one_byte[byte] = TrieNode() node = node.one_byte[byte] node.is_word = True node.value = str def find(self, bytes_array): node = self.root for byte in bytes_array: child = node.one_byte.get(byte) if child == None: print "No this word in this Trie." return None node = node.one_byte[byte] if not node.is_word: print "It is not a word." return None else: return node.value def modify(self, bytes_array, str): node = self.root for byte in bytes_array: child = node.one_byte.get(byte) if child == None: print "This word is not in this Trie, we will insert it." node.one_byte[byte] = TrieNode() node = node.one_byte[byte] if not node.is_word: print "This word is not a word in this Trie, we will make it a word." node.is_word = True node.value = str else: print "modify this word..." node.value = str def delete(self, bytes_array): node = self.root for byte in bytes_array: child = node.one_byte.get(byte) if child == None: print "This word is not in this Trie." break node = node.one_byte[byte] if not node.is_word: print "It is not a word." else: node.is_word = False node.value = None child = node.one_byte.keys() if len(child) == 0: node.one_byte.clear() def print_item(self, p, indent=0): if p: ind = '' + '\t' * indent for key in p.one_byte.keys(): label = "'%s' : " % key print ind + label + '{' self.print_item(p.one_byte[key], indent + 1) #print ind + ' ' * len(label) + '}' #self.print_item(p.one_byte[key], indent + 1) if __name__ == "__main__": trie = Trie256() with open("dictionary/pinyin.txt", 'r') as fd: line = fd.readline() while line: line_split = line.split('=') word = line_split[0] pinyin = line_split[1].strip() bytes = trie.getUtf8String(word) sentence = '' for byte in bytes: sentence = sentence + 'x' + str(byte) print sentence trie.insert(bytes, pinyin) line = fd.readline() trie.print_item(trie.root) bytes = trie.getUtf8String("一分鐘".decode("utf-8")) for byte in bytes: print byte print trie.find(bytes)