1. 程式人生 > >AC自動機1——適用於utf-8編碼的Trie樹

AC自動機1——適用於utf-8編碼的Trie樹

最近需要用到文字的拼音相似度計算,看了hankcs大神的hanlp裡面通過ac自動機實現拼音的儲存,想把它轉成python版本的。開始啃AC自動機吧。

AC自動機建立在Trie樹和KMP字串匹配演算法。首先啃Trie樹。

我所要做的是把utf-8編碼的中文詞和拼音對應起來。Utf-8編碼將一個漢字編碼成3個byte,每個byte按照16進位制儲存。鑑於這種情況,需要構造一個256 Trie,即每一層可能有256個節點。

看了幾個程式後,集眾人智慧,寫了一個自己的。

# coding:utf-8

import sys

reload(sys)
sys.setdefaultencoding("utf-8")

class TrieNode(object):
    def __init__(self):
        self.one_byte = {}
        self.value = None
        self.is_word = False


class Trie256(object):
    def __init__(self):
        self.root = TrieNode()

    def getUtf8String(self, string):
        bytes_array = bytearray(string.encode("utf-8"))
        return bytes_array

    def insert(self, bytes_array, str):
        node = self.root
        for byte in bytes_array:
            child = node.one_byte.get(byte)
            if child == None:
                node.one_byte[byte] = TrieNode()
            node = node.one_byte[byte]
        node.is_word = True
        node.value = str

    def find(self, bytes_array):
        node = self.root
        for byte in bytes_array:
            child = node.one_byte.get(byte)
            if child == None:
                print "No this word in this Trie."
                return None
            node = node.one_byte[byte]
        if not node.is_word:
            print "It is not a word."
            return None
        else:
            return node.value

    def modify(self, bytes_array, str):
        node = self.root
        for byte in bytes_array:
            child = node.one_byte.get(byte)
            if child == None:
                print "This word is not in this Trie, we will insert it."
                node.one_byte[byte] = TrieNode()
            node = node.one_byte[byte]
        if not node.is_word:
            print "This word is not a word in this Trie, we will make it a word."
            node.is_word = True
            node.value = str
        else:
            print "modify this word..."
            node.value = str

    def delete(self, bytes_array):
        node = self.root
        for byte in bytes_array:
            child = node.one_byte.get(byte)
            if child == None:
                print "This word is not in this Trie."
                break
            node = node.one_byte[byte]
        if not node.is_word:
            print "It is not a word."
        else:
            node.is_word = False
            node.value = None
            child = node.one_byte.keys()
            if len(child) == 0:
                node.one_byte.clear()

    def print_item(self, p, indent=0):
        if p:
            ind = '' + '\t' * indent
            for key in p.one_byte.keys():
                label = "'%s' : " % key
                print ind + label + '{'
                self.print_item(p.one_byte[key], indent + 1)
            #print ind + ' ' * len(label) + '}'
            #self.print_item(p.one_byte[key], indent + 1)


if __name__ == "__main__":
    trie = Trie256()

    with open("dictionary/pinyin.txt", 'r') as fd:
        line = fd.readline()
        while line:
            line_split = line.split('=')
            word = line_split[0]
            pinyin = line_split[1].strip()
            bytes = trie.getUtf8String(word)
            sentence = ''
            for byte in bytes:
                sentence = sentence + 'x' + str(byte)
            print sentence
            trie.insert(bytes, pinyin)
            line = fd.readline()

    trie.print_item(trie.root)


    bytes = trie.getUtf8String("一分鐘".decode("utf-8"))
    for byte in bytes:
        print byte
    print trie.find(bytes)