1. 程式人生 > >Trie樹實現詞頻統計與查詢

Trie樹實現詞頻統計與查詢

#encoding:utf-8
from collections import defaultdict
import sys
reload(sys) 
sys.setdefaultencoding('utf8') 
class LBTrie:  
    """ 
    simple implemention of Trie in Python.  
    """  
    def __init__(self):  
        self.trie = {}  
        self.size = 0  

    #新增單詞   
    def add(self, word):
p = self.trie dicnum = 0 word = word.strip() for c in word: if not c in p: p[c] = {} dicnum+=1 p = p[c] if word != '': #在單詞末尾處新增鍵值''作為標記,即只要某個字元的字典中含有''鍵即為單詞結尾 p[''] = ''
if dicnum == len(word): return True #查詢單詞 def search(self, word): p = self.trie word = word.lstrip() for c in word: if not c in p: return False p = p[c] #判斷單詞結束標記'' if
'' in p: return True return False #列印Trie樹的介面 def output(self): #print '{' self.__print_item(self.trie) #print '}' return self.__print_item(self.trie) #實現Trie樹列印的私有遞迴函式,indent控制縮排 def __print_item(self, p, indent=0): if p: ind = '' + '\t' * indent for key in p.keys(): label = "'%s' : " % key print ind + label + '{' self.__print_item(p[key], indent+1) print ind + ' '*len(label) + '}' def codeutil(strs): return strs.decode('utf8','ignore').encode('GBK','ignore').decode('GBK','ignore') if __name__ == '__main__': trie_obj = LBTrie() #新增單詞 corpus = open('content.txt','r') tree = open('tree.txt','w+') countdic = defaultdict(int) for record in corpus.readlines(): recordlist = record.split(' ') for word in recordlist: check = trie_obj.add(codeutil(word)) if check: countdic[word] += 1 resortedcountdic = sorted(countdic.items(), key=lambda item: item[1], reverse=True) for tup in resortedcountdic: tree.write(''.join(codeutil(tup[0]))+'\t'+str(tup[1])+'\t') #查詢單詞 if trie_obj.search(codeutil('氨基酸')): print 'Yes' else: print 'No'