Trie樹實現詞頻統計與查詢
阿新 • • 發佈:2018-12-29
#encoding:utf-8
from collections import defaultdict
import sys
reload(sys)
sys.setdefaultencoding('utf8')
class LBTrie:
"""
simple implemention of Trie in Python.
"""
def __init__(self):
self.trie = {}
self.size = 0
#新增單詞
def add(self, word):
p = self.trie
dicnum = 0
word = word.strip()
for c in word:
if not c in p:
p[c] = {}
dicnum+=1
p = p[c]
if word != '':
#在單詞末尾處新增鍵值''作為標記,即只要某個字元的字典中含有''鍵即為單詞結尾
p[''] = ''
if dicnum == len(word):
return True
#查詢單詞
def search(self, word):
p = self.trie
word = word.lstrip()
for c in word:
if not c in p:
return False
p = p[c]
#判斷單詞結束標記''
if '' in p:
return True
return False
#列印Trie樹的介面
def output(self):
#print '{'
self.__print_item(self.trie)
#print '}'
return self.__print_item(self.trie)
#實現Trie樹列印的私有遞迴函式,indent控制縮排
def __print_item(self, p, indent=0):
if p:
ind = '' + '\t' * indent
for key in p.keys():
label = "'%s' : " % key
print ind + label + '{'
self.__print_item(p[key], indent+1)
print ind + ' '*len(label) + '}'
def codeutil(strs):
return strs.decode('utf8','ignore').encode('GBK','ignore').decode('GBK','ignore')
if __name__ == '__main__':
trie_obj = LBTrie()
#新增單詞
corpus = open('content.txt','r')
tree = open('tree.txt','w+')
countdic = defaultdict(int)
for record in corpus.readlines():
recordlist = record.split(' ')
for word in recordlist:
check = trie_obj.add(codeutil(word))
if check:
countdic[word] += 1
resortedcountdic = sorted(countdic.items(), key=lambda item: item[1], reverse=True)
for tup in resortedcountdic:
tree.write(''.join(codeutil(tup[0]))+'\t'+str(tup[1])+'\t')
#查詢單詞
if trie_obj.search(codeutil('氨基酸')):
print 'Yes'
else:
print 'No'