機械匹配詞表最大化分詞
阿新 • • 發佈:2018-12-23
2017-05-18
分詞程式碼
# -*- coding:utf-8 -*-
#簡單的支援中文的正向最大匹配的機械分詞
import string
__dict = {}
def load_dict(dict_file='words.dic'):
#載入詞庫,把詞庫載入成一個key為首字元,value為相關詞的列表的字典
words = [line.split() for line in open(dict_file)]
for word in words:
first_char = word[0][0]
__dict .setdefault(first_char, [])
__dict[first_char].append(word[0])
#按詞的長度倒序排列
for first_char, twords in __dict.items():
__dict[first_char] = sorted(twords, key=lambda x:len(x), reverse=True)
def __match_ascii(i, input):
#返回連續的英文字母,數字,符號, 對英文,字母,符號不處理
result = ''
for i in range(i, len(input)):
if input[i] in string.printable: # and input[i] not in string.whitespace: #string.ascii_letters or input[i] in string.digits:
result += input[i]
else:
break
return result.strip()
def __match_word(first_char , i , input):
#根據當前位置進行分詞,ascii的直接讀取連續字元,中文的讀取詞庫
if not __dict.get(first_char):
try:
if first_char in string.printable: #string.ascii_letters or first_char in string.digits:
return __match_ascii(i, input)
except:
print('except:',first_char,chr(first_char))
return first_char
words = __dict[first_char]
for word in words:
if input[i:i+len(word)] == word:
return word
return first_char
def tokenize(input):
#對input進行分詞
if not input: return []
tokens = []
i = 0
while i < len(input):
first_char = input[i]
matched_word = __match_word(first_char, i, input)
tokens.append(matched_word)
i += len(matched_word)
return tokens
if __name__ == '__main__':
def get_test_text():
import requests
url = "http://www.zhb.gov.cn/xxgk/gzdt/201703/t20170321_408538.shtml"
#url="http://mil.news.sina.com.cn/2016-12-30/doc-ifxzczff3445251.shtml"
#text = requests.get(url).content
text = requests.get(url,'utf8').content
#return text.decode('gbk')
#print(text.decode('utf8'))
return text.decode('utf8')
def load_dict_test():
load_dict()
i=0;
for first_char, words in __dict.items():
print('%d. %s:%s' % (i,first_char, ' '.join(words)))
i=i+1
if i>10:
break
def tokenize_test(text):
load_dict()
tokens = tokenize(text)
for token in tokens:
print(token)
#load_dict_test()
tokenize_test('美麗的花園裡有各種各樣的小動物')
tokenize_test('他購買了一盒Rosetta Stone品牌的SHA-PA型號24/6的訂書釘,總價¥24.3元.')
tokenize_test('1949年10月1日,毛主席站在天安門城樓上莊嚴宣佈:中華人民共和國中央人民政府成立了!');
tokenize_test('A Happy New Yeear and a Merry Christmas