1. 程式人生 > 其它 >word2vec訓練模型實現文字轉換詞向量

word2vec訓練模型實現文字轉換詞向量

利用Word2Vec 實現文字分詞後轉換成詞向量

 1 import re
 2 import jieba
 3 from gensim.models import Word2Vec, word2vec
 4 
 5 
 6 def tokenize():
 7     """
 8     分詞
 9     :return:
10     """
11     f_input = open('166893.txt', 'r', encoding='utf-8')
12     f_output = open('yttlj.txt', 'w', encoding='utf-8')
13     line = f_input.readline()
14 while line: 15 newline = jieba.cut(line, cut_all=False) 16 newline = ' '.join(newline) 17 fileters = ['', '', '', '!', '', '"', '#', '$', '%', '&', '\(', '\)', '\*', '\+', ',', '-', '\.', '/', ':', ';', '<', '=', '>', '\?', '@' 18 , '\[', '\\', '
\]', '^', '_', '`', '\{', '\|', '\}', '~', '', '', ''] 19 newline = re.sub("<.*?>", " ", newline, flags=re.S) 20 newline = re.sub("|".join(fileters), " ", newline, flags=re.S) 21 f_output.write(newline) 22 print(newline) 23 line = f_input.readline() 24
f_input.close() 25 f_output.close() 26 27 28 def train_model(): 29 """ 30 訓練模型 31 :return: 32 """ 33 model_file_name = 'model_yt.txt' 34 sentences = word2vec.LineSentence('yttlj.txt') 35 model = word2vec.Word2Vec(sentences, window=5, min_count=5, workers=4, vector_size=300) 36 model.save(model_file_name) 37 38 39 def test(): 40 """ 41 測試 42 :return: 43 """ 44 model = Word2Vec.load('model_yt.txt') 45 print(model.wv.similarity('趙敏', '趙敏')) 46 print(model.wv.similarity('趙敏', '周芷若')) 47 for k in model.wv.most_similar('趙敏', topn=10): 48 print(k[0], k[1]) 49 50 51 if __name__ == '__main__': 52 test()
View Code