gensim 英文文字相似度
阿新 • • 發佈:2019-01-24
# -*- coding: utf-8 -*-
# __jiahuiyu__
"""
對英文的處理
"""
import logging
from gensim import models, similarities, corpora
from collections import defaultdict
import os
# 日誌輸出
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# 停用詞
stoplist = set('for a of the and to in' .split())
# 英文標點符號
punctions = [' ', '\n', '\t', ',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']
documents = open('E:/descfile/en_test/en_text.txt', 'r')
lines = documents.readlines()
print lines
texts = [[word for word in document.lower().split() if word not in stoplist and punctions]
for document in lines]
# texts = [[word for word in document.lower().split() if word not in punctions]
# for document in texts]
print texts
# 詞標記
frequency = defaultdict(int)
for text in texts:
for token in text:
frequency[token] += 1
texts1 = [[token for token in text if frequency[token] > 1] for text in texts]
print texts1
# 建立詞典和語料庫
dictionary = corpora.Dictionary(texts)
dictionary.save('e:/descfile/en_test/desc_en.dict')
# print(dictionary)
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('e:/descfile/en_test/desc_en.mm', corpus)
print corpus
# 下載儲存的建立好的詞典和語料庫
if os.path.exists('e:/descfile/en_test/desc_en.dict'):
dictionary = corpora.Dictionary.load('e:/descfile/en_test/desc_en.dict')
corpus = corpora.MmCorpus('e:/descfile/en_test/desc_en.mm')
print 'used english files generated'
else:
print 'please generate the files again!'
# 建立模型
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
# make transformations serialized
lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10)
corpus_lsi = lsi_model[corpus_tfidf]
index = similarities.MatrixSimilarity(lsi_model[corpus])
"""
print corpus_lsi
for c in corpus_lsi:
print c
"""
# test english string
en_str = 'There is nothing noble in being superior to some other man.The true nobility is being supior to your previous self.'
en_str_vec = dictionary.doc2bow(en_str.lower().split())
print en_str_vec
lsi_str_vec1 = lsi_model[en_str_vec]
print lsi_str_vec1
# 計算相似度
sims = index[lsi_str_vec1]
print list(enumerate(sims))
# sorted
simsorted = sorted(enumerate(sims), key=lambda item: -item[1])
print simsorted