TF-IDF比較文字相似度
阿新 • • 發佈:2019-01-27
文字相似度
TF-IDF 演算法
如果某個詞在給定文件中很少出現,但是在給定文件中的某一篇文章中出現的次數很大,
該詞在很大程度上反映了該文章的特性,我們稱該詞為這篇文章的關鍵字
參考連結:http://www.ruanyifeng.com/blog/2013/03/tf-idf.html
餘弦相似性
測試案例
程式碼部分
# -*- coding: UTF-8 -*-
# import codecs
import jieba.posseg as pseg
from gensim import corpora, models, similarities
# from hotelmatcher.constant import *
class Tfidf:
""" TF-IDF模型比較文字相似度類 """
# 停用詞
stop_words = ['酒店', '旅館']
# 結巴分詞後的停用詞性
# [標點符號、連詞、助詞、副詞、介詞、時語素、‘的’、數詞、方位詞、代詞]
stop_flag = ['x', 'c', 'u', 'd', 'p', 't', 'uj', 'm', 'f', 'r']
def __init__(self):
# self.ensure_stop_words()
pass
"""
def ensure_stop_words(self):
# 停用詞
if self.stop_words is None:
stop_file = PATH_DOC + 'StopWords.txt'
stop_words = codecs.open(stop_file, 'r', encoding='utf8').readlines()
self.stop_words = [w.strip() for w in stop_words]
"""
def text2words(self, text: str) -> list:
""" 對一段文字分詞、去停用詞 """
result = []
words = pseg.cut(text)
for word, flag in words:
if word not in self.stop_words and flag not in self.stop_flag:
result.append(word)
return result
def similarity_compare(self, compare_doc: str, refer_doc: list) -> tuple:
"""
比較相似度
:param compare_doc: 待比對的文件
:param refer_doc: 基準文件
:return: tuple
"""
# 語料庫
refer_words = []
placeholder_count = 0
for refer_word in refer_doc:
words = self.text2words(refer_word)
if words:
refer_words.append(words)
else: # 確保順序
placeholder_count += 1
refer_words.append(self.text2words('placeholder' + str(placeholder_count)))
# 建立語料庫詞袋模型
dictionary = corpora.Dictionary(refer_words)
doc_vectors = [dictionary.doc2bow(word) for word in refer_words]
# 建立語料庫 TF-IDF 模型
tf_idf = models.TfidfModel(doc_vectors)
tf_idf_vectors = tf_idf[doc_vectors]
compare_vectors = dictionary.doc2bow(self.text2words(compare_doc))
index = similarities.MatrixSimilarity(tf_idf_vectors, num_features=len(dictionary))
sims = index[compare_vectors]
# 對結果按相似度由高到低排序
sims = sorted(list(enumerate(sims)), key=lambda x: x[1], reverse=True)
"""
index = similarities.MatrixSimilarity(tf_idf_vectors, num_features=len(dictionary), num_best=1)
# 對結果按相似度由高到低排序
sims = index[compare_vectors]
"""
return sims[0]
if __name__ == '__main__':
tfIdf = Tfidf()
test = '月亮海灘旅館'
refers = {
'普吉島斷點酒店': [(1, '普吉島斷點酒店')],
'月亮海灘酒店': [(10386, '月亮海灘酒店')],
'月亮海酒店': [(1564, '月亮海酒店')],
'清萊海灘酒店': [(3467, '清萊艾美度假酒店')]
}
titles = list(refers.keys())
similarity = tfIdf.similarity_compare(test, titles)
msg = "測試酒店 '%s' 和參照酒店中的 '%s' 最相似,相似度為 %f,對應酒店ID為:%s" \
% (test, titles[similarity[0]], similarity[1],
refers[titles[similarity[0]]][0][0])
print(msg)