1. 程式人生 > 其它 >NLP(二十三):用tf-idf得到句子向量,並計算相似度

NLP(二十三):用tf-idf得到句子向量,並計算相似度

一、基於gensim

1、模型類

import os
import jieba
import pickle
import logging
import numpy as np
from gensim import corpora, models, similarities
import utils.word_process as word_process
from root_path import root
from pathlib import Path
import heapq

class TfIdf(object):
    """tf-idf模型計算相似度"""
    def __init__
(self): root_path = os.path.join(root, "confusion_detection", "checkpoints", "tf_idf") if not Path(root_path).is_dir(): os.mkdir(root_path) self.dic_path = os.path.join(root_path, "bow.model") self.tfidf_model_path = os.path.join(root_path, "tfidf_model.model
") self.tfidf_index_path = os.path.join(root_path, "tfidf_index.model") self.stop_list = word_process.get_stop_list() self.data_path = os.path.join(root, "confusion_detection", "data", "raw_data", "all.txt") def del_stopwords(self, words): """刪除一句話中的停用詞""" word_list
= [] for word in words: if word not in self.stop_list: word_list.append(word) return word_list def _seg_word(self, words_list, jieba_flag=True, del_stopword=True): """對多句話進行分詞或分字""" word_list = [] if jieba_flag: if del_stopword: for words in words_list: jieba.cut(words) word_list.append(self.del_stopwords(list(jieba.cut(words)))) else: for words in words_list: word_list.append(list(jieba.cut(words))) else: if del_stopword: for words in words_list: word_list.append(self.del_stopwords(words)) else: for words in words_list: word_list.append([word for word in words]) return word_list def train(self, sentence_list): """訓練模型""" #下面儲存語料字典 word_list = self._seg_word(sentence_list) dic = corpora.Dictionary(word_list, prune_at=2000000) dic.save(self.dic_path) # 構建tfidf模型 tfidf_model_path = self.tfidf_model_path corpus_model = [dic.doc2bow(word) for word in word_list] tfidf_model = models.TfidfModel(corpus_model) tfidf_model.save(tfidf_model_path) #構造檢索模型 tfidf_index_path = self.tfidf_index_path corpus_tfidf = tfidf_model[corpus_model] tfidf_index = similarities.MatrixSimilarity(corpus_tfidf) tfidf_index.save(tfidf_index_path) def predict(self, sentence): # 得到句子向量, 直接出檢索結果(檢索是基於word_list的)。 dic = corpora.Dictionary.load(self.dic_path) words = sentence word_bow = dic.doc2bow(self._seg_word([words])[0]) word_tfidf = models.TfidfModel.load(self.tfidf_model_path)[word_bow] tfidf_index = similarities.MatrixSimilarity.load(self.tfidf_index_path) score = tfidf_index[word_tfidf] return score def get_train_data(self): """得到句子陣列和標籤陣列""" labels = [] sentences = [] with open(self.data_path, "r", encoding="utf8") as f: for line in f.readlines(): data_tuple = line.split(" ") label = data_tuple[0] labels.append(label) sentence = data_tuple[1].replace("\n", "").replace("\r", "") sentences.append(sentence) return labels, sentences def main(self): labels, sentences = self.get_train_data() print(sentences) self.train(sentences) score_list = self.predict("我有困難還不了") # 獲取下標, 輸出為[4, 5, 2] print(heapq.nlargest(30, range(len(score_list)), score_list.__getitem__)) # 獲取數值, 輸出為[9, 9, 6] print(heapq.nlargest(30, score_list)) if __name__ == '__main__': TfIdf().main()

2、工具類

import os
from root_path import root
import tqdm


stop = os.path.join(root, "confusion_detection","data", "raw_data", "ChineseStopWords.txt")

def get_stop_list():
    """得到停用詞列表"""
    stop_word_list = []
    with open(stop, "r", encoding="utf8") as f:
        data_lines = tqdm.tqdm(f.readlines(), smoothing=0, mininterval=0.1)
        data_lines.set_description('正在處理停用詞...')
        for line in data_lines:
            line = line.replace(" ", "").replace("\n", "").replace("\r", "")
            if len(line) == 1:
                stop_word_list.append(line)
    return stop_word_list