NLP（二十三）：用tf-idf得到句子向量，並計算相似度

阿新 • • 發佈：2021-06-18

一、基於gensim

1、模型類

import os
import jieba
import pickle
import logging
import numpy as np
from gensim import corpora, models, similarities
import utils.word_process as word_process
from root_path import root
from pathlib import Path
import heapq

class TfIdf(object):
    """tf-idf模型計算相似度"""
    def __init__ 
(self):
        root_path = os.path.join(root, "confusion_detection", "checkpoints", "tf_idf")
        if not Path(root_path).is_dir():
            os.mkdir(root_path)
        self.dic_path = os.path.join(root_path, "bow.model")
        self.tfidf_model_path = os.path.join(root_path, "tfidf_model.model 
")
        self.tfidf_index_path = os.path.join(root_path, "tfidf_index.model")
        self.stop_list = word_process.get_stop_list()

        self.data_path = os.path.join(root, "confusion_detection", "data", "raw_data", "all.txt")


    def del_stopwords(self, words):
        """刪除一句話中的停用詞"""
        word_list  
= []

        for word in words:
            if word not in self.stop_list:
                word_list.append(word)
        return word_list

    def _seg_word(self, words_list, jieba_flag=True, del_stopword=True):
        """對多句話進行分詞或分字"""
        word_list = []
        if jieba_flag:
            if del_stopword:
                for words in words_list:
                    jieba.cut(words)
                    word_list.append(self.del_stopwords(list(jieba.cut(words))))
            else:
                for words in words_list:
                    word_list.append(list(jieba.cut(words)))
        else:
            if del_stopword:
                for words in words_list:
                    word_list.append(self.del_stopwords(words))
            else:
                for words in words_list:
                    word_list.append([word for word in words])
        return word_list

    def train(self, sentence_list):
        """訓練模型"""
        #下面儲存語料字典
        word_list = self._seg_word(sentence_list)
        dic = corpora.Dictionary(word_list, prune_at=2000000)
        dic.save(self.dic_path)

        # 構建tfidf模型
        tfidf_model_path = self.tfidf_model_path
        corpus_model = [dic.doc2bow(word) for word in word_list]
        tfidf_model = models.TfidfModel(corpus_model)
        tfidf_model.save(tfidf_model_path)

        #構造檢索模型
        tfidf_index_path = self.tfidf_index_path
        corpus_tfidf = tfidf_model[corpus_model]
        tfidf_index = similarities.MatrixSimilarity(corpus_tfidf)
        tfidf_index.save(tfidf_index_path)

    def predict(self, sentence):
        # 得到句子向量, 直接出檢索結果(檢索是基於word_list的)。
        dic = corpora.Dictionary.load(self.dic_path)
        words = sentence
        word_bow = dic.doc2bow(self._seg_word([words])[0])
        word_tfidf = models.TfidfModel.load(self.tfidf_model_path)[word_bow]
        tfidf_index = similarities.MatrixSimilarity.load(self.tfidf_index_path)
        score = tfidf_index[word_tfidf]
        return score

    def get_train_data(self):
        """得到句子陣列和標籤陣列"""
        labels = []
        sentences = []
        with open(self.data_path, "r", encoding="utf8") as f:
            for line in f.readlines():
                data_tuple = line.split("  ")
                label = data_tuple[0]
                labels.append(label)
                sentence = data_tuple[1].replace("\n", "").replace("\r", "")
                sentences.append(sentence)
        return labels, sentences

    def main(self):
        labels, sentences = self.get_train_data()
        print(sentences)
        self.train(sentences)
        score_list = self.predict("我有困難還不了")

        # 獲取下標， 輸出為[4, 5, 2]
        print(heapq.nlargest(30, range(len(score_list)), score_list.__getitem__))

        # 獲取數值， 輸出為[9, 9, 6]
        print(heapq.nlargest(30, score_list))



if __name__ == '__main__':
    TfIdf().main()

2、工具類

import os
from root_path import root
import tqdm


stop = os.path.join(root, "confusion_detection","data", "raw_data", "ChineseStopWords.txt")

def get_stop_list():
    """得到停用詞列表"""
    stop_word_list = []
    with open(stop, "r", encoding="utf8") as f:
        data_lines = tqdm.tqdm(f.readlines(), smoothing=0, mininterval=0.1)
        data_lines.set_description('正在處理停用詞...')
        for line in data_lines:
            line = line.replace(" ", "").replace("\n", "").replace("\r", "")
            if len(line) == 1:
                stop_word_list.append(line)
    return stop_word_list