1. 程式人生 > >短文字轉向量的一種實現方式

短文字轉向量的一種實現方式

文章目錄

前言

下文實現僅僅是比較粗糙的一種方式,可以改進的點還有很多,是真的很多!重點是,不講解原理,就是這麼沒道理…

實現思路

  1. 分詞。分詞還是jieba好。word2vec模型訓練選取gensim。
  2. 使用大語料進行基礎詞典word2vec模型的訓練。
  3. 使用特定領域(針對業務)語料進行專業詞彙word2vec模型的訓練。
  4. 文字分詞後使用AVG-W2V方式獲取短文字向量,維度取決於word2vec維度大小,即所有詞向量求平均。

word2vec相關配置

  • w2v.properties
#一些經驗
#架構(sg):skip-gram(慢、對罕見字有利)vs CBOW(快)
#訓練演算法(hs):分層softmax(對罕見字有利)vs 負取樣(對常見詞和低緯向量有利)
#欠取樣頻繁詞(sample):可以提高結果的準確性和速度(適用範圍1e-3到1e-5)
#文字大小(window):skip-gram通常在10附近,CBOW通常在5附近
#大語料下,建議提高min_count,減少iter

# 訓練演算法,0為CBOW演算法,1為skip-gram演算法,預設為0
sg=1
# 特徵向量的維度
size=300
# 詞窗大小
window=5
# 最小詞頻
min_count=5
# 初始學習速率
alpha=0.025
# 0為負取樣,1為softmax,預設為0
hs=1
#迭代次數
iter=10

程式碼

  • 大語料基礎訓練相關程式碼
# -*- coding:utf-8 -*-
"""
Description: 基於百度百科大語料的word2vec模型

@author: WangLeAi
@date: 2018/9/18
"""
import os
from util.DBUtil import DbPoolUtil
from util.JiebaUtil import jieba_util
from util.PropertiesUtil import prop
from gensim.models import word2vec


class OriginModel(object)
: def __init__(self): self.params = prop.get_config_dict("config/w2v.properties") self.db_pool_util = DbPoolUtil(db_type="mysql") self.train_data_path = "gen/ori_train_data.txt" self.model_path = "model/oriw2v.model" @staticmethod def text_process(sentence):
""" 文字預處理 :param sentence: :return: """ # 過濾任意非中文、非英文、非數字 # regex = re.compile(u'[^\u4e00-\u9fa50-9a-zA-Z\-·]+') # sentence = regex.sub('', sentence) words = jieba_util.jieba_cut(sentence) return words def get_train_data(self): """ 獲取訓練資料,此處需要自行修改,最好寫入檔案而不是直接取到記憶體中!!!!! :return: """ print("建立初始語料訓練資料") sql = """ """ sentences = self.db_pool_util.loop_row(origin_model, "text_process", sql) with open(self.train_data_path, "w", encoding="utf-8") as f: for sentence in sentences: f.write(" ".join(sentence) + "\n") def train_model(self): """ 訓練模型 :return: """ if not os.path.exists(self.train_data_path): self.get_train_data() print("訓練初始模型") sentences = word2vec.LineSentence(self.train_data_path) model = word2vec.Word2Vec(sentences=sentences, sg=int(self.params["sg"]), size=int(self.params["size"]), window=int(self.params["window"]), min_count=int(self.params["min_count"]), alpha=float(self.params["alpha"]), hs=int(self.params["hs"]), workers=6, iter=int(self.params["iter"])) model.save(self.model_path) print("訓練初始模型完畢,儲存模型") origin_model = OriginModel()
  • 額外語料進行訓練
# -*- coding:utf-8 -*-
"""
Description:word2vec fine tuning
基於對應型別的額外語料進行微調

@author: WangLeAi
@date: 2018/9/11
"""
import os
from util.DBUtil import DbPoolUtil
from util.JiebaUtil import jieba_util
from util.PropertiesUtil import prop
from gensim.models import word2vec
from algorithms.OriginModel import origin_model


class Word2VecModel(object):
    def __init__(self):
        self.db_pool_util = DbPoolUtil(db_type="mysql")
        self.train_data_path = "gen/train_data.txt"
        self.origin_model_path = "model/oriw2v.model"
        self.model_path = "model/w2v.model"
        self.model = None
        # 未登入詞進入需考慮最小詞頻
        self.min_count = int(prop.get_config_value("config/w2v.properties", "min_count"))

    @staticmethod
    def text_process(sentence):
        """
        文字預處理
        :param sentence:
        :return:
        """
        # 過濾任意非中文、非英文、非數字等
        # regex = re.compile(u'[^\u4e00-\u9fa50-9a-zA-Z\-·]+')
        # sentence = regex.sub('', sentence)
        words = jieba_util.jieba_cut(sentence)
        return words

    def get_train_data(self):
        """
        獲取訓練資料,此處需要自行修改,最好寫入檔案而不是直接取到記憶體中!!!!!
        :return:
        """
        print("建立額外語料訓練資料")
        sql = """ """
        sentences = self.db_pool_util.loop_row(w2v_model, "text_process", sql)
        with open(self.train_data_path, "a", encoding="utf-8") as f:
            for sentence in sentences:
                f.write(" ".join(sentence) + "\n")

    def train_model(self):
        """
        訓練模型
        :return:
        """
        if not os.path.exists(self.origin_model_path):
            print("無初始模型,進行初始模型訓練")
            origin_model.train_model()
        model = word2vec.Word2Vec.load(self.origin_model_path)
        print("初始模型載入完畢")
        if not os.path.exists(self.train_data_path):
            self.get_train_data()
        print("額外語料訓練")
        extra_sentences = word2vec.LineSentence(self.train_data_path)
        model.build_vocab(extra_sentences, update=True)
        model.train(extra_sentences, total_examples=model.corpus_count, epochs=model.iter)
        model.save(self.model_path)
        print("額外語料訓練完畢")

    def load_model(self):
        """
        載入模型
        :return:
        """
        print("載入詞嵌入模型")
        if not os.path.exists(self.model_path):
            print("無詞嵌入模型,進行訓練")
            self.train_model()
        self.model = word2vec.Word2Vec.load(self.model_path)
        print("詞嵌入模型載入完畢")

    def get_word_vector(self, words, extra=0):
        """
        獲取詞語向量,需要先載入模型
        :param words:
        :param extra:是否考慮未登入詞,0不考慮,1考慮
        :return:
        """
        if extra:
            if words not in self.model:
                more_sentences = [[words, ] for i in range(self.min_count)]
                self.model.build_vocab(more_sentences, update=True)
                self.model.train(more_sentences, total_examples=self.model.corpus_count, epochs=self.model.iter)
                self.model.save(self.model_path)
        rst = None
        if words in self.model:
            rst = self.model[words]
        return rst

    def get_sentence_vector(self, sentence, extra=0):
        """
        獲取文字向量,需要先載入模型
        :param sentence:
        :param extra: 是否考慮未登入詞,0不考慮,1考慮
        :return:
        """
        words = jieba_util.jieba_cut_flag(sentence)
        if not words:
            words = jieba_util.jieba_cut(sentence)
        if not words:
            print("存在無法切出有效詞的句子:" + sentence)
            # raise Exception("存在無法切出有效詞的句子:" + sentence)
        if extra:
            for item in words:
                if item not in self.model:
                    more_sentences = [words for i in range(self.min_count)]
                    self.model.build_vocab(more_sentences, update=True)
                    self.model.train(more_sentences, total_examples=self.model.corpus_count, epochs=self.model.iter)
                    self.model.save(self.model_path)
                    break
        return self.get_sentence_embedding(words)

    def get_sentence_embedding(self, words):
        """
        獲取短文字向量,僅推薦短文字使用
        句中所有詞權重總和求平均獲取文字向量,不適用於長文字的原因在於受頻繁詞影響較大
        長文字推薦使用gensim的doc2vec
        :param words:
        :return:
        """
        count = 0
        vector = None
        for item in words:
            if item in self.model:
                count += 1
                if vector is not None:
                    vector = vector + self.model[item]
                else:
                    vector = self.model[item]
        if vector is not None:
            vector = vector / count
        return vector


w2v_model = Word2VecModel()

  • 測試方式
# -*- coding:utf-8 -*-
"""
Description:

@author: WangLeAi
@date: 2018/9/18
"""
import os
from algorithms.Word2VecModel import w2v_model


def main():
    root_path = os.path.split(os.path.realpath(__file__))[0]
    if not os.path.exists(root_path + "/model"):
        os.mkdir(root_path + "/model")
    w2v_model.load_model()
    print(w2v_model.get_sentence_vector("不知不覺間我已經忘記了愛"))


if __name__ == "__main__":
    main()

補充資料

  1. 文字相似度演算法相關資料(力推!):戳我
  2. DBUtils相關內容可以看我之前的博文,有一點小改動:戳我

完整程式碼

下載地址