1. 程式人生 > >【Tensorflow】文字自編碼器

【Tensorflow】文字自編碼器

使用文字序列的word2vec詞向量作為seq2seq模型的輸入和輸出,訓練得到中間層的文字特徵表示,可進一步進行分類任務等,encoder和decoder都使用LSTM。

import tensorflow as tf
import numpy as np
import re
from gensim.models import Word2Vec
import pandas as pd
import matplotlib.pyplot as plt
import time
import warnings
warnings.filterwarnings("ignore")

#匯入Word2vec詞向量模型
model = Word2Vec.load('model/daixia_w2c_char_100.model')

#超引數
num_units = 256
input_size = 100
batch_size = 5
vocab_size = 946

# 讀取詞典,包括病案所有字,還有結束符號EOS
def get_dict():
    f = open('data/char_dict.txt', 'r', encoding='utf-8')
    dict_char = dict()
    dict_id = dict()
    for i in range(0, 946):
        word = f.readline()
        dict_char[re.sub('\n', '', word)] = i
        dict_id[i] = re.sub('\n', '', word)

    return dict_char, dict_id


# 獲取資料,病案的字序列,生成訓練資料的batch
def get_batches(filename, dict_char, batch_size):
    # 文字,分類,對應字典序號,文字長度
    texts = []
    label = []
    targets = []
    length = []
    # 記錄文字資訊及標籤,序號
    data = pd.read_csv(filename, delimiter=',', encoding='utf-8')
    for i in range(data.shape[0]):
        char_list = re.split(' ', data['text'].loc[i])
        texts.append(char_list)
        label.append(data['label'].loc[i])
        target = [dict_char[char] for char in char_list]
        targets.append(target)

    # 記錄每個文字的長度
    for t in texts:
        length.append(len(t))
    length = np.array(length, dtype=np.int32)

    #     #返回整個資料集
    #     return texts,targets

    # batches生成器
    i = 0
    while True:
        yield texts[i:i + batch_size], targets[i:i + batch_size]
        i = i + batch_size
        if i + batch_size > len(texts):
            i = 0

#將string序列轉化為詞向量,格式轉化為time_major
def make_batch(texts, isTargets=False, max_sequence_length=None):
    sequence_lengths = [len(text) for text in texts]
    batch_size = len(texts)
    if max_sequence_length is None:
        max_sequence_length = max(sequence_lengths)
    if isTargets is False:
        inputs_batch_major = np.zeros(shape=[batch_size, max_sequence_length, input_size], dtype=np.float32)
        for i, text in enumerate(texts):
            for j, char in enumerate(text):
                inputs_batch_major[i, j] = model[char]
    else:
        inputs_batch_major = np.zeros(shape=[batch_size, max_sequence_length], dtype=np.int32)
        for i, target in enumerate(texts):
            for j, t in enumerate(target):
                inputs_batch_major[i, j] = t
    inputs_time_major = inputs_batch_major.swapaxes(0, 1)
    return inputs_time_major

#構建訓練計算圖
train_graph = tf.Graph()
with train_graph.as_default():
    encoder_inputs = tf.placeholder(shape=[None, batch_size, input_size], dtype=tf.float32, name='encoder_inputs')
    decoder_inputs = tf.placeholder(shape=[None, batch_size, input_size], dtype=tf.float32, name='decoder_inputs')
    decoder_targets = tf.placeholder(shape=(None, None), dtype=tf.int32, name='decoder_targets')
    #LSTM encoder
    encoder_cell = tf.contrib.rnn.LSTMCell(num_units)
    encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn(
        encoder_cell, encoder_inputs,
        dtype=tf.float32, time_major=True,
    )
    #LSTM decoder
    decoder_cell = tf.contrib.rnn.LSTMCell(num_units)
    decoder_outputs, decoder_final_state = tf.nn.dynamic_rnn(
        decoder_cell, decoder_inputs,
        initial_state=encoder_final_state,
        dtype=tf.float32, time_major=True, scope="plain_decoder",
    )
    #分類層
    decoder_logits = tf.contrib.layers.linear(decoder_outputs, vocab_size)
    decoder_prediction = tf.argmax(decoder_logits, 2)
    stepwise_cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
        labels=tf.one_hot(decoder_targets, depth=vocab_size, dtype=tf.float32),
        logits=decoder_logits,
    )
    #計算精確度
    correct_prediction = tf.equal(decoder_prediction,
                                  tf.argmax(tf.one_hot(decoder_targets, depth=vocab_size, dtype=tf.float32), 2))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    #交叉熵損失
    loss = tf.reduce_mean(stepwise_cross_entropy)
    #優化
    train_op = tf.train.AdamOptimizer().minimize(loss)
    #建立saver儲存模型
    saver = tf.train.Saver()

if __name__ == '__main__':
    # 記錄損失
    loss_track = []
    epochs = 10001
    # 讀取字典
    dict_char, dict_id = get_dict()
    # batch生成器
    gen_batches = get_batches('data/data_char.csv', dict_char, batch_size)
    #開啟會話
    with tf.Session(graph=train_graph) as sess:
        sess.run(tf.global_variables_initializer())
        time_start = time.time()
        for epoch in range(epochs):
            batch = next(gen_batches)
            texts = batch[0]
            targets = batch[1]
            #EOS為句子結束符號,在字典中對應0,decoder輸入以結束符號開始,decoder的targets以結束符號為結尾
            encoder_inputs_ = make_batch(texts)
            decoder_inputs_ = make_batch([['EOS'] + text for text in texts])
            decoder_targets_ = make_batch([target + [0] for target in targets], True, None)

            feed_dict = {encoder_inputs: encoder_inputs_, decoder_inputs: decoder_inputs_,
                         decoder_targets: decoder_targets_,
                         }
            _, l, acc = sess.run([train_op, loss, accuracy], feed_dict)
            loss_track.append(l)
            #展示預測效果
            if epoch == 0 or epoch % 10 == 0:
                print('loss: {}'.format(sess.run(loss, feed_dict)))
                print('acc: {}'.format(sess.run(accuracy, feed_dict)))
                predict_ = sess.run(decoder_prediction, feed_dict)
                for i, (inp, pred) in enumerate(zip(texts, predict_.T)):
                    print('input > {}'.format(inp))
                    print('predicted > {}'.format([dict_id[id] for id in pred]))
                    if i >= 2:
                        break

        time_span = time.time() - time_start
        print('訓練花費了{}'.format(time_span))
        saver.save(sess, 'model/dl/model.ckpt')

    plt.plot(loss_track)
    plt.show()