【Tensorflow】文字自編碼器
阿新 • • 發佈:2019-02-20
使用文字序列的word2vec詞向量作為seq2seq模型的輸入和輸出,訓練得到中間層的文字特徵表示,可進一步進行分類任務等,encoder和decoder都使用LSTM。
import tensorflow as tf import numpy as np import re from gensim.models import Word2Vec import pandas as pd import matplotlib.pyplot as plt import time import warnings warnings.filterwarnings("ignore") #匯入Word2vec詞向量模型 model = Word2Vec.load('model/daixia_w2c_char_100.model') #超引數 num_units = 256 input_size = 100 batch_size = 5 vocab_size = 946 # 讀取詞典,包括病案所有字,還有結束符號EOS def get_dict(): f = open('data/char_dict.txt', 'r', encoding='utf-8') dict_char = dict() dict_id = dict() for i in range(0, 946): word = f.readline() dict_char[re.sub('\n', '', word)] = i dict_id[i] = re.sub('\n', '', word) return dict_char, dict_id # 獲取資料,病案的字序列,生成訓練資料的batch def get_batches(filename, dict_char, batch_size): # 文字,分類,對應字典序號,文字長度 texts = [] label = [] targets = [] length = [] # 記錄文字資訊及標籤,序號 data = pd.read_csv(filename, delimiter=',', encoding='utf-8') for i in range(data.shape[0]): char_list = re.split(' ', data['text'].loc[i]) texts.append(char_list) label.append(data['label'].loc[i]) target = [dict_char[char] for char in char_list] targets.append(target) # 記錄每個文字的長度 for t in texts: length.append(len(t)) length = np.array(length, dtype=np.int32) # #返回整個資料集 # return texts,targets # batches生成器 i = 0 while True: yield texts[i:i + batch_size], targets[i:i + batch_size] i = i + batch_size if i + batch_size > len(texts): i = 0 #將string序列轉化為詞向量,格式轉化為time_major def make_batch(texts, isTargets=False, max_sequence_length=None): sequence_lengths = [len(text) for text in texts] batch_size = len(texts) if max_sequence_length is None: max_sequence_length = max(sequence_lengths) if isTargets is False: inputs_batch_major = np.zeros(shape=[batch_size, max_sequence_length, input_size], dtype=np.float32) for i, text in enumerate(texts): for j, char in enumerate(text): inputs_batch_major[i, j] = model[char] else: inputs_batch_major = np.zeros(shape=[batch_size, max_sequence_length], dtype=np.int32) for i, target in enumerate(texts): for j, t in enumerate(target): inputs_batch_major[i, j] = t inputs_time_major = inputs_batch_major.swapaxes(0, 1) return inputs_time_major #構建訓練計算圖 train_graph = tf.Graph() with train_graph.as_default(): encoder_inputs = tf.placeholder(shape=[None, batch_size, input_size], dtype=tf.float32, name='encoder_inputs') decoder_inputs = tf.placeholder(shape=[None, batch_size, input_size], dtype=tf.float32, name='decoder_inputs') decoder_targets = tf.placeholder(shape=(None, None), dtype=tf.int32, name='decoder_targets') #LSTM encoder encoder_cell = tf.contrib.rnn.LSTMCell(num_units) encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn( encoder_cell, encoder_inputs, dtype=tf.float32, time_major=True, ) #LSTM decoder decoder_cell = tf.contrib.rnn.LSTMCell(num_units) decoder_outputs, decoder_final_state = tf.nn.dynamic_rnn( decoder_cell, decoder_inputs, initial_state=encoder_final_state, dtype=tf.float32, time_major=True, scope="plain_decoder", ) #分類層 decoder_logits = tf.contrib.layers.linear(decoder_outputs, vocab_size) decoder_prediction = tf.argmax(decoder_logits, 2) stepwise_cross_entropy = tf.nn.softmax_cross_entropy_with_logits( labels=tf.one_hot(decoder_targets, depth=vocab_size, dtype=tf.float32), logits=decoder_logits, ) #計算精確度 correct_prediction = tf.equal(decoder_prediction, tf.argmax(tf.one_hot(decoder_targets, depth=vocab_size, dtype=tf.float32), 2)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) #交叉熵損失 loss = tf.reduce_mean(stepwise_cross_entropy) #優化 train_op = tf.train.AdamOptimizer().minimize(loss) #建立saver儲存模型 saver = tf.train.Saver() if __name__ == '__main__': # 記錄損失 loss_track = [] epochs = 10001 # 讀取字典 dict_char, dict_id = get_dict() # batch生成器 gen_batches = get_batches('data/data_char.csv', dict_char, batch_size) #開啟會話 with tf.Session(graph=train_graph) as sess: sess.run(tf.global_variables_initializer()) time_start = time.time() for epoch in range(epochs): batch = next(gen_batches) texts = batch[0] targets = batch[1] #EOS為句子結束符號,在字典中對應0,decoder輸入以結束符號開始,decoder的targets以結束符號為結尾 encoder_inputs_ = make_batch(texts) decoder_inputs_ = make_batch([['EOS'] + text for text in texts]) decoder_targets_ = make_batch([target + [0] for target in targets], True, None) feed_dict = {encoder_inputs: encoder_inputs_, decoder_inputs: decoder_inputs_, decoder_targets: decoder_targets_, } _, l, acc = sess.run([train_op, loss, accuracy], feed_dict) loss_track.append(l) #展示預測效果 if epoch == 0 or epoch % 10 == 0: print('loss: {}'.format(sess.run(loss, feed_dict))) print('acc: {}'.format(sess.run(accuracy, feed_dict))) predict_ = sess.run(decoder_prediction, feed_dict) for i, (inp, pred) in enumerate(zip(texts, predict_.T)): print('input > {}'.format(inp)) print('predicted > {}'.format([dict_id[id] for id in pred])) if i >= 2: break time_span = time.time() - time_start print('訓練花費了{}'.format(time_span)) saver.save(sess, 'model/dl/model.ckpt') plt.plot(loss_track) plt.show()