LSTM-CNNs-CRF演算法用於做ner等nlp任務
阿新 • • 發佈:2018-12-10
仔細看了下論文和相關程式碼實現,原理大概如下:
利用 word級別和char級別的方式作為輸入:
word級別的 一個序列長度: input_word=tf.placeholder([None,seqlen]), 就是分完詞之後的 “我 在 吃飯”
char級別的是 input_char=tf.placeholder([None,seqlen,maxchar_perword]) 這裡的意思是在word級別中的每個word是又多少個字組成,一般英文單詞的char級別有七八個的情況,中文經過分詞之後基本上最多應該就4個,個別能到5個。對應的輸入大概是這樣
[[我],[在],[吃,飯]],只不過在做模型的訓練的時候最好每個batch都需要pad到同一個長度。
然後就是word 和 char級別經過embeding的方式,這裡input_char經過embeding之後變為一個4維的tensor,然後經過一層二維卷積--relu--最大池化 再和word級別的embeding之後的tensor做做concat,進入lstm,後面再進入crf層,原理基本上就是這樣,實現在tensorflow中比較簡單。
另外在網上還看到了一些其他方法,也是利用cnn、lsrm、crf的方式,原理大概如下:
只有word的形式,也就是輸入input_word=tf.placeholder([None,seqlen]),分別經過bilstm和cnn層,兩者output後的結果再做concat,然後進入crf層,這種實現方式我簡單寫了下,看下程式碼:
import tensorflow as tf import numpy as np from tensorflow.contrib import rnn class BiLstmCnnCRF(object): def __init__( self, input_x,input_y,batch_size, num_tags, word_vocab_size, word_embedd_dim, grad_clip,dropout,regularization,seq_len, n_hidden_LSTM=200): self.word_vocab_size=word_vocab_size self.word_embedd_dim=word_embedd_dim self.input_x = input_x self.input_y = input_y self.batch_size=batch_size self.regularization=regularization self.dropout_keep_prob = dropout self.seq_len=seq_len self.max_sequence_in_batch = tf.constant(value=self.seq_len,dtype=tf.int32) self.sequence_lengths =tf.convert_to_tensor(self.batch_size * [self.max_sequence_in_batch], dtype=tf.int32) with tf.name_scope("word_embedding"): self.w_word = tf.Variable(tf.random_uniform([self.word_vocab_size, self.word_embedd_dim], -1, 1), trainable=True, name="w_word") self.embedded_words = tf.nn.embedding_lookup(self.w_word, self.input_x, name="embedded_words") with tf.name_scope("cnn"): #batchsize*80*200*1 cnn_input = tf.reshape(self.embedded_words,[-1,self.seq_len, self.word_embedd_dim,1]) cnn_filter = tf.get_variable(name='filter', shape=[1, 1, 2, 30], initializer=tf.random_uniform_initializer(-0.01, 0.01), dtype=tf.float32) cnn_bias = tf.get_variable(name='cnn_bias', shape=[30], initializer=tf.random_uniform_initializer(-0.01, 0.01), dtype=tf.float32) # batchsize*80*100*30 cnn_network = tf.add(tf.nn.conv2d(cnn_input , cnn_filter, strides=[1, 1, 2, 1], padding="VALID", name="conv"), cnn_bias); relu_applied = tf.nn.relu(cnn_network) max_pool = tf.nn.max_pool(relu_applied, ksize=[1, 1, 100, 1], strides=[1, 1, 1, 1], padding='VALID') self.cnn_output = tf.reshape(max_pool, [-1, self.seq_len, 30]) with tf.name_scope("biLSTM"): # forward LSTM cell lstm_fw_cell = rnn.BasicLSTMCell(n_hidden_LSTM, state_is_tuple=True) lstm_bw_cell = rnn.BasicLSTMCell(n_hidden_LSTM, state_is_tuple=True) (output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell, lstm_bw_cell, self.embedded_words, dtype=tf.float32) # output : [batch_size, timesteps, cell_fw.output_size] self.biLstm = tf.concat([output_fw, output_bw], axis=-1, name="biLstm") self.biLstm_clip = tf.clip_by_value(self.biLstm, -grad_clip, grad_clip) self.biLstm_dropout = tf.nn.dropout(self.biLstm_clip, self.dropout_keep_prob) with tf.name_scope("concat"): self.outpu_concat=tf.concat([self.cnn_output,self.biLstm_dropout],axis=-1) with tf.name_scope("output"): W_out = tf.get_variable("W_out", shape=[2 * n_hidden_LSTM+30, num_tags], initializer=tf.contrib.layers.xavier_initializer()) b_out = tf.Variable(tf.constant(0.0, shape=[num_tags]), name="b_out") self.biLstm_reshaped = tf.reshape(self.outpu_concat, [-1, 2 * n_hidden_LSTM+30]) # [batch_size * timesteps , 2*n_hidden_LSTM] obtained by statement print(self.biLstm.get_shape()) self.predictions = tf.nn.xw_plus_b(self.biLstm_reshaped, W_out, b_out, name="predictions") # input : [batch_size * timesteps , 2*n_hidden_LSTM] * [2*n_hidden_LSTM, num_classes] = [batch_size * timesteps , num_classes] self.logits = tf.reshape(self.predictions, [self.batch_size, -1, num_tags], name="logits") # output [batch_size, max_seq_len] # self.logits_soft=tf.nn.softmax(logits=self.logits,name="logits_soft") # # self.pred=tf.reshape(self.logits_soft,[self.batch_size,-1],name="pred") labels_softmax_argmax = tf.argmax(self.logits, axis=-1) self.pred = tf.cast(labels_softmax_argmax, tf.int32,name="pred") with tf.name_scope("l2loss"): self.tv = tf.trainable_variables() self.regularization_cost = self.regularization * tf.reduce_sum([tf.nn.l2_loss(v) for v in self.tv]) with tf.name_scope("loss"): log_likelihood, self.transition_params = tf.contrib.crf.crf_log_likelihood( self.logits, self.input_y,self.sequence_lengths) #+self.regularization_cost +self.regularization_cost self.loss = tf.reduce_mean(-log_likelihood, name="loss")+self.regularization_cost self.train_op = tf.train.AdamOptimizer().minimize(self.loss) with tf.name_scope("crf_pred"): self.viterbi_sequence, viterbi_score=tf.contrib.crf.crf_decode(self.logits, self.transition_params, self.sequence_lengths)
只是簡單了寫個實現方式