1. 程式人生 > 實用技巧 >命名實體識別之bert+bilstm(基於tensorflow)

命名實體識別之bert+bilstm(基於tensorflow)

接下來我們繼續對官方基於bert的模型進行擴充套件,之前的可參考:

基於bert命名實體識別(一)資料處理

命名實體識別資料預處理

命名實體識別之建立訓練資料

命名實體識別之使用tensorflow的bert模型進行微調

命名實體識別之動態融合不同bert層的特徵(基於tensorflow)

直接看程式碼:

class MyModel:
  def __init__(self, config):
    self.config = config
    # 喂入模型的資料佔位符
    self.input_x_word = tf.placeholder(tf.int32, [None, None], name="
input_x_word") self.input_x_len = tf.placeholder(tf.int32, name='input_x_len') self.input_mask = tf.placeholder(tf.int32, [None, None], name='input_mask') self.input_relation = tf.placeholder(tf.int32, [None, None], name='input_relation') # 實體NER的真實標籤 self.keep_prob = tf.placeholder(tf.float32, name='
dropout_keep_prob') self.is_training = tf.placeholder(tf.bool, None, name='is_training') self.initializer = initializers.xavier_initializer() self.lstm_dim = self.config.lstm_dim self.relation_num = self.config.relation_num self.num_steps = tf.shape(self.input_x_word)[-1] print
("self.num_steps.shape[-1]:",tf.shape(self.input_x_word)[-1]) self.bert_embed(bert_init=True) def biLSTM_layer(self, lstm_inputs, lstm_dim, lengths, name=None): """ :param lstm_inputs: [batch_size, num_steps, emb_size] :return: [batch_size, num_steps, 2*lstm_dim] """ with tf.name_scope("char_BiLSTM" if not name else name): lstm_cell = {} for direction in ["forward", "backward"]: with tf.name_scope(direction): lstm_cell[direction] = rnn.CoupledInputForgetGateLSTMCell( lstm_dim, use_peepholes=True, initializer=self.initializer, state_is_tuple=True) outputs, final_states = tf.nn.bidirectional_dynamic_rnn( lstm_cell["forward"], lstm_cell["backward"], lstm_inputs, dtype=tf.float32, sequence_length=lengths) return tf.concat(outputs, axis=2) def project_layer(self, lstm_outputs, name=None): """ hidden layer between lstm layer and logits :param lstm_outputs: [batch_size, num_steps, emb_size] :return: [batch_size, num_steps, num_tags] """ with tf.name_scope("project" if not name else name): with tf.name_scope("hidden"): W = tf.get_variable("HW", shape=[self.lstm_dim * 2, self.lstm_dim], dtype=tf.float32, initializer=self.initializer) b = tf.get_variable("Hb", shape=[self.lstm_dim], dtype=tf.float32, initializer=tf.zeros_initializer()) output = tf.reshape(lstm_outputs, shape=[-1, self.lstm_dim * 2]) hidden = tf.tanh(tf.nn.xw_plus_b(output, W, b)) # project to score of tags with tf.name_scope("logits"): W = tf.get_variable("LW", shape=[self.lstm_dim, self.relation_num], dtype=tf.float32, initializer=self.initializer) b = tf.get_variable("Lb", shape=[self.relation_num], dtype=tf.float32, initializer=tf.zeros_initializer()) pred = tf.nn.xw_plus_b(hidden, W, b) return tf.reshape(pred, [-1, self.num_steps, self.relation_num], name='pred_logits') def loss_without_crf(self, output_layer, num_labels, bert_init=True): with tf.variable_scope("loss"): self.logits = output_layer self.probabilities = tf.nn.softmax(self.logits, axis=-1) log_probs = tf.nn.log_softmax(self.logits, axis=-1) # [?,11] print("log_probs.shape:",log_probs.shape) self.predictions = tf.argmax(self.logits, axis=-1, name="predictions") one_hot_labels = tf.one_hot(self.input_relation, depth=num_labels, dtype=tf.float32) # [?,512,11] #print(one_hot_labels) #print("one_hot_labels.shape:",one_hot_labels.shape) self.per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) #print("self.per_example_loss.shape:",self.per_example_loss.shape) self.loss = tf.reduce_mean(self.per_example_loss) print(self.loss) #print("self.loss.shape:",self.loss.shape) tvars = tf.trainable_variables() init_checkpoint = self.config.bert_file assignment_map, initialized_variable_names = get_assignment_map_from_checkpoint(tvars, init_checkpoint) if bert_init: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" print(" name = {}, shape = {}{}".format(var.name, var.shape, init_string)) print('init bert from checkpoint: {}'.format(init_checkpoint)) #return self.loss, self.per_example_loss, self.logits, self.probabilities def bert_embed(self, bert_init=True): """ 讀取BERT的TF模型 :param bert_init: :return: """ num_labels = self.config.relation_num bert_config_file = self.config.bert_config_file bert_config = BertConfig.from_json_file(bert_config_file) model = BertModel( config=bert_config, is_training=self.is_training, # 微調 input_ids=self.input_x_word, input_mask=self.input_mask, token_type_ids=None, use_one_hot_embeddings=False) # If you want to use the token-level output, use model.get_sequence_output() # output_layer = model.get_pooled_output() # [?,768] # print("output_layer.shape:",output_layer) used = tf.sign(tf.abs(self.input_x_word)) length = tf.reduce_sum(used, reduction_indices=1) self.lengths = tf.cast(length, tf.int32) output_layer = model.get_sequence_output() lstm_inputs = tf.nn.dropout(output_layer, 0.9) output_layer = self.biLSTM_layer(lstm_inputs, self.lstm_dim, self.lengths) output_layer = self.project_layer(output_layer) print("output_layer.shape:", output_layer.shape) self.loss_without_crf(output_layer, num_labels) import sys sys.exit(0)

結果:

WARNING:tensorflow:From /content/drive/My Drive/Deep-Learning-With-Python/chapter8/CCF_ner/optimization.py:155: The name tf.train.AdamOptimizer is deprecated. Please use tf.compat.v1.train.AdamOptimizer instead.

WARNING:tensorflow:From /content/drive/My Drive/Deep-Learning-With-Python/chapter8/CCF_ner/bert/tokenization.py:125: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.

{'', '', '', '', 'Q', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'Z', 'U', '', 'X', '', '', 'F', 'L', '', '', '', '', '', '', '', '', '', '', '', '', 'I', '', '', '', '', '', '', 'T', '', '', '', '', '', 'G', '', 'O', 'V', '', '', '', '`', '', '', '', '', '', 'W', 'S', 'N', '', '', '', '', 'K', '', '', '', '', 'Y', 'R', '', '', '', '', '', '', '', '', 'D', '', '', '', '', '', '', '', 'E', '', '', 'B', 'J', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '\ue40a', '', '', '', '', '', '', '', '', 'C', '', '', '', '', '', '', '', '', '', '', 'P', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'M', 'A', 'H', '', '', '', '', ''}
8012
{'', 'C', 'T', '', 'Q', 'D', 'G', '', '', 'P', '', '', '', 'E', 'O', 'V', '', 'Z', 'J', 'B', 'U', 'X', '', 'F', 'L', '', 'W', '', 'N', 'S', 'K', '', '', 'A', 'H', 'Y', 'M', 'R', 'I'}
1105
WARNING:tensorflow:From test_bert.py:388: The name tf.Session is deprecated. Please use tf.compat.v1.Session instead.

2020-12-13 14:07:47.209770: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2020-12-13 14:07:47.265991: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-12-13 14:07:47.266613: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1639] Found device 0 with properties: 
name: Tesla T4 major: 7 minor: 5 memoryClockRate(GHz): 1.59
pciBusID: 0000:00:04.0
2020-12-13 14:07:47.266923: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2020-12-13 14:07:47.493085: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10
2020-12-13 14:07:47.621614: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcufft.so.10
2020-12-13 14:07:47.641392: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcurand.so.10
2020-12-13 14:07:47.925153: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusolver.so.10
2020-12-13 14:07:47.943921: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusparse.so.10
2020-12-13 14:07:48.468415: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
2020-12-13 14:07:48.468625: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-12-13 14:07:48.469411: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-12-13 14:07:48.470004: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1767] Adding visible gpu devices: 0
2020-12-13 14:07:48.525931: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2200000000 Hz
2020-12-13 14:07:48.526210: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x270ef40 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2020-12-13 14:07:48.526244: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
2020-12-13 14:07:48.677879: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-12-13 14:07:48.678754: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x270f100 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2020-12-13 14:07:48.678790: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
2020-12-13 14:07:48.679588: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-12-13 14:07:48.680198: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1639] Found device 0 with properties: 
name: Tesla T4 major: 7 minor: 5 memoryClockRate(GHz): 1.59
pciBusID: 0000:00:04.0
2020-12-13 14:07:48.680265: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2020-12-13 14:07:48.680295: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10
2020-12-13 14:07:48.680319: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcufft.so.10
2020-12-13 14:07:48.680346: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcurand.so.10
2020-12-13 14:07:48.680371: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusolver.so.10
2020-12-13 14:07:48.680393: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusparse.so.10
2020-12-13 14:07:48.680416: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
2020-12-13 14:07:48.680497: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-12-13 14:07:48.681158: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-12-13 14:07:48.681699: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1767] Adding visible gpu devices: 0
2020-12-13 14:07:48.684658: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2020-12-13 14:07:48.686073: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1180] Device interconnect StreamExecutor with strength 1 edge matrix:
2020-12-13 14:07:48.686103: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1186]      0 
2020-12-13 14:07:48.686114: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1199] 0:   N 
2020-12-13 14:07:48.687110: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-12-13 14:07:48.687768: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-12-13 14:07:48.688359: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
2020-12-13 14:07:48.688404: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1325] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 14221 MB memory) -> physical GPU (device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5)
WARNING:tensorflow:From test_bert.py:176: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

self.num_steps.shape[-1]: Tensor("strided_slice_1:0", shape=(), dtype=int32)
WARNING:tensorflow:From /content/drive/My Drive/Deep-Learning-With-Python/chapter8/CCF_ner/tf_utils/bert_modeling.py:175: The name tf.variable_scope is deprecated. Please use tf.compat.v1.variable_scope instead.

WARNING:tensorflow:From /content/drive/My Drive/Deep-Learning-With-Python/chapter8/CCF_ner/tf_utils/bert_modeling.py:416: The name tf.get_variable is deprecated. Please use tf.compat.v1.get_variable instead.

WARNING:tensorflow:From /content/drive/My Drive/Deep-Learning-With-Python/chapter8/CCF_ner/tf_utils/bert_modeling.py:497: The name tf.assert_less_equal is deprecated. Please use tf.compat.v1.assert_less_equal instead.

WARNING:tensorflow:
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

WARNING:tensorflow:From /content/drive/My Drive/Deep-Learning-With-Python/chapter8/CCF_ner/tf_utils/bert_modeling.py:364: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
WARNING:tensorflow:From /content/drive/My Drive/Deep-Learning-With-Python/chapter8/CCF_ner/tf_utils/bert_modeling.py:874: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.
Instructions for updating:
Use keras.layers.Dense instead.
WARNING:tensorflow:From /tensorflow-1.15.2/python3.6/tensorflow_core/python/layers/core.py:187: Layer.apply (from tensorflow.python.keras.engine.base_layer) is deprecated and will be removed in a future version.
Instructions for updating:
Please use `layer.__call__` method instead.
WARNING:tensorflow:From /content/drive/My Drive/Deep-Learning-With-Python/chapter8/CCF_ner/tf_utils/bert_modeling.py:282: The name tf.erf is deprecated. Please use tf.math.erf instead.

WARNING:tensorflow:From test_bert.py:209: bidirectional_dynamic_rnn (from tensorflow.python.ops.rnn) is deprecated and will be removed in a future version.
Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
WARNING:tensorflow:From /tensorflow-1.15.2/python3.6/tensorflow_core/python/ops/rnn.py:464: dynamic_rnn (from tensorflow.python.ops.rnn) is deprecated and will be removed in a future version.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
WARNING:tensorflow:Entity <bound method CoupledInputForgetGateLSTMCell.call of <tf_utils.rnncell.CoupledInputForgetGateLSTMCell object at 0x7f297e2d2eb8>> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: module 'gast' has no attribute 'Num'
WARNING:tensorflow:From /tensorflow-1.15.2/python3.6/tensorflow_core/python/ops/rnn.py:244: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
WARNING:tensorflow:Entity <bound method CoupledInputForgetGateLSTMCell.call of <tf_utils.rnncell.CoupledInputForgetGateLSTMCell object at 0x7f297e2d2fd0>> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: module 'gast' has no attribute 'Num'
WARNING:tensorflow:From test_bert.py:225: The name tf.nn.xw_plus_b is deprecated. Please use tf.compat.v1.nn.xw_plus_b instead.

output_layer.shape: (?, ?, 11)
log_probs.shape: (?, ?, 11)
self.per_example_loss.shape: (?, ?)
self.loss.shape: ()
WARNING:tensorflow:From test_bert.py:255: The name tf.trainable_variables is deprecated. Please use tf.compat.v1.trainable_variables instead.

WARNING:tensorflow:From test_bert.py:259: The name tf.train.init_from_checkpoint is deprecated. Please use tf.compat.v1.train.init_from_checkpoint instead.

WARNING:tensorflow:From test_bert.py:260: The name tf.logging.info is deprecated. Please use tf.compat.v1.logging.info instead.

  name = bert/embeddings/word_embeddings:0, shape = (21128, 768), *INIT_FROM_CKPT*
  name = bert/embeddings/token_type_embeddings:0, shape = (2, 768), *INIT_FROM_CKPT*
  name = bert/embeddings/position_embeddings:0, shape = (512, 768), *INIT_FROM_CKPT*
  name = bert/embeddings/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/embeddings/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_0/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_0/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_0/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_0/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_0/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_0/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_0/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_0/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_0/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_0/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_0/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*
  name = bert/encoder/layer_0/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_0/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_0/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_0/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_0/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_1/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_1/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_1/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_1/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_1/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_1/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_1/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_1/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_1/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_1/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_1/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*
  name = bert/encoder/layer_1/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_1/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_1/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_1/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_1/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_2/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_2/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_2/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_2/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_2/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_2/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_2/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_2/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_2/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_2/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_2/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*
  name = bert/encoder/layer_2/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_2/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_2/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_2/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_2/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_3/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_3/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_3/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_3/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_3/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_3/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_3/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_3/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_3/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_3/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_3/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*
  name = bert/encoder/layer_3/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_3/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_3/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_3/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_3/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_4/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_4/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_4/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_4/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_4/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_4/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_4/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_4/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_4/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_4/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_4/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*
  name = bert/encoder/layer_4/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_4/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_4/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_4/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_4/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_5/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_5/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_5/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_5/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_5/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_5/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_5/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_5/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_5/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_5/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_5/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*
  name = bert/encoder/layer_5/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_5/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_5/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_5/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_5/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_6/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_6/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_6/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_6/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_6/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_6/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_6/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_6/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_6/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_6/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_6/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*
  name = bert/encoder/layer_6/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_6/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_6/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_6/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_6/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_7/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_7/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_7/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_7/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_7/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_7/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_7/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_7/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_7/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_7/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_7/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*
  name = bert/encoder/layer_7/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_7/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_7/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_7/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_7/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_8/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_8/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_8/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_8/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_8/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_8/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_8/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_8/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_8/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_8/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_8/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*
  name = bert/encoder/layer_8/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_8/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_8/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_8/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_8/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_9/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_9/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_9/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_9/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_9/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_9/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_9/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_9/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_9/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_9/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_9/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*
  name = bert/encoder/layer_9/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_9/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_9/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_9/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_9/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_10/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_10/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_10/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_10/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_10/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_10/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_10/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_10/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_10/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_10/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_10/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*
  name = bert/encoder/layer_10/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_10/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_10/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_10/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_10/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_11/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_11/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_11/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_11/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_11/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_11/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_11/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_11/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_11/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_11/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_11/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*
  name = bert/encoder/layer_11/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_11/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_11/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_11/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_11/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/pooler/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/pooler/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bidirectional_rnn/fw/coupled_input_forget_gate_lstm_cell/_w_xi:0, shape = (768, 256)
  name = bidirectional_rnn/fw/coupled_input_forget_gate_lstm_cell/_w_hi:0, shape = (256, 256)
  name = bidirectional_rnn/fw/coupled_input_forget_gate_lstm_cell/_w_ci:0, shape = (256, 256)
  name = bidirectional_rnn/fw/coupled_input_forget_gate_lstm_cell/_w_xo:0, shape = (768, 256)
  name = bidirectional_rnn/fw/coupled_input_forget_gate_lstm_cell/_w_ho:0, shape = (256, 256)
  name = bidirectional_rnn/fw/coupled_input_forget_gate_lstm_cell/_w_co:0, shape = (256, 256)
  name = bidirectional_rnn/fw/coupled_input_forget_gate_lstm_cell/_w_xc:0, shape = (768, 256)
  name = bidirectional_rnn/fw/coupled_input_forget_gate_lstm_cell/_w_hc:0, shape = (256, 256)
  name = bidirectional_rnn/fw/coupled_input_forget_gate_lstm_cell/_b_i:0, shape = (256,)
  name = bidirectional_rnn/fw/coupled_input_forget_gate_lstm_cell/_b_c:0, shape = (256,)
  name = bidirectional_rnn/fw/coupled_input_forget_gate_lstm_cell/_b_o:0, shape = (256,)
  name = bidirectional_rnn/bw/coupled_input_forget_gate_lstm_cell/_w_xi:0, shape = (768, 256)
  name = bidirectional_rnn/bw/coupled_input_forget_gate_lstm_cell/_w_hi:0, shape = (256, 256)
  name = bidirectional_rnn/bw/coupled_input_forget_gate_lstm_cell/_w_ci:0, shape = (256, 256)
  name = bidirectional_rnn/bw/coupled_input_forget_gate_lstm_cell/_w_xo:0, shape = (768, 256)
  name = bidirectional_rnn/bw/coupled_input_forget_gate_lstm_cell/_w_ho:0, shape = (256, 256)
  name = bidirectional_rnn/bw/coupled_input_forget_gate_lstm_cell/_w_co:0, shape = (256, 256)
  name = bidirectional_rnn/bw/coupled_input_forget_gate_lstm_cell/_w_xc:0, shape = (768, 256)
  name = bidirectional_rnn/bw/coupled_input_forget_gate_lstm_cell/_w_hc:0, shape = (256, 256)
  name = bidirectional_rnn/bw/coupled_input_forget_gate_lstm_cell/_b_i:0, shape = (256,)
  name = bidirectional_rnn/bw/coupled_input_forget_gate_lstm_cell/_b_c:0, shape = (256,)
  name = bidirectional_rnn/bw/coupled_input_forget_gate_lstm_cell/_b_o:0, shape = (256,)
  name = HW:0, shape = (512, 256)
  name = Hb:0, shape = (256,)
  name = LW:0, shape = (256, 11)
  name = Lb:0, shape = (11,)
init bert from checkpoint: /content/drive/MyDrive/Deep-Learning-With-Python/chapter8/CCF_ner/bert_pretrained/bert_model.ckpt
WARNING:tensorflow:From test_bert.py:392: The name tf.train.exponential_decay is deprecated. Please use tf.compat.v1.train.exponential_decay instead.

bert train variable num: 199
normal train variable num: 26
word2vec trainable!!

說明:

我們可以直接呼叫官方的tensorflow的bert模型來使用bert,接下來,我們使用output_layer=model.get_sequence_output()來獲得最後一層的特徵,然後接下來在新增bilstm層,

對於bilstm的前向和反向的輸出進行拼接後,經過一個project_layer()函式計算logits,最後再經過一個損失層計算損失和其它的一些預測的值等。同時我們要將預訓練bert模型的引數匯入到bert中。

這裡面我們可以通過這種方式計算每個序列的長度:

used = tf.sign(tf.abs(self.input_x_word))
length
= tf.reduce_sum(used, reduction_indices=1) self.lengths = tf.cast(length, tf.int32)

當然,在喂入資料的時候,我們也已經傳入了長度了,可以酌情使用。

當bert+bilstm之後,一般而言bert微調的學習率和bilstm的學習率是要設定成不同的,比如一下程式碼:

# 超引數設定
            global_step = tf.Variable(0, name='step', trainable=False)
            learning_rate = tf.train.exponential_decay(config.learning_rate, global_step, config.decay_step,
                                                       config.decay_rate, staircase=True)

            normal_optimizer = tf.train.AdamOptimizer(learning_rate)  # 下接結構的學習率

            all_variables = graph.get_collection('trainable_variables')
            word2vec_var_list = [x for x in all_variables if 'bert' in x.name]  # BERT的引數
            normal_var_list = [x for x in all_variables if 'bert' not in x.name]  # 下接結構的引數
            print('bert train variable num: {}'.format(len(word2vec_var_list)))
            print('normal train variable num: {}'.format(len(normal_var_list)))
            normal_op = normal_optimizer.minimize(model.loss, global_step=global_step, var_list=normal_var_list)
            num_batch = int(train_iter.num_records / config.batch_size * config.train_epoch)
            embed_step = tf.Variable(0, name='step', trainable=False)
            if word2vec_var_list:  # 對BERT微調
                print('word2vec trainable!!')
                word2vec_op, embed_learning_rate, embed_step = create_optimizer(
                    model.loss, config.embed_learning_rate, num_train_steps=num_batch,
                    num_warmup_steps=int(num_batch * 0.05) , use_tpu=False ,  variable_list=word2vec_var_list
                )

                train_op = tf.group(normal_op, word2vec_op)  # 組裝BERT與下接結構引數
            else:
                train_op = normal_op

一般bert+bilstm之後還需要接一個crf(條件隨機場),我們下節繼續。