1. 程式人生 > >【TensorFlow】多GPU訓練:示例程式碼解析



兩個GPU比單個GPU加速了近一倍 :






  • CPU 做為引數伺服器
  • 多個GPU計算彙總更新


# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#     http://www.apache.org/licenses/LICENSE-2.0
# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License.
# ============================================================================== """A binary to train CIFAR-10 using multiple GPUs with synchronous updates. 在100k大概256epochs後可以達到約86%的精度 Accuracy: cifar10_multi_gpu_train.py achieves ~86% accuracy after 100K steps (256 epochs of data) as judged by cifar10_eval.py. Speed: With batch_size 128. 下面是一些訓練參考時間: System | Step Time (sec/batch) | Accuracy -------------------------------------------------------------------- 1 Tesla K20m | 0.35-0.60 | ~86% at 60K steps (5 hours) 1 Tesla K40m | 0.25-0.35 | ~86% at 100K steps (4 hours) 2 Tesla K20m | 0.13-0.20 | ~84% at 30K steps (2.5 hours) 3 Tesla K20m | 0.13-0.18 | ~84% at 30K steps 4 Tesla K20m | ~0.10 | ~84% at 30K steps """ from __future__ import absolute_import from __future__ import division from __future__ import print_function #匯入版本支援 from datetime import datetime #匯入時間模組 import os.path #路徑模組用於穿件資料夾 import re #正則表示式模組 import time import numpy as np from six.moves import xrange # pylint: disable=redefined-builtin #這句類似python range,py2/py3相容模組,也可將文中的xrange替換為range import tensorflow as tf #匯入tensorflow import cifar10 #匯入自定義的cifar10.py,包含了各種資料初始化、模型構建、損失和訓練函式


FLAGS = tf.app.flags.FLAGS    #定義引數flags,隨後利用FLAGS讀取引數

tf.app.flags.DEFINE_string('train_dir', './your/path/to/data/cifar10_train',
                           """Directory where to write event logs """
                           """and checkpoint.""")
tf.app.flags.DEFINE_integer('max_steps', 1000000,
                            """Number of batches to run.""")
tf.app.flags.DEFINE_integer('num_gpus', 1,
                            """How many GPUs to use.""")
tf.app.flags.DEFINE_boolean('log_device_placement', False,
                            """Whether to log device placement.""")



def tower_loss(scope, images, labels):
  """Calculate the total loss on a single tower running the CIFAR model.

    scope: 特定tower的名稱空間, e.g. 'tower_0'
    images: Images. 4D tensor of shape [batch_size, height, width, 3].
    labels: Labels. 1D tensor of shape [batch_size].

     Tensor of shape [] containing the 某個批次資料的總損失
	# 計算圖構建的輸出
	logits = cifar10.inference(images)
	# 呼叫函式計算loss
	_ = cifar10.loss(logits, labels)
	# 綜合tower的loss
	losses = tf.get_collection('losses', scope)
	# 計算當前tower的總loss
	total_loss = tf.add_n(losses, name='total_loss')
	# Attach a scalar summary to all individual losses and the total loss; do the
	# same for the averaged version of the losses.
	for l in losses + [total_loss]:
		# Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
		# session. 清理tensorboard
		loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name)
		tf.summary.scalar(loss_name, l)    #tensorboard視覺化
	return total_loss

    Tensor("tower_0/total_loss_1:0", shape=(), dtype=float32, device=/device:GPU:0)
	Tensor("tower_1/total_loss_1:0", shape=(), dtype=float32, device=/device:GPU:1)


-首先讀入每個GPU(Tower)中的(梯度,變數),這些變數按照GPU 分為多個字列表儲存,[[GPUi],.......,[GPUn]]
-將不同GPU中的同一個變數及其梯度((grad0_gpu0, var0_gpu0),.....,(grad0_gpun, var0_gpun))抽取出來,

def average_gradients(tower_grads):
  """Calculate the average gradient for each shared variable across all towers.

  Note that this function provides a synchronization point across all towers.

    tower_grads: List of lists of (gradient, variable) tuples. The outer list
      is over individual gradients. The inner list is over the gradient
      calculation for each tower.
     List of pairs of (gradient, variable) where the gradient has been averaged
     across all towers.

tower_grads = [[tower0_grad],[tower1_grads]]>>>包含了第一塊gpu的變數梯度和第二塊GPU的變數梯度,他們被放在一個大的列表裡outer-list;
而其中的每一個tower-n_grads 又是一個小的列表inner-list,包含了整個模型的梯度和變數。
[tower-n_grads] = [(grad0,variable0),.......,(gradn,variablen)

>>> tower_grads:
        (<tf.Tensor 'tower_0/gradients/tower_0/conv1/Conv2D_grad/tuple/control_dependency_1:0' shape=(5, 5, 3, 64) dtype=float32>, <tf.Variable 'conv1/weights:0' shape=(5, 5, 3, 64) dtype=float32_ref>), 
        (<tf.Tensor 'tower_0/gradients/tower_0/conv1/BiasAdd_grad/tuple/control_dependency_1:0' shape=(64,) dtype=float32>, <tf.Variable 'conv1/biases:0' shape=(64,) dtype=float32_ref>), 
        (<tf.Tensor 'tower_0/gradients/tower_0/conv2/Conv2D_grad/tuple/control_dependency_1:0' shape=(5, 5, 64, 64) dtype=float32>, <tf.Variable 'conv2/weights:0' shape=(5, 5, 64, 64) dtype=float32_ref>),
        (<tf.Tensor 'tower_0/gradients/tower_0/conv2/BiasAdd_grad/tuple/control_dependency_1:0' shape=(64,) dtype=float32>, <tf.Variable 'conv2/biases:0' shape=(64,) dtype=float32_ref>), 
        (<tf.Tensor 'tower_0/gradients/AddN_1:0' shape=(2304, 384) dtype=float32>, <tf.Variable 'local3/weights:0' shape=(2304, 384) dtype=float32_ref>), 
        (<tf.Tensor 'tower_0/gradients/tower_0/local3/add_grad/tuple/control_dependency_1:0' shape=(384,) dtype=float32>, <tf.Variable 'local3/biases:0' shape=(384,) dtype=float32_ref>), 
        (<tf.Tensor 'tower_0/gradients/AddN:0' shape=(384, 192) dtype=float32>, <tf.Variable 'local4/weights:0' shape=(384, 192) dtype=float32_ref>), 
        (<tf.Tensor 'tower_0/gradients/tower_0/local4/add_grad/tuple/control_dependency_1:0' shape=(192,) dtype=float32>, <tf.Variable 'local4/biases:0' shape=(192,) dtype=float32_ref>), 
        (<tf.Tensor 'tower_0/gradients/tower_0/softmax_linear/MatMul_grad/tuple/control_dependency_1:0' shape=(192, 10) dtype=float32>, <tf.Variable 'softmax_linear/weights:0' shape=(192, 10) dtype=float32_ref>), 
        (<tf.Tensor 'tower_0/gradients/tower_0/softmax_linear/softmax_linear_grad/tuple/control_dependency_1:0' shape=(10,) dtype=float32>, <tf.Variable 'softmax_linear/biases:0' shape=(10,) dtype=float32_ref>)],
        (<tf.Tensor 'tower_1/gradients/tower_1/conv1/Conv2D_grad/tuple/control_dependency_1:0' shape=(5, 5, 3, 64) dtype=float32>, <tf.Variable 'conv1/weights:0' shape=(5, 5, 3, 64) dtype=float32_ref>), 
        (<tf.Tensor 'tower_1/gradients/tower_1/conv1/BiasAdd_grad/tuple/control_dependency_1:0' shape=(64,) dtype=float32>, <tf.Variable 'conv1/biases:0' shape=(64,) dtype=float32_ref>), 
        (<tf.Tensor 'tower_1/gradients/tower_1/conv2/Conv2D_grad/tuple/control_dependency_1:0' shape=(5, 5, 64, 64) dtype=float32>, <tf.Variable 'conv2/weights:0' shape=(5, 5, 64, 64) dtype=float32_ref>), 
        (<tf.Tensor 'tower_1/gradients/tower_1/conv2/BiasAdd_grad/tuple/control_dependency_1:0' shape=(64,) dtype=float32>, <tf.Variable 'conv2/biases:0' shape=(64,) dtype=float32_ref>), 
        (<tf.Tensor 'tower_1/gradients/AddN_1:0' shape=(2304, 384) dtype=float32>, <tf.Variable 'local3/weights:0' shape=(2304, 384) dtype=float32_ref>), 
        (<tf.Tensor 'tower_1/gradients/tower_1/local3/add_grad/tuple/control_dependency_1:0' shape=(384,) dtype=float32>, <tf.Variable 'local3/biases:0' shape=(384,) dtype=float32_ref>), 
        (<tf.Tensor 'tower_1/gradients/AddN:0' shape=(384, 192) dtype=float32>, <tf.Variable 'local4/weights:0' shape=(384, 192) dtype=float32_ref>), 
        (<tf.Tensor 'tower_1/gradients/tower_1/local4/add_grad/tuple/control_dependency_1:0' shape=(192,) dtype=float32>, <tf.Variable 'local4/biases:0' shape=(192,) dtype=float32_ref>), 
        (<tf.Tensor 'tower_1/gradients/tower_1/softmax_linear/MatMul_grad/tuple/control_dependency_1:0' shape=(192, 10) dtype=float32>, <tf.Variable 'softmax_linear/weights:0' shape=(192, 10) dtype=float32_ref>), 
        (<tf.Tensor 'tower_1/gradients/tower_1/softmax_linear/softmax_linear_grad/tuple/control_dependency_1:0' shape=(10,) dtype=float32>, <tf.Variable 'softmax_linear/biases:0' shape=(10,) dtype=float32_ref>)


	average_grads = []
		for grad_and_vars in zip(*tower_grads):    #在各個變數var上迴圈
		#   grad_and_vars: ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
		#   遍歷var0及其梯度在不同GPU上的分佈,此例子中
		#((<tf.Tensor 'tower_0/gradients/tower_0/conv1/Conv2D_grad/tuple/control_dependency_1:0' shape=(5, 5, 3, 64) dtype=float32>, <tf.Variable 'conv1/weights:0' shape=(5, 5, 3, 64) dtype=float32_ref>), 
		#(<tf.Tensor 'tower_1/gradients/tower_1/conv1/Conv2D_grad/tuple/control_dependency_1:0' shape=(5, 5, 3, 64) dtype=float32>, <tf.Variable 'conv1/weights:0' shape=(5, 5, 3, 64) dtype=float32_ref>))

		grads = []
		for g, _ in grad_and_vars:    #對所有GPU上的同一變數的梯度進行組合
			# Add 0 dimension to the gradients to represent the tower.
			expanded_g = tf.expand_dims(g, 0)
			# Append on a 'tower' dimension which we will average over below.
		grad = tf.concat(axis=0, values=grads)    #在tower維度上,對不同的GPU求均值
		grad = tf.reduce_mean(grad, 0)     #得到所有變數及其梯度的均值
		# 引數由於共享冗餘,所以只需要返回變數在首個tower的指標
		v = grad_and_vars[0][1]              #指標varxx-gpuxx
		grad_and_var = (grad, v)             #合併為元組  得到某個變數綜合後的平均梯度,及變數名指標。
		average_grads.append(grad_and_var)   #新增新的梯度和v指標,新增各個var
	return average_grads
     >>> print(average_grads)
    [(<tf.Tensor 'Mean:0' shape=(5, 5, 3, 64) dtype=float32>, <tf.Variable 'conv1/weights:0' shape=(5, 5, 3, 64) dtype=float32_ref>),
	 (<tf.Tensor 'Mean_1:0' shape=(64,) dtype=float32>, <tf.Variable 'conv1/biases:0' shape=(64,) dtype=float32_ref>),
	 (<tf.Tensor 'Mean_2:0' shape=(5, 5, 64, 64) dtype=float32>, <tf.Variable 'conv2/weights:0' shape=(5, 5, 64, 64) dtype=float32_ref>),
	 (<tf.Tensor 'Mean_3:0' shape=(64,) dtype=float32>, <tf.Variable 'conv2/biases:0' shape=(64,) dtype=float32_ref>),
	 (<tf.Tensor 'Mean_4:0' shape=(2304, 384) dtype=float32>, <tf.Variable 'local3/weights:0' shape=(2304, 384) dtype=float32_ref>), 
	 (<tf.Tensor 'Mean_5:0' shape=(384,) dtype=float32>, <tf.Variable 'local3/biases:0' shape=(384,) dtype=float32_ref>), 
	 (<tf.Tensor 'Mean_6:0' shape=(384, 192) dtype=float32>, <tf.Variable 'local4/weights:0' shape=(384, 192) dtype=float32_ref>), 
	 (<tf.Tensor 'Mean_7:0' shape=(192,) dtype=float32>, <tf.Variable 'local4/biases:0' shape=(192,) dtype=float32_ref>), 
	 (<tf.Tensor 'Mean_8:0' shape=(192, 10) dtype=float32>, <tf.Variable 'softmax_linear/weights:0' shape=(192, 10) dtype=float32_ref>), 
	 (<tf.Tensor 'Mean_9:0' shape=(10,) dtype=float32>, <tf.Variable 'softmax_linear/biases:0' shape=(10,) dtype=float32_ref>)



def train():
	"""Train CIFAR-10 for a number of steps."""
	with tf.Graph().as_default(), tf.device('/cpu:0'):
		# Create a variable to count the number of train() calls. This equals the
		# number of batches processed * FLAGS.num_gpus.
		global_step = tf.get_variable(
		    'global_step', [],
		    initializer=tf.constant_initializer(0), trainable=False)
		# Calculate the learning rate schedule.
		num_batches_per_epoch = (cifar10.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
		                         FLAGS.batch_size / FLAGS.num_gpus)
		decay_steps = int(num_batches_per_epoch * cifar10.NUM_EPOCHS_PER_DECAY)
		# Decay the learning rate exponentially based on the number of steps.
		lr = tf.train.exponential_decay(cifar10.INITIAL_LEARNING_RATE,
		# Create an optimizer that performs gradient descent.
		opt = tf.train.GradientDescentOptimizer(lr)
		# 影象和標籤的batch輸入
		images, labels = cifar10.distorted_inputs()
		batch_queue = tf.contrib.slim.prefetch_queue.prefetch_queue(
		      [images, labels], capacity=2 * FLAGS.num_gpus)

		# 計算每一個gpu上的梯度,放入tower_grads中.
		tower_grads = []
		with tf.variable_scope(tf.get_variable_scope()):
			for i in xrange(FLAGS.num_gpus):
				with tf.device('/gpu:%d' % i):
					with tf.name_scope('%s_%d' % (cifar10.TOWER_NAME, i)) as scope:
						# Dequeues one batch for the GPU
						image_batch, label_batch = batch_queue.dequeue()
						# Calculate the loss for one tower of the CIFAR model. This function
						# constructs the entire CIFAR model but shares the variables across
						# all towers.
						loss = tower_loss(scope, image_batch, label_batch)
						# Reuse variables for the next tower.
						# Retain the summaries from the final tower.
						summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)
						# Calculate the gradients for the batch of data on this CIFAR tower.
						grads = opt.compute_gradients(loss)
						# Keep track of the gradients across all towers.
		# 計算平均梯度
		# 注意同步指標.
		grads = average_gradients(tower_grads)
		# tensorboard顯示學習率
		summaries.append(tf.summary.scalar('learning_rate', lr))
		# 各種梯度的tensorboard直方圖顯示
		for grad, var in grads:
			if grad is not None:
				summaries.append(tf.summary.histogram(var.op.name + '/gradients', grad))
		# 利用計算出的平均梯度來進行優化
		apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
		# 各種變數的直方圖
		for var in tf.trainable_variables():
			summaries.append(tf.summary.histogram(var.op.name, var))
		# 跟蹤所有變數的移動平均
		variable_averages = tf.train.ExponentialMovingAverage(
		    cifar10.MOVING_AVERAGE_DECAY, global_step)
		variables_averages_op = variable_averages.apply(tf.trainable_variables())
		# 將所有操作組合進單一操作
		train_op = tf.group(apply_gradient_op, variables_averages_op)
		# 儲存相關操作
		saver = tf.train.Saver(tf.global_variables())
		# 建立綜合操作
		summary_op = tf.summary.merge(summaries)
		# 初始化
		init = tf.global_variables_initializer()
		# 開始計算
		# Start running operations on the Graph. allow_soft_placement must be set to
		# True to build towers on GPU, as some of the ops do not have GPU
		# implementations.
		sess = tf.Session(config=tf.ConfigProto(
		# Start the queue runners.
		summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)
		for step in xrange(FLAGS.max_steps):
			start_time = time.time()
			_, loss_value = sess.run([train_op, loss])
			duration = time.time() - start_time
			assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
		#---------------------------下面是不同check steps的時候顯示的資訊-----------------#
			if step % 10 == 0:
			    num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
			    examples_per_sec = num_examples_per_step / duration
			    sec_per_batch = duration / FLAGS.num_gpus
			    format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
			    print (format_str % (datetime.now(), step, loss_value,
			                         examples_per_sec, sec_per_batch))
		  	if step % 100 == 0:
			  	summary_str = sess.run(summary_op)
			  	summary_writer.add_summary(summary_str, step)
		  # Save the model checkpoint periodically.
		  	if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
			  	checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
			  	saver.save(sess, checkpoint_path, global_step=step)


def main(argv=None):  # pylint: disable=unused-argument
  cifar10.maybe_download_and_extract()    #沒資料需要下載,這個函式在cifar10.py裡
  if tf.gfile.Exists(FLAGS.train_dir):

if __name__ == '__main__':

pic from pexels.com
