89、tensorflow使用GPU平行計算
阿新 • • 發佈:2019-02-20
''' Created on May 25, 2017 @author: p0079482 ''' # 分散式深度學習模型訓練模式 # 在一臺機器的多個GPU上並行訓練深度學習模型 from datetime import datetime import os import time import tensorflow as tf import mnist_inference # 定義訓練神經網路時需要用到的配置。 BATCH_SIZE = 100 LEARNING_RATE_BASE = 0.001 LEARNING_RATE_DECAY = 0.99 REGULARAZTION_REATE = 0.0001 TRAINING_STEPS= 1000 MOVING_AVERAGE_DECAY = 0.99 N_GPU = 4 # 定義日誌和模型輸出的路徑 MODEL_SAVE_PATH = "/path/to/logs_and_models/" MODEL_NAME = "model.ckpt" # 定義資料儲存的路徑。因為需要為不同的GPU提供不同的訓練資料,所以通過placerholder的方式 # 就需要手動準備多分資料。為了方便訓練資料的獲取過程,可以採用第7章中介紹的輸入佇列的方式 # 從TFRecord中讀取資料。於是在這裡提供的資料檔案路徑為將MNIST訓練資料轉化為TFRecords格式之後的路徑 # 如何將MNIST資料轉化為TFRecord格式在第7章中有詳細介紹,這裡不再贅述DATA_PATH = "/path/to/output.tfrecords" # 定義輸入佇列得到訓練資料,具體細節可以參考第7章 def get_input(): filename_queue = tf.train.string_input_producer([DATA_PATH]) reader = tf.TFRecordReader() _, serialized_example = reader.read(filename_queue) # 定義資料解析格式 features = tf.parse_single_example(serialized_example, features={ 'image_raw':tf.FixedLenFeature([], tf.string), 'pixels':tf.FixedLenFeature([], tf.int64), 'label':tf.FixedLenFeature([], tf.int64), }) # 解析圖片和標籤資訊 decoded_image = tf.decode_raw(features['image_raw'], tf.uint8) reshaped_image = tf.reshape(decoded_image, [784]) retyped_image = tf.cast(reshaped_image, tf.float32) label = tf.cast(features['label'], tf.int32) # 定義輸入佇列並返回 min_after_dequeue = 10000 capacity = min_after_dequeue + 3 * BATCH_SIZE return tf.train.shuffle_batch([retyped_image, label], batch_size=BATCH_SIZE, capacity=capacity, min_after_dequeue=min_after_dequeue) # 定義損失函式。對於給定的訓練資料、正則化損失計算規則和名稱空間,計算在這個名稱空間下的總損失 # 之所以需要給定名稱空間是因為不同的GPU上計算得出的正則化損失都會加入名為loss的集合, # 如果不通過名稱空間就會將不同GPU上的正則化損失都加進來 def get_loss(x, y_, regularizer, scope, reuse_variables=None): # 沿用5.5節中定義的函式來計算神經網路的前向傳播結果 with tf.variable_scope(tf.get_variable_scope(), reuse=reuse_variables): y = mnist_inference.inference(x, regularizer) # 計算交叉熵損失 cross_entropy = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y, labels=y_)) # 計算當前GPU上計算得到的正則化損失 regularization_loss = tf.add_n(tf.get_collection('losses', scope)) # 計算最終的總損失 loss = cross_entropy + regularization_loss return loss def average_gradients(tower_grads): average_grads = [] # 列舉所有的變數和變數在不同GPU上計算得出的梯度 for grad_and_vars in zip(*tower_grads): # 計算所有GPU上的梯度平均值 grads = [] for g, _ in grad_and_vars: expanded_g = tf.expand_dims(g, 0) grads.append(expanded_g) grad = tf.concat(grads, 0) grad = tf.reduce_mean(grad, 0) v = grad_and_vars[0][1] grad_and_var = (grad, v) # 將變數和它的平均梯度對應起來 average_grads.append(grad_and_var) # 返回所有變數的平均梯度,這將被用於變數更新 return average_grads # 主訓練過程 def main(argv=None): # 將簡單的運算放在CPU上,只有神經網路的訓練過程放在GPU上 with tf.Graph().as_default(), tf.device('/cpu:0'): # 獲取訓練batch x, y_ = get_input() regularizer = tf.contrib.layers.l2_regularizer(REGULARAZTION_REATE) # 定義訓練輪數和指數衰減的學習率 global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) learning_rate = tf.train.exponential_decay(LEARNING_RATE_BASE, global_step, 60000 / BATCH_SIZE, LEARNING_RATE_DECAY) # 定義優化方法 opt = tf.train.GradientDescentOptimizer(learning_rate) tower_grads = [] reuse_variables = False # 將神經網路的優化過程跑在不同的GPU上 for i in range(N_GPU): # 將優化過程指定在一個GPU上 with tf.device('/gpu:%d' % i): with tf.name_scope('GPU_%d' % i) as scope: cur_loss = get_loss(x, y_, regularizer, scope, reuse_variables) # 在第一次宣告變數之後,將控制變數重用的引數設定為True.這樣可以 # 讓不同的GPU更新同一組引數。注意tf.name_scope函式並不會影響 # tf.get_variable的名稱空間 reuse_variables = True # 使用當前GPU計算所有變數的梯度 grads = opt.compute_gradients(cur_loss) tower_grads.append(grads) # 計算變數的平均梯度,並輸出到TensorBoard日誌中 grads = average_gradients(tower_grads) for grad, var in grads: if grad is not None: tf.summary.histogram('gradients_on_average/%s' % var.op.name, grad) # 使用平均梯度更新引數 apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) for var in tf.trainable_variables(): tf.summary.histogram(var.op.name, var) # 計算變數的滑動平均值 variable_averages = tf.train.ExponentialMovingAverage( MOVING_AVERAGE_DECAY, global_step) variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) variable_averages_op = variable_averages.apply(variables_to_average) # 每一輪迭代需要更新變數的取值並更新變數的滑動平均值 train_op = tf.group(apply_gradient_op, variable_averages_op) saver = tf.train.Saver(tf.global_variables()) summary_op = tf.summary.merge_all() init = tf.global_variables_initializer() # 訓練過程 with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)) as sess: # 初始化所有變數並啟動佇列 init.run() coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) summary_writer = tf.summary.FileWriter(MODEL_SAVE_PATH, sess.graph) for step in range(TRAINING_STEPS): # 執行神經網路訓練操作,並記錄訓練操作的執行時間 start_time = time.time() _, loss_value = sess.run([train_op, cur_loss]) duration = time.time() - start_time # 每隔一段時間展示當前的訓練進度,並統計訓練速度 if step != 0 and step % 10 == 0 and duration != 0: # 計算使用過的訓練資料個數,因為在每一次執行訓練操作時,每一個GPU # 都會使用一個batch的訓練資料,所以總共用到的訓練資料個數為 # batch大小*GPU個數 num_examples_per_step = BATCH_SIZE * N_GPU # num_examples_per_step為本次迭代使用到的訓練資料個數 # duration為運行當前訓練過程使用的時間,於是平均每秒可以處理的訓練資料個數為 # num_examples_per_step/duration examples_per_sec = num_examples_per_step / duration # duration為運行當前訓練過程使用的時間,因為在每一個訓練過程中 # 每一個GPU都會使用一個batch的訓練資料,所以在單個batch上的訓練所需要時間為 # duration/GPU個數 sec_per_batch = duration / N_GPU # 輸出訓練資訊 format_str = ('step %d, loss = %.2f (% .1f examples/ sec; %.3f sec/batch)') print(format_str % (step, loss_value, examples_per_sec, sec_per_batch)) # 通過TensorBoard視覺化訓練過程 summary = sess.run(summary_op) summary_writer.add_summary(summary, step) # 每隔一段時間儲存當前的模型 if step % 1000 == 0 or (step + 1) == TRAINING_STEPS: checkpoint_path = os.path.join(MODEL_SAVE_PATH, MODEL_NAME) saver.save(sess, checkpoint_path, global_step=step) coord.request_stop() coord.join(threads) if __name__ == '__main__': tf.app.run()
下面是訓練完的結果
step 20, loss = 29.53 ( 10362.6 examples/ sec; 0.010 sec/batch) step 30, loss = 9.62 ( 12022.4 examples/ sec; 0.008 sec/batch) step 40, loss = 16.63 ( 10689.3 examples/ sec; 0.009 sec/batch) step 50, loss = 10.68 ( 11293.4 examples/ sec; 0.009 sec/batch) step 60, loss = 14.73 ( 10895.0 examples/ sec; 0.009 sec/batch) step 70, loss = 17.17 ( 11192.9 examples/ sec; 0.009 sec/batch) step 80, loss = 12.43 ( 11236.8 examples/ sec; 0.009 sec/batch) step 90, loss = 5.16 ( 11398.3 examples/ sec; 0.009 sec/batch) step 100, loss = 8.06 ( 12466.7 examples/ sec; 0.008 sec/batch) step 110, loss = 13.57 ( 11081.5 examples/ sec; 0.009 sec/batch) step 120, loss = 9.43 ( 11396.2 examples/ sec; 0.009 sec/batch) step 130, loss = 12.21 ( 13296.7 examples/ sec; 0.008 sec/batch) step 140, loss = 6.15 ( 11868.9 examples/ sec; 0.008 sec/batch) step 150, loss = 9.93 ( 12089.1 examples/ sec; 0.008 sec/batch) step 160, loss = 10.42 ( 11733.5 examples/ sec; 0.009 sec/batch) step 170, loss = 23.47 ( 11859.4 examples/ sec; 0.008 sec/batch) step 180, loss = 2.97 ( 11358.0 examples/ sec; 0.009 sec/batch) step 190, loss = 5.44 ( 11085.0 examples/ sec; 0.009 sec/batch) step 200, loss = 3.98 ( 13347.3 examples/ sec; 0.007 sec/batch) step 210, loss = 11.98 ( 10551.4 examples/ sec; 0.009 sec/batch) step 220, loss = 9.17 ( 11115.3 examples/ sec; 0.009 sec/batch) step 230, loss = 15.31 ( 12450.5 examples/ sec; 0.008 sec/batch) step 240, loss = 5.92 ( 11729.5 examples/ sec; 0.009 sec/batch) step 250, loss = 9.94 ( 10497.2 examples/ sec; 0.010 sec/batch) step 260, loss = 2.94 ( 11398.1 examples/ sec; 0.009 sec/batch) step 270, loss = 7.30 ( 10497.4 examples/ sec; 0.010 sec/batch) step 280, loss = 3.98 ( 11946.0 examples/ sec; 0.008 sec/batch) step 290, loss = 7.66 ( 11307.2 examples/ sec; 0.009 sec/batch) step 300, loss = 2.03 ( 11968.7 examples/ sec; 0.008 sec/batch) step 310, loss = 2.39 ( 8672.0 examples/ sec; 0.012 sec/batch) step 320, loss = 2.07 ( 3835.6 examples/ sec; 0.026 sec/batch) step 330, loss = 2.71 ( 12087.7 examples/ sec; 0.008 sec/batch) step 340, loss = 2.70 ( 11907.3 examples/ sec; 0.008 sec/batch) step 350, loss = 7.17 ( 7671.2 examples/ sec; 0.013 sec/batch) step 360, loss = 8.36 ( 11863.6 examples/ sec; 0.008 sec/batch) step 370, loss = 2.48 ( 11782.7 examples/ sec; 0.008 sec/batch) step 380, loss = 2.27 ( 11081.5 examples/ sec; 0.009 sec/batch) step 390, loss = 2.85 ( 11562.4 examples/ sec; 0.009 sec/batch) step 400, loss = 2.99 ( 12088.9 examples/ sec; 0.008 sec/batch) step 410, loss = 5.08 ( 12465.6 examples/ sec; 0.008 sec/batch) step 420, loss = 2.12 ( 12869.1 examples/ sec; 0.008 sec/batch) step 430, loss = 2.83 ( 13756.3 examples/ sec; 0.007 sec/batch) step 440, loss = 7.56 ( 13297.8 examples/ sec; 0.008 sec/batch) step 450, loss = 3.51 ( 12634.6 examples/ sec; 0.008 sec/batch) step 460, loss = 2.23 ( 13297.8 examples/ sec; 0.008 sec/batch) step 470, loss = 1.80 ( 12869.2 examples/ sec; 0.008 sec/batch) step 480, loss = 5.92 ( 9730.3 examples/ sec; 0.010 sec/batch) step 490, loss = 4.01 ( 12647.0 examples/ sec; 0.008 sec/batch) step 500, loss = 2.29 ( 12466.9 examples/ sec; 0.008 sec/batch) step 510, loss = 2.20 ( 13078.4 examples/ sec; 0.008 sec/batch) step 520, loss = 3.70 ( 13296.5 examples/ sec; 0.008 sec/batch) step 530, loss = 2.11 ( 13298.3 examples/ sec; 0.008 sec/batch) step 540, loss = 1.73 ( 13296.6 examples/ sec; 0.008 sec/batch) step 550, loss = 1.20 ( 12868.9 examples/ sec; 0.008 sec/batch) step 560, loss = 3.44 ( 13078.6 examples/ sec; 0.008 sec/batch) step 570, loss = 1.35 ( 11562.0 examples/ sec; 0.009 sec/batch) step 580, loss = 3.51 ( 13205.2 examples/ sec; 0.008 sec/batch) step 590, loss = 3.11 ( 12868.8 examples/ sec; 0.008 sec/batch) step 600, loss = 3.40 ( 12869.1 examples/ sec; 0.008 sec/batch) step 610, loss = 2.49 ( 13297.7 examples/ sec; 0.008 sec/batch) step 620, loss = 2.68 ( 12620.3 examples/ sec; 0.008 sec/batch) step 630, loss = 2.09 ( 11907.3 examples/ sec; 0.008 sec/batch) step 640, loss = 3.82 ( 8487.3 examples/ sec; 0.012 sec/batch) step 650, loss = 2.77 ( 11081.5 examples/ sec; 0.009 sec/batch) step 660, loss = 2.55 ( 12089.1 examples/ sec; 0.008 sec/batch) step 670, loss = 2.53 ( 10228.3 examples/ sec; 0.010 sec/batch) step 680, loss = 5.17 ( 9498.5 examples/ sec; 0.011 sec/batch) step 690, loss = 2.02 ( 10498.4 examples/ sec; 0.010 sec/batch) step 700, loss = 0.21 ( 12088.9 examples/ sec; 0.008 sec/batch) step 710, loss = 1.95 ( 12868.7 examples/ sec; 0.008 sec/batch) step 720, loss = 3.90 ( 13296.2 examples/ sec; 0.008 sec/batch) step 730, loss = 2.17 ( 9277.6 examples/ sec; 0.011 sec/batch) step 740, loss = 1.09 ( 9730.1 examples/ sec; 0.010 sec/batch) step 750, loss = 1.33 ( 12466.8 examples/ sec; 0.008 sec/batch) step 760, loss = 3.17 ( 9797.9 examples/ sec; 0.010 sec/batch) step 770, loss = 3.20 ( 13297.9 examples/ sec; 0.008 sec/batch) step 780, loss = 4.28 ( 13756.4 examples/ sec; 0.007 sec/batch) step 790, loss = 1.23 ( 12465.4 examples/ sec; 0.008 sec/batch) step 800, loss = 1.78 ( 12868.8 examples/ sec; 0.008 sec/batch) step 810, loss = 1.12 ( 12924.2 examples/ sec; 0.008 sec/batch) step 820, loss = 2.09 ( 13297.1 examples/ sec; 0.008 sec/batch) step 830, loss = 0.71 ( 11967.1 examples/ sec; 0.008 sec/batch) step 840, loss = 3.03 ( 12088.8 examples/ sec; 0.008 sec/batch) step 850, loss = 2.76 ( 12868.8 examples/ sec; 0.008 sec/batch) step 860, loss = 1.64 ( 12087.1 examples/ sec; 0.008 sec/batch) step 870, loss = 2.43 ( 9066.8 examples/ sec; 0.011 sec/batch) step 880, loss = 1.73 ( 11398.2 examples/ sec; 0.009 sec/batch) step 890, loss = 0.61 ( 12980.4 examples/ sec; 0.008 sec/batch) step 900, loss = 3.44 ( 12868.8 examples/ sec; 0.008 sec/batch) step 910, loss = 0.96 ( 11445.9 examples/ sec; 0.009 sec/batch) step 920, loss = 2.95 ( 13756.3 examples/ sec; 0.007 sec/batch) step 930, loss = 2.99 ( 12868.5 examples/ sec; 0.008 sec/batch) step 940, loss = 0.34 ( 13752.5 examples/ sec; 0.007 sec/batch) step 950, loss = 1.05 ( 13297.8 examples/ sec; 0.008 sec/batch) step 960, loss = 2.34 ( 13295.7 examples/ sec; 0.008 sec/batch) step 970, loss = 1.32 ( 13297.6 examples/ sec; 0.008 sec/batch) step 980, loss = 2.46 ( 12466.6 examples/ sec; 0.008 sec/batch) step 990, loss = 1.02 ( 13297.7 examples/ sec; 0.008 sec/batch)