keras/examples/mnist_acgan.py ACGAN程式碼解析
阿新 • • 發佈:2018-11-22
# -*- coding: utf-8 -*- """ Train an Auxiliary Classifier Generative Adversarial Network (ACGAN) on the MNIST dataset. See https://arxiv.org/abs/1610.09585 for more details. You should start to see reasonable images after ~5 epochs, and good images by ~15 epochs. You should use a GPU, as the convolution-heavy operations are very slow on the CPU. Prefer the TensorFlow backend if you plan on iterating, as the compilation time can be a blocker using Theano. Timings: Hardware | Backend | Time / Epoch ------------------------------------------- CPU | TF | 3 hrs Titan X (maxwell) | TF | 4 min Titan X (maxwell) | TH | 7 min Consult https://github.com/lukedeo/keras-acgan for more information and example output """ from __future__ import print_function from collections import defaultdict try: import cPickle as pickle except ImportError: import pickle from PIL import Image from six.moves import range from keras.datasets import mnist from keras import layers from keras.layers import Input, Dense, Reshape, Flatten, Embedding, Dropout from keras.layers import BatchNormalization from keras.layers.advanced_activations import LeakyReLU from keras.layers.convolutional import Conv2DTranspose, Conv2D from keras.models import Sequential, Model from keras.optimizers import Adam from keras.utils.generic_utils import Progbar import numpy as np np.random.seed(1337) num_classes = 10 def build_generator(latent_size): # 構造一個generator, # 對映一組影象空間(格式為(..., 28, 28, 1))的引數(z, L), # 其中z是分佈的向量,L是從P_c裡取的標記 # we will map a pair of (z, L), where z is a latent vector and L is a # label drawn from P_c, to image space (..., 28, 28, 1) cnn = Sequential() # 建立一個layer的線性棧,用於後面新增layers,相當於初始化一個卷積網路 # dense構建網路層,當為第一層時,需要給出輸入input_dim,不為第一層時不需要 cnn.add(Dense(3 * 3 * 384, input_dim=latent_size, activation='relu')) cnn.add(Reshape((3, 3, 384))) # upsample to (7, 7, ...) cnn.add(Conv2DTranspose(192, 5, strides=1, padding='valid', activation='relu', kernel_initializer='glorot_normal')) cnn.add(BatchNormalization()) # upsample to (14, 14, ...) cnn.add(Conv2DTranspose(96, 5, strides=2, padding='same', activation='relu', kernel_initializer='glorot_normal')) cnn.add(BatchNormalization()) # upsample to (28, 28, ...) cnn.add(Conv2DTranspose(1, 5, strides=2, padding='same', activation='tanh', kernel_initializer='glorot_normal')) # this is the z space commonly referred to in GAN papers latent = Input(shape=(latent_size, )) # 建立先驗分佈 z space,一維長度為 latent_size的噪聲向量(特徵向量),但並沒有初始化(只是構建圖) ,實際上是隨機sample的一個特徵向量 # 這是一個condition GAN,所以除了噪聲向量,還要構建標記: image_class = Input(shape=(1, ), dtype='int32') # 建立影象種類標記,一維整型向量,且長度也為1 cls = Flatten()(Embedding(num_classes, latent_size, embeddings_initializer='glorot_normal')(image_class)) # 先了解一下嵌入層的知識:http://frankchen.xyz/2017/12/18/How-to-Use-Word-Embedding-Layers-for-Deep-Learning-with-Keras/ # 先建立一個嵌入層(Embedding()()的第一個括號裡是argument,第二個括號裡是輸入),輸入資料的長度為1,num_classes=10表示最大的輸入是9,即輸入是0-9的10個整數,即之前建立的影象標記image_class(不是獨熱資料型別,是單個的整數). latent_size代表全連線嵌入的維度,embedding層的輸出shape為:(input_size, output_dim),這裡的input_size由第二個括號裡的image_class控制,即=1;output_dim=latent_size. 在這裡latent_size就是輸出的一維資料的長度(由於是全連線層,所以輸入是1維,輸出也是一維). # embedding初始化方法是glorot_normal法(xavier_normal法),個人理解是對image_class做初始化(之前只定義了是一維,長度為10,但並沒有定義輸入矩陣每個數的值),normal指正太分佈,glorot演算法可以通過輸入和輸出單元的數量自動確定權值矩陣的初始化大小,可以儘可能保證輸入和輸出資料有類似的概率分佈. 其他初始化方法種類相關定義見:(http://keras-cn.readthedocs.io/en/latest/other/initializations/) # Flatten()()第一個括號是argument,第二個括號是輸入,用於壓平輸入,將多維資料轉成一維,這裡由於Embedding的輸出本身就是一維,所以實際上沒有做操作. # 所以整個這行程式碼的作用是: 反向全連線層(與前面的反向卷積類似),得到正常卷積網路輸出層前面的那層全連線層的輸出, shape為(latent_size,) h = layers.multiply([latent, cls]) # shape同為(latent_size,)的特徵向量與標記嵌入向量乘,得到的還是shape為(latent_size,)的特徵向量,該特徵向量中加入了標記的資訊,即condition. fake_image = cnn(h) # h指generator的真實輸入,通過構建的反向網路來生成image(生成出的假的) return Model([latent, image_class], fake_image) # 建立函式式模型,input為[latent, image_class], output為fake_image def build_discriminator(): # build a relatively standard conv net, with LeakyReLUs as suggested in # the reference paper # 建立discriminator,這是一個正向卷積網路,輸出為輸入影象(generator生成出的或者真實的影象)的真假以及輸入影象的分類(輔助分類器,實際上在GAN中只需要知道真假即可). cnn = Sequential() cnn.add(Conv2D(32, 3, padding='same', strides=2, input_shape=(28, 28, 1))) cnn.add(LeakyReLU(0.2)) cnn.add(Dropout(0.3)) # 使用了dropout,也可以不使用池化層 cnn.add(Conv2D(64, 3, padding='same', strides=1)) cnn.add(LeakyReLU(0.2)) cnn.add(Dropout(0.3)) cnn.add(Conv2D(128, 3, padding='same', strides=2)) cnn.add(LeakyReLU(0.2)) cnn.add(Dropout(0.3)) cnn.add(Conv2D(256, 3, padding='same', strides=1)) cnn.add(LeakyReLU(0.2)) cnn.add(Dropout(0.3)) cnn.add(Flatten()) image = Input(shape=(28, 28, 1)) features = cnn(image) fake = Dense(1, activation='sigmoid', name='generation')(features) # 0<fake<1,用於判定真假 aux = Dense(num_classes, activation='softmax', name='auxiliary')(features) # aux是一個長度為10的概率分佈,和為1,用於完成影象的分類 return Model(image, [fake, aux]) # model的輸入是image,輸出是[fake, aux] if __name__ == '__main__': # batch and latent size taken from the paper epochs = 100 batch_size = 100 latent_size = 100 # 特徵向量的長度為100 # Adam parameters suggested in https://arxiv.org/abs/1511.06434 # 定義優化器的學習率和超引數 adam_lr = 0.0002 adam_beta_1 = 0.5 # build the discriminator print('Discriminator model:') discriminator = build_discriminator() # 構建判別網路 # keras的模型通過Model.compile()來定義優化器,損失函式,這裡對於真假二分類問題使用binary_crossentropy損失函式,對於影象代表的數字的10分類問題,採用sparse_categorical_crossentropy分類器. discriminator.compile( optimizer=Adam(lr=adam_lr, beta_1=adam_beta_1), loss=['binary_crossentropy', 'sparse_categorical_crossentropy'] ) discriminator.summary() # 列印這個網路的日誌 # build the generator generator = build_generator(latent_size) latent = Input(shape=(latent_size, )) image_class = Input(shape=(1,), dtype='int32') # get a fake image fake = generator([latent, image_class]) # 這些都是在構建圖,並沒有賦值計算 # we only want to be able to train generation for the combined model # 先固定discriminator,來更新generator,讓generator能夠生成可以欺騙該discriminator的圖片 discriminator.trainable = False fake, aux = discriminator(fake) # 計算出該fake的值(接近0或者1),和該fake的推測值 combined = Model([latent, image_class], [fake, aux]) # 輸入為建立的[latent, image_class](圖片的特徵向量和標記),和generator生成的結果[fake, aux] print('Combined model:') combined.compile( optimizer=Adam(lr=adam_lr, beta_1=adam_beta_1), loss=['binary_crossentropy', 'sparse_categorical_crossentropy'] ) combined.summary() # get our mnist data, and force it to be of shape (..., 28, 28, 1) with # range [-1, 1] (x_train, y_train), (x_test, y_test) = mnist.load_data() x_train = (x_train.astype(np.float32) - 127.5) / 127.5 # 將畫素值歸一化,取值範圍[-1, 1] x_train = np.expand_dims(x_train, axis=-1) # 將mnist資料轉化為(...,28,28,1)維度 x_test = (x_test.astype(np.float32) - 127.5) / 127.5 x_test = np.expand_dims(x_test, axis=-1) num_train, num_test = x_train.shape[0], x_test.shape[0] # 訓練和測試的資料個數 train_history = defaultdict(list) test_history = defaultdict(list) for epoch in range(1, epochs + 1): print('Epoch {}/{}'.format(epoch, epochs)) num_batches = int(x_train.shape[0] / batch_size) progress_bar = Progbar(target=num_batches) # we don't want the discriminator to also maximize the classification # accuracy of the auxiliary classifier on generated images, so we # don't train discriminator to produce class labels for generated # images (see https://openreview.net/forum?id=rJXTf9Bxg). # To preserve sum of sample weights for the auxiliary classifier, # we assign sample weight of 2 to the real images. disc_sample_weight = [np.ones(2 * batch_size), np.concatenate((np.ones(batch_size) * 2, np.zeros(batch_size)))] epoch_gen_loss = [] epoch_disc_loss = [] for index in range(num_batches): # generate a new batch of noise noise = np.random.uniform(-1, 1, (batch_size, latent_size)) # 隨機給噪聲向量取值,範圍是[-1,1),這與之前對mnist資料集做的處理的範圍一樣;shape是(batch_size, latent_size),這與全連線層之前的特徵向量一樣 # get a batch of real images image_batch = x_train[index * batch_size:(index + 1) * batch_size] label_batch = y_train[index * batch_size:(index + 1) * batch_size] # sample some labels from p_c sampled_labels = np.random.randint(0, num_classes, batch_size) # 給噪聲隨機取一些噪聲標記,範圍是[0,10)裡的整數,個數是batch_size # generate a batch of fake images, using the generated labels as a # conditioner. We reshape the sampled labels to be # (batch_size, 1) so that we can feed them into the embedding # layer as a length one sequence # 通過建立的CGAN生成虛假images generated_images = generator.predict( [noise, sampled_labels.reshape((-1, 1))], verbose=0) x = np.concatenate((image_batch, generated_images)) # 將真實圖片和虛假圖片並列放到x裡,real圖片佔前半部分,fake佔後半部分 # use one-sided soft real/fake labels # Salimans et al., 2016 # https://arxiv.org/pdf/1606.03498.pdf (Section 3.4) soft_zero, soft_one = 0, 0.95 y = np.array([soft_one] * batch_size + [soft_zero] * batch_size) # 判斷真假的標記值(前半部分真實的都取0.95(逼近1),後半部分假的都取0(不給假的任何機會)) aux_y = np.concatenate((label_batch, sampled_labels), axis=0) # 相應的分類的標記 # see if the discriminator can figure itself out... # sample_weight定義每個sample loss的權重,這裡discriminator只需要判斷真假,不需要鑑別種類,鑑別種類只是個輔助. # 所以: y是2個batch的真假標記,loss的權重都設定為1;aux_y是兩個batch的分類標記,對real的資料設定loss權重為2,對fake的資料設定loss權重為0, # 因為對於假圖片,我們沒有必要去關心他們的輔助分類結果來更新discriminator,改善discriminator的輔助分類效果只需要著重於real圖片. epoch_disc_loss.append(discriminator.train_on_batch( x, [y, aux_y], sample_weight=disc_sample_weight)) # make new noise. we generate 2 * batch size here such that we have # the generator optimize over an identical number of images as the # discriminator noise = np.random.uniform(-1, 1, (2 * batch_size, latent_size)) sampled_labels = np.random.randint(0, num_classes, 2 * batch_size) # we want to train the generator to trick the discriminator # For the generator, we want all the {fake, not-fake} labels to say # not-fake trick = np.ones(2 * batch_size) * soft_one # 輸入裡有sampled_labels,用作condition,輸出裡,sampled出來的標記都被當做正確的,所以標記分別就是trick和sampled_labels本身 epoch_gen_loss.append(combined.train_on_batch( [noise, sampled_labels.reshape((-1, 1))], [trick, sampled_labels])) progress_bar.update(index + 1) # batch數進度條跟新一下 print('Testing for epoch {}:'.format(epoch)) # evaluate the testing loss here # generate a new batch of noise noise = np.random.uniform(-1, 1, (num_test, latent_size)) # sample some labels from p_c and generate images from them sampled_labels = np.random.randint(0, num_classes, num_test) generated_images = generator.predict( [noise, sampled_labels.reshape((-1, 1))], verbose=False) x = np.concatenate((x_test, generated_images)) y = np.array([1] * num_test + [0] * num_test) aux_y = np.concatenate((y_test, sampled_labels), axis=0) # see if the discriminator can figure itself out... discriminator_test_loss = discriminator.evaluate( x, [y, aux_y], verbose=False) discriminator_train_loss = np.mean(np.array(epoch_disc_loss), axis=0) # make new noise noise = np.random.uniform(-1, 1, (2 * num_test, latent_size)) sampled_labels = np.random.randint(0, num_classes, 2 * num_test) trick = np.ones(2 * num_test) generator_test_loss = combined.evaluate( [noise, sampled_labels.reshape((-1, 1))], [trick, sampled_labels], verbose=False) generator_train_loss = np.mean(np.array(epoch_gen_loss), axis=0) # generate an epoch report on performance train_history['generator'].append(generator_train_loss) train_history['discriminator'].append(discriminator_train_loss) test_history['generator'].append(generator_test_loss) test_history['discriminator'].append(discriminator_test_loss) print('{0:<22s} | {1:4s} | {2:15s} | {3:5s}'.format( 'component', *discriminator.metrics_names)) print('-' * 65) ROW_FMT = '{0:<22s} | {1:<4.2f} | {2:<15.4f} | {3:<5.4f}' print(ROW_FMT.format('generator (train)', *train_history['generator'][-1])) print(ROW_FMT.format('generator (test)', *test_history['generator'][-1])) print(ROW_FMT.format('discriminator (train)', *train_history['discriminator'][-1])) print(ROW_FMT.format('discriminator (test)', *test_history['discriminator'][-1])) # save weights every epoch generator.save_weights( 'params_generator_epoch_{0:03d}.hdf5'.format(epoch), True) discriminator.save_weights( 'params_discriminator_epoch_{0:03d}.hdf5'.format(epoch), True) # generate some digits to display num_rows = 40 noise = np.tile(np.random.uniform(-1, 1, (num_rows, latent_size)), (num_classes, 1)) sampled_labels = np.array([ [i] * num_rows for i in range(num_classes) ]).reshape(-1, 1) # get a batch to display generated_images = generator.predict( [noise, sampled_labels], verbose=0) # prepare real images sorted by class label real_labels = y_train[(epoch - 1) * num_rows * num_classes: epoch * num_rows * num_classes] indices = np.argsort(real_labels, axis=0) real_images = x_train[(epoch - 1) * num_rows * num_classes: epoch * num_rows * num_classes][indices] # display generated images, white separator, real images img = np.concatenate( (generated_images, np.repeat(np.ones_like(x_train[:1]), num_rows, axis=0), real_images)) # arrange them into a grid img = (np.concatenate([r.reshape(-1, 28) for r in np.split(img, 2 * num_classes + 1) ], axis=-1) * 127.5 + 127.5).astype(np.uint8) Image.fromarray(img).save( 'plot_epoch_{0:03d}_generated.png'.format(epoch)) with open('acgan-history.pkl', 'wb') as f: pickle.dump({'train': train_history, 'test': test_history}, f)