《深度學習框架PyTorch入門與實踐》示例——AI藝術家:神經網路風格遷移
阿新 • • 發佈:2020-12-22
這是我在學習《深度學習框架PyTorch入門與實踐》第九章的筆記。原書實現了Fast Neural Style,實現將輸入圖片轉換為對應圖片風格的型別。
該部落格有書中的全部內容,並且有全部程式碼和資料集的百度網盤連結。程式碼在PyCharm下測試可執行。
部分程式碼和註釋如下:
transformer_net.py
import torch import torch.nn as nn import numpy as np class TransformerNet(nn.Module): def __init__(self): super(TransformerNet, self).__init__() # 下卷積層 self.initial_layers = nn.Sequential( ConvLayer(3, 32, kernel_size=9, stride=1), nn.InstanceNorm2d(32, affine=True), # torch.nn.InstanceNorm2d(num_features: int, # eps: float = 1e-05, momentum: float = 0.1, # affine: bool = False, # track_running_stats: bool = False) # 該函式與Batch Normalization的區別是,可以對每個通道的輸入進行標準化nn.ReLU(True), ConvLayer(32, 64, kernel_size=3, stride=2), nn.InstanceNorm2d(64, affine=True), nn.ReLU(True), ConvLayer(64, 128, kernel_size=3, stride=2), nn.InstanceNorm2d(128, affine=True), nn.ReLU(True) ) # Residual layers(殘差層) self.res_layers = nn.Sequential( ResidualBlock(128), ResidualBlock(128), ResidualBlock(128), ResidualBlock(128), ResidualBlock(128) ) # Upsampling Layers(上卷積層) self.upsample_layer = nn.Sequential( UpsampleConvLayer(128, 64, kernel_size=3, stride=1, upsample=2), nn.InstanceNorm2d(64, affine=True), nn.ReLU(True), UpsampleConvLayer(64, 32, kernel_size=3, stride=1, upsample=2), nn.InstanceNorm2d(32, affine=True), nn.ReLU(True), ConvLayer(32, 3, kernel_size=9, stride=1) ) def forward(self, x): x = self.initial_layers(x) x = self.res_layers(x) x = self.upsample_layer(x) return x class ConvLayer(nn.Module): """ 使用Reflection Pad 預設padding是在邊緣補0 """ def __init__(self, in_channels, out_channels, kernel_size, stride): super(ConvLayer, self).__init__() reflection_padding = int(np.floor(kernel_size / 2)) self.reflection_pad = nn.ReflectionPad2d(reflection_padding) # torch.nn.ReflectionPad2d(padding: Union[T, Tuple[T, T, T, T]]) # 上下左右反射邊緣的畫素進行補齊 self.conv2d = nn.Conv2d(in_channels, out_channels, kernel_size, stride) def forward(self, x): out = self.reflection_pad(x) out = self.conv2d(out) return out class ResidualBlock(nn.Module): def __init__(self, channels): super(ResidualBlock, self).__init__() self.res_block = nn.Sequential( ConvLayer(channels, channels, kernel_size=3, stride=1), nn.InstanceNorm2d(channels, affine=True), nn.ReLU(True), ConvLayer(channels, channels, kernel_size=3, stride=1), nn.InstanceNorm2d(channels, affine=True), ) def forward(self, x): residual = x out = self.res_block(x) out = out + residual return out class UpsampleConvLayer(nn.Module): def __init__(self, in_channels, out_channels, kernel_size, stride, upsample=None): super(UpsampleConvLayer, self).__init__() self.upsample = upsample reflection_padding = kernel_size // 2 self.reflection_pad = nn.ReflectionPad2d(reflection_padding) self.conv2d = nn.Conv2d(in_channels, out_channels, kernel_size, stride) def forward(self, x): x_in = x if self.upsample: x_in = nn.functional.interpolate(x_in, mode='nearest', scale_factor=self.upsample) out = self.reflection_pad(x_in) out = self.conv2d(out) return out
utils.py
import torch as t import torch.nn as nn import torchvision as tv from torchvision.models import vgg16 from collections import namedtuple class Vgg16(nn.Module): def __init__(self): super(Vgg16, self).__init__() features = list(vgg16(pretrained=True).features)[:23] # vgg16的前23層 # vgg16是vgg的一種變形,可以在官網原始碼找到其定義及網路結構 # https://pytorch.org/docs/stable/_modules/torchvision/models/vgg.html#vgg11 self.features = nn.ModuleList(features).eval() # ModuleList與定義網路層的Sequential類似 def forward(self, x): results = [] # features的第3,8,15,22層分別是:relu1_2,relu2_2,relu3_3,relu4_3 for ii, model in enumerate(self.features): x = model(x) # ModuleList為子model,x記錄每一層的輸出 if ii in {3, 8, 15, 22}: results.append(x) # append()方法用於在列表末尾新增新的物件。 vgg_outputs = namedtuple("VggOutpus", ['relu1_2', 'relu2_2', 'relu3_3', 'relu4_3']) return vgg_outputs(*results) # *表示傳遞引數,對應上一行的vgg_outputs定義名稱 IMAGENET_MEAN = [0.485, 0.456, 0.406] IMAGENET_STD = [0.229, 0.224, 0.225] def gram_matrix(y): """輸入形狀b,c,h,w 輸出形狀b,c,c""" """ b:batch_size c:channel h:height w:width """ (b, ch, h, w) = y.size() features = y.view(b, ch, w * h) features_t = features.transpose(1, 2) # transpose將1軸和2軸交換。從0軸開始計數。 gram = features.bmm(features_t) / (ch * h * w) # 返回b*ch*ch return gram def get_style_data(path): """ 載入風格圖片 :param path: 輸入路徑 :return: 形狀1*c*h*w, Tensor """ style_transform = tv.transforms.Compose([ tv.transforms.ToTensor(), tv.transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD), ]) style_image = tv.datasets.folder.default_loader(path) style_tensor = style_transform(style_image) return style_tensor.unsqueeze(0) def normalize_batch(batch): """ :param batch:輸入b,ch,h,w, 0~255, Variable :return:b,ch,h,w -2~2, Variable """ mean = batch.data.new(IMAGENET_MEAN).view(1, -1, 1, 1) std = batch.data.new(IMAGENET_STD).view(1, -1, 1, 1) mean = t.autograd.Variable(mean.expand_as(batch.data)) std = t.autograd.Variable(std.expand_as(batch.data)) return (batch / 255.0 - mean) / std
main.py
import torch as t import torchvision as tv import torchnet as tnt from torch.utils import data from transformer_net import TransformerNet import utils from PackedVGG import Vgg16 from torch.nn import functional as F import tqdm import os import ipdb mean = [0.485, 0.456, 0.406] std = [0.229, 0.224, 0.225] class Config(object): image_size = 256 # 圖片大小 batch_size = 8 data_root = 'data/' # 資料集存放路徑:data/coco/a.jpg num_workers = 4 # 多執行緒載入資料 use_gpu = True # 使用GPU style_path = 'style.jpg' # 風格圖片存放路徑 lr = 1e-3 # 學習率 env = 'neural-style' # visdom env plot_every = 10 # 每10個batch視覺化一次 epoches = 2 # 訓練epoch content_weight = 1e5 # content_loss 的權重 style_weight = 1e10 # style_loss的權重 model_path = None # 預訓練模型的路徑 debug_file = 'debug/debug.txt' # touch $debug_fie 進入除錯模式 content_path = 'input.png' # 需要進行分割遷移的圖片 result_path = 'output.png' # 風格遷移結果的儲存路徑 def train(**kwargs): opt = Config() for k_, v_ in kwargs.items(): setattr(opt, k_, v_) device=t.device('cuda') if opt.use_gpu else t.device('cpu') vis = utils.Visualizer(opt.env) # 資料載入 transfroms = tv.transforms.Compose([ tv.transforms.Resize(opt.image_size), tv.transforms.CenterCrop(opt.image_size), tv.transforms.ToTensor(), tv.transforms.Lambda(lambda x: x * 255) ]) dataset = tv.datasets.ImageFolder(opt.data_root, transfroms) dataloader = data.DataLoader(dataset, opt.batch_size) # 轉換網路 transformer = TransformerNet() if opt.model_path: transformer.load_state_dict(t.load(opt.model_path, map_location=lambda _s, _: _s)) transformer.to(device) # 損失網路 Vgg16 vgg = Vgg16().eval() vgg.to(device) for param in vgg.parameters(): param.requires_grad = False # 優化器 optimizer = t.optim.Adam(transformer.parameters(), opt.lr) # 獲取風格圖片的資料 style = utils.get_style_data(opt.style_path) vis.img('style', (style.data[0] * 0.225 + 0.45).clamp(min=0, max=1)) style = style.to(device) # 風格圖片的gram矩陣 with t.no_grad(): features_style = vgg(style) gram_style = [utils.gram_matrix(y) for y in features_style] # 損失統計 style_meter = tnt.meter.AverageValueMeter() content_meter = tnt.meter.AverageValueMeter() for epoch in range(opt.epoches): content_meter.reset() style_meter.reset() for ii, (x, _) in tqdm.tqdm(enumerate(dataloader)): # 訓練 optimizer.zero_grad() x = x.to(device) y = transformer(x) y = utils.normalize_batch(y) x = utils.normalize_batch(x) features_y = vgg(y) features_x = vgg(x) # content loss content_loss = opt.content_weight * F.mse_loss(features_y.relu2_2, features_x.relu2_2) # style loss style_loss = 0. for ft_y, gm_s in zip(features_y, gram_style): gram_y = utils.gram_matrix(ft_y) style_loss += F.mse_loss(gram_y, gm_s.expand_as(gram_y)) style_loss *= opt.style_weight total_loss = content_loss + style_loss total_loss.backward() optimizer.step() # 損失平滑 content_meter.add(content_loss.item()) style_meter.add(style_loss.item()) if (ii + 1) % opt.plot_every == 0: if os.path.exists(opt.debug_file): ipdb.set_trace() # 視覺化 vis.plot('content_loss', content_meter.value()[0]) vis.plot('style_loss', style_meter.value()[0]) # 因為x和y經過標準化處理(utils.normalize_batch),所以需要將它們還原 vis.img('output', (y.data.cpu()[0] * 0.225 + 0.45).clamp(min=0, max=1)) vis.img('input', (x.data.cpu()[0] * 0.225 + 0.45).clamp(min=0, max=1)) # 儲存visdom和模型 vis.save([opt.env]) t.save(transformer.state_dict(), 'checkpoints/%s_style.pth' % epoch) @t.no_grad() def stylize(**kwargs): opt = Config() for k_, v_ in kwargs.items(): setattr(opt, k_, v_) device=t.device('cuda') if opt.use_gpu else t.device('cpu') # 圖片處理 content_image = tv.datasets.folder.default_loader(opt.content_path) content_transform = tv.transforms.Compose([ tv.transforms.ToTensor(), tv.transforms.Lambda(lambda x: x.mul(255)) ]) content_image = content_transform(content_image) content_image = content_image.unsqueeze(0).to(device).detach() # 模型 style_model = TransformerNet().eval() style_model.load_state_dict(t.load(opt.model_path, map_location=lambda _s, _: _s)) style_model.to(device) # 風格遷移與儲存 output = style_model(content_image) output_data = output.cpu().data[0] tv.utils.save_image(((output_data / 255)).clamp(min=0, max=1), opt.result_path) if __name__ == '__main__': import fire fire.Fire() train()
訓練過程中的結果如下圖所示: