1. 程式人生 > 實用技巧 >《深度學習框架PyTorch入門與實踐》示例——AI藝術家:神經網路風格遷移

《深度學習框架PyTorch入門與實踐》示例——AI藝術家:神經網路風格遷移

這是我在學習《深度學習框架PyTorch入門與實踐》第九章的筆記。原書實現了Fast Neural Style,實現將輸入圖片轉換為對應圖片風格的型別。

強烈建議參考:https://blog.csdn.net/u011436316/article/details/102472530?ops_request_misc=%25257B%252522request%25255Fid%252522%25253A%252522160851630416780273321992%252522%25252C%252522scm%252522%25253A%25252220140713.130102334.pc%25255Fblog.%252522%25257D&request_id=160851630416780273321992&biz_id=0&utm_medium=distribute.pc_search_result.none-task-blog-2~blog~first_rank_v1~rank_blog_v1-1-102472530.pc_v1_rank_blog_v1&utm_term=%E7%AC%AC%E5%85%AB%E7%AB%A0

該部落格有書中的全部內容,並且有全部程式碼和資料集的百度網盤連結。程式碼在PyCharm下測試可執行。

部分程式碼和註釋如下:

transformer_net.py

import torch
import torch.nn as nn
import numpy as np

class TransformerNet(nn.Module):
    def __init__(self):
        super(TransformerNet, self).__init__()

        # 下卷積層
        self.initial_layers = nn.Sequential(
            ConvLayer(
3, 32, kernel_size=9, stride=1), nn.InstanceNorm2d(32, affine=True), # torch.nn.InstanceNorm2d(num_features: int, # eps: float = 1e-05, momentum: float = 0.1, # affine: bool = False, # track_running_stats: bool = False) # 該函式與Batch Normalization的區別是,可以對每個通道的輸入進行標準化
nn.ReLU(True), ConvLayer(32, 64, kernel_size=3, stride=2), nn.InstanceNorm2d(64, affine=True), nn.ReLU(True), ConvLayer(64, 128, kernel_size=3, stride=2), nn.InstanceNorm2d(128, affine=True), nn.ReLU(True) ) # Residual layers(殘差層) self.res_layers = nn.Sequential( ResidualBlock(128), ResidualBlock(128), ResidualBlock(128), ResidualBlock(128), ResidualBlock(128) ) # Upsampling Layers(上卷積層) self.upsample_layer = nn.Sequential( UpsampleConvLayer(128, 64, kernel_size=3, stride=1, upsample=2), nn.InstanceNorm2d(64, affine=True), nn.ReLU(True), UpsampleConvLayer(64, 32, kernel_size=3, stride=1, upsample=2), nn.InstanceNorm2d(32, affine=True), nn.ReLU(True), ConvLayer(32, 3, kernel_size=9, stride=1) ) def forward(self, x): x = self.initial_layers(x) x = self.res_layers(x) x = self.upsample_layer(x) return x class ConvLayer(nn.Module): """ 使用Reflection Pad 預設padding是在邊緣補0 """ def __init__(self, in_channels, out_channels, kernel_size, stride): super(ConvLayer, self).__init__() reflection_padding = int(np.floor(kernel_size / 2)) self.reflection_pad = nn.ReflectionPad2d(reflection_padding) # torch.nn.ReflectionPad2d(padding: Union[T, Tuple[T, T, T, T]]) # 上下左右反射邊緣的畫素進行補齊 self.conv2d = nn.Conv2d(in_channels, out_channels, kernel_size, stride) def forward(self, x): out = self.reflection_pad(x) out = self.conv2d(out) return out class ResidualBlock(nn.Module): def __init__(self, channels): super(ResidualBlock, self).__init__() self.res_block = nn.Sequential( ConvLayer(channels, channels, kernel_size=3, stride=1), nn.InstanceNorm2d(channels, affine=True), nn.ReLU(True), ConvLayer(channels, channels, kernel_size=3, stride=1), nn.InstanceNorm2d(channels, affine=True), ) def forward(self, x): residual = x out = self.res_block(x) out = out + residual return out class UpsampleConvLayer(nn.Module): def __init__(self, in_channels, out_channels, kernel_size, stride, upsample=None): super(UpsampleConvLayer, self).__init__() self.upsample = upsample reflection_padding = kernel_size // 2 self.reflection_pad = nn.ReflectionPad2d(reflection_padding) self.conv2d = nn.Conv2d(in_channels, out_channels, kernel_size, stride) def forward(self, x): x_in = x if self.upsample: x_in = nn.functional.interpolate(x_in, mode='nearest', scale_factor=self.upsample) out = self.reflection_pad(x_in) out = self.conv2d(out) return out

utils.py

import torch as t
import torch.nn as nn
import torchvision as tv
from torchvision.models import vgg16
from collections import namedtuple


class Vgg16(nn.Module):
    def __init__(self):
        super(Vgg16, self).__init__()
        features = list(vgg16(pretrained=True).features)[:23]   # vgg16的前23層
        # vgg16是vgg的一種變形,可以在官網原始碼找到其定義及網路結構
        # https://pytorch.org/docs/stable/_modules/torchvision/models/vgg.html#vgg11
        self.features = nn.ModuleList(features).eval()
        # ModuleList與定義網路層的Sequential類似

    def forward(self, x):
        results = []
        # features的第3,8,15,22層分別是:relu1_2,relu2_2,relu3_3,relu4_3
        for ii, model in enumerate(self.features):
            x = model(x)                # ModuleList為子model,x記錄每一層的輸出
            if ii in {3, 8, 15, 22}:
                results.append(x)       # append()方法用於在列表末尾新增新的物件。

        vgg_outputs = namedtuple("VggOutpus", ['relu1_2', 'relu2_2', 'relu3_3', 'relu4_3'])
        return vgg_outputs(*results)        # *表示傳遞引數,對應上一行的vgg_outputs定義名稱


IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD = [0.229, 0.224, 0.225]

def gram_matrix(y):
    """輸入形狀b,c,h,w 輸出形狀b,c,c"""
    """
    b:batch_size
    c:channel
    h:height
    w:width        
    """
    (b, ch, h, w) = y.size()
    features = y.view(b, ch, w * h)
    features_t = features.transpose(1, 2)
    # transpose將1軸和2軸交換。從0軸開始計數。
    gram = features.bmm(features_t) / (ch * h * w)
    # 返回b*ch*ch
    return gram

def get_style_data(path):
    """
    載入風格圖片
    :param path: 輸入路徑
    :return: 形狀1*c*h*w, Tensor
    """
    style_transform = tv.transforms.Compose([
        tv.transforms.ToTensor(),
        tv.transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
    ])
    style_image = tv.datasets.folder.default_loader(path)
    style_tensor = style_transform(style_image)
    return style_tensor.unsqueeze(0)

def normalize_batch(batch):
    """
    :param batch:輸入b,ch,h,w, 0~255, Variable
    :return:b,ch,h,w -2~2, Variable
    """
    mean = batch.data.new(IMAGENET_MEAN).view(1, -1, 1, 1)
    std = batch.data.new(IMAGENET_STD).view(1, -1, 1, 1)
    mean = t.autograd.Variable(mean.expand_as(batch.data))
    std = t.autograd.Variable(std.expand_as(batch.data))
    return (batch / 255.0 - mean) / std

main.py

import torch as t
import torchvision as tv
import torchnet as tnt

from torch.utils import data
from transformer_net import TransformerNet
import utils
from PackedVGG import Vgg16
from torch.nn import functional as F
import tqdm
import os
import ipdb

mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]


class Config(object):
    image_size = 256  # 圖片大小
    batch_size = 8
    data_root = 'data/'  # 資料集存放路徑:data/coco/a.jpg
    num_workers = 4  # 多執行緒載入資料
    use_gpu = True  # 使用GPU

    style_path = 'style.jpg'  # 風格圖片存放路徑
    lr = 1e-3  # 學習率

    env = 'neural-style'  # visdom env
    plot_every = 10  # 每10個batch視覺化一次

    epoches = 2  # 訓練epoch

    content_weight = 1e5  # content_loss 的權重
    style_weight = 1e10  # style_loss的權重

    model_path = None  # 預訓練模型的路徑
    debug_file = 'debug/debug.txt'  # touch $debug_fie 進入除錯模式

    content_path = 'input.png'  # 需要進行分割遷移的圖片
    result_path = 'output.png'  # 風格遷移結果的儲存路徑


def train(**kwargs):
    opt = Config()
    for k_, v_ in kwargs.items():
        setattr(opt, k_, v_)
    
    device=t.device('cuda') if opt.use_gpu else t.device('cpu')
    vis = utils.Visualizer(opt.env)

    # 資料載入
    transfroms = tv.transforms.Compose([
        tv.transforms.Resize(opt.image_size),
        tv.transforms.CenterCrop(opt.image_size),
        tv.transforms.ToTensor(),
        tv.transforms.Lambda(lambda x: x * 255)
    ])
    dataset = tv.datasets.ImageFolder(opt.data_root, transfroms)
    dataloader = data.DataLoader(dataset, opt.batch_size)

    # 轉換網路
    transformer = TransformerNet()
    if opt.model_path:
        transformer.load_state_dict(t.load(opt.model_path, map_location=lambda _s, _: _s))
    transformer.to(device)

    # 損失網路 Vgg16
    vgg = Vgg16().eval()
    vgg.to(device)
    for param in vgg.parameters():
        param.requires_grad = False

    # 優化器
    optimizer = t.optim.Adam(transformer.parameters(), opt.lr)

    # 獲取風格圖片的資料
    style = utils.get_style_data(opt.style_path)
    vis.img('style', (style.data[0] * 0.225 + 0.45).clamp(min=0, max=1))
    style = style.to(device)


    # 風格圖片的gram矩陣
    with t.no_grad():
        features_style = vgg(style)
        gram_style = [utils.gram_matrix(y) for y in features_style]

    # 損失統計
    style_meter = tnt.meter.AverageValueMeter()
    content_meter = tnt.meter.AverageValueMeter()

    for epoch in range(opt.epoches):
        content_meter.reset()
        style_meter.reset()

        for ii, (x, _) in tqdm.tqdm(enumerate(dataloader)):

            # 訓練
            optimizer.zero_grad()
            x = x.to(device)
            y = transformer(x)
            y = utils.normalize_batch(y)
            x = utils.normalize_batch(x)
            features_y = vgg(y)
            features_x = vgg(x)

            # content loss
            content_loss = opt.content_weight * F.mse_loss(features_y.relu2_2, features_x.relu2_2)

            # style loss
            style_loss = 0.
            for ft_y, gm_s in zip(features_y, gram_style):
                gram_y = utils.gram_matrix(ft_y)
                style_loss += F.mse_loss(gram_y, gm_s.expand_as(gram_y))
            style_loss *= opt.style_weight

            total_loss = content_loss + style_loss
            total_loss.backward()
            optimizer.step()

            # 損失平滑
            content_meter.add(content_loss.item())
            style_meter.add(style_loss.item())

            if (ii + 1) % opt.plot_every == 0:
                if os.path.exists(opt.debug_file):
                    ipdb.set_trace()

                # 視覺化
                vis.plot('content_loss', content_meter.value()[0])
                vis.plot('style_loss', style_meter.value()[0])
                # 因為x和y經過標準化處理(utils.normalize_batch),所以需要將它們還原
                vis.img('output', (y.data.cpu()[0] * 0.225 + 0.45).clamp(min=0, max=1))
                vis.img('input', (x.data.cpu()[0] * 0.225 + 0.45).clamp(min=0, max=1))

        # 儲存visdom和模型
        vis.save([opt.env])
        t.save(transformer.state_dict(), 'checkpoints/%s_style.pth' % epoch)

@t.no_grad()
def stylize(**kwargs):
    opt = Config()

    for k_, v_ in kwargs.items():
        setattr(opt, k_, v_)
    device=t.device('cuda') if opt.use_gpu else t.device('cpu')
    
    # 圖片處理
    content_image = tv.datasets.folder.default_loader(opt.content_path)
    content_transform = tv.transforms.Compose([
        tv.transforms.ToTensor(),
        tv.transforms.Lambda(lambda x: x.mul(255))
    ])
    content_image = content_transform(content_image)
    content_image = content_image.unsqueeze(0).to(device).detach()

    # 模型
    style_model = TransformerNet().eval()
    style_model.load_state_dict(t.load(opt.model_path, map_location=lambda _s, _: _s))
    style_model.to(device)

    # 風格遷移與儲存
    output = style_model(content_image)
    output_data = output.cpu().data[0]
    tv.utils.save_image(((output_data / 255)).clamp(min=0, max=1), opt.result_path)


if __name__ == '__main__':
    import fire

    fire.Fire()
    train()

訓練過程中的結果如下圖所示: