Pytorch yolov3 多GPU 訓練
阿新 • • 發佈:2019-02-07
pytorch 多gpu訓練:
# -*- coding:utf-8 -*- from __future__ import division import datetime import torch import torch.nn as nn import torch.nn.functional as F from torch.autograd import Variable import numpy as np from PIL import Image from utils.parse_config import * from utils.utils import build_targets fromcollections import defaultdict def create_modules(module_defs): """ Constructs module list of layer blocks from module configuration in module_defs """ #根據cfg檔案建立yolov3網路結構 hyperparams = module_defs.pop(0) output_filters = [int(hyperparams['channels'])] module_list = nn.ModuleList() fori, module_def in enumerate(module_defs): modules = nn.Sequential() if module_def['type'] == 'convolutional': bn = int(module_def['batch_normalize']) filters = int(module_def['filters']) kernel_size = int(module_def['size']) pad = (kernel_size - 1) // 2 if int(module_def['pad']) else 0 modules.add_module('conv_%d' % i, nn.Conv2d(in_channels=output_filters[-1], out_channels=filters, kernel_size=kernel_size, stride=int(module_def['stride']), padding=pad, bias=not bn)) if bn: modules.add_module('batch_norm_%d' % i, nn.BatchNorm2d(filters)) if module_def['activation'] == 'leaky': modules.add_module('leaky_%d' % i, nn.LeakyReLU(0.1)) elif module_def['type'] == 'upsample': upsample = nn.Upsample( scale_factor=int(module_def['stride']), mode='nearest') modules.add_module('upsample_%d' % i, upsample) elif module_def['type'] == 'route': layers = [int(x) for x in module_def["layers"].split(',')] filters = sum([output_filters[layer_i] for layer_i in layers]) modules.add_module('route_%d' % i, EmptyLayer()) elif module_def['type'] == 'shortcut': filters = output_filters[int(module_def['from'])] modules.add_module("shortcut_%d" % i, EmptyLayer()) elif module_def["type"] == "yolo": anchor_idxs = [int(x) for x in module_def["mask"].split(",")] # Extract anchors anchors = module_def["anchors"] anchors = [anchors[i] for i in anchor_idxs] num_classes = int(module_def['classes']) img_height = int(hyperparams['height']) # Define detection layer yolo_layer = YOLOLayer(anchors, num_classes, img_height) modules.add_module('yolo_%d' % i, yolo_layer) # Register module list and number of output filters module_list.append(modules) output_filters.append(filters) return hyperparams, module_list class EmptyLayer(nn.Module): """Placeholder for 'route' and 'shortcut' layers""" def __init__(self): super(EmptyLayer, self).__init__() class YOLOLayer(nn.Module): """Detection layer""" def __init__(self, anchors, num_classes, image_dim): super(YOLOLayer, self).__init__() self.anchors = anchors self.scaled_anchors = None self.num_anchors = len(anchors) self.num_classes = num_classes self.bbox_attrs = 5 + num_classes self.image_dim = image_dim self.ignore_thres = 0.5 self.coord_scale = 1 self.noobject_scale = 1 self.object_scale = 5 self.class_scale = 1 self.seen = 0 self.mse_loss = nn.MSELoss() self.bce_loss = nn.BCELoss() # self.bce_logits_loss = nn.BCEWithLogitsLoss() def forward(self, x, targets=None): bs = x.size(0) g_dim = x.size(2) stride = self.image_dim / g_dim # Tensors for cuda support FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor prediction = x.view(bs, self.num_anchors, self.bbox_attrs, g_dim, g_dim).permute(0, 1, 3, 4, 2).contiguous() # Get outputs x = torch.sigmoid(prediction[..., 0]) # Center x y = torch.sigmoid(prediction[..., 1]) # Center y w = prediction[..., 2] # Width h = prediction[..., 3] # Height conf = torch.sigmoid(prediction[..., 4]) # Conf pred_cls = torch.sigmoid(prediction[..., 5:]) # Cls pred. # Calculate offsets for each grid grid_x = torch.linspace(0, g_dim-1, g_dim).repeat(g_dim,1).repeat(bs*self.num_anchors, 1, 1).view(x.shape).type(FloatTensor) grid_y = torch.linspace(0, g_dim-1, g_dim).repeat(g_dim,1).t().repeat(bs*self.num_anchors, 1, 1).view(y.shape).type(FloatTensor) scaled_anchors = [(a_w / stride, a_h / stride) for a_w, a_h in self.anchors] anchor_w = FloatTensor(scaled_anchors).index_select(1, LongTensor([0])) anchor_h = FloatTensor(scaled_anchors).index_select(1, LongTensor([1])) anchor_w = anchor_w.repeat(bs, 1).repeat(1, 1, g_dim*g_dim).view(w.shape) anchor_h = anchor_h.repeat(bs, 1).repeat(1, 1, g_dim*g_dim).view(h.shape) # Add offset and scale with anchors pred_boxes = FloatTensor(prediction[..., :4].shape) pred_boxes[..., 0] = x.data + grid_x pred_boxes[..., 1] = y.data + grid_y pred_boxes[..., 2] = torch.exp(w.data) * anchor_w pred_boxes[..., 3] = torch.exp(h.data) * anchor_h self.seen += prediction.size(0) # Training if targets is not None: if x.is_cuda: self.mse_loss = self.mse_loss.cuda() self.bce_loss = self.bce_loss.cuda() nGT, nCorrect, coord_mask, conf_mask, cls_mask, tx, ty, tw, th, tconf, tcls = build_targets(pred_boxes.cpu().data, targets.cpu().data, scaled_anchors, self.num_anchors, self.num_classes, g_dim, self.ignore_thres) # nProposals = int((conf > 0.25).sum().item()) recall = float(nCorrect / nGT) if nGT else 1 tx = Variable(tx.type(FloatTensor), requires_grad=False) ty = Variable(ty.type(FloatTensor), requires_grad=False) tw = Variable(tw.type(FloatTensor), requires_grad=False) th = Variable(th.type(FloatTensor), requires_grad=False) tconf = Variable(tconf.type(FloatTensor), requires_grad=False) tcls = Variable(tcls[cls_mask == 1].type(FloatTensor), requires_grad=False) coord_mask = Variable(coord_mask.type(FloatTensor), requires_grad=False) conf_mask = Variable(conf_mask.type(FloatTensor), requires_grad=False) loss_x = self.coord_scale * self.mse_loss(x[coord_mask == 1], tx[coord_mask == 1]) / 2 loss_y = self.coord_scale * self.mse_loss(y[coord_mask == 1], ty[coord_mask == 1]) / 2 loss_w = self.coord_scale * self.mse_loss(w[coord_mask == 1], tw[coord_mask == 1]) / 2 loss_h = self.coord_scale * self.mse_loss(h[coord_mask == 1], th[coord_mask == 1]) / 2 loss_conf = self.bce_loss(conf[conf_mask == 1], tconf[conf_mask == 1]) loss_cls = self.class_scale * self.bce_loss(pred_cls[cls_mask == 1], tcls) loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls return loss, loss_x.item(), loss_y.item(), loss_w.item(), loss_h.item(), loss_conf.item(), loss_cls.item(),recall else: # If not in training phase return predictions output = torch.cat((pred_boxes.view(bs, -1, 4) * stride, conf.view(bs, -1, 1), pred_cls.view(bs, -1, self.num_classes)), -1) return output.data class Darknet(nn.Module): """YOLOv3 object detection model""" def __init__(self, module_defs, img_size=416): super(Darknet, self).__init__() self.module_defs = module_defs self.hyperparams, self.module_list = create_modules(self.module_defs)#根據config檔案建立yolov3網路模型,返回網路引數和torch版神經網路 # print("module",self.module_list) self.img_size = img_size self.loss_names = ['x', 'y', 'w', 'h', 'conf', 'cls', 'recall'] self.losses = defaultdict(float) def forward(self, x, targets=None): is_training = targets is not None output = [] for name in self.loss_names: self.losses[name] =0 layer_outputs = [] for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)): if module_def['type'] in ['convolutional', 'upsample']: x = module(x) elif module_def['type'] == 'route': layer_i = [int(x) for x in module_def['layers'].split(',')] x = torch.cat([layer_outputs[i] for i in layer_i], 1) elif module_def['type'] == 'shortcut': layer_i = int(module_def['from']) x = layer_outputs[-1] + layer_outputs[layer_i] elif module_def['type'] == 'yolo': # Train phase: get loss if is_training: x, *losses = module[0](x, targets) for name, loss in zip(self.loss_names, losses): self.losses[name] += loss # Test phase: Get detections else: x = module(x) output.append(x) layer_outputs.append(x) self.losses['recall'] /= 3 if is_training: return sum(output).view(-1, ) else: return torch.cat(output, 1) # return sum(output) if is_training else torch.cat(output, 1) def load_weights(self, weights_path,is_training = False): """Parses and loads the weights stored in 'weights_path'""" #Open the weights file fp = open(weights_path, "rb") header = np.fromfile(fp, dtype=np.int32, count=5) # First five are header values # Needed to write header when saving weights self.header_info = header self.seen = header[3] weights = np.fromfile(fp, dtype=np.float32) # The rest are weights fp.close() ptr = 0 for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)): if module_def['type'] == 'convolutional': conv_layer = module[0] if module_def['batch_normalize']: # Load BN bias, weights, running mean and running variance bn_layer = module[1] num_b = bn_layer.bias.numel() # Number of biases # Bias bn_b = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(bn_layer.bias) bn_layer.bias.data.copy_(bn_b) ptr += num_b # Weight bn_w = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(bn_layer.weight) bn_layer.weight.data.copy_(bn_w) ptr += num_b # Running Mean bn_rm = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(bn_layer.running_mean) bn_layer.running_mean.data.copy_(bn_rm) ptr += num_b # Running Var bn_rv = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(bn_layer.running_var) bn_layer.running_var.data.copy_(bn_rv) ptr += num_b else: # Load conv. bias num_b = conv_layer.bias.numel() conv_b = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(conv_layer.bias) conv_layer.bias.data.copy_(conv_b) ptr += num_b # Load conv. weights num_w = conv_layer.weight.numel() conv_w = torch.from_numpy(weights[ptr:ptr + num_w]).view_as(conv_layer.weight)#權重引數賦值 conv_layer.weight.data.copy_(conv_w) ptr += num_w """ @:param path - path of the new weights file @:param cutoff - save layers between 0 and cutoff (cutoff = -1 -> all are saved) """ def save_weights(self, path, cutoff=-1): fp = open(path, 'wb') self.header_info[3] = self.seen self.header_info.tofile(fp) # Iterate through layers for i, (module_def, module) in enumerate(zip(self.module_defs[:cutoff], self.module_list[:cutoff])): if module_def['type'] == 'convolutional': conv_layer = module[0] # If batch norm, load bn first if module_def['batch_normalize']: bn_layer = module[1] bn_layer.bias.data.cpu().numpy().tofile(fp) bn_layer.weight.data.cpu().numpy().tofile(fp) bn_layer.running_mean.data.cpu().numpy().tofile(fp) bn_layer.running_var.data.cpu().numpy().tofile(fp) # Load conv bias else: conv_layer.bias.data.cpu().numpy().tofile(fp) # Load conv weights conv_layer.weight.data.cpu().numpy().tofile(fp) fp.close()
train程式碼:關鍵詞
optimizer.module.zero_grad()
model.module.save_weights
loss = model(imgs, targets)
torch.sum(loss).backward()
optimizer.module.step()
for epoch in range(opt.epochs): for batch_i, (_, imgs, targets) in enumerate(dataloader): imgs = Variable(imgs.type(Tensor)) targets = Variable(targets.type(Tensor), requires_grad=False) optimizer.module.zero_grad() loss = model(imgs, targets) # loss.backward() # optimizer.step() torch.sum(loss).backward() optimizer.module.step() now = datetime.datetime.now() strftime = now.strftime("%H:%M:%S") print(strftime, epoch, opt.epochs, batch_i, len(dataloader), loss) if batch_i % 40 == 39: if last_total_loss > 0 and total_loss > last_total_loss * 1.01: print("total_loss", total_loss) adjust_learning_rate(optimizer) else: print("total_loss", total_loss, last_total_loss) last_total_loss = total_loss total_loss = torch.sum(loss) elif batch_i == 0: total_loss = torch.sum(loss) else: total_loss += torch.sum(loss) # if epoch > 0 and batch_i == 0: # if torch.sum(loss) > mean_loss / batch_size : # print("mean_loss", mean_loss) # adjust_learning_rate(optimizer) # mean_loss = torch.sum(loss) # else: # mean_loss += torch.sum(loss) # info = {'loss': loss.item(), 'cls': model.losses['cls'], 'conf': model.losses['conf']} # for tag, value in info.items(): # logger.scalar_summary(tag, value, epoch) print('%s [Epoch %d/%d, Batch %d/%d Losse s: x %f, y %f, w %f, h %f, conf %f, cls %f, total %f, recall: %.5f]' % (strftime, epoch, opt.epochs, batch_i, len(dataloader), model.module.losses['x'], model.module.losses['y'], model.module.losses['w'], model.module.losses['h'], model.module.losses['conf'], model.module.losses['cls'], torch.sum(loss), model.module.losses['recall'])) if epoch % opt.checkpoint_interval == 0: model.module.save_weights('%s/%d.weights' % (opt.checkpoint_dir, epoch))
# -*- coding:utf-8 -*- from __future__ import division from models import * from utils.utils import * from utils.datasets import * from utils.parse_config import * from logger import Logger import os import sys import time import datetime import argparse import torch from torch.utils.data import DataLoader from torch.autograd import Variable import torch.optim as optim parser = argparse.ArgumentParser() parser.add_argument('--epochs', type=int, default=2001, help='number of epochs') parser.add_argument('--image_folder', type=str, default='data/samples', help='path to dataset') parser.add_argument('--batch_size', type=int, default=4, help='size of each image batch') parser.add_argument('--learning_rate', type=float, default=0.01, help='learning_rate') parser.add_argument('--train_dir', type=str, default=r'E:\team-CV\dataset\tiny_data\VOC2007/',help='train_dir') parser.add_argument('--model_config_path', type=str, default='config/yolov3_2cls.cfg', help='path to model config file') parser.add_argument('--data_config_path', type=str, default='config/coco.data', help='path to data config file') parser.add_argument('--weights_path', type=str, default='weights/yolov3.weights', help='path to weights file') # parser.add_argument('--weights_path', type=str, default='checkpoints/40.weights', help='path to weights file') parser.add_argument('--class_path', type=str, default='data/coco_2cls.names', help='path to class label file') parser.add_argument('--conf_thres', type=float, default=0.8, help='object confidence threshold') parser.add_argument('--nms_thres', type=float, default=0.4, help='iou thresshold for non-maximum suppression') parser.add_argument('--n_cpu', type=int, default=0, help='number of cpu threads to use during batch generation') parser.add_argument('--img_size', type=int, default=416, help='size of each image dimension') parser.add_argument('--checkpoint_interval', type=int, default=4, help='interval between saving model weights') parser.add_argument('--checkpoint_dir', type=str, default='checkpoints', help='directory where model checkpoints are saved') opt = parser.parse_args() print(opt) os.makedirs('output', exist_ok=True) os.makedirs('checkpoints', exist_ok=True) def adjust_learning_rate(optimizer, decay_rate=0.5): for param_group in optimizer.module.param_groups: if(param_group['lr']>1e-8): param_group['lr'] = param_group['lr'] * decay_rate print(optimizer.module) cuda = True if torch.cuda.is_available else False classes = load_classes(opt.class_path) module_defs=parse_model_config(opt.model_config_path) hyperparams = module_defs[0] anchors=hyperparams["anchors"] anchors = [int(x) for x in anchors.split(",")] anchors = [(anchors[i], anchors[i + 1]) for i in range(0, len(anchors), 2)] module_defs[83]["anchors"]=anchors module_defs[95]["anchors"]=anchors module_defs[107]["anchors"]=anchors batch_size = opt.batch_size# int(hyperparams['batch']) subdivisions = int(hyperparams['subdivisions']) sub_batch = batch_size // subdivisions learning_rate = opt.learning_rate momentum = float(hyperparams['momentum']) decay = float(hyperparams['decay']) burn_in = int(hyperparams['burn_in']) hyperparams['height']=hyperparams['width']=opt.img_size if __name__ == '__main__': dataloader = torch.utils.data.DataLoader( ListDataset(opt.train_dir,img_size=opt.img_size,is_training = 1,data_size=10000), batch_size=batch_size, shuffle=1, num_workers=opt.n_cpu) model = Darknet(module_defs,img_size=opt.img_size) model.load_weights(opt.weights_path,is_training=True) #model.apply(weights_init_normal) ngpus = 4 if ngpus >= 1: device = torch.device("cuda") else: device = torch.device("cpu") if cuda: if ngpus > 1: model = torch.nn.DataParallel(model).to(device) # model = nn.parallel.DataParallel(model,device_ids=_DEVICE_ID).cuda() else: model = model.to(device) model.train() Tensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor # optimizer = optim.SGD(model.parameters(), lr=learning_rate/batch_size, momentum=momentum, dampening=0, weight_decay=decay*batch_size) optimizer = optim.Adam(model.parameters(), lr=learning_rate/batch_size, weight_decay=decay*batch_size) optimizer = torch.nn.DataParallel(optimizer).to(device) print("subdivisions",subdivisions) logger = Logger('./logs') total_loss=0 last_total_loss=0 for epoch in range(opt.epochs): for batch_i, (_, imgs, targets) in enumerate(dataloader): imgs = Variable(imgs.type(Tensor)) targets = Variable(targets.type(Tensor), requires_grad=False) optimizer.module.zero_grad() loss = model(imgs, targets) # loss.backward() # optimizer.step() torch.sum(loss).backward() optimizer.module.step() strftime = datetime.datetime.now().strftime("%H:%M:%S") # print(strftime, epoch, opt.epochs, batch_i, len(dataloader), loss) if batch_i % 40 == 39: if last_total_loss > 0 and total_loss > last_total_loss * 1.01: print("total_loss", total_loss) adjust_learning_rate(optimizer) else: last_total_loss = total_loss total_loss = torch.sum(loss) elif batch_i == 0: total_loss = torch.sum(loss) else: total_loss += torch.sum(loss) # if epoch > 0 and batch_i == 0: # if torch.sum(loss) > mean_loss / batch_size : # print("mean_loss", mean_loss) # adjust_learning_rate(optimizer) # mean_loss = torch.sum(loss) # else: # mean_loss += torch.sum(loss) # info = {'loss': loss.item(), 'cls': model.losses['cls'], 'conf': model.losses['conf']} # for tag, value in info.items(): # logger.scalar_summary(tag, value, epoch) print('%s [Epoch %d/%d, Batch %d/%d Losses: x %f, y %f, w %f, h %f, conf %f, cls %f, total %f, recall: %.5f]' % (strftime, epoch, opt.epochs, batch_i, len(dataloader), model.module.losses['x'], model.module.losses['y'], model.module.losses['w'], model.module.losses['h'], model.module.losses['conf'], model.module.losses['cls'], torch.sum(loss), model.module.losses['recall'])) if epoch % opt.checkpoint_interval == 0: model.module.save_weights('%s/%d.weights' % (opt.checkpoint_dir, epoch))