PyTorch程式碼學習-ImageNET訓練
阿新 • • 發佈:2018-10-31
PyTorch程式碼學習-ImageNET訓練
文章說明:本人學習pytorch/examples/ImageNET/main()理解(待續)
# -*- coding: utf-8 -*-
import argparse # 命令列直譯器相關程式,命令列直譯器
import os # 作業系統檔案相關
import shutil # 檔案高階操作
import time # 呼叫時間模組
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn # gpu 使用
import torch.distributed as dist # 分散式(pytorch 0.2)
import torch.optim # 優化器
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
# name中若為小寫且不以‘——’開頭,則對其進行升序排列
model_names = sorted(name for name in models.__dict__
if name.islower() and not name.startswith("__")
and callable(models.__dict__[name]))
# callable功能為判斷返回物件是否可呼叫(即某種功能)。
# 建立argparse.ArgumentParser物件
parser = argparse.ArgumentParser(description='PyTorch ImageNet Training' )
# 新增命令列元素
parser.add_argument('data', metavar='DIR',
help='path to dataset')
parser.add_argument('--arch', '-a', metavar='ARCH', default='resnet18',
choices=model_names,
help='model architecture: ' +
' | '.join(model_names) +
' (default: resnet18)')
parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
help='number of data loading workers (default: 4)')
parser.add_argument('--epochs', default=90, type=int, metavar='N',
help='number of total epochs to run')
parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
help='manual epoch number (useful on restarts)')
parser.add_argument('-b', '--batch-size', default=256, type=int,
metavar='N', help='mini-batch size (default: 256)')
parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
metavar='LR', help='initial learning rate')
parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
help='momentum')
parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float,
metavar='W', help='weight decay (default: 1e-4)')
parser.add_argument('--print-freq', '-p', default=10, type=int,
metavar='N', help='print frequency (default: 10)')
parser.add_argument('--resume', default='', type=str, metavar='PATH',
help='path to latest checkpoint (default: none)')
parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
help='evaluate model on validation set')
parser.add_argument('--pretrained', dest='pretrained', action='store_true',
help='use pre-trained model')
parser.add_argument('--world-size', default=1, type=int,
help='number of distributed processes')
parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
help='url used to set up distributed training')
parser.add_argument('--dist-backend', default='gloo', type=str,
help='distributed backend')
# 定義引數
best_prec1 = 0
# 定義主函式main()
def main():
global args, best_prec1
# 使用函式parse_args()進行引數解析,輸入預設是sys.argv[1:],
# 返回值是一個包含命令引數的Namespace,所有引數以屬性的形式存在,比如args.myoption。
args = parser.parse_args()
########## 使用多播地址進行初始化
args.distributed = args.world_size > 1
if args.distributed:
dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
world_size=args.world_size)
##### step1: create model and set GPU
# 匯入pretrained model 或者建立model
if args.pretrained:
# format 格式化表達字串,上述預設arch為resnet18
print("=> using pre-trained model '{}'".format(args.arch))
model = models.__dict__[args.arch](pretrained=True)
else:
print("=> creating model '{}'".format(args.arch))
model = models.__dict__[args.arch]()
# 分散式執行,可實現在多塊GPU上執行
if not args.distributed:
if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
# 批處理,多GPU預設用dataparallel使用在多塊gpu上
model.features = torch.nn.DataParallel(model.features)
model.cuda()
else:
model = torch.nn.DataParallel(model).cuda()
else:
# Wrap model in DistributedDataParallel (CUDA only for the moment)
model.cuda()
model = torch.nn.parallel.DistributedDataParallel(model)
##### step2: define loss function (criterion) and optimizer
# 使用交叉熵損失函式
criterion = nn.CrossEntropyLoss().cuda()
# optimizer 使用 SGD + momentum
# 動量,預設設定為0.9
optimizer = torch.optim.SGD(model.parameters(), args.lr,
momentum=args.momentum,
# 權值衰減,預設為1e-4
weight_decay=args.weight_decay)
# 恢復模型(詳見模型存取與恢復)
####step3:optionally resume from a checkpoint
if args.resume:
if os.path.isfile(args.resume): # 判斷返回的是不是檔案
print("=> loading checkpoint '{}'".format(args.resume))
checkpoint = torch.load(args.resume) # load 一個save的物件
args.start_epoch = checkpoint['epoch'] # default = 90
best_prec1 = checkpoint['best_prec1'] # best_prec1 = 0
model.load_state_dict(checkpoint['state_dict'])
optimizer.load_state_dict(checkpoint['optimizer']) # load_state_dict:恢復模型
print("=> loaded checkpoint '{}' (epoch {})"
.format(args.resume, checkpoint['epoch']))
else:
print("=> no checkpoint found at '{}'".format(args.resume))
cudnn.benchmark = True
##### step4: Data loading code base of dataset(have downloaded) and normalize
# 從 train、val檔案中匯入資料
traindir = os.path.join(args.data, 'train')
valdir = os.path.join(args.data, 'val')
# 資料預處理:normalize: - mean / std
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
# ImageFolder 一個通用的資料載入器
train_dataset = datasets.ImageFolder(
traindir,
# 對資料進行預處理
transforms.Compose([ # 將幾個transforms 組合在一起
transforms.RandomSizedCrop(224), # 隨機切再resize成給定的size大小
transforms.RandomHorizontalFlip(), # 概率為0.5,隨機水平翻轉。
transforms.ToTensor(), # 把一個取值範圍是[0,255]或者shape為(H,W,C)的numpy.ndarray,
# 轉換成形狀為[C,H,W],取值範圍是[0,1.0]的torch.FloadTensor
normalize,
]))
#######
if args.distributed:
# Use a DistributedSampler to restrict each process to a distinct subset of the dataset.
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
else:
train_sampler = None
######
# train 資料下載及預處理
train_loader = torch.utils.data.DataLoader(
train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
num_workers=args.workers, pin_memory=True, sampler=train_sampler)
val_loader = torch.utils.data.DataLoader(
datasets.ImageFolder(valdir, transforms.Compose([
# 重新改變大小為`size`,若:height>width`,則:(size*height/width, size)
transforms.Scale(256),
# 將給定的資料進行中心切割,得到給定的size。
transforms.CenterCrop(224),
transforms.ToTensor(),
normalize,
])),
batch_size=args.batch_size, shuffle=False,
num_workers=args.workers, pin_memory=True) # default workers = 4
##### step5: 驗證函式
if args.evaluate:
validate(val_loader, model, criterion) # 自定義的validate函式,見下
return
##### step6:開始訓練模型
for epoch in range(args.start_epoch, args.epochs):
# Use .set_epoch() method to reshuffle the dataset partition at every iteration
if args.distributed:
train_sampler.set_epoch(epoch)
adjust_learning_rate(optimizer, epoch) # adjust_learning_rate 自定義的函式,見下
# train for one epoch
train(train_loader, model, criterion, optimizer, epoch)
# evaluate on validation set
prec1 = validate(val_loader, model, criterion)
# remember best [email protected] and save checkpoint
is_best = prec1 > best_prec1
best_prec1 = max(prec1, best_prec1)
save_checkpoint({
'epoch': epoch + 1,
'arch': args.arch,
'state_dict': model.state_dict(),
'best_prec1': best_prec1,
'optimizer' : optimizer.state_dict(),
}, is_best)
# 定義相關函式
# def train 函式
def train(train_loader, model, criterion, optimizer, epoch):
batch_time = AverageMeter()
data_time = AverageMeter()
losses = AverageMeter()
top1 = AverageMeter()
top5 = AverageMeter()
# switch to train mode
model.train()
end = time.time()
for i, (input, target) in enumerate(train_loader):
# measure data loading time
data_time.update(time.time() - end)
target = target.cuda(async=True)
input_var = torch.autograd.Variable(input)
target_var = torch.autograd.Variable(target)
# compute output
output = model(input_var)
# criterion 為定義過的損失函式
loss = criterion(output, target_var)
# measure accuracy and record loss
prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
losses.update(loss.data[0], input.size(0))
top1.update(prec1[0], input.size(0))
top5.update(prec5[0], input.size(0))
# compute gradient and do SGD step
optimizer.zero_grad()
loss.backward()
optimizer.step()
# measure elapsed time
batch_time.update(time.time() - end)
end = time.time()
# 每十步輸出一次
if i % args.print_freq == 0: # default=10
print('Epoch: [{0}][{1}/{2}]\t'
'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
'[email protected] {top1.val:.3f} ({top1.avg:.3f})\t'
'[email protected] {top5.val:.3f} ({top5.avg:.3f})'.format(
epoch, i, len(train_loader), batch_time=batch_time,
data_time=data_time, loss=losses, top1=top1, top5=top5))
def validate(val_loader, model, criterion):
batch_time = AverageMeter()
losses = AverageMeter()
top1 = AverageMeter()
top5 = AverageMeter()
# switch to evaluate mode
model.eval()
end = time.time()
for i, (input, target) in enumerate(val_loader):
target = target.cuda(async=True)
# 這是一種用來包裹張量並記錄應用的操作
"""
Attributes:
data: 任意型別的封裝好的張量。
grad: 儲存與data型別和位置相匹配的梯度,此屬性難以分配並且不能重新分配。
requires_grad: 標記變數是否已經由一個需要呼叫到此變數的子圖建立的bool值。只能在葉子變數上進行修改。
volatile: 標記變數是否能在推理模式下應用(如不儲存歷史記錄)的bool值。只能在葉變數上更改。
is_leaf: 標記變數是否是圖葉子(如由使用者建立的變數)的bool值.
grad_fn: Gradient function graph trace.
Parameters:
data (any tensor class): 要包裝的張量.
requires_grad (bool): bool型的標記值. **Keyword only.**
volatile (bool): bool型的標記值. **Keyword only.**
"""
input_var = torch.autograd.Variable(input, volatile=True)
target_var = torch.autograd.Variable(target, volatile=True)
# compute output
output = model(input_var)
loss = criterion(output, target_var)
# measure accuracy and record loss
prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
losses.update(loss.data[0], input.size(0))
top1.update(prec1[0], input.size(0))
top5.update(prec5[0], input.size(0))
# measure elapsed time
batch_time.update(time.time() - end)
end = time.time()
if i % args.print_freq == 0:
print('Test: [{0}/{1}]\t'
'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
'[email protected] {top1.val:.3f} ({top1.avg:.3f})\t'
'[email protected] {top5.val:.3f} ({top5.avg:.3f})'.format(
i, len(val_loader), batch_time=batch_time, loss=losses,
top1=top1, top5=top5))
print(' * [email protected] {top1.avg:.3f} [email protected] {top5.avg:.3f}'
.format(top1=top1, top5=top5))
return top1.avg
# 儲存當前節點
def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
torch.save(state, filename)
if is_best:
shutil.copyfile(filename, 'model_best.pth.tar')
# 計算並存儲引數當前值或平均值
class AverageMeter(object):
# Computes and stores the average and current value
"""
batch_time = AverageMeter()
即 self = batch_time
則 batch_time 具有__init__,reset,update三個屬性,
直接使用batch_time.update()呼叫
功能為:batch_time.update(time.time() - end)
僅一個引數,則直接儲存引數值
對應定義:def update(self, val, n=1)
losses.update(loss.data[0], input.size(0))
top1.update(prec1[0], input.size(0))
top5.update(prec5[0], input.size(0))
這些有兩個引數則求引數val的均值,儲存在avg中##不確定##
"""
def __init__(self):
self.reset() # __init__():reset parameters
def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
def update(self, val, n=1):
self.val = val
self.sum += val * n
self.count += n
self.avg = self.sum / self.count
# 更新 learning_rate :每30步,學習率降至前的10分之1
def adjust_learning_rate(optimizer, epoch):
"""Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
lr = args.lr * (0.1 ** (epoch // 30)) # args.lr = 0.1 , 即每30步,lr = lr /10
for param_group in optimizer.param_groups: # 將更新的lr 送入優化器 optimizer 中,進行下一次優化
param_group['lr'] = lr
# 計算準確度
def accuracy(output, target, topk=(1,)):
"""Computes the [email protected] for the specified values of k
prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
"""
maxk = max(topk)
# size函式:總元素的個數
batch_size = target.size(0)
# topk函式選取output前k大個數
_, pred = output.topk(maxk, 1, True, True)
##########不瞭解t()
pred = pred.t()
correct = pred.eq(target.view(1, -1).expand_as(pred))
res = []
for k in topk:
correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
res.append(correct_k.mul_(100.0 / batch_size))
return res
if __name__ == '__main__':
main()
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
- 85
- 86
- 87
- 88
- 89
- 90
- 91
- 92
- 93
- 94
- 95
- 96
- 97
- 98
- 99
- 100
- 101
- 102
- 103
- 104
- 105
- 106
- 107
- 108
- 109
- 110
- 111
- 112
- 113
- 114
- 115
- 116
- 117
- 118
- 119
- 120
- 121
- 122
- 123
- 124
- 125
- 126
- 127
- 128
- 129
- 130
- 131
- 132
- 133
- 134
- 135
- 136
- 137
- 138
- 139
- 140
- 141
- 142
- 143
- 144
- 145
- 146
- 147
- 148
- 149
- 150
- 151
- 152
- 153
- 154
- 155
- 156
- 157
- 158
- 159
- 160
- 161
- 162
- 163
- 164
- 165
- 166
- 167
- 168
- 169
- 170
- 171
- 172
- 173
- 174
- 175
- 176
- 177
- 178
- 179
- 180
- 181
- 182
- 183
- 184
- 185
- 186
- 187
- 188
- 189
- 190
- 191
- 192
- 193
- 194
- 195
- 196
- 197
- 198
- 199
- 200
- 201
- 202
- 203
- 204
- 205
- 206
- 207
- 208
- 209
- 210
- 211
- 212
- 213
- 214
- 215
- 216
- 217
- 218
- 219
- 220
- 221
- 222
- 223
- 224
- 225
- 226
- 227
- 228
- 229
- 230
- 231
- 232
- 233
- 234
- 235
- 236
- 237
- 238
- 239
- 240
- 241
- 242
- 243
- 244
- 245
- 246
- 247
- 248
- 249
- 250
- 251
- 252
- 253
- 254
- 255
- 256
- 257
- 258
- 259
- 260
- 261
- 262
- 263
- 264
- 265
- 266
- 267
- 268
- 269
- 270
- 271
- 272
- 273
- 274
- 275
- 276
- 277
- 278
- 279
- 280
- 281
- 282
- 283
- 284
- 285
- 286
- 287
- 288
- 289
- 290
- 291
- 292
- 293
- 294
- 295
- 296
- 297
- 298
- 299
- 300
- 301
- 302
- 303
- 304
- 305
- 306
- 307
- 308
- 309
- 310
- 311
- 312
- 313
- 314
- 315
- 316
- 317
- 318
- 319
- 320
- 321
- 322
- 323
- 324
- 325
- 326
- 327
- 328
- 329
- 330
- 331
- 332
- 333
- 334
- 335
- 336
- 337
- 338
- 339
- 340
- 341
- 342
- 343
- 344
- 345
- 346
- 347
- 348
- 349
- 350
- 351
- 352
- 353
- 354
- 355
- 356
- 357
- 358
- 359
- 360
- 361
- 362
- 363
- 364
- 365
- 366
- 367
- 368
- 369
- 370
- 371
- 372
- 373
- 374
- 375
- 376
- 377
- 378
- 379
- 380
- 381
- 382
- 383
- 384
- 385
- 386
- 387
- 388
- 389
文章目錄 [隱藏]
在模型完成訓練後,我們需要將訓練好的模型儲存為一個檔案供測試使用,或者因為一些原因我們需要繼續之前的狀態訓練之前儲存的模型,那麼如何在PyTorch中儲存和恢復模型呢?
參考PyTorch官方的這份repo,我們知道有兩種方法可以實現我們想要的效果。
方法一(推薦):
第一種方法也是官方推薦的方法,只儲存和恢復模型中的引數。
儲存
torch.save(the_model.state_dict(), PATH)1 | torch . save ( the_model . state_dict ( ) , PATH ) |
恢復
the_model = TheModelClass(*args, **kwargs) the_model.load_state_dict(torch.load(PATH))1 2 | the_model = TheModelClass ( * args , * * kwargs ) the_model . load_state_dict ( torch . load ( PATH ) ) |
使用這種方法,我們需要自己匯入模型的結構資訊。
方法二:
使用這種方法,將會儲存模型的引數和結構資訊。
儲存
torch.save(the_model, PATH)1 | torch . save ( the_model , PATH ) |
恢復
the_model = torch.load(PATH)1 | the_model = torch . load ( PATH ) |
一個相對完整的例子
saving
torch.save({ ‘epoch’: epoch + 1, ‘arch’: args.arch, ‘state_dict’: model.state_dict(), ‘best_prec1’: best_prec1, }, ‘checkpoint.tar’ )