1. 程式人生 > >PyTorch程式碼學習-ImageNET訓練

PyTorch程式碼學習-ImageNET訓練

PyTorch程式碼學習-ImageNET訓練

文章說明:本人學習pytorch/examples/ImageNET/main()理解(待續)

# -*- coding: utf-8 -*-
import argparse  # 命令列直譯器相關程式,命令列直譯器
import os        # 作業系統檔案相關
import shutil    # 檔案高階操作
import time      # 呼叫時間模組

import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as
cudnn # gpu 使用 import torch.distributed as dist # 分散式(pytorch 0.2) import torch.optim # 優化器 import torch.utils.data import torch.utils.data.distributed import torchvision.transforms as transforms import torchvision.datasets as datasets import torchvision.models as
models # name中若為小寫且不以‘——’開頭,則對其進行升序排列 model_names = sorted(name for name in models.__dict__ if name.islower() and not name.startswith("__") and callable(models.__dict__[name])) # callable功能為判斷返回物件是否可呼叫(即某種功能)。 # 建立argparse.ArgumentParser物件 parser = argparse.ArgumentParser(description='PyTorch ImageNet Training'
) # 新增命令列元素 parser.add_argument('data', metavar='DIR', help='path to dataset') parser.add_argument('--arch', '-a', metavar='ARCH', default='resnet18', choices=model_names, help='model architecture: ' + ' | '.join(model_names) + ' (default: resnet18)') parser.add_argument('-j', '--workers', default=4, type=int, metavar='N', help='number of data loading workers (default: 4)') parser.add_argument('--epochs', default=90, type=int, metavar='N', help='number of total epochs to run') parser.add_argument('--start-epoch', default=0, type=int, metavar='N', help='manual epoch number (useful on restarts)') parser.add_argument('-b', '--batch-size', default=256, type=int, metavar='N', help='mini-batch size (default: 256)') parser.add_argument('--lr', '--learning-rate', default=0.1, type=float, metavar='LR', help='initial learning rate') parser.add_argument('--momentum', default=0.9, type=float, metavar='M', help='momentum') parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float, metavar='W', help='weight decay (default: 1e-4)') parser.add_argument('--print-freq', '-p', default=10, type=int, metavar='N', help='print frequency (default: 10)') parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true', help='evaluate model on validation set') parser.add_argument('--pretrained', dest='pretrained', action='store_true', help='use pre-trained model') parser.add_argument('--world-size', default=1, type=int, help='number of distributed processes') parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str, help='url used to set up distributed training') parser.add_argument('--dist-backend', default='gloo', type=str, help='distributed backend') # 定義引數 best_prec1 = 0 # 定義主函式main() def main(): global args, best_prec1 # 使用函式parse_args()進行引數解析,輸入預設是sys.argv[1:], # 返回值是一個包含命令引數的Namespace,所有引數以屬性的形式存在,比如args.myoption。 args = parser.parse_args() ########## 使用多播地址進行初始化 args.distributed = args.world_size > 1 if args.distributed: dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size) ##### step1: create model and set GPU # 匯入pretrained model 或者建立model if args.pretrained: # format 格式化表達字串,上述預設arch為resnet18 print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() # 分散式執行,可實現在多塊GPU上執行 if not args.distributed: if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): # 批處理,多GPU預設用dataparallel使用在多塊gpu上 model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() else: # Wrap model in DistributedDataParallel (CUDA only for the moment) model.cuda() model = torch.nn.parallel.DistributedDataParallel(model) ##### step2: define loss function (criterion) and optimizer # 使用交叉熵損失函式 criterion = nn.CrossEntropyLoss().cuda() # optimizer 使用 SGD + momentum # 動量,預設設定為0.9 optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, # 權值衰減,預設為1e-4 weight_decay=args.weight_decay) # 恢復模型(詳見模型存取與恢復) ####step3:optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): # 判斷返回的是不是檔案 print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) # load 一個save的物件 args.start_epoch = checkpoint['epoch'] # default = 90 best_prec1 = checkpoint['best_prec1'] # best_prec1 = 0 model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) # load_state_dict:恢復模型 print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True ##### step4: Data loading code base of dataset(have downloaded) and normalize # 從 train、val檔案中匯入資料 traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') # 資料預處理:normalize: - mean / std normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # ImageFolder 一個通用的資料載入器 train_dataset = datasets.ImageFolder( traindir, # 對資料進行預處理 transforms.Compose([ # 將幾個transforms 組合在一起 transforms.RandomSizedCrop(224), # 隨機切再resize成給定的size大小 transforms.RandomHorizontalFlip(), # 概率為0.5,隨機水平翻轉。 transforms.ToTensor(), # 把一個取值範圍是[0,255]或者shape為(H,W,C)的numpy.ndarray, # 轉換成形狀為[C,H,W],取值範圍是[0,1.0]的torch.FloadTensor normalize, ])) ####### if args.distributed: # Use a DistributedSampler to restrict each process to a distinct subset of the dataset. train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) else: train_sampler = None ###### # train 資料下載及預處理 train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader( datasets.ImageFolder(valdir, transforms.Compose([ # 重新改變大小為`size`,若:height>width`,則:(size*height/width, size) transforms.Scale(256), # 將給定的資料進行中心切割,得到給定的size。 transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) # default workers = 4 ##### step5: 驗證函式 if args.evaluate: validate(val_loader, model, criterion) # 自定義的validate函式,見下 return ##### step6:開始訓練模型 for epoch in range(args.start_epoch, args.epochs): # Use .set_epoch() method to reshuffle the dataset partition at every iteration if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch) # adjust_learning_rate 自定義的函式,見下 # train for one epoch train(train_loader, model, criterion, optimizer, epoch) # evaluate on validation set prec1 = validate(val_loader, model, criterion) # remember best [email protected] and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint({ 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer' : optimizer.state_dict(), }, is_best) # 定義相關函式 # def train 函式 def train(train_loader, model, criterion, optimizer, epoch): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() # switch to train mode model.train() end = time.time() for i, (input, target) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) target = target.cuda(async=True) input_var = torch.autograd.Variable(input) target_var = torch.autograd.Variable(target) # compute output output = model(input_var) # criterion 為定義過的損失函式 loss = criterion(output, target_var) # measure accuracy and record loss prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) losses.update(loss.data[0], input.size(0)) top1.update(prec1[0], input.size(0)) top5.update(prec5[0], input.size(0)) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() # 每十步輸出一次 if i % args.print_freq == 0: # default=10 print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' '[email protected] {top1.val:.3f} ({top1.avg:.3f})\t' '[email protected] {top5.val:.3f} ({top5.avg:.3f})'.format( epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, top1=top1, top5=top5)) def validate(val_loader, model, criterion): batch_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() # switch to evaluate mode model.eval() end = time.time() for i, (input, target) in enumerate(val_loader): target = target.cuda(async=True) # 這是一種用來包裹張量並記錄應用的操作 """ Attributes: data: 任意型別的封裝好的張量。 grad: 儲存與data型別和位置相匹配的梯度,此屬性難以分配並且不能重新分配。 requires_grad: 標記變數是否已經由一個需要呼叫到此變數的子圖建立的bool值。只能在葉子變數上進行修改。 volatile: 標記變數是否能在推理模式下應用(如不儲存歷史記錄)的bool值。只能在葉變數上更改。 is_leaf: 標記變數是否是圖葉子(如由使用者建立的變數)的bool值. grad_fn: Gradient function graph trace. Parameters: data (any tensor class): 要包裝的張量. requires_grad (bool): bool型的標記值. **Keyword only.** volatile (bool): bool型的標記值. **Keyword only.** """ input_var = torch.autograd.Variable(input, volatile=True) target_var = torch.autograd.Variable(target, volatile=True) # compute output output = model(input_var) loss = criterion(output, target_var) # measure accuracy and record loss prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) losses.update(loss.data[0], input.size(0)) top1.update(prec1[0], input.size(0)) top5.update(prec5[0], input.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: print('Test: [{0}/{1}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' '[email protected] {top1.val:.3f} ({top1.avg:.3f})\t' '[email protected] {top5.val:.3f} ({top5.avg:.3f})'.format( i, len(val_loader), batch_time=batch_time, loss=losses, top1=top1, top5=top5)) print(' * [email protected] {top1.avg:.3f} [email protected] {top5.avg:.3f}' .format(top1=top1, top5=top5)) return top1.avg # 儲存當前節點 def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'): torch.save(state, filename) if is_best: shutil.copyfile(filename, 'model_best.pth.tar') # 計算並存儲引數當前值或平均值 class AverageMeter(object): # Computes and stores the average and current value """ batch_time = AverageMeter() 即 self = batch_time 則 batch_time 具有__init__,reset,update三個屬性, 直接使用batch_time.update()呼叫 功能為:batch_time.update(time.time() - end) 僅一個引數,則直接儲存引數值 對應定義:def update(self, val, n=1) losses.update(loss.data[0], input.size(0)) top1.update(prec1[0], input.size(0)) top5.update(prec5[0], input.size(0)) 這些有兩個引數則求引數val的均值,儲存在avg中##不確定## """ def __init__(self): self.reset() # __init__():reset parameters def reset(self): self.val = 0 self.avg = 0 self.sum = 0 self.count = 0 def update(self, val, n=1): self.val = val self.sum += val * n self.count += n self.avg = self.sum / self.count # 更新 learning_rate :每30步,學習率降至前的10分之1 def adjust_learning_rate(optimizer, epoch): """Sets the learning rate to the initial LR decayed by 10 every 30 epochs""" lr = args.lr * (0.1 ** (epoch // 30)) # args.lr = 0.1 , 即每30步,lr = lr /10 for param_group in optimizer.param_groups: # 將更新的lr 送入優化器 optimizer 中,進行下一次優化 param_group['lr'] = lr # 計算準確度 def accuracy(output, target, topk=(1,)): """Computes the [email protected] for the specified values of k prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) """ maxk = max(topk) # size函式:總元素的個數 batch_size = target.size(0) # topk函式選取output前k大個數 _, pred = output.topk(maxk, 1, True, True) ##########不瞭解t() pred = pred.t() correct = pred.eq(target.view(1, -1).expand_as(pred)) res = [] for k in topk: correct_k = correct[:k].view(-1).float().sum(0, keepdim=True) res.append(correct_k.mul_(100.0 / batch_size)) return res if __name__ == '__main__': main()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125
  • 126
  • 127
  • 128
  • 129
  • 130
  • 131
  • 132
  • 133
  • 134
  • 135
  • 136
  • 137
  • 138
  • 139
  • 140
  • 141
  • 142
  • 143
  • 144
  • 145
  • 146
  • 147
  • 148
  • 149
  • 150
  • 151
  • 152
  • 153
  • 154
  • 155
  • 156
  • 157
  • 158
  • 159
  • 160
  • 161
  • 162
  • 163
  • 164
  • 165
  • 166
  • 167
  • 168
  • 169
  • 170
  • 171
  • 172
  • 173
  • 174
  • 175
  • 176
  • 177
  • 178
  • 179
  • 180
  • 181
  • 182
  • 183
  • 184
  • 185
  • 186
  • 187
  • 188
  • 189
  • 190
  • 191
  • 192
  • 193
  • 194
  • 195
  • 196
  • 197
  • 198
  • 199
  • 200
  • 201
  • 202
  • 203
  • 204
  • 205
  • 206
  • 207
  • 208
  • 209
  • 210
  • 211
  • 212
  • 213
  • 214
  • 215
  • 216
  • 217
  • 218
  • 219
  • 220
  • 221
  • 222
  • 223
  • 224
  • 225
  • 226
  • 227
  • 228
  • 229
  • 230
  • 231
  • 232
  • 233
  • 234
  • 235
  • 236
  • 237
  • 238
  • 239
  • 240
  • 241
  • 242
  • 243
  • 244
  • 245
  • 246
  • 247
  • 248
  • 249
  • 250
  • 251
  • 252
  • 253
  • 254
  • 255
  • 256
  • 257
  • 258
  • 259
  • 260
  • 261
  • 262
  • 263
  • 264
  • 265
  • 266
  • 267
  • 268
  • 269
  • 270
  • 271
  • 272
  • 273
  • 274
  • 275
  • 276
  • 277
  • 278
  • 279
  • 280
  • 281
  • 282
  • 283
  • 284
  • 285
  • 286
  • 287
  • 288
  • 289
  • 290
  • 291
  • 292
  • 293
  • 294
  • 295
  • 296
  • 297
  • 298
  • 299
  • 300
  • 301
  • 302
  • 303
  • 304
  • 305
  • 306
  • 307
  • 308
  • 309
  • 310
  • 311
  • 312
  • 313
  • 314
  • 315
  • 316
  • 317
  • 318
  • 319
  • 320
  • 321
  • 322
  • 323
  • 324
  • 325
  • 326
  • 327
  • 328
  • 329
  • 330
  • 331
  • 332
  • 333
  • 334
  • 335
  • 336
  • 337
  • 338
  • 339
  • 340
  • 341
  • 342
  • 343
  • 344
  • 345
  • 346
  • 347
  • 348
  • 349
  • 350
  • 351
  • 352
  • 353
  • 354
  • 355
  • 356
  • 357
  • 358
  • 359
  • 360
  • 361
  • 362
  • 363
  • 364
  • 365
  • 366
  • 367
  • 368
  • 369
  • 370
  • 371
  • 372
  • 373
  • 374
  • 375
  • 376
  • 377
  • 378
  • 379
  • 380
  • 381
  • 382
  • 383
  • 384
  • 385
  • 386
  • 387
  • 388
  • 389

文章目錄 [隱藏]

在模型完成訓練後,我們需要將訓練好的模型儲存為一個檔案供測試使用,或者因為一些原因我們需要繼續之前的狀態訓練之前儲存的模型,那麼如何在PyTorch中儲存和恢復模型呢?

參考PyTorch官方的這份repo,我們知道有兩種方法可以實現我們想要的效果。

方法一(推薦):

第一種方法也是官方推薦的方法,只儲存和恢復模型中的引數。

儲存

torch.save(the_model.state_dict(), PATH)
1 torch . save ( the_model . state_dict ( ) , PATH )

恢復

the_model = TheModelClass(*args, **kwargs) the_model.load_state_dict(torch.load(PATH))
1 2 the_model = TheModelClass ( * args , * * kwargs ) the_model . load_state_dict ( torch . load ( PATH ) )

使用這種方法,我們需要自己匯入模型的結構資訊。

方法二:

使用這種方法,將會儲存模型的引數和結構資訊。

儲存

torch.save(the_model, PATH)
1 torch . save ( the_model , PATH )

恢復

the_model = torch.load(PATH)
1 the_model = torch . load ( PATH )

一個相對完整的例子

saving

torch.save({ ‘epoch’: epoch + 1, ‘arch’: args.arch, ‘state_dict’: model.state_dict(), ‘best_prec1’: best_prec1, }, ‘checkpoint.tar’ )