pytorch模型引數理解備忘
阿新 • • 發佈:2019-01-09
模型結構
def downsample_basic_block(x, planes, stride):
out = F.avg_pool3d(x, kernel_size=1, stride=stride)
zero_pads = torch.Tensor(
out.size(0), planes - out.size(1), out.size(2), out.size(3),
out.size(4)).zero_()
if isinstance(out.data, torch.cuda.FloatTensor):
zero_pads = zero_pads.cuda()
out = Variable(torch.cat([out.data, zero_pads], dim=1))
return out
class ResNeXtBottleneck(nn.Module):
expansion = 2
def __init__(self, inplanes, planes, cardinality, stride=1,
downsample=None,conv3d_bias=True):
super(ResNeXtBottleneck, self).__init__()
mid_planes = cardinality * int(planes / 32 )
self.conv1 = nn.Conv3d(inplanes, mid_planes, kernel_size=1, bias=conv3d_bias)
self.bn1 = nn.BatchNorm3d(mid_planes)
self.conv2 = nn.Conv3d(
mid_planes,
mid_planes,
kernel_size=3,
stride=stride,
padding=1,
groups=cardinality,
bias=conv3d_bias)
self .bn2 = nn.BatchNorm3d(mid_planes)
self.conv3 = nn.Conv3d(
mid_planes, planes * self.expansion, kernel_size=1, bias=conv3d_bias)
self.bn3 = nn.BatchNorm3d(planes * self.expansion)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class ResNeXt(nn.Module):
def __init__(self,
block,
layers,
sample_size,
sample_duration,
shortcut_type='B',
cardinality=32,
num_classes=400,
conv3d_bias=True):
self.conv3d_bias = conv3d_bias
self.inplanes = 64
super(ResNeXt, self).__init__()
self.conv1 = nn.Conv3d(
3,
64,
kernel_size=7,
stride=2,
padding=3,
bias=conv3d_bias)
self.bn1 = nn.BatchNorm3d(64)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1)
self.layer1 = self._make_layer(block, 128, layers[0], shortcut_type,
cardinality)
self.layer2 = self._make_layer(
block, 256, layers[1], shortcut_type, cardinality, stride=2)
self.layer3 = self._make_layer(
block, 512, layers[2], shortcut_type, cardinality, stride=2)
self.layer4 = self._make_layer(
block, 1024, layers[3], shortcut_type, cardinality, stride=2)
last_duration = int(math.ceil(sample_duration / 16))
last_size = int(math.ceil(sample_size / 32))
self.avgpool = nn.AvgPool3d(
(last_duration, last_size, last_size), stride=1)
self.fc = nn.Linear(cardinality * 32 * block.expansion, num_classes)
for m in self.modules():
if isinstance(m, nn.Conv3d):
m.weight = nn.init.kaiming_normal_(m.weight, mode='fan_out')
elif isinstance(m, nn.BatchNorm3d):
m.weight.data.fill_(1)
m.bias.data.zero_()
def _make_layer(self,
block,
planes,
blocks,
shortcut_type,
cardinality,
stride=1):
downsample = None
if stride != 1 or self.inplanes != planes * block.expansion:
if shortcut_type == 'A':
downsample = partial(
downsample_basic_block,
planes=planes * block.expansion,
stride=stride)
else:
downsample = nn.Sequential(
nn.Conv3d(
self.inplanes,
planes * block.expansion,
kernel_size=1,
stride=stride,
bias=self.conv3d_bias), nn.BatchNorm3d(planes * block.expansion))
layers = []
layers.append(
block(self.inplanes, planes, cardinality, stride, downsample, conv3d_bias=self.conv3d_bias))
self.inplanes = planes * block.expansion
for i in range(1, blocks):
layers.append(block(self.inplanes, planes, cardinality, conv3d_bias=self.conv3d_bias))
return nn.Sequential(*layers)
def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.avgpool(x)
x = x.view(x.size(0), -1)
x = self.fc(x)
return x
pretrain = torch.load(opt.pretrain_path)
assert opt.arch == pretrain['arch']
from collections import OrderedDict
new_state_dict = OrderedDict()
for k, v in pretrain['state_dict'].items():
print(k)
name = k[7:]
print(name) # remove `module.`
new_state_dict[name] = v
程式結果
這裡之所以需要pretrain['state_dict']
而不是直接使用model.load_state_dict(torch.load(opt.pretrain_path))
是因為儲存模型的時候不但儲存了引數,還有周期,結構等資訊。
states = {
'epoch': epoch + 1,
'arch': opt.arch,
'state_dict': model.state_dict(),
'optimizer': optimizer.state_dict(),
}
torch.save(states, save_file_path)
name = k[7:]
去掉每一個引數名的前七個字元,因為下載的預訓練模型是在torch.nn.DataParallel
分散式下訓練的,而我只有單卡,所以需要去掉引數名前面的module,再load。
optimizer.state_dict()
有state
和param_groups
兩個key,其中param_groups
的value如下所示。
for k, v in model.named_parameters():
print(k)
for i in model.parameters():
print(i)
print(model.fc)
返回
Linear(in_features=2048, out_features=90, bias=True)
print(model.layer4)
總結:首先構建計算圖,返回model。如果只想要一致的學習率,只需要再optimizer的第一個引數裡寫model.parameters()。model.parameters()應該是個有順序的字典,此時len(optimizer.param_groups)等於1。如果想分別設定學習率等引數,可以如下設定,因為model.parameters()字典有順序,所以這裡列表新增的{‘params’: v, ‘lr’: 0.0}都沒有引數名,剩下的沒有定義的引數如momentum就按照optimizer引數設定的給每個都分配
for i in range(opt.ft_begin_index, 5):
ft_module_names.append('layer{}'.format(i))
ft_module_names.append('fc')
parameters = []
for k, v in model.parameters():
print(k)
for ft_module in ft_module_names:
if ft_module in k:
parameters.append({'params': v})
break
else:
parameters.append({'params': v, 'lr': 0.0})