pytorch + visdom 處理cifar10影象分類
執行環境
系統:win10
cpu:i7-6700HQ
gpu:gtx965m
python : 3.6
pytorch :0.3
資料集使用
Cifar-10 由60000張32*32的 RGB 彩色圖片構成,共10個分類。50000張訓練,10000張測試(交叉驗證)。這個資料集最大的特點在於將識別遷移到了普適物體,而且應用於多分類。
資料下載,使用pytorch 可以自動下載,但是由於下載速度很龜,還是去官網下吧,建議迅雷,挺快的,連結在這。
資料下載完成後解壓,在project根目錄建立data資料夾,把解壓檔案放入即可。
資料處理:
transform_train = transforms.Compose(
# 影象翻轉
[transforms.RandomHorizontalFlip(),
# 資料張量化 (0,255) >> (0,1)
transforms.ToTensor(),
# 資料正態分佈 (0,1) >> (-1,1)
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
transform_test = transforms.Compose(
[transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5 , 0.5, 0.5))])
# 載入資料靠 train 做以區分 訓練集和測試集
train_dataset = datasets.CIFAR10('./data', train=True, transform=transform_train)
test_dataset = datasets.CIFAR10('./data', train=False, transform=transform_test)
#載入資料,並分批
train_loader = DataLoader(train_dataset, BATCH_SIZE, True)
test_loader = DataLoader(test_dataset, BATCH_SIZE, False )
視覺化部分資料看看:
classes = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
dataiter = iter(test_loader)
image, label = dataiter.next()
# 視覺化visualize
image = viz.images(image[:10]/2+0.5, nrow=10, padding=3, env='cifar10')
text = viz.text('||'.join('%6s' % classes[label[j]] for j in range(10)))
每張圖 32x32 ,很糊。
建立 cnn
建立network,用了兩層卷積層-池化層 :
class CNN(nn.Module):
def __init__(self, in_dim, n_class):
super(CNN, self).__init__()
# 卷積部分
self.cnn = nn.Sequential(
nn.Conv2d(in_dim, 16, 5, 1, 2), # (32,32)
nn.ReLU(True),
nn.MaxPool2d(2), # (32,32) >> (16,16)
nn.ReLU(True),
nn.Conv2d(16, 32, 3, 1, 1),
nn.ReLU(True),
nn.MaxPool2d(2), # (16,16) >> (8,8)
)
# linear 部分
self.fc = nn.Sequential(
nn.Linear(32*8*8, 120),
nn.ReLU(True),
nn.Linear(120, 50),
nn.ReLU(True),
nn.Linear(50, n_class),
)
def forward(self, x):
out = self.cnn(x)
out = self.fc(out.view(-1, 32*8*8)) # 通過 view改變out形態
return out
# 因為是彩色圖片 channel 為3 輸入3 ,10分類 輸出10
net = CNN(3, 10)
loss ,優化函式:
# 交叉熵
loss_f = nn.CrossEntropyLoss()
# SGD加動量加快優化
optimizer = optim.SGD(net.parameters(), lr=LR, momentum=0.9)
# learning-rate 變化函式
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)
訓練前準備,視覺化需要記錄一些執行過程產生的資料值,訓練時間記錄,net最佳狀態留存:
# 隔多少個batch輸出資料
tr_num = len(train_dataset)/BATCH_SIZE/5
ts_num = len(test_dataset)/BATCH_SIZE/5
# 開始時間
start_time = time.time()
# visdom 建立 line 視窗
line = viz.line(Y=np.arange(10), env="cifar10")
# 記錄資料的一些狀態
tr_loss, ts_loss, tr_acc, ts_acc, step = [], [], [], [], []
# 記錄net最佳狀態
best_acc = 0.
best_state = net.state_dict()
訓練部分程式碼,10個epoch,每個batch 20個數據 初始learning-rate 0.005:
for epoch in range(EPOCHS):
# 檢測執行中loss ,acc 變化
running_loss, running_acc = 0.0, 0.
scheduler.step()
# 訓練環境設定
net.train()
for i, (img, label) in enumerate(train_loader, 1):
if gpu_status:
img, label = img.cuda(), label.cuda()
img, label = Variable(img), Variable(label)
out = net(img)
loss = loss_f(out, label)
pred = torch.max(out, 1)[1]
running_acc += sum(pred==label).data[0]
running_loss += loss.data[0]*len(label)
optimizer.zero_grad()
loss.backward()
optimizer.step()
# 資料輸出,觀察執行狀態
if i % tr_num == 0:
print("TRAIN : [{}/{}] | loss: {:.4f} | r_acc: {:.4f} ".format(epoch+1, EPOCHS, running_loss/(i*BATCH_SIZE),
running_acc/(i*BATCH_SIZE)))
# 訓練loss,acc 視覺化資料收集
tr_loss.append(running_loss/(i*BATCH_SIZE))
tr_acc.append(running_acc/(i*BATCH_SIZE))
為了,更好觀察訓練情況,每個epoch ,test一下:
net.eval()
eval_loss, eval_acc = 0., 0.
for i, (img, label) in enumerate(test_loader, 1):
if gpu_status:
img, label = img.cuda(), label.cuda()
# 因為是測試集 volatile1 不影響訓練狀態
img, label = Variable(img, volatile=True), Variable(label, volatile=True)
out = net(img)
loss = loss_f(out, label)
pred = torch.max(out, 1)[1]
eval_acc += sum(pred == label).data[0]
eval_loss += loss.data[0]*len(label)
if i % ts_num == 0:
print("test : [{}/{}] | loss: {:.4f} | r_acc: {:.4f} ".format(epoch + 1, EPOCHS,
eval_loss / (i*BATCH_SIZE),
eval_acc / (i*BATCH_SIZE)))
ts_loss.append(eval_loss / (i*BATCH_SIZE))
ts_acc.append(eval_acc / (i*BATCH_SIZE))
# visualize
viz.line(Y=np.column_stack((np.array(tr_loss), np.array(tr_acc), np.array(ts_loss), np.array(ts_acc))),
win=line,
opts=dict(legend=["tr_loss", "tr_acc", "ts_loss", "ts_acc"],
title="cafar10"),
env="cifar10")
# 儲存訓練最佳狀態
if eval_acc / (len(test_dataset)) > best_acc:
best_acc = eval_acc / (len(test_dataset))
best_state = net.state_dict()
訓練&優化
接下來開始訓練,資料可視觀察要用到 visdom ,開啟visdom , cmd輸入:
python -m visdom.server
建立cifar 10 env
執行專案:
TRAIN : [10/10] | loss: 0.4257 | r_acc: 0.8526
TRAIN : [10/10] | loss: 0.4270 | r_acc: 0.8516
TRAIN : [10/10] | loss: 0.4383 | r_acc: 0.8522
test : [10/10] | loss: 0.7466 | r_acc: 0.7485
test : [10/10] | loss: 0.8148 | r_acc: 0.7415
10個 epoch 測試準確率 0.75 ,由圖可見第5個epoch開始出現過擬合:
為了抑制過擬合出現,導致訓練進入誤區,我們加入歸一化,對batch normalize , 修改net如下:
class CNN(nn.Module):
def __init__(self, in_dim, n_class):
super(CNN, self).__init__()
# 卷積部分
self.cnn = nn.Sequential(
nn.BatchNorm2d(in_dim),
nn.ReLU(True),
nn.Conv2d(in_dim, 16, 5, 1, 2), # (32,32)
nn.BatchNorm2d(16),
nn.ReLU(True),
nn.MaxPool2d(2, 2), # (32,32) >> (16,16)
nn.ReLU(True),
nn.Conv2d(16, 32, 3, 1, 1),
nn.BatchNorm2d(32),
nn.ReLU(True),
nn.MaxPool2d(2, 2), # (16,16) >> (8,8)
)
# linear 部分
self.fc = nn.Sequential(
nn.BatchNorm2d(32*8*8),
nn.ReLU(True),
nn.Linear(32*8*8, 120),
nn.BatchNorm2d(120),
nn.ReLU(True),
nn.Linear(120, 50),
nn.BatchNorm2d(50),
nn.ReLU(True),
nn.Linear(50, n_class),
)
再次執行:
果然抑制了過擬合現象,沒normalize的訓練到10epoch時達到了0.85,而加了normalize的訓練到10epoch時只有0.77,而且準確率也有些許提高 。
不過我們會發下,76%的準確率對於分類來說,似乎太低了,準確率低的原因很多(epoch數,LR,優化函式,神經網路太薄等等),最主要的就是訓練不充分,10個epoch 太少了,這回我們訓練20個epoch:
多加了10個epoch ,準確率77%僅僅加了1%,雖然epoch太少,但是這不是最根本的原因,是不是神經層太薄了導致的,加一層試試:
self.cnn = nn.Sequential(
nn.BatchNorm2d(in_dim),
nn.ReLU(True),
nn.Conv2d(in_dim, 16, 5, 1, 2), # (32,32)
nn.BatchNorm2d(16),
nn.ReLU(True),
nn.MaxPool2d(2, 2), # (32,32) >> (16,16)
nn.ReLU(True),
nn.Conv2d(16, 32, 3, 1, 1),
nn.BatchNorm2d(32),
nn.ReLU(True),
#新增一層
nn.Conv2d(32, 32, 3, 1, 1),
nn.BatchNorm2d(32),
nn.ReLU(True),
nn.MaxPool2d(2, 2), # (16,16) >> (8,8)
)
訓練10個epoch,看結果:
準確率 78%,提高2%,當然可以即增加epoch又加神經層,這裡就不再嘗試。
torchvision 附帶models
注:cpu不要試了
torchvision上models模組帶有一些已經寫好的network可以直接引用,我們就看看比較簡單的resnet18,由於cifar10圖片太小,直接使用resnet18 會報錯,我們把models資料夾複製到project根目錄,進行更改:
ResNet類,更改兩個pool的kernel-size:
引用net:
from models import resnet18
# net = CNN(3, 10) #登出原net
net = resnet18()
num_ftrs = net.fc.in_features
# 更改輸出數
net.fc = nn.Linear(num_ftrs, 10)
訓練10個epoch ,看看效果……………………龜速……………………慢慢等…………………………:
9000 | loss : 0.4682 | acc : 0.8581|time:1263.9
10000 | loss : 0.4657 | acc : 0.8572|time:1271.9
跑了21分鐘。。。,上面我自己寫的net的平均5分鐘
不過效果也是很好的2個epoch 就75%了,10個epoch 到了準確率85% :
我們print一下net看看,為啥能比我自己寫的network準確率高那麼多:
ResNet(
(conv1): Conv2d (3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
(bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True)
(relu): ReLU(inplace)
(maxpool): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=(1, 1), dilation=(1, 1))
(layer1): Sequential(
(0): BasicBlock(
(conv1): Conv2d (64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True)
(relu): ReLU(inplace)
(conv2): Conv2d (64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True)
)
(1): BasicBlock(
(conv1): Conv2d (64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True)
(relu): ReLU(inplace)
(conv2): Conv2d (64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True)
)
)
(layer2): Sequential(
(0): BasicBlock(
(conv1): Conv2d (64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True)
(relu): ReLU(inplace)
(conv2): Conv2d (128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True)
(downsample): Sequential(
(0): Conv2d (64, 128, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True)
)
)
(1): BasicBlock(
(conv1): Conv2d (128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True)
(relu): ReLU(inplace)
(conv2): Conv2d (128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True)
)
)
(layer3): Sequential(
(0): BasicBlock(
(conv1): Conv2d (128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True)
(relu): ReLU(inplace)
(conv2): Conv2d (256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True)
(downsample): Sequential(
(0): Conv2d (128, 256, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True)
)
)
(1): BasicBlock(
(conv1): Conv2d (256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True)
(relu): ReLU(inplace)
(conv2): Conv2d (256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True)
)
)
(layer4): Sequential(
(0): BasicBlock(
(conv1): Conv2d (256, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True)
(relu): ReLU(inplace)
(conv2): Conv2d (512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True)
(downsample): Sequential(
(0): Conv2d (256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True)
)
)
(1): BasicBlock(
(conv1): Conv2d (512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True)
(relu): ReLU(inplace)
(conv2): Conv2d (512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True)
)
)
(avgpool): AvgPool2d(kernel_size=2, stride=1, padding=0, ceil_mode=False, count_include_pad=True)
(fc): Linear(in_features=512, out_features=10)
)
因為神經層多麼啊。。。。
其實不是的,適當的增加神經層可以增加效果但是並非越多越好,有實驗表明簡單的累加神經層到很高會出現梯度消失現象,反而降低了訓練的效果,ResNet 殘差網路 可以實現深層神經網路的構建,詳細看這裡。
視覺化 (visualize)
完整測試 一次,並計入測試結果,對測試結果做heatmap,以觀察相關性:
net.eval()
eval_loss, eval_acc = 0., 0.
match = [0]*100
# 因為要記錄每一個圖片的測試結果,batch_size設為1
test_loader = DataLoader(test_dataset, 1, False)
for i, (img, label) in enumerate(test_loader, 1):
if gpu_status:
img, label = img.cuda(), label.cuda()
img, label = Variable(img, volatile=True), Variable(label, volatile=True)
out = net(img)
loss = loss_f(out, label)
pred = torch.max(out, 1)[1]
eval_acc += sum(pred == label).data[0]
eval_loss += loss.data[0]
# 對 測試集 資料準確率進行記錄 注意:本資料集為等比例 每個分類數量相等,否者要用佔比
number = int(label.data.cpu().numpy()[0]*10+pred.data.cpu().numpy()[0])
match[number] = match[number] + 1
if i % 1000 == 0:
print("{} | loss : {:.4f} | acc : {:.4f}|time:{:.1f}".format(i, eval_loss/i, eval_acc/i,time.time()-start_time))
count = np.array(match).reshape(10,10)
viz.heatmap(X=count, opts=dict(
columnnames=classes, # 新增分類
rownames=classes,
colormap='Jet', # 選取colormap 用顏色梯度 可視 數值梯度
title="ACC: {:.4f}".format(eval_acc/len(test_dataset)), #標題
xlabel="pred",
ylabel="label"),
env="cifar10")
根據圖片可見,貓和狗的混淆情況最為嚴重,相對於交通工具,動物的區分效果更差一些。