pytorch實現yolov3(5) 實現端到端的目標檢測
torch實現yolov3(1)
torch實現yolov3(2)
torch實現yolov3(3)
torch實現yolov3(4)
前面4篇已經實現了network的forward,並且將network的output已經轉換成了易於操作的detection prediction格式.
本篇把前面四篇實現的功能組織起來,實現端到端的推理過程.
整體流程如下
- 讀取圖片,對圖片前處理,把圖片調整到模型的input size及輸入順序(rgb c x h x w).
- 載入模型,讀取模型權重檔案.
- 將第一步讀到的矩陣送給模型.進行forward運算.得到prediction
- 後處理,我們得到的box座標是相對於調整後的圖片的.要處理成原圖上的座標.
detector.py 實現完整的端到端的圖片檢測. 用法python detect.py --images dog-cycle-car.png --det det
from __future__ import division import time import torch import torch.nn as nn from torch.autograd import Variable import numpy as np import cv2 from util import * import argparse import os import os.path as osp from darknet import Darknet import pickle as pkl import pandas as pd import random def arg_parse(): """ Parse arguements to the detect module """ parser = argparse.ArgumentParser(description='YOLO v3 Detection Module') parser.add_argument("--images", dest = 'images', help = "Image / Directory containing images to perform detection upon", default = "imgs", type = str) parser.add_argument("--det", dest = 'det', help = "Image / Directory to store detections to", default = "det", type = str) parser.add_argument("--bs", dest = "bs", help = "Batch size", default = 1) parser.add_argument("--confidence", dest = "confidence", help = "Object Confidence to filter predictions", default = 0.5) parser.add_argument("--nms_thresh", dest = "nms_thresh", help = "NMS Threshhold", default = 0.4) parser.add_argument("--cfg", dest = 'cfgfile', help = "Config file", default = "cfg/yolov3.cfg", type = str) parser.add_argument("--weights", dest = 'weightsfile', help = "weightsfile", default = "yolov3.weights", type = str) parser.add_argument("--reso", dest = 'reso', help = "Input resolution of the network. Increase to increase accuracy. Decrease to increase speed", default = "416", type = str) return parser.parse_args() args = arg_parse() images = args.images batch_size = int(args.bs) confidence = float(args.confidence) nms_thesh = float(args.nms_thresh) start = 0 CUDA = torch.cuda.is_available() num_classes = 80 classes = load_classes("data/coco.names") #Set up the neural network print("Loading network.....") model = Darknet(args.cfgfile) model.load_weights(args.weightsfile) print("Network successfully loaded") model.net_info["height"] = args.reso inp_dim = int(model.net_info["height"]) assert inp_dim % 32 == 0 assert inp_dim > 32 #If there's a GPU availible, put the model on GPU if CUDA: model.cuda() #Set the model in evaluation mode model.eval() read_dir = time.time() #Detection phase try: imlist = [osp.join(osp.realpath('.'), images, img) for img in os.listdir(images)] except NotADirectoryError: imlist = [] imlist.append(osp.join(osp.realpath('.'), images)) except FileNotFoundError: print ("No file or directory with the name {}".format(images)) exit() if not os.path.exists(args.det): os.makedirs(args.det) load_batch = time.time() loaded_ims = [cv2.imread(x) for x in imlist] im_batches = list(map(prep_image, loaded_ims, [inp_dim for x in range(len(imlist))])) im_dim_list = [(x.shape[1], x.shape[0]) for x in loaded_ims] im_dim_list = torch.FloatTensor(im_dim_list).repeat(1,2) leftover = 0 if (len(im_dim_list) % batch_size): leftover = 1 if batch_size != 1: num_batches = len(imlist) // batch_size + leftover im_batches = [torch.cat((im_batches[i*batch_size : min((i + 1)*batch_size, len(im_batches))])) for i in range(num_batches)] write = 0 if CUDA: im_dim_list = im_dim_list.cuda() start_det_loop = time.time() for i, batch in enumerate(im_batches): #load the image start = time.time() if CUDA: batch = batch.cuda() with torch.no_grad(): prediction = model(Variable(batch), CUDA) #類呼叫,相當於呼叫類的__call__()函式 prediction = write_results(prediction, confidence, num_classes, nms_conf = nms_thesh) end = time.time() if type(prediction) == int: for im_num, image in enumerate(imlist[i*batch_size: min((i + 1)*batch_size, len(imlist))]): im_id = i*batch_size + im_num print("{0:20s} predicted in {1:6.3f} seconds".format(image.split("/")[-1], (end - start)/batch_size)) print("{0:20s} {1:s}".format("Objects Detected:", "")) print("----------------------------------------------------------") continue prediction[:,0] += i*batch_size #transform the atribute from index in batch to index in imlist if not write: #If we have't initialised output output = prediction write = 1 else: output = torch.cat((output,prediction)) for im_num, image in enumerate(imlist[i*batch_size: min((i + 1)*batch_size, len(imlist))]): im_id = i*batch_size + im_num objs = [classes[int(x[-1])] for x in output if int(x[0]) == im_id] print("{0:20s} predicted in {1:6.3f} seconds".format(image.split("/")[-1], (end - start)/batch_size)) print("{0:20s} {1:s}".format("Objects Detected:", " ".join(objs))) print("----------------------------------------------------------") if CUDA: torch.cuda.synchronize() try: output except NameError: print ("No detections were made") exit() im_dim_list = torch.index_select(im_dim_list, 0, output[:,0].long()) scaling_factor = torch.min(416/im_dim_list,1)[0].view(-1,1) output[:,[1,3]] -= (inp_dim - scaling_factor*im_dim_list[:,0].view(-1,1))/2 output[:,[2,4]] -= (inp_dim - scaling_factor*im_dim_list[:,1].view(-1,1))/2 output[:,1:5] /= scaling_factor for i in range(output.shape[0]): output[i, [1,3]] = torch.clamp(output[i, [1,3]], 0.0, im_dim_list[i,0]) output[i, [2,4]] = torch.clamp(output[i, [2,4]], 0.0, im_dim_list[i,1]) output_recast = time.time() class_load = time.time() colors = pkl.load(open("pallete", "rb")) draw = time.time() def write(x, results): c1 = tuple(x[1:3].int()) c2 = tuple(x[3:5].int()) img = results[int(x[0])] cls = int(x[-1]) color = random.choice(colors) label = "{0}".format(classes[cls]) cv2.rectangle(img, c1, c2,color, 1) t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1 , 1)[0] c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4 cv2.rectangle(img, c1, c2,color, -1) cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225,255,255], 1); return img list(map(lambda x: write(x, loaded_ims), output)) det_names = pd.Series(imlist).apply(lambda x: "{}/det_{}".format(args.det,x.split("/")[-1])) list(map(cv2.imwrite, det_names, loaded_ims)) end = time.time() print("SUMMARY") print("----------------------------------------------------------") print("{:25s}: {}".format("Task", "Time Taken (in seconds)")) print() print("{:25s}: {:2.3f}".format("Reading addresses", load_batch - read_dir)) print("{:25s}: {:2.3f}".format("Loading batch", start_det_loop - load_batch)) print("{:25s}: {:2.3f}".format("Detection (" + str(len(imlist)) + " images)", output_recast - start_det_loop)) print("{:25s}: {:2.3f}".format("Output Processing", class_load - output_recast)) print("{:25s}: {:2.3f}".format("Drawing Boxes", end - draw)) print("{:25s}: {:2.3f}".format("Average time_per_img", (end - load_batch)/len(imlist))) print("----------------------------------------------------------") torch.cuda.empty_cache()
第一段沒啥好說的,我們希望可以通過命令列傳參,所以用ArgParse模組來實現引數解析.
第二段 模型載入
#Set up the neural network
print("Loading network.....")
model = Darknet(args.cfgfile)
model.load_weights(args.weightsfile)
print("Network successfully loaded")
第三段 影象預處理
對任意一個圖片,要先做預處理,把尺寸處理到model的input size.
read_dir = time.time() #Detection phase try: imlist = [osp.join(osp.realpath('.'), images, img) for img in os.listdir(images)] except NotADirectoryError: imlist = [] imlist.append(osp.join(osp.realpath('.'), images)) except FileNotFoundError: print ("No file or directory with the name {}".format(images)) exit() if not os.path.exists(args.det): os.makedirs(args.det) load_batch = time.time() loaded_ims = [cv2.imread(x) for x in imlist] im_batches = list(map(prep_image, loaded_ims, [inp_dim for x in range(len(imlist))])) im_dim_list = [(x.shape[1], x.shape[0]) for x in loaded_ims] im_dim_list = torch.FloatTensor(im_dim_list).repeat(1,2) leftover = 0 if (len(im_dim_list) % batch_size): leftover = 1 if batch_size != 1: num_batches = len(imlist) // batch_size + leftover im_batches = [torch.cat((im_batches[i*batch_size : min((i + 1)*batch_size, len(im_batches))])) for i in range(num_batches)]
從某個目錄讀入n多個圖片.假設模型每個batch處理5個圖片.圖片為320 x 320 x 3. 則每次輸入模型的矩陣為(320*5) x 320 x 3.即
im_batches = [torch.cat((im_batches[i*batch_size : min((i + 1)*batch_size,
len(im_batches))])) for i in range(num_batches)]
所做的事情.
圖片的前處理所用到的一些工具函式如下.
def letterbox_image(img, inp_dim):
'''resize image with unchanged aspect ratio using padding'''
img_w, img_h = img.shape[1], img.shape[0]
w, h = inp_dim
new_w = int(img_w * min(w/img_w, h/img_h))
new_h = int(img_h * min(w/img_w, h/img_h))
resized_image = cv2.resize(img, (new_w,new_h), interpolation = cv2.INTER_CUBIC)
canvas = np.full((inp_dim[1], inp_dim[0], 3), 128)
canvas[(h-new_h)//2:(h-new_h)//2 + new_h,(w-new_w)//2:(w-new_w)//2 + new_w, :] = resized_image
return canvas
保證原有圖片的寬高比,其餘位置灰度值填充.
cv讀進來的bgr格式,我們轉成rgb的.然後transpose 把h x w x c的轉成c x h x w的.
def prep_image(img, inp_dim):
"""
Prepare image for inputting to the neural network.
Returns a Variable
"""
img = cv2.resize(img, (inp_dim, inp_dim
img = img[:,:,::-1].transpose((2,0,1)).copy()
img = torch.from_numpy(img).float().div(255.0).unsqueeze(0)
return img
參考https://www.cnblogs.com/sdu20112013/p/11216322.html
4.將矩陣餵給模型,進行forward
for i, batch in enumerate(im_batches):
#load the image
start = time.time()
if CUDA:
batch = batch.cuda()
with torch.no_grad():
prediction = model(Variable(batch), CUDA) #類呼叫,相當於呼叫類的__call__()函式
prediction = write_results(prediction, confidence, num_classes, nms_conf = nms_thesh)
end = time.time()
if type(prediction) == int:
for im_num, image in enumerate(imlist[i*batch_size: min((i + 1)*batch_size, len(imlist))]):
im_id = i*batch_size + im_num
print("{0:20s} predicted in {1:6.3f} seconds".format(image.split("/")[-1], (end - start)/batch_size))
print("{0:20s} {1:s}".format("Objects Detected:", ""))
print("----------------------------------------------------------")
continue
prediction[:,0] += i*batch_size #transform the atribute from index in batch to index in imlist
if not write: #If we have't initialised output
output = prediction
write = 1
else:
output = torch.cat((output,prediction))
for im_num, image in enumerate(imlist[i*batch_size: min((i + 1)*batch_size, len(imlist))]):
im_id = i*batch_size + im_num
objs = [classes[int(x[-1])] for x in output if int(x[0]) == im_id]
print("{0:20s} predicted in {1:6.3f} seconds".format(image.split("/")[-1], (end - start)/batch_size))
print("{0:20s} {1:s}".format("Objects Detected:", " ".join(objs)))
print("----------------------------------------------------------")
其中重點就是
prediction = model(Variable(batch), CUDA) #類呼叫,相當於呼叫類的__call__()函式,
prediction = write_results(prediction, confidence, num_classes, nms_conf = nms_thesh)
涉及到一個python語法,類例項呼叫.其實就相當於呼叫__call__().基類nn.module的__call__()裡呼叫了forward().所以這一句實際上就相當於呼叫model.forward(batch).
5.後處理
im_dim_list = torch.index_select(im_dim_list, 0, output[:,0].long())
scaling_factor = torch.min(416/im_dim_list,1)[0].view(-1,1)
output[:,[1,3]] -= (inp_dim - scaling_factor*im_dim_list[:,0].view(-1,1))/2
output[:,[2,4]] -= (inp_dim - scaling_factor*im_dim_list[:,1].view(-1,1))/2
output[:,1:5] /= scaling_factor
for i in range(output.shape[0]):
output[i, [1,3]] = torch.clamp(output[i, [1,3]], 0.0, im_dim_list[i,0])
output[i, [2,4]] = torch.clamp(output[i, [2,4]], 0.0, im_dim_list[i,1])
output中的box座標是相對於模型的輸入圖片的,將其對映到相對於原始圖片的位置.
圖片繪製,涉及python基礎語法參考https://www.cnblogs.com/sdu20112013/p/11216584.html
list(map(lambda x: write(x, loaded_ims), output))
det_names = pd.Series(imlist).apply(lambda x: "{}/det_{}".format(args.det,x.split("/")[-1]))
list(map(cv2.imwrite, det_names, loaded_ims))