我們接著上一篇文章CenterNet(一)論文解讀, 來了解一下作者具體的代碼是如何實(shí)現(xiàn)的吧。
這里我們可以下代碼地址:Github CenterNet
首先我們看一下大致的目錄:
一、訓(xùn)練代碼
首先我們從訓(xùn)練模型的主函數(shù)開始說起吧争涌。
CenterNet/src/main.py
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import _init_paths
import os
import torch
import torch.utils.data
from opts import opts
from models.model import create_model, load_model, save_model
from models.data_parallel import DataParallel
from logger import Logger
from datasets.dataset_factory import get_dataset
from trains.train_factory import train_factory
def main(opt):
torch.manual_seed(opt.seed)
torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test
Dataset = get_dataset(opt.dataset, opt.task)
opt = opts().update_dataset_info_and_set_heads(opt, Dataset)
print(opt)
logger = Logger(opt)
os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str
opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu')
print('Creating model...')
model = create_model(opt.arch, opt.heads, opt.head_conv)
optimizer = torch.optim.Adam(model.parameters(), opt.lr)
start_epoch = 0
if opt.load_model != '':
model, optimizer, start_epoch = load_model(
model, opt.load_model, optimizer, opt.resume, opt.lr, opt.lr_step)
Trainer = train_factory[opt.task]
trainer = Trainer(opt, model, optimizer)
trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device)
print('Setting up data...')
val_loader = torch.utils.data.DataLoader(
Dataset(opt, 'val'),
batch_size=1,
shuffle=False,
num_workers=1,
pin_memory=True
)
if opt.test:
_, preds = trainer.val(0, val_loader)
val_loader.dataset.run_eval(preds, opt.save_dir)
return
train_loader = torch.utils.data.DataLoader(
Dataset(opt, 'train'),
batch_size=opt.batch_size,
shuffle=True,
num_workers=opt.num_workers,
pin_memory=True,
drop_last=True
)
print('Starting training...')
best = 1e10
for epoch in range(start_epoch + 1, opt.num_epochs + 1):
mark = epoch if opt.save_all else 'last'
log_dict_train, _ = trainer.train(epoch, train_loader)
logger.write('epoch: {} |'.format(epoch))
for k, v in log_dict_train.items():
logger.scalar_summary('train_{}'.format(k), v, epoch)
logger.write('{} {:8f} | '.format(k, v))
if opt.val_intervals > 0 and epoch % opt.val_intervals == 0:
save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)),
epoch, model, optimizer)
with torch.no_grad():
log_dict_val, preds = trainer.val(epoch, val_loader)
for k, v in log_dict_val.items():
logger.scalar_summary('val_{}'.format(k), v, epoch)
logger.write('{} {:8f} | '.format(k, v))
if log_dict_val[opt.metric] < best:
best = log_dict_val[opt.metric]
save_model(os.path.join(opt.save_dir, 'model_best.pth'),
epoch, model)
else:
save_model(os.path.join(opt.save_dir, 'model_last.pth'),
epoch, model, optimizer)
logger.write('\n')
if epoch in opt.lr_step:
save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)),
epoch, model, optimizer)
lr = opt.lr * (0.1 ** (opt.lr_step.index(epoch) + 1))
print('Drop LR to', lr)
for param_group in optimizer.param_groups:
param_group['lr'] = lr
logger.close()
if __name__ == '__main__':
opt = opts().parse()
"""
ctdet --exp_id coco_dla --batch_size 64 --master_batch 32 --lr 1.25e-4 --gpu 1,2,3 --num_workers 32
"""
main(opt)
我們從main函數(shù)開始說起吧。
a. torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test
benchmark = True 自動(dòng)尋找最適合當(dāng)前配置的高效算法晾咪,來達(dá)到優(yōu)化運(yùn)行效率的問題
b. Dataset = get_dataset(opt.dataset, opt.task)
獲取訓(xùn)練特定任務(wù)模型需要數(shù)據(jù),
c. opt = opts().update_dataset_info_and_set_heads(opt, Dataset)
更新數(shù)據(jù)等配置丐枉,設(shè)置模型輸出heads朱沃。比如我們需要bounding box識(shí)別任務(wù), 我們就需要設(shè)置三個(gè)輸出hm
, wh
, reg
.
elif opt.task == 'ctdet':
# assert opt.dataset in ['pascal', 'coco']
opt.heads = {'hm': opt.num_classes,
'wh': 2 if not opt.cat_spec_wh else 2 * opt.num_classes}
if opt.reg_offset:
opt.heads.update({'reg': 2})
這里opt.cat_spec_wh
作者僅僅是實(shí)驗(yàn)中使用嗜傅, 并發(fā)現(xiàn)效果沒有提升金句。
作者這樣說的 We never used cat_spec_wh is the experiments.
I have tried once this on Pascal VOC but it doesn't give improvement. Feel free to try it on COCO yourself.
這里面最主要的部分莫過于Dataset = get_dataset(opt.dataset, opt.task)
以及model = create_model(opt.arch, opt.heads, opt.head_conv)
。下面我們來好好分析一下吕嘀。
-
CenterNet/src/lib/datasets/dataset_factory.py
以及CenterNet/src/lib/datasets/dataset/coco.py
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from .sample.ddd import DddDataset
from .sample.exdet import EXDetDataset
from .sample.ctdet import CTDetDataset
from .sample.multi_pose import MultiPoseDataset
from .dataset.coco import COCO
from .dataset.pascal import PascalVOC
from .dataset.kitti import KITTI
from .dataset.coco_hp import COCOHP
from .dataset.clothing import Clothing
dataset_factory = {
'coco': COCO,
'pascal': PascalVOC,
'kitti': KITTI,
'coco_hp': COCOHP,
'clothing': Clothing
}
_sample_factory = {
'exdet': EXDetDataset,
'ctdet': CTDetDataset,
'ddd': DddDataset,
'multi_pose': MultiPoseDataset
}
def get_dataset(dataset, task):
class Dataset(dataset_factory[dataset], _sample_factory[task]):
pass
return Dataset
可以看出Dataset
繼承了dataset_factory
和_sample_factory
這里我們就拿coco數(shù)據(jù)作為例子來解釋(即data_set='coco'
)违寞,根據(jù)上述代碼我們先從CenterNet/src/lib/datasets/dataset/coco.py
以及
\CenterNet\src\lib\datasets\sample\ctdet.py
代碼入手。
首先我們看一下coco.py
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import pycocotools.coco as coco
from pycocotools.cocoeval import COCOeval
import numpy as np
import json
import os
import torch.utils.data as data
class COCO(data.Dataset):
num_classes = 80
default_resolution = [512, 512]
mean = np.array([0.40789654, 0.44719302, 0.47026115],
dtype=np.float32).reshape(1, 1, 3)
std = np.array([0.28863828, 0.27408164, 0.27809835],
dtype=np.float32).reshape(1, 1, 3)
def __init__(self, opt, split):
super(COCO, self).__init__()
self.data_dir = os.path.join(opt.data_dir, 'coco') # 圖片存儲(chǔ)地址
self.img_dir = os.path.join(self.data_dir, '{}2017'.format(split)) # 標(biāo)簽根路徑存儲(chǔ)
if split == 'test':
self.annot_path = os.path.join(
self.data_dir, 'annotations',
'image_info_test-dev2017.json').format(split)
else:
if opt.task == 'exdet':
self.annot_path = os.path.join(
self.data_dir, 'annotations',
'instances_extreme_{}2017.json').format(split)
else:
self.annot_path = os.path.join(
self.data_dir, 'annotations',
'instances_{}2017.json').format(split)
self.max_objs = 100
self.class_name = [
'__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane',
'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse',
'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack',
'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis',
'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass',
'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich',
'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake',
'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv',
'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
'scissors', 'teddy bear', 'hair drier', 'toothbrush']
self._valid_ids = [
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13,
14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 27, 28, 31, 32, 33, 34, 35, 36,
37, 38, 39, 40, 41, 42, 43, 44, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
58, 59, 60, 61, 62, 63, 64, 65, 67, 70,
72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
82, 84, 85, 86, 87, 88, 89, 90]
self.cat_ids = {v: i for i, v in enumerate(self._valid_ids)} # 生成對(duì)應(yīng)的category dict
self.voc_color = [(v // 32 * 64 + 64, (v // 8) % 4 * 64, v % 8 * 32) \
for v in range(1, self.num_classes + 1)]
self._data_rng = np.random.RandomState(123)
self._eig_val = np.array([0.2141788, 0.01817699, 0.00341571],
dtype=np.float32)
# 這里是們了后面圖片增廣中的顏色增廣的參數(shù)
self._eig_vec = np.array([
[-0.58752847, -0.69563484, 0.41340352],
[-0.5832747, 0.00994535, -0.81221408],
[-0.56089297, 0.71832671, 0.41158938]
], dtype=np.float32)
# self.mean = np.array([0.485, 0.456, 0.406], np.float32).reshape(1, 1, 3)
# self.std = np.array([0.229, 0.224, 0.225], np.float32).reshape(1, 1, 3)
self.split = split
self.opt = opt
print('==> initializing coco 2017 {} data.'.format(split))
self.coco = coco.COCO(self.annot_path)
self.images = self.coco.getImgIds()
self.num_samples = len(self.images)
print('Loaded {} {} samples'.format(split, self.num_samples))
def _to_float(self, x):
return float("{:.2f}".format(x))
# 遍歷每一個(gè)標(biāo)注文件解析寫入detections. 輸出結(jié)果使用
def convert_eval_format(self, all_bboxes):
# import pdb; pdb.set_trace()
detections = []
for image_id in all_bboxes:
for cls_ind in all_bboxes[image_id]:
category_id = self._valid_ids[cls_ind - 1]
for bbox in all_bboxes[image_id][cls_ind]:
bbox[2] -= bbox[0]
bbox[3] -= bbox[1]
score = bbox[4]
bbox_out = list(map(self._to_float, bbox[0:4]))
detection = {
"image_id": int(image_id),
"category_id": int(category_id),
"bbox": bbox_out,
"score": float("{:.2f}".format(score))
}
if len(bbox) > 5:
extreme_points = list(map(self._to_float, bbox[5:13]))
detection["extreme_points"] = extreme_points
detections.append(detection)
return detections
def __len__(self):
return self.num_samples
def save_results(self, results, save_dir):
json.dump(self.convert_eval_format(results),
open('{}/results.json'.format(save_dir), 'w'))
def run_eval(self, results, save_dir):
# result_json = os.path.join(save_dir, "results.json")
# detections = self.convert_eval_format(results)
# json.dump(detections, open(result_json, "w"))
self.save_results(results, save_dir)
coco_dets = self.coco.loadRes('{}/results.json'.format(save_dir))
coco_eval = COCOeval(self.coco, coco_dets, "bbox")
coco_eval.evaluate()
coco_eval.accumulate()
coco_eval.summarize()
接著我們來看一下ctdet.py
币他。
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import torch.utils.data as data
import numpy as np
import torch
import json
import cv2
import os
from utils.image import flip, color_aug
from utils.image import get_affine_transform, affine_transform
from utils.image import gaussian_radius, draw_umich_gaussian, draw_msra_gaussian
from utils.image import draw_dense_reg
import math
class CTDetDataset(data.Dataset):
def _coco_box_to_bbox(self, box):
bbox = np.array([box[0], box[1], box[0] + box[2], box[1] + box[3]],
dtype=np.float32)
return bbox
def _get_border(self, border, size):
#border 128 pic_len w or h
i = 1
while size - border // i <= border // i:
# 如果圖像寬高小于 boder*2坞靶,i增大,返回128 // i
# 正常返回128蝴悉,圖像小于256,則返回64
i *= 2
return border // i
def __getitem__(self, index):
img_id = self.images[index]
file_name = self.coco.loadImgs(ids=[img_id])[0]['file_name']
img_path = os.path.join(self.img_dir, file_name)
ann_ids = self.coco.getAnnIds(imgIds=[img_id])
anns = self.coco.loadAnns(ids=ann_ids)
num_objs = min(len(anns), self.max_objs) # 目標(biāo)個(gè)數(shù),這里為100
img = cv2.imread(img_path)
try:
height, width = img.shape[0], img.shape[1]
except:
print(img_path)
c = np.array([img.shape[1] / 2., img.shape[0] / 2.], dtype=np.float32) # 獲取中心點(diǎn)
if self.opt.keep_res: # False
input_h = (height | self.opt.pad) + 1
input_w = (width | self.opt.pad) + 1
s = np.array([input_w, input_h], dtype=np.float32)
else: # True
s = max(img.shape[0], img.shape[1]) * 1.0 # s最長的邊長
input_h, input_w = self.opt.input_h, self.opt.input_w # 512瘾敢, 512
flipped = False
if self.split == 'train':
if not self.opt.not_rand_crop:
s = s * np.random.choice(np.arange(0.6, 1.4, 0.1)) # 隨機(jī)尺度
w_border = self._get_border(128, img.shape[1])
h_border = self._get_border(128, img.shape[0])
c[0] = np.random.randint(low=w_border, high=img.shape[1] - w_border)
c[1] = np.random.randint(low=h_border, high=img.shape[0] - h_border)
else:
sf = self.opt.scale
cf = self.opt.shift
c[0] += s * np.clip(np.random.randn()*cf, -2*cf, 2*cf)
c[1] += s * np.clip(np.random.randn()*cf, -2*cf, 2*cf)
s = s * np.clip(np.random.randn()*sf + 1, 1 - sf, 1 + sf)
if np.random.random() < self.opt.flip:
flipped = True
img = img[:, ::-1, :]
c[0] = width - c[0] - 1 # 隨機(jī)裁剪
trans_input = get_affine_transform(
c, s, 0, [input_w, input_h])
inp = cv2.warpAffine(img, trans_input,
(input_w, input_h),
flags=cv2.INTER_LINEAR)# 放射變換
inp = (inp.astype(np.float32) / 255.)
if self.split == 'train' and not self.opt.no_color_aug:
color_aug(self._data_rng, inp, self._eig_val, self._eig_vec)
# 歸一化
inp = (inp - self.mean) / self.std
inp = inp.transpose(2, 0, 1)
output_h = input_h // self.opt.down_ratio # 輸出512//4=128
output_w = input_w // self.opt.down_ratio
num_classes = self.num_classes
trans_output = get_affine_transform(c, s, 0, [output_w, output_h])
hm = np.zeros((num_classes, output_h, output_w), dtype=np.float32) # heatmap(80,128,128)
wh = np.zeros((self.max_objs, 2), dtype=np.float32) # 中心點(diǎn)寬高(100*2)
dense_wh = np.zeros((2, output_h, output_w), dtype=np.float32)# 返回2*128*128
reg = np.zeros((self.max_objs, 2), dtype=np.float32) # 記錄下采樣帶來的誤差,返回100*2的小數(shù)
ind = np.zeros((self.max_objs), dtype=np.int64) # 返回100個(gè)ind
reg_mask = np.zeros((self.max_objs), dtype=np.uint8)# 返回8個(gè) 回歸mask
cat_spec_wh = np.zeros((self.max_objs, num_classes * 2), dtype=np.float32) # 100*80*2
cat_spec_mask = np.zeros((self.max_objs, num_classes * 2), dtype=np.uint8) # 100*80*2
draw_gaussian = draw_msra_gaussian if self.opt.mse_loss else \
draw_umich_gaussian
gt_det = []
for k in range(num_objs):
ann = anns[k]
bbox = self._coco_box_to_bbox(ann['bbox'])
cls_id = int(self.cat_ids[ann['category_id']])
if flipped:
bbox[[0, 2]] = width - bbox[[2, 0]] - 1
bbox[:2] = affine_transform(bbox[:2], trans_output)
bbox[2:] = affine_transform(bbox[2:], trans_output)
bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, output_w - 1)
bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, output_h - 1)
h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]
if h > 0 and w > 0:
radius = gaussian_radius((math.ceil(h), math.ceil(w)))
radius = max(0, int(radius))
radius = self.opt.hm_gauss if self.opt.mse_loss else radius
ct = np.array(
[(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2], dtype=np.float32)
ct_int = ct.astype(np.int32)
draw_gaussian(hm[cls_id], ct_int, radius)
#cv2.imwrite("/data/humaocheng/CenterNet-master/single_heatmap.jpg", hm[0]*255)
wh[k] = 1. * w, 1. * h # 目標(biāo)矩形框的寬高——目標(biāo)尺寸損失
ind[k] = ct_int[1] * output_w + ct_int[0] # 目標(biāo)中心點(diǎn)在128×128特征圖中的索引
reg[k] = ct - ct_int # off Loss, # ct 即 center point reg是偏置回歸數(shù)組拍冠,存放每個(gè)中心店的偏置值 k是當(dāng)前圖中第k個(gè)目標(biāo)
# 實(shí)際例子為
# [98.97667 2.3566666] - [98 2] = [0.97667, 0.3566666]
reg_mask[k] = 1 # 有目標(biāo)的位置的mask
cat_spec_wh[k, cls_id * 2: cls_id * 2 + 2] = wh[k]
cat_spec_mask[k, cls_id * 2: cls_id * 2 + 2] = 1
if self.opt.dense_wh:
draw_dense_reg(dense_wh, hm.max(axis=0), ct_int, wh[k], radius)
gt_det.append([ct[0] - w / 2, ct[1] - h / 2,
ct[0] + w / 2, ct[1] + h / 2, 1, cls_id])
# cv2.imwrite("/data/humaocheng/CenterNet-master/heatmap.jpg",hm[0]*255)
ret = {'input': inp, 'hm': hm, 'reg_mask': reg_mask, 'ind': ind, 'wh': wh}
if self.opt.dense_wh:
hm_a = hm.max(axis=0, keepdims=True)
dense_wh_mask = np.concatenate([hm_a, hm_a], axis=0)
ret.update({'dense_wh': dense_wh, 'dense_wh_mask': dense_wh_mask})
del ret['wh']
elif self.opt.cat_spec_wh:
ret.update({'cat_spec_wh': cat_spec_wh, 'cat_spec_mask': cat_spec_mask})
del ret['wh']
if self.opt.reg_offset:
ret.update({'reg': reg})
if self.opt.debug > 0 or not self.split == 'train':
gt_det = np.array(gt_det, dtype=np.float32) if len(gt_det) > 0 else \
np.zeros((1, 6), dtype=np.float32)
meta = {'c': c, 's': s, 'gt_det': gt_det, 'img_id': img_id}
ret['meta'] = meta
return ret
我們還是從def __getitem__(self, index)
函數(shù)為入口。這里我們可以得到我們輸出參數(shù)簇抵,分別是庆杜。
img_id = self.images[index]
file_name = self.coco.loadImgs(ids=[img_id])[0]['file_name']
img_path = os.path.join(self.img_dir, file_name)
ann_ids = self.coco.getAnnIds(imgIds=[img_id])
anns = self.coco.loadAnns(ids=ann_ids)
num_objs = min(len(anns), self.max_objs) # 目標(biāo)個(gè)數(shù),這里為100
img = cv2.imread(img_path)
分別獲取圖片img_id
, 并根據(jù)img_id
獲取圖片名稱及地址。并根據(jù)img_id
獲取標(biāo)注的ann_ids
, 并借此獲取對(duì)于應(yīng)的標(biāo)簽碟摆。這里需要強(qiáng)調(diào)的num_objs
為我們一張圖片選取top中心點(diǎn)的數(shù)量(即類似起到NMS作用)晃财。可以理解為超參數(shù)這里我們默認(rèn)設(shè)置為100典蜕。
同時(shí)我們也獲取中心點(diǎn)的坐標(biāo)c = np.array([img.shape[1] / 2., img.shape[0] / 2.], dtype=np.float32
接著我們獲取圖片的最長邊以及輸入尺寸(512,512)
if self.opt.keep_res: # False
input_h = (height | self.opt.pad) + 1
input_w = (width | self.opt.pad) + 1
s = np.array([input_w, input_h], dtype=np.float32)
else: # True
s = max(img.shape[0], img.shape[1]) * 1.0 # s最長的邊長
input_h, input_w = self.opt.input_h, self.opt.input_w # 512断盛, 512
剩下的為了保持?jǐn)?shù)據(jù)的泛化性,對(duì)數(shù)據(jù)進(jìn)行一系列處理愉舔。最終輸出結(jié)果即我們第一個(gè)所需要的輸入圖像.
flipped = False
if self.split == 'train':
if not self.opt.not_rand_crop:
s = s * np.random.choice(np.arange(0.6, 1.4, 0.1)) # 隨機(jī)尺度
w_border = self._get_border(128, img.shape[1])
h_border = self._get_border(128, img.shape[0])
c[0] = np.random.randint(low=w_border, high=img.shape[1] - w_border)
c[1] = np.random.randint(low=h_border, high=img.shape[0] - h_border)
else:
sf = self.opt.scale
cf = self.opt.shift
c[0] += s * np.clip(np.random.randn()*cf, -2*cf, 2*cf)
c[1] += s * np.clip(np.random.randn()*cf, -2*cf, 2*cf)
s = s * np.clip(np.random.randn()*sf + 1, 1 - sf, 1 + sf)
if np.random.random() < self.opt.flip:
flipped = True
img = img[:, ::-1, :]
c[0] = width - c[0] - 1 # 隨機(jī)裁剪
trans_input = get_affine_transform(
c, s, 0, [input_w, input_h])
inp = cv2.warpAffine(img, trans_input,
(input_w, input_h),
flags=cv2.INTER_LINEAR)# 放射變換
inp = (inp.astype(np.float32) / 255.)
if self.split == 'train' and not self.opt.no_color_aug:
color_aug(self._data_rng, inp, self._eig_val, self._eig_vec)
# 歸一化
inp = (inp - self.mean) / self.std
inp = inp.transpose(2, 0, 1)
接著我們需要完成我們的heatmap的生成钢猛。
output_h = input_h // self.opt.down_ratio # 輸出512//4=128
output_w = input_w // self.opt.down_ratio
num_classes = self.num_classes # num_classes=80
trans_output = get_affine_transform(c, s, 0, [output_w, output_h])
hm = np.zeros((num_classes, output_h, output_w), dtype=np.float32) # heatmap(80,128,128)
wh = np.zeros((self.max_objs, 2), dtype=np.float32) # 中心點(diǎn)寬高(32*2)
dense_wh = np.zeros((2, output_h, output_w), dtype=np.float32)# 返回2*128*128
reg = np.zeros((self.max_objs, 2), dtype=np.float32) # 記錄下采樣帶來的誤差,返回32*2的小數(shù)
ind = np.zeros((self.max_objs), dtype=np.int64) # 返回32個(gè)ind
reg_mask = np.zeros((self.max_objs), dtype=np.uint8)# 返回100個(gè) 回歸mask
cat_spec_wh = np.zeros((self.max_objs, num_classes * 2), dtype=np.float32) # 32*80*2
cat_spec_mask = np.zeros((self.max_objs, num_classes * 2), dtype=np.uint8) # 32*80*2
draw_gaussian = draw_msra_gaussian if self.opt.mse_loss else \
draw_umich_gaussian
這里mse_loss
為False, 所以我們只需要關(guān)注draw_umich_gaussian
函數(shù)即可。
gt_det = []
for k in range(num_objs):
ann = anns[k]
bbox = self._coco_box_to_bbox(ann['bbox'])
cls_id = int(self.cat_ids[ann['category_id']])
if flipped:
bbox[[0, 2]] = width - bbox[[2, 0]] - 1
bbox[:2] = affine_transform(bbox[:2], trans_output)
bbox[2:] = affine_transform(bbox[2:], trans_output)
bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, output_w - 1)
bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, output_h - 1)
h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]
if h > 0 and w > 0:
radius = gaussian_radius((math.ceil(h), math.ceil(w)))
radius = max(0, int(radius))
radius = self.opt.hm_gauss if self.opt.mse_loss else radius
ct = np.array(
[(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2], dtype=np.float32)
ct_int = ct.astype(np.int32)
draw_gaussian(hm[cls_id], ct_int, radius)
#cv2.imwrite("/data/humaocheng/CenterNet-master/single_heatmap.jpg", hm[0]*255)
wh[k] = 1. * w, 1. * h # 目標(biāo)矩形框的寬高——目標(biāo)尺寸損失
ind[k] = ct_int[1] * output_w + ct_int[0] # 目標(biāo)中心點(diǎn)在128×128特征圖中的索引
reg[k] = ct - ct_int # off Loss, # ct 即 center point reg是偏置回歸數(shù)組轩缤,存放每個(gè)中心店的偏置值 k是當(dāng)前圖中第k個(gè)目標(biāo)
# 實(shí)際例子為
# [98.97667 2.3566666] - [98 2] = [0.97667, 0.3566666]
reg_mask[k] = 1 # 有目標(biāo)的位置的mask
cat_spec_wh[k, cls_id * 2: cls_id * 2 + 2] = wh[k]
cat_spec_mask[k, cls_id * 2: cls_id * 2 + 2] = 1
if self.opt.dense_wh:
draw_dense_reg(dense_wh, hm.max(axis=0), ct_int, wh[k], radius)
gt_det.append([ct[0] - w / 2, ct[1] - h / 2,
ct[0] + w / 2, ct[1] + h / 2, 1, cls_id])
我們來看下這個(gè)代碼命迈,是如何畫出heatmap的。這里的關(guān)鍵點(diǎn)在于如何畫出高斯半徑火的。下面展示求解高斯半徑的初始化的函數(shù)壶愤。這個(gè)函數(shù)也是照搬CornerNet
的方式求解的。這里的理解可以參考CornerNet半徑求解馏鹤。這里在CornerNet里面是這么解釋的征椒。而半徑設(shè)置時(shí)需要滿足半徑內(nèi)的點(diǎn)組成的box與gt box之間的IoU達(dá)到某個(gè)閾值0.3.
We determine the radius by the size of an object by ensuring that a pair of points within the radius would generate a bounding box with at least t IoU with the ground-truth annotation (we set
to 0.3 in all experiments)
def gaussian_radius(det_size, min_overlap=0.7):
height, width = det_size
a1 = 1
b1 = (height + width)
c1 = width * height * (1 - min_overlap) / (1 + min_overlap)
sq1 = np.sqrt(b1 ** 2 - 4 * a1 * c1)
r1 = (b1 + sq1) / 2
a2 = 4
b2 = 2 * (height + width)
c2 = (1 - min_overlap) * width * height
sq2 = np.sqrt(b2 ** 2 - 4 * a2 * c2)
r2 = (b2 + sq2) / 2
a3 = 4 * min_overlap
b3 = -2 * min_overlap * (height + width)
c3 = (min_overlap - 1) * width * height
sq3 = np.sqrt(b3 ** 2 - 4 * a3 * c3)
r3 = (b3 + sq3) / 2
return min(r1, r2, r3)
這樣的話我們通過def draw_umich_gaussian
畫出我們想要的heatmap.
def draw_umich_gaussian(heatmap, center, radius, k=1):
diameter = 2 * radius + 1
gaussian = gaussian2D((diameter, diameter), sigma=diameter / 6)
x, y = int(center[0]), int(center[1])
height, width = heatmap.shape[0:2]
left, right = min(x, radius), min(width - x, radius + 1)
top, bottom = min(y, radius), min(height - y, radius + 1)
masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
masked_gaussian = gaussian[radius - top:radius + bottom, radius - left:radius + right]
if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0: # TODO debug
np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap)
return heatmap
這里的center
其實(shí)是bounding box的中心點(diǎn)。np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap)
相當(dāng)于不斷的在heatmap基礎(chǔ)上添加關(guān)鍵點(diǎn)的高斯假瞬,即同一種類型的框會(huì)在一個(gè)heatmap某一個(gè)類別通道上面上面不斷添加陕靠。最終通過函數(shù)總體的for循環(huán)迂尝,相當(dāng)于不斷將目標(biāo)畫到heatmap
上面生成我們第二個(gè)輸出參數(shù)。
上面的代碼已經(jīng)介紹了我們的輸出參數(shù)以及
, 還有
以及
這三個(gè)參數(shù)我們可以看下代碼。
wh[k] = 1. * w, 1. * h # 目標(biāo)矩形框的寬高——目標(biāo)尺寸損失
ind[k] = ct_int[1] * output_w + ct_int[0] # 目標(biāo)中心點(diǎn)在128×128特征圖中的索引
reg[k] = ct - ct_int # off Loss, # ct 即 center point reg是偏置回歸數(shù)組禀挫,存放每個(gè)中心店的偏置值 k是當(dāng)前圖中第k個(gè)目標(biāo)
# 實(shí)際例子為
# [98.97667 2.3566666] - [98 2] = [0.97667, 0.3566666]
reg_mask[k] = 1 # 有目標(biāo)的位置的mask
cat_spec_wh[k, cls_id * 2: cls_id * 2 + 2] = wh[k]
cat_spec_mask[k, cls_id * 2: cls_id * 2 + 2] = 1
if self.opt.dense_wh:
draw_dense_reg(dense_wh, hm.max(axis=0), ct_int, wh[k], radius)
gt_det.append([ct[0] - w / 2, ct[1] - h / 2,
ct[0] + w / 2, ct[1] + h / 2, 1, cls_id])
這里的w
,h
分別代表目標(biāo)框的寬與高ind
代表目標(biāo)中心點(diǎn)在矩陣上面的索引恰响,
reg[k]
表示offset。關(guān)于reg[k]
是因?yàn)閷loat轉(zhuǎn)成int類型而產(chǎn)生的誤差溉躲。我們可以根據(jù)下面這個(gè)圖清楚的理解這里的loss。
reg_mask
是記錄我們前100個(gè)點(diǎn)益兄,這里相當(dāng)于記載一張圖片存在哪些目標(biāo)锻梳,有的話對(duì)應(yīng)索引設(shè)置為1,其余設(shè)置為0净捅。這樣基本上我們就把
ret = {'input': inp, 'hm': hm, 'reg_mask': reg_mask, 'ind': ind, 'wh': wh}
這個(gè)參數(shù)介紹完成疑枯。最后進(jìn)行ret更新
ret.update({'reg': reg})
, 即把reg
這個(gè)item添加到ret
字典中去。
- 下面我們來看下
model.py
蛔六。
數(shù)據(jù)的預(yù)處理我們已經(jīng)看了荆永。下面我們來看下模型代碼。
from .networks.msra_resnet import get_pose_net
from .networks.dlav0 import get_pose_net as get_dlav0
from .networks.pose_dla_dcn import get_pose_net as get_dla_dcn
from .networks.resnet_dcn import get_pose_net as get_pose_net_dcn
from .networks.large_hourglass import get_large_hourglass_net
_model_factory = {
'res': get_pose_net, # default Resnet with deconv
'dlav0': get_dlav0, # default DLAup
'dla': get_dla_dcn,
'resdcn': get_pose_net_dcn,
'hourglass': get_large_hourglass_net,
}
def create_model(arch, heads, head_conv):
num_layers = int(arch[arch.find('_') + 1:]) if '_' in arch else 0
arch = arch[:arch.find('_')] if '_' in arch else arch
get_model = _model_factory[arch]
model = get_model(num_layers=num_layers, heads=heads, head_conv=head_conv)
return model
這里我們使用arch
默認(rèn)為dla_34
国章。在這里代表的是
dla
特征提取的層數(shù)具钥。
從代碼可以看出我們是獲取了pose_net
。這里需要我們?nèi)タ匆幌?code>CenterNet\src\lib\models\networks\pose_dla_dcn.py文件液兽。在這里就不做過多的解釋特征提取模型了骂删,提供一個(gè)鏈接可以幫助我們理解這個(gè)模型cvpr2018 Deep Layer Aggregation(DLANet)。主要結(jié)構(gòu)如下:
我們看一下DLA模型輸出端是什么樣子的四啰。
for head in self.heads:
classes = self.heads[head]
if head_conv > 0:
fc = nn.Sequential(
nn.Conv2d(channels[self.first_level], head_conv,
kernel_size=3, padding=1, bias=True),
nn.ReLU(inplace=True),
nn.Conv2d(head_conv, classes,
kernel_size=final_kernel, stride=1,
padding=final_kernel // 2, bias=True)
這里的heads = {'hm':3, 'wh':2, 'reg':2}
通過dla模型的輸出的通道為5宁玫, 先經(jīng)過的卷積,輸出通道為256拟逮,再經(jīng)過
最后在進(jìn)行1\times1$卷積輸出如果是
hm
則為3通道撬统,wh
則為2通道,reg
則為2通道敦迄。
- 最后我們看下loss函數(shù)的定義恋追。可以參考
CenterNet\src\lib\trains\ctdet.py
def forward(self, outputs, batch):
opt = self.opt
hm_loss, wh_loss, off_loss = 0, 0, 0
for s in range(opt.num_stacks): # num_stacks = 1
output = outputs[s]
if not opt.mse_loss:
output['hm'] = _sigmoid(output['hm'])
if opt.eval_oracle_hm:
output['hm'] = batch['hm']
if opt.eval_oracle_wh:
output['wh'] = torch.from_numpy(gen_oracle_map(
batch['wh'].detach().cpu().numpy(),
batch['ind'].detach().cpu().numpy(),
output['wh'].shape[3], output['wh'].shape[2])).to(opt.device)
if opt.eval_oracle_offset:
output['reg'] = torch.from_numpy(gen_oracle_map(
batch['reg'].detach().cpu().numpy(),
batch['ind'].detach().cpu().numpy(),
output['reg'].shape[3], output['reg'].shape[2])).to(opt.device)
hm_loss += self.crit(output['hm'], batch['hm']) / opt.num_stacks
if opt.wh_weight > 0:
if opt.dense_wh:
mask_weight = batch['dense_wh_mask'].sum() + 1e-4
wh_loss += (
self.crit_wh(output['wh'] * batch['dense_wh_mask'],
batch['dense_wh'] * batch['dense_wh_mask']) /
mask_weight) / opt.num_stacks
elif opt.cat_spec_wh:
wh_loss += self.crit_wh(
output['wh'], batch['cat_spec_mask'],
batch['ind'], batch['cat_spec_wh']) / opt.num_stacks
else:
wh_loss += self.crit_reg(
output['wh'], batch['reg_mask'],
batch['ind'], batch['wh']) / opt.num_stacks
if opt.reg_offset and opt.off_weight > 0:
off_loss += self.crit_reg(output['reg'], batch['reg_mask'],
batch['ind'], batch['reg']) / opt.num_stacks
loss = opt.hm_weight * hm_loss + opt.wh_weight * wh_loss + \
opt.off_weight * off_loss
loss_stats = {'loss': loss, 'hm_loss': hm_loss,
'wh_loss': wh_loss, 'off_loss': off_loss}
return loss, loss_stats
output['hm'] = _sigmoid(output['hm'])
hm_loss += self.crit(output['hm'], batch['hm']) / opt.num_stacks
這里使用sigmoid函數(shù)自己理解人為這是對(duì)heatmap做歸一化, 在進(jìn)行對(duì)heatmap loss計(jì)算加快收斂罚屋。
elif opt.cat_spec_wh:
wh_loss += self.crit_wh(
output['wh'], batch['cat_spec_mask'],
batch['ind'], batch['cat_spec_wh']) / opt.num_stacks
def forward(self, output, mask, ind, target):
pred = _tranpose_and_gather_feat(output, ind)
mask = mask.unsqueeze(2).expand_as(pred).float()
# loss = F.l1_loss(pred * mask, target * mask, reduction='elementwise_mean')
loss = F.l1_loss(pred * mask, target * mask, size_average=False)
loss = loss / (mask.sum() + 1e-4)
return loss
def _gather_feat(feat, ind, mask=None):
dim = feat.size(2)
ind = ind.unsqueeze(2).expand(ind.size(0), ind.size(1), dim)
feat = feat.gather(1, ind)
if mask is not None:
mask = mask.unsqueeze(2).expand_as(feat)
feat = feat[mask]
feat = feat.view(-1, dim)
return feat
def _tranpose_and_gather_feat(feat, ind):
feat = feat.permute(0, 2, 3, 1).contiguous()
feat = feat.view(feat.size(0), -1, feat.size(3))
feat = _gather_feat(feat, ind)
return feat
我們相當(dāng)于在ind中記錄了目標(biāo)在heatmap上的地址索引苦囱,通過_tranpose_and_gather_feat
以及def _gather_feat(feat, ind, mask=None):
函數(shù)得出我們預(yù)測(cè)的寬高。 _gather_feat根據(jù)ind取出feat中對(duì)應(yīng)的元素
__gather_feat
起到的作用是消除各個(gè)channel區(qū)別的作用脾猛,最終得到的inds是對(duì)于所有channel而言的撕彤。輸入: feat(topk_inds): batch * (cat x K) * 1 (假設(shè)輸入的是topk_inds和topk_ind)
ind(topk_ind):batch * K
首先將ind擴(kuò)展一個(gè)指標(biāo),變?yōu)?batch * K * 1, 之后使用gather羹铅,將ind對(duì)應(yīng)的值取出來蚀狰。返回的是index:
feat: batch * K * 1 取值:[0, cat x K - 1]
更一般的情況如下: feat : A * B * C, ind:A * D
首先將ind擴(kuò)展一個(gè)指標(biāo)职员,并且expand為dim的大小麻蹋,變?yōu)?A * D * C,其中對(duì)于任意的i, j, 數(shù)組ind[i, j, :]中所有的元素均相同焊切,等于原來A * D shape的ind[i, j]扮授。 之后使用gather,將ind對(duì)應(yīng)的值取出來专肪。 得到的feat: A * D * C
off_loss += self.crit_reg(output['reg'], batch['reg_mask'],batch['ind'], batch['reg']) / opt.num_stacks
原理和計(jì)算 一樣
loss = opt.hm_weight * hm_loss + opt.wh_weight * wh_loss + opt.off_weight * off_loss
二刹勃、測(cè)試代碼
這里我們可以從demo.py
這個(gè)函數(shù)開始入手了解。
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import _init_paths
import os
import cv2
from opts import opts
from detectors.detector_factory import detector_factory
image_ext = ['jpg', 'jpeg', 'png', 'webp']
video_ext = ['mp4', 'mov', 'avi', 'mkv']
time_stats = ['tot', 'load', 'pre', 'net', 'dec', 'post', 'merge']
def demo(opt):
os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str
opt.debug = max(opt.debug, 1)
Detector = detector_factory[opt.task]
detector = Detector(opt)
if opt.demo == 'webcam' or \
opt.demo[opt.demo.rfind('.') + 1:].lower() in video_ext:
cam = cv2.VideoCapture(0 if opt.demo == 'webcam' else opt.demo)
detector.pause = False
while True:
_, img = cam.read()
cv2.imshow('input', img)
ret = detector.run(img)
time_str = ''
for stat in time_stats:
time_str = time_str + '{} {:.3f}s |'.format(stat, ret[stat])
print(time_str)
if cv2.waitKey(1) == 27:
return # esc to quit
else:
if os.path.isdir(opt.demo):
image_names = []
ls = os.listdir(opt.demo)
for file_name in sorted(ls):
ext = file_name[file_name.rfind('.') + 1:].lower()
if ext in image_ext:
image_names.append(os.path.join(opt.demo, file_name))
else:
image_names = [opt.demo]
for (image_name) in image_names:
ret = detector.run(image_name)
time_str = ''
for stat in time_stats:
time_str = time_str + '{} {:.3f}s |'.format(stat, ret[stat])
print(time_str)
if __name__ == '__main__':
opt = opts().init()
demo(opt)
根據(jù)
Detector = detector_factory[opt.task]
detector = Detector(opt)
這里我們主要研究一下CenterNet\src\lib\detectors\ctdet.py
文件嚎尤。
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import cv2
import numpy as np
from progress.bar import Bar
import time
import torch
try:
from external.nms import soft_nms
except:
print('NMS not imported! If you need it,'
' do \n cd $CenterNet_ROOT/src/lib/external \n make')
from models.decode import ctdet_decode
from models.utils import flip_tensor
from utils.image import get_affine_transform
from utils.post_process import ctdet_post_process
from utils.debugger import Debugger
from .base_detector import BaseDetector
class CtdetDetector(BaseDetector):
def __init__(self, opt):
super(CtdetDetector, self).__init__(opt)
def process(self, images, return_time=False):
with torch.no_grad():
output = self.model(images)[-1]
hm = output['hm'].sigmoid_()
wh = output['wh']
reg = output['reg'] if self.opt.reg_offset else None
if self.opt.flip_test:
hm = (hm[0:1] + flip_tensor(hm[1:2])) / 2
wh = (wh[0:1] + flip_tensor(wh[1:2])) / 2
reg = reg[0:1] if reg is not None else None
torch.cuda.synchronize()
forward_time = time.time()
dets = ctdet_decode(hm, wh, reg=reg, cat_spec_wh=self.opt.cat_spec_wh, K=self.opt.K)
if return_time:
return output, dets, forward_time
else:
return output, dets
def post_process(self, dets, meta, scale=1):
dets = dets.detach().cpu().numpy()
dets = dets.reshape(1, -1, dets.shape[2])
dets = ctdet_post_process(
dets.copy(), [meta['c']], [meta['s']],
meta['out_height'], meta['out_width'], self.opt.num_classes)
for j in range(1, self.num_classes + 1):
dets[0][j] = np.array(dets[0][j], dtype=np.float32).reshape(-1, 5)
dets[0][j][:, :4] /= scale
return dets[0]
def merge_outputs(self, detections):
results = {}
for j in range(1, self.num_classes + 1):
results[j] = np.concatenate(
[detection[j] for detection in detections], axis=0).astype(np.float32)
if len(self.scales) > 1 or self.opt.nms:
soft_nms(results[j], Nt=0.5, method=2)
scores = np.hstack(
[results[j][:, 4] for j in range(1, self.num_classes + 1)])
if len(scores) > self.max_per_image:
kth = len(scores) - self.max_per_image
thresh = np.partition(scores, kth)[kth]
for j in range(1, self.num_classes + 1):
keep_inds = (results[j][:, 4] >= thresh)
results[j] = results[j][keep_inds]
return results
def debug(self, debugger, images, dets, output, scale=1):
detection = dets.detach().cpu().numpy().copy()
detection[:, :, :4] *= self.opt.down_ratio
for i in range(1):
img = images[i].detach().cpu().numpy().transpose(1, 2, 0)
img = ((img * self.std + self.mean) * 255).astype(np.uint8)
pred = debugger.gen_colormap(output['hm'][i].detach().cpu().numpy())
debugger.add_blend_img(img, pred, 'pred_hm_{:.1f}'.format(scale))
debugger.add_img(img, img_id='out_pred_{:.1f}'.format(scale))
for k in range(len(dets[i])):
if detection[i, k, 4] > self.opt.center_thresh:
debugger.add_coco_bbox(detection[i, k, :4], detection[i, k, -1],
detection[i, k, 4],
img_id='out_pred_{:.1f}'.format(scale))
def show_results(self, debugger, image, results):
debugger.add_img(image, img_id='ctdet')
for j in range(1, self.num_classes + 1):
for bbox in results[j]:
if bbox[4] > self.opt.vis_thresh:
debugger.add_coco_bbox(bbox[:4], j - 1, bbox[4], img_id='ctdet')
debugger.show_all_imgs(pause=self.pause)
我們先從def process(self, images, return_time=False)
說起荔仁。
def process(self, images, return_time=False):
with torch.no_grad():
output = self.model(images)[-1]
hm = output['hm'].sigmoid_()
wh = output['wh']
reg = output['reg'] if self.opt.reg_offset else None
if self.opt.flip_test: #False
hm = (hm[0:1] + flip_tensor(hm[1:2])) / 2
wh = (wh[0:1] + flip_tensor(wh[1:2])) / 2
reg = reg[0:1] if reg is not None else None
torch.cuda.synchronize()
forward_time = time.time()
dets = ctdet_decode(hm, wh, reg=reg, cat_spec_wh=self.opt.cat_spec_wh, K=self.opt.K)
首先獲取預(yù)測(cè)出來的圖片hm
, 因?yàn)槲覀冊(cè)谟?xùn)練的時(shí)候也是做了sigmoid
所以我們?cè)陬A(yù)測(cè)的時(shí)候也進(jìn)行sigmoid
诺苹。之后進(jìn)入dets = ctdet_decode(hm, wh, reg=reg, cat_spec_wh=self.opt.cat_spec_wh, K=self.opt.K)
這個(gè)函數(shù)主要的作用是將我們的heatmap轉(zhuǎn)換成
bbox
咕晋。
- 首先進(jìn)入
_nms
函數(shù)
def _nms(heat, kernel=3):
pad = (kernel - 1) // 2
hmax = nn.functional.max_pool2d(
heat, (kernel, kernel), stride=1, padding=pad)
keep = (hmax == heat).float()
return heat * keep
hmax用來尋找8-近鄰極大值點(diǎn),keep為h極大值點(diǎn)的位置收奔,返回heat*keep,篩選出極大值點(diǎn)滓玖,為原值坪哄,其余為0。
*其次進(jìn)入_topk:
函數(shù)
def _topk(scores, K=40):
batch, cat, height, width = scores.size()
topk_scores, topk_inds = torch.topk(scores.view(batch, cat, -1), K)
topk_inds = topk_inds % (height * width)
topk_ys = (topk_inds / width).int().float()
topk_xs = (topk_inds % width).int().float()
topk_score, topk_ind = torch.topk(topk_scores.view(batch, -1), K)
topk_clses = (topk_ind / K).int()
topk_inds = _gather_feat(
topk_inds.view(batch, -1, 1), topk_ind).view(batch, K)
topk_ys = _gather_feat(topk_ys.view(batch, -1, 1), topk_ind).view(batch, K)
topk_xs = _gather_feat(topk_xs.view(batch, -1, 1), topk_ind).view(batch, K)
return topk_score, topk_inds, topk_clses, topk_ys, topk_xs
topk_scores: batch * cat * K势篡, batch代表batchsize翩肌,cat代表類別數(shù),K代表K個(gè)最大值禁悠。topk_inds:batch * cat * K念祭, index取值:[0, W x H - 1]。 topk_scores和topk_inds分別為每個(gè)batch每張heatmap(每個(gè)類別)中前K個(gè)最大的score和id碍侦。之后對(duì)topk_inds使用取余和除法得到橫縱坐標(biāo)top_ys粱坤、top_xs。然后在每個(gè)batch中取所有heatmap的前K個(gè)最大score以及id瓷产,站玄。topk_score:batch * K topk_ind:batch * K index取值:[0, cat x K - 1] 之后對(duì)topk_inds(view后)和topk_ind調(diào)用了_gather_feat函數(shù),在utils文件中
_gather_feat
def _gather_feat(feat, ind, mask=None):
dim = feat.size(2)
ind = ind.unsqueeze(2).expand(ind.size(0), ind.size(1), dim)
feat = feat.gather(1, ind)
if mask is not None:
mask = mask.unsqueeze(2).expand_as(feat)
feat = feat[mask]
feat = feat.view(-1, dim)
return feat
輸入:
feat(topk_inds): batch * (cat x K) * 1 (假設(shè)輸入的是topk_inds和topk_ind) ind(topk_ind):batch * K 首先將ind擴(kuò)展一個(gè)指標(biāo)濒旦,變?yōu)?batch * K * 1 之后使用gather株旷,將ind對(duì)應(yīng)的值取出來。返回的是index:
feat: batch * K * 1 取值:[0, cat x K - 1]更一般的情況如下:
feat : A * B * C
ind:A * D
首先將ind擴(kuò)展一個(gè)指標(biāo)尔邓,并且expand為dim的大小晾剖,變?yōu)?A * D * C锉矢,其中對(duì)于任意的i, j, 數(shù)組ind[i, j, :]中所有的元素均相同,等于原來A * D shape的ind[i, j]齿尽。之后使用gather沽损,將ind對(duì)應(yīng)的值取出來。得到的feat: A * D * C
scores, inds, clses, ys, xs = _topk(heat, K=K)
最后返回有四個(gè):topk_score, topk_inds, topk_clses, topk_ys, topk_xs
topk_score:batch * K雕什。每張圖片中最大的K個(gè)值topk_inds:batch * K 缠俺。沒張圖片中最大的K個(gè)值對(duì)應(yīng)的index,這個(gè)index在[0, W x H - 1]之間贷岸。后兩個(gè)類似壹士。reg = _tranpose_and_gather_feat(reg, inds)
-
wh = _tranpose_and_gather_feat(wh, inds)
前者后者都是過inds分別獲取reg
以及wh
.
- 之后會(huì)進(jìn)行尺度變換等后處理, 在做soft_nms(oft-nms來移除冗余的bbox偿警。根據(jù)得分情況躏救,最后選擇top-100個(gè)bbox用于檢測(cè)。)螟蒸,之后scores進(jìn)行一個(gè)篩選盒使,將那些大于最多檢測(cè)框(100)剔除掉。
def post_process(self, dets, meta, scale=1):
dets = dets.detach().cpu().numpy()
dets = dets.reshape(1, -1, dets.shape[2])
dets = ctdet_post_process(
dets.copy(), [meta['c']], [meta['s']],
meta['out_height'], meta['out_width'], self.opt.num_classes)
for j in range(1, self.num_classes + 1):
dets[0][j] = np.array(dets[0][j], dtype=np.float32).reshape(-1, 5)
dets[0][j][:, :4] /= scale
return dets[0]
def merge_outputs(self, detections):
results = {}
for j in range(1, self.num_classes + 1):
results[j] = np.concatenate(
[detection[j] for detection in detections], axis=0).astype(np.float32)
if len(self.scales) > 1 or self.opt.nms:
soft_nms(results[j], Nt=0.5, method=2)
scores = np.hstack(
[results[j][:, 4] for j in range(1, self.num_classes + 1)])
if len(scores) > self.max_per_image:
kth = len(scores) - self.max_per_image
thresh = np.partition(scores, kth)[kth]
for j in range(1, self.num_classes + 1):
keep_inds = (results[j][:, 4] >= thresh)
results[j] = results[j][keep_inds]
return results
def run(self, image_or_path_or_tensor, meta=None):
load_time, pre_time, net_time, dec_time, post_time = 0, 0, 0, 0, 0
merge_time, tot_time = 0, 0
debugger = Debugger(dataset=self.opt.dataset, ipynb=(self.opt.debug==3),
theme=self.opt.debugger_theme)
start_time = time.time()
pre_processed = False
if isinstance(image_or_path_or_tensor, np.ndarray):
image = image_or_path_or_tensor
elif type(image_or_path_or_tensor) == type (''):
image = cv2.imread(image_or_path_or_tensor)
else:
image = image_or_path_or_tensor['image'][0].numpy()
pre_processed_images = image_or_path_or_tensor
pre_processed = True
loaded_time = time.time()
load_time += (loaded_time - start_time)
detections = []
for scale in self.scales:
scale_start_time = time.time()
if not pre_processed:
images, meta = self.pre_process(image, scale, meta)
else:
# import pdb; pdb.set_trace()
images = pre_processed_images['images'][scale][0]
meta = pre_processed_images['meta'][scale]
meta = {k: v.numpy()[0] for k, v in meta.items()}
images = images.to(self.opt.device)
torch.cuda.synchronize()
pre_process_time = time.time()
pre_time += pre_process_time - scale_start_time
output, dets, forward_time = self.process(images, return_time=True)
torch.cuda.synchronize()
net_time += forward_time - pre_process_time
decode_time = time.time()
dec_time += decode_time - forward_time
if self.opt.debug >= 2:
self.debug(debugger, images, dets, output, scale)
dets = self.post_process(dets, meta, scale)
torch.cuda.synchronize()
post_process_time = time.time()
post_time += post_process_time - decode_time
detections.append(dets)
results = self.merge_outputs(detections)
torch.cuda.synchronize()
end_time = time.time()
merge_time += end_time - post_process_time
tot_time += end_time - start_time
if self.opt.debug >= 1:
self.show_results(debugger, image, results)
return {'results': results, 'tot': tot_time, 'load': load_time,
'pre': pre_time, 'net': net_time, 'dec': dec_time,
'post': post_time, 'merge': merge_time}
關(guān)于softNMS可以參考這篇文章論文閱讀: Soft-NMS七嫌。
傳統(tǒng)的NMS原則:
1少办、根據(jù)候選框的類別分類概率做排序,假如有4個(gè) BBox 诵原,其置信度A>B>C>D英妓。
2、先標(biāo)記最大概率矩形框A是算法要保留的BBox绍赛;
3蔓纠、從最大概率矩形框A開始,分別判斷ABC與D的重疊度IOU(兩框的交并比)是否大于某個(gè)設(shè)定的閾值(0.5)吗蚌,假設(shè)D與A的重疊度超過閾值腿倚,那么就舍棄D;
4蚯妇、從剩下的矩形框BC中敷燎,選擇概率最大的B,標(biāo)記為保留侮措,然后判讀C與B的重疊度懈叹,扔掉重疊度超過設(shè)定閾值的矩形框;
5分扎、一直重復(fù)進(jìn)行澄成,標(biāo)記完所有要保留下來的矩形框。
傳統(tǒng)NMS缺點(diǎn):
NMS缺點(diǎn):
1、NMS算法中的最大問題就是它將相鄰檢測(cè)框的分?jǐn)?shù)均強(qiáng)制歸零(既將重疊部分大于重疊閾值Nt的檢測(cè)框移除)墨状。在這種情況下卫漫,如果一個(gè)真實(shí)物體在重疊區(qū)域出
現(xiàn),則將導(dǎo)致對(duì)該物體的檢測(cè)失敗并降低了算法的平均檢測(cè)率(average precision, AP)肾砂。
2列赎、NMS的閾值也不太容易確定,設(shè)置過小會(huì)出現(xiàn)誤刪镐确,設(shè)置過高又容易增大誤檢包吝。
3、NMS一般只能使用CPU計(jì)算源葫,無法使用GPU計(jì)算诗越。
寫這篇博客是為了做一下模型代碼理解的筆記,方便自己日后的復(fù)習(xí)息堂,如果問題嚷狞,歡迎讀者指出。
參考: