2019.11.17.更新說明
- 關于這個工程復現的一些問題
對于私信以及評論問我的朋友們稠歉,首先表示抱歉,我沒有看簡書及時回復,這一個工程是我今年年初嘗試復現的一項工作掰担,后續(xù)作者也不斷擴展做的更robust,具體的一些操作在作者的github上寫的也比較清晰怒炸,我這里只是一個比較粗糙簡單描述带饱,寫的比較粗糙,最后一次在5月份調試這個工程時作者已經把map的文件測試集成到了python中(雖然還是調用的matlab)阅羹,那次比起我之前的調試過的版本已經完善了許多. - 關于當時最后測試map的問題.
threshold=0.500000
時間已過 4.047161 秒勺疼。
AP:0.131 at overlap 0.5 for BaseballPitch
AP:0.443 at overlap 0.5 for BasketballDunk
AP:0.095 at overlap 0.5 for Billiards
AP:0.431 at overlap 0.5 for CleanAndJerk
AP:0.527 at overlap 0.5 for CliffDiving
AP:0.298 at overlap 0.5 for CricketBowling
AP:0.124 at overlap 0.5 for CricketShot
AP:0.408 at overlap 0.5 for Diving
AP:0.037 at overlap 0.5 for FrisbeeCatch
AP:0.080 at overlap 0.5 for GolfSwing
AP:0.545 at overlap 0.5 for HammerThrow
AP:0.173 at overlap 0.5 for HighJump
AP:0.361 at overlap 0.5 for JavelinThrow
AP:0.599 at overlap 0.5 for LongJump
AP:0.476 at overlap 0.5 for PoleVault
AP:0.131 at overlap 0.5 for Shotput
AP:0.049 at overlap 0.5 for SoccerPenalty
AP:0.122 at overlap 0.5 for TennisSwing
AP:0.224 at overlap 0.5 for ThrowDiscus
AP:0.058 at overlap 0.5 for VolleyballSpiking
MAP: 0.265574
上面是我最后復現的使用res18的一個結果,可以看作者的reame后發(fā)現其實是和他的結果差異不大.因為后續(xù)去涉及其他的一些工作這一個工作擱置了許久,也只留下了這一個最終預測結果.
- 關于map的計算部分代碼的使用
作者這里使用的工具實際上還是從matlab里調用來的接口,只不過這個map結果我們想要得到的話需要保留test時的log.
#!/bin/bash
GPU_ID=0,1
EX_DIR=thumos14
export PYTHONUNBUFFERED=true
LOG="test_log.txt"
time python test_net.py --cuda --net i3d-res50 \
${EXTRA_ARGS} \
2>&1 | tee $LOG
上述是當時我寫的一個腳本,可以打印輸出到test_log.txt文件中,只需要替換自己的net即可.
然后使用evaluation/thumos14里的thumos14_log_analysis.py
import sys, os, errno
import numpy as np
import csv
import json
import copy
import argparse
import subprocess
THIS_DIR = os.path.dirname(os.path.abspath(__file__))
FRAME_DIR = '/home/simon/THUMOS14'
META_DIR = os.path.join(FRAME_DIR, 'annotation_')
def nms(dets, thresh=0.4):
"""Pure Python NMS baseline."""
if len(dets) == 0: return []
x1 = dets[:, 0]
x2 = dets[:, 1]
scores = dets[:, 2]
lengths = x2 - x1
order = scores.argsort()[::-1]
keep = []
while order.size > 0:
i = order[0]
keep.append(i)
xx1 = np.maximum(x1[i], x1[order[1:]])
xx2 = np.minimum(x2[i], x2[order[1:]])
inter = np.maximum(0.0, xx2 - xx1)
ovr = inter / (lengths[i] + lengths[order[1:]] - inter)
inds = np.where(ovr <= thresh)[0]
order = order[inds + 1]
return keep
def generate_classes(meta_dir, split, use_ambiguous=False):
class_id = {0: 'Background'}
with open(os.path.join(meta_dir, 'class-index-detection.txt'), 'r') as f:
lines = f.readlines()
for l in lines:
cname = l.strip().split()[-1]
cid = int(l.strip().split()[0])
class_id[cid] = cname
if use_ambiguous:
class_id[21] = 'Ambiguous'
return class_id
'''
def get_segments(data, thresh, framerate):
segments = []
vid = 'Background'
find_next = False
tmp = {'label' : 0, 'score': 0, 'segment': [0, 0]}
for l in data:
# video name and sliding window length
if "fg_name :" in l:
vid = l.split('/')[-1]
# frame index, time, confident score
elif "frames :" in l:
start_frame=int(l.split()[4])
end_frame=int(l.split()[5])
stride = int(l.split()[6].split(']')[0])
elif "activity:" in l:
label = int(l.split()[1])
tmp['label'] = label
find_next = True
elif "im_detect" in l:
return vid, segments
elif find_next:
try:
left_frame = float(l.split()[0].split('[')[-1])*stride + start_frame
right_frame = float(l.split()[1])*stride + start_frame
except:
left_frame = float(l.split()[1])*stride + start_frame
right_frame = float(l.split()[2])*stride + start_frame
if (left_frame < end_frame) and (right_frame <= end_frame):
left = left_frame / 25.0
right = right_frame / 25.0
try:
score = float(l.split()[-1].split(']')[0])
except:
score = float(l.split()[-2])
if score > thresh:
tmp1 = copy.deepcopy(tmp)
tmp1['score'] = score
tmp1['segment'] = [left, right]
segments.append(tmp1)
elif (left_frame < end_frame) and (right_frame > end_frame):
if (end_frame-left_frame)*1.0/(right_frame-left_frame)>=0:
right_frame = end_frame
left = left_frame / 25.0
right = right_frame / 25.0
try:
score = float(l.split()[-1].split(']')[0])
except:
score = float(l.split()[-2])
if score > thresh:
tmp1 = copy.deepcopy(tmp)
tmp1['score'] = score
tmp1['segment'] = [left, right]
segments.append(tmp1)
'''
def get_segments(data, thresh, framerate):
segments = []
vid = 'Background'
find_next = False
tmp = {'label' : 0, 'score': 0, 'segment': [0, 0]}
for l in data:
# video name and sliding window length
if "fg_name:" in l:
vid = l.split('/')[-1]
# frame index, time, confident score
elif "frames:" in l:
start_frame=int(l.split()[3])
end_frame=int(l.split()[4])
stride = int(l.split()[5].split(']')[0])
elif "activity:" in l:
label = int(l.split()[1])
tmp['label'] = label
find_next = True
elif "im_detect" in l:
return vid, segments
elif find_next:
try:
left_frame = float(l.split()[0].split('[')[-1])*stride + start_frame
right_frame = float(l.split()[1])*stride + start_frame
except:
left_frame = float(l.split()[1])*stride + start_frame
right_frame = float(l.split()[2])*stride + start_frame
try:
score = float(l.split()[-1].split(']')[0])
except:
score = float(l.split()[-2])
if (left_frame >= right_frame):
print("???", l)
continue
if right_frame > end_frame:
#print("right out", right_frame, end_frame)
right_frame = end_frame
left = left_frame / framerate
right = right_frame / framerate
if score > thresh:
tmp1 = copy.deepcopy(tmp)
tmp1['score'] = score
tmp1['segment'] = [left, right]
segments.append(tmp1)
def analysis_log(logfile, thresh, framerate):
with open(logfile, 'r') as f:
lines = f.read().splitlines()
predict_data = []
res = {}
for l in lines:
if "frames:" in l:
predict_data = []
predict_data.append(l)
if "im_detect:" in l:
vid, segments = get_segments(predict_data, thresh, framerate)
if vid not in res:
res[vid] = []
res[vid] += segments
return res
def select_top(segmentations, nms_thresh=0.99999, num_cls=0, topk=0):
res = {}
for vid, vinfo in segmentations.items():
# select most likely classes
if num_cls > 0:
ave_scores = np.zeros(21)
for i in xrange(1, 21):
ave_scores[i] = np.sum([d['score'] for d in vinfo if d['label']==i])
labels = list(ave_scores.argsort()[::-1][:num_cls])
else:
labels = list(set([d['label'] for d in vinfo]))
# NMS
res_nms = []
for lab in labels:
nms_in = [d['segment'] + [d['score']] for d in vinfo if d['label'] == lab]
keep = nms(np.array(nms_in), nms_thresh)
for i in keep:
# tmp = {'label':classes[lab], 'score':nms_in[i][2], 'segment': nms_in[i][0:2]}
tmp = {'label': lab, 'score':nms_in[i][2], 'segment': nms_in[i][0:2]}
res_nms.append(tmp)
# select topk
scores = [d['score'] for d in res_nms]
sortid = np.argsort(scores)[-topk:]
res[vid] = [res_nms[id] for id in sortid]
return res
parser = argparse.ArgumentParser(description="log analysis.py")
parser.add_argument('log_file', type=str, help="test log file path")
parser.add_argument('--framerate', type=int, help="frame rate of videos extract by ffmpeg")##這里必填framerate,不然會報錯捏鱼,這里也是照顧到我們選擇不同的幀率拆幀的方式恢口。
parser.add_argument('--thresh', type=float, default=0.005, help="filter those dets low than the thresh, default=0.0005")
parser.add_argument('--nms_thresh', type=float, default=0.4, help="nms thresh, default=0.3")
parser.add_argument('--topk', type=int, default=200, help="select topk dets, default=200")
parser.add_argument('--num_cls', type=int, default=0, help="select most likely classes, default=0")
args = parser.parse_args()
classes = generate_classes(META_DIR+'test', 'test', use_ambiguous=False)
segmentations = analysis_log(args.log_file, thresh = args.thresh, framerate=args.framerate)
segmentations = select_top(segmentations, nms_thresh=args.nms_thresh, num_cls=args.num_cls, topk=args.topk)
res = {'version': 'VERSION 1.3',
'external_data': {'used': True, 'details': 'C3D pre-trained on activity-1.3 training set'},
'results': {}}
for vid, vinfo in segmentations.items():
res['results'][vid] = vinfo
#with open('results.json', 'w') as outfile:
# json.dump(res, outfile)
with open('tmp.txt', 'w') as outfile:
for vid, vinfo in segmentations.items():
for seg in vinfo:
outfile.write("{} {} {} {} {}\n".format(vid, seg['segment'][0], seg['segment'][1], int(seg['label']) ,seg['score']))
def matlab_eval():
print('Computing results with the official Matlab eval code')
path = os.path.join(THIS_DIR, 'Evaluation')
cmd = 'cp tmp.txt {} && '.format(path)
cmd += 'cd {} && '.format(path)
cmd += 'matlab -nodisplay -nodesktop '
cmd += '-r "dbstop if error; '
cmd += 'eval_thumos14(); quit;"'
print('Runing: \n {}'.format(cmd))
status = subprocess.call(cmd, shell=True)
matlab_eval()
當時的這個文件代碼如上所示,一些相應的注釋已有所標注.運行代碼可以參考下圖.
- 這個工作暫時就只做到這里了,各位可以根據自己的需求選取自己覺得有用的信息就好,如果能有一點幫助的話我就感覺很開心了,另外最近facebook的slowfast最近也開源了,這個工程感覺應用價值大一些,有興趣的可以關注一下這個.
時序行為檢測新工作開展
最近開始的一項新工作,首先是基于R-C3D.pytorch這一部分進行工作(看了下是電子科大的大佬遷移寫出來的穷躁,確實是在這里救急了,十分感謝)的baseline具體工程見鏈接。這個方法是結合了C3D的框架還有faster-rcnn的做法來做的一項工作问潭,也就是兩個工作的結合猿诸。不得不說其實這一塊只要是有一個比較好的做法提出,實際上就是把別人的兩個方法一結合就呈現了自己的方法....搞得十分真實狡忙。我們的想法也是打算將這里之前的ma-i3d的network移植過來替代一下這里的C3D的network看看能不能對于時序檢測工作上有一些改進幫助梳虽。
時序行為檢測的難點
根據這兩天對于時序行為檢測的理解,當前打算和R-C3D的工作相結合做一個新框架出來灾茁,對于時序行為檢測的兩個難點了解到一個是時序檢測片段要有精確的邊界窜觉,就是說什么時候算是一個行為的開始,什么時候又是一個行為的結束北专,這個一點對比用bounding boxes來框一個靜態(tài)object來講難度相對大一些禀挫。(就是說就算拆幀來算的話也要精確到起始幀)。還有就是要結合時序信息來進行識別拓颓,看了下THUMOS2014數據集無論training還是validation都是一定會包含一個是feature和一個是video的文件夾语婴。feature里面包含的一些信息是,video里面就是原始視頻了驶睦。如果單純我們切幀的話確實是會存在一部分的問題無論是R-C3D還是SCNN都是要先找到proposal砰左,這里的精確度越高,對于后面分類的幫助越大场航。
當前想做的幾個數據集諸如THUMOS2014缠导、ActivityNet_v1.3、AVA(根據具體情況決定)溉痢,Charades僻造。其中THUMOS2014的訓練集其實就是UCF101數據集,這個下載下來看一看就知道了适室。實際上現有數據集多多少少會有一些問題嫡意,里面視頻帶水印或者是時間長度十分短對于我們人來講理解都有一定的困難,何況機器了...不過好在機器不像人一樣會感覺疲勞捣辆。
R-C3D的baseline工作開展
實際上這一塊只要在理解C3D基礎上以及有了解過faster-rcnn之類的做法蔬螟,就會發(fā)現這就是兩塊工作(proposal+classification的結合)。他的網絡結構比較容易理解汽畴,可以看看這篇文章講的比我詳細旧巾,就是主要兩個子網絡,一:時序上的行為檢測忍些,檢測起始幀和結束幀看這一段是否是一個完整的動作鲁猩,這里訓練集中倒是給出了指定的start-time和end-time。
R-C3D.pytorch遇到的一些問題以及最終解決方案
我們來詳細解析一下這個工程的一些調試和遇到具體問題
首先是大體看了一下這個工作的實現情況罢坝,具體的指標暫且還沒有研究廓握,只是憑感覺把結果調試出來了。工程內包含的文件見圖1
因為數據集在遠程服務器上然后比較大,我就下了兩三個視頻拿來進行測試隙券,這里有一個問題一開始沒有解決因為這個作者寫的十分不明確男应,對于第一次使用該數據集的人了解會有一些誤解,我就是吃了這個虧卡在讀取數據處理數據這里娱仔。
這里我們首先要講一下THUMOS2014數據集沐飘,訓練集就是上面的ucf101數據集我們已經了解到了,但是牲迫,最重要的一點是這個數據集是包含兩個任務的耐朴,識別和檢測兩大任務,它本身是從一個競賽中分離過來的盹憎,也就是說本身他不是說要用ucf101來進行做檢測這一塊的筛峭。下面這個圖2是我們服務器上的THUMOS2014數據集,其中1-training里面就是ucf101數據集脚乡,包括101類動作蜒滩,共計13320段分割好的視頻片段。THUMOS2014的驗證集和測試集則分別包括1010和1574個未分割過的視頻奶稠。就是后面的3-validation和6-testing這兩個文件夾里的東西俯艰。在時序行為檢測任務中,只有20類動作的未分割視頻是有時序行為片段標注的锌订,包括200個驗證集視頻(包含3007個行為片段)和213個測試集視頻(包含3358個行為片段)竹握。這些經過標注的未分割視頻可以被用于訓練和測試時序行為檢測模型。這里說明就是說這里是用來做檢測的數據辆飘,也就是我們真正做檢測要用的訓練的和測試的數據啦辐。一開始沒有理解透,老是在想應該怎么把ucf101當做訓練數據來匹配進去蜈项,耽誤了不少時間芹关,實在是不應該。話說論文中就有寫啊紧卒,就應該一開始把數據集論文先讀一遍侥衬,這樣直接看R-C3D的方法實際上進展也并未有很多(嚶)。下面圖3是3-validation里面的一些文件跑芳,圖4是6-testing里的文件轴总。
理解了上面的任務要點后,那么根據完成這個任務的完整流程走的話盆佣,首先就是看看官方README里怎么講的往堡,圖5告訴我們比較明確的前置準備工作了
然后這里我們是要對輸入的視頻進行分割的投蝉,必須要做的一部分养葵。這個地方一開始很難理解,因為我們看到訓練的時候不是有1010個未分割的視頻在validation里瘩缆,為什么不能用這么多?因為這里我們要做的是檢測啊佃蚜,上面也有提到只有20類動作的未分割視頻是有時序行為片段標注的庸娱,也就說我們想得到時序上標注的動作信息,也只有這點數據供給我們的機器學習了谐算。就相當于我們復習的時候只有兩本書復習熟尉,別的什么復習資料都沒有,就問你看還是不看吧洲脂,不看反正一點不會斤儿,看了多少還能有點進展。這里我們從圖7-9中看一下具體的一些文件吧恐锦。
圖7這里的文件包含的都是一些時序標注信息往果,也就是我們所講的能檢測出動作的時間,然后圖8里面就是這個具體動作到底在那幾個視頻中出現過,然后后面兩個數字是他具體出現的起始時間一铅,這里不光是猜測陕贮,經過驗證過的,圖9是我對應截到的一個能看的出來是動作的一幀(截圖鬼才)潘飘,我們可以看到下方的時間軸也正好和圖8中的開始結束時間相對應肮之。這里的起始時間都是按秒數算的,從多少秒到多少秒為止結束卜录,小數點精確到微秒(相對來說比較寬泛了戈擒,不過也說得過去,再細一點其實影響也不是很大艰毒,畢竟這還是一個時序上的動作筐高,多一個靜止的圖也就是截出的幀不一定有很大的幫助)。
然后我們就這樣跟著來讀一下generate_frames.py這個文件吧凯傲,這個文件在R-C3D.pytorch/process/thumos2014/文件夾下面,這里我們把我這里修改的程序也給出來:
#coding=utf-8
# --------------------------------------------------------
# R-C3D
# Copyright (c) 2017 Boston University
# Licensed under The MIT License [see LICENSE for details]
# Written by Huijuan Xu
# --------------------------------------------------------
import os
from util import *
import json
import glob
fps = 25
ext = '.mp4'
VIDEO_DIR = '/home/simon/ApplyEyeMakeup'
FRAME_DIR = '/home/simon/THUMOS14'
META_DIR = os.path.join(FRAME_DIR, 'annotation_')
def generate_frame(split):
SUB_FRAME_DIR = os.path.join(FRAME_DIR, split)
mkdir(SUB_FRAME_DIR)
segment = dataset_label_parser(META_DIR+split, split, use_ambiguous=True)
video_list = segment.keys()
for vid in video_list:
filename = os.path.join(VIDEO_DIR, vid+ext)
outpath = os.path.join(FRAME_DIR, split, vid)
outfile = os.path.join(outpath, "image_%5d.jpg")
mkdir(outpath)
ffmpeg(filename, outfile, fps)
for framename in os.listdir(outpath):
resize(os.path.join(outpath, framename))
frame_size = len(os.listdir(outpath))
print (filename, fps, frame_size)
generate_frame('val')
#generate_frame('test')
#generate_frame('testing')
這里我們video-dir中存放的就是我們有的原始視頻了嗦篱,frames-dir是我們存放一些生成的frame的地方冰单。這個文件單看是不夠的,因為他從同一文件夾下的util.py中定義了一個方法dataset_label_parser()灸促,我們把util.py也貼在下面:
# --------------------------------------------------------
# R-C3D
# Copyright (c) 2017 Boston University
# Licensed under The MIT License [see LICENSE for details]
# Written by Huijuan Xu
# --------------------------------------------------------
import subprocess
#import shutil
import os, errno
import cv2
from collections import defaultdict
import shutil
import matplotlib
import numpy as np
def dataset_label_parser(meta_dir, split, use_ambiguous=False):
class_id = defaultdict(int)
with open(os.path.join(meta_dir, 'class-index-detection.txt'), 'r') as f:
lines = f.readlines()
for l in lines:
cname = l.strip().split()[-1]# leibie name
#print(cname)
cid = int(l.strip().split()[0])## leibie id
class_id[cname] = cid
if use_ambiguous:
class_id['Ambiguous'] = 21
segment = {}
#video_instance = set()
for cname in class_id.keys():
tmp = '{}_{}.txt'.format(cname, split)
with open(os.path.join(meta_dir, tmp)) as f:
lines = f.readlines()
for l in lines:
vid_name = l.strip().split()[0]
start_t = float(l.strip().split()[1])
end_t = float(l.strip().split()[2])
#video_instance.add(vid_name)
# initionalize at the first time
if not vid_name in segment.keys():
segment[vid_name] = [[start_t, end_t, class_id[cname]]]
else:
segment[vid_name].append([start_t, end_t, class_id[cname]])
# sort segments by start_time
for vid in segment:
segment[vid].sort(key=lambda x: x[0])
if True:
keys = list(segment.keys())
keys.sort()
with open('segment.txt', 'w') as f:
for k in keys:
f.write("{}\n{}\n\n".format(k,segment[k]))
return segment
def get_segment_len(segment):
segment_len = []
for vid_seg in segment.values():
for seg in vid_seg:
l = seg[1] - seg[0]
assert l > 0
segment_len.append(l)
return segment_len
def mkdir(path):
try:
os.makedirs(path)
except OSError as e:
if e.errno != errno.EEXIST:
raise
def rm(path):
try:
shutil.rmtree(path)
except OSError as e:
if e.errno != errno.ENOENT:
raise
def ffmpeg(filename, outfile, fps):
command = ["ffmpeg", "-i", filename, "-q:v", "1", "-r", str(fps), outfile]
pipe = subprocess.Popen(command, stdout = subprocess.PIPE, stderr = subprocess.STDOUT)
pipe.communicate()
def resize(filename, size = (171, 128)):
img = cv2.imread(filename, 100)
img2 = cv2.resize(img, size, interpolation=cv2.INTER_LINEAR)
cv2.imwrite(filename, img2, [100])
# get segs_len from segments by: segs_len = [ s[1]-s[0] for v in segments.values() for s in v ]
def kmeans(segs_len, K=5, vis=False):
X = np.array(segs_len).reshape(-1, 1)
cls = KMeans(K).fit(X)
print( "the cluster centers are: ")
print( cls.cluster_centers_)
if vis:
markers = ['^','x','o','*','+']
for i in range(K):
members = cls.labels_ == i
matplotlib.scatter(X[members,0],X[members,0],s=60,marker=markers[min(i,K-1)],c='b',alpha=0.5)
matplotlib.title(' ')
matplotlib.show()
這里我們直接先看util.py中的'class-index-detection.txt',這個文件里存放的是我們有的這20類有標注的類別的名稱:
7 BaseballPitch
9 BasketballDunk
12 Billiards
21 CleanAndJerk
22 CliffDiving
23 CricketBowling
24 CricketShot
26 Diving
31 FrisbeeCatch
33 GolfSwing
36 HammerThrow
40 HighJump
45 JavelinThrow
51 LongJump
68 PoleVault
79 Shotput
85 SoccerPenalty
92 TennisSwing
93 ThrowDiscus
97 VolleyballSpiking
然而這里面并沒有Ambigious诫欠,這里應該是說只是單純識別是一個夸張的動作吧涵卵,反正分類里面沒有,我們是學習不到具體是哪一類的荒叼。
然后具體的我們再回去看generate_frames.py中轿偎,這里對于THUMOS2014文件夾是我們自己建的,里面包含annotaiton_val文件被廓,這個稍微懂點程序的都能看出來坏晦,不多贅述。annotaiton文件夾里面就包含我們上述圖7的標注信息嫁乘,找尋這些動作的所在起始時間昆婿。
好了,然后再往下讀util.py里的dataset_label_parser()方法吧蜓斧,這里還有一個他自己給的segment.txt仓蛆,里面不光有我們所存的動作標注,后面還有一個數應該是劃分的k段吧挎春。上面采樣率已經設置好了是25了看疙。然后就根據segment劃分出來每一幀存到對應的文件夾中,不是什么問題直奋。
然后下一步來看這個generate_roidb_training.py文件能庆。
#coding=utf-8
# --------------------------------------------------------
# R-C3D
# Copyright (c) 2017 Boston University
# Licensed under The MIT License [see LICENSE for details]
# Written by Huijuan Xu
# --------------------------------------------------------
import os
import copy
import json
import pickle
import subprocess
import numpy as np
import cv2
from util import *
import glob
FPS = 25
ext = '.mp4'
LENGTH = 768
min_length = 3
overlap_thresh = 0.7
STEP = LENGTH / 4
WINS = [LENGTH * 1]
WINS = [LENGTH * 1]
FRAME_DIR = '/home/simon/THUMOS14'## it can be changed
META_DIR = os.path.join(FRAME_DIR, 'annotation_')
print ('Generate Training Segments')
train_segment = dataset_label_parser(META_DIR+'val', 'val', use_ambiguous=False)
def generate_roi(rois, video, start, end, stride, split):
tmp = {}
tmp['wins'] = ( rois[:,:2] - start ) / stride
tmp['durations'] = tmp['wins'][:,1] - tmp['wins'][:,0]
tmp['gt_classes'] = rois[:,2]
tmp['max_classes'] = rois[:,2]
tmp['max_overlaps'] = np.ones(len(rois))
tmp['flipped'] = False
tmp['frames'] = np.array([[0, start, end, stride]])
tmp['bg_name'] = os.path.join(FRAME_DIR, split, video)
tmp['fg_name'] = os.path.join(FRAME_DIR, split, video)
if not os.path.isfile(os.path.join(FRAME_DIR, split, video, 'image_' + str(end-1).zfill(5) + '.jpg')):
print (os.path.join(FRAME_DIR, split, video, 'image_' + str(end-1).zfill(5) + '.jpg'))
raise
return tmp
def generate_roidb(split, segment):
VIDEO_PATH = os.path.join(FRAME_DIR, split)
video_list = set(os.listdir(VIDEO_PATH))
duration = []
roidb = []
for vid in segment:
if vid in video_list:
length = len(os.listdir(os.path.join(VIDEO_PATH, vid)))
db = np.array(segment[vid])
if len(db) == 0:
continue
db[:,:2] = db[:,:2] * FPS
for win in WINS:
# inner of windows
stride = int(win / LENGTH)
# Outer of windows
step = int(stride * STEP)
# Forward Direction
for start in range(0, max(1, length - win + 1), step):
end = min(start + win, length)
assert end <= length
rois = db[np.logical_not(np.logical_or(db[:,0] >= end, db[:,1] <= start))]
# Remove duration less than min_length
if len(rois) > 0:
duration = rois[:,1] - rois[:,0]
rois = rois[duration >= min_length]
# Remove overlap less than overlap_thresh
if len(rois) > 0:
time_in_wins = (np.minimum(end, rois[:,1]) - np.maximum(start, rois[:,0]))*1.0
overlap = time_in_wins / (rois[:,1] - rois[:,0])
assert min(overlap) >= 0
assert max(overlap) <= 1
rois = rois[overlap >= overlap_thresh]
# Append data
if len(rois) > 0:
rois[:,0] = np.maximum(start, rois[:,0])
rois[:,1] = np.minimum(end, rois[:,1])
tmp = generate_roi(rois, vid, start, end, stride, split)
roidb.append(tmp)
if USE_FLIPPED:
flipped_tmp = copy.deepcopy(tmp)
flipped_tmp['flipped'] = True
roidb.append(flipped_tmp)
# Backward Direction
for end in range(length, win-1, - step):
start = end - win
assert start >= 0
rois = db[np.logical_not(np.logical_or(db[:,0] >= end, db[:,1] <= start))]
# Remove duration less than min_length
if len(rois) > 0:
duration = rois[:,1] - rois[:,0]
rois = rois[duration > min_length]
# Remove overlap less than overlap_thresh
if len(rois) > 0:
time_in_wins = (np.minimum(end, rois[:,1]) - np.maximum(start, rois[:,0]))*1.0
overlap = time_in_wins / (rois[:,1] - rois[:,0])
assert min(overlap) >= 0
assert max(overlap) <= 1
rois = rois[overlap > overlap_thresh]
# Append data
if len(rois) > 0:
rois[:,0] = np.maximum(start, rois[:,0])
rois[:,1] = np.minimum(end, rois[:,1])
tmp = generate_roi(rois, vid, start, end, stride, split)
roidb.append(tmp)
if USE_FLIPPED:
flipped_tmp = copy.deepcopy(tmp)
flipped_tmp['flipped'] = True
roidb.append(flipped_tmp)
return roidb
if __name__ == '__main__':
USE_FLIPPED = True
train_roidb = generate_roidb('val', train_segment)
print (len(train_roidb))
print ("Save dictionary")
pickle.dump(train_roidb, open('train_data_25fps_flipped.pkl','wb'), pickle.HIGHEST_PROTOCOL)
這個就是生成我們需要的輸入數據roidb,這個我查了一下是faster-rcnn讀取數據的格式帮碰,這篇文章里面講的比較詳細相味,我也是從這里大概了解了一下,啊殉挽,反正總的來說他就是一個數據讀取格式丰涉。我們就算不知道他是什么原理旨袒,反正能讀取這一步是已經做到了跨扮。
下面說一下中間出現的一些問題坤邪,首先就是報出這個torch里沒有安cuda咖杂,這個我們把cuda9.0文件中的文件直接全復制做個硬鏈接暴力解決观蜗。
然后本機測試遇到這個問題:ImportError: No module named 'numpy.core._multiarray_umath'
終端使用pip install -U numpy更新到最新版本后可以解決上述問題只酥。
然后運行
python train_net.py
我們直接看結果荧降,上述問題也都是從運行train這步出現的一些問題妈倔。
最后終于冠骄,能夠運行訓練并且保留模型了伪煤。
這里我們可以直接把自己的網絡替換,因為這個老哥也就給了4個網絡凛辣,c3d抱既,i3d,res34扁誓,res50這四個防泵,然后剩下的網絡我們可以自己添加來看看結果如何蚀之。
還存在的問題就是訓練占得顯存也很大,batch-size默認為1我的本機單卡1070顯存就占了6000m左右這個后續(xù)調優(yōu)我們再說捷泞。
不管怎么樣足删,第一步做的還是不錯的,這項工作繼續(xù)開展锁右,還會持續(xù)更新失受。
因為數據集比較大,所以跟師兄商量后拿出5類進行測試訓練咏瑟,每一類挑選了5個視頻贱纠,然后還有一類分類為ambigious,一共定下是6類响蕴。
從圖11我了解到,train_net.py里最終的loss由四個loss想加得到一個總的loss函數惠桃,因為我們有兩個子網絡浦夷,proposal和classification兩個subnet。一個sunet里包含兩個loss來約束最終的結果辜王。直接翻譯下面的英文的話劈狐,就是
出現cudaerro(59),查了一下是標簽出現問題呐馆,更正后從1開始打到最后一個類別肥缔。
目前evaluation沒有完成,看了一下需要從log里找到所需要的東西汹来,看一下test.log應該怎么保存续膳,保存要什么格式。(可以確定的是.txt格式)
然后我們就直接用腳本輸出看看最后這個出來的日志生成json看看收班,這個pytorch版的R3D還是一個殘次品坟岔,不完全邻耕。粗略看了一下log_analysis文件,他是截取log中的關鍵詞來生成對應我們想要的json文件悲伶,問題應該不是很大舆声,等待半個小時log打印出來后去評估一下,評估json我們直接用python版本的這個evaluation文件就好了,原版使用的是matlab版的,這個今下午應該可以做出來。
上述問題解決椿肩,現在已經正式開始訓練thumos2014數據集了,先看看結果能不能復現出來噪沙,可以的話就沒有太大問題局义。預計兩天半訓練結果出來。
Processed過程
又重新看了一下processed數據的過程诅挑,因為訓練的時讀取的是roidb的數據格式没龙,剛剛研究并且從faster-rcnn對應得到啟發(fā),找一下里面各個鍵對應的含義,一會整理一下到下方表格中。
這里的anchor中的K設置的是4,其實可以自由變換的,論文中說道給了2,4,5,6,8,9,10,12,14,16這么10種選擇,實際上并沒有用到這么多怜械。
目前難點以及可以優(yōu)化部分
從頭捋了一遍,難點主要是對于不同數據集是否可以進行相同的處理方式巍佑?看了一下這個版本中generate_frames對于三個數據集的處理除了采樣率不一樣沒有其他大的區(qū)別倦卖『帜恚基本是以activitynet的處理方式進行的板壮。按照可以從大范圍來約束小范圍的理論來講,應該是沒有什么問題“拢可以先等等結果看看評估如何试和,如果處理不得當再進行優(yōu)化修改节视。
對于resnet50來講寻行,這里我用了之前3D-ResNet50-Pytorch的預訓練kinetics模型剩岳,沒有想到竟然嵌套進去了尚困,這里應該是只要了resnet50每一層的權重宴卖,不會導致出來loss是nan(沒有權重加載進來算loss)
2019.4.22.
任務其實在一周之前就重啟了邻悬,目前發(fā)現現在基礎的工程完善度十分之高症昏,只不過是數據單純不好看而已,但是工程的豐富度值得我們學習拘悦。
faster-rccn這個文章還沒有看齿兔,不過基礎知識還是按照這個來做的,再仔細看一看理解一下础米。
從代碼方面具體理解細節(jié)
我們開始深入從網絡中的各部細節(jié)進行理解分苇。還是先以C3D部分進行理解。
我們先看成出c3d.py的代碼
import torch.nn as nn
import torch
from model.tdcnn.tdcnn import _TDCNN
import math
def make_layers(cfg, batch_norm=False):
layers = []
in_channels = 3
maxpool_count = 0
for v in cfg:
if v == 'M':
maxpool_count += 1
if maxpool_count==1:
layers += [nn.MaxPool3d(kernel_size=(1,2,2), stride=(1,2,2))]
elif maxpool_count==5:
layers += [nn.MaxPool3d(kernel_size=(2,2,2), stride=(2,2,2), padding=(0,1,1))]
else:
layers += [nn.MaxPool3d(kernel_size=(2,2,2), stride=(2,2,2))]
else:
conv3d = nn.Conv3d(in_channels, v, kernel_size=(3,3,3), padding=(1,1,1))
if batch_norm:
layers += [conv3d, nn.BatchNorm3d(v), nn.ReLU(inplace=True)]
else:
layers += [conv3d, nn.ReLU(inplace=True)]
in_channels = v
return nn.Sequential(*layers)
cfg = {
'A': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
}##這里c3d的config已經寫的比較清楚,一個一個便利,如果遇到maxpool的標示"m"就添加一層maxpool的操作.也就說在第一層后和最后一層分別加一個maxpool層.
class C3D(nn.Module):##這里就是一個正常的C3D網絡模型,運用上面makelayer進行操作,估計他也是從某個地方搞來的.
"""
The C3D network as described in [1].
References
----------
[1] Tran, Du, et al. "Learning spatiotemporal features with 3d convolutional networks."
Proceedings of the IEEE international conference on computer vision. 2015.
"""
def _initialize_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv3d):
n = m.kernel_size[0] * m.kernel_size[1] * m.kernel_size[2] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
if m.bias is not None:
m.bias.data.zero_()
elif isinstance(m, nn.BatchNorm3d):
m.weight.data.fill_(1)
m.bias.data.zero_()
elif isinstance(m, nn.Linear):
m.weight.data.normal_(0, 0.01)
m.bias.data.zero_()
def __init__(self):
super(C3D, self).__init__()
self.features = make_layers(cfg['A'], batch_norm=False)
self.classifier = nn.Sequential(
nn.Linear(512*1*4*4, 4096),
nn.ReLU(True),
nn.Dropout(inplace=False),
nn.Linear(4096, 4096),
nn.ReLU(True),
nn.Dropout(inplace=False),
nn.Linear(4096, 487),##這里487可以更改,應該是有輸入num_classes進行更改.
)
self._initialize_weights()
def forward(self, x):
x = self.features(x)#這里記錄features類(從make_layer里記錄)
x = x.view(x.size(0), -1)
x = self.classifier(x)##記錄分類層.這是基礎網絡,classes通過下方的tdcnn部分來傳導.,num_classes通過cfg文件設置拿過來.
return x
class c3d_tdcnn(_TDCNN):
def __init__(self, pretrained=False):
self.model_path = 'data/pretrained_model/activitynet_iter_30000_3fps-caffe.pth' #ucf101-caffe.pth' #c3d_sports1M.pth' #activitynet_iter_30000_3fps-caffe.pth
self.dout_base_model = 512##這里512是以為經過c3d網絡之后確實得到的是512*l/8*h/16*w*16大小的特征圖,總共52個通道的512張?zhí)卣鲌D,那么就保證往后輸入的特征就是512.
self.pretrained = pretrained
_TDCNN.__init__(self)
def _init_modules(self):
c3d = C3D()##這里把C3D的網絡先導進來,提取feature.
if self.pretrained:##通過代碼結合文章理解,實際上是說從c3d里直接導入這個網絡.讀取預訓練模型.
print("Loading pretrained weights from %s" %(self.model_path))
state_dict = torch.load(self.model_path)
c3d.load_state_dict({k:v for k,v in state_dict.items() if k in c3d.state_dict()})
# Using conv1 -> conv5b, not using the last maxpool##這里的標示也說的很清楚了,models.values也是把對應層的值并到一起了.
self.RCNN_base = nn.Sequential(*list(c3d.features._modules.values())[:-1])##對應的鍵值都加進去,并且也不導入maxpool這一層.
# Using fc6
self.RCNN_top = nn.Sequential(*list(c3d.classifier._modules.values())[:-4])#寫入classification這一層.但是只要nn.Linear(512*4*4,4096)和他的relu以及dropout這一層.
# Fix the layers before pool2:
for layer in range(6):
for p in self.RCNN_base[layer].parameters(): p.requires_grad = False##不要求梯度.實際上也是從頭開始計算
# not using the last maxpool layer
self.RCNN_cls_score = nn.Linear(4096, self.n_classes)#這里再把tdcnn的兩個參數導入.
self.RCNN_twin_pred = nn.Linear(4096, 2 * self.n_classes)
def _head_to_tail(self, pool5):##從頭到位結束.
pool5_flat = pool5.view(pool5.size(0), -1)###這些設置實際上都是為tdcnn服務的.
fc6 = self.RCNN_top(pool5_flat)
return fc6
下面的是tdcnn.py的代碼,也就是faster-rcnn的那一部分轉過來的做法.這個tdcnn理解應該是用來提取proposal的.
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torchvision.models as models
from torch.autograd import Variable
import numpy as np
from model.utils.config import cfg
from model.rpn.rpn import _RPN
from model.roi_temporal_pooling.modules.roi_temporal_pool import _RoITemporalPooling
from model.rpn.proposal_target_layer_cascade import _ProposalTargetLayer
import time
import pdb
from model.utils.net_utils import _smooth_l1_loss
from model.utils.non_local_dot_product import NONLocalBlock3D
DEBUG = False
class _TDCNN(nn.Module):
""" faster RCNN """
def __init__(self):
super(_TDCNN, self).__init__()
#self.classes = classes
self.n_classes = cfg.NUM_CLASSES
# loss
self.RCNN_loss_cls = 0
self.RCNN_loss_twin = 0
# define rpn
self.RCNN_rpn = _RPN(self.dout_base_model)##特征傳入rpn網絡
self.RCNN_proposal_target = _ProposalTargetLayer(self.n_classes)
self.RCNN_roi_temporal_pool = _RoITemporalPooling(cfg.POOLING_LENGTH, cfg.POOLING_HEIGHT, cfg.POOLING_WIDTH, cfg.DEDUP_TWINS)
if cfg.USE_ATTENTION:#導入nonlocal的模塊.
self.RCNN_attention = NONLocalBlock3D(self.dout_base_model, inter_channels=self.dout_base_model)
def prepare_data(self, video_data):
return video_data
def forward(self, video_data, gt_twins):
batch_size = video_data.size(0)
gt_twins = gt_twins.data
# prepare data
video_data = self.prepare_data(video_data)
# feed image data to base model to obtain base feature map
base_feat = self.RCNN_base(video_data)
# feed base feature map tp RPN to obtain rois
# rois, [rois_score], rpn_cls_prob, rpn_twin_pred, self.rpn_loss_cls, self.rpn_loss_twin, self.rpn_label, self.rpn_loss_mask
rois, _, _, rpn_loss_cls, rpn_loss_twin, _, _ = self.RCNN_rpn(base_feat, gt_twins)
# if it is training phase, then use ground truth twins for refining
if self.training:
roi_data = self.RCNN_proposal_target(rois, gt_twins)
rois, rois_label, rois_target, rois_inside_ws, rois_outside_ws = roi_data
rois_label = Variable(rois_label.view(-1).long())
rois_target = Variable(rois_target.view(-1, rois_target.size(2)))
rois_inside_ws = Variable(rois_inside_ws.view(-1, rois_inside_ws.size(2)))
rois_outside_ws = Variable(rois_outside_ws.view(-1, rois_outside_ws.size(2)))
else:
rois_label = None
rois_target = None
rois_inside_ws = None
rois_outside_ws = None
rpn_loss_cls = 0
rpn_loss_twin = 0
rois = Variable(rois)
# do roi pooling based on predicted rois
if cfg.POOLING_MODE == 'pool':
pooled_feat = self.RCNN_roi_temporal_pool(base_feat, rois.view(-1,3))
if cfg.USE_ATTENTION:
pooled_feat = self.RCNN_attention(pooled_feat)
# feed pooled features to top model
pooled_feat = self._head_to_tail(pooled_feat)
# compute twin offset, twin_pred will be (128, 402)
twin_pred = self.RCNN_twin_pred(pooled_feat)
if self.training:
# select the corresponding columns according to roi labels, twin_pred will be (128, 2)
twin_pred_view = twin_pred.view(twin_pred.size(0), int(twin_pred.size(1) / 2), 2)
twin_pred_select = torch.gather(twin_pred_view, 1, rois_label.view(rois_label.size(0), 1, 1).expand(rois_label.size(0), 1, 2))
twin_pred = twin_pred_select.squeeze(1)
# compute object classification probability
cls_score = self.RCNN_cls_score(pooled_feat)
cls_prob = F.softmax(cls_score, dim=1)
if DEBUG:
print("tdcnn.py--base_feat.shape {}".format(base_feat.shape))
print("tdcnn.py--rois.shape {}".format(rois.shape))
print("tdcnn.py--tdcnn_tail.shape {}".format(pooled_feat.shape))
print("tdcnn.py--cls_score.shape {}".format(cls_score.shape))
print("tdcnn.py--twin_pred.shape {}".format(twin_pred.shape))
RCNN_loss_cls = 0
RCNN_loss_twin = 0
if self.training:
# classification loss
RCNN_loss_cls = F.cross_entropy(cls_score, rois_label)
# bounding box regression L1 loss
RCNN_loss_twin = _smooth_l1_loss(twin_pred, rois_target, rois_inside_ws, rois_outside_ws)
# RuntimeError caused by mGPUs and higher pytorch version: https://github.com/jwyang/faster-rcnn.pytorch/issues/226
rpn_loss_cls = torch.unsqueeze(rpn_loss_cls, 0)
rpn_loss_twin = torch.unsqueeze(rpn_loss_twin, 0)
RCNN_loss_cls = torch.unsqueeze(RCNN_loss_cls, 0)
RCNN_loss_twin = torch.unsqueeze(RCNN_loss_twin, 0)
cls_prob = cls_prob.view(batch_size, rois.size(1), -1)
twin_pred = twin_pred.view(batch_size, rois.size(1), -1)
if self.training:
return rois, cls_prob, twin_pred, rpn_loss_cls, rpn_loss_twin, RCNN_loss_cls, RCNN_loss_twin, rois_label
else:
return rois, cls_prob, twin_pred
def _init_weights(self):
def normal_init(m, mean, stddev, truncated=False):
"""
weight initalizer: truncated normal and random normal.
"""
# x is a parameter
if truncated:
m.weight.data.normal_().fmod_(2).mul_(stddev).add_(mean) # not a perfect approximation
else:
m.weight.data.normal_(mean, stddev)
m.bias.data.zero_()
self.RCNN_rpn.init_weights()
normal_init(self.RCNN_cls_score, 0, 0.01, cfg.TRAIN.TRUNCATED)
normal_init(self.RCNN_twin_pred, 0, 0.001, cfg.TRAIN.TRUNCATED)
def create_architecture(self):
self._init_modules()
self._init_weights()
上面代碼實際上就是兩部分的結合屁桑,先用c3d的基礎網絡對特征提取proposal医寿,TDCNN這一部分從另一部分來學習.把對應的鍵值寫到tdcnn里進行提取proposal。從下方看其實要先走一遍rpn的方法蘑斧。
rpn.py的代碼如下靖秩。
from __future__ import absolute_import
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from model.utils.config import cfg
from .proposal_layer import _ProposalLayer
from .anchor_target_layer import _AnchorTargetLayer
from model.utils.net_utils import _smooth_l1_loss, mask_rpn_losses
import numpy as np
import math
import pdb
import time
DEBUG=False
class _RPN(nn.Module):
""" region proposal network """
def __init__(self, din, out_scores=False):##這里網絡也說明了rpn也是一個單獨的網絡,可以作為后期更改入手的方向.
super(_RPN, self).__init__()
self.din = din # get depth of input feature map, e.g., 512
self.anchor_scales = cfg.ANCHOR_SCALES##c
self.feat_stride = cfg.FEAT_STRIDE[0]
self.out_scores = out_scores#是否輸出得分?
self.mask_upsample_rate = 1
# define the convrelu layers processing input feature map
self.RPN_Conv1 = nn.Conv3d(self.din, 512, kernel_size=(3, 3, 3), stride=(1, 2, 2), padding=(1, 1, 1), bias=True)##c3d實際上經過了3*3的卷積核和一個maxpool,下面那個是給resnet等更深層的網絡準備的.
self.RPN_Conv2 = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 2, 2), padding=(1, 1, 1), bias=True)
self.RPN_output_pool = nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2))
# define bg/fg classifcation score layer
self.nc_score_out = len(self.anchor_scales) * 2 # 2(bg/fg) * 10 (anchors)##backgroun/foreground,具體是一個指標.
self.RPN_cls_score = nn.Conv3d(512, self.nc_score_out, 1, 1, 0)##1*1的卷積層,這里主要是區(qū)分類別的.
# define anchor twin offset prediction layer
self.nc_twin_out = len(self.anchor_scales) * 2 # 2(coords) * 10 (anchors)
self.RPN_twin_pred = nn.Conv3d(512, self.nc_twin_out, 1, 1, 0)
# define proposal layer
self.RPN_proposal = _ProposalLayer(self.feat_stride, self.anchor_scales, self.out_scores)##proposal加classification的操作,具體操作大概就是按論文中所寫的那樣.proposal分別要做分類和邊框回歸劃分.
# define anchor target layer
self.RPN_anchor_target = _AnchorTargetLayer(self.feat_stride, self.anchor_scales)##這里求anchors的具體分類.往forward下面看.
self.rpn_loss_cls = 0
self.rpn_loss_twin = 0
self.rpn_loss_mask = 0
@staticmethod
def reshape(x, d):##reshape
input_shape = x.size()
x = x.view(
input_shape[0],
int(d),
int(float(input_shape[1] * input_shape[2]) / float(d)),
input_shape[3],
input_shape[4]
)
return x
def forward(self, base_feat, gt_twins):
batch_size = base_feat.size(0)
# return feature map after convrelu layer
rpn_conv1 = F.relu(self.RPN_Conv1(base_feat), inplace=True)
rpn_conv2 = F.relu(self.RPN_Conv2(rpn_conv1), inplace=True)
rpn_output_pool = self.RPN_output_pool(rpn_conv2) # (1,512,96,1,1)
# get rpn classification score
rpn_cls_score = self.RPN_cls_score(rpn_output_pool)
rpn_cls_score_reshape = self.reshape(rpn_cls_score, 2)##reshape這里調用上面的reshape
#print("rpn_cls_score_reshape: {}".format(rpn_cls_score_reshape.shape))
rpn_cls_prob_reshape = F.softmax(rpn_cls_score_reshape, dim=1)
rpn_cls_prob = self.reshape(rpn_cls_prob_reshape, self.nc_score_out)
#print("rpn_cls_prob: {}".format(rpn_cls_prob.shape))
# get rpn offsets to the anchor twins
rpn_twin_pred = self.RPN_twin_pred(rpn_output_pool)
#print("rpn_twin_pred: {}".format(rpn_twin_pred.shape))
# proposal layer
cfg_key = 'TRAIN' if self.training else 'TEST'
#rois = self.RPN_proposal((rpn_cls_prob.data, rpn_twin_pred.data, cfg_key))
if self.out_scores:
rois, rois_score = self.RPN_proposal((rpn_cls_prob.data, rpn_twin_pred.data, cfg_key))
else:
rois = self.RPN_proposal((rpn_cls_prob.data, rpn_twin_pred.data, cfg_key))
self.rpn_loss_cls = 0
self.rpn_loss_twin = 0
self.rpn_loss_mask = 0
self.rpn_label = None
# generating training labels and build the rpn loss
if self.training:
assert gt_twins is not None
# rpn_data = [label_targets, twin_targets, twin_inside_weights, twin_outside_weights]
# label_targets: (batch_size, 1, A * length, height, width)
# twin_targets: (batch_size, A*2, length, height, width), the same as twin_inside_weights and twin_outside_weights
rpn_data = self.RPN_anchor_target((rpn_cls_score.data, gt_twins))
# compute classification loss
rpn_cls_score = rpn_cls_score_reshape.permute(0, 2, 3, 4, 1).contiguous().view(batch_size, -1, 2)
self.rpn_label = rpn_data[0].view(batch_size, -1)
rpn_keep = Variable(self.rpn_label.view(-1).ne(-1).nonzero().view(-1))
rpn_cls_score = torch.index_select(rpn_cls_score.view(-1,2), 0, rpn_keep)
self.rpn_label = torch.index_select(self.rpn_label.view(-1), 0, rpn_keep.data)
self.rpn_label = Variable(self.rpn_label.long())
self.rpn_loss_cls = F.cross_entropy(rpn_cls_score, self.rpn_label)
fg_cnt = torch.sum(self.rpn_label.data.ne(0))
rpn_twin_targets, rpn_twin_inside_weights, rpn_twin_outside_weights = rpn_data[1:]
# compute twin regression loss
rpn_twin_inside_weights = Variable(rpn_twin_inside_weights)
rpn_twin_outside_weights = Variable(rpn_twin_outside_weights)
rpn_twin_targets = Variable(rpn_twin_targets)
self.rpn_loss_twin = _smooth_l1_loss(rpn_twin_pred, rpn_twin_targets, rpn_twin_inside_weights,
rpn_twin_outside_weights, sigma=3, dim=[1,2,3,4])
if self.out_scores:
return rois, rois_score, rpn_cls_prob, rpn_twin_pred, self.rpn_loss_cls, self.rpn_loss_twin, self.rpn_label, self.rpn_loss_mask
else:
return rois, rpn_cls_prob, rpn_twin_pred, self.rpn_loss_cls, self.rpn_loss_twin, self.rpn_label, self.rpn_loss_mask
def init_weights(self):
def normal_init(m, mean, stddev, truncated=False):
"""
weight initalizer: truncated normal and random normal.
"""
# x is a parameter
if truncated:
m.weight.data.normal_().fmod_(2).mul_(stddev).add_(mean) # not a perfect approximation
else:
m.weight.data.normal_(mean, stddev)
m.bias.data.zero_()
normal_init(self.RPN_Conv1, 0, 0.01, cfg.TRAIN.TRUNCATED)
normal_init(self.RPN_Conv2, 0, 0.01, cfg.TRAIN.TRUNCATED)
normal_init(self.RPN_cls_score, 0, 0.01, cfg.TRAIN.TRUNCATED)
normal_init(self.RPN_twin_pred, 0, 0.01, cfg.TRAIN.TRUNCATED)
def create_architecture(self):
self._init_modules()
self.init_weights()
def generate_mask_label(self, gt_twins, feat_len):
"""
gt_twins will be (batch_size, n, 3), where each gt will be (x1, x2, class_id)
# feat_len is the length of mask-task features, self.feat_stride * feat_len = video_len
# according: self.feat_stride, and upsample_rate
# mask will be (batch_size, feat_len), -1 -- ignore, 1 -- fg, 0 -- bg
"""
batch_size = gt_twins.size(0)
mask_label = torch.zeros(batch_size, feat_len).type_as(gt_twins)
for b in range(batch_size):
single_gt_twins = gt_twins[b]
single_gt_twins[:, :2] = (single_gt_twins[:, :2] / self.feat_stride).int()
twins_start = single_gt_twins[:, 0]
_, indices = torch.sort(twins_start)
single_gt_twins = torch.index_select(single_gt_twins, 0, indices).long().cpu().numpy()
starts = np.minimum(np.maximum(0, single_gt_twins[:,0]), feat_len-1)
ends = np.minimum(np.maximum(0, single_gt_twins[:,1]), feat_len)
for x in zip(starts, ends):
mask_label[b, x[0]:x[1]+1] = 1
return mask_label
其中還涉及到的proposal的layer也需要看一下,總體的功能就是為了生成proposal.具體proposal是個什么暫且理解為就是一段有長度的序列?
from __future__ import absolute_import
# --------------------------------------------------------
# R-C3D
# Copyright (c) 2017 Boston University
# Licensed under The MIT License [see LICENSE for details]
# Written by Huijuan Xu
# --------------------------------------------------------
# --------------------------------------------------------
# Reorganized and modified by Shiguang Wang
# --------------------------------------------------------
import torch
import torch.nn as nn
import numpy as np
import math
import yaml
from model.utils.config import cfg
from .generate_anchors import generate_anchors
from .twin_transform import twin_transform_inv, clip_twins
from model.nms.nms_wrapper import nms
import pdb
DEBUG = False
class _ProposalLayer(nn.Module):
"""
Outputs object detection proposals by applying estimated bounding-box
transformations to a set of regular twins (called "anchors").
"""
def __init__(self, feat_stride, scales, out_scores=False):
super(_ProposalLayer, self).__init__()
self._feat_stride = feat_stride
self._anchors = torch.from_numpy(generate_anchors(base_size=feat_stride, scales=np.array(scales))).float()##生成anchors
self._num_anchors = self._anchors.size(0)
self._out_scores = out_scores
# TODO: add scale_ratio for video_len ??
# rois blob: holds R regions of interest, each is a 3-tuple
# (n, x1, x2) specifying an video batch index n and a
# rectangle (x1, x2)
# top[0].reshape(1, 3)
#
# # scores blob: holds scores for R regions of interest
# if len(top) > 1:
# top[1].reshape(1, 1, 1, 1)
def forward(self, input):
# Algorithm:
#
# for each (H, W) location i
# generate A anchor twins centered on cell i
# apply predicted twin deltas at cell i to each of the A anchors
# clip predicted twins to video
# remove predicted twins with either height or width < threshold
# sort all (proposal, score) pairs by score from highest to lowest
# take top pre_nms_topN proposals before NMS
# apply NMS with threshold 0.7 to remaining proposals
# take after_nms_topN proposals after NMS
# return the top proposals (-> RoIs top, scores top)
# the first set of _num_anchors channels are bg probs
# the second set are the fg probs
scores = input[0][:, self._num_anchors:, :, :, :]
twin_deltas = input[1]
cfg_key = input[2]
pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N
post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N
nms_thresh = cfg[cfg_key].RPN_NMS_THRESH
min_size = cfg[cfg_key].RPN_MIN_SIZE
# 1. Generate proposals from twin deltas and shifted anchors
length, height, width = scores.shape[-3:]
if DEBUG:
print( 'score map size: {}'.format(scores.shape))
batch_size = twin_deltas.size(0)
# Enumerate all shifts
shifts = np.arange(0, length) * self._feat_stride
shifts = torch.from_numpy(shifts.astype(float))
shifts = shifts.contiguous().type_as(scores)
# Enumerate all shifted anchors:
#
# add A anchors (1, A, 2) to
# cell K shifts (K, 1, 1) to get
# shift anchors (K, A, 2)
# reshape to (1, K*A, 2) shifted anchors
# expand to (batch_size, K*A, 2)
A = self._num_anchors
K = shifts.shape[0]
self._anchors = self._anchors.type_as(scores)
anchors = self._anchors.view(1, A, 2) + shifts.view(K, 1, 1)
anchors = anchors.view(1, K * A, 2).expand(batch_size, K * A, 2)
# Transpose and reshape predicted twin transformations to get them
# into the same order as the anchors:
#
# twin deltas will be (batch_size, 2 * A, L, H, W) format
# transpose to (batch_size, L, H, W, 2 * A)
# reshape to (batch_size, L * H * W * A, 2) where rows are ordered by (l, h, w, a)
# in slowest to fastest order
twin_deltas = twin_deltas.permute(0, 2, 3, 4, 1).contiguous()
twin_deltas = twin_deltas.view(batch_size, -1, 2)
# Same story for the scores:
#
# scores are (batch_size, A, L, H, W) format
# transpose to (batch_size, L, H, W, A)
# reshape to (batch_size, L * H * W * A) where rows are ordered by (l, h, w, a)
scores = scores.permute(0, 2, 3, 4, 1).contiguous()
scores = scores.view(batch_size, -1)
# Convert anchors into proposals via twin transformations
proposals = twin_transform_inv(anchors, twin_deltas, batch_size)
# 2. clip predicted wins to video
proposals = clip_twins(proposals, length * self._feat_stride, batch_size)
# 3. remove predicted twins with either length < threshold
# assign the score to 0 if it's non keep.
no_keep = self._filter_twins_reverse(proposals, min_size)
scores[no_keep] = 0
scores_keep = scores
proposals_keep = proposals
# sorted in descending order
_, order = torch.sort(scores_keep, 1, True)
#print ("scores_keep {}".format(scores_keep.shape))
#print ("proposals_keep {}".format(proposals_keep.shape))
#print ("order {}".format(order.shape))
output = scores.new(batch_size, post_nms_topN, 3).zero_()
if self._out_scores:
output_score = scores.new(batch_size, post_nms_topN, 2).zero_()
for i in range(batch_size):
proposals_single = proposals_keep[i]
scores_single = scores_keep[i]
# 4. sort all (proposal, score) pairs by score from highest to lowest
# 5. take top pre_nms_topN (e.g. 6000)
order_single = order[i]
if pre_nms_topN > 0 and pre_nms_topN < scores_keep.numel():
order_single = order_single[:pre_nms_topN]
proposals_single = proposals_single[order_single, :]
scores_single = scores_single[order_single].view(-1,1)
# 6. apply nms (e.g. threshold = 0.7)
# 7. take after_nms_topN (e.g. 300)
# 8. return the top proposals (-> RoIs top)
keep_idx_i = nms(torch.cat((proposals_single, scores_single), 1), nms_thresh, force_cpu=not cfg.USE_GPU_NMS)
keep_idx_i = keep_idx_i.long().view(-1)
if post_nms_topN > 0:
keep_idx_i = keep_idx_i[:post_nms_topN]
proposals_single = proposals_single[keep_idx_i, :]
scores_single = scores_single[keep_idx_i, :]
# padding 0 at the end.
num_proposal = proposals_single.size(0)
#print ("num_proposal: ", num_proposal)
output[i,:,0] = i
output[i,:num_proposal,1:] = proposals_single
if self._out_scores:
output_score[i, :, 0] = i
output_score[i, :num_proposal, 1] = scores_single
if self._out_scores:
return output, output_score
else:
return output
def backward(self, top, propagate_down, bottom):
"""This layer does not propagate gradients."""
pass
def reshape(self, bottom, top):
"""Reshaping happens during the call to forward."""
pass
def _filter_twins_reverse(self, twins, min_size):
"""get the keep index of all twins with length smaller than min_size.
twins will be (batch_size, C, 2), keep will be (batch_size, C)"""
ls = twins[:, :, 1] - twins[:, :, 0] + 1
no_keep = (ls < min_size)
return no_keep
其中的proposal里面generate_anchors也需要細看.
# --------------------------------------------------------
# R-C3D
# Copyright (c) 2017 Boston University
# Licensed under The MIT License [see LICENSE for details]
# Written by Huijuan Xu
# --------------------------------------------------------
import numpy as np
import pdb
def generate_anchors(base_size=8, scales=2**np.arange(3, 6)):##讀入anchors和讀入的批次,生成一個劃窗方
# 便我們選擇后續(xù)的anchors.對于np.arange函數的返回值: np.arange()函數返回一個有終點和起點的固定步長的排列,如[1,
# 2,3,4,5]竖瘾,起點是1沟突,終點是5,步長為1捕传。 兩個參數時惠拭,第一個參數為起點,第二個參數為終點庸论,步長取默認值1职辅。
"""
Generate anchor (reference) windows by enumerating aspect
scales wrt a reference (0, 7) window.
"""
base_anchor = np.array([1, base_size]) - 1#進入的base_size是8,也就說實際上劃窗大小是8長度的
#print('base_anchor = ',base_anchor)
anchors = _scale_enum(base_anchor, scales)##這里生成來了8個anchors,實際上對應的scales有8個.
return anchors
def _whctrs(anchor):
"""
Return width, height, x center, and y center for an anchor (window).
"""
l = anchor[1] - anchor[0] + 1
x_ctr = anchor[0] + 0.5 * (l - 1)
return l, x_ctr
def _mkanchors(ls, x_ctr):
"""
Given a vector of lengths (ls) around a center##看解釋說是生成一堆可能的anchors
(x_ctr), output a set of anchors (windows).
"""
ls = ls[:, np.newaxis ]#np.newaxis的作用就是在這一位置增加一個一維,這一位置指的是np.newaxis所在的位置聂示,比較抽象域携,需要配合例子理解。這列默認是添加到了10*1,強行作為一個二維的矩陣.##這里已經乘過8了,生成8個不同scale的候選時序.
anchors = np.hstack((x_ctr - 0.5 * (ls - 1),
x_ctr + 0.5 * (ls - 1)))#np.hstack():在水平方向上平鋪,這里又撐起了一層新的維度.
return anchors
def _scale_enum(anchor, scales):
"""
Enumerate a set of anchors for each scale wrt an anchor.
"""
l, x_ctr = _whctrs(anchor)#這里輸入的anchors長度是8,也就是0-7,長度就是8.x的中心坐標是3.5((0+7)/2)
ls = l * scales
anchors = _mkanchors(ls, x_ctr)
return anchors
if __name__ == '__main__':
import time
t = time.time()
a = generate_anchors(scales=np.array([2, 4, 5, 6, 8, 9, 10, 12, 14, 16]))
print (time.time() - t)
print (a)
from IPython import embed; embed()
對于classification的網絡,通過anchor_target_layer這一層進行生成
from __future__ import absolute_import
# --------------------------------------------------------
# R-C3D
# Copyright (c) 2017 Boston University
# Licensed under The MIT License [see LICENSE for details]
# Written by Huijuan Xu
# --------------------------------------------------------
# --------------------------------------------------------
# Reorganized and modified by Shiguang Wang
# --------------------------------------------------------
import torch
import torch.nn as nn
import numpy as np
import numpy.random as npr
from model.utils.config import cfg
from .generate_anchors import generate_anchors
from .twin_transform import clip_twins, twins_overlaps_batch, twin_transform_batch
import pdb
DEBUG = False
try:
long # Python 2
except NameError:
long = int # Python 3##照顧到了不同版本的python接口
class _AnchorTargetLayer(nn.Module):
"""
Assign anchors to ground-truth targets. Produces anchor classification
labels and bounding-box regression targets.
"""
def __init__(self, feat_stride, scales):
super(_AnchorTargetLayer, self).__init__()
self._feat_stride = feat_stride##base_stride為8
self._anchors = torch.from_numpy(generate_anchors(base_size=feat_stride, scales=np.array(scales))).float()##通過代碼來看確實是從生成的anchors繼續(xù),也就說作者論文寫的是有問題的.
self._num_anchors = self._anchors.size(0)
# allow boxes to sit over the edge by a small amount
self._allowed_border = 0 # default is 0##這個是anchors的邊界.
def forward(self, input):
# Algorithm:
#
# for each (H, W) location i
# generate 9 anchor boxes centered on cell i
# apply predicted twin deltas at cell i to each of the 9 anchors
# filter out-of-video anchors
# measure GT overlap
rpn_cls_score = input[0]##得到的分類scores
# GT boxes (batch_size, n, 3), each row of gt box contains (x1, x2, label)
gt_twins = input[1]
#im_info = input[2]
#num_boxes = input[2]
batch_size = gt_twins.size(0)
# map of shape (..., L, H, W)
length, height, width = rpn_cls_score.shape[-3:]
# Enumerate all shifts
shifts = np.arange(0, length) * self._feat_stride
shifts = torch.from_numpy(shifts.astype(float))
shifts = shifts.contiguous().type_as(rpn_cls_score)
# Enumerate all shifted anchors:
#
# add A anchors (1, A, 2) to
# cell K shifts (K, 1, 1) to get
# shift anchors (K, A, 2)
# reshape to (K*A, 2) shifted anchors
A = self._num_anchors
K = shifts.shape[0]
self._anchors = self._anchors.type_as(rpn_cls_score) # move to specific context
all_anchors = self._anchors.view((1, A, 2)) + shifts.view(K, 1, 1)
all_anchors = all_anchors.view(K * A, 2)
total_anchors = int(K * A)
keep = ((all_anchors[:, 0] >= -self._allowed_border) &
(all_anchors[:, 1] < long(length * self._feat_stride) + self._allowed_border))
inds_inside = torch.nonzero(keep).view(-1)
# keep only inside anchors
anchors = all_anchors[inds_inside, :]
# label: 1 is positive, 0 is negative, -1 is dont care
labels = gt_twins.new(batch_size, inds_inside.size(0)).fill_(-1)
twin_inside_weights = gt_twins.new(batch_size, inds_inside.size(0)).zero_()
twin_outside_weights = gt_twins.new(batch_size, inds_inside.size(0)).zero_()
#print("anchors {}".format(anchors.shape)) #(876, 2)
#print("gt_twins {}".format(gt_twins.shape)) #(1, 6, 3)
# assume anchors(batch_size, N, 2) and gt_wins(batch_size, K, 2), respectively, overlaps will be (batch_size, N, K)
overlaps = twins_overlaps_batch(anchors, gt_twins)
# find max_overlaps for each dt: (batch_size, N)
max_overlaps, argmax_overlaps = torch.max(overlaps, 2)
# find max_overlaps for each gt: (batch_size, K)
gt_max_overlaps, _ = torch.max(overlaps, 1)
if not cfg.TRAIN.RPN_CLOBBER_POSITIVES:
labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0
gt_max_overlaps[gt_max_overlaps==0] = 1e-5
keep = torch.sum(overlaps.eq(gt_max_overlaps.view(batch_size,1,-1).expand_as(overlaps)), 2)
if torch.sum(keep) > 0:
labels[keep>0] = 1
# fg label: above threshold IOU
labels[max_overlaps >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1
if cfg.TRAIN.RPN_CLOBBER_POSITIVES:
labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0
num_fg = int(cfg.TRAIN.RPN_FG_FRACTION * cfg.TRAIN.RPN_BATCHSIZE)
sum_fg = torch.sum((labels == 1).int(), 1)
sum_bg = torch.sum((labels == 0).int(), 1)
for i in range(batch_size):
# subsample positive labels if we have too many
if sum_fg[i] > num_fg:
fg_inds = torch.nonzero(labels[i] == 1).view(-1)
# torch.randperm seems has a bug on multi-gpu setting that cause the segfault.
# See https://github.com/pytorch/pytorch/issues/1868 for more details.
# use numpy instead.
#rand_num = torch.randperm(fg_inds.size(0)).type_as(gt_twins).long()
rand_num = torch.from_numpy(np.random.permutation(fg_inds.size(0))).type_as(gt_twins).long()
disable_inds = fg_inds[rand_num[:fg_inds.size(0)-num_fg]]
labels[i][disable_inds] = -1
# num_bg = cfg.TRAIN.RPN_BATCHSIZE - sum_fg[i]
num_bg = cfg.TRAIN.RPN_BATCHSIZE - torch.sum((labels == 1).int(), 1)[i]
# subsample negative labels if we have too many
if sum_bg[i] > num_bg:
bg_inds = torch.nonzero(labels[i] == 0).view(-1)
#rand_num = torch.randperm(bg_inds.size(0)).type_as(gt_twins).long()
rand_num = torch.from_numpy(np.random.permutation(bg_inds.size(0))).type_as(gt_twins).long()
disable_inds = bg_inds[rand_num[:bg_inds.size(0)-num_bg]]
labels[i][disable_inds] = -1
offset = torch.arange(0, batch_size)*gt_twins.size(1)
argmax_overlaps = argmax_overlaps + offset.view(batch_size, 1).type_as(argmax_overlaps)
twin_targets = _compute_targets_batch(anchors, gt_twins.view(-1,3)[argmax_overlaps.view(-1), :].view(batch_size, -1, 3))
# use a single value instead of 2 values for easy index.
twin_inside_weights[labels==1] = cfg.TRAIN.RPN_TWIN_INSIDE_WEIGHTS[0]
if cfg.TRAIN.RPN_POSITIVE_WEIGHT < 0:
num_examples = torch.sum(labels[i] >= 0)
positive_weights = 1.0 / num_examples.float()
negative_weights = 1.0 / num_examples.float()
else:
assert ((cfg.TRAIN.RPN_POSITIVE_WEIGHT > 0) &
(cfg.TRAIN.RPN_POSITIVE_WEIGHT < 1))
positive_weights = cfg.TRAIN.RPN_POSITIVE_WEIGHT
negative_weights = 1 - positive_weights
twin_outside_weights[labels == 1] = positive_weights
twin_outside_weights[labels == 0] = negative_weights
labels = _unmap(labels, total_anchors, inds_inside, batch_size, fill=-1)
twin_targets = _unmap(twin_targets, total_anchors, inds_inside, batch_size, fill=0)
twin_inside_weights = _unmap(twin_inside_weights, total_anchors, inds_inside, batch_size, fill=0)
twin_outside_weights = _unmap(twin_outside_weights, total_anchors, inds_inside, batch_size, fill=0)
outputs = []
labels = labels.view(batch_size, length, height, width, A).permute(0,4,1,2,3).contiguous()
labels = labels.view(batch_size, 1, A * length, height, width)
outputs.append(labels)
twin_targets = twin_targets.view(batch_size, length, height, width, A*2).permute(0,4,1,2,3).contiguous()
outputs.append(twin_targets)
anchors_count = twin_inside_weights.size(1)
twin_inside_weights = twin_inside_weights.view(batch_size,anchors_count,1).expand(batch_size, anchors_count, 2)
twin_inside_weights = twin_inside_weights.contiguous().view(batch_size, length, height, width, 2*A)\
.permute(0,4,1,2,3).contiguous()
outputs.append(twin_inside_weights)
twin_outside_weights = twin_outside_weights.view(batch_size,anchors_count,1).expand(batch_size, anchors_count, 2)
twin_outside_weights = twin_outside_weights.contiguous().view(batch_size, length, height, width, 2*A)\
.permute(0,4,1,2,3).contiguous()
outputs.append(twin_outside_weights)
return outputs
def backward(self, top, propagate_down, bottom):
"""This layer does not propagate gradients."""
pass
def reshape(self, bottom, top):
"""Reshaping happens during the call to forward."""
pass
def _unmap(data, count, inds, batch_size, fill=0):
""" Unmap a subset of item (data) back to the original set of items (of
size count) """
# for labels, twin_inside_weights and twin_outside_weights
if data.dim() == 2:
ret = data.new(batch_size, count).fill_(fill)
ret[:, inds] = data
# for twin_targets
else:
ret = data.new(batch_size, count, data.size(2)).fill_(fill)
ret[:, inds,:] = data
return ret
def _compute_targets_batch(ex_rois, gt_rois):
"""Compute bounding-box regression targets for an video."""
return twin_transform_batch(ex_rois, gt_rois[:, :, :2])##twin_transform
總體來講確實是所有方法都基于faster-rcnn的思想直接遷移過來的鱼喉。
關于我們的i3d網絡的遷移
問題出現在conv1上秀鞭,可能是因為卷積卷的不太對趋观,目前conv1設置為了3dresnet50的conv1就可以正常運行,其他參數設置也和3dresnet50一樣气筋。先這么跑著拆内,明天下課再過來看看具體如何。
目前第一個epoch已經得到結果宠默,已經在跑validation麸恍,最后看看test的結果如何,不過畢竟是第一個epoch搀矫,不太指望能跑到多高抹沪,只是大體看一下結果。
目前這個問題更改了歸一化的參數瓤球,把BN_TRACK_STATS = False融欧,默認是true。通過谷歌翻譯得到的解釋是:
布爾值卦羡,當設置為True時噪馏,此模塊跟蹤運行的均值和方差,當設置為False時绿饵,此模塊不跟蹤此類統(tǒng)計信息欠肾,并始終在訓練和評估模式下使用批處理統(tǒng)計信息。 默認值:True
不管怎么樣這樣下去loss比原來小多了拟赊,而且正常了刺桃。這個開關還是有用的。具體結果還是看明天跑出來怎么樣吸祟。