视频分析中常用的I3D代码细节
目前写的比较好的两个video understanding base code
这两个代码中对 I 3 D I3D I3D 的实现方式分别如下
#mmactoin
# configs
backbone=dict(
type='ResNet_I3D',
pretrained='modelzoo://resnet50',
depth=50,
num_stages=4,
out_indices=[3],
frozen_stages=-1,
inflate_freq=((1,1,1), (1,0,1,0), (1,0,1,0,1,0), (0,1,0)),
inflate_style='3x1x1',
conv1_kernel_t=5,
conv1_stride_t=2,
pool1_kernel_t=1,
pool1_stride_t=2,
bn_eval=False,
partial_bn=False,
style='pytorch'),
# code
# ......
# bottle neck 这部分代码展示了 I3D 的一些细节
# if_inflate 控制 bottle neck 中是否使用时序卷积。
# True
# 一种风格是 Kernel (3,1,1)--》(1,3,3)--》(1,1,1), 正常I3D中的bottle neck就是这样子的。
# 另一个风格是直接用(3,3,3)卷积核 Kernel (1,1,1)-->(3,3,3)---》(1,1,1)
# False
# 不使用时序卷积 (1,1,1)--->(1,1,1)--->(1,1,1)
class Bottleneck(nn.Module):
expansion = 4
def __init__(self,
inplanes,
planes,
spatial_stride=1,
temporal_stride=1,
dilation=1,
downsample=None,
style='pytorch',
if_inflate=True,
inflate_style='3x1x1',
if_nonlocal=True,
nonlocal_cfg=None,
with_cp=False):
"""Bottleneck block for ResNet.
If style is "pytorch", the stride-two layer is the 3x3 conv layer,
if it is "caffe", the stride-two layer is the first 1x1 conv layer.
"""
super(Bottleneck, self).__init__()
assert style in ['pytorch', 'caffe']
assert inflate_style in ['3x1x1', '3x3x3']
self.inplanes = inplanes
self.planes = planes
if style == 'pytorch':
self.conv1_stride = 1
self.conv2_stride = spatial_stride
self.conv1_stride_t = 1
self.conv2_stride_t = temporal_stride
else:
self.conv1_stride = spatial_stride
self.conv2_stride = 1
self.conv1_stride_t = temporal_stride
self.conv2_stride_t = 1
if if_inflate:
if inflate_style == '3x1x1':
self.conv1 = nn.Conv3d(
inplanes,
planes,
kernel_size=(3,1,1),
stride=(self.conv1_stride_t, self.conv1_stride, self.conv1_stride),
padding=(1,0,0),
bias=False)
self.conv2 = nn.Conv3d(
planes,
planes,
kernel_size=(1,3,3),
stride=(self.conv2_stride_t, self.conv2_stride, self.conv2_stride),
padding=(0, dilation, dilation),
dilation=(1, dilation, dilation),
bias=False)
else:
self.conv1 = nn.Conv3d(
inplanes,
planes,
kernel_size=1,
stride=(self.conv1_stride_t, self.conv1_stride, self.conv1_stride),
bias=False)
self.conv2 = nn.Conv3d(
planes,
planes,
kernel_size=3,
stride=(self.conv2_stride_t, self.conv2_stride, self.conv2_stride),
padding=(1, dilation, dilation),
dilation=(1, dilation, dilation),
bias=False)
else:
self.conv1 = nn.Conv3d(
inplanes,
planes,
kernel_size=1,
stride=(1, self.conv1_stride, self.conv1_stride),
bias=False)
self.conv2 = nn.Conv3d(
planes,
planes,
kernel_size=(1,3,3),
stride=(1, self.conv2_stride, self.conv2_stride),
padding=(0, dilation, dilation),
dilation=(1, dilation, dilation),
bias=False)
self.bn1 = nn.BatchNorm3d(planes)
self.bn2 = nn.BatchNorm3d(planes)
self.conv3 = nn.Conv3d(
planes, planes * self.expansion, kernel_size=1, bias=False)
self.bn3 = nn.BatchNorm3d(planes * self.expansion)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.spatial_tride = spatial_stride
self.temporal_tride = temporal_stride
self.dilation = dilation
self.with_cp = with_cp
if if_nonlocal and nonlocal_cfg is not None:
nonlocal_cfg_ = nonlocal_cfg.copy()
nonlocal_cfg_['in_channels'] = planes * self.expansion
self.nonlocal_block = build_nonlocal_block(nonlocal_cfg_)
else:
self.nonlocal_block = None
def forward(self, x)