【视频动作识别】I3D网络的代码细节_i3d的resnet实现代码-CSDN博客

本文链接：https://blog.csdn.net/Tzu_ming/article/details/103675324
视频分析中常用的I3D代码细节

目前写的比较好的两个video understanding base code
这两个代码中对 $I 3 D$ 的实现方式分别如下
#mmactoin

# configs 
backbone=dict(
        type='ResNet_I3D',
        pretrained='modelzoo://resnet50',
        depth=50,
        num_stages=4,
        out_indices=[3],
        frozen_stages=-1,
        inflate_freq=((1,1,1), (1,0,1,0), (1,0,1,0,1,0), (0,1,0)),
        inflate_style='3x1x1',
        conv1_kernel_t=5,
        conv1_stride_t=2,
        pool1_kernel_t=1,
        pool1_stride_t=2,
        bn_eval=False,
        partial_bn=False,
        style='pytorch'),

# code 
# ......
# bottle neck 这部分代码展示了 I3D 的一些细节
# if_inflate 控制 bottle neck 中是否使用时序卷积。
# True
# 一种风格是 Kernel （3，1，1）--》（1，3，3）--》（1，1，1）， 正常I3D中的bottle neck就是这样子的。
# 另一个风格是直接用（3，3，3）卷积核 Kernel (1,1,1)-->(3,3,3)---》（1，1，1）

# False
# 不使用时序卷积  (1,1,1)--->(1,1,1)--->(1,1,1)
class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self,
                 inplanes,
                 planes,
                 spatial_stride=1,
                 temporal_stride=1,
                 dilation=1,
                 downsample=None,
                 style='pytorch',
                 if_inflate=True,
                 inflate_style='3x1x1',
                 if_nonlocal=True,
                 nonlocal_cfg=None,
                 with_cp=False):
        """Bottleneck block for ResNet.
        If style is "pytorch", the stride-two layer is the 3x3 conv layer,
        if it is "caffe", the stride-two layer is the first 1x1 conv layer.
        """
        super(Bottleneck, self).__init__()
        assert style in ['pytorch', 'caffe']
        assert inflate_style in ['3x1x1', '3x3x3']
        self.inplanes = inplanes
        self.planes = planes
        if style == 'pytorch':
            self.conv1_stride = 1
            self.conv2_stride = spatial_stride
            self.conv1_stride_t = 1
            self.conv2_stride_t = temporal_stride
        else:
            self.conv1_stride = spatial_stride
            self.conv2_stride = 1
            self.conv1_stride_t = temporal_stride
            self.conv2_stride_t = 1
        if if_inflate:
          if inflate_style == '3x1x1':
              self.conv1 = nn.Conv3d(
                  inplanes,
                  planes,
                  kernel_size=(3,1,1),
                  stride=(self.conv1_stride_t, self.conv1_stride, self.conv1_stride),
                  padding=(1,0,0),
                  bias=False)
              self.conv2 = nn.Conv3d(
                  planes,
                  planes,
                  kernel_size=(1,3,3),
                  stride=(self.conv2_stride_t, self.conv2_stride, self.conv2_stride),
                  padding=(0, dilation, dilation),
                  dilation=(1, dilation, dilation),
                  bias=False)
          else:
              self.conv1 = nn.Conv3d(
                  inplanes,
                  planes,
                  kernel_size=1,
                  stride=(self.conv1_stride_t, self.conv1_stride, self.conv1_stride),
                  bias=False)
              self.conv2 = nn.Conv3d(
                  planes,
                  planes,
                  kernel_size=3,
                  stride=(self.conv2_stride_t, self.conv2_stride, self.conv2_stride),
                  padding=(1, dilation, dilation),
                  dilation=(1, dilation, dilation),
                  bias=False)
        else:
            self.conv1 = nn.Conv3d(
                inplanes,
                planes,
                kernel_size=1,
                stride=(1, self.conv1_stride, self.conv1_stride),
                bias=False)
            self.conv2 = nn.Conv3d(
                planes,
                planes,
                kernel_size=(1,3,3),
                stride=(1, self.conv2_stride, self.conv2_stride),
                padding=(0, dilation, dilation),
                dilation=(1, dilation, dilation),
                bias=False)

        self.bn1 = nn.BatchNorm3d(planes)
        self.bn2 = nn.BatchNorm3d(planes)
        self.conv3 = nn.Conv3d(
            planes, planes * self.expansion, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm3d(planes * self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.spatial_tride = spatial_stride
        self.temporal_tride = temporal_stride
        self.dilation = dilation
        self.with_cp = with_cp

        if if_nonlocal and nonlocal_cfg is not None:
            nonlocal_cfg_ = nonlocal_cfg.copy()
            nonlocal_cfg_['in_channels'] = planes * self.expansion
            self.nonlocal_block = build_nonlocal_block(nonlocal_cfg_)
        else:
            self.nonlocal_block = None

    def forward(self, x)