Dite-HRNet: Dynamic Lightweight High-Resolution Network for Human PoseEstimation

原创已于 2022-06-11 11:01:36 修改 · 1.3k 阅读

7 ·

CC 4.0 BY-SA版权

文章标签：

#深度学习 #计算机视觉 #机器学习

于 2022-06-11 09:23:25 首次发布

pytorch 同时被 2 个专栏收录

97 篇文章

订阅专栏

人体姿态估计

65 篇文章

订阅专栏

本文详细介绍了深度学习中两种上下文建模模块——DCM（Dense Context Modeling）和GCM（Global Context Modeling）。DCM通过不同分辨率的特征聚合来捕获上下文信息，而GCM则全局地考虑整个特征图谱。这两种方法都在ACM（Adaptive Context Modeling）框架下工作，通过不同的注意力机制增强特征表示。动态卷积（Dynamic Convolution）和动态卷积核聚合（Dynamic Kernel Aggregation）进一步增强了模型的表达能力，允许每个输入特征图谱有多个不同的卷积核参数。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

阅读此篇文章的感触：

首先针对ACM提出两种类型：DCM和GCM

1. 首先介绍ACM的组成：

① 提出------adaptive context pooling, 包括一个1*1操作和softmax，以及transpose操作

②context shifting, 就是将context pooling特征图谱经过两个 1 × 1 convolutions with non-linear activation（1*1+BN+ReLu+1*1+BN+ReLu）和sigmoid函数

③ 将求得的shift通道权重与输入特征图谱进行像素相乘。

以上就是ACM的答题过程，针对DCM和GCM两种模块ACM的第一个步骤是不一样的。

2.DCM 的介绍

首先它是针对不同分支的不同分辨率进行聚合的，因此在ACM的第一步adaptive context pooling会有所不同。DCM的操作过程是将所有输入尺寸除以最小分辨率的尺寸，即 $H\times W/(H_{min}\times W_{min})$ ,然后在合并（concat）

具体代码如下

class DenseContextModeling(nn.Module):

    def __init__(self, channels, reduction):
        super().__init__()

        num_branches = len(channels)
        self.reduction = reduction[num_branches-2]

        self.channels = channels
        total_channel = sum(channels)
        mid_channels = total_channel // self.reduction

        ##这个是ACM中的adaptive context pooling, a context mask,一个1*1卷积和softmax部分 
        self.conv_mask = nn.ModuleList([
            nn.Conv2d(channels[i], 1, kernel_size=1, stride=1, padding=0, bias=True)
            for i in range(len(channels))
        ])
        self.softmax = nn.Softmax(dim=2)
            
        ##这个是shift操作——2个1*1卷积和一个sigmoid函数
        self.channel_attention = nn.Sequential(
            nn.Conv2d(total_channel, mid_channels, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(mid_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(mid_channels, total_channel, kernel_size=1, stride=1, padding=0, bias=True),
            nn.Sigmoid()
        )

    ##这个就是真正的ACM实现
    def global_spatial_pool(self, x, mini_size, i):
        batch, channel, height, width = x.size()
        mini_height, mini_width = mini_size

        # [N, C, H, W]
        x_m = x
        # [N, C, H * W]
        x_m = x_m.view(batch, channel, height * width)
        # [N, MH * MW, C, (H * W) / (MH * MW)]
        x_m = x_m.view(batch, mini_height * mini_width, channel, (height * width) // (mini_height * mini_width))
        # [N, 1, H, W]
        mask = self.conv_mask[i](x)
        # [N, 1, H * W]
        mask = mask.view(batch, 1, height * width)
        # [N, 1, H * W]
        mask = self.softmax(mask)
        # [N, MH * MW, (H * W) / (MH * MW)]
        mask = mask.view(batch, mini_height * mini_width, (height * width) // (mini_height * mini_width))
        # [N, MH * MW, (H * W) / (MH * MW), 1]
        mask = mask.unsqueeze(-1)
        # [N, MH * MW, C, 1]
        x = torch.matmul(x_m, mask)
        # [N, C, MH * MW, 1]
        x = x.permute(0, 2, 1, 3)
        # [N, C, MH, MW]
        x = x.view(batch, channel, mini_height, mini_width)

        return x

    def forward(self, x):
        mini_size = x[-1].size()[-2:]
        out = [self.global_spatial_pool(s, mini_size, i) for s, i in zip(x[:-1], range(len(x)))] + [x[-1]]
        out = torch.cat(out, dim=1)

        out = self.channel_attention(out)

        out = torch.split(out, self.channels, dim=1)
        out = [s * F.interpolate(a, size=s.size()[-2:], mode='nearest') for s, a in zip(x, out)]

        return out

2.GCM

代码

class GlobalContextModeling(nn.Module):

    def __init__(self, channels, num_branch, reduction, with_cp=False):
        super().__init__()

        self.with_cp = with_cp

        self.reduction = reduction[num_branch]

        mid_channels = channels // self.reduction

        self.conv_mask = nn.Conv2d(channels, 1, kernel_size=1, stride=1, padding=0, bias=True)
        self.softmax = nn.Softmax(dim=2)

        self.channel_attention = nn.Sequential(
            nn.Conv2d(channels, mid_channels, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(mid_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(mid_channels, channels, kernel_size=1, stride=1, padding=0, bias=True),
            nn.Sigmoid()
        )

        self.bn = nn.BatchNorm2d(channels)

    def global_spatial_pool(self, x):
        batch, channel, height, width = x.size()

        # [N, C, H, W]
        x_m = x
        # [N, C, H * W]
        x_m = x_m.view(batch, channel, height * width)
        # [N, 1, C, H * W]
        x_m = x_m.unsqueeze(1)
        # [N, 1, H, W]
        mask = self.conv_mask(x)
        # [N, 1, H * W]
        mask = mask.view(batch, 1, height * width)
        # [N, 1, H * W]
        mask = self.softmax(mask)
        # [N, 1, H * W, 1]
        mask = mask.unsqueeze(-1)
        # [N, 1, C, 1]
        x = torch.matmul(x_m, mask)
        # [N, C, 1, 1]
        x = x.permute(0, 2, 1, 3)

        return x

    def forward(self, x):

        def _inner_forward(x):
            identity = x

            x = self.global_spatial_pool(x)
            x = self.channel_attention(x)
            x = self.bn(identity * x)

            return x

        if self.with_cp and x.requires_grad:
            x = cp.checkpoint(_inner_forward, x)
        else:
            x = _inner_forward(x)

        return x

DSC代码

class DynamicSplitConvolution(nn.Module):

    def __init__(self, channels, stride, num_branch, num_groups, num_kernels, with_cp=False):
        super().__init__()

        self.with_cp = with_cp

        self.num_groups = num_groups[num_branch]
        self.num_kernels = num_kernels[num_branch]

        self.split_channels = _split_channels(channels, self.num_groups)

        self.conv = nn.ModuleList([
            ConvBN(
                self.split_channels[i],
                self.split_channels[i],
                kernel_size=i * 2 + 3,
                stride=stride,
                padding=i + 1,
                groups=self.split_channels[i],
                num_kernels=self.num_kernels)
            for i in range(self.num_groups)
        ])

    def forward(self, x):

        def _inner_forward(x):
            if self.num_groups == 1:
                x = self.conv[0](x)
            else:
                x_split = torch.split(x, self.split_channels, dim=1)
                x = [conv(t) for conv, t in zip(self.conv, x_split)]
                x = torch.cat(x, dim=1)
                x = channel_shuffle(x, self.num_groups)

            return x

        if self.with_cp and x.requires_grad:
            x = cp.checkpoint(_inner_forward, x)
        else:
            x = _inner_forward(x)

        return x

在这里所有的卷积和采用的是动态卷积生成卷积核

动态卷积采用的是每一个输入特征图谱都有K个不同的卷积核（卷积核的大小一样，不一样的是参数值），如何生成K个不同的卷积核，采用pytorch里面的F.conv2函数自己定义。首先是生成batchsize个K个卷积核，采用的是SENet函数。然后是对F.conv2函数进行研究发现维度和现有想法不一致，如何实现每一个输入尺寸特征图谱有K个不同卷积核参数，现将输入维度变成batchsize*inplane，然后group=batchsize，这样，每一个分支代表一个特征图谱上的输入channel个数，且每一个特征图谱具有K个不同卷积核参数。

class KernelAttention(nn.Module):
    def __init__(self, channels, reduction=4, num_kernels=4, init_weight=True):
        super().__init__()

        if channels != 3:
            mid_channels = channels // reduction
        else:
            mid_channels = num_kernels

        self.avg_pool = nn.AdaptiveAvgPool2d(1)

        self.conv1 = nn.Conv2d(channels, mid_channels, kernel_size=1, bias=False)
        self.bn = nn.BatchNorm2d(mid_channels)
        self.relu = nn.ReLU(inplace=True)

        self.conv2 = nn.Conv2d(mid_channels, num_kernels, kernel_size=1, bias=True)
        self.sigmoid = nn.Sigmoid()

        if init_weight:
            self._initialize_weights()

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            if isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

    def forward(self, x):
        x = self.avg_pool(x)

        x = self.conv1(x)
        x = self.bn(x)
        x = self.relu(x)

        x = self.conv2(x).view(x.shape[0], -1)
        x = self.sigmoid(x)

        return x


class KernelAggregation(nn.Module):

    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias, num_kernels,
                 init_weight=True):
        super().__init__()

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.dilation = dilation
        self.groups = groups
        self.bias = bias
        self.num_kernels = num_kernels

        self.weight = nn.Parameter(
            torch.randn(num_kernels, out_channels, in_channels // groups, kernel_size, kernel_size),
            requires_grad=True)
        if bias:
            self.bias = nn.Parameter(
                torch.zeros(num_kernels, out_channels))
        else:
            self.bias = None

        if init_weight:
            self._initialize_weights()

    def _initialize_weights(self):
        for i in range(self.num_kernels):
            nn.init.kaiming_uniform_(self.weight[i])

    def forward(self, x, attention):
        batch_size, in_channels, height, width = x.size()

        x = x.contiguous().view(1, batch_size * self.in_channels, height, width)

        weight = self.weight.contiguous().view(self.num_kernels, -1)
        weight = torch.mm(attention, weight).contiguous().view(
            batch_size * self.out_channels,
            self.in_channels // self.groups,
            self.kernel_size,
            self.kernel_size)

        if self.bias is not None:
            bias = torch.mm(attention, self.bias).contiguous().view(-1)
            x = F.conv2d(
                x,
                weight=weight,
                bias=bias,
                stride=self.stride,
                padding=self.padding,
                dilation=self.dilation,
                groups=self.groups * batch_size)
        else:
            x = F.conv2d(
                x,
                weight=weight,
                bias=None,
                stride=self.stride,
                padding=self.padding,
                dilation=self.dilation,
                groups=self.groups * batch_size)

        x = x.contiguous().view(batch_size, self.out_channels, x.shape[-2], x.shape[-1])

        return x


class DynamicKernelAggregation(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True,
                 num_kernels=4):
        super().__init__()
        assert in_channels % groups == 0

        self.attention = KernelAttention(
            in_channels,
            num_kernels=num_kernels)
        self.aggregation = KernelAggregation(
            in_channels,
            out_channels,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation,
            groups=groups,
            bias=bias,
            num_kernels=num_kernels)

    def forward(self, x):
        attention = x

        attention = self.attention(attention)
        x = self.aggregation(x, attention)

        return x

Enable GingerCannot connect to Ginger Check your internet connection
or reload the browserDisable in this text fieldRephraseRephrase current sentence7Log in to edit with Ginger×