PaddlePaddle
/
PaddleSeg
mirror of https://github.com/PaddlePaddle/PaddleSeg

 
			
							# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import paddle.nn as nn
import paddle.nn.functional as F

from paddleseg.cvlibs import manager
from paddleseg.models import layers
from paddleseg.utils import utils


@manager.MODELS.add_component
class BiseNetV1(nn.Layer):
    """
    The BiSeNetV1 implementation based on PaddlePaddle.

    The original article refers to
    Yu, Changqian, et al. "BiSeNet V2: Bilateral Network with Guided Aggregation for Real-time Semantic Segmentation"
    (https://paperswithcode.com/paper/bisenet-bilateral-segmentation-network-for)

    Args:
        num_classes (int): The unique number of target classes.
        backbone (paddle.nn.Layer): Backbone network, currently support Resnet18_vd/Resnet34_vd/Resnet50_vd/Resnet101_vd.
        pretrained (str, optional): The path or url of pretrained model. Default: None.
    """

    def __init__(self, num_classes, backbone, conv_channel=128,
                 pretrained=None):
        super().__init__()
        self.backbone = backbone
        self.spatial_path = SpatialPath(3, 128)
        self.global_context = nn.Sequential(
            nn.AdaptiveAvgPool2D(1),
            layers.ConvBNReLU(
                512, conv_channel, 1, bias_attr=False), )

        self.arms = nn.LayerList([
            AttentionRefinement(512, conv_channel),
            AttentionRefinement(256, conv_channel),
        ])
        self.refines = nn.LayerList([
            layers.ConvBNReLU(
                conv_channel,
                conv_channel,
                3,
                stride=1,
                padding=1,
                bias_attr=False),
            layers.ConvBNReLU(
                conv_channel,
                conv_channel,
                3,
                stride=1,
                padding=1,
                bias_attr=False),
        ])

        self.heads = nn.LayerList([
            BiSeNetHead(conv_channel, num_classes, 8, True),
            BiSeNetHead(conv_channel, num_classes, 8, True),
            BiSeNetHead(conv_channel * 2, num_classes, 8, False),
        ])

        self.ffm = FeatureFusion(conv_channel * 2, conv_channel * 2, 1)

        self.pretrained = pretrained

    def init_weight(self):
        if self.pretrained is not None:
            utils.load_entire_model(self, self.pretrained)

    def forward(self, x):
        spatial_out = self.spatial_path(x)
        context_blocks = self.backbone(x)
        context_blocks.reverse()

        global_context = self.global_context(context_blocks[0])
        global_context = F.interpolate(
            global_context,
            size=paddle.shape(context_blocks[0])[2:],
            mode='bilinear',
            align_corners=True)
        last_fm = global_context
        pred_out = []

        for i, (
                fm, arm, refine
        ) in enumerate(zip(context_blocks[:2], self.arms, self.refines)):
            fm = arm(fm)
            fm += last_fm
            last_fm = F.interpolate(
                fm,
                size=paddle.shape(context_blocks[i + 1])[2:],
                mode='bilinear',
                align_corners=True)
            last_fm = refine(last_fm)
            pred_out.append(last_fm)
        context_out = last_fm

        concate_fm = self.ffm(spatial_out, context_out)
        pred_out.append(concate_fm)

        output = []
        if self.training:
            for i, head in enumerate(self.heads):
                out = head(pred_out[i])
                output.append(out)
        else:
            out = self.heads[-1](pred_out[-1])
            output.append(out)
        return output


class SpatialPath(nn.Layer):
    """
    SpatialPath module of BiseNetV1 model

    Args:
        in_channels (int): The number of input channels in spatial path module.
        out_channels (int): The number of output channels in spatial path module.
    """

    def __init__(self, in_channels, out_channels, inner_channel=64):
        super().__init__()
        self.conv_7x7 = layers.ConvBNReLU(
            in_channels, inner_channel, 7, stride=2, padding=3, bias_attr=False)
        self.conv_3x3_1 = layers.ConvBNReLU(
            inner_channel,
            inner_channel,
            3,
            stride=2,
            padding=1,
            bias_attr=False)
        self.conv_3x3_2 = layers.ConvBNReLU(
            inner_channel,
            inner_channel,
            3,
            stride=2,
            padding=1,
            bias_attr=False)
        self.conv_1x1 = layers.ConvBNReLU(
            inner_channel, out_channels, 1, bias_attr=False)

    def forward(self, x):
        x = self.conv_7x7(x)
        x = self.conv_3x3_1(x)
        x = self.conv_3x3_2(x)
        x = self.conv_1x1(x)
        return x


class BiSeNetHead(nn.Layer):
    """
    BiSeNet head of BiseNetV1 model

    Args:
        in_channels (int): The number of input channels in spatial path module.
        out_channels (int): The number of output channels in spatial path module.
        scale (int, float): The scale factor of interpolation.
    """

    def __init__(self, in_channels, out_channels, scale, is_aux=False):
        super().__init__()
        inner_channel = 128 if is_aux else 64
        self.conv_3x3 = layers.ConvBNReLU(
            in_channels, inner_channel, 3, stride=1, padding=1, bias_attr=False)
        self.conv_1x1 = nn.Conv2D(inner_channel, out_channels, 1)
        self.scale = scale

    def forward(self, x):
        x = self.conv_3x3(x)
        x = self.conv_1x1(x)
        if self.scale > 1:
            x = F.interpolate(
                x, scale_factor=self.scale, mode='bilinear', align_corners=True)
        return x


class AttentionRefinement(nn.Layer):
    """
    AttentionRefinement module of BiseNetV1 model

    Args:
        in_channels (int): The number of input channels in spatial path module.
        out_channels (int): The number of output channels in spatial path module.
    """

    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv_3x3 = layers.ConvBNReLU(
            in_channels, out_channels, 3, stride=1, padding=1, bias_attr=False)
        self.channel_attention = nn.Sequential(
            nn.AdaptiveAvgPool2D(1),
            layers.ConvBNReLU(
                out_channels, out_channels, 1, bias_attr=False),
            nn.Sigmoid(), )

    def forward(self, x):
        x = self.conv_3x3(x)
        se = self.channel_attention(x)
        x = x * se
        return x


class FeatureFusion(nn.Layer):
    """
    AttentionRefinement module of BiseNetV1 model

    Args:
        in_channels (int): The number of input channels in spatial path module.
        out_channels (int): The number of output channels in spatial path module.
        reduction (int): A factor shrinks convolutional channels. Default: 1.
    """

    def __init__(self, in_channels, out_channels, reduction=1):
        super().__init__()
        self.conv_1x1 = layers.ConvBNReLU(
            in_channels, out_channels, 1, bias_attr=False)
        self.channel_attention = nn.Sequential(
            nn.AdaptiveAvgPool2D(1),
            layers.ConvBNReLU(
                out_channels, out_channels // reduction, 1, bias_attr=False),
            layers.ConvBNReLU(
                out_channels // reduction, out_channels, 1, bias_attr=False),
            nn.Sigmoid(), )

    def forward(self, x1, x2):
        fm = paddle.concat([x1, x2], axis=1)
        fm = self.conv_1x1(fm)
        fm_se = self.channel_attention(fm)
        output = fm + fm * fm_se
        return output