songyw
/
mindcv-main

 
			
							"""
MindSpore implementation of `poolformer`.
Refer to PoolFormer: MetaFormer Is Actually What You Need for Vision.
"""

import collections.abc
from itertools import repeat

import numpy as np

import mindspore
import mindspore.common.initializer as init
from mindspore import Tensor, nn, ops

from .layers import DropPath, Identity
from .registry import register_model
from .utils import load_pretrained

__all__ = [
    "PoolFormer",
    "poolformer_s12",
    "poolformer_s24",
    "poolformer_s36",
    "poolformer_m36",
    "poolformer_m48",
]


def _cfg(url="", **kwargs):
    return {
        "url": url,
        "num_classes": 1000,
        "first_conv": "",
        "classifier": "",
        **kwargs,
    }


default_cfgs = dict(
    poolformer_s12=_cfg(
        url="https://download.mindspore.cn/toolkits/mindcv/poolformer/poolformer_s12-5be5c4e4.ckpt", crop_pct=0.9
    ),
    poolformer_s24=_cfg(url="", crop_pct=0.9),
    poolformer_s36=_cfg(url="", crop_pct=0.9),
    poolformer_m36=_cfg(url="", crop_pct=0.95),
    poolformer_m48=_cfg(url="", crop_pct=0.95),
)


def _ntuple(n):
    def parse(x):
        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
            return x
        return tuple(repeat(x, n))

    return parse


to_2tuple = _ntuple(2)


class ConvMlp(nn.Cell):
    """MLP using 1x1 convs that keeps spatial dims"""

    def __init__(
        self,
        in_features,
        hidden_features=None,
        out_features=None,
        act_layer=nn.GELU,
        norm_layer=None,
        bias=True,
        drop=0.0,
    ):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        bias = to_2tuple(bias)

        self.fc1 = nn.Conv2d(in_features, hidden_features, kernel_size=1, has_bias=bias[0])
        self.norm = norm_layer(hidden_features) if norm_layer else Identity()
        self.act = act_layer(approximate=False)
        self.drop = nn.Dropout(1 - drop)
        self.fc2 = nn.Conv2d(hidden_features, out_features, kernel_size=1, has_bias=bias[1])
        self.cls_init_weights()

    def cls_init_weights(self):
        """Initialize weights for cells."""
        for name, m in self.cells_and_names():
            if isinstance(m, nn.Conv2d):
                m.weight.set_data(
                    init.initializer(init.TruncatedNormal(sigma=.02), m.weight.shape, m.weight.dtype))
                if m.bias is not None:
                    m.bias.set_data(
                        init.initializer(init.Constant(0), m.bias.shape, m.bias.dtype))

    def construct(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x


class PatchEmbed(nn.Cell):
    """Patch Embedding that is implemented by a layer of conv.
    Input: tensor in shape [B, C, H, W]
    Output: tensor in shape [B, C, H/stride, W/stride]"""

    def __init__(self, in_chs=3, embed_dim=768, patch_size=16, stride=16, padding=0, norm_layer=None):
        super().__init__()
        patch_size = to_2tuple(patch_size)
        stride = to_2tuple(stride)
        # padding = to_2tuple(padding)
        self.proj = nn.Conv2d(in_chs, embed_dim, kernel_size=patch_size, stride=stride, padding=padding, pad_mode="pad",
                              has_bias=True)
        self.norm = norm_layer(embed_dim) if norm_layer else Identity()

    def construct(self, x):
        x = self.proj(x)
        x = self.norm(x)
        return x


class Pooling(nn.Cell):
    def __init__(self, pool_size=3):
        super().__init__()
        self.pool = nn.AvgPool2d(pool_size, stride=1, pad_mode="same")

    def construct(self, x):
        return self.pool(x) - x


class PoolFormerBlock(nn.Cell):
    """Implementation of one PoolFormer block."""

    def __init__(
        self,
        dim,
        pool_size=3,
        mlp_ratio=4.0,
        act_layer=nn.GELU,
        norm_layer=nn.GroupNorm,
        drop=0.0,
        drop_path=0.0,
        layer_scale_init_value=1e-5,
    ):
        super().__init__()
        self.norm1 = norm_layer(1, dim)
        self.token_mixer = Pooling(pool_size=pool_size)
        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else Identity()
        self.norm2 = norm_layer(1, dim)
        self.mlp = ConvMlp(dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop)

        if layer_scale_init_value:
            layer_scale_init_tensor = Tensor(layer_scale_init_value * np.ones([dim]).astype(np.float32))
            self.layer_scale_1 = mindspore.Parameter(layer_scale_init_tensor)
            self.layer_scale_2 = mindspore.Parameter(layer_scale_init_tensor)
        else:
            self.layer_scale_1 = None
            self.layer_scale_2 = None
        self.expand_dims = ops.ExpandDims()

    def construct(self, x):
        if self.layer_scale_1 is not None:
            x = x + self.drop_path(
                self.expand_dims(self.expand_dims(self.layer_scale_1, -1), -1) * self.token_mixer(self.norm1(x)))
            x = x + self.drop_path(
                self.expand_dims(self.expand_dims(self.layer_scale_2, -1), -1) * self.mlp(self.norm2(x)))
        else:
            x = x + self.drop_path(self.token_mixer(self.norm1(x)))
            x = x + self.drop_path(self.mlp(self.norm2(x)))
        return x


def basic_blocks(
    dim,
    index,
    layers,
    pool_size=3,
    mlp_ratio=4.0,
    act_layer=nn.GELU,
    norm_layer=nn.GroupNorm,
    drop_rate=0.0,
    drop_path_rate=0.0,
    layer_scale_init_value=1e-5,
):
    """generate PoolFormer blocks for a stage"""
    blocks = []
    for block_idx in range(layers[index]):
        block_dpr = drop_path_rate * (block_idx + sum(layers[:index])) / (sum(layers) - 1)
        blocks.append(PoolFormerBlock(
            dim, pool_size=pool_size, mlp_ratio=mlp_ratio,
            act_layer=act_layer, norm_layer=norm_layer,
            drop=drop_rate, drop_path=block_dpr,
            layer_scale_init_value=layer_scale_init_value,
        ))
    blocks = nn.SequentialCell(*blocks)
    return blocks


class PoolFormer(nn.Cell):
    r"""PoolFormer model class, based on
    `"MetaFormer Is Actually What You Need for Vision" <https://arxiv.org/pdf/2111.11418v3.pdf>`_

    Args:
        layers: number of blocks for the 4 stages
        embed_dims: the embedding dims for the 4 stages. Default: (64, 128, 320, 512)
        mlp_ratios: mlp ratios for the 4 stages. Default: (4, 4, 4, 4)
        downsamples: flags to apply downsampling or not. Default: (True, True, True, True)
        pool_size: the pooling size for the 4 stages. Default: 3
        in_chans: number of input channels. Default: 3
        num_classes: number of classes for the image classification. Default: 1000
        global_pool: define the types of pooling layer. Default: avg
        norm_layer: define the types of normalization. Default: nn.GroupNorm
        act_layer: define the types of activation. Default: nn.GELU
        in_patch_size: specify the patch embedding for the input image. Default: 7
        in_stride: specify the stride for the input image. Default: 4.
        in_pad: specify the pad for the input image. Default: 2.
        down_patch_size: specify the downsample. Default: 3.
        down_stride: specify the downsample (patch embed.). Default: 2.
        down_pad: specify the downsample (patch embed.). Default: 1.
        drop_rate: dropout rate of the layer before main classifier. Default: 0.
        drop_path_rate: Stochastic Depth. Default: 0.
        layer_scale_init_value: LayerScale. Default: 1e-5.
        fork_feat: whether output features of the 4 stages, for dense prediction. Default: False.
    """

    def __init__(
        self,
        layers,
        embed_dims=(64, 128, 320, 512),
        mlp_ratios=(4, 4, 4, 4),
        downsamples=(True, True, True, True),
        pool_size=3,
        in_chans=3,
        num_classes=1000,
        global_pool="avg",
        norm_layer=nn.GroupNorm,
        act_layer=nn.GELU,
        in_patch_size=7,
        in_stride=4,
        in_pad=2,
        down_patch_size=3,
        down_stride=2,
        down_pad=1,
        drop_rate=0.0,
        drop_path_rate=0.0,
        layer_scale_init_value=1e-5,
        fork_feat=False,
    ):
        super().__init__()

        if not fork_feat:
            self.num_classes = num_classes
        self.fork_feat = fork_feat

        self.global_pool = global_pool
        self.num_features = embed_dims[-1]
        self.grad_checkpointing = False

        self.patch_embed = PatchEmbed(
            patch_size=in_patch_size, stride=in_stride, padding=in_pad,
            in_chs=in_chans, embed_dim=embed_dims[0])

        # set the main block in network
        network = []
        for i in range(len(layers)):
            network.append(basic_blocks(
                embed_dims[i], i, layers,
                pool_size=pool_size, mlp_ratio=mlp_ratios[i],
                act_layer=act_layer, norm_layer=norm_layer,
                drop_rate=drop_rate, drop_path_rate=drop_path_rate,
                layer_scale_init_value=layer_scale_init_value)
            )
            if i < len(layers) - 1 and (downsamples[i] or embed_dims[i] != embed_dims[i + 1]):
                # downsampling between stages
                network.append(PatchEmbed(
                    in_chs=embed_dims[i], embed_dim=embed_dims[i + 1],
                    patch_size=down_patch_size, stride=down_stride, padding=down_pad)
                )

        self.network = nn.SequentialCell(*network)
        self.norm = norm_layer(1, embed_dims[-1])
        self.head = nn.Dense(embed_dims[-1], num_classes, has_bias=True) if num_classes > 0 else Identity()
        # self._initialize_weights()
        self.cls_init_weights()

    def cls_init_weights(self):
        """Initialize weights for cells."""
        for name, m in self.cells_and_names():
            if isinstance(m, nn.Dense):
                m.weight.set_data(
                    init.initializer(init.TruncatedNormal(sigma=.02), m.weight.shape, m.weight.dtype))
                if m.bias is not None:
                    m.bias.set_data(
                        init.initializer(init.Constant(0), m.bias.shape, m.bias.dtype))

    def reset_classifier(self, num_classes, global_pool=None):
        self.num_classes = num_classes
        if global_pool is not None:
            self.global_pool = global_pool
        self.head = nn.Dense(self.num_features, num_classes) if num_classes > 0 else Identity()

    def forward_features(self, x: Tensor) -> Tensor:
        x = self.patch_embed(x)
        x = self.network(x)
        if self.fork_feat:
            # otuput features of four stages for dense prediction
            return x
        x = self.norm(x)
        return x

    def forward_head(self, x: Tensor) -> Tensor:
        return self.head(x.mean([-2, -1]))

    def construct(self, x: Tensor) -> Tensor:
        x = self.forward_features(x)
        return self.forward_head(x)


@register_model
def poolformer_s12(pretrained: bool = False, num_classes: int = 1000, in_channels: int = 3, **kwargs) -> PoolFormer:
    """Get poolformer_s12 model.
    Refer to the base class `models.PoolFormer` for more details."""
    default_cfg = default_cfgs["poolformer_s12"]
    model = PoolFormer(in_chans=in_channels, num_classes=num_classes, layers=(2, 2, 6, 2), **kwargs)
    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)
    return model


@register_model
def poolformer_s24(pretrained: bool = False, num_classes: int = 1000, in_channels: int = 3, **kwargs) -> PoolFormer:
    """Get poolformer_s24 model.
    Refer to the base class `models.PoolFormer` for more details."""
    default_cfg = default_cfgs["poolformer_s24"]
    model = PoolFormer(in_chans=in_channels, num_classes=num_classes, layers=(4, 4, 12, 4), **kwargs)
    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)
    return model


@register_model
def poolformer_s36(pretrained: bool = False, num_classes: int = 1000, in_channels: int = 3, **kwargs) -> PoolFormer:
    """Get poolformer_s36 model.
    Refer to the base class `models.PoolFormer` for more details."""
    default_cfg = default_cfgs["poolformer_s36"]
    model = PoolFormer(
        in_chans=in_channels, num_classes=num_classes, layers=(6, 6, 18, 6), layer_scale_init_value=1e-6, **kwargs
    )
    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)
    return model


@register_model
def poolformer_m36(pretrained: bool = False, num_classes: int = 1000, in_channels: int = 3, **kwargs) -> PoolFormer:
    """Get poolformer_m36 model.
    Refer to the base class `models.PoolFormer` for more details."""
    default_cfg = default_cfgs["poolformer_m36"]
    layers = (6, 6, 18, 6)
    embed_dims = (96, 192, 384, 768)
    model = PoolFormer(
        in_chans=in_channels,
        num_classes=num_classes,
        layers=layers,
        layer_scale_init_value=1e-6,
        embed_dims=embed_dims,
        **kwargs
    )
    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)
    return model


@register_model
def poolformer_m48(pretrained: bool = False, num_classes: int = 1000, in_channels: int = 3, **kwargs) -> PoolFormer:
    """Get poolformer_m48 model.
    Refer to the base class `models.PoolFormer` for more details."""
    default_cfg = default_cfgs["poolformer_m48"]
    layers = (8, 8, 24, 8)
    embed_dims = (96, 192, 384, 768)
    model = PoolFormer(
        in_chans=in_channels,
        num_classes=num_classes,
        layers=layers,
        layer_scale_init_value=1e-6,
        embed_dims=embed_dims,
        **kwargs
    )
    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)
    return model