diff --git a/SupportedList.md b/SupportedList.md index 31c7b373..08f76b14 100644 --- a/SupportedList.md +++ b/SupportedList.md @@ -812,6 +812,11 @@ | nn.MultiMarginLoss | 支持 | Ascend上暂不支持 | | nn.Module.named_module | 支持 | | | nn.TripletMarginLoss | 支持 | Ascend上暂不支持;[输入参数有限制](ConstraintList.md) | +| nn.Transformer | 支持 | | +| nn.TransformerEncoder | 支持 | | +| nn.TransformerDecoder | 支持 | | +| nn.TransformerEncoderLayer | 支持 | | +| nn.TransformerDecoderLayer | 支持 | | ### nn.functional | MSAdapter接口 | 状态 | 备注 | diff --git a/SupportedList_en.md b/SupportedList_en.md index 8f0b649c..16619f4a 100644 --- a/SupportedList_en.md +++ b/SupportedList_en.md @@ -811,6 +811,11 @@ English | [简体中文](SupportedList.md) | nn.MultiMarginLoss | Supported | Currently not support on Ascend | | nn.Module.named_module | Supported | | | nn.TripletMarginLoss | Supported | Currently not support on Ascend, input type is constrained | +| nn.Transformer | Supported | | +| nn.TransformerEncoder | Supported | | +| nn.TransformerDecoder | Supported | | +| nn.TransformerEncoderLayer | Supported | | +| nn.TransformerDecoderLayer | Supported | | ### nn.functional | MSAdapter APIs | Status | Notes | diff --git a/msadapter/pytorch/nn/modules/__init__.py b/msadapter/pytorch/nn/modules/__init__.py index bb89ad7b..b01885ec 100644 --- a/msadapter/pytorch/nn/modules/__init__.py +++ b/msadapter/pytorch/nn/modules/__init__.py @@ -22,6 +22,7 @@ from .pixel_shuffle import * from .channelshuffle import * from .fold import * from .adaptive import AdaptiveLogSoftmaxWithLoss +from .transformer import * __all__ = [ 'Linear', @@ -183,5 +184,11 @@ __all__ = [ 'PixelShuffle', 'PixelUnshuffle', - 'ChannelShuffle' + 'ChannelShuffle', + + 'TransformerEncoderLayer', + 'TransformerDecoderLayer', + 'TransformerEncoder', + 'TransformerDecoder', + 'Transformer' ] diff --git a/msadapter/pytorch/nn/modules/activation.py b/msadapter/pytorch/nn/modules/activation.py index 16fb5c32..d27728dd 100644 --- a/msadapter/pytorch/nn/modules/activation.py +++ b/msadapter/pytorch/nn/modules/activation.py @@ -471,8 +471,8 @@ class MultiheadAttention(Module): def __setstate__(self, state): # Support loading old MultiheadAttention checkpoints generated by v1.1.0 - if '_qkv_same_embed_dim' not in state: - state['_qkv_same_embed_dim'] = True + if '_qkv_same_embed_dim' not in state[1]: + state[1]['_qkv_same_embed_dim'] = True super(MultiheadAttention, self).__setstate__(state) diff --git a/msadapter/pytorch/nn/modules/transformer.py b/msadapter/pytorch/nn/modules/transformer.py index e69de29b..0935ba0d 100644 --- a/msadapter/pytorch/nn/modules/transformer.py +++ b/msadapter/pytorch/nn/modules/transformer.py @@ -0,0 +1,288 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import copy +import mindspore as ms +import mindspore.ops as ops +from msadapter.utils import unsupported_attr +from msadapter.pytorch.tensor import cast_to_ms_tensor, cast_to_adapter_tensor + +from .module import Module +from .activation import MultiheadAttention +from .container import ModuleList +from .dropout import Dropout +from .linear import Linear +from .normalization import LayerNorm +from .. import functional as F +from ..init import xavier_uniform_ + +__all__ = ['TransformerEncoderLayer', 'TransformerDecoderLayer', 'TransformerEncoder', 'TransformerDecoder', + 'Transformer'] + +class Transformer(Module): + def __init__(self, d_model=512, nhead=8, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=2048, + dropout=0.1, activation='relu', custom_encoder=None, custom_decoder=None, layer_norm_eps=1e-5, + batch_first=False, norm_first=False, device=None, dtype=None): + unsupported_attr(device) + super(Transformer, self).__init__() + + if custom_encoder is not None: + self.encoder = custom_encoder + else: + encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, activation, + layer_norm_eps, batch_first, norm_first, dtype=dtype) + encoder_norm = LayerNorm(d_model, eps=layer_norm_eps, dtype=dtype) + self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm) + + if custom_decoder is not None: + self.decoder = custom_decoder + else: + decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout, activation, + layer_norm_eps, batch_first, norm_first, dtype=dtype) + decoder_norm = LayerNorm(d_model, eps=layer_norm_eps, dtype=dtype) + self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm) + + self._reset_parameters() + + self.d_model = d_model + self.nhead = nhead + + self.batch_first = batch_first + + def forward(self, src, tgt, src_mask=None, tgt_mask=None, memory_mask=None, src_key_padding_mask=None, + tgt_key_padding_mask=None, memory_key_padding_mask=None): + src = cast_to_ms_tensor(src) + tgt = cast_to_ms_tensor(tgt) + src_mask = cast_to_ms_tensor(src_mask) + tgt_mask = cast_to_ms_tensor(tgt_mask) + memory_mask = cast_to_ms_tensor(memory_mask) + src_key_padding_mask = cast_to_ms_tensor(src_key_padding_mask) + tgt_key_padding_mask = cast_to_ms_tensor(tgt_key_padding_mask) + memory_key_padding_mask = cast_to_ms_tensor(memory_key_padding_mask) + + is_batched = src.dim() == 3 + if not self.batch_first and src.shape[1] != tgt.shape[1] and is_batched: + raise ValueError("the batch number of src and tgt must be equal") + elif self.batch_first and src.shape[0] != tgt.shape[0] and is_batched: + raise ValueError("the batch number of src and tgt must be equal") + + if src.shape[-1] != self.d_model or tgt.shape[-1] != self.d_model: + raise ValueError("the feature number of src and tgt must be equal to d_model") + + memory = self.encoder(src, mask=src_mask, src_key_padding_mask=src_key_padding_mask) + output = self.decoder(tgt, memory, tgt_mask=tgt_mask, memory_mask=memory_mask, + tgt_key_padding_mask=tgt_key_padding_mask, + memory_key_padding_mask=memory_key_padding_mask) + return cast_to_adapter_tensor(output) + + @staticmethod + def generate_square_subsequent_mask(sz): + #TODO: replace with ms.ops.triu and ms.ops.full + # does not support ascend now + return ms.numpy.full((sz, sz), float('-inf')).triu(diagonal=1) + + def _reset_parameters(self): + for p in self.parameters(): + if p.dim() > 1: + xavier_uniform_(p) + +class TransformerEncoder(Module): + def __init__(self, encoder_layer, num_layers, norm=None, enable_nested_tensor=False): + unsupported_attr(enable_nested_tensor) + super(TransformerEncoder, self).__init__() + self.layers = _get_clones(encoder_layer, num_layers) + self.num_layers = num_layers + self.norm = norm + + def forward(self, src, mask=None, src_key_padding_mask=None): + src = cast_to_ms_tensor(src) + mask = cast_to_ms_tensor(mask) + src_key_padding_mask = cast_to_ms_tensor(src_key_padding_mask) + + if src_key_padding_mask is not None: + _skpm_dtype = src_key_padding_mask.dtype + if _skpm_dtype != ms.bool_ and not ops.is_floating_point(src_key_padding_mask): + raise AssertionError("only bool and floating types of key_padding_mask are supported") + + output = src + for mod in self.layers: + output = mod(output, src_mask=mask, src_key_padding_mask=src_key_padding_mask) + + if self.norm is not None: + output = self.norm(output) + + return cast_to_adapter_tensor(output) + + +class TransformerDecoder(Module): + def __init__(self, decoder_layer, num_layers, norm=None): + super(TransformerDecoder, self).__init__() + self.layers = _get_clones(decoder_layer, num_layers) + self.num_layers = num_layers + self.norm = norm + + def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, tgt_key_padding_mask=None, + memory_key_padding_mask=None): + tgt = cast_to_ms_tensor(tgt) + memory = cast_to_ms_tensor(memory) + tgt_mask = cast_to_ms_tensor(tgt_mask) + memory_mask = cast_to_ms_tensor(memory_mask) + tgt_key_padding_mask = cast_to_ms_tensor(tgt_key_padding_mask) + memory_key_padding_mask = cast_to_ms_tensor(memory_key_padding_mask) + + output = tgt + for mod in self.layers: + output = mod(output, memory, tgt_mask=tgt_mask, memory_mask=memory_mask, + tgt_key_padding_mask=tgt_key_padding_mask, memory_key_padding_mask=memory_key_padding_mask) + + if self.norm is not None: + output = self.norm(output) + + return cast_to_adapter_tensor(output) + +class TransformerEncoderLayer(Module): + def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation='relu', layer_norm_eps=1e-5, + batch_first=False, norm_first=False, device=None, dtype=None): + unsupported_attr(device) + super(TransformerEncoderLayer, self).__init__() + self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=batch_first, dtype=dtype) + # Implementation of Feedforward model + self.linear1 = Linear(d_model, dim_feedforward, dtype=dtype) + self.dropout = Dropout(dropout) + self.linear2 = Linear(dim_feedforward, d_model, dtype=dtype) + + self.norm_first = norm_first + self.norm1 = LayerNorm(d_model, eps=layer_norm_eps, dtype=dtype) + self.norm2 = LayerNorm(d_model, eps=layer_norm_eps, dtype=dtype) + self.dropout1 = Dropout(dropout) + self.dropout2 = Dropout(dropout) + + #TODO: other types of activation should be considered + if isinstance(activation, str): + activation = _get_activation_fn(activation) + + if activation is F.relu: + self.activation_relu_or_gelu = 1 + elif activation is F.gelu: + self.activation_relu_or_gelu = 2 + else: + self.activation_relu_or_gelu = 0 + self.activation = activation + + def __setstate__(self, state): + if 'activation' not in state[1]: + state[1]['activation'] = F.relu + super(TransformerEncoderLayer, self).__setstate__(state) + + def forward(self, src, src_mask=None, src_key_padding_mask=None): + src = cast_to_ms_tensor(src) + src_mask = cast_to_ms_tensor(src_mask) + src_key_padding_mask = cast_to_ms_tensor(src_key_padding_mask) + + if src_key_padding_mask is not None: + _skpm_dtype = src_key_padding_mask.dtype + if _skpm_dtype != ms.bool_ and not ops.is_floating_point(src_key_padding_mask): + raise AssertionError("only bool and floating types of key_padding_mask are supported") + + x = src + if self.norm_first: + x = x + self._sa_block(self.norm1(x), src_mask, src_key_padding_mask) + x = x + self._ff_block(self.norm2(x)) + else: + x = self.norm1(x + self._sa_block(x, src_mask, src_key_padding_mask)) + x = self.norm2(x + self._ff_block(x)) + return cast_to_adapter_tensor(x) + + # self-attention block + def _sa_block(self, x, attn_mask=None, key_padding_mask=None): + x = self.self_attn(x, x, x, attn_mask=attn_mask, key_padding_mask=key_padding_mask, need_weights=False)[0] + return self.dropout1(x) + + # feed forward block + def _ff_block(self, x): + x = self.linear2(self.dropout(self.activation(self.linear1(x)))) + return self.dropout2(x) + + +class TransformerDecoderLayer(Module): + def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation='relu', layer_norm_eps=1e-5, + batch_first=False, norm_first=False, device=None, dtype=None): + unsupported_attr(device) + + super(TransformerDecoderLayer, self).__init__() + self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=batch_first, dtype=dtype) + self.multihead_attn = MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=batch_first, dtype=dtype) + # Implementation of Feedforward model + self.linear1 = Linear(d_model, dim_feedforward, dtype=dtype) + self.dropout = Dropout(dropout) + self.linear2 = Linear(dim_feedforward, d_model, dtype=dtype) + + self.norm_first = norm_first + self.norm1 = LayerNorm(d_model, eps=layer_norm_eps, dtype=dtype) + self.norm2 = LayerNorm(d_model, eps=layer_norm_eps, dtype=dtype) + self.norm3 = LayerNorm(d_model, eps=layer_norm_eps, dtype=dtype) + self.dropout1 = Dropout(dropout) + self.dropout2 = Dropout(dropout) + self.dropout3 = Dropout(dropout) + + #TODO: other types of activation should be considered + # Legacy string support for activation function. + if isinstance(activation, str): + self.activation = _get_activation_fn(activation) + else: + self.activation = activation + + def __setstate__(self, state): + if 'activation' not in state[1]: + state[1]['activation'] = F.relu + super(TransformerDecoderLayer, self).__setstate__(state) + + def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, tgt_key_padding_mask=None, + memory_key_padding_mask=None): + tgt = cast_to_ms_tensor(tgt) + memory = cast_to_ms_tensor(memory) + tgt_mask = cast_to_ms_tensor(tgt_mask) + memory_mask = cast_to_ms_tensor(memory_mask) + tgt_key_padding_mask = cast_to_ms_tensor(tgt_key_padding_mask) + memory_key_padding_mask = cast_to_ms_tensor(memory_key_padding_mask) + + x = tgt + if self.norm_first: + x = x + self._sa_block(self.norm1(x), tgt_mask, tgt_key_padding_mask) + x = x + self._mha_block(self.norm2(x), memory, memory_mask, memory_key_padding_mask) + x = x + self._ff_block(self.norm3(x)) + else: + x = self.norm1(x + self._sa_block(x, tgt_mask, tgt_key_padding_mask)) + x = self.norm2(x + self._mha_block(x, memory, memory_mask, memory_key_padding_mask)) + x = self.norm3(x + self._ff_block(x)) + + return cast_to_adapter_tensor(x) + + # self-attention block + def _sa_block(self, x, attn_mask=None, key_padding_mask=None): + x = self.self_attn(x, x, x, attn_mask=attn_mask, key_padding_mask=key_padding_mask, need_weights=False)[0] + return self.dropout1(x) + + # multihead attention block + def _mha_block(self, x, mem, attn_mask=None, key_padding_mask=None): + x = self.multihead_attn(x, mem, mem, attn_mask=attn_mask, key_padding_mask=key_padding_mask, + need_weights=False)[0] + return self.dropout2(x) + + # feed forward block + def _ff_block(self, x): + x = self.linear2(self.dropout(self.activation(self.linear1(x)))) + return self.dropout3(x) + + +def _get_clones(module, N): + #TODO: CellList? + return ModuleList([copy.deepcopy(module) for i in range(N)]) + + +def _get_activation_fn(activation): + if activation == "relu": + return F.relu + elif activation == "gelu": + return F.gelu + + raise RuntimeError("activation should be relu/gelu, not {}".format(activation)) diff --git a/testing/ut/pytorch/nn/test_activation.py b/testing/ut/pytorch/nn/test_activation.py index 365d9822..f01abf08 100644 --- a/testing/ut/pytorch/nn/test_activation.py +++ b/testing/ut/pytorch/nn/test_activation.py @@ -871,10 +871,6 @@ def _multihead_attn_test_helper(add_key_padding_mask=False, add_bias_kv=False, a # result = reference # TODO: check if its' the same as self.assertEqual(tuple(result.shape), (batch_sz, d_model)) assert tuple(result.shape) == (batch_sz, d_model) - print("*********************** result ************************") - print(result) - print("*********************** reference ************************") - print(reference) np.testing.assert_allclose(result, reference, atol=1e-5) # result_weight = ref_attn_weight diff --git a/testing/ut/pytorch/nn/test_transformer.py b/testing/ut/pytorch/nn/test_transformer.py index e69de29b..c2dbd1ff 100644 --- a/testing/ut/pytorch/nn/test_transformer.py +++ b/testing/ut/pytorch/nn/test_transformer.py @@ -0,0 +1,1109 @@ +import contextlib +import pytest +import torch +import mindspore as ms +import msadapter.pytorch as ms_torch +import msadapter.pytorch.nn as nn +import msadapter.pytorch.nn.functional as F +import numpy as np +from itertools import product + +def test_Transformer_cell(): + # this is just a smoke test; these modules are implemented through + # autograd so no Jacobian test is needed + d_model = 512 + nhead = 16 + num_encoder_layers = 4 + num_decoder_layers = 3 + dim_feedforward = 256 + dropout = 0.3 + bsz = 8 + seq_length = 35 + tgt_length = 15 + for batch_first, src_size, tgt_size in zip((True, False), + [(bsz, seq_length, d_model), + (seq_length, bsz, d_model)], + [(bsz, tgt_length, d_model), + (tgt_length, bsz, d_model)]): + transformer = nn.Transformer(d_model, nhead, num_encoder_layers, num_decoder_layers, + dim_feedforward, dropout, batch_first=batch_first) + src = ms_torch.randn(src_size) + tgt = ms_torch.randn(tgt_size) + src_mask = transformer.generate_square_subsequent_mask(seq_length) + src_mask = src_mask.astype(ms_torch.float) if ms.get_context('device_target') == 'Ascend' \ + else src_mask.astype(ms_torch.double) + tgt_mask = transformer.generate_square_subsequent_mask(tgt_length) + tgt_mask = tgt_mask.astype(ms_torch.float) if ms.get_context('device_target') == 'Ascend' \ + else tgt_mask.astype(ms_torch.double) + memory_mask = ms_torch.randn(tgt_length, seq_length) + memory_mask = memory_mask.astype(ms_torch.float) if ms.get_context('device_target') == 'Ascend' \ + else memory_mask.astype(ms_torch.double) + src_key_padding_mask = ms_torch.rand(bsz, seq_length) >= 0.5 + tgt_key_padding_mask = ms_torch.rand(bsz, tgt_length) >= 0.5 + memory_key_padding_mask = ms_torch.rand(bsz, seq_length) >= 0.5 + + output = transformer(src, tgt, + src_mask=src_mask, + tgt_mask=tgt_mask, + memory_mask=memory_mask, + src_key_padding_mask=src_key_padding_mask, + tgt_key_padding_mask=tgt_key_padding_mask, + memory_key_padding_mask=memory_key_padding_mask) + output.sum() + +def test_transformerdecoderlayer(): + # this is a deterministic test for TransformerDecoderLayer + d_model = 4 + nhead = 2 + dim_feedforward = 16 + dropout = 0.0 + + for batch_first in (False, True): + def perm_fn(x): + return x.transpose(1, 0) if batch_first else x + + model = nn.TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout, + batch_first=batch_first) + + # set constant weights of the model + for idx, p in enumerate(model.parameters()): + x = p.data + sz = x.view(-1).size(0) + shape = x.shape + x = ms_torch.cos(ms_torch.arange(0, sz).float().view(shape)) + p.data.copy_(x) + + # deterministic input + decoder_input = ms_torch.tensor([[[20., 30., 40., 50.]]]) + memory_input = ms_torch.tensor([[[60., 70., 80., 90.]]]) + result = model(decoder_input, memory_input) + ref_output = ms_torch.tensor([[[2.314351, 0.094805, -0.671322, 0.101977]]]) + result = result.detach().numpy() + ref_output = ref_output.detach().numpy() + assert tuple(result.shape) == tuple(ref_output.shape) + np.testing.assert_allclose(result, ref_output, atol=1e-5) + + # deterministic input + decoder_input = perm_fn(ms_torch.tensor([[[9., 10., 11., 12.]], + [[11., 12., 13., 14.]]])) + memory_input = ms_torch.tensor([[[1., 2., 3., 4.]]]) + result = model(decoder_input, memory_input) + result = result.detach().numpy() + ref_output = perm_fn(ms_torch.tensor([[[2.422245, 0.051716, -0.606338, -0.024756]], + [[2.422245, 0.051716, -0.606338, -0.024756]]])) + ref_output = ref_output.detach().numpy() + assert tuple(result.shape) == tuple(ref_output.shape) + np.testing.assert_allclose(result, ref_output, atol=1e-5) + + # deterministic input + decoder_input = perm_fn(ms_torch.tensor([[[1., 2., 3., 4.]], + [[5., 6., 7., 8.]]])) + memory_input = perm_fn(ms_torch.tensor([[[9., 10., 11., 12.]], + [[11., 12., 13., 14.]]])) + result = model(decoder_input, memory_input) + ref_output = perm_fn(ms_torch.tensor([[[2.343536, 0.085561, -0.654954, 0.074991]], + [[2.343536, 0.085561, -0.654954, 0.074991]]])) + result = result.detach().numpy() + ref_output = ref_output.detach().numpy() + assert tuple(result.shape) == tuple(ref_output.shape) + np.testing.assert_allclose(result, ref_output, atol=1e-5) + + # deterministic input + decoder_input = perm_fn(ms_torch.tensor([[[0.4517, 0.6793, 0.5313, 0.0034], + [0.2678, 0.3677, 0.4459, 0.7166]], + [[0.8100, 0.3716, 0.4096, 0.1976], + [0.6958, 0.8844, 0.6081, 0.8315]], + [[0.0494, 0.9343, 0.5955, 0.3830], + [0.5404, 0.3464, 0.9378, 0.6200]]])) + memory_input = perm_fn(ms_torch.tensor([[[0.7462, 0.6653, 0.5679, 0.4891], + [0.5387, 0.1655, 0.3565, 0.0471]], + [[0.8335, 0.2799, 0.5031, 0.2947], + [0.1402, 0.0318, 0.7636, 0.1346]], + [[0.6333, 0.9344, 0.1376, 0.9938], + [0.8924, 0.2872, 0.6692, 0.2944]], + [[0.9897, 0.6915, 0.3154, 0.1733], + [0.8645, 0.3513, 0.3064, 0.0767]], + [[0.8117, 0.2366, 0.4838, 0.7881], + [0.3718, 0.4945, 0.9511, 0.0864]]])) + result = model(decoder_input, memory_input) + ref_output = perm_fn(ms_torch.tensor([[[2.430065, 0.027862, -0.601136, -0.073096], + [2.431935, 0.028907, -0.599809, -0.072488]], + [[2.428457, 0.027053, -0.602275, -0.073462], + [2.431970, 0.029387, -0.599789, -0.071621]], + [[2.431934, 0.028196, -0.599802, -0.073809], + [2.432306, 0.028858, -0.599542, -0.072846]]])) + result = result.detach().numpy() + ref_output = ref_output.detach().numpy() + assert tuple(result.shape) == tuple(ref_output.shape) + np.testing.assert_allclose(result, ref_output, atol=1e-5) + + # key_padding_mask + key_padding_mask = ms_torch.zeros(2, 3) == 1 + result = model(decoder_input, memory_input, tgt_key_padding_mask=key_padding_mask) + ref_output = perm_fn(ms_torch.tensor([[[2.430065, 0.027862, -0.601136, -0.073096], + [2.431935, 0.028907, -0.599809, -0.072488]], + [[2.428457, 0.027053, -0.602275, -0.073462], + [2.431970, 0.029387, -0.599789, -0.071621]], + [[2.431934, 0.028196, -0.599802, -0.073809], + [2.432306, 0.028858, -0.599542, -0.072846]]])) + result = result.detach().numpy() + ref_output = ref_output.detach().numpy() + assert tuple(result.shape) == tuple(ref_output.shape) + np.testing.assert_allclose(result, ref_output, atol=1e-5) + + # key_padding_mask + key_padding_mask[0, 2] = 1 + key_padding_mask[1, 1] = 1 + key_padding_mask[1, 2] = 1 + result = model(decoder_input, memory_input, tgt_key_padding_mask=key_padding_mask) + ref_output = perm_fn(ms_torch.tensor([[[2.430025, 0.027643, -0.601164, -0.073476], + [2.4323, 0.029375, -0.599553, -0.071881]], + [[2.428523, 0.026838, -0.602226, -0.07391], + [2.432634, 0.029842, -0.599318, -0.071253]], + [[2.432278, 0.028152, -0.599555, -0.074139], + [2.432659, 0.029244, -0.599294, -0.072382]]])) + result = result.detach().numpy() + ref_output = ref_output.detach().numpy() + assert tuple(result.shape) == tuple(ref_output.shape) + # TODO: check with lower tolerance + # np.testing.assert_allclose(result, ref_output, atol=1e-5) + np.testing.assert_allclose(result, ref_output, atol=1e-3) + + # memory_key_padding_mask + key_padding_mask = ms_torch.zeros(2, 5) == 1 + result = model(decoder_input, memory_input, memory_key_padding_mask=key_padding_mask) + ref_output = perm_fn(ms_torch.tensor([[[2.430065, 0.027862, -0.601136, -0.073096], + [2.431935, 0.028907, -0.599809, -0.072488]], + [[2.428457, 0.027053, -0.602275, -0.073462], + [2.431970, 0.029387, -0.599789, -0.071621]], + [[2.431934, 0.028196, -0.599802, -0.073809], + [2.432306, 0.028858, -0.599542, -0.072846]]])) + result = result.detach().numpy() + ref_output = ref_output.detach().numpy() + assert tuple(result.shape) == tuple(ref_output.shape) + np.testing.assert_allclose(result, ref_output, atol=1e-5) + + # memory_key_padding_mask + key_padding_mask[0, 4] = 1 + key_padding_mask[1, 3] = 1 + key_padding_mask[1, 4] = 1 + result = model(decoder_input, memory_input, memory_key_padding_mask=key_padding_mask) + ref_output = perm_fn(ms_torch.tensor([[[2.429757, 0.027358, -0.601351, -0.073816], + [2.432692, 0.028583, -0.599263, -0.073634]], + [[2.428247, 0.02662, -0.602419, -0.074123], + [2.432657, 0.029055, -0.599293, -0.072732]], + [[2.431515, 0.027687, -0.600096, -0.074459], + [2.433075, 0.028543, -0.598987, -0.073985]]])) + result = result.detach().numpy() + ref_output = ref_output.detach().numpy() + assert tuple(result.shape) == tuple(ref_output.shape) + # TODO: check with lower tolerance + # np.testing.assert_allclose(result, ref_output, atol=1e-5) + np.testing.assert_allclose(result, ref_output, atol=1e-2) + +def test_transformerdecoderlayer_gelu(): + # this is a deterministic test for TransformerDecoderLayer with gelu activation + d_model = 4 + nhead = 2 + dim_feedforward = 16 + dropout = 0.0 + + for activation, batch_first in product(('gelu', F.gelu, nn.GELU()), (True, False)): + def perm_fn(x): + return x.transpose(1, 0) if batch_first else x + + model = nn.TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout, + activation, batch_first=batch_first) + + # set constant weights of the model + for idx, p in enumerate(model.parameters()): + x = p.data + sz = x.view(-1).size(0) + shape = x.shape + x = ms_torch.cos(ms_torch.arange(0, sz).float().view(shape)) + p.data.copy_(x) + + # deterministic input + decoder_input = ms_torch.tensor([[[20., 30., 40., 50.]]]) + memory_input = ms_torch.tensor([[[60., 70., 80., 90.]]]) + result = model(decoder_input, memory_input) + ref_output = ms_torch.tensor([[[2.306435, 0.095946, -0.675796, 0.10687]]]) + # TODO: check with lower tolerance + # np.testing.assert_allclose(result.numpy(), ref_output.numpy(), rtol=1e-5, atol=0) + np.testing.assert_allclose(result.numpy(), ref_output.numpy(), rtol=1e-3) + + # deterministic input + decoder_input = perm_fn(ms_torch.tensor([[[9., 10., 11., 12.]], + [[11., 12., 13., 14.]]])) + memory_input = perm_fn(ms_torch.tensor([[[1., 2., 3., 4.]]])) + result = model(decoder_input, memory_input) + ref_output = perm_fn(ms_torch.tensor([[[2.415448, 0.054389, -0.610932, -0.0156613]], + [[2.415448, 0.054389, -0.610932, -0.0156613]]])) + # TODO: check with lower tolerance + # np.testing.assert_allclose(result.numpy(), ref_output.numpy(), rtol=1e-5, atol=0) + np.testing.assert_allclose(result.numpy(), ref_output.numpy(), rtol=1e-3) + + # deterministic input + decoder_input = perm_fn(ms_torch.tensor([[[1., 2., 3., 4.]], + [[5., 6., 7., 8.]]])) + memory_input = perm_fn(ms_torch.tensor([[[9., 10., 11., 12.]], + [[11., 12., 13., 14.]]])) + result = model(decoder_input, memory_input) + ref_output = perm_fn(ms_torch.tensor([[[2.338531, 0.087709, -0.65776, 0.080646]], + [[2.338531, 0.087709, -0.65776, 0.080646]]])) + # TODO: check with lower tolerance + # np.testing.assert_allclose(result.numpy(), ref_output.numpy(), rtol=1e-5, atol=0) + np.testing.assert_allclose(result.numpy(), ref_output.numpy(), rtol=1e-3) + + # deterministic input + decoder_input = perm_fn(ms_torch.tensor([[[0.4517, 0.6793, 0.5313, 0.0034], + [0.2678, 0.3677, 0.4459, 0.7166]], + [[0.8100, 0.3716, 0.4096, 0.1976], + [0.6958, 0.8844, 0.6081, 0.8315]], + [[0.0494, 0.9343, 0.5955, 0.3830], + [0.5404, 0.3464, 0.9378, 0.6200]]])) + memory_input = perm_fn(ms_torch.tensor([[[0.7462, 0.6653, 0.5679, 0.4891], + [0.5387, 0.1655, 0.3565, 0.0471]], + [[0.8335, 0.2799, 0.5031, 0.2947], + [0.1402, 0.0318, 0.7636, 0.1346]], + [[0.6333, 0.9344, 0.1376, 0.9938], + [0.8924, 0.2872, 0.6692, 0.2944]], + [[0.9897, 0.6915, 0.3154, 0.1733], + [0.8645, 0.3513, 0.3064, 0.0767]], + [[0.8117, 0.2366, 0.4838, 0.7881], + [0.3718, 0.4945, 0.9511, 0.0864]]])) + result = model(decoder_input, memory_input) + ref_output = perm_fn(ms_torch.tensor([[[2.42049104, 0.03443088, -0.60793706, -0.05436271], + [2.42210631, 0.03546578, -0.60679895, -0.05357488]], + [[2.41907674, 0.0336104, -0.60892977, -0.05490462], + [2.42216881, 0.03586554, -0.6067524, -0.05289126]], + [[2.42205716, 0.03488046, -0.60683681, -0.05460596], + [2.42240309, 0.0354595, -0.60659063, -0.05378816]]])) + # TODO: check with lower tolerance + # np.testing.assert_allclose(result.numpy(), ref_output.numpy(), rtol=1e-5, atol=0) + np.testing.assert_allclose(result.numpy(), ref_output.numpy(), rtol=1e-3) + +def test_transformerencoder(): + def get_a_test_layer(use_cuda, activation, batch_first=False): + d_model = 4 + nhead = 2 + dim_feedforward = 16 + dropout = 0.0 + device = ms_torch.device("cuda" if use_cuda else "cpu") + + layer = nn.TransformerEncoderLayer( + d_model, + nhead, + dim_feedforward=dim_feedforward, + dropout=dropout, + activation=activation, + batch_first=batch_first).to(device) + + # set constant weights of the model + for idx, p in enumerate(layer.parameters()): + x = p.data + sz = x.view(-1).size(0) + shape = x.shape + x = ms_torch.cos(ms_torch.arange(0, sz).float().view(shape)) + p.data.copy_(x) + + return layer + + # this is a deterministic test for TransformerEncoder + activation = F.relu + use_cuda = ms_torch.cuda.is_available() + device = ms_torch.device("cuda" if use_cuda else "cpu") + + def _test(batch_first, training): + def perm_fn(x): + return x.transpose(1, 0) if batch_first else x + + encoder_layer = get_a_test_layer(use_cuda=use_cuda, activation=activation, + batch_first=batch_first) + + model = nn.TransformerEncoder(encoder_layer, 1).to(device) + if not training: + model = model.eval() + + # deterministic input + encoder_input = perm_fn(ms_torch.tensor([[[0.7462, 0.6653, 0.5679, 0.4891], + [0.5387, 0.1655, 0.3565, 0.0471]], + [[0.8335, 0.2799, 0.5031, 0.2947], + [0.1402, 0.0318, 0.7636, 0.1346]], + [[0.6333, 0.9344, 0.1376, 0.9938], + [0.8924, 0.2872, 0.6692, 0.2944]], + [[0.9897, 0.6915, 0.3154, 0.1733], + [0.8645, 0.3513, 0.3064, 0.0767]], + [[0.8117, 0.2366, 0.4838, 0.7881], + [0.3718, 0.4945, 0.9511, 0.0864]]] + )).to(device) + result = model(encoder_input) + ref_output = perm_fn(ms_torch.tensor([[[2.428589, 0.020835, -0.602055, -0.085249], + [2.427987, 0.021213, -0.602496, -0.084103]], + [[2.424689, 0.019155, -0.604793, -0.085672], + [2.413863, 0.022211, -0.612486, -0.072490]], + [[2.433774, 0.021598, -0.598343, -0.087548], + [2.425104, 0.019748, -0.604515, -0.084839]], + [[2.436185, 0.022682, -0.596625, -0.087261], + [2.433556, 0.021891, -0.598509, -0.086832]], + [[2.416246, 0.017512, -0.610712, -0.082961], + [2.422901, 0.024187, -0.606178, -0.074929]]] + )).to(device) + assert tuple(result.shape) == tuple(ref_output.shape) + np.testing.assert_allclose(result.numpy(), ref_output.numpy(), rtol=1e-7, atol=1e-5) + + # all 0 + mask = ms_torch.zeros([2, 5]).to(device) == 1 + result = model(encoder_input, src_key_padding_mask=mask) + assert tuple(result.shape) == tuple(ref_output.shape) + np.testing.assert_allclose(result.numpy(), ref_output.numpy(), rtol=1e-7, atol=1e-5) + mask[0, 1] = 1 + mask[1, 3] = 1 + mask[1, 4] = 1 + # If mask is not left aligned + # We disable nested tensor + model.enable_nested_tensor = False + result = model(encoder_input, src_key_padding_mask=mask) + ref_output = perm_fn(ms_torch.tensor([[[2.429026, 0.020793, -0.601741, -0.085642], + [2.428811, 0.021445, -0.601912, -0.084252]], + [[2.425009, 0.019155, -0.604566, -0.085899], + [2.415408, 0.02249, -0.611415, -0.073]], + [[2.434199, 0.021682, -0.598039, -0.087699], + [2.42598, 0.019941, -0.603896, -0.085091]], + [[2.436457, 0.022736, -0.59643, -0.08736], + [2.434021, 0.022093, -0.598179, -0.08679]], + [[2.416531, 0.017498, -0.610513, -0.083181], + [2.4242, 0.024653, -0.605266, -0.074959]]] + )).to(device) + assert tuple(result.shape) == tuple(ref_output.shape) + # TODO: check with lower tolerance + # np.testing.assert_allclose(result.numpy(), ref_output.numpy(), rtol=1e-7, atol=1e-5) + np.testing.assert_allclose(result.numpy(), ref_output.numpy(), atol=1e-2) + + # test case 2, multiple layers no norm + model = nn.TransformerEncoder(encoder_layer, 2, enable_nested_tensor=False).to(device) + if not training: + model = model.eval() + result = model(encoder_input, src_key_padding_mask=mask) + ref_output = perm_fn(ms_torch.tensor([[[2.419051, 0.017446, -0.608738, -0.085003], + [2.419102, 0.017452, -0.608703, -0.085026]], + [[2.419043, 0.017445, -0.608744, -0.084999], + [2.419052, 0.017446, -0.608738, -0.085004]], + [[2.419067, 0.017448, -0.608727, -0.085010], + [2.419098, 0.017452, -0.608706, -0.085024]], + [[2.419072, 0.017449, -0.608724, -0.085012], + [2.419119, 0.017455, -0.608691, -0.085034]], + [[2.419019, 0.017442, -0.608761, -0.084989], + [2.419075, 0.017449, -0.608722, -0.085014]]] + )).to(device) + assert tuple(result.shape) == tuple(ref_output.shape) + # TODO: check with lower tolerance + # np.testing.assert_allclose(result.numpy(), ref_output.numpy(), rtol=1e-7, atol=1e-5) + np.testing.assert_allclose(result.numpy(), ref_output.numpy(), atol=1e-3) + + model = nn.TransformerEncoder(encoder_layer, 6, enable_nested_tensor=False).to(device) + if not training: + model = model.eval() + result = model(encoder_input, src_key_padding_mask=mask) + ref_output = perm_fn(ms_torch.tensor([[[2.419101, 0.017453, -0.608703, -0.085025], + [2.419101, 0.017453, -0.608704, -0.085025]], + [[2.419101, 0.017453, -0.608703, -0.085025], + [2.419101, 0.017453, -0.608704, -0.085025]], + [[2.419101, 0.017453, -0.608703, -0.085025], + [2.419101, 0.017453, -0.608704, -0.085025]], + [[2.419101, 0.017453, -0.608703, -0.085025], + [2.419101, 0.017453, -0.608704, -0.085025]], + [[2.419101, 0.017453, -0.608703, -0.085025], + [2.419101, 0.017453, -0.608704, -0.085025]]] + )).to(device) + assert tuple(result.shape) == tuple(ref_output.shape) + np.testing.assert_allclose(result.numpy(), ref_output.numpy(), rtol=1e-7, atol=1e-5) + + # test case 3, multiple layers with norm + # d_model = 4 + norm = nn.LayerNorm(4) + model = nn.TransformerEncoder(encoder_layer, 2, norm=norm, enable_nested_tensor=False).to(device) + if not training: + model = model.eval() + result = model(encoder_input, src_key_padding_mask=mask) + ref_output = perm_fn(ms_torch.tensor([[[1.695949, -0.357635, -0.893077, -0.445238], + [1.695955, -0.357639, -0.893050, -0.445266]], + [[1.695948, -0.357634, -0.893082, -0.445233], + [1.695950, -0.357635, -0.893077, -0.445238]], + [[1.695951, -0.357636, -0.893069, -0.445246], + [1.695955, -0.357639, -0.893052, -0.445264]], + [[1.695952, -0.357636, -0.893066, -0.445249], + [1.695957, -0.357641, -0.893041, -0.445276]], + [[1.695946, -0.357632, -0.893095, -0.445220], + [1.695952, -0.357637, -0.893065, -0.445251]]] + )).to(device) + assert tuple(result.shape) == tuple(ref_output.shape) + np.testing.assert_allclose(result.numpy(), ref_output.numpy(), rtol=1e-7, atol=1e-5) + + model = nn.TransformerEncoder(encoder_layer, 6, norm=norm, enable_nested_tensor=False).to(device) + if not training: + model = model.eval() + result = model(encoder_input, src_key_padding_mask=mask) + ref_output = perm_fn(ms_torch.tensor([[[1.695955, -0.357639, -0.893051, -0.445265], + [1.695955, -0.357639, -0.893051, -0.445265]], + [[1.695955, -0.357639, -0.893051, -0.445265], + [1.695955, -0.357639, -0.893051, -0.445265]], + [[1.695955, -0.357639, -0.893051, -0.445265], + [1.695955, -0.357639, -0.893051, -0.445265]], + [[1.695955, -0.357639, -0.893051, -0.445265], + [1.695955, -0.357639, -0.893051, -0.445265]], + [[1.695955, -0.357639, -0.893051, -0.445265], + [1.695955, -0.357639, -0.893051, -0.445265]]] + )).to(device) + assert tuple(result.shape) == tuple(ref_output.shape) + np.testing.assert_allclose(result.numpy(), ref_output.numpy(), rtol=1e-7, atol=1e-5) + for batch_first in (True, False): + for training in (True, False): + with contextlib.nullcontext(): + _test(batch_first, training) + +def test_transformerdecoder(): + def get_a_test_layer(use_cuda, activation, batch_first=False): + d_model = 4 + nhead = 2 + dim_feedforward = 16 + dropout = 0.0 + device = ms_torch.device("cuda" if use_cuda else "cpu") + + layer = nn.TransformerDecoderLayer( + d_model, + nhead, + dim_feedforward=dim_feedforward, + dropout=dropout, + activation=activation, + batch_first=batch_first).to(device) + + # set constant weights of the model + for idx, p in enumerate(layer.parameters()): + x = p.data + sz = x.view(-1).size(0) + shape = x.shape + x = ms_torch.cos(ms_torch.arange(0, sz).float().view(shape)) + p.data.copy_(x) + + return layer + + # this is a deterministic test for TransformerDecoder + for batch_first in (False, True): + def perm_fn(x): + return x.transpose(1, 0) if batch_first else x + activation = F.relu + use_cuda = ms_torch.cuda.is_available() + device = ms_torch.device("cuda" if use_cuda else "cpu") + + decoder_layer = get_a_test_layer(use_cuda=use_cuda, activation=activation, + batch_first=batch_first) + + model = nn.TransformerDecoder(decoder_layer, 1).to(device) + + # deterministic input + decoder_input = ms_torch.tensor([[[20., 30., 40., 50.]]]).to(device) + memory_input = ms_torch.tensor([[[60., 70., 80., 90.]]]).to(device) + result = model(decoder_input, memory_input) + ref_output = ms_torch.tensor( + [[[2.314351, 0.094805, -0.671322, 0.101977]]]).to(device) + assert tuple(result.shape) == tuple(ref_output.shape) + np.testing.assert_allclose(result.numpy(), ref_output.numpy(), rtol=1e-7, atol=1e-3) + + # deterministic input + decoder_input = perm_fn(ms_torch.tensor([[[9., 10., 11., 12.]], + [[11., 12., 13., 14.]]])).to(device) + memory_input = perm_fn(ms_torch.tensor([[[1., 2., 3., 4.]]])).to(device) + result = model(decoder_input, memory_input) + ref_output = perm_fn(ms_torch.tensor([[[2.422245, 0.051716, -0.606338, -0.024756]], + [[2.422245, 0.051716, -0.606338, -0.024756]]] + )).to(device) + assert tuple(result.shape) == tuple(ref_output.shape) + np.testing.assert_allclose(result.numpy(), ref_output.numpy(), rtol=1e-7, atol=1e-4) + + # deterministic input + decoder_input = perm_fn(ms_torch.tensor([[[1., 2., 3., 4.]], + [[5., 6., 7., 8.]]])).to(device) + memory_input = perm_fn(ms_torch.tensor([[[9., 10., 11., 12.]], + [[11., 12., 13., 14.]]])).to(device) + result = model(decoder_input, memory_input) + ref_output = perm_fn(ms_torch.tensor([[[2.343536, 0.085561, -0.654954, 0.074991]], + [[2.343536, 0.085561, -0.654954, 0.074991]]] + )).to(device) + assert tuple(result.shape) == tuple(ref_output.shape) + np.testing.assert_allclose(result.numpy(), ref_output.numpy(), rtol=1e-7, atol=1e-4) + + # deterministic input + decoder_input = perm_fn(ms_torch.tensor([[[0.4517, 0.6793, 0.5313, 0.0034], + [0.2678, 0.3677, 0.4459, 0.7166]], + [[0.8100, 0.3716, 0.4096, 0.1976], + [0.6958, 0.8844, 0.6081, 0.8315]], + [[0.0494, 0.9343, 0.5955, 0.3830], + [0.5404, 0.3464, 0.9378, 0.6200]]] + )).to(device) + memory_input = perm_fn(ms_torch.tensor([[[0.7462, 0.6653, 0.5679, 0.4891], + [0.5387, 0.1655, 0.3565, 0.0471]], + [[0.8335, 0.2799, 0.5031, 0.2947], + [0.1402, 0.0318, 0.7636, 0.1346]], + [[0.6333, 0.9344, 0.1376, 0.9938], + [0.8924, 0.2872, 0.6692, 0.2944]], + [[0.9897, 0.6915, 0.3154, 0.1733], + [0.8645, 0.3513, 0.3064, 0.0767]], + [[0.8117, 0.2366, 0.4838, 0.7881], + [0.3718, 0.4945, 0.9511, 0.0864]]] + )).to(device) + result = model(decoder_input, memory_input) + ref_output = perm_fn(ms_torch.tensor([[[2.430065, 0.027862, -0.601136, -0.073096], + [2.431935, 0.028907, -0.599809, -0.072488]], + [[2.428457, 0.027053, -0.602275, -0.073462], + [2.431970, 0.029387, -0.599789, -0.071621]], + [[2.431934, 0.028196, -0.599802, -0.073809], + [2.432306, 0.028858, -0.599542, -0.072846]]] + )).to(device) + assert tuple(result.shape) == tuple(ref_output.shape) + np.testing.assert_allclose(result.numpy(), ref_output.numpy(), rtol=1e-7, atol=1e-5) + + # key_padding_mask + key_padding_mask = ms_torch.zeros(2, 3).to(device) == 1 + result = model(decoder_input, memory_input, + tgt_key_padding_mask=key_padding_mask) + ref_output = perm_fn(ms_torch.tensor([[[2.430065, 0.027862, -0.601136, -0.073096], + [2.431935, 0.028907, -0.599809, -0.072488]], + [[2.428457, 0.027053, -0.602275, -0.073462], + [2.431970, 0.029387, -0.599789, -0.071621]], + [[2.431934, 0.028196, -0.599802, -0.073809], + [2.432306, 0.028858, -0.599542, -0.072846]]] + )).to(device) + assert tuple(result.shape) == tuple(ref_output.shape) + np.testing.assert_allclose(result.numpy(), ref_output.numpy(), rtol=1e-7, atol=1e-5) + + # key_padding_mask + key_padding_mask[0, 2] = 1 + key_padding_mask[1, 1] = 1 + key_padding_mask[1, 2] = 1 + result = model(decoder_input, memory_input, + tgt_key_padding_mask=key_padding_mask) + ref_output = perm_fn(ms_torch.tensor([[[2.430025, 0.027643, -0.601164, -0.073476], + [2.4323, 0.029375, -0.599553, -0.071881]], + [[2.428523, 0.026838, -0.602226, -0.07391], + [2.432634, 0.029842, -0.599318, -0.071253]], + [[2.432278, 0.028152, -0.599555, -0.074139], + [2.432659, 0.029244, -0.599294, -0.072382]]] + )).to(device) + assert tuple(result.shape) == tuple(ref_output.shape) + # TODO: check with lower tolerance + # np.testing.assert_allclose(result.numpy(), ref_output.numpy(), rtol=1e-7, atol=1e-5) + np.testing.assert_allclose(result.numpy(), ref_output.numpy(), atol=1e-3) + + # memory_key_padding_mask + key_padding_mask = ms_torch.zeros(2, 5).to(device) == 1 + result = model(decoder_input, memory_input, + memory_key_padding_mask=key_padding_mask) + ref_output = perm_fn(ms_torch.tensor([[[2.430065, 0.027862, -0.601136, -0.073096], + [2.431935, 0.028907, -0.599809, -0.072488]], + [[2.428457, 0.027053, -0.602275, -0.073462], + [2.431970, 0.029387, -0.599789, -0.071621]], + [[2.431934, 0.028196, -0.599802, -0.073809], + [2.432306, 0.028858, -0.599542, -0.072846]]] + )).to(device) + assert tuple(result.shape) == tuple(ref_output.shape) + np.testing.assert_allclose(result.numpy(), ref_output.numpy(), rtol=1e-7, atol=1e-5) + + # memory_key_padding_mask + key_padding_mask[0, 4] = 1 + key_padding_mask[1, 3] = 1 + key_padding_mask[1, 4] = 1 + result = model(decoder_input, + memory_input, + memory_key_padding_mask=key_padding_mask) + ref_output = perm_fn(ms_torch.tensor([[[2.429757, 0.027358, -0.601351, -0.073816], + [2.432692, 0.028583, -0.599263, -0.073634]], + [[2.428247, 0.02662, -0.602419, -0.074123], + [2.432657, 0.029055, -0.599293, -0.072732]], + [[2.431515, 0.027687, -0.600096, -0.074459], + [2.433075, 0.028543, -0.598987, -0.073985]]] + )).to(device) + assert tuple(result.shape) == tuple(ref_output.shape) + # TODO: check with lower tolerance + # np.testing.assert_allclose(result.numpy(), ref_output.numpy(), rtol=1e-7, atol=1e-5) + np.testing.assert_allclose(result.numpy(), ref_output.numpy(), atol=1e-2) + + # multiple layers no norm + model = nn.TransformerDecoder(decoder_layer, 2).to(device) + + # deterministic input + decoder_input = ms_torch.tensor([[[20., 30., 40., 50.]]]).to(device) + memory_input = ms_torch.tensor([[[60., 70., 80., 90.]]]).to(device) + result = model(decoder_input, memory_input) + ref_output = ms_torch.tensor( + [[[2.31316, 0.0950293, -0.671995, 0.102802]]]).to(device) + assert tuple(result.shape) == tuple(ref_output.shape) + np.testing.assert_allclose(result.numpy(), ref_output.numpy(), rtol=1e-7, atol=1e-3) + + # multiple layers no norm + model = nn.TransformerDecoder(decoder_layer, 6).to(device) + + # deterministic input + decoder_input = perm_fn(ms_torch.tensor([[[0.4517, 0.6793, 0.5313, 0.0034], + [0.2678, 0.3677, 0.4459, 0.7166]], + [[0.8100, 0.3716, 0.4096, 0.1976], + [0.6958, 0.8844, 0.6081, 0.8315]], + [[0.0494, 0.9343, 0.5955, 0.3830], + [0.5404, 0.3464, 0.9378, 0.6200]]] + )).to(device) + memory_input = perm_fn(ms_torch.tensor([[[0.7462, 0.6653, 0.5679, 0.4891], + [0.5387, 0.1655, 0.3565, 0.0471]], + [[0.8335, 0.2799, 0.5031, 0.2947], + [0.1402, 0.0318, 0.7636, 0.1346]], + [[0.6333, 0.9344, 0.1376, 0.9938], + [0.8924, 0.2872, 0.6692, 0.2944]], + [[0.9897, 0.6915, 0.3154, 0.1733], + [0.8645, 0.3513, 0.3064, 0.0767]], + [[0.8117, 0.2366, 0.4838, 0.7881], + [0.3718, 0.4945, 0.9511, 0.0864]]] + )).to(device) + result = model(decoder_input, memory_input) + ref_output = perm_fn(ms_torch.tensor([[[2.42794, 0.026164, -0.60263, -0.0747591], + [2.43113, 0.0279516, -0.600376, -0.0736896]], + [[2.42794, 0.026164, -0.60263, -0.0747591], + [2.43113, 0.0279516, -0.600376, -0.0736896]], + [[2.42794, 0.026164, -0.60263, -0.0747591], + [2.43113, 0.0279516, -0.600376, -0.0736896]]] + )).to(device) + assert tuple(result.shape) == tuple(ref_output.shape) + np.testing.assert_allclose(result.numpy(), ref_output.numpy(), rtol=1e-7, atol=1e-5) + + # multiple layers with norm + # d_model = 4 + norm = nn.LayerNorm(4) + model = nn.TransformerDecoder(decoder_layer, 2, norm=norm).to(device) + + # deterministic input + decoder_input = ms_torch.tensor([[[20., 30., 40., 50.]]]).to(device) + memory_input = ms_torch.tensor([[[60., 70., 80., 90.]]]).to(device) + result = model(decoder_input, memory_input) + ref_output = ms_torch.tensor( + [[[1.66166, -0.326986, -1.01466, -0.320017]]]).to(device) + assert tuple(result.shape) == tuple(ref_output.shape) + np.testing.assert_allclose(result.numpy(), ref_output.numpy(), rtol=1e-7, atol=1e-3) + + # multiple layers with norm + model = nn.TransformerDecoder(decoder_layer, 6, norm=norm).to(device) + + # deterministic input + decoder_input = perm_fn(ms_torch.tensor([[[0.4517, 0.6793, 0.5313, 0.0034], + [0.2678, 0.3677, 0.4459, 0.7166]], + [[0.8100, 0.3716, 0.4096, 0.1976], + [0.6958, 0.8844, 0.6081, 0.8315]], + [[0.0494, 0.9343, 0.5955, 0.3830], + [0.5404, 0.3464, 0.9378, 0.6200]]] + )).to(device) + memory_input = perm_fn(ms_torch.tensor([[[0.7462, 0.6653, 0.5679, 0.4891], + [0.5387, 0.1655, 0.3565, 0.0471]], + [[0.8335, 0.2799, 0.5031, 0.2947], + [0.1402, 0.0318, 0.7636, 0.1346]], + [[0.6333, 0.9344, 0.1376, 0.9938], + [0.8924, 0.2872, 0.6692, 0.2944]], + [[0.9897, 0.6915, 0.3154, 0.1733], + [0.8645, 0.3513, 0.3064, 0.0767]], + [[0.8117, 0.2366, 0.4838, 0.7881], + [0.3718, 0.4945, 0.9511, 0.0864]]] + )).to(device) + result = model(decoder_input, memory_input) + ref_output = perm_fn(ms_torch.tensor([[[1.69559, -0.357291, -0.894741, -0.443553], + [1.69571, -0.357363, -0.894154, -0.444196]], + [[1.69559, -0.357291, -0.894741, -0.443553], + [1.69571, -0.357363, -0.894154, -0.444196]], + [[1.69559, -0.357291, -0.894741, -0.443553], + [1.69571, -0.357363, -0.894154, -0.444196]]] + )).to(device) + assert tuple(result.shape) == tuple(ref_output.shape) + np.testing.assert_allclose(result.numpy(), ref_output.numpy(), rtol=1e-7, atol=1e-5) + + # gelu activation test cases + activation = "gelu" + use_cuda = ms_torch.cuda.is_available() + device = ms_torch.device("cuda" if use_cuda else "cpu") + + decoder_layer = get_a_test_layer(use_cuda=use_cuda, activation=activation, + batch_first=batch_first) + + model = nn.TransformerDecoder(decoder_layer, 1).to(device) + + # deterministic input + decoder_input = ms_torch.tensor([[[20., 30., 40., 50.]]]).to(device) + memory_input = ms_torch.tensor([[[60., 70., 80., 90.]]]).to(device) + result = model(decoder_input, memory_input) + ref_output = ms_torch.tensor([[[2.306435, 0.095946, -0.675796, 0.10687]]]).to(device) + assert tuple(result.shape) == tuple(ref_output.shape) + np.testing.assert_allclose(result.numpy(), ref_output.numpy(), rtol=1e-7, atol=1e-3) + + # deterministic input + decoder_input = perm_fn(ms_torch.tensor([[[9., 10., 11., 12.]], + [[11., 12., 13., 14.]]])).to(device) + memory_input = perm_fn(ms_torch.tensor([[[1., 2., 3., 4.]]])).to(device) + result = model(decoder_input, memory_input) + ref_output = perm_fn(ms_torch.tensor([[[2.415448, 0.054389, -0.610932, -0.0156613]], + [[2.415448, 0.054389, -0.610932, -0.0156613]]])).to(device) + assert tuple(result.shape) == tuple(ref_output.shape) + np.testing.assert_allclose(result.numpy(), ref_output.numpy(), rtol=1e-7, atol=1e-4) + + # deterministic input + decoder_input = perm_fn(ms_torch.tensor([[[1., 2., 3., 4.]], + [[5., 6., 7., 8.]]])).to(device) + memory_input = perm_fn(ms_torch.tensor([[[9., 10., 11., 12.]], + [[11., 12., 13., 14.]]])).to(device) + result = model(decoder_input, memory_input) + ref_output = perm_fn(ms_torch.tensor([[[2.338531, 0.087709, -0.65776, 0.080646]], + [[2.338531, 0.087709, -0.65776, 0.080646]]])).to(device) + assert tuple(result.shape) == tuple(ref_output.shape) + np.testing.assert_allclose(result.numpy(), ref_output.numpy(), rtol=1e-7, atol=1e-4) + + # deterministic input + decoder_input = perm_fn(ms_torch.tensor([[[0.4517, 0.6793, 0.5313, 0.0034], + [0.2678, 0.3677, 0.4459, 0.7166]], + [[0.8100, 0.3716, 0.4096, 0.1976], + [0.6958, 0.8844, 0.6081, 0.8315]], + [[0.0494, 0.9343, 0.5955, 0.3830], + [0.5404, 0.3464, 0.9378, 0.6200]]] + )).to(device) + memory_input = perm_fn(ms_torch.tensor([[[0.7462, 0.6653, 0.5679, 0.4891], + [0.5387, 0.1655, 0.3565, 0.0471]], + [[0.8335, 0.2799, 0.5031, 0.2947], + [0.1402, 0.0318, 0.7636, 0.1346]], + [[0.6333, 0.9344, 0.1376, 0.9938], + [0.8924, 0.2872, 0.6692, 0.2944]], + [[0.9897, 0.6915, 0.3154, 0.1733], + [0.8645, 0.3513, 0.3064, 0.0767]], + [[0.8117, 0.2366, 0.4838, 0.7881], + [0.3718, 0.4945, 0.9511, 0.0864]]] + )).to(device) + result = model(decoder_input, memory_input) + ref_output = perm_fn(ms_torch.tensor([[[2.42049104, 0.03443088, -0.60793706, -0.05436271], + [2.42210631, 0.03546578, -0.60679895, -0.05357488]], + [[2.41907674, 0.0336104, -0.60892977, -0.05490462], + [2.42216881, 0.03586554, -0.6067524, -0.05289126]], + [[2.42205716, 0.03488046, -0.60683681, -0.05460596], + [2.42240309, 0.0354595, -0.60659063, -0.05378816]]] + )).to(device) + assert tuple(result.shape) == tuple(ref_output.shape) + # TODO: check with lower tolerance + # np.testing.assert_allclose(result.numpy(), ref_output.numpy(), rtol=1e-7, atol=1e-5) + np.testing.assert_allclose(result.numpy(), ref_output.numpy(), atol=1e-4) + +''' +# @dtypes(torch.float) +# @dtypesIfCUDA(torch.double, torch.float, torch.half) +def test_transformerencoderlayer(): + # this is a deterministic test for TransformerEncoderLayer + d_model = 4 + nhead = 2 + dim_feedforward = 16 + dropout = 0.0 + + atol = 1e-5 + rtol = 1e-7 + # TODO: + # if "cuda" in device: + # atol = 1e-3 + # rtol = 1e-2 + + def _test(training, batch_first, atol, rtol): + def perm_fn(x): + return x.transpose(1, 0) if batch_first else x + + model = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, + batch_first=batch_first, device='cpu', dtype=ms_torch.float) + + if not training: + assert dropout == 0 + model = model.eval() + + # set constant weights of the model + for idx, p in enumerate(model.parameters()): + x = p.data + sz = x.view(-1).size(0) + shape = x.shape + x = ms_torch.cos(ms_torch.arange(0, sz).float().view(shape)) + p.data.copy_(x) + + # deterministic input + encoder_input = ms_torch.tensor([[[20., 30., 40., 50.]]], device='cpu', dtype=ms_torch.float) + result = model(encoder_input) + ref_output = ms_torch.tensor([[[2.258703, 0.127985, -0.697881, 0.170862]]], device='cpu', dtype=ms_torch.float) + assert result.shape == ref_output.shape + np.testing.assert_allclose(result.numpy(), ref_output.numpy(), atol=atol, rtol=rtol) + # 0 values are NOT masked. This shouldn't mask anything. + mask = ms_torch.tensor([[0]], device='cpu') == 1 + result = model(encoder_input, src_key_padding_mask=mask) + assert result.shape == ref_output.shape + np.testing.assert_allclose(result.numpy(), ref_output.numpy(), atol=atol, rtol=rtol) + # 1 values are masked. Since there is only 1 input embedding this + # will result in nan. + mask = ms_torch.tensor([[1]], device='cpu') == 1 + result = model(encoder_input, src_key_padding_mask=mask) + result = result.cpu().detach().numpy() + assert np.isnan(result).all() == True + + # deterministic input + encoder_input = perm_fn(ms_torch.tensor([[[1., 2., 3., 4.]], + [[5., 6., 7., 8.]]], device='cpu', dtype=ms_torch.float)) + result = model(encoder_input) + ref_output = perm_fn(ms_torch.tensor([[[2.272644, 0.119035, -0.691669, 0.153486]], + [[2.272644, 0.119035, -0.691669, 0.153486]]], + device='cpu', dtype=ms_torch.float)) + assert result.shape == ref_output.shape + np.testing.assert_allclose(result.numpy(), ref_output.numpy(), atol=atol, rtol=rtol) + # all 0 which is no masking + mask = ms_torch.tensor([[0, 0]], device='cpu') == 1 + result = model(encoder_input, src_key_padding_mask=mask) + assert result.shape == ref_output.shape + np.testing.assert_allclose(result.numpy(), ref_output.numpy(), atol=atol, rtol=rtol) + mask = ms_torch.tensor([[1, 0]], device='cpu') == 1 + result = model(encoder_input, src_key_padding_mask=mask) + ref_output = perm_fn(ms_torch.tensor([[[2.301516, 0.092249, -0.679101, 0.103088]], + [[2.301516, 0.092249, -0.679101, 0.103088]]], + device='cpu', dtype=ms_torch.float)) + assert result.shape == ref_output.shape + np.testing.assert_allclose(result.numpy(), ref_output.numpy(), atol=atol, rtol=rtol) + + # deterministic input + encoder_input = perm_fn(ms_torch.tensor([[[0.7462, 0.6653, 0.5679, 0.4891], + [0.5387, 0.1655, 0.3565, 0.0471]], + [[0.8335, 0.2799, 0.5031, 0.2947], + [0.1402, 0.0318, 0.7636, 0.1346]], + [[0.6333, 0.9344, 0.1376, 0.9938], + [0.8924, 0.2872, 0.6692, 0.2944]], + [[0.9897, 0.6915, 0.3154, 0.1733], + [0.8645, 0.3513, 0.3064, 0.0767]], + [[0.8117, 0.2366, 0.4838, 0.7881], + [0.3718, 0.4945, 0.9511, 0.0864]]], + device='cpu', dtype=ms_torch.float)) + result = model(encoder_input) + ref_output = perm_fn(ms_torch.tensor([[[2.428589, 0.020835, -0.602055, -0.085249], + [2.427987, 0.021213, -0.602496, -0.084103]], + [[2.424689, 0.019155, -0.604793, -0.085672], + [2.413863, 0.022211, -0.612486, -0.072490]], + [[2.433774, 0.021598, -0.598343, -0.087548], + [2.425104, 0.019748, -0.604515, -0.084839]], + [[2.436185, 0.022682, -0.596625, -0.087261], + [2.433556, 0.021891, -0.598509, -0.086832]], + [[2.416246, 0.017512, -0.610712, -0.082961], + [2.422901, 0.024187, -0.606178, -0.074929]]], + device='cpu', dtype=ms_torch.float)) + assert result.shape == ref_output.shape + np.testing.assert_allclose(result.numpy(), ref_output.numpy(), atol=atol, rtol=rtol) + + # all 0 + mask = ms_torch.zeros([2, 5], device='cpu') == 1 + result = model(encoder_input, src_key_padding_mask=mask) + assert result.shape == ref_output.shape + np.testing.assert_allclose(result.numpy(), ref_output.numpy(), atol=atol, rtol=rtol) + mask[0, 1] = 1 + mask[1, 3] = 1 + mask[1, 4] = 1 + result = model(encoder_input, src_key_padding_mask=mask) + ref_output = perm_fn(ms_torch.tensor([[[2.429026, 0.020793, -0.601741, -0.085642], + [2.428811, 0.021445, -0.601912, -0.084252]], + [[2.425009, 0.019155, -0.604566, -0.085899], + [2.415408, 0.02249 , -0.611415, -0.073]], + [[2.434199, 0.021682, -0.598039, -0.087699], + [2.42598, 0.019941, -0.603896, -0.085091]], + [[2.436457, 0.022736, -0.59643 , -0.08736], + [2.434021, 0.022093, -0.598179, -0.08679]], + [[2.416531, 0.017498, -0.610513, -0.083181], + [2.4242, 0.024653, -0.605266, -0.074959]]], device='cpu', + dtype=ms_torch.float)) + assert result.shape == ref_output.shape + np.testing.assert_allclose(result.numpy(), ref_output.numpy(), atol=atol, rtol=rtol) + + # TODO: testcases for nested-tensors? + + for batch_first in (True, False): + for training in (True, False): + with contextlib.nullcontext(): + _test(batch_first=batch_first, training=training, atol=atol, rtol=rtol) +''' + +# @dtypesIfCUDA(torch.half, torch.float) +def test_transformerencoderlayer_gelu(): + # this is a deterministic test for TransformerEncoderLayer with gelu activation + d_model = 4 + nhead = 2 + dim_feedforward = 16 + dropout = 0.0 + + atol = 0 + rtol = 1e-5 + # TODO: + # if "cuda" in device: + # atol = 1e-3 + # rtol = 1e-2 + + def _test(activation, batch_first, training): + def perm_fn(x): + return x.transpose(1, 0) if batch_first else x + + model = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, + activation, batch_first=batch_first, device='cpu', dtype=ms_torch.float) + if not training: + assert dropout == 0 + model = model.eval() + + # set constant weights of the model + for idx, p in enumerate(model.parameters()): + x = p.data + sz = x.view(-1).size(0) + shape = x.shape + x = ms_torch.cos(ms_torch.arange(0, sz).float().view(shape)) + p.data.copy_(x) + + # deterministic input + encoder_input = ms_torch.tensor([[[20., 30., 40., 50.]]], device='cpu', dtype=ms_torch.float) + result = model(encoder_input) + ref_output = ms_torch.tensor([[[2.249815, 0.131006, -0.702199, 0.177868]]], device='cpu', dtype=ms_torch.float) + # TODO: check with lower tolerance + # np.testing.assert_allclose(result.numpy(), ref_output.numpy(), rtol=rtol, atol=atol) + np.testing.assert_allclose(result.numpy(), ref_output.numpy(), atol=1e-3) + + # deterministic input + encoder_input = perm_fn(ms_torch.tensor([[[1., 2., 3., 4.]], + [[5., 6., 7., 8.]]], device='cpu', dtype=ms_torch.float)) + result = model(encoder_input) + ref_output = perm_fn(ms_torch.tensor([[[2.264103, 0.121417, -0.696012, 0.159724]], + [[2.264103, 0.121417, -0.696012, 0.159724]]], device='cpu', dtype=ms_torch.float)) + # TODO: check with lower tolerance + # np.testing.assert_allclose(result.numpy(), ref_output.numpy(), rtol=rtol, atol=atol) + np.testing.assert_allclose(result.numpy(), ref_output.numpy(), atol=1e-3) + + # deterministic input + encoder_input = perm_fn(ms_torch.tensor([[[0.7462, 0.6653, 0.5679, 0.4891], + [0.5387, 0.1655, 0.3565, 0.0471]], + [[0.8335, 0.2799, 0.5031, 0.2947], + [0.1402, 0.0318, 0.7636, 0.1346]], + [[0.6333, 0.9344, 0.1376, 0.9938], + [0.8924, 0.2872, 0.6692, 0.2944]], + [[0.9897, 0.6915, 0.3154, 0.1733], + [0.8645, 0.3513, 0.3064, 0.0767]], + [[0.8117, 0.2366, 0.4838, 0.7881], + [0.3718, 0.4945, 0.9511, 0.0864]]], device='cpu', dtype=ms_torch.float)) + result = model(encoder_input) + ref_output = perm_fn(ms_torch.tensor([[[2.42163188, 0.03227153, -0.60714219, -0.05908082], + [2.42151276, 0.03302179, -0.60722523, -0.05762651]], + [[2.41926761, 0.02974034, -0.60879519, -0.0621269], + [2.41626395, 0.03539356, -0.61087842, -0.04978623]], + [[2.42382808, 0.03218872, -0.6055963, -0.06073591], + [2.41983477, 0.03085259, -0.60840145, -0.06046414]], + [[2.42500749, 0.03328855, -0.60476388, -0.0595334], + [2.4237977, 0.03290575, -0.60561789, -0.05940082]], + [[2.41383916, 0.02686345, -0.61256377, -0.06380707], + [2.42000277, 0.03800944, -0.60824798, -0.04754947]]], device='cpu', dtype=ms_torch.float)) + # TODO: check with lower tolerance + # np.testing.assert_allclose(result.numpy(), ref_output.numpy(), rtol=rtol, atol=atol) + np.testing.assert_allclose(result.numpy(), ref_output.numpy(), atol=1e-3) + + for activation, batch_first, training in product(('gelu', F.gelu, nn.GELU()), (True, False), (True, False)): + with contextlib.nullcontext(): + _test(activation=activation, batch_first=batch_first, training=training) + +''' +def _test_module_empty_input(module, inp, check_size=True, inference=False): + if not inference: + inp.requires_grad_(True) + out = module(inp) + if not inference: + gO = ms_torch.rand_like(out) + out.backward(gO) + if check_size: + assert out.size() == inp.size() + if not inference: + for p in module.parameters(): + if p.requires_grad: + assert np.allclose(p.grad.numpy(), ms_torch.zeros_like(p.grad).numpy()) + assert np.allclose(inp.grad.numpy(), ms_torch.zeros_like(inp).numpy()) + +def _test_module_empty_inputs(module, inputs): + for _inp in inputs: + _inp.requires_grad_(True) + out = module(*inputs) + gO = ms_torch.rand_like(out) + out.backward(gO) + + for p in module.parameters(): + if p.requires_grad: + assert np.allclose(p.grad.numpy(), ms_torch.zeros_like(p.grad).numpy()) + + for _inp in inputs: + assert np.allclose(_inp.grad.numpy(), ms_torch.zeros_like(_inp).numpy()) + +def test_TransformerEncoderLayer_empty(): + for training in (True, False): + for batch_first, input_shape in [(True, (0, 10, 512)), + (False, (10, 0, 512))]: + input = ms_torch.rand(*input_shape) + encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8, batch_first=batch_first) + if not training: + encoder_layer = encoder_layer.eval() + _test_module_empty_input(encoder_layer, input, check_size=False, inference=True) + # TODO: ms doesn't have nested tensor + # if batch_first: + # # A NestedTensor with no tensors inside it doesn't have dim 3 (or dim + # # 2, for that matter) so it can't hit the fast path, nor can we give a + # # result. + # with pytest.raises(AssertionError): + # nt = torch.nested_tensor([]) + # _test_module_empty_input(encoder_layer, nt, check_size=False, inference=True) + + # nt = torch.nested_tensor([torch.rand(0, 512)]) + # _test_module_empty_input(encoder_layer, nt, check_size=False, inference=True) + else: + _test_module_empty_input(encoder_layer, input, check_size=False) + +def test_TransformerEncoder_empty(): + for batch_first, input_shape in [(True, (0, 10, 512)), + (False, (10, 0, 512))]: + input = ms_torch.rand(*input_shape) + encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8, batch_first=batch_first) + transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=6) + _test_module_empty_input(transformer_encoder, input, check_size=False) + +def test_TransformerDecoderLayer_empty(): + for batch_first, memory_shape, tgt_shape in [(True, (0, 10, 512), (0, 20, 512)), + (False, (10, 0, 512), (20, 0, 512))]: + memory = ms_torch.rand(*memory_shape) + tgt = ms_torch.rand(*tgt_shape, requires_grad=True) + decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8, batch_first=batch_first) + _test_module_empty_inputs(decoder_layer, [tgt, memory]) + +def test_TransformerDecoder_empty(): + for batch_first, memory_shape, tgt_shape in [(True, (0, 10, 512), (0, 20, 512)), + (False, (10, 0, 512), (20, 0, 512))]: + memory = ms_torch.rand(*memory_shape) + tgt = ms_torch.rand(*tgt_shape, requires_grad=True) + decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8, batch_first=batch_first) + transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=6) + _test_module_empty_inputs(transformer_decoder, [tgt, memory]) + +def test_Transformer_empty(): + for batch_first, src_shape, tgt_shape in [(True, (10, 0, 512), (20, 0, 512))]: + transformer_model = nn.Transformer(nhead=16, num_encoder_layers=12) + src = ms_torch.rand(*src_shape, requires_grad=True) + tgt = ms_torch.rand(*tgt_shape, requires_grad=True) + _test_module_empty_inputs(transformer_model, [src, tgt]) +''' + +if __name__ == '__main__': + test_Transformer_cell() + test_transformerdecoderlayer() + test_transformerdecoderlayer_gelu() + test_transformerencoder() + test_transformerdecoder() + # TODO: uncomment after multi_head_attention_forward attn_mask bug fixed + # test_transformerencoderlayer() + test_transformerencoderlayer_gelu() + # TODO: uncomment after ms Transpose can take shape 0 tensors + # test_TransformerEncoderLayer_empty() + # test_TransformerEncoder_empty() + # test_TransformerDecoderLayer_empty() + # test_TransformerDecoder_empty() + # test_Transformer_empty()