--- a/.gitignore
+++ b/.gitignore
@@ -33,4 +33,5 @@ sdist/
 var/
 wheels/
 #datasets/
 #mnist/
 #mnist/
 rank_*/
--- a/mindtorch/init.py
+++ b/mindtorch/init.py
@@ -1,6 +1,10 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-

 from mindspore._c_expression import jit_mode_pi_enable, update_pijit_default_config
 # update_pijit_default_config(auto_grad=True)
 jit_mode_pi_enable()

 from mindtorch import torch
 from mindtorch.utils import unsupported_attr, pynative_mode_condition
 from mindtorch.package_info import __version__, VERSION, version
--- a/mindtorch/torch/autograd/init.py
+++ b/mindtorch/torch/autograd/init.py
@@ -4,9 +4,10 @@
 from .variable import Variable
 from .function import Function
 from .grad_mode import *
 from .functional import *
 from . import functional

 # MindSpore's autodiff mechanism is different from PyTorch' autograd， so it cannot be fully benchmarked.
 # Users can directly use the autograd API of MindSpore.

 __all__ = ["Variable", "Function", 'grad_mode']
 __all__ = ["Variable", "Function", 'grad_mode', 'grad', 'value_and_grad']
--- a/mindtorch/torch/autograd/functional.py
+++ b/mindtorch/torch/autograd/functional.py
@@ -1,8 +1,10 @@
 import mindspore as ms
 from mindspore import grad as ms_grad, value_and_grad as ms_value_and_grad
 from mindtorch.utils import unsupported_attr
 from mindtorch.torch.tensor import cast_to_adapter_tensor, cast_to_ms_tensor
 from mindtorch.torch.tensor import cast_to_adapter_tensor, cast_to_ms_tensor, Tensor
 from mindtorch.torch.nn import Module

 __all__ = ['vjp', 'jvp', 'jacobian']
 __all__ = ['vjp', 'jvp', 'jacobian', 'grad', 'value_and_grad']

 def vjp(func, inputs, v=None, create_graph=False, strict=False):
    if strict is True or create_graph is True:
@@ -72,3 +74,33 @@ def jacobian(func, inputs, create_graph=False, strict=False, vectorize=False, st
        output = _op(inputs)

    return cast_to_adapter_tensor(output)

 def grad(fn, grad_position=0, weights=None, has_aux=False, return_ids=False):
    new_weights = []
    if weights:
        for param in weights:
            if isinstance(param, Tensor):
                new_weights.append(param.tensor)
            else:
                new_weights.append(param)
    if isinstance(fn, Module):
        def new_fn(*args, **kwargs):
            return fn(*args, **kwargs)
    else:
        new_fn = fn
    return ms_grad(new_fn, grad_position, new_weights, has_aux, return_ids)

 def value_and_grad(fn, grad_position=0, weights=None, has_aux=False, return_ids=False):
    new_weights = []
    if weights:
        for param in weights:
            if isinstance(param, Tensor):
                new_weights.append(param.tensor)
            else:
                new_weights.append(param)
    if isinstance(fn, Module):
        def new_fn(*args, **kwargs):
            return fn(*args, **kwargs)
    else:
        new_fn = fn
    return ms_value_and_grad(new_fn, grad_position, new_weights, has_aux, return_ids)
--- a/mindtorch/torch/common/_inner.py
+++ b/mindtorch/torch/common/_inner.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 from mindspore.common._stub_tensor import StubTensor
 from mindspore.ops.primitive import _primexpr
 from mindtorch.torch.tensor import cast_to_adapter_tensor, Tensor
 from mindtorch.torch.logging import info
@@ -124,6 +125,10 @@ def _inplace_limit_pynative(inplace, op_name):

 def _inplace_assign(input, inplace, output):
    if inplace is True:
        input.assign_value(output)
        if not isinstance(output, StubTensor):
            input.tensor = output
        else:
            input.tensor = output.tensor
            input.stub = output.stub
        return input
    return cast_to_adapter_tensor(output)
--- a/mindtorch/torch/nn/functional.py
+++ b/mindtorch/torch/nn/functional.py
@@ -5,6 +5,7 @@ from typing import Iterable
 # from functools import lru_cache
 import numpy as np
 import mindspore as ms
 from mindspore import ops
 from mindspore.ops.primitive import _primexpr
 from mindspore.ops._primitive_cache import _get_cache_prim
 from mindspore.ops.function.math_func import _expand, _check_same_type
--- a/mindtorch/torch/nn/modules/activation.py
+++ b/mindtorch/torch/nn/modules/activation.py
@@ -4,7 +4,7 @@ import numpy as np
 from mindspore.ops import operations as P
 from mindspore.common import dtype as mstype
 import mindspore as ms
 from mindspore import nn
 from mindspore import ops
 import mindspore._checkparam as validator

 from mindtorch.torch.functional import empty
@@ -88,12 +88,11 @@ class Hardtanh(Module):
        if self.max_val <= self.min_val:
            raise ValueError('`max_val` must be larger than `min_val` in `{}`, but get `max_val`:{} and '
                             '`min_val`:{}'.format(self.__class__.__name__, self.max_val, self.min_val))
        self.hardtanh = nn.Hardtanh(min_val, max_val)


    def forward(self, input):
        input_ms = cast_to_ms_tensor(input)
        output = self.hardtanh(input_ms)
        output = ops.hardtanh(input_ms, self.min_val, self.max_val)
        return _inplace_assign(input, self.inplace, output)

    def extra_repr(self):
@@ -482,9 +481,9 @@ class MultiheadAttention(Module):
        return super().__call__(*args, **kwargs)

    def __setstate__(self, state):
        # Support loading old MultiheadAttention checkpoints generated by v1.1.0
        if '_qkv_same_embed_dim' not in state[1]:
            state[1]['_qkv_same_embed_dim'] = True
        # # Support loading old MultiheadAttention checkpoints generated by v1.1.0
        # if '_qkv_same_embed_dim' not in state[1]:
        #     state[1]['_qkv_same_embed_dim'] = True

        super(MultiheadAttention, self).__setstate__(state)

@@ -543,7 +542,6 @@ class PReLU(Module):
    def __init__(self, num_parameters=1, init=0.25, device=None, dtype=None):
        super(PReLU, self).__init__()
        unsupported_attr(device)
        validator.check_positive_int(num_parameters, 'num_parameters', self.cls_name)
        dtype = _dtype_or_default(dtype)
        w = init
        if isinstance(w, (float, np.float32)):
--- a/mindtorch/torch/nn/modules/batchnorm.py
+++ b/mindtorch/torch/nn/modules/batchnorm.py
@@ -46,10 +46,8 @@ class _NormBase(Module):
        self.bias = Parameter(empty(num_features), requires_grad=affine)
        # 'running_mean' and 'running_var' have to be Parameter
        # because mindspore.ops.BatchNorm require them to be Parameter when 'is_training' is True
        self.running_mean = Parameter(empty(num_features), requires_grad=False)
        self.running_var = Parameter(empty(num_features), requires_grad=False)
        self.register_buffer('running_mean', self.running_mean)
        self.register_buffer('running_var', self.running_var)
        self.register_buffer('running_mean', Parameter(empty(num_features), requires_grad=False))
        self.register_buffer('running_var', Parameter(empty(num_features), requires_grad=False))
        self.reset_parameters()
        if not self.track_running_stats:
            self.momentum = 0.0
--- a/mindtorch/torch/nn/modules/container.py
+++ b/mindtorch/torch/nn/modules/container.py
@@ -1,168 +1,160 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 from abc import abstractmethod
 import operator
 from itertools import chain
 from typing import Dict
 import warnings
 from collections import OrderedDict, abc as container_abcs
 from mindspore.nn.layer.container import _get_prefix_and_index, _valid_index, _valid_cell
 from itertools import chain, islice
 import operator

 from mindtorch.torch.tensor import Tensor, cast_to_adapter_tensor
 from mindtorch.torch.nn.parameter import Parameter
 from mindtorch.torch._ref import typename
 from mindtorch import torch
 from .module import Module
 from ..parameter import Parameter

 from typing import Any, Dict, Iterable, Iterator, Mapping, Optional, overload, Tuple, TypeVar, Union
 from typing_extensions import Self

 __all__ = ['Container', 'Sequential', 'ModuleList', 'ModuleDict', 'ParameterList', 'ParameterDict']

 T = TypeVar('T', bound=Module)


 # Copied from torch.nn.modules.module, required for a custom __repr__ for ModuleList
 def _addindent(s_, numSpaces):
    s = s_.split('\n')
    # don't do anything for single-line stuff
    if len(s) == 1:
        return s_
    first = s.pop(0)
    s = [(numSpaces * ' ') + line for line in s]
    s = '\n'.join(s)
    s = first + '\n' + s
    return s


 class Container(Module):

    def __init__(self, **kwargs: Any) -> None:
        super().__init__()
        # DeprecationWarning is ignored by default <sigh>
        warnings.warn("nn.Container is deprecated. All of it's functionality "
                      "is now implemented in nn.Module. Subclass that instead.")
        for key, value in kwargs.items():
            self.add_module(key, value)


 class Sequential(Module):
    r"""A sequential container.

    Modules will be added to it in the order they are passed in the
    constructor. Alternatively, an ``OrderedDict`` of modules can be
    passed in. The ``forward()`` method of ``Sequential`` accepts any
    input and forwards it to the first module it contains. It then
    "chains" outputs to inputs sequentially for each subsequent module,
    finally returning the output of the last module.

    The value a ``Sequential`` provides over manually calling a sequence
    of modules is that it allows treating the whole container as a
    single module, such that performing a transformation on the
    ``Sequential`` applies to each of the modules it stores (which are
    each a registered submodule of the ``Sequential``).

    What's the difference between a ``Sequential`` and a
    :class:`torch.nn.ModuleList`? A ``ModuleList`` is exactly what it
    sounds like--a list for storing ``Module`` s! On the other hand,
    the layers in a ``Sequential`` are connected in a cascading way.

    Example::

        # Using Sequential to create a small model. When `model` is run,
        # input will first be passed to `Conv2d(1,20,5)`. The output of
        # `Conv2d(1,20,5)` will be used as the input to the first
        # `ReLU`; the output of the first `ReLU` will become the input
        # for `Conv2d(20,64,5)`. Finally, the output of
        # `Conv2d(20,64,5)` will be used as input to the second `ReLU`
        model = nn.Sequential(
                  nn.Conv2d(1,20,5),
                  nn.ReLU(),
                  nn.Conv2d(20,64,5),
                  nn.ReLU()
                )

        # Using Sequential with OrderedDict. This is functionally the
        # same as the above code
        model = nn.Sequential(OrderedDict([
                  ('conv1', nn.Conv2d(1,20,5)),
                  ('relu1', nn.ReLU()),
                  ('conv2', nn.Conv2d(20,64,5)),
                  ('relu2', nn.ReLU())
                ]))
    """
    Sequential Module container. For more details about Module, please refer to

    A list of Cells will be added to it in the order they are passed in the constructor.
    Alternatively, an ordered dict of cells can also be passed in.
    _modules: Dict[str, Module]  # type: ignore[assignment]

    Note:
        Sequential and nn.ModuleList are different, ModuleList is a list for storing modules. However,
        the layers in a Sequential are connected in a cascading way.
    @overload
    def __init__(self, *args: Module) -> None:
        ...

    @overload
    def __init__(self, arg: 'OrderedDict[str, Module]') -> None:
        ...

    Args:
        args (list, OrderedDict): List or OrderedDict of subclass of Module.

    Inputs:
        - **x** (Tensor) - Tensor with shape according to the first Module in the sequence.

    Outputs:
        Tensor, the output Tensor with shape depending on the input `x` and defined sequence of Cells.

    Raises:
        TypeError: If the type of the `args` is not list or OrderedDict.

    Supported Platforms:
        ``Ascend`` ``GPU`` ``CPU``

    Examples:
        >>> conv = nn.Conv2d(3, 2, 3, pad_mode='valid', weight_init="ones")
        >>> relu = nn.ReLU()
        >>> seq = nn.Sequential([conv, relu])
        >>> x = Tensor(np.ones([1, 3, 4, 4]), dtype=mindspore.float32)
        >>> output = seq(x)
        >>> print(output)
        [[[[27. 27.]
           [27. 27.]]
          [[27. 27.]
           [27. 27.]]]]
        >>> from collections import OrderedDict
        >>> d = OrderedDict()
        >>> d["conv"] = conv
        >>> d["relu"] = relu
        >>> seq = nn.Sequential(d)
        >>> x = Tensor(np.ones([1, 3, 4, 4]), dtype=mindspore.float32)
        >>> output = seq(x)
        >>> print(output)
        [[[[27. 27.]
           [27. 27.]]
          [[27. 27.]
           [27. 27.]]]]
    """
    def __init__(self, *args):
        """Initialize Sequential."""
        super(Sequential, self).__init__()
        self._is_dynamic_name = []
        if len(args) == 1:
            cells = args[0]
            if isinstance(cells, list):
                for index, cell in enumerate(cells):
                    self.insert_child_to_cell(str(index), cell)
                    cell.update_parameters_name(str(index) + ".")
                    self._is_dynamic_name.append(True)
            elif isinstance(cells, OrderedDict):
                for name, cell in cells.items():
                    self.insert_child_to_cell(name, cell)
                    cell.update_parameters_name(name + ".")
                    self._is_dynamic_name.append(False)
            elif isinstance(cells, Module):
                for index, cell in enumerate(args):
                    self.insert_child_to_cell(str(index), cell)
                    cell.update_parameters_name(str(index) + ".")
                    self._is_dynamic_name.append(True)
            else:
                raise TypeError(f"For '{self.__class__.__name__}', the 'args[0]' must be list or orderedDict, "
                                f"but got {type(cells).__name__}")
        super().__init__()
        if len(args) == 1 and isinstance(args[0], OrderedDict):
            for key, module in args[0].items():
                self.add_module(key, module)
        else:
            for index, cell in enumerate(args):
                self.insert_child_to_cell(str(index), cell)
                cell.update_parameters_name(str(index) + ".")
                self._is_dynamic_name.append(True)
        self.cell_list = list(self._cells.values())

    def __getitem__(self, index):
        if isinstance(index, slice):
            return self.__class__(
                OrderedDict(list(self._cells.items())[index]))
        if isinstance(index, Tensor):
            index = int(index)
        index = _valid_index(len(self), index, self.__class__.__name__)
        return list(self._cells.values())[index]

    def __setitem__(self, index, module):
        if isinstance(index, Tensor):
            index = int(index)
        cls_name = self.__class__.__name__
        if _valid_cell(module, cls_name):
            prefix, _ = _get_prefix_and_index(self._cells)
            index = _valid_index(len(self), index, cls_name)
            key = list(self._cells.keys())[index]
            self._cells[key] = module
            module.update_parameters_name(prefix + key + ".")
            self.cell_list = list(self._cells.values())

    def __delitem__(self, index):
        cls_name = self.__class__.__name__
        if isinstance(index, int):
            index = _valid_index(len(self), index, cls_name)
            key = list(self._cells.keys())[index]
            del self._cells[key]
            del self._is_dynamic_name[index]
        elif isinstance(index, slice):
            keys = list(self._cells.keys())[index]
            for key in keys:
                del self._cells[key]
            del self._is_dynamic_name[index]
            for idx, module in enumerate(args):
                self.add_module(str(idx), module)

    def _get_item_by_idx(self, iterator, idx) -> T:  # type: ignore[misc, type-var]
        """Get the idx-th item of the iterator."""
        size = len(self)
        idx = operator.index(idx)
        if not -size <= idx < size:
            raise IndexError(f'index {idx} is out of range')
        idx %= size
        return next(islice(iterator, idx, None))

    def __getitem__(self, idx: Union[slice, int]) -> Union['Sequential', T]:
        if isinstance(idx, slice):
            return self.__class__(OrderedDict(list(self._modules.items())[idx]))
        else:
            raise TypeError(f"For '{cls_name}', the type of index must be int type or slice type, "
                            f"but got {type(index).__name__}")
        prefix, key_index = _get_prefix_and_index(self._cells)
        temp_dict = OrderedDict()
        for idx, key in enumerate(self._cells.keys()):
            cell = self._cells[key]
            if self._is_dynamic_name[idx]:
                for _, param in cell.parameters_and_names():
                    param.name = prefix + str(idx) + "." + ".".join(param.name.split(".")[key_index+1:])
                temp_dict[str(idx)] = cell
            else:
                temp_dict[key] = cell
        self._cells = temp_dict
        self.cell_list = list(self._cells.values())
            return self._get_item_by_idx(self._modules.values(), idx)

    def __setitem__(self, idx: int, module: Module) -> None:
        key: str = self._get_item_by_idx(self._modules.keys(), idx)
        return setattr(self, key, module)

    def __len__(self):
        return len(self._cells)
    def __delitem__(self, idx: Union[slice, int]) -> None:
        if isinstance(idx, slice):
            for key in list(self._modules.keys())[idx]:
                delattr(self, key)
        else:
            key = self._get_item_by_idx(self._modules.keys(), idx)
            delattr(self, key)
        # To preserve numbering
        str_indices = [str(i) for i in range(len(self._modules))]
        self._modules = OrderedDict(list(zip(str_indices, self._modules.values())))

    def __bool__(self):
        return len(self._cells) != 0
    def __len__(self) -> int:
        return len(self._modules)

    def __add__(self, other):
    def __add__(self, other) -> 'Sequential':
        if isinstance(other, Sequential):
            ret = Sequential()
            for layer in self:
                self.append(ret, layer)
                ret.append(layer)
            for layer in other:
                self.append(ret, layer)
                ret.append(layer)
            return ret
        else:
            raise ValueError('add operator supports only objects '
                             'of Sequential class, but {} is given.'.format(
                             str(type(other))))
                             f'of Sequential class, but {str(type(other))} is given.')

    def pop(self, key: Union[int, slice]) -> Module:
        v = self[key]
        del self[key]
        return v

    def __iadd__(self, other):
    def __iadd__(self, other) -> Self:
        if isinstance(other, Sequential):
            offset = len(self)
            for i, module in enumerate(other):
@@ -170,13 +162,12 @@ class Sequential(Module):
            return self
        else:
            raise ValueError('add operator supports only objects '
                             'of Sequential class, but {} is given.'.format(
                             str(type(other))))
                             f'of Sequential class, but {str(type(other))} is given.')

    def __mul__(self, other):
    def __mul__(self, other: int) -> 'Sequential':
        if not isinstance(other, int):
            raise TypeError(f"unsupported operand type(s) for *: {type(self)} and {type(other)}")
        elif other <= 0:
        elif (other <= 0):
            raise ValueError(f"Non-positive multiplication factor {other} for {type(self)}")
        else:
            combined = Sequential()
@@ -187,164 +178,85 @@ class Sequential(Module):
                    offset += 1
            return combined

    def __rmul__(self, other):
    def __rmul__(self, other: int) -> 'Sequential':
        return self.__mul__(other)

    def __imul__(self, other):
    def __imul__(self, other: int) -> Self:
        if not isinstance(other, int):
            raise TypeError(f"unsupported operand type(s) for *: {type(self)} and {type(other)}")
        elif other <= 0:
        elif (other <= 0):
            raise ValueError(f"Non-positive multiplication factor {other} for {type(self)}")
        else:
            len_original = len(self)
            offset = len(self)
            for _ in range(other - 1):
                for i in range(len_original):
                    self.add_module(str(i + offset), self._cells[str(i)])
                    self.add_module(str(i + offset), self._modules[str(i)])
                offset += len_original
            return self

    def __dir__(self):
        keys = Module.__dir__(self)
        keys = super().__dir__()
        keys = [key for key in keys if not key.isdigit()]
        return keys

    def __iter__(self):
        return iter(self._cells.values())

    @property
    def _modules(self):
        return self._cells
    def __iter__(self) -> Iterator[Module]:
        return iter(self._modules.values())

    def set_grad(self, flag=True):
        self.requires_grad = flag
        for cell in self._cells.values():
            cell.set_grad(flag)
    # NB: We can't really type check this function as the type of input
    # may change dynamically (as is tested in
    # TestScript.test_sequential_intermediary_types).  Cannot annotate
    # with Any as TorchScript expects a more precise type
    def forward(self, input):
        for module in self:
            input = module(input)
        return input

    def append(self, module):
        """
        Appends a given Module to the end of the list.
    def append(self, module: Module) -> 'Sequential':
        r"""Append a given module to the end.

        Args:
            module(Module): The Module to be appended.

        Examples:
            >>> conv = nn.Conv2d(3, 2, 3, pad_mode='valid', weight_init="ones")
            >>> bn = nn.BatchNorm2d(2)
            >>> relu = nn.ReLU()
            >>> seq = nn.Sequential([conv, bn])
            >>> seq.append(relu)
            >>> x = Tensor(np.ones([1, 3, 4, 4]), dtype=mindspore.float32)
            >>> output = seq(x)
            >>> print(output)
            [[[[26.999863 26.999863]
               [26.999863 26.999863]]
              [[26.999863 26.999863]
               [26.999863 26.999863]]]]
            module (nn.Module): module to append
        """
        if _valid_cell(module, self.__class__.__name__):
            prefix, _ = _get_prefix_and_index(self._cells)
            module.update_parameters_name(prefix + str(len(self)) + ".")
            self._is_dynamic_name.append(True)
            self._cells[str(len(self))] = module
        self.cell_list = list(self._cells.values())
        self.add_module(str(len(self)), module)
        return self

    def add_module(self, name, module):
        if not isinstance(module, Module) and module is not None:
            raise TypeError("{} is not a Module subclass".format(
                module.__name__))
        elif hasattr(self, name) and name not in self._cells:
            raise KeyError("attribute '{}' already exists".format(name))
        elif '.' in name:
            raise KeyError("module name can't contain \".\", got: {}".format(name))
        elif name == '':
            raise KeyError("module name can't be empty string \"\"")

        if _valid_cell(module, self.__class__.__name__):
            module.update_parameters_name(name + ".")
            self._is_dynamic_name.append(False)

        self._cells[name] = module
        self.cell_list = list(self._cells.values())

    def forward(self, input):
        for cell in self.cell_list:
            input = cell(input)
        return cast_to_adapter_tensor(input)

    def pop(self, key):
        v = self[key]
        del self[key]
        return v
    def insert(self, index: int, module: Module) -> 'Sequential':
        if not isinstance(module, Module):
            raise AssertionError(
                f'module should be of type: {Module}')
        n = len(self._modules)
        if not (-n <= index <= n):
            raise IndexError(
                f'Index out of range: {index}')
        if index < 0:
            index += n
        for i in range(n, index, -1):
            self._modules[str(i)] = self._modules[str(i - 1)]
        self._modules[str(index)] = module
        return self

    def extend(self, sequential):
    def extend(self, sequential) -> 'Sequential':
        for layer in sequential:
            self.append(layer)
        return self

    def insert(self, index, module):
        """
        Inserts a given Cell before a given index in the list.

        Args:
            index(int): The Insert index in the CellList.
            cell(Cell): The Cell to be inserted.
        """
        cls_name = self.__class__.__name__
        idx = _valid_index(len(self), index, cls_name)
        _valid_cell(module, cls_name)
        length = len(self)
        prefix, key_index = _get_prefix_and_index(self._cells)
        while length > idx:
            if self._auto_prefix:
                tmp_cell = self._cells[str(length-1)]
                for _, param in tmp_cell.parameters_and_names():
                    param.name = f'{prefix}{str(length)}{"."}{".".join(param.name.split(".")[key_index+1:])}'
            self._cells[str(length)] = self._cells[str(length - 1)]
            length -= 1
        self._cells[str(idx)] = module
        if self._auto_prefix:
            module.update_parameters_name(prefix + str(idx) + ".")
        self.cell_list = list(self._cells.values())
        self._is_dynamic_name.insert(index, True)

 #_ModuleListBase is similar to ms.nn._CellListBase
 class _ModuleListBase:
    """
    An interface for base the Module as list.

    The sequential Module may be iterated using the construct method using for-in statement.
    But there are some scenarios that the construct method built-in does not fit.
    For convenience, we provide an interface that indicates the sequential
    Module may be interpreted as list of Cells, so it can be accessed using
    iterator or subscript when a sequential Module instantiate is accessed
    by iterator or subscript, it will be interpreted as a list of Cells.
    """
    def __init__(self):
        """Initialize _ModuleListBase."""
        self.__cell_as_list__ = True  #for ms jit parse

    @abstractmethod
    def __len__(self):
        pass

    @abstractmethod
    def __getitem__(self, index):
        pass
 class ModuleList(Module):
    r"""Holds submodules in a list.

 class ModuleList(_ModuleListBase, Module):
    """
    Holds Cells in a list.
    ModuleList can be used like a regular Python list, the Cells it contains have been initialized.
    :class:`~torch.nn.ModuleList` can be indexed like a regular Python list, but
    modules it contains are properly registered, and will be visible by all
    :class:`~torch.nn.Module` methods.

    Args:
    modules (iterable, optional): an iterable of modules to add
        modules (iterable, optional): an iterable of modules to add

    Example::

    Examples:
        class MyModule(nn.Module):
            def __init__(self):
                super(MyModule, self).__init__()
                super().__init__()
                self.linears = nn.ModuleList([nn.Linear(10, 10) for i in range(10)])

            def forward(self, x):
@@ -353,172 +265,154 @@ class ModuleList(_ModuleListBase, Module):
                    x = self.linears[i // 2](x) + l(x)
                return x
    """
    def __init__(self, modules=None):
        """Initialize ModuleList."""
        _ModuleListBase.__init__(self)
        Module.__init__(self)

    _modules: Dict[str, Module]  # type: ignore[assignment]

    def __init__(self, modules: Optional[Iterable[Module]] = None) -> None:
        super().__init__()
        if modules is not None:
            self.extend(modules)
            self += modules

    def __getitem__(self, idx):
        if isinstance(idx, Tensor):
            idx = int(idx)
        cls_name = self.__class__.__name__
    def _get_abs_string_index(self, idx):
        """Get the absolute index for the list of modules."""
        idx = operator.index(idx)
        if not (-len(self) <= idx < len(self)):
            raise IndexError(f'index {idx} is out of range')
        if idx < 0:
            idx += len(self)
        return str(idx)

    def __getitem__(self, idx: Union[int, slice]) -> Union[Module, 'ModuleList']:
        if isinstance(idx, slice):
            return self.__class__(list(self._modules.values())[idx])
        else:
            return self._modules[self._get_abs_string_index(idx)]

    def __setitem__(self, idx: int, module: Module) -> None:
        idx = self._get_abs_string_index(idx)
        return setattr(self, str(idx), module)

    def __delitem__(self, idx: Union[int, slice]) -> None:
        if isinstance(idx, slice):
            return self.__class__(list(self._cells.values())[idx])
        if isinstance(idx, int):
            idx = _valid_index(len(self), idx, cls_name)
            return self._cells[str(idx)]
        raise TypeError(f"For '{cls_name}', the type of 'idx' must be int or slice, "
                        f"but got {type(idx).__name__}.")

    def __setitem__(self, idx, module):
        if isinstance(idx, Tensor):
            idx = int(idx)
        cls_name = self.__class__.__name__
        if not isinstance(idx, int) and _valid_cell(module, cls_name):
            raise TypeError(f"For '{cls_name}', the type of 'idx' must be int, "
                            f"but got {type(idx).__name__}.")
        idx = _valid_index(len(self), idx, cls_name)
        if self._auto_prefix:
            prefix, _ = _get_prefix_and_index(self._cells)
            module.update_parameters_name(prefix + str(idx) + ".")
        self._cells[str(idx)] = module

    def __delitem__(self, idx):
        if isinstance(idx, Tensor):
            idx = int(idx)
        cls_name = self.__class__.__name__
        if isinstance(idx, int):
            idx = _valid_index(len(self), idx, cls_name)
            del self._cells[str(idx)]
        elif isinstance(idx, slice):
            keys = list(self._cells.keys())[idx]
            for key in keys:
                del self._cells[key]
            for k in range(len(self._modules))[idx]:
                delattr(self, str(k))
        else:
            raise TypeError(f"For '{cls_name}', the type of 'index' must be int or slice, "
                            f"but got {type(idx).__name__}.")
        # adjust orderedDict
        prefix, key_index = _get_prefix_and_index(self._cells)
        temp_dict = OrderedDict()
        for id, cell in enumerate(self._cells.values()):
            if self._auto_prefix:
                for _, param in cell.parameters_and_names():
                    param.name = prefix + str(id) + "." + ".".join(param.name.split(".")[key_index+1:])
            temp_dict[str(id)] = cell
        self._cells = temp_dict

    def __len__(self):
        return len(self._cells)

    def __iter__(self):
        return iter(self._cells.values())

    def __iadd__(self, modules):
            delattr(self, self._get_abs_string_index(idx))
        # To preserve numbering, self._modules is being reconstructed with modules after deletion
        str_indices = [str(i) for i in range(len(self._modules))]
        self._modules = OrderedDict(list(zip(str_indices, self._modules.values())))

    def __len__(self) -> int:
        return len(self._modules)

    def __iter__(self) -> Iterator[Module]:
        return iter(self._modules.values())

    def __iadd__(self, modules: Iterable[Module]) -> Self:
        return self.extend(modules)

    def __add__(self, other):
    def __add__(self, other: Iterable[Module]) -> 'ModuleList':
        combined = ModuleList()
        for _, module in enumerate(chain(self, other)):
            combined.append(module)
        for i, module in enumerate(chain(self, other)):
            combined.add_module(str(i), module)
        return combined

    def __repr__(self):
        """Return a custom repr for ModuleList that compresses repeated module representations."""
        list_of_reprs = [repr(item) for item in self]
        if len(list_of_reprs) == 0:
            return self._get_name() + '()'

        start_end_indices = [[0, 0]]
        repeated_blocks = [list_of_reprs[0]]
        for i, r in enumerate(list_of_reprs[1:], 1):
            if r == repeated_blocks[-1]:
                start_end_indices[-1][1] += 1
                continue

            start_end_indices.append([i, i])
            repeated_blocks.append(r)

        lines = []
        main_str = self._get_name() + '('
        for (start_id, end_id), b in zip(start_end_indices, repeated_blocks):
            local_repr = f"({start_id}): {b}"  # default repr

            if start_id != end_id:
                n = end_id - start_id + 1
                local_repr = f"({start_id}-{end_id}): {n} x {b}"

            local_repr = _addindent(local_repr, 2)
            lines.append(local_repr)

        main_str += '\n  ' + '\n  '.join(lines) + '\n'
        main_str += ')'
        return main_str

    def __dir__(self):
        keys = super(ModuleList, self).__dir__()
        keys = super().__dir__()
        keys = [key for key in keys if not key.isdigit()]
        return keys

    def pop(self, key):
        v = self[key]
        del self[key]
        return v
    def insert(self, index: int, module: Module) -> None:
        r"""Insert a given module before a given index in the list.

    def insert(self, index, module):
        Args:
            index (int): index to insert.
            module (nn.Module): module to insert
        """
        Inserts a given Module before a given index in the list.
        for i in range(len(self._modules), index, -1):
            self._modules[str(i)] = self._modules[str(i - 1)]
        self._modules[str(index)] = module

    def append(self, module: Module) -> 'ModuleList':
        r"""Append a given module to the end of the list.

        Args:
            index(int): The Insert index in the ModuleList.
            module(Module): The Module to be inserted.
            module (nn.Module): module to append
        """
        cls_name = self.__class__.__name__
        #TODO: after _valid_index fixed, below code can be remove
        if len(self) == 0 and index == 0:
            idx = index
        else:
            idx = _valid_index(len(self), index, cls_name)
        _valid_cell(module, cls_name)
        length = len(self)
        prefix, key_index = _get_prefix_and_index(self._cells)
        while length > idx:
            if self._auto_prefix:
                tmp_cell = self._cells[str(length-1)]
                for _, param in tmp_cell.parameters_and_names():
                    param.name = prefix + str(length) + "." + ".".join(param.name.split(".")[key_index+1:])
            self._cells[str(length)] = self._cells[str(length - 1)]
            length -= 1
        self._cells[str(idx)] = module
        if self._auto_prefix:
            module.update_parameters_name(prefix + str(idx) + ".")

    def extend(self, modules):
        """
        Appends Cells from a Python iterable to the end of the list.
        self.add_module(str(len(self)), module)
        return self

        Args:
            cells(list): The Cells to be extended.
    def pop(self, key: Union[int, slice]) -> Module:
        v = self[key]
        del self[key]
        return v

    def extend(self, modules: Iterable[Module]) -> Self:
        r"""Append modules from a Python iterable to the end of the list.

        Raises:
            TypeError: If the argument cells are not a list of Cells.
        Args:
            modules (iterable): iterable of modules to append
        """
        cls_name = self.__class__.__name__
        if not isinstance(modules, container_abcs.Iterable):
            raise TypeError("ModuleList.extend should be called with an "
                            "iterable, but got " + type(modules).__name__)
        prefix, _ = _get_prefix_and_index(self._cells)
        for module in modules:
            if _valid_cell(module, cls_name):
                if self._auto_prefix:
                    module.update_parameters_name(prefix + str(len(self)) + ".")
                self._cells[str(len(self))] = module
        offset = len(self)
        for i, module in enumerate(modules):
            self.add_module(str(offset + i), module)
        return self

    def append(self, module):
        """
        Appends a given Module to the end of the list.

        Args:
            module(Module): The subcell to be appended.
        """
        if _valid_cell(module, self.__class__.__name__):
            if self._auto_prefix:
                prefix, _ = _get_prefix_and_index(self._cells)
                module.update_parameters_name(prefix + str(len(self)) + ".")
            self._cells[str(len(self))] = module
    # remove forward alltogether to fallback on Module's _forward_unimplemented

    def set_grad(self, flag=True):
        self.requires_grad = flag
        for cell in self._cells.values():
            cell.set_grad(flag)

 class ModuleDict(Module):
    r"""Holds submodules in a dictionary.

    :class:`nn.ModuleDict` can be indexed like a regular Python dictionary,
    :class:`~torch.nn.ModuleDict` can be indexed like a regular Python dictionary,
    but modules it contains are properly registered, and will be visible by all
    :class:`nn.Module` methods.
    :class:`~torch.nn.Module` methods.

    :class:`nn.ModuleDict` is an **ordered** dictionary that respects
    :class:`~torch.nn.ModuleDict` is an **ordered** dictionary that respects

    * the order of insertion, and

    * in :meth:`nn.ModuleDict.update`, the order of the merged
    * in :meth:`~torch.nn.ModuleDict.update`, the order of the merged
      ``OrderedDict``, ``dict`` (started from Python 3.6) or another
      :class:`nn.ModuleDict` (the argument to
      :meth:`nn.ModuleDict.update`).
      :class:`~torch.nn.ModuleDict` (the argument to
      :meth:`~torch.nn.ModuleDict.update`).

    Note that :meth:`nn.ModuleDict.update` with other unordered mapping
    Note that :meth:`~torch.nn.ModuleDict.update` with other unordered mapping
    types (e.g., Python's plain ``dict`` before Python version 3.6) does not
    preserve the order of the merged mapping.

@@ -530,7 +424,7 @@ class ModuleDict(Module):

        class MyModule(nn.Module):
            def __init__(self):
                super(MyModule, self).__init__()
                super().__init__()
                self.choices = nn.ModuleDict({
                        'conv': nn.Conv2d(10, 10, 3),
                        'pool': nn.MaxPool2d(3)
@@ -546,42 +440,36 @@ class ModuleDict(Module):
                return x
    """

    def __init__(self, modules=None):
        super(ModuleDict, self).__init__()
        self.__cell_as_dict__ = True
    _modules: Dict[str, Module]  # type: ignore[assignment]

    def __init__(self, modules: Optional[Mapping[str, Module]] = None) -> None:
        super().__init__()
        if modules is not None:
            self.update(modules)

    def __getitem__(self, key):
        return self._cells[key]
    def __getitem__(self, key: str) -> Module:
        return self._modules[key]

    def __setitem__(self, key, module):
        self._update_cell_para_name(key, module)
    def __setitem__(self, key: str, module: Module) -> None:
        self.add_module(key, module)

    def __delitem__(self, key):
        del self._cells[key]
    def __delitem__(self, key: str) -> None:
        del self._modules[key]

    def __len__(self):
        return len(self._cells)
    def __len__(self) -> int:
        return len(self._modules)

    def __iter__(self):
        return iter(self._cells)
    def __iter__(self) -> Iterator[str]:
        return iter(self._modules)

    def __contains__(self, key):
        return key in self._cells
    def __contains__(self, key: str) -> bool:
        return key in self._modules

    def _update_cell_para_name(self, key, cell):
        if self._auto_prefix:
            prefix, _ = _get_prefix_and_index(self._cells)
            cell.update_parameters_name(prefix + key + ".")
    def clear(self) -> None:
        """Remove all items from the ModuleDict."""
        self._modules.clear()

    def clear(self):
        """Remove all items from the ModuleDict.
        """
        self._cells.clear()

    def pop(self, key):
    def pop(self, key: str) -> Module:
        r"""Remove key from the ModuleDict and return its module.

        Args:
@@ -591,32 +479,28 @@ class ModuleDict(Module):
        del self[key]
        return v

    def keys(self):
        r"""Return an iterable of the ModuleDict keys.
        """
        return self._cells.keys()
    def keys(self) -> Iterable[str]:
        r"""Return an iterable of the ModuleDict keys."""
        return self._modules.keys()

    def items(self):
        r"""Return an iterable of the ModuleDict key/value pairs.
        """
        return self._cells.items()
    def items(self) -> Iterable[Tuple[str, Module]]:
        r"""Return an iterable of the ModuleDict key/value pairs."""
        return self._modules.items()

    def values(self):
        r"""Return an iterable of the ModuleDict values.
        """
        return self._cells.values()
    def values(self) -> Iterable[Module]:
        r"""Return an iterable of the ModuleDict values."""
        return self._modules.values()

    def update(self, modules):
        r"""Update the :class:`nn.ModuleDict` with the key-value pairs from a
        mapping or an iterable, overwriting existing keys.
    def update(self, modules: Mapping[str, Module]) -> None:
        r"""Update the :class:`~torch.nn.ModuleDict` with key-value pairs from a mapping, overwriting existing keys.

        .. note::
            If :attr:`modules` is an ``OrderedDict``, a :class:`nn.ModuleDict`, or
            If :attr:`modules` is an ``OrderedDict``, a :class:`~torch.nn.ModuleDict`, or
            an iterable of key-value pairs, the order of new elements in it is preserved.

        Args:
            modules (iterable): a mapping (dictionary) from string to :class:`nn.Module`,
                or an iterable of key-value pairs of type (string, :class:`nn.Module`)
            modules (iterable): a mapping (dictionary) from string to :class:`~torch.nn.Module`,
                or an iterable of key-value pairs of type (string, :class:`~torch.nn.Module`)
        """
        if not isinstance(modules, container_abcs.Iterable):
            raise TypeError("ModuleDict.update should be called with an "
@@ -645,15 +529,15 @@ class ModuleDict(Module):


 class ParameterList(Module):
    """Holds parameters in a list.
    r"""Holds parameters in a list.

    :class:`nn.ParameterList` can be used like a regular Python
    list, but Tensors that are :class:`nn.Parameter` are properly registered,
    and will be visible by all :class:`nn.Module` methods.
    :class:`~torch.nn.ParameterList` can be used like a regular Python
    list, but Tensors that are :class:`~torch.nn.Parameter` are properly registered,
    and will be visible by all :class:`~torch.nn.Module` methods.

    Note that the constructor, assigning an element of the list, the
    :meth:`nn.ParameterDict.append` method and the :meth:`nn.ParameterDict.extend`
    method will convert any :class:`Tensor` into :class:`nn.Parameter`.
    :meth:`~torch.nn.ParameterDict.append` method and the :meth:`~torch.nn.ParameterDict.extend`
    method will convert any :class:`~torch.Tensor` into :class:`~torch.nn.Parameter`.

    Args:
        parameters (iterable, optional): an iterable of elements to add to the list.
@@ -662,8 +546,8 @@ class ParameterList(Module):

        class MyModule(nn.Module):
            def __init__(self):
                super(MyModule, self).__init__()
                self.params = nn.ParameterList([nn.Parameter(ms_torch.randn(10, 10)) for i in range(10)])
                super().__init__()
                self.params = nn.ParameterList([nn.Parameter(torch.randn(10, 10)) for i in range(10)])

            def forward(self, x):
                # ParameterList can act as an iterable, or be indexed using ints
@@ -672,21 +556,29 @@ class ParameterList(Module):
                return x
    """

    def __init__(self, values=None):
        super(ParameterList, self).__init__()
    def __init__(self, values: Optional[Iterable[Any]] = None) -> None:
        super().__init__()
        self._size = 0
        if values is not None:
            self += values

    def _get_abs_string_index(self, idx):
        """Get the absolute index for the list of modules"""
        """Get the absolute index for the list of modules."""
        idx = operator.index(idx)
        if not -len(self) <= idx < len(self):
            raise IndexError('index {} is out of range'.format(idx))
        if not (-len(self) <= idx < len(self)):
            raise IndexError(f'index {idx} is out of range')
        if idx < 0:
            idx += len(self)
        return str(idx)

    @overload
    def __getitem__(self, idx: int) -> Any:
        ...

    @overload
    def __getitem__(self: T, idx: slice) -> T:
        ...

    def __getitem__(self, idx):
        if isinstance(idx, slice):
            start, stop, step = idx.indices(len(self))
@@ -698,33 +590,33 @@ class ParameterList(Module):
            idx = self._get_abs_string_index(idx)
            return getattr(self, str(idx))

    def __setitem__(self, idx, param):
    def __setitem__(self, idx: int, param: Any) -> None:
        # Note that all other function that add an entry to the list part of
        # the ParameterList end up here. So this is the only place where we need
        # to wrap things into Parameter if needed.
        # Objects added via setattr() are not in the list part and thus won't
        # call into this function.
        idx = self._get_abs_string_index(idx)
        if isinstance(param, Tensor) and not isinstance(param, Parameter):
        if isinstance(param, torch.Tensor) and not isinstance(param, Parameter):
            param = Parameter(param)
        return setattr(self, str(idx), param)

    def __len__(self):
    def __len__(self) -> int:
        return self._size

    def __iter__(self):
    def __iter__(self) -> Iterator[Any]:
        return iter(self[i] for i in range(len(self)))

    def __iadd__(self, parameters):
    def __iadd__(self, parameters: Iterable[Any]) -> Self:
        return self.extend(parameters)

    def __dir__(self):
        keys = super(ParameterList, self).__dir__()
        keys = super().__dir__()
        keys = [key for key in keys if not key.isdigit()]
        return keys

    def append(self, value):
        """Appends a given value at the end of the list.
    def append(self, value: Any) -> 'ParameterList':
        """Append a given value at the end of the list.

        Args:
            value (Any): value to append
@@ -734,26 +626,26 @@ class ParameterList(Module):
        self[new_idx] = value
        return self

    def extend(self, values):
        """Appends values from a Python iterable to the end of the list.
    def extend(self, values: Iterable[Any]) -> Self:
        """Append values from a Python iterable to the end of the list.

        Args:
            values (iterable): iterable of values to append
        """
        # Tensor is an iterable but we never want to unpack it here
        if not isinstance(values, container_abcs.Iterable) or isinstance(values, Tensor):
        if not isinstance(values, container_abcs.Iterable) or isinstance(values, torch.Tensor):
            raise TypeError("ParameterList.extend should be called with an "
                            "iterable, but got " + type(values).__name__)
        for value in values:
            self.append(value)
        return self

    def extra_repr(self):
    def extra_repr(self) -> str:
        child_lines = []
        for k, p in enumerate(self):
            if isinstance(p, Tensor):
            if isinstance(p, torch.Tensor):
                size_str = 'x'.join(str(size) for size in p.size())
                device_str = '' if not p.is_cuda else ' (GPU {})'.format(p.get_device())
                device_str = ''
                parastr = '{} containing: [{} of size {}{}]'.format(
                    "Parameter" if isinstance(p, Parameter) else "Tensor",
                    p.dtype, size_str, device_str)
@@ -767,31 +659,23 @@ class ParameterList(Module):
    def __call__(self, *args, **kwargs):
        raise RuntimeError('ParameterList should not be called.')

    # adpater api, to convert ParameterList to list[Parameter]
    def to_list(self):
        list_params = []
        for i, p in enumerate(self):
            p.name = str(i) + "." + p.name
            list_params.append(p)
        return list_params


 class ParameterDict(Module):
    """Holds parameters in a dictionary.
    r"""Holds parameters in a dictionary.

    ParameterDict can be indexed like a regular Python dictionary, but Parameters it
    contains are properly registered, and will be visible by all Module methods.
    Other objects are treated as would be done by a regular Python dictionary

    :class:`nn.ParameterDict` is an **ordered** dictionary.
    :meth:`nn.ParameterDict.update` with other unordered mapping
    :class:`~torch.nn.ParameterDict` is an **ordered** dictionary.
    :meth:`~torch.nn.ParameterDict.update` with other unordered mapping
    types (e.g., Python's plain ``dict``) does not preserve the order of the
    merged mapping. On the other hand, ``OrderedDict`` or another :class:`nn.ParameterDict`
    merged mapping. On the other hand, ``OrderedDict`` or another :class:`~torch.nn.ParameterDict`
    will preserve their ordering.

    Note that the constructor, assigning an element of the dictionary and the
    :meth:`nn.ParameterDict.update` method will convert any :class:`Tensor` into
    :class:`nn.Parameter`.
    :meth:`~torch.nn.ParameterDict.update` method will convert any :class:`~torch.Tensor` into
    :class:`~torch.nn.Parameter`.

    Args:
        values (iterable, optional): a mapping (dictionary) of
@@ -802,10 +686,10 @@ class ParameterDict(Module):

        class MyModule(nn.Module):
            def __init__(self):
                super(MyModule, self).__init__()
                super().__init__()
                self.params = nn.ParameterDict({
                        'left': nn.Parameter(ms_torch.randn(5, 10)),
                        'right': nn.Parameter(ms_torch.randn(5, 10))
                        'left': nn.Parameter(torch.randn(5, 10)),
                        'right': nn.Parameter(torch.randn(5, 10))
                })

            def forward(self, x, choice):
@@ -813,13 +697,13 @@ class ParameterDict(Module):
                return x
    """

    def __init__(self, parameters = None):
        super(ParameterDict, self).__init__()
    def __init__(self, parameters: Any = None) -> None:
        super().__init__()
        self._keys: Dict[str, None] = {}
        if parameters is not None:
            self.update(parameters)

    def _key_to_attr(self, key):
    def _key_to_attr(self, key: str) -> str:
        if not isinstance(key, str):
            raise TypeError("Index given to ParameterDict cannot be used as a key as it is "
                            f"not a string (type is '{type(key).__name__}'). Open an issue on "
@@ -828,11 +712,11 @@ class ParameterDict(Module):
            # Use the key as-is so that `.named_parameters()` returns the right thing
            return key

    def __getitem__(self, key):
    def __getitem__(self, key: str) -> Any:
        attr = self._key_to_attr(key)
        return getattr(self, attr)

    def __setitem__(self, key, value):
    def __setitem__(self, key: str, value: Any) -> None:
        # Note that all other function that add an entry to the dictionary part of
        # the ParameterDict end up here. So this is the only place where we need
        # to wrap things into Parameter if needed.
@@ -840,36 +724,37 @@ class ParameterDict(Module):
        # call into this function.
        self._keys[key] = None
        attr = self._key_to_attr(key)
        if isinstance(value, Tensor) and not isinstance(value, Parameter):
        if isinstance(value, torch.Tensor) and not isinstance(value, Parameter):
            value = Parameter(value)
        setattr(self, attr, value)

    def __delitem__(self, key):
    def __delitem__(self, key: str) -> None:
        del self._keys[key]
        attr = self._key_to_attr(key)
        delattr(self, attr)

    def __len__(self):
    def __len__(self) -> int:
        return len(self._keys)

    def __iter__(self):
    def __iter__(self) -> Iterator[str]:
        return iter(self._keys)

    def __reversed__(self):
    def __reversed__(self) -> Iterator[str]:
        return reversed(list(self._keys))

    def copy(self):
        """Returns a copy of this :class:`nn.ParameterDict` instance.
        """
    def copy(self) -> 'ParameterDict':
        """Return a copy of this :class:`~torch.nn.ParameterDict` instance."""
        # We have to use an OrderedDict because the ParameterDict constructor
        # behaves differently on plain dict vs OrderedDict
        return ParameterDict(OrderedDict((k, self[k]) for k in self._keys))

    def __contains__(self, key):
    def __contains__(self, key: str) -> bool:
        return key in self._keys

    def setdefault(self, key, default = None):
        """If key is in the ParameterDict, return its value.
    def setdefault(self, key: str, default: Optional[Any] = None) -> Any:
        """Set the default for a key in the Parameterdict.

        If key is in the ParameterDict, return its value.
        If not, insert `key` with a parameter `default` and return `default`.
        `default` defaults to `None`.

@@ -877,18 +762,16 @@ class ParameterDict(Module):
            key (str): key to set default for
            default (Any): the parameter set to the key
        """

        if key not in self:
            self[key] = default
        return self[key]

    def clear(self):
        """Remove all items from the ParameterDict.
        """
    def clear(self) -> None:
        """Remove all items from the ParameterDict."""
        for k in self._keys.copy():
            del self[k]

    def pop(self, key):
    def pop(self, key: str) -> Any:
        r"""Remove key from the ParameterDict and return its parameter.

        Args:
@@ -898,10 +781,8 @@ class ParameterDict(Module):
        del self[key]
        return v

    def popitem(self):
        """Remove and return the last inserted `(key, parameter)` pair
        from the ParameterDict
        """
    def popitem(self) -> Tuple[str, Any]:
        """Remove and return the last inserted `(key, parameter)` pair from the ParameterDict."""
        k, _ = self._keys.popitem()
        # We need the key in the _keys to be able to access/del
        self._keys[k] = None
@@ -909,9 +790,8 @@ class ParameterDict(Module):
        del self[k]
        return k, val

    def get(self, key, default = None):
        r"""Return the parameter associated with key if present.
        Otherwise return default if provided, None if not.
    def get(self, key: str, default: Optional[Any] = None) -> Any:
        r"""Return the parameter associated with key if present. Otherwise return default if provided, None if not.

        Args:
            key (str): key to get from the ParameterDict
@@ -919,42 +799,38 @@ class ParameterDict(Module):
        """
        return self[key] if key in self else default

    def fromkeys(self, keys, default = None):
        r"""Return a new ParameterDict with the keys provided
    def fromkeys(self, keys: Iterable[str], default: Optional[Any] = None) -> 'ParameterDict':
        r"""Return a new ParameterDict with the keys provided.

        Args:
            keys (iterable, string): keys to make the new ParameterDict from
            default (Parameter, optional): value to set for all keys
        """
        return ParameterDict(((k, default) for k in keys))
        return ParameterDict((k, default) for k in keys)

    def keys(self):
        r"""Return an iterable of the ParameterDict keys.
        """
    def keys(self) -> Iterable[str]:
        r"""Return an iterable of the ParameterDict keys."""
        return self._keys.keys()

    def items(self):
        r"""Return an iterable of the ParameterDict key/value pairs.
        """
    def items(self) -> Iterable[Tuple[str, Any]]:
        r"""Return an iterable of the ParameterDict key/value pairs."""
        return ((k, self[k]) for k in self._keys)

    def values(self):
        r"""Return an iterable of the ParameterDict values.
        """
    def values(self) -> Iterable[Any]:
        r"""Return an iterable of the ParameterDict values."""
        return (self[k] for k in self._keys)

    def update(self, parameters):
        r"""Update the :class:`~nn.ParameterDict` with the key-value pairs from a
        mapping or an iterable, overwriting existing keys.
    def update(self, parameters: Union[Mapping[str, Any], 'ParameterDict']) -> None:
        r"""Update the :class:`~torch.nn.ParameterDict` with key-value pairs from ``parameters``, overwriting existing keys.

        .. note::
            If :attr:`parameters` is an ``OrderedDict``, a :class:`~nn.ParameterDict`, or
            If :attr:`parameters` is an ``OrderedDict``, a :class:`~torch.nn.ParameterDict`, or
            an iterable of key-value pairs, the order of new elements in it is preserved.

        Args:
            parameters (iterable): a mapping (dictionary) from string to
                :class:`~nn.Parameter`, or an iterable of
                key-value pairs of type (string, :class:`~nn.Parameter`)
                :class:`~torch.nn.Parameter`, or an iterable of
                key-value pairs of type (string, :class:`~torch.nn.Parameter`)
        """
        if not isinstance(parameters, container_abcs.Iterable):
            raise TypeError("ParametersDict.update should be called with an "
@@ -980,15 +856,15 @@ class ParameterDict(Module):
                # parameters as length-2 list too cumbersome to type, see ModuleDict.update comment
                self[p[0]] = p[1]  # type: ignore[assignment]

    def extra_repr(self):
    def extra_repr(self) -> str:
        child_lines = []
        for k, p in self.items():
            if isinstance(p, Tensor):
            if isinstance(p, torch.Tensor):
                size_str = 'x'.join(str(size) for size in p.size())
                device_str = '' if not p.is_cuda else ' (GPU {})'.format(p.get_device())
                device_str = ''
                parastr = '{} containing: [{} of size {}{}]'.format(
                    "Parameter" if isinstance(p, Parameter) else "Tensor",
                    typename(p), size_str, device_str)
                    torch.typename(p), size_str, device_str)
                child_lines.append('  (' + str(k) + '): ' + parastr)
            else:
                child_lines.append('  (' + str(k) + '): Object of type: ' + type(p).__name__)
@@ -998,22 +874,16 @@ class ParameterDict(Module):
    def __call__(self, input):
        raise RuntimeError('ParameterDict should not be called.')

    def __or__(self, other):
    def __or__(self, other: 'ParameterDict') -> 'ParameterDict':
        copy = self.copy()
        copy.update(other)
        return copy

    def __ror__(self, other):
    def __ror__(self, other: 'ParameterDict') -> 'ParameterDict':
        copy = other.copy()
        copy.update(self)
        return copy

    def __ior__(self, other):
    def __ior__(self, other : 'ParameterDict') -> Self:
        self.update(other)
        return self

    def to_dict(self):
        new_dict = {}
        for key in self._keys:
            new_dict[key] = self[key]
        return new_dict
--- a/mindtorch/torch/nn/modules/lazy.py
+++ b/mindtorch/torch/nn/modules/lazy.py
@@ -1,17 +1,22 @@
 import itertools
 from typing_extensions import Protocol
 import warnings
 from typing import Protocol, Optional, Type, Any

 from mindspore import _no_grad as torch_no_grad
 from mindtorch.torch.logging import warning
 from mindtorch.utils import unsupported_attr
 from mindtorch import torch
 from ..parameter import is_lazy

 __all__ = ['LazyModuleMixin']

 class _LazyProtocol(Protocol):
    """This class is used to avoid errors with mypy checks for the attributes in a mixin.

    https://mypy.readthedocs.io/en/latest/more_types.html#mixin-classes
    """

    def _register_load_state_dict_pre_hook(self, hook):
        ...

    def register_forward_pre_hook(self, hook):
    def register_forward_pre_hook(self, hook, *, prepend=False, with_kwargs=False):
        ...

    def _lazy_load_hook(
@@ -47,17 +52,139 @@ class _LazyProtocol(Protocol):


 class LazyModuleMixin:
    r"""A mixin for modules that lazily initialize parameters, also known as "lazy modules".

    .. warning:
        Lazy modules are an experimental new feature under active development,
        and their API is likely to change.

    Modules that lazily initialize parameters, or "lazy modules",
    derive the shapes of their parameters from the first input(s)
    to their forward method. Until that first forward they contain
    :class:`torch.nn.UninitializedParameter` s that should not be accessed
    or used, and afterward they contain regular :class:`torch.nn.Parameter` s.
    Lazy modules are convenient since they don't require computing some
    module arguments, like the :attr:`in_features` argument of a
    typical :class:`torch.nn.Linear`.

    After construction, networks with lazy modules should first
    be converted to the desired dtype and placed on the expected device.
    This is because lazy modules only perform shape inference so the usual dtype
    and device placement behavior applies.
    The lazy modules should then perform "dry runs" to initialize all the components in the module.
    These "dry runs" send inputs of the correct size, dtype, and device through
    the network and to each one of its lazy modules. After this the network can be used as usual.

    >>> # xdoctest: +SKIP
    >>> class LazyMLP(torch.nn.Module):
    ...    def __init__(self):
    ...        super().__init__()
    ...        self.fc1 = torch.nn.LazyLinear(10)
    ...        self.relu1 = torch.nn.ReLU()
    ...        self.fc2 = torch.nn.LazyLinear(1)
    ...        self.relu2 = torch.nn.ReLU()
    ...
    ...    def forward(self, input):
    ...        x = self.relu1(self.fc1(input))
    ...        y = self.relu2(self.fc2(x))
    ...        return y
    >>> # constructs a network with lazy modules
    >>> lazy_mlp = LazyMLP()
    >>> # transforms the network's device and dtype
    >>> # NOTE: these transforms can and should be applied after construction and before any 'dry runs'
    >>> lazy_mlp = lazy_mlp.cuda().double()
    >>> lazy_mlp
    LazyMLP( (fc1): LazyLinear(in_features=0, out_features=10, bias=True)
      (relu1): ReLU()
      (fc2): LazyLinear(in_features=0, out_features=1, bias=True)
      (relu2): ReLU()
    )
    >>> # performs a dry run to initialize the network's lazy modules
    >>> lazy_mlp(torch.ones(10,10).cuda())
    >>> # after initialization, LazyLinear modules become regular Linear modules
    >>> lazy_mlp
    LazyMLP(
      (fc1): Linear(in_features=10, out_features=10, bias=True)
      (relu1): ReLU()
      (fc2): Linear(in_features=10, out_features=1, bias=True)
      (relu2): ReLU()
    )
    >>> # attaches an optimizer, since parameters can now be used as usual
    >>> optim = torch.optim.SGD(mlp.parameters(), lr=0.01)

    A final caveat when using lazy modules is that the order of initialization of a network's
    parameters may change, since the lazy modules are always initialized after other modules.
    For example, if the LazyMLP class defined above had a :class:`torch.nn.LazyLinear` module
    first and then a regular :class:`torch.nn.Linear` second, the second module would be
    initialized on construction and the first module would be initialized during the first dry run.
    This can cause the parameters of a network using lazy modules to be initialized differently
    than the parameters of a network without lazy modules as the order of parameter initializations,
    which often depends on a stateful random number generator, is different.
    Check :doc:`/notes/randomness` for more details.

    Lazy modules can be serialized with a state dict like other modules. For example:

    >>> lazy_mlp = LazyMLP()
    >>> # The state dict shows the uninitialized parameters
    >>> lazy_mlp.state_dict()
    OrderedDict([('fc1.weight', Uninitialized parameter),
                 ('fc1.bias',
                  tensor([-1.8832e+25,  4.5636e-41, -1.8832e+25,  4.5636e-41, -6.1598e-30,
                           4.5637e-41, -1.8788e+22,  4.5636e-41, -2.0042e-31,  4.5637e-41])),
                 ('fc2.weight', Uninitialized parameter),
                 ('fc2.bias', tensor([0.0019]))])

    cls_to_become = None

    Lazy modules can load regular :class:`torch.nn.Parameter` s (i.e. you can serialize/deserialize
    initialized LazyModules and they will remain initialized)


    >>> full_mlp = LazyMLP()
    >>> # Dry run to initialize another module
    >>> full_mlp.forward(torch.ones(10, 1))
    >>> # Load an initialized state into a lazy module
    >>> lazy_mlp.load_state_dict(full_mlp.state_dict())
    >>> # The state dict now holds valid values
    >>> lazy_mlp.state_dict()
    OrderedDict([('fc1.weight',
                  tensor([[-0.3837],
                          [ 0.0907],
                          [ 0.6708],
                          [-0.5223],
                          [-0.9028],
                          [ 0.2851],
                          [-0.4537],
                          [ 0.6813],
                          [ 0.5766],
                          [-0.8678]])),
                 ('fc1.bias',
                  tensor([-1.8832e+25,  4.5636e-41, -1.8832e+25,  4.5636e-41, -6.1598e-30,
                           4.5637e-41, -1.8788e+22,  4.5636e-41, -2.0042e-31,  4.5637e-41])),
                 ('fc2.weight',
                  tensor([[ 0.1320,  0.2938,  0.0679,  0.2793,  0.1088, -0.1795, -0.2301,  0.2807,
                            0.2479,  0.1091]])),
                 ('fc2.bias', tensor([0.0019]))])

    Note, however, that the loaded parameters will not be replaced when doing a "dry run" if they are initialized
    when the state is loaded. This prevents using initialized modules in different contexts.
    """

    # modules inheriting from this will change their __class__ to the specified
    # one after they are fully initialized
    cls_to_become: Optional[Type[Any]] = None

    def __init__(self: _LazyProtocol, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # Mypy doesnt like this super call in a mixin
        super().__init__(*args, **kwargs)  # type: ignore[misc]
        self._load_hook = self._register_load_state_dict_pre_hook(self._lazy_load_hook)
        self._initialize_hook = self.register_forward_pre_hook(self._infer_parameters)
        warning('Lazy modules are a new feature under heavy development '
                'so changes to the API or functionality can happen at any moment.')
        self._initialize_hook = self.register_forward_pre_hook(self._infer_parameters, with_kwargs=True)
        warnings.warn('Lazy modules are a new feature under heavy development '
                      'so changes to the API or functionality can happen at any moment.')

    def _save_to_state_dict(self: _LazyProtocol, destination, prefix, keep_vars):
        # This should be ideally implemented as a hook,
        # but we should override `detach` in the UninitializedParameter to return itself
        # which is not clean
        for name, param in self._parameters.items():
            if param is not None:
                if not (is_lazy(param) or keep_vars):
@@ -72,24 +199,38 @@ class LazyModuleMixin:
    def _lazy_load_hook(
            self: _LazyProtocol, state_dict, prefix, local_metadata, strict,
            missing_keys, unexpected_keys, error_msgs):
        unsupported_attr(local_metadata)
        unsupported_attr(strict)
        unsupported_attr(missing_keys)
        unsupported_attr(unexpected_keys)
        unsupported_attr(error_msgs)
        """load_state_dict pre-hook function for lazy buffers and parameters.

        The purpose of this hook is to adjust the current state and/or
        ``state_dict`` being loaded so that a module instance serialized in
        both un/initialized state can be deserialized onto both un/initialized
        module instance.
        See comment in ``torch.nn.Module._register_load_state_dict_pre_hook``
        for the details of the hook specification.
        """
        for name, param in itertools.chain(self._parameters.items(), self._buffers.items()):
            key = prefix + name
            if key in state_dict and param is not None:
                input_param = state_dict[key]
                if is_lazy(param):
                    # The current parameter is not initialized but the one being loaded one is
                    # create a new parameter based on the uninitialized one
                    if not is_lazy(input_param):
                        with torch_no_grad():
                        with torch.no_grad():
                            param.materialize(input_param.shape)

    def initialize_parameters(self: _LazyProtocol, *args, **kwargs):
        raise NotImplementedError('initialize_parameters is not implemented for {}'.format(self.__class__.__name__))
        r"""Initialize parameters according to the input batch properties.

        This adds an interface to isolate parameter initialization from the
        forward pass when doing parameter shape inference.
        """
        raise NotImplementedError(f'initialize_parameters is not implemented for {self.__class__.__name__}')

    def has_uninitialized_params(self: _LazyProtocol):
        r"""Check if a module has parameters that are not initialized."""
        # This is to avoid the JIT to track this parameter and force
        # custom modules __setstate__ to add it
        params = self._parameters.values()
        buffers = self._buffers.values()
        for param in itertools.chain(params, buffers):
@@ -97,10 +238,20 @@ class LazyModuleMixin:
                return True
        return False

    def _infer_parameters(self: _LazyProtocol, module, input):
        module.initialize_parameters(*input)
    def _infer_parameters(self: _LazyProtocol, module, args, kwargs=None):
        r"""Infers the size and initializes the parameters according to the provided input batch.

        Given a module that contains parameters that were declared inferrable
        using :class:`torch.nn.parameter.ParameterMode.Infer`, runs a forward pass
        in the complete module using the provided input to initialize all the parameters
        as needed.
        The module is set into evaluation mode before running the forward pass in order
        to avoid saving statistics or calculating gradients
        """
        kwargs = kwargs if kwargs else {}
        module.initialize_parameters(*args, **kwargs)
        if module.has_uninitialized_params():
            raise RuntimeError('module {} has not been fully initialized'.format(self._get_name()))
            raise RuntimeError(f'module {self._get_name()} has not been fully initialized')
        module._initialize_hook.remove()
        module._load_hook.remove()
        delattr(module, '_initialize_hook')
@@ -111,4 +262,4 @@ class LazyModuleMixin:

    def _replicate_for_data_parallel(self: _LazyProtocol):
        raise RuntimeError('Modules with uninitialized parameters can\'t be used with `DataParallel`. '
                           'Run a dummy forward pass to correctly initialize the modules')
                           'Run a dummy forward pass to correctly initialize the modules')
--- a/mindtorch/torch/nn/modules/module.py
+++ b/mindtorch/torch/nn/modules/module.py
--- a/mindtorch/torch/nn/modules/transformer.py
+++ b/mindtorch/torch/nn/modules/transformer.py
@@ -168,11 +168,6 @@ class TransformerEncoderLayer(Module):
            self.activation_relu_or_gelu = 0
        self.activation = activation

    def __setstate__(self, state):
        if 'activation' not in state[1]:
            state[1]['activation'] = F.relu
        super(TransformerEncoderLayer, self).__setstate__(state)

    def forward(self, src, src_mask=None, src_key_padding_mask=None):
        src = cast_to_ms_tensor(src)
        src_mask = cast_to_ms_tensor(src_mask)
@@ -231,10 +226,10 @@ class TransformerDecoderLayer(Module):
        else:
            self.activation = activation

    def __setstate__(self, state):
        if 'activation' not in state[1]:
            state[1]['activation'] = F.relu
        super(TransformerDecoderLayer, self).__setstate__(state)
    # def __setstate__(self, state):
    #     if 'activation' not in state[1]:
    #         state[1]['activation'] = F.relu
    #     super(TransformerDecoderLayer, self).__setstate__(state)

    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, tgt_key_padding_mask=None,
                memory_key_padding_mask=None):
--- a/mindtorch/torch/nn/parameter.py
+++ b/mindtorch/torch/nn/parameter.py
@@ -16,6 +16,9 @@ from mindtorch.torch.tensor import Tensor, cast_to_ms_tensor, cast_to_adapter_te
 from mindtorch.torch.common.dtype import _msdtype2typeDict
 from mindtorch.torch.functional import empty as torch_empty
 from mindtorch.utils import unsupported_attr, graph_mode_condition
 from mindtorch.utils import unsupported_attr
 from mindspore import Parameter as msParameter
 from mindtorch import torch

 __all__ = ['Parameter', 'ParameterTuple', 'UninitializedParameter', 'UninitializedBuffer']

@@ -39,144 +42,35 @@ def init_to_value(init):
        return float(init)
    raise ValueError("The argument 'init' should be number or string, but got {}.".format(type(init)))

 class Parameter(ms.Parameter):

 class Parameter(Tensor):
    _base_type = {}
    def __new__(cls, data, *args, **kwargs):
        init_data_flag = bool(isinstance(data, ms.Tensor) and data.has_init)
        rc = sys.getrefcount(data)
        input_class, *class_init_args = Parameter._get_parameter_new_args(data, rc)
        new_type = Parameter._get_base_class(input_class)
        obj = input_class.__new__(new_type)
        input_class.__init__(obj, *class_init_args)
        obj.init_mode = None
        obj.is_default_input_init = init_data_flag
        if obj.has_init:
            obj.init_mode = data
        return obj

    def __reduce_ex__(self, _):
        data = self
        if self.init_mode is not None:
            data = self.init_mode
        else:
            # cast to break deep infinite loop while deepcopy
            data = ms.Tensor(self)
        return (
            Parameter, (data, self.requires_grad, self.name, self.layerwise_parallel))

    def __init__(self, data, requires_grad=True, name=None, layerwise_parallel=False, parallel_optimizer=True):
        self.adapter_flag = True
        super().__init__(default_input=data, name=name, requires_grad=requires_grad,
                         layerwise_parallel=layerwise_parallel, parallel_optimizer=parallel_optimizer)

    def __deepcopy__(self, memodict):
        new_obj = Parameter(self)
        new_obj.name = self.name
        new_obj._inited_param = self._inited_param
        return new_obj

    def __str__(self):
        if self.init_finished:
            Tensor_.data_sync(self.data, True)
        return f'Parameter containing: {Tensor_.__repr__(self.data)}, requires_grad={self.requires_grad})'

    @staticmethod
    def _get_base_class(input_class):
        input_class_name = Parameter.__name__
        if input_class_name in Parameter._base_type:
            new_type = Parameter._base_type.get(input_class_name)
    is_leaf = True
    retains_grad = False

    # def __reduce_ex__(self, _):
    #     data = self
    #     if self.init_mode is not None:
    #         data = self.init_mode
    #     else:
    #         # cast to break deep infinite loop while deepcopy
    #         data = ms.Tensor(self)
    #     return (
    #         Parameter, (data, self.requires_grad, self.name, self.layerwise_parallel))

    def __init__(self, data, requires_grad=True, name=None):
        if isinstance(data, Tensor):
            super().__init__(data, requires_grad=requires_grad, cast_tensor=True)
        else:
            new_type = type(input_class_name, (Parameter, input_class), {})
            Parameter._base_type[input_class_name] = new_type
        return new_type

    @property
    def dtype(self):
        dtype = super(Parameter, self).dtype
        return _msdtype2typeDict.get(str(dtype), dtype)

    @property
    def data(self):
        """Return the parameter object."""
        return self

    @data.setter
    def data(self, data):
        ms_data = cast_to_ms_tensor(data)
        self.set_data(ms_data, True)

    def _update_tensor_data(self, data):
        """Update the parameter by a Tensor."""
        if isinstance(self, ms.Tensor):
            self.init_flag = False
            self.init = None
            return self.assign_value(data)
        new_param = Parameter(data, self.name, self.requires_grad)
        new_param.param_info = self.param_info
        return new_param

    @staticmethod
    def _from_tensor(tensor, *args, **kwargs):
        """Create a `Parameter` that data is shared from a `Tensor`."""
        if not isinstance(tensor, Tensor_):
            raise TypeError(f"The type of input must be Tensor, but got {type(tensor)}.")
        param = Tensor_.__new__(Parameter)
        Tensor_.__init__(param, tensor)
        param.init = None
        param.init_mode = None
        param.is_default_input_init = False
        Parameter.__init__(param, tensor, *args, **kwargs)
        return param

    def requires_grad_(self, requires_grad=True):
        self.requires_grad = requires_grad
        return self
            raise ValueError(f'not support type {type(data)}.')
        self.name = name
        print(self.tensor.has_init)
        self.tensor = ms.Parameter(self.tensor, name, requires_grad)

    def detach(self):
        return cast_to_adapter_tensor(ms.Parameter.value(self))

    def numel(self):
        shape = self.shape
        return reduce((lambda x, y: x * y), shape) if shape else 1

    def nelement(self):
        return self.numel()

    def item(self):
        if self.numel() > 1:
            raise ValueError("only one element tensors can be converted to Python scalars")
        output = self.asnumpy().reshape(-1).tolist()
        return output[0]

    def stride(self, dim=None):
        bytelen = self.itemsize
        output = list(self.strides)
        for i in range(len(output)):
            output[i] = output[i]//bytelen
        output = tuple(output)
        if dim is not None:
            output = output[dim]
        return output

    def is_signed(self):
        return self.dtype in mstype.signed_type

    def is_complex(self):
        return self.dtype in mstype.complex_type

    def is_floating_point(self):
        return self.dtype in [mstype.float32, mstype.float16, mstype.float64]

    @jit_forbidden_register
    def assign_value(self, value):
        if validator.is_stub_tensor(value):
            value = value.stub_sync()
        self.assign_value_cpp(value)
        return self

    @property
    def shape(self):
        return self._shape
    def __repr__(self):
        # if self.init_finished:
        #     Tensor_.data_sync(self.data, True)
        return f'Parameter containing: {self.data}, requires_grad={self.requires_grad})'

    def set_(self, source=None, storage_offset=0, size=None, stride=None):
        if storage_offset or size or stride:
@@ -305,59 +199,56 @@ class UninitializedTensorMixin:
 def is_lazy(param):
    return isinstance(param, UninitializedTensorMixin)


 class UninitializedParameter(UninitializedTensorMixin, Parameter):
    r"""A parameter that is not initialized.

    Uninitialized Parameters are a a special case of :class:`torch.nn.Parameter`
    where the shape of the data is still unknown.

    Unlike a :class:`torch.nn.Parameter`, uninitialized parameters
    hold no data and attempting to access some properties, like their shape,
    will throw a runtime error. The only operations that can be performed on a uninitialized
    parameter are changing its datatype, moving it to a different device and
    converting it to a regular :class:`torch.nn.Parameter`.

    The default device or dtype to use when the parameter is materialized can be set
    during construction using e.g. ``device='cuda'``.
    """

    cls_to_become = Parameter
    _base_type = {}
    def __new__(cls, requires_grad=True, device=None, dtype=None):
        factory_kwargs = {'device': device, 'dtype': dtype}
        data = torch_empty(1, **factory_kwargs)
        init_data_flag = bool(isinstance(data, ms.Tensor) and data.has_init)
        rc = sys.getrefcount(data)
        input_class, *class_init_args = UninitializedParameter._get_parameter_new_args(data, rc)
        new_type = UninitializedParameter._get_base_class(input_class)
        obj = input_class.__new__(new_type)
        input_class.__init__(obj, *class_init_args)
        obj.init_mode = None
        obj.is_default_input_init = init_data_flag
        if obj.has_init:
            obj.init_mode = data
        unsupported_attr(requires_grad)
        return obj

    def __init__(self, requires_grad=True, device=None, dtype=None):

    def __new__(cls, requires_grad=True, device=None, dtype=None) -> None:
        factory_kwargs = {'device': device, 'dtype': dtype}
        data = torch_empty(1, **factory_kwargs)
        Parameter.__init__(self, data, requires_grad=requires_grad)

    @staticmethod
    def _get_base_class(input_class):
        input_class_name = UninitializedParameter.__name__
        if input_class_name in UninitializedParameter._base_type:
            new_type = UninitializedParameter._base_type.get(input_class_name)
        data = torch.empty(0, **factory_kwargs)
        return torch.Tensor._make_subclass(cls, data, requires_grad)

    def __deepcopy__(self, memo):
        if id(self) in memo:
            return memo[id(self)]
        else:
            new_type = \
                type(input_class_name, (UninitializedParameter, UninitializedTensorMixin, Parameter, input_class), {})
            UninitializedParameter._base_type[input_class_name] = new_type
        return new_type
            result = type(self)(self.requires_grad, self.data.device, self.data.dtype)
            memo[id(self)] = result
            return result

    def __str__(self):
        if self.init_finished:
            Tensor_.data_sync(self.data, True)
        return f'UninitializedParameter containing: {Tensor_.__repr__(self.data)}, requires_grad={self.requires_grad})'
 class UninitializedBuffer(UninitializedTensorMixin, Tensor):
    r"""A buffer that is not initialized.

    def __repr__(self):
        return self.__str__()
    Uninitialized Buffer is a a special case of :class:`torch.Tensor`
    where the shape of the data is still unknown.

    Unlike a :class:`torch.Tensor`, uninitialized parameters
    hold no data and attempting to access some properties, like their shape,
    will throw a runtime error. The only operations that can be performed on a uninitialized
    parameter are changing its datatype, moving it to a different device and
    converting it to a regular :class:`torch.Tensor`.

 class UninitializedBuffer(UninitializedTensorMixin, Tensor):
    The default device or dtype to use when the buffer is materialized can be set
    during construction using e.g. ``device='cuda'``.
    """

    cls_to_become = Tensor

    def __new__(cls, requires_grad=False, device=None, dtype=None):
    def __new__(cls, requires_grad=False, device=None, dtype=None) -> None:
        factory_kwargs = {'device': device, 'dtype': dtype}
        data = torch_empty(1, **factory_kwargs)
        obj = Tensor.__new__(cls)
        Tensor.__init__(obj, data)
        unsupported_attr(requires_grad)
        return obj
        data = torch.empty(0, **factory_kwargs)
        return Tensor(data, dtype=dtype, requires_grad=requires_grad)
--- a/mindtorch/torch/optim/optimizer.py
+++ b/mindtorch/torch/optim/optimizer.py
@@ -261,11 +261,14 @@ class _Optimizer:
        return ret

    def zero_grad(self):
        raise NotImplementedError("'zero_grad' not support yet because of different autograd mechanism "
                                  "between MindSpore and PyTorch. Actually we usually don't need to "
                                  "call 'zero_grad' in MindTorch, because 'mindspore.grad' or 'value_and_grad' always "
                                  "return the new grad without accumulation, so there is no need to clear "
                                  "the grad.")
        if not hasattr(self, 'origin_params'):
            raise NotImplementedError("'zero_grad' not support yet because of different autograd mechanism "
                                    "between MindSpore and PyTorch. Actually we usually don't need to "
                                    "call 'zero_grad' in MindTorch, because 'mindspore.grad' or 'value_and_grad' always "
                                    "return the new grad without accumulation, so there is no need to clear "
                                    "the grad.")
        for param in self.origin_params:
            param.grad = None

 class _OptimizerMeta(abc.ABCMeta, type(Optimizer_MS)):
    """
--- a/mindtorch/torch/optim/sgd.py
+++ b/mindtorch/torch/optim/sgd.py
@@ -1,6 +1,7 @@
 from mindspore.experimental.optim import SGD as SGD_MS
 from mindtorch.torch.optim.optimizer import _Optimizer, _warn_differentiable
 from mindtorch.utils import unsupported_attr
 from mindtorch.torch import Tensor

 _default_lr = 0.01
 class SGD(_Optimizer, SGD_MS):
@@ -16,6 +17,10 @@ class SGD(_Optimizer, SGD_MS):
            # Fake lr. The above code guarantees that every param_group has its own 'lr' setting.
            # So the following _default_lr won't take effect, just for the input args of mindspore SGD.
            lr = _default_lr
        params = list(params)
        self.origin_params = params
        if isinstance(params[0], Tensor):
            params = [param.tensor for param in params]
        SGD_MS.__init__(self, params, lr, momentum, dampening, weight_decay, nesterov, maximize=maximize)
        _Optimizer.__init__(self)
        self._state_map = {'accum': 'momentum_buffer'}
--- a/mindtorch/torch/tensor.py
+++ b/mindtorch/torch/tensor.py
@@ -247,13 +247,9 @@ def _gather_get_padding_pattern(input_shape, index_shape, dim):
            padding_pattern = (0, input_shape[i] - index_shape[i]) + padding_pattern
    return padding_pattern

 class _TensorMeta(type(ms_Tensor), abc.ABCMeta):
    """
    Meta class for Tensor. Used internally.
    """

 class Tensor(StubTensor, metaclass=_TensorMeta):
    def __init__(self, *data, dtype=None, inner=False, cast_tensor=False):
 class Tensor(StubTensor):
    def __init__(self, *data, dtype=None, requires_grad=False, inner=False, cast_tensor=False):
        if cast_tensor:
            if len(data) != 1:
                raise RuntimeError("Tensor init data lenght is not 1 when cast_tensor=True")
@@ -261,9 +257,17 @@ class Tensor(StubTensor, metaclass=_TensorMeta):
            if isinstance(input_data, StubTensor):
                self.stub = input_data.stub
                self.tensor = input_data.tensor
                self.requires_grad_ = input_data.requires_grad_ or requires_grad
                self.grad_fn_ = input_data.grad_fn_
                self.grad_ = input_data.grad_
                self.retain_grad_ = input_data.retain_grad_
            elif isinstance(input_data, Tensor_):
                self.stub = None
                self.tensor = input_data
                self.requires_grad = requires_grad
                self.grad_fn_ = None
                self.grad_ = None
                self.retain_grad_ = False
            else:
                raise ValueError(f"Tensor init data type is invaild: {type(input_data)}")
            self.adapter_flag = True
@@ -290,6 +294,7 @@ class Tensor(StubTensor, metaclass=_TensorMeta):
                init_tensor = ms_Tensor(input_data=_input_data, dtype=dtype)
        super(Tensor, self).__init__(tensor=init_tensor)
        self.adapter_flag = True
        self.requires_grad = requires_grad


    def _process_data(self, data):
@@ -1719,32 +1724,32 @@ class Tensor(StubTensor, metaclass=_TensorMeta):
    def is_quantized(self, flag):
        raise AttributeError("attribute 'is_quantized' of 'torch.Tensor' objects is not writable.")

    @property
    def requires_grad(self):
        warning("tensor.requires_grad only suppport set to True now. So It is always True.")
        return True

    @requires_grad.setter
    def requires_grad(self, flag):
        if not isinstance(flag, bool):
            raise RuntimeError("requires_grad must be a bool")
        if flag is False:
            raise NotImplementedError("tensor.requires_grad can not set to False yet. "
                                      "If tensor is not leaf Tensor, can try tensor.detach() instead. "
                                      "If tensor is leaf Tensor, can replaces tensor with Parameter, because "
                                      "Parameter.requires_grad work with mindspore autograd mechanism, "
                                      "when it set to False, the gradient return by ms.grad"
                                      "(https://www.mindspore.cn/docs/zh-CN/r2.0/"
                                      "api_python/mindspore/mindspore.grad.html) "
                                      "or ms.value_and_grad"
                                      "(https://www.mindspore.cn/docs/zh-CN/r2.0/"
                                      "api_python/mindspore/mindspore.value_and_grad.html)"
                                      " is zero. ")

    def requires_grad_(self, requires_grad=True):
        if requires_grad is False:
            warning("requires_grad is always True in Tensor.")
        return self
    # @property
    # def requires_grad(self):
    #     warning("tensor.requires_grad only suppport set to True now. So It is always True.")
    #     return True

    # @requires_grad.setter
    # def requires_grad(self, flag):
    #     if not isinstance(flag, bool):
    #         raise RuntimeError("requires_grad must be a bool")
    #     if flag is False:
    #         raise NotImplementedError("tensor.requires_grad can not set to False yet. "
    #                                   "If tensor is not leaf Tensor, can try tensor.detach() instead. "
    #                                   "If tensor is leaf Tensor, can replaces tensor with Parameter, because "
    #                                   "Parameter.requires_grad work with mindspore autograd mechanism, "
    #                                   "when it set to False, the gradient return by ms.grad"
    #                                   "(https://www.mindspore.cn/docs/zh-CN/r2.0/"
    #                                   "api_python/mindspore/mindspore.grad.html) "
    #                                   "or ms.value_and_grad"
    #                                   "(https://www.mindspore.cn/docs/zh-CN/r2.0/"
    #                                   "api_python/mindspore/mindspore.value_and_grad.html)"
    #                                   " is zero. ")

    # def requires_grad_(self, requires_grad=True):
    #     if requires_grad is False:
    #         warning("requires_grad is always True in Tensor.")
    #     return self

    def nonzero(self):
        input_ms = cast_to_ms_tensor(self)
@@ -4252,42 +4257,15 @@ class Tensor(StubTensor, metaclass=_TensorMeta):
                return rlt
            return cast_to_adapter_tensor(value), cast_to_adapter_tensor(indices)

    def backward(self, gradient=None, retain_graph=None, create_graph=False, inputs=None):
        unsupported_attr(gradient)
        unsupported_attr(retain_graph)
        unsupported_attr(create_graph)
        unsupported_attr(inputs)
        raise NotImplementedError(
            "tensor.backward() not support yet. please use "
            "mindspore.value_and_grad"
            "(https://www.mindspore.cn/docs/zh-CN/r2.0/api_python/mindspore/mindspore.value_and_grad.html) "
            "or mindspore.grad"
            "(https://www.mindspore.cn/docs/zh-CN/r2.0/api_python/mindspore/mindspore.grad.html) "
            "to compute gradient and send the gradient to the optimizer. "
            "please refer to mobilenet_v2 example: "
            "https://openi.pcl.ac.cn/OpenI/MindTorchModelZoo/src/branch/master/official/cv/"
            "mobilenet_v2/mobilenet_v2_adapter.py")
    def backward(self, grad=None):
        r"""
        calculate the gradient.
        """
        # assert self.shape == ()
        if grad is None:
            grad = self.new_ones(self.shape)

    @property
    def grad(self):
        if hasattr(self, "_grad"):
            return self._grad
        raise NotImplementedError(
            "tensor.grad not support yet. pleause use "
            "mindspore.value_and_grad"
            "(https://www.mindspore.cn/docs/zh-CN/r2.0/api_python/mindspore/mindspore.value_and_grad.html) "
            "or mindspore.grad"
            "(https://www.mindspore.cn/docs/zh-CN/r2.0/api_python/mindspore/mindspore.grad.html) "
            "to get the gradient. And take out the corresponding element as grad."
        )

    @grad.setter
    def grad(self, new_grad):
        self._grad = new_grad

    @grad.deleter
    def grad(self):
        del self._grad
        super().backward(grad)

    def frexp(self):
        # TODO: to use ms.ops.frexp
@@ -4429,19 +4407,15 @@ def _get_default_dtype_by_data(data):
        return default_dtype
    return None

 def tensor(data, dtype=None, device=None, requires_grad=True):
 def tensor(data, dtype=None, device=None, requires_grad=False):
    unsupported_attr(device)
    if requires_grad is False:
        msg = ("In MindTorch, Tensor's `requires_grad` is always 'True', can not be set to 'False'. ")
        warning(msg)

    if dtype is None and _not_default_fp32_dtype():
        dtype = _get_default_dtype_by_data(data)

    if isinstance(data, (tuple, list)) and not data:
        return Tensor(*data, dtype=dtype, inner=False)
        return Tensor(*data, dtype=dtype, requires_grad=requires_grad, inner=False)

    return Tensor(data, dtype=dtype, inner=True)
    return Tensor(data, dtype=dtype, requires_grad=requires_grad, inner=True)

 def cast_to_ms_tensor(inputs):
    """
--- a/mindtorch/torch/utils/hooks.py
+++ b/mindtorch/torch/utils/hooks.py
@@ -1,29 +1,55 @@
 # from mindtorch.torch import Tensor
 # from mindtorch.torch.autograd import is_grad_enabled
 import torch
 from collections import OrderedDict
 import weakref
 import warnings
 # import functools
 from typing import Any
 from typing import Any, Tuple

 class RemovableHandle():
    """A handle which provides the capability to remove a hook."""
 __all__ = ["RemovableHandle", "unserializable_hook", "warn_if_has_hooks", "BackwardHook"]

 class RemovableHandle:
    r"""
    A handle which provides the capability to remove a hook.

    Args:
        hooks_dict (dict): A dictionary of hooks, indexed by hook ``id``.
        extra_dict (Union[dict, List[dict]]): An additional dictionary or list of
            dictionaries whose keys will be deleted when the same keys are
            removed from ``hooks_dict``.
    """

    id: int
    next_id: int = 0
    op = None

    def __init__(self, hooks_dict: Any) -> None:
    def __init__(self, hooks_dict: Any, *, extra_dict: Any = None) -> None:
        self.hooks_dict_ref = weakref.ref(hooks_dict)
        self.id = RemovableHandle.next_id
        RemovableHandle.next_id += 1

        self.extra_dict_ref: Tuple = ()
        if isinstance(extra_dict, dict):
            self.extra_dict_ref = (weakref.ref(extra_dict),)
        elif isinstance(extra_dict, list):
            self.extra_dict_ref = tuple(weakref.ref(d) for d in extra_dict)

    def remove(self) -> None:
        hooks_dict = self.hooks_dict_ref()
        if hooks_dict is not None and self.id in hooks_dict:
            del hooks_dict[self.id]
            if self.op is not None:
                self.op.remove_backward_hook(self.id)


        for ref in self.extra_dict_ref:
            extra_dict = ref()
            if extra_dict is not None and self.id in extra_dict:
                del extra_dict[self.id]

    def __getstate__(self):
        return (self.hooks_dict_ref(), self.id)
        if self.extra_dict_ref is None:
            return (self.hooks_dict_ref(), self.id)
        else:
            return (self.hooks_dict_ref(), self.id, tuple(ref() for ref in self.extra_dict_ref))

    def __setstate__(self, state) -> None:
        if state[0] is None:
@@ -34,7 +60,12 @@ class RemovableHandle():
        self.id = state[1]
        RemovableHandle.next_id = max(RemovableHandle.next_id, self.id + 1)

    def __enter__(self) -> 'RemovableHandle':
        if len(state) < 3 or state[2] is None:
            self.extra_dict_ref = ()
        else:
            self.extra_dict_ref = tuple(weakref.ref(d) for d in state[2])

    def __enter__(self) -> "RemovableHandle":
        return self

    def __exit__(self, type: Any, value: Any, tb: Any) -> None:
@@ -43,7 +74,8 @@ class RemovableHandle():

 def unserializable_hook(f):
    """
    Decorator which marks a function as an unserializable hook.
    Mark a function as an unserializable hook with this decorator.

    This suppresses warnings that would otherwise arise if you attempt
    to serialize a tensor that has a hook.
    """
@@ -56,131 +88,169 @@ def warn_if_has_hooks(tensor):
        for k in tensor._backward_hooks:
            hook = tensor._backward_hooks[k]
            if not hasattr(k, "__torch_unserializable__"):
                warnings.warn("backward hook {} on tensor will not be "
                warnings.warn(f"backward hook {repr(hook)} on tensor will not be "
                              "serialized.  If this is expected, you can "
                              "decorate the function with @torch.utils.hooks.unserializable_hook "
                              "to suppress this warning".format(repr(hook)))

 # TODO: Adapt after the new differential scheme is launched.
 # class BackwardHook(object):
 #     def __init__(self, module, user_hooks):
 #         self.user_hooks = user_hooks
 #         self.module = module
 #
 #         self.grad_outputs = None
 #         self.n_outputs = -1
 #         self.output_tensors_index = None
 #         self.n_inputs = -1
 #         self.input_tensors_index = None
 #
 #     def _pack_with_none(self, indices, values, size):
 #         res = [None] * size
 #         for idx, val in zip(indices, values):
 #             res[idx] = val
 #
 #         return tuple(res)
 #
 #     def _unpack_none(self, indices, values):
 #         res = []
 #         for idx in indices:
 #             res.append(values[idx])
 #
 #         return tuple(res)
 #
 #     def _set_user_hook(self, grad_fn, user_hook):
 #         @functools.wraps(user_hook)
 #         def hook(grad_input, _):
 #             if self.grad_outputs is None:
 #                 raise RuntimeError("Module backward hook for grad_input is called before "
 #                                    "the grad_output one. This happens because the gradient "
 #                                    "in your nn.Module flows to the Module's input without "
 #                                    "passing through the Module's output. Make sure that the "
 #                                    "output depends on the input and that the loss is computed "
 #                                    "based on the output.")
 #
 #             grad_input = self._pack_with_none(self.input_tensors_index, grad_input, self.n_inputs)
 #             res = user_hook(self.module, grad_input, self.grad_outputs)
 #             if res is None:
 #                 return res
 #
 #             if len(res) != len(grad_input):
 #                 raise RuntimeError("Backward hook returned an invalid number of grad_input, "
 #                                    "got {}, but expected {}".format(len(res), len(grad_input)))
 #             return self._unpack_none(self.input_tensors_index, res)
 #         grad_fn.register_hook(hook)
 #
 #     def _apply_on_tensors(self, fn, args):
 #         # Can be used to apply the given function to the tensors contained in the
 #         # args. Will return updated args and the tensors indices
 #         tensors_idx = []
 #         tensors = []
 #
 #         requires_grad = False
 #         for i, arg in enumerate(args):
 #             if isinstance(arg, Tensor):
 #                 tensors_idx.append(i)
 #                 tensors.append(arg)
 #                 requires_grad |= arg.requires_grad
 #
 #         if not (requires_grad and is_grad_enabled()):
 #             return args, None
 #
 #         new_tensors = torch.nn.modules._functions.BackwardHookFunction.apply(*tensors)
 #         if len(new_tensors) == 0:
 #             raise RuntimeError("Cannot set Module backward hook for a Module with no input Tensors.")
 #
 #         grad_fns = [t.grad_fn for t in new_tensors if t.grad_fn is not None and
 #                     t.grad_fn.name() == "BackwardHookFunctionBackward"]
 #         if len(grad_fns) == 0:
 #             raise RuntimeError("Error while setting up backward hooks. Please open "
 #                                "an issue with a code sample to reproduce this.")
 #
 #         fn(grad_fns[0])
 #
 #         arg_list = list(args)
 #         for idx, val in zip(tensors_idx, new_tensors):
 #             arg_list[idx] = val
 #
 #         return tuple(arg_list), tensors_idx
 #
 #     def setup_input_hook(self, args):
 #         def fn(grad_fn):
 #             for hook in self.user_hooks:
 #                 self._set_user_hook(grad_fn, hook)
 #
 #         res, input_idx = self._apply_on_tensors(fn, args)
 #         self.n_inputs = len(args)
 #         self.input_tensors_index = input_idx
 #         return res
 #
 #     def setup_output_hook(self, args):
 #         def fn(grad_fn):
 #             def hook(_, grad_output):
 #                 self.grad_outputs = self._pack_with_none(self.output_tensors_index,
 #                                                          grad_output,
 #                                                          self.n_outputs)
 #
 #                 # Special case if no input required gradients, this hook should call the user
 #                 # hook directly
 #                 if self.input_tensors_index is None:
 #                     grad_inputs = self._pack_with_none([], [], self.n_inputs)
 #                     for user_hook in self.user_hooks:
 #                         res = user_hook(self.module, grad_inputs, self.grad_outputs)
 #                         if res is not None and not (isinstance(res, tuple) and all(el is None for el in res)):
 #                             raise RuntimeError("Backward hook for Modules where no input requires "
 #                                                "gradient should always return None or None for all gradients.")
 #
 #             grad_fn.register_hook(hook)
 #
 #         is_tuple = True
 #         if not isinstance(args, tuple):
 #             args = (args,)
 #             is_tuple = False
 #
 #         res, output_idx = self._apply_on_tensors(fn, args)
 #         self.n_outputs = len(args)
 #         self.output_tensors_index = output_idx
 #
 #         if not is_tuple:
 #             res = res[0]
 #         return res
                              "to suppress this warning")

 class BackwardHook:
    """
    A wrapper class to implement nn.Module backward hooks.

    It handles:
      - Ignoring non-Tensor inputs and replacing them by None before calling the user hook
      - Generating the proper Node to capture a set of Tensor's gradients
      - Linking the gradients captures for the outputs with the gradients captured for the input
      - Calling the user hook once both output and input gradients are available
    """

    def __init__(self, module, user_hooks, user_pre_hooks):
        self.user_hooks = user_hooks
        self.user_pre_hooks = user_pre_hooks
        self.module = module

        self.grad_outputs = None
        self.n_outputs = -1
        self.output_tensors_index = None
        self.n_inputs = -1
        self.input_tensors_index = None

    def _pack_with_none(self, indices, values, size):
        res = [None] * size
        for idx, val in zip(indices, values):
            res[idx] = val

        return tuple(res)

    def _unpack_none(self, indices, values):
        res = []
        for idx in indices:
            res.append(values[idx])

        return tuple(res)

    def _set_user_hook(self, grad_fn):
        def hook(grad_input, _):
            if self.grad_outputs is None:
                # This happens because the gradient in your nn.Module flows to
                # the Module's input without " passing through the Module's
                # output, e.g. when you're doing double backward.
                return
            res = self._pack_with_none(self.input_tensors_index, grad_input, self.n_inputs)

            for hook in self.user_hooks:
                out = hook(self.module, res, self.grad_outputs)

                if out is None:
                    continue

                if len(out) != len(res):
                    raise RuntimeError("Backward hook returned an invalid number of grad_input, "
                                       f"got {len(out)}, but expected {len(res)}")

                res = out

            self.grad_outputs = None

            return self._unpack_none(self.input_tensors_index, res)

        grad_fn.register_hook(hook)

    def _apply_on_tensors(self, fn, args):
        # Can be used to apply the given function to the tensors contained in the
        # args. Will return updated args and the tensors indices
        tensors_idx = []
        tensors = []

        requires_grad = False
        for i, arg in enumerate(args):
            if isinstance(arg, torch.Tensor):
                tensors_idx.append(i)
                tensors.append(arg)
                requires_grad |= arg.requires_grad

        if not (requires_grad and torch.is_grad_enabled()):
            return args, None

        new_tensors = torch.nn.modules._functions.BackwardHookFunction.apply(*tensors)
        if len(new_tensors) == 0:
            raise RuntimeError("Cannot set Module backward hook for a Module with no input Tensors.")

        grad_fns = [t.grad_fn for t in new_tensors if t.grad_fn is not None and t.grad_fn.name() == "BackwardHookFunctionBackward"]
        if len(grad_fns) == 0:
            raise RuntimeError("Error while setting up backward hooks. Please open "
                               "an issue with a code sample to reproduce this.")

        fn(grad_fns[0])

        arg_list = list(args)
        for idx, val in zip(tensors_idx, new_tensors):
            arg_list[idx] = val

        if type(args) is tuple:
            out = tuple(arg_list)
        else:
            out = type(args)(*arg_list)
        return out, tensors_idx

    def setup_input_hook(self, args):
        def fn(grad_fn):
            self._set_user_hook(grad_fn)

        res, input_idx = self._apply_on_tensors(fn, args)
        self.n_inputs = len(args)
        self.input_tensors_index = input_idx
        return res

    def setup_output_hook(self, args):
        def fn(grad_fn):
            def hook(_, grad_output):
                self.grad_outputs = self._pack_with_none(self.output_tensors_index,
                                                         grad_output,
                                                         self.n_outputs)

                if self.user_pre_hooks:
                    expected_len = len(self.grad_outputs)
                    for user_pre_hook in self.user_pre_hooks:
                        hook_grad_outputs = user_pre_hook(self.module, self.grad_outputs)
                        if hook_grad_outputs is None:
                            continue

                        actual_len = len(hook_grad_outputs)
                        if actual_len != expected_len:
                            raise RuntimeError("Backward pre hook returned an invalid number of grad_output, "
                                               f"got {actual_len}, but expected {expected_len}")
                        self.grad_outputs = hook_grad_outputs

                # We need to be able to clear self.grad_outputs but also return it
                local_grad_outputs = self.grad_outputs

                # Special case if no input required gradients, this hook should call the user
                # hook directly
                if self.input_tensors_index is None:
                    grad_inputs = self._pack_with_none([], [], self.n_inputs)
                    for user_hook in self.user_hooks:
                        res = user_hook(self.module, grad_inputs, self.grad_outputs)
                        if res is not None and not (isinstance(res, tuple) and all(el is None for el in res)):
                            raise RuntimeError("Backward hook for Modules where no input requires "
                                               "gradient should always return None or None for all gradients.")
                    self.grad_outputs = None

                if local_grad_outputs is not None:
                    assert self.output_tensors_index is not None  # mypy
                    return tuple(local_grad_outputs[i] for i in self.output_tensors_index)

            grad_fn.register_hook(hook)

        is_tuple = True
        if not isinstance(args, tuple):
            args = (args,)
            is_tuple = False

        res, output_idx = self._apply_on_tensors(fn, args)
        self.n_outputs = len(args)
        self.output_tensors_index = output_idx

        if not is_tuple:
            res = res[0]
        return res
--- a/testing/ut/pytorch/amp/test_grad_scaler.py
+++ b/testing/ut/pytorch/amp/test_grad_scaler.py
@@ -93,7 +93,7 @@ def test_grad_scalar():
            out = scaler.scale(loss)
            return out

        grad_fn = ms.ops.grad(func, None, net.trainable_params())
        grad_fn = ms.ops.grad(func, None, net.paramters())
        grads = grad_fn(inputs, target)

        scaler.unscale_(optimizer, grads)
--- a/testing/ut/pytorch/autograd/test_autograd.py
+++ b/testing/ut/pytorch/autograd/test_autograd.py
@@ -0,0 +1,209 @@
 import copy
 from mindtorch import torch
 from mindtorch.torch import nn

 import mindspore
 # mindspore.set_context(pynative_synchronize=True)
 # mindspore.set_context(device_target="CPU")

 class Function(nn.Module):
    def __init__(self):
        super(Function, self).__init__()
        self.Linear = nn.Linear(1,1)

    def forward(self, input):
        output = self.Linear(input)
        return output

 def test_normal_train():
    x = torch.tensor([2.0])
    y = torch.tensor([4.0])
    func = Function()
    loss_fn = nn.MSELoss()
    optim = torch.optim.SGD(func.parameters(), lr=0.01)

    w_grad_list = []
    for _ in range(3):
        optim.zero_grad()
        y_hat = func(x)
        loss = loss_fn(y_hat, y)
        loss.backward()
        # optim.step() each step different if update parameter
        w_grad_list.append(copy.deepcopy(func.Linear.weight.grad))

    assert w_grad_list[1].numpy() == w_grad_list[0].numpy()
    assert w_grad_list[2].numpy() == w_grad_list[0].numpy()

 def test_grad_accumulate():
    x = torch.tensor([2.0])
    y = torch.tensor([4.0])
    func = Function()
    loss_fn = torch.nn.MSELoss()
    optim = torch.optim.SGD(func.parameters(), lr=0.01)

    w_grad_list = []
    optim.zero_grad()
    for _ in range(3):
        y_hat = func(copy.deepcopy(x))
        loss = loss_fn(y_hat, y)
        loss.backward()
        w_grad_list.append(copy.deepcopy(func.Linear.weight.grad))

    optim.step()

    assert w_grad_list[1].numpy() == (2 * w_grad_list[0].numpy())
    assert w_grad_list[2].numpy() == (3 * w_grad_list[0].numpy())


 def test_intermediate_values():
    func = Function()
    x = torch.tensor([1.0])
    y = func(x)
    y_hat =  y ** 2

    y_hat.backward()
    assert y.grad is None
    assert y_hat.grad is None    

 # def test_retain_graph():
 #     func = Function()

 #     x = torch.tensor([1.0])
 #     x.requires_grad=True
 #     y = func(x) ** 2
 #     print(y.shape)

 #     y.backward(retain_graph=True)
 #     w_grad_0 = copy.deepcopy(func.Linear.weight.grad)
 #     y.backward()
 #     w_grad_1 = func.Linear.weight.grad

 #     # print(func.Linear.weight.grad)
 #     print(w_grad_0, w_grad_1)
 #     assert w_grad_1.numpy() == (2 * w_grad_0).numpy()

 def test_create_grad():
    # for high order
    pass

 def test_multi_loss():
    x = torch.tensor([2.0])
    y0 = torch.tensor([4.0])
    y1 = torch.tensor([4.0])
    func = Function()
    loss_fn = torch.nn.MSELoss()
    
    w_grad_list = []
    y_hat = func(copy.deepcopy(x))
    loss0 = loss_fn(y_hat, y0)
    # loss0.backward(retain_graph=True)
    loss0.backward()
    w_grad_list.append(copy.deepcopy(func.Linear.weight.grad))

    loss1 = loss_fn(y_hat, y1)
    loss1.backward()
    w_grad_list.append(copy.deepcopy(func.Linear.weight.grad))

    assert w_grad_list[1].numpy() == (2 * w_grad_list[0].numpy())


 def test_joint_loss():
    x = torch.tensor([2.0])
    y0 = torch.tensor([4.0])
    y1 = torch.tensor([4.0])
    func = Function()
    loss_fn = torch.nn.MSELoss()
    
    y_hat = func(copy.deepcopy(x))
    assert func.Linear.weight.grad is None
    loss0 = loss_fn(y_hat, y0)    
    loss1 = loss_fn(y_hat, y1)
    (loss1 + loss0).backward()

    assert func.Linear.weight.grad is not None


 # def test_two_net_connect_with_detach():
 #     x = torch.tensor([1.0])
 #     y = torch.tensor([2.0])

 #     func_0 = Function()
 #     func_1 = Function()
 #     loss_fn = torch.nn.MSELoss()

 #     y_0 = func_0(x)
 #     y_0 = y_0.detach()
 #     y_1 = func_1(y_0)
 #     loss = loss_fn(y_1, y)
 #     loss.backward()
    
 #     assert func_0.Linear.weight.grad is None
 #     assert func_0.Linear.bias.grad is None

 #     assert func_1.Linear.weight.grad is not None
 #     assert func_1.Linear.bias.grad is not None

 def test_two_net_connect_without_detach():
    x = torch.tensor([1.0])
    y = torch.tensor([2.0])

    func_0 = Function()
    func_1 = Function()
    loss_fn = torch.nn.MSELoss()

    y_0 = func_0(x)
    y_1 = func_1(y_0)
    loss = loss_fn(y_1, y)
    loss.backward()

    assert func_0.Linear.weight.grad is not None
    assert func_0.Linear.bias.grad is not None

    assert func_1.Linear.weight.grad is not None
    assert func_1.Linear.bias.grad is not None

 # def test_share_weight():
 #     x = torch.tensor([1.0])
 #     y = torch.tensor([2.0])

 #     func_0 = Function()
 #     func_1 = Function()
 #     loss_fn = torch.nn.MSELoss()
 #     # not share weight
 #     y_0 = func_0(x)
 #     y_1 = func_1(y_0)
 #     loss = loss_fn(y_1, y)
 #     loss.backward()
    
 #     print(func_0.Linear.weight.grad)
 #     print(func_1.Linear.weight.grad)

 #     assert func_0.Linear.weight.grad != func_1.Linear.weight.grad
    
 #     func_0_weight_not_shared = copy.deepcopy(func_0.Linear.weight.grad)
 #     func_1_weight_not_shared = copy.deepcopy(func_1.Linear.weight.grad)
 #     print(func_0_weight_not_shared, func_1_weight_not_shared)
 #     # zero_grad
 #     func_0.zero_grad()
 #     func_1.zero_grad()
 #     # share weight
 #     func_1.Linear.weight = func_0.Linear.weight
 #     y_0 = func_0(x)
 #     y_1 = func_1(y_0)
 #     loss = loss_fn(y_1, y)
 #     loss.backward()

 #     print(func_0.Linear.weight.grad, func_1.Linear.weight.grad)
 #     assert func_0.Linear.weight == func_1.Linear.weight
 #     assert func_0.Linear.weight.grad == func_1.Linear.weight.grad
 #     assert func_0.Linear.weight.grad != func_0_weight_not_shared
 #     assert func_0.Linear.weight.grad != func_1_weight_not_shared

 def test_vanilla_backward():
    x = torch.tensor([1.0], requires_grad=True)
    y = x * 2
    z = y + x
    z.backward()
    
    assert x.grad is not None
    assert x.grad.numpy() == [3]
--- a/testing/ut/pytorch/autograd/test_autograd_function.py
+++ b/testing/ut/pytorch/autograd/test_autograd_function.py
@@ -30,7 +30,7 @@ def adapter_autograd_function():
    y = ms_torch.tensor([[0.01], [0.2], [3.3]], dtype=ms_torch.float32)
    net = Net()
    out = net(x, y)
    grad_out = ms.grad(net, grad_position=(0, 1))(x, y)
    grad_out = ag.grad(net, grad_position=(0, 1))(x, y)
    return out, grad_out


--- a/testing/ut/pytorch/autograd/test_backward.py
+++ b/testing/ut/pytorch/autograd/test_backward.py
@@ -0,0 +1,56 @@
 import numpy as np
 import torch
 from typing import Optional, Union
 from mindtorch.torch import Tensor as mtTensor
 from mindtorch.torch.nn import Parameter
 from torch import Tensor as ptTensor

 def run_backward(tensor: Union[mtTensor, ptTensor], grad_input=None):
    if grad_input is None:
        assert tensor.shape == ()
        tensor.backward()
    
    else:
        assert tensor.shape == grad_input.shape
        tensor.backward(grad_input)

 def run_simple_op(a: Union[mtTensor, ptTensor], b: Union[mtTensor, ptTensor], op: str):
    if op == '+':
        return a + b
    if op == '-':
        return a + b
    if op == '*':
        return a + b
    if op == '/':
        return a + b
    if op == '@':
        return a @ b
    raise ValueError(f'not support {op} yet')

 def test_simple_op_backward_test():
    a = np.random.randn(3, 3).astype(np.float32)
    b = np.random.randn(3, 3).astype(np.float32)

    pt_a, pt_b = torch.tensor(a, requires_grad=True), torch.tensor(b, requires_grad=True)
    mt_a, mt_b = mtTensor(a, requires_grad=True), mtTensor(b, requires_grad=True)

    print(mt_a.requires_grad)

    op_list = ['+', '-', '*', '/', '@']

    for op in op_list:
        pt_out = run_simple_op(pt_a, pt_b, op)
        mt_out = run_simple_op(mt_a, mt_b, op)
        print(mt_out.requires_grad)
        print('pt_out.requires_grad', pt_out.requires_grad)

        assert np.allclose(pt_out.detach().numpy(), mt_out.numpy(), 1e-4, 1e-4)

        run_backward(pt_out, torch.tensor(np.ones((3, 3), np.float32)))
        run_backward(mt_out, mtTensor(np.ones((3, 3), np.float32)))

        # assert has grad
        assert mt_a.grad is not None and mt_b.grad is not None
        # allclose
        assert np.allclose(pt_a.grad.detach().numpy(), mt_a.grad.numpy(), 1e-4, 1e-4)
        assert np.allclose(pt_b.grad.detach().numpy(), mt_b.grad.numpy(), 1e-4, 1e-4)
--- a/testing/ut/pytorch/autograd/test_grad_mode.py
+++ b/testing/ut/pytorch/autograd/test_grad_mode.py
@@ -45,7 +45,7 @@ def adapter_no_grad():
    z = ms_torch.tensor([[0.01]], dtype=ms_torch.float32, requires_grad=True)
    net = Net()
    out = net(x, y, z)
    grad_out = ms.grad(net, grad_position=(0, 1, 2))(x, y, z)
    grad_out = ag.grad(net, grad_position=(0, 1, 2))(x, y, z)
    return out, grad_out


--- a/testing/ut/pytorch/functional/test_function.py
+++ b/testing/ut/pytorch/functional/test_function.py
@@ -2682,8 +2682,8 @@ def test_clone():
    ms_out1 = ms_fun(ms_a)

    assert np.allclose(torch_out1.detach().numpy(), ms_out1.numpy())
    assert np.allclose(torch_a.grad.detach().numpy(), ms.grad(ms_fun)(ms_a).numpy())
    assert torch_a.grad.detach().numpy().dtype == ms.grad(ms_fun)(ms_a).numpy().dtype
    assert np.allclose(torch_a.grad.detach().numpy(), ag.grad(ms_fun)(ms_a).numpy())
    assert torch_a.grad.detach().numpy().dtype == ag.grad(ms_fun)(ms_a).numpy().dtype

 def test_slice_scatter():
    a = torch.zeros(8, 8)
@@ -3034,7 +3034,7 @@ def test_bernoulli_grad():

    input = ms_torch.empty(3, 3).uniform_(0, 1).requires_grad_(True)
    net = ms_Net()
    ms_gradient = ms.grad(net)(input)
    ms_gradient = ag.grad(net)(input)

    class torch_Net(torch.nn.Module):
        def forward(self, input):
--- a/testing/ut/pytorch/nn/test_activation.py
+++ b/testing/ut/pytorch/nn/test_activation.py
@@ -8,6 +8,9 @@ from mindspore import context
 import mindspore as ms
 import torch
 import pytest
 from mindspore._c_expression import jit_mode_pi_disable

 from mindtorch.torch import autograd as ag

 from ...utils import set_mode_by_env_config, SKIP_ENV_GRAPH_MODE, SKIP_ENV_ASCEND, param_compare, type_shape_compare, \
    SKIP_ENV_CPU
@@ -39,6 +42,7 @@ def test_relu2():
    ms_input = ms_torch.tensor(data.astype(np.float32))
    ms_output = ms_net(ms_input)


    assert np.allclose(ms_input.asnumpy(), torch_input.numpy())
    assert np.allclose(ms_output.asnumpy(), torch_output.numpy())

@@ -66,6 +70,7 @@ def test_hardtanh2():
    torch_output = torch_net(torch_input)

    ms_input = ms_torch.tensor(data)
    print(type(ms_input))
    ms_output = ms_net(ms_input)

    assert np.allclose(ms_input.asnumpy(), torch_input.numpy())
@@ -721,6 +726,7 @@ def test_prelu():
    torch_out = torch.nn.PReLU(num_parameters=1, init=weight_init)(torch_input)
    ms_torch_input = ms_torch.tensor(input)
    ms_torch_out = ms_torch.nn.PReLU(num_parameters=1, init=weight_init)(ms_torch_input)
    print(type(torch_out))
    assert np.allclose(torch_out.detach().numpy(), ms_torch_out.detach().numpy())

    input1 = np.array([0.1, 0.6, 0.9]).astype(np.float32)
@@ -757,7 +763,9 @@ def test_prelu():
 def test_prelu_grad():
    net = ms_torch.nn.PReLU()
    x = ms_torch.Tensor([1, 2, -3])
    grad_fn = ms.grad(net, grad_position=None, weights=net.trainable_params())
    def forward(x):
        return net(x)
    grad_fn = ag.grad(forward, grad_position=None, weights=net.parameters())
    grad = grad_fn(x)[0]
    assert np.count_nonzero(grad.asnumpy()) != 0

--- a/testing/ut/pytorch/nn/test_container.py
+++ b/testing/ut/pytorch/nn/test_container.py
@@ -7,6 +7,7 @@ import torch
 import mindspore as ms
 import mindtorch.torch as ms_torch
 import mindtorch.torch.nn as nn
 from mindtorch.torch import autograd as ag

 from ...utils import set_mode_by_env_config, SKIP_ENV_GRAPH_MODE
 set_mode_by_env_config()
@@ -202,7 +203,7 @@ def test_module_dict_grad():
    net = MyModule()
    input_np = np.arange(4).reshape(2, 2).astype(np.float32)
    input = ms_torch.tensor(input_np)
    grad = ms.grad(net, grad_position=None, weights=net.trainable_params())(input)
    grad = ag.grad(net, grad_position=None, weights=net.parameters())(input)
    assert len(grad) == 4


@@ -336,7 +337,7 @@ def test_module_list_grad():
    net = MyModule()
    input_np = np.arange(4).reshape(2, 2).astype(np.float32)
    input = ms_torch.tensor(input_np)
    grad = ms.grad(net, grad_position=None, weights=net.trainable_params())(input)
    grad = ag.grad(net, grad_position=None, weights=net.parameters())(input)
    assert len(grad) == 4

 def test_module_list_insert_zero():
@@ -460,7 +461,7 @@ def test_parameter_list():
    torch_out.backward()
    torch_grad = torch_net.params[0].grad

    ms_grad = ms.grad(ms_torch_net, grad_position=None, weights=ms_torch_net.trainable_params(), has_aux=False)(ms_torch.tensor(x))
    ms_grad = ag.grad(ms_torch_net, grad_position=None, weights=ms_torch_net.parameters(), has_aux=False)(ms_torch.tensor(x))
    assert torch_grad.size() ==  ms_grad[0].shape
    assert np.allclose(torch_grad.numpy(), ms_grad[0].numpy())

@@ -484,10 +485,8 @@ def test_parameter_list_to_list():
    ms_torch_net.params.append(ms_torch.nn.Parameter(ms_torch.tensor(init_data)))
    ms_torch_net.params.extend([ms_torch.nn.Parameter(ms_torch.tensor(init_data))])

    ms_torch_net.params = ms_torch_net.params.to_list()  #to avoid graph mode error

    ms_torch_out = ms_torch_net(ms_torch.tensor(x))
    ms_grad = ms.grad(ms_torch_net, grad_position=None, weights=ms_torch_net.trainable_params(), has_aux=False)(ms_torch.tensor(x))
    ms_grad = ag.grad(ms_torch_net, grad_position=None, weights=ms_torch_net.parameters(), has_aux=False)(ms_torch.tensor(x))


@SKIP_ENV_GRAPH_MODE(reason="Graph mode unsupport custom list/tuple.")
@@ -531,7 +530,7 @@ def test_parameter_dict_grad():
    torch_out.backward()
    torch_grad = torch_net.params['right'].grad

    ms_grad = ms.grad(ms_torch_net, grad_position=None, weights=ms_torch_net.trainable_params(), has_aux=False)(ms_torch.tensor(x))
    ms_grad = ag.grad(ms_torch_net, grad_position=None, weights=ms_torch_net.parameters(), has_aux=False)(ms_torch.tensor(x))
    assert torch_grad.size() ==  ms_grad[1].shape
    assert np.allclose(torch_grad.numpy(), ms_grad[1].numpy())

@@ -547,16 +546,15 @@ def test_parameter_dict_to_dict():
                    'right': ms_torch.nn.Parameter(ms_torch.tensor(init_data2))
            })
            self.params.update({'left': ms_torch.nn.Parameter(ms_torch.tensor(init_data2))})
            self.new_params = self.params.to_dict()  #to avoid graph mode error

        def forward(self, x):
            x = self.new_params['right'].mm(x)
            x = self.params['right'].mm(x)
            return x

    x = np.random.randn(10, 1).astype(np.float32)
    ms_torch_net = MyMsModule()
    ms_torch_out = ms_torch_net(ms_torch.tensor(x))
    ms_grad = ms.grad(ms_torch_net, grad_position=None, weights=ms_torch_net.trainable_params(), has_aux=False)(ms_torch.tensor(x))
    ms_grad = ag.grad(ms_torch_net, grad_position=None, weights=ms_torch_net.parameters(), has_aux=False)(ms_torch.tensor(x))

 def test_sequential_grad1():
    input_np = np.arange(80).reshape(10, 8).astype(np.float32)
@@ -576,7 +574,7 @@ def test_sequential_grad1():
    net = Net(8, 5, 2, 1)
    input = ms_torch.tensor(input_np)

    grad_func = ms.value_and_grad(net, grad_position=None, weights=net.trainable_params())
    grad_func = ag.value_and_grad(net, grad_position=None, weights=net.parameters())
    _, weight_grad = grad_func(input)
    assert np.count_nonzero(weight_grad[-1].asnumpy()) != 10

@@ -585,7 +583,7 @@ def test_sequential_grad2():
    net = ms_torch.nn.Sequential(nn.Linear(2, 2), nn.ReLU())

    x = ms_torch.tensor(input_np, requires_grad=True)
    grad = ms.grad(net, grad_position=None, weights=net.trainable_params())(x)
    grad = ag.grad(net, grad_position=None, weights=net.parameters())(x)
    assert len(grad) == 2


--- a/testing/ut/pytorch/nn/test_conv.py
+++ b/testing/ut/pytorch/nn/test_conv.py
@@ -13,6 +13,7 @@ from mindtorch.torch.nn import Module, Parameter
 from mindtorch.torch.nn import Conv1d, Conv2d, Conv3d
 from mindtorch.torch.nn import ConvTranspose1d, ConvTranspose2d, ConvTranspose3d
 from mindtorch.torch import tensor
 from mindtorch.torch import autograd as ag

 from ...utils import SKIP_ENV_ASCEND, SKIP_ENV_GRAPH_MODE, SKIP_ENV_PYNATIVE_MODE, set_mode_by_env_config,\
                    param_compare, is_test_under_ascend_context
@@ -282,7 +283,7 @@ def test_torch_ms_conv2d_grad():
    data = np.random.randn(1, 2, 5, 5).astype(np.float32)
    net = ms_pytorch.nn.Conv2d(2, 3, 3)
    input = ms_pytorch.tensor(data)
    grad_func = ms.grad(net, grad_position=None, weights=net.trainable_params())
    grad_func = ag.grad(net, grad_position=None, weights=net.parameters())
    weight_grad, bias_grad = grad_func(input)
    assert np.count_nonzero(weight_grad.asnumpy()) != 0
    assert np.count_nonzero(bias_grad.asnumpy()) != 0
@@ -483,13 +484,13 @@ def test_torch_ms_conv_transposed3d_grad():
    data = np.random.randn(batch_size, in_channal, 10, 12, 15).astype(np.float32)
    net = ms_pytorch.nn.ConvTranspose3d(in_channal, out_channal, kernel_size, stride=2)
    input = ms_pytorch.tensor(data)
    grad_func = ms.grad(net, grad_position=None, weights=net.trainable_params())
    grad_func = ag.grad(net, grad_position=None, weights=net.parameters())
    weight_grad, bias_grad = grad_func(input)
    assert np.count_nonzero(weight_grad.asnumpy()) != 0
    assert np.count_nonzero(bias_grad.asnumpy()) != 0

    input = ms_pytorch.tensor(data)
    grad_func = ms.grad(net, grad_position=None, weights=net.trainable_params())
    grad_func = ag.grad(net, grad_position=None, weights=net.parameters())
    weight_grad, bias_grad = grad_func(input, (21, 25, 31))
    assert np.count_nonzero(weight_grad.asnumpy()) != 0
    assert np.count_nonzero(bias_grad.asnumpy()) != 0
@@ -749,7 +750,7 @@ def test_torch_ms_conv1d_grad():
    data = np.random.randn(1, 2, 5).astype(np.float32)
    net = ms_pytorch.nn.Conv1d(2, 3, 3)
    input = ms_pytorch.tensor(data)
    grad_func = ms.grad(net, grad_position=None, weights=net.trainable_params())
    grad_func = ag.grad(net, grad_position=None, weights=net.parameters())
    weight_grad, bias_grad = grad_func(input)
    assert np.count_nonzero(weight_grad.asnumpy()) != 0
    assert np.count_nonzero(bias_grad.asnumpy()) != 0
--- a/testing/ut/pytorch/nn/test_hooks.py
+++ b/testing/ut/pytorch/nn/test_hooks.py
@@ -7,12 +7,12 @@ from mindtorch.torch import nn
 from mindtorch.torch.tensor import Tensor as adapter_tenosr
 from ...utils import set_mode_by_env_config, SKIP_ENV_GRAPH_MODE, param_compare
 set_mode_by_env_config()
 from mindtorch.torch import autograd as ag

@SKIP_ENV_GRAPH_MODE(reason="register hooks not supported in GRAPH_MODE")
 def test_hooks():
    module = nn.Sigmoid()
    input = ms_torch.ones(5, 5)
    module.set_grad()

    counter = {
        'forwards': 0,
@@ -21,16 +21,18 @@ def test_hooks():

    def fw_hook(inc, h_module, input, output):
        assert isinstance(input, tuple)
        print(type(output))
        assert isinstance(output, adapter_tenosr)
        assert h_module is module
        np.allclose(input[0].numpy(), ms_torch.ones(5, 5).numpy())
        np.allclose(output.numpy(), ms_torch.full((5, 5), 1 / (1 + 1 / math.e)).numpy())
        counter['forwards'] += inc

    def bw_hook(inc, h_module, grad_input, grad_output):
    def bw_hook(inc, h_module, grad_output, grad_input):
        assert isinstance(grad_input, tuple)
        # TODO: grad_output is tuple
        assert isinstance(grad_output[0], adapter_tenosr)
        print(type(grad_output[0]))
        # assert isinstance(grad_output[0], adapter_tenosr)
        # TODO:
        # assert h_module is module
        np.allclose(grad_output[0].numpy(), (ms_torch.ones(5, 5) * 2).numpy())
@@ -50,10 +52,12 @@ def test_hooks():
    assert counter['backwards'] == 0

    grad_all = ms.ops.GradOperation(get_all=True, sens_param=True)
    grad_fn = grad_all(module)
    def forward(x):
        return module(x)
    grad_fn = grad_all(forward)

    _ = grad_fn(input, ms_torch.ones(5, 5) * 2)
    assert counter['forwards'] == 3
    assert counter['forwards'] == 4
    assert counter['backwards'] == 1

    # TODO: ms bwd hook has bug when finding higher-order derivative
@@ -92,7 +96,9 @@ def test_hook_forward_preforward_writable():
    assert np.allclose(ms_output.numpy(), torch_output.detach().numpy())

    grad_all = ms.ops.GradOperation(get_all=True, sens_param=True)
    grad_fn = grad_all(ms_module)
    def forward(x):
        return ms_module(x)
    grad_fn = grad_all(forward)
    gradient = grad_fn(ms_input, ms.ops.ones((5, 5)) * 2)
    torch_output.backward(torch.ones(5, 5) * 2, retain_graph=True)
    assert np.allclose(gradient[0].numpy(), torch_input.grad.numpy())
@@ -175,11 +181,11 @@ def test_module_forward_hook_removable():
 def test_hook_backward_writeable():
    input = np.random.randn(5, 5).astype(np.float32)

    def ms_bw_hook(module, grad_input, grad_output):
        for grad in grad_input:
            assert isinstance(grad, adapter_tenosr)
        for grad in grad_output:
            assert isinstance(grad, adapter_tenosr)
    def ms_bw_hook(module, grad_output, grad_input):
        # for grad in grad_input:
        #     assert isinstance(grad, adapter_tenosr)
        # for grad in grad_output:
        #     assert isinstance(grad, adapter_tenosr)
        return tuple(gi * 2 for gi in grad_input)

    def torch_bw_hook(module, grad_input, grad_output):
@@ -193,14 +199,16 @@ def test_hook_backward_writeable():
    ms_input = ms_torch.tensor(input)
    module.register_backward_hook(ms_bw_hook)

    grad_func = ms.ops.grad(module)
    def forward(x):
        return module(x)
    grad_func = ag.grad(forward, has_aux=False)
    gradient = grad_func(ms_input)

    torch_module = torch.nn.Sigmoid()
    torch_input = torch.tensor(input, requires_grad=True)
    torch_module.register_backward_hook(torch_bw_hook)
    torch_module(torch_input).backward(torch.ones(5, 5))
    param_compare(gradient, torch_input.grad)
    param_compare(gradient[0], torch_input.grad)


@SKIP_ENV_GRAPH_MODE(reason="register hooks not supported in GRAPH_MODE")
@@ -213,11 +221,11 @@ def test_register_module_hooks():
    def forward_hook(m, input, output):
        return -output

    def ms_bw_hook(module, grad_input, grad_output):
        for grad in grad_input:
            assert isinstance(grad, adapter_tenosr)
        for grad in grad_output:
            assert isinstance(grad, adapter_tenosr)
    def ms_bw_hook(module, grad_output, grad_input):
        # for grad in grad_input:
        #     assert isinstance(grad, adapter_tenosr)
        # for grad in grad_output:
        #     assert isinstance(grad, adapter_tenosr)
        return tuple(gi * 2 for gi in grad_input)

    def torch_bw_hook(module, grad_input, grad_output):
@@ -232,8 +240,10 @@ def test_register_module_hooks():
    ms_forward_pre_hook_handle = ms_torch.nn.modules.module.register_module_forward_pre_hook(forward_pre_hook)
    ms_forward_hook_handle = ms_torch.nn.modules.module.register_module_forward_hook(forward_hook)
    ms_bw_hook_handle = ms_torch.nn.modules.module.register_module_full_backward_hook(ms_bw_hook)
    print(ms_torch.nn.modules.module._global_backward_hooks)

    ms_out, gradient = ms.ops.value_and_grad(module, grad_position=0)(ms_input)
    ms_out, gradient = ag.value_and_grad(module, grad_position=0)(ms_input)
    print(ms_torch.nn.modules.module._global_backward_hooks)

    torch_module = torch.nn.Sigmoid()
    torch_input = torch.tensor(input, requires_grad=True)
@@ -241,11 +251,14 @@ def test_register_module_hooks():
    torch_forward_hook_handle = torch.nn.modules.module.register_module_forward_hook(forward_hook)
    torch_bw_hook_handle = torch.nn.modules.module.register_module_full_backward_hook(torch_bw_hook)

    print(torch.nn.modules.module._global_backward_hooks)
    torch_out = torch_module(torch_input)
    torch_out.backward(torch.ones(5, 5))
    print(torch.nn.modules.module._global_backward_hooks)

    param_compare(ms_out, torch_out.detach())
    param_compare(gradient, torch_input.grad)
    print(gradient[0], torch_input.grad)
    param_compare(gradient[0], torch_input.grad)

    ms_forward_pre_hook_handle.remove()
    ms_forward_hook_handle.remove()
--- a/testing/ut/pytorch/nn/test_loss.py
+++ b/testing/ut/pytorch/nn/test_loss.py
@@ -788,7 +788,7 @@ def test_ctc_loss_float32():
    pt_ctc_loss = torch.nn.CTCLoss()
    pt_loss = pt_ctc_loss(pt_input, pt_target, pt_input_lengths, pt_target_lengths)

    ms_input = ms_torch.tensor(np_data).log_softmax(2).detach().requires_grad_()
    ms_input = ms_torch.tensor(np_data).log_softmax(2).detach()#.requires_grad()
    ms_target = ms_torch.tensor(np_target)
    ms_input_lengths = ms_torch.full(size=(N,), fill_value=T, dtype=ms_torch.long)
    ms_target_lengths = ms_torch.tensor(np_target_lengths)
--- a/testing/ut/pytorch/nn/test_parameter.py
+++ b/testing/ut/pytorch/nn/test_parameter.py
@@ -97,7 +97,7 @@ def test_requires_grad_():

    torch_out.backward()
    grad_out = torch_net.conv.weight.grad
    ms_grad = ms.grad(ms_net, grad_position=None, weights=ms_net.trainable_params())(ms_input)
    ms_grad = ag.grad(ms_net, grad_position=None, weights=ms_net.paramters())(ms_input)
    assert len(ms_grad) == 1
    ms_grad = ms.ops.squeeze(ms_grad[0])
    if ms.get_context('device_target') == 'Ascend':
--- a/testing/ut/pytorch/nn/test_sequential.py
+++ b/testing/ut/pytorch/nn/test_sequential.py
@@ -33,10 +33,10 @@ print(model2)


 model3 = nn.Sequential(
          [nn.Conv2d(1,20,5),
          nn.Conv2d(1,20,5),
          nn.ReLU(),
          nn.Conv2d(20,64,5),
          nn.ReLU()]
          nn.ReLU()
        )
 print(model3)

--- a/testing/ut/pytorch/nn/test_sparse.py
+++ b/testing/ut/pytorch/nn/test_sparse.py
@@ -42,7 +42,7 @@ def test_embedding():
    result_ms = net(ms_index)
    train_net = TrainNet(net)
    train_net.set_grad()
    grad_fn = ms.value_and_grad(train_net, grad_position=None, weights=train_net.trainable_params())
    grad_fn = ms.value_and_grad(train_net, grad_position=None, weights=train_net.paramters())
    _, grads = grad_fn(ms_index)

    assert not np.allclose(grads[0][1].asnumpy(), ms.ops.ZerosLike()(grads[0][1]).asnumpy())
@@ -62,7 +62,7 @@ def test_embedding_with_weight():
    result_ms = net(ms_index)
    train_net = TrainNet(net)
    train_net.set_grad()
    grad_fn = ms.value_and_grad(train_net, grad_position=None, weights=train_net.trainable_params())
    grad_fn = ms.value_and_grad(train_net, grad_position=None, weights=train_net.paramters())
    _, grads = grad_fn(ms_index)

    assert not np.allclose(grads[0][1].asnumpy(), ms.ops.ZerosLike()(grads[0][1]).asnumpy())
@@ -85,7 +85,7 @@ def test_embedding_from_pretrained():
    result_ms = net(ms_index)
    train_net = TrainNet(net)
    train_net.set_grad()
    grad_fn = ms.value_and_grad(train_net, grad_position=None, weights=train_net.trainable_params())
    grad_fn = ms.value_and_grad(train_net, grad_position=None, weights=train_net.paramters())
    _, grads = grad_fn(ms_index)
    assert not grads

@@ -107,7 +107,7 @@ def test_embedding_weight_grad_with_padding_idx():
    net = ms_torch.nn.Embedding(4, 2, _weight=ms_weight, padding_idx=_padding_idx)
    train_net = TrainNet(net)
    train_net.set_grad()
    grad_fn = ms.value_and_grad(train_net, grad_position=None, weights=train_net.trainable_params())
    grad_fn = ms.value_and_grad(train_net, grad_position=None, weights=train_net.paramters())
    _, grads = grad_fn(ms_index)

    torch_index = torch.tensor(index_np)
@@ -130,7 +130,7 @@ def test_embedding_weight_grad_with_padding_idx_fp64():
    net = ms_torch.nn.Embedding(4, 2, _weight=ms_weight, padding_idx=_padding_idx)
    train_net = TrainNet(net)
    train_net.set_grad()
    grad_fn = ms.value_and_grad(train_net, grad_position=None, weights=train_net.trainable_params())
    grad_fn = ms.value_and_grad(train_net, grad_position=None, weights=train_net.paramters())
    _, grads = grad_fn(ms_index)

    torch_index = torch.tensor(index_np)
--- a/testing/ut/pytorch/tensor/test_tensor.py
+++ b/testing/ut/pytorch/tensor/test_tensor.py
@@ -1278,7 +1278,7 @@ def test_clone():
    assert np.allclose(ms_out.asnumpy(), torch_out.detach().numpy())
    torch_out.backward()
    torch_grad = torch_x.grad
    ms_grad = ms.grad(fun)(ms_x)
    ms_grad = ag.grad(fun)(ms_x)
    assert np.allclose(torch_grad.detach().numpy(), ms_grad.asnumpy())

 def test_detach():
@@ -1295,7 +1295,7 @@ def test_detach():

    torch_out.backward()
    torch_grad = torch_x.grad
    ms_grad = ms.grad(fun)(ms_x)
    ms_grad = ag.grad(fun)(ms_x)
    assert np.allclose(torch_grad.detach().numpy(), ms_grad.asnumpy())

 def test_new_zeros():
--- a/testing/ut/pytorch/tensor/test_tensor2.py
+++ b/testing/ut/pytorch/tensor/test_tensor2.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-

 import pytest
 import random
 import numpy as np
 import torch
@@ -605,8 +606,10 @@ def test_device_equal():
    b = ms_torch.tensor(2)
    assert a.device == b.device

@pytest.mark.skip('dynamic shape error')
 def test_view_dynamic():
    @ms.jit(input_signature=ms_torch.cast_to_adapter_tensor(ms.tensor(shape=[None, 2], dtype=ms.float32)))
    # @ms.jit()
    def view_func(x):
        return x.view(-1, 2)