From ba8e5657db13c05da35bacefcbcc197b17f4b5e6 Mon Sep 17 00:00:00 2001
From: lvyufeng <lvyufeng@cqu.edu.cn>
Date: Mon, 29 Sep 2025 15:28:38 +0800
Subject: [PATCH 1/2] fix mindtorch.nn import error

---
 mindtorch/nn/__init__.py              |   68 +-
 mindtorch/nn/modules/__init__.py      |  355 ++++-
 mindtorch/nn/modules/activation.py    | 1805 +++++++++++++++++++------
 mindtorch/nn/modules/adaptive.py      |  154 ++-
 mindtorch/nn/modules/batchnorm.py     |  342 ++++-
 mindtorch/nn/modules/container.py     |  561 +++++---
 mindtorch/nn/modules/conv.py          | 1139 +++++++++++-----
 mindtorch/nn/modules/distance.py      |   30 +-
 mindtorch/nn/modules/dropout.py       |   52 +-
 mindtorch/nn/modules/flatten.py       |   76 +-
 mindtorch/nn/modules/fold.py          |   78 +-
 mindtorch/nn/modules/instancenorm.py  |  140 +-
 mindtorch/nn/modules/lazy.py          |    4 +-
 mindtorch/nn/modules/linear.py        |  317 ++++-
 mindtorch/nn/modules/loss.py          |  646 ++++++---
 mindtorch/nn/modules/normalization.py |  314 ++++-
 mindtorch/nn/modules/padding.py       |  469 +++++--
 mindtorch/nn/modules/pixelshuffle.py  |   28 +-
 mindtorch/nn/modules/pooling.py       | 1149 ++++++++++++++--
 mindtorch/nn/modules/rnn.py           |   10 +-
 mindtorch/nn/modules/sparse.py        |  487 ++++++-
 mindtorch/nn/modules/upsampling.py    |   82 +-
 mindtorch/nn/modules/utils.py         |   70 +-
 setup.py                              |    8 +-
 24 files changed, 6634 insertions(+), 1750 deletions(-)

diff --git a/mindtorch/nn/__init__.py b/mindtorch/nn/__init__.py
index 5c0fbb269..8d7e9abda 100644
--- a/mindtorch/nn/__init__.py
+++ b/mindtorch/nn/__init__.py
@@ -12,8 +12,66 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
-"""mindtorch nn"""
-from . import utils, functional, init
-from .modules import *
-from .parameter import Parameter, Buffer
-from .parallel import DataParallel as DataParallel
\ No newline at end of file
+"""mindmindtorch nn"""
+# mypy: allow-untyped-defs
+from mindtorch.nn.parameter import (  # usort: skip
+    Buffer as Buffer,
+    Parameter as Parameter,
+    UninitializedBuffer as UninitializedBuffer,
+    UninitializedParameter as UninitializedParameter,
+)
+from mindtorch.nn.modules import *  # usort: skip # noqa: F403
+from mindtorch.nn import (
+    attention as attention,
+    functional as functional,
+    init as init,
+    modules as modules,
+    parallel as parallel,
+    parameter as parameter,
+    utils as utils,
+)
+from mindtorch.nn.parallel import DataParallel as DataParallel
+
+
+def factory_kwargs(kwargs):
+    r"""Return a canonicalized dict of factory kwargs.
+
+    Given kwargs, returns a canonicalized dict of factory kwargs that can be directly passed
+    to factory functions like mindtorch.empty, or errors if unrecognized kwargs are present.
+
+    This function makes it simple to write code like this::
+
+        class MyModule(nn.Module):
+            def __init__(self, **kwargs):
+                factory_kwargs = mindtorch.nn.factory_kwargs(kwargs)
+                self.weight = Parameter(mindtorch.empty(10, **factory_kwargs))
+
+    Why should you use this function instead of just passing `kwargs` along directly?
+
+    1. This function does error validation, so if there are unexpected kwargs we will
+    immediately report an error, instead of deferring it to the factory call
+    2. This function supports a special `factory_kwargs` argument, which can be used to
+    explicitly specify a kwarg to be used for factory functions, in the event one of the
+    factory kwargs conflicts with an already existing argument in the signature (e.g.
+    in the signature ``def f(dtype, **kwargs)``, you can specify ``dtype`` for factory
+    functions, as distinct from the dtype argument, by saying
+    ``f(dtype1, factory_kwargs={"dtype": dtype2})``)
+    """
+    if kwargs is None:
+        return {}
+    simple_keys = {"device", "dtype", "memory_format"}
+    expected_keys = simple_keys | {"factory_kwargs"}
+    if not kwargs.keys() <= expected_keys:
+        raise TypeError(f"unexpected kwargs {kwargs.keys() - expected_keys}")
+
+    # guarantee no input kwargs is untouched
+    r = dict(kwargs.get("factory_kwargs", {}))
+    for k in simple_keys:
+        if k in kwargs:
+            if k in r:
+                raise TypeError(
+                    f"{k} specified twice, in **kwargs and in factory_kwargs"
+                )
+            r[k] = kwargs[k]
+
+    return r
diff --git a/mindtorch/nn/modules/__init__.py b/mindtorch/nn/modules/__init__.py
index 16bca9bd7..b6ea4124c 100644
--- a/mindtorch/nn/modules/__init__.py
+++ b/mindtorch/nn/modules/__init__.py
@@ -1,22 +1,335 @@
-"""new nn modules"""
-from .module import Module
-from .container import ModuleList, ParameterList, Sequential, ParameterDict, ModuleDict
-from .linear import Linear, Identity
-from .sparse import Embedding
-from .normalization import LayerNorm, GroupNorm, RMSNorm
-from .dropout import Dropout, Dropout2d
-from .activation import *
-from .conv import Conv3d, Conv2d, Conv1d, ConvTranspose2d, ConvTranspose1d, ConvTranspose3d
-from .padding import *
-from .batchnorm import BatchNorm2d, BatchNorm1d, SyncBatchNorm
-from .pooling import AdaptiveAvgPool2d, AvgPool1d, MaxPool2d, MaxPool1d, AdaptiveAvgPool1d, AvgPool2d
-from .flatten import Unflatten, Flatten
-from .rnn_cell import RNNCell, GRUCell, LSTMCell
-from .rnn import RNN, LSTM, GRU
-from .fold import Unfold, Fold
-from .pixelshuffle import PixelUnshuffle, PixelShuffle
-from .upsampling import Upsample, UpsamplingBilinear2d, UpsamplingNearest2d
-from .loss import *
-from .distance import *
+from .module import Module  # usort: skip
+from .linear import Bilinear, Identity, LazyLinear, Linear  # usort: skip
+from .activation import (
+    CELU,
+    ELU,
+    GELU,
+    GLU,
+    Hardshrink,
+    Hardsigmoid,
+    Hardswish,
+    Hardtanh,
+    LeakyReLU,
+    LogSigmoid,
+    LogSoftmax,
+    Mish,
+    MultiheadAttention,
+    PReLU,
+    ReLU,
+    ReLU6,
+    RReLU,
+    SELU,
+    Sigmoid,
+    SiLU,
+    Softmax,
+    Softmax2d,
+    Softmin,
+    Softplus,
+    Softshrink,
+    Softsign,
+    Tanh,
+    Tanhshrink,
+    Threshold,
+)
 from .adaptive import AdaptiveLogSoftmaxWithLoss
-from .batchnorm import BatchNorm1d, BatchNorm2d, BatchNorm3d
+from .batchnorm import (
+    BatchNorm1d,
+    BatchNorm2d,
+    BatchNorm3d,
+    LazyBatchNorm1d,
+    LazyBatchNorm2d,
+    LazyBatchNorm3d,
+    SyncBatchNorm,
+)
+# from .channelshuffle import ChannelShuffle
+from .container import (
+    Container,
+    ModuleDict,
+    ModuleList,
+    ParameterDict,
+    ParameterList,
+    Sequential,
+)
+from .conv import (
+    Conv1d,
+    Conv2d,
+    Conv3d,
+    ConvTranspose1d,
+    ConvTranspose2d,
+    ConvTranspose3d,
+    LazyConv1d,
+    LazyConv2d,
+    LazyConv3d,
+    LazyConvTranspose1d,
+    LazyConvTranspose2d,
+    LazyConvTranspose3d,
+)
+from .distance import CosineSimilarity, PairwiseDistance
+from .dropout import (
+    AlphaDropout,
+    Dropout,
+    Dropout1d,
+    Dropout2d,
+    Dropout3d,
+    FeatureAlphaDropout,
+)
+from .flatten import Flatten, Unflatten
+from .fold import Fold, Unfold
+from .instancenorm import (
+    InstanceNorm1d,
+    InstanceNorm2d,
+    InstanceNorm3d,
+    LazyInstanceNorm1d,
+    LazyInstanceNorm2d,
+    LazyInstanceNorm3d,
+)
+from .loss import (
+    BCELoss,
+    BCEWithLogitsLoss,
+    CosineEmbeddingLoss,
+    CrossEntropyLoss,
+    CTCLoss,
+    GaussianNLLLoss,
+    HingeEmbeddingLoss,
+    HuberLoss,
+    KLDivLoss,
+    L1Loss,
+    MarginRankingLoss,
+    MSELoss,
+    MultiLabelMarginLoss,
+    MultiLabelSoftMarginLoss,
+    MultiMarginLoss,
+    NLLLoss,
+    NLLLoss2d,
+    PoissonNLLLoss,
+    SmoothL1Loss,
+    SoftMarginLoss,
+    TripletMarginLoss,
+    TripletMarginWithDistanceLoss,
+)
+from .normalization import (
+    CrossMapLRN2d,
+    GroupNorm,
+    LayerNorm,
+    LocalResponseNorm,
+    RMSNorm,
+)
+from .padding import (
+    CircularPad1d,
+    CircularPad2d,
+    CircularPad3d,
+    ConstantPad1d,
+    ConstantPad2d,
+    ConstantPad3d,
+    ReflectionPad1d,
+    ReflectionPad2d,
+    ReflectionPad3d,
+    ReplicationPad1d,
+    ReplicationPad2d,
+    ReplicationPad3d,
+    ZeroPad1d,
+    ZeroPad2d,
+    ZeroPad3d,
+)
+from .pixelshuffle import PixelShuffle, PixelUnshuffle
+from .pooling import (
+    AdaptiveAvgPool1d,
+    AdaptiveAvgPool2d,
+    AdaptiveAvgPool3d,
+    AdaptiveMaxPool1d,
+    AdaptiveMaxPool2d,
+    AdaptiveMaxPool3d,
+    AvgPool1d,
+    AvgPool2d,
+    AvgPool3d,
+    FractionalMaxPool2d,
+    FractionalMaxPool3d,
+    LPPool1d,
+    LPPool2d,
+    LPPool3d,
+    MaxPool1d,
+    MaxPool2d,
+    MaxPool3d,
+    MaxUnpool1d,
+    MaxUnpool2d,
+    MaxUnpool3d,
+)
+from .rnn import GRU, LSTM, RNN, RNNBase
+from .rnn_cell import GRUCell, LSTMCell, RNNCell, RNNCellBase
+from .sparse import Embedding, EmbeddingBag
+# from .transformer import (
+#     Transformer,
+#     TransformerDecoder,
+#     TransformerDecoderLayer,
+#     TransformerEncoder,
+#     TransformerEncoderLayer,
+# )
+from .upsampling import Upsample, UpsamplingBilinear2d, UpsamplingNearest2d
+
+
+__all__ = [
+    "AdaptiveAvgPool1d",
+    "AdaptiveAvgPool2d",
+    "AdaptiveAvgPool3d",
+    "AdaptiveLogSoftmaxWithLoss",
+    "AdaptiveMaxPool1d",
+    "AdaptiveMaxPool2d",
+    "AdaptiveMaxPool3d",
+    "AlphaDropout",
+    "AvgPool1d",
+    "AvgPool2d",
+    "AvgPool3d",
+    "BCELoss",
+    "BCEWithLogitsLoss",
+    "BatchNorm1d",
+    "BatchNorm2d",
+    "BatchNorm3d",
+    "Bilinear",
+    "CELU",
+    "CTCLoss",
+    # "ChannelShuffle",
+    "CircularPad1d",
+    "CircularPad2d",
+    "CircularPad3d",
+    "ConstantPad1d",
+    "ConstantPad2d",
+    "ConstantPad3d",
+    "Container",
+    "Conv1d",
+    "Conv2d",
+    "Conv3d",
+    "ConvTranspose1d",
+    "ConvTranspose2d",
+    "ConvTranspose3d",
+    "CosineEmbeddingLoss",
+    "CosineSimilarity",
+    "CrossEntropyLoss",
+    "CrossMapLRN2d",
+    "Dropout",
+    "Dropout1d",
+    "Dropout2d",
+    "Dropout3d",
+    "ELU",
+    "Embedding",
+    "EmbeddingBag",
+    "FeatureAlphaDropout",
+    "Flatten",
+    "Fold",
+    "FractionalMaxPool2d",
+    "FractionalMaxPool3d",
+    "GELU",
+    "GLU",
+    "GRU",
+    "GRUCell",
+    "GaussianNLLLoss",
+    "GroupNorm",
+    "Hardshrink",
+    "Hardsigmoid",
+    "Hardswish",
+    "Hardtanh",
+    "HingeEmbeddingLoss",
+    "HuberLoss",
+    "Identity",
+    "InstanceNorm1d",
+    "InstanceNorm2d",
+    "InstanceNorm3d",
+    "KLDivLoss",
+    "L1Loss",
+    "LPPool1d",
+    "LPPool2d",
+    "LPPool3d",
+    "LSTM",
+    "LSTMCell",
+    "LayerNorm",
+    "LazyBatchNorm1d",
+    "LazyBatchNorm2d",
+    "LazyBatchNorm3d",
+    "LazyConv1d",
+    "LazyConv2d",
+    "LazyConv3d",
+    "LazyConvTranspose1d",
+    "LazyConvTranspose2d",
+    "LazyConvTranspose3d",
+    "LazyInstanceNorm1d",
+    "LazyInstanceNorm2d",
+    "LazyInstanceNorm3d",
+    "LazyLinear",
+    "LeakyReLU",
+    "Linear",
+    "LocalResponseNorm",
+    "LogSigmoid",
+    "LogSoftmax",
+    "MSELoss",
+    "MarginRankingLoss",
+    "MaxPool1d",
+    "MaxPool2d",
+    "MaxPool3d",
+    "MaxUnpool1d",
+    "MaxUnpool2d",
+    "MaxUnpool3d",
+    "Mish",
+    "Module",
+    "ModuleDict",
+    "ModuleList",
+    "MultiLabelMarginLoss",
+    "MultiLabelSoftMarginLoss",
+    "MultiMarginLoss",
+    "MultiheadAttention",
+    "NLLLoss",
+    "NLLLoss2d",
+    "PReLU",
+    "PairwiseDistance",
+    "ParameterDict",
+    "ParameterList",
+    "PixelShuffle",
+    "PixelUnshuffle",
+    "PoissonNLLLoss",
+    "RMSNorm",
+    "RNN",
+    "RNNBase",
+    "RNNCell",
+    "RNNCellBase",
+    "RReLU",
+    "ReLU",
+    "ReLU6",
+    "ReflectionPad1d",
+    "ReflectionPad2d",
+    "ReflectionPad3d",
+    "ReplicationPad1d",
+    "ReplicationPad2d",
+    "ReplicationPad3d",
+    "SELU",
+    "Sequential",
+    "SiLU",
+    "Sigmoid",
+    "SmoothL1Loss",
+    "SoftMarginLoss",
+    "Softmax",
+    "Softmax2d",
+    "Softmin",
+    "Softplus",
+    "Softshrink",
+    "Softsign",
+    "SyncBatchNorm",
+    "Tanh",
+    "Tanhshrink",
+    "Threshold",
+    # "Transformer",
+    # "TransformerDecoder",
+    # "TransformerDecoderLayer",
+    # "TransformerEncoder",
+    # "TransformerEncoderLayer",
+    "TripletMarginLoss",
+    "TripletMarginWithDistanceLoss",
+    "Unflatten",
+    "Unfold",
+    "Upsample",
+    "UpsamplingBilinear2d",
+    "UpsamplingNearest2d",
+    "ZeroPad1d",
+    "ZeroPad2d",
+    "ZeroPad3d",
+]
+
+# Please keep this list sorted
+assert __all__ == sorted(__all__)
\ No newline at end of file
diff --git a/mindtorch/nn/modules/activation.py b/mindtorch/nn/modules/activation.py
index 5468d4155..2767cc087 100644
--- a/mindtorch/nn/modules/activation.py
+++ b/mindtorch/nn/modules/activation.py
@@ -1,57 +1,109 @@
-"""activation"""
-from typing import Optional, Tuple
+# mypy: allow-untyped-defs
+import warnings
+from typing import Optional
+
 import mindtorch
+import mindtorch.nn.functional as F
 from mindtorch import Tensor
-from ..parameter import Parameter
+from mindtorch.nn.init import constant_, xavier_normal_, xavier_uniform_
+from mindtorch.nn.parameter import Parameter
 
+from .linear import NonDynamicallyQuantizableLinear
 from .module import Module
-from .linear import Linear
-from .. import functional as F
-from .. import init
-from ... import ops
-
-class GELU(Module):
-    r"""Applies the Gaussian Error Linear Units function:
-
-    .. math:: \text{GELU}(x) = x * \Phi(x)
 
-    where :math:`\Phi(x)` is the Cumulative Distribution Function for Gaussian Distribution.
 
-    When the approximate argument is 'tanh', Gelu is estimated with:
+__all__ = [
+    "Threshold",
+    "ReLU",
+    "RReLU",
+    "Hardtanh",
+    "ReLU6",
+    "Sigmoid",
+    "Hardsigmoid",
+    "Tanh",
+    "SiLU",
+    "Mish",
+    "Hardswish",
+    "ELU",
+    "CELU",
+    "SELU",
+    "GLU",
+    "GELU",
+    "Hardshrink",
+    "LeakyReLU",
+    "LogSigmoid",
+    "Softplus",
+    "Softshrink",
+    "MultiheadAttention",
+    "PReLU",
+    "Softsign",
+    "Tanhshrink",
+    "Softmin",
+    "Softmax",
+    "Softmax2d",
+    "LogSoftmax",
+]
+
+
+class Threshold(Module):
+    r"""Thresholds each element of the input Tensor.
+
+    Threshold is defined as:
 
-    .. math:: \text{GELU}(x) = 0.5 * x * (1 + \text{Tanh}(\sqrt{2 / \pi} * (x + 0.044715 * x^3)))
+    .. math::
+        y =
+        \begin{cases}
+        x, &\text{ if } x > \text{threshold} \\
+        \text{value}, &\text{ otherwise }
+        \end{cases}
 
     Args:
-        approximate (str, optional): the gelu approximation algorithm to use:
-            ``'none'`` | ``'tanh'``. Default: ``'none'``
+        threshold: The value to threshold at
+        value: The value to replace with
+        inplace: can optionally do the operation in-place. Default: ``False``
 
     Shape:
         - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
         - Output: :math:`(*)`, same shape as the input.
 
-    .. image:: ../scripts/activation_images/GELU.png
+    .. image:: ../scripts/activation_images/Threshold.png
 
     Examples::
 
-        >>> m = nn.GELU()
-        >>> input = mindtorch.randn(2)
+        >>> m = nn.Threshold(0, 0.5)
+        >>> input = mindtorch.arange(-3, 3)
         >>> output = m(input)
     """
-    __constants__ = ['approximate']
-    approximate: str
 
-    def __init__(self, approximate: str = 'none') -> None:
+    __constants__ = ["threshold", "value", "inplace"]
+
+    threshold: float
+    value: float
+    inplace: bool
+
+    def __init__(self, threshold: float, value: float, inplace: bool = False) -> None:
         super().__init__()
-        self.approximate = approximate
+        self.threshold = threshold
+        self.value = value
+        self.inplace = inplace
+        # TODO: check in THNN (if inplace == True, then assert value <= threshold)
 
     def forward(self, input: Tensor) -> Tensor:
-        return F.gelu(input, approximate=self.approximate)
+        """
+        Runs the forward pass.
+        """
+        return F.threshold(input, self.threshold, self.value, self.inplace)
 
     def extra_repr(self) -> str:
-        return f'approximate={repr(self.approximate)}'
+        """
+        Return the extra representation of the module.
+        """
+        inplace_str = ", inplace=True" if self.inplace else ""
+        return f"threshold={self.threshold}, value={self.value}{inplace_str}"
+
 
 class ReLU(Module):
-    r"""Applies the rectified linear unit function element-wise:
+    r"""Applies the rectified linear unit function element-wise.
 
     :math:`\text{ReLU}(x) = (x)^+ = \max(0, x)`
 
@@ -77,181 +129,208 @@ class ReLU(Module):
         >>> input = mindtorch.randn(2).unsqueeze(0)
         >>> output = mindtorch.cat((m(input), m(-input)))
     """
+
     __constants__ = ["inplace"]
     inplace: bool
 
-    def __init__(self, inplace: bool = False):
+    def __init__(self, inplace: bool = False) -> None:
         super().__init__()
         self.inplace = inplace
 
     def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
         return F.relu(input, inplace=self.inplace)
 
     def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
         inplace_str = "inplace=True" if self.inplace else ""
         return inplace_str
 
 
-class LeakyReLU(Module):
-    r"""Applies the LeakyReLU function element-wise.
-
-    .. math::
-        \text{LeakyReLU}(x) = \max(0, x) + \text{negative\_slope} * \min(0, x)
+class RReLU(Module):
+    r"""Applies the randomized leaky rectified linear unit function, element-wise.
 
+    Method described in the paper:
+    `Empirical Evaluation of Rectified Activations in Convolutional Network <https://arxiv.org/abs/1505.00853>`_.
 
-    or
+    The function is defined as:
 
     .. math::
-        \text{LeakyReLU}(x) =
+        \text{RReLU}(x) =
         \begin{cases}
-        x, & \text{ if } x \geq 0 \\
-        \text{negative\_slope} \times x, & \text{ otherwise }
+            x & \text{if } x \geq 0 \\
+            ax & \text{ otherwise }
         \end{cases}
 
+    where :math:`a` is randomly sampled from uniform distribution
+    :math:`\mathcal{U}(\text{lower}, \text{upper})` during training while during
+    evaluation :math:`a` is fixed with :math:`a = \frac{\text{lower} + \text{upper}}{2}`.
+
     Args:
-        negative_slope: Controls the angle of the negative slope (which is used for
-          negative input values). Default: 1e-2
+        lower: lower bound of the uniform distribution. Default: :math:`\frac{1}{8}`
+        upper: upper bound of the uniform distribution. Default: :math:`\frac{1}{3}`
         inplace: can optionally do the operation in-place. Default: ``False``
 
     Shape:
-        - Input: :math:`(*)` where `*` means, any number of additional
-          dimensions
-        - Output: :math:`(*)`, same shape as the input
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
 
-    .. image:: ../scripts/activation_images/LeakyReLU.png
+    .. image:: ../scripts/activation_images/RReLU.png
 
     Examples::
 
-        >>> m = nn.LeakyReLU(0.1)
+        >>> m = nn.RReLU(0.1, 0.3)
         >>> input = mindtorch.randn(2)
         >>> output = m(input)
+
     """
 
-    __constants__ = ['inplace', 'negative_slope']
+    __constants__ = ["lower", "upper", "inplace"]
+
+    lower: float
+    upper: float
     inplace: bool
-    negative_slope: float
 
-    def __init__(self, negative_slope: float = 1e-2, inplace: bool = False) -> None:
+    def __init__(
+        self, lower: float = 1.0 / 8, upper: float = 1.0 / 3, inplace: bool = False
+    ) -> None:
         super().__init__()
-        self.negative_slope = negative_slope
+        self.lower = lower
+        self.upper = upper
         self.inplace = inplace
 
     def forward(self, input: Tensor) -> Tensor:
-        return F.leaky_relu(input, self.negative_slope)
+        """
+        Runs the forward pass.
+        """
+        return F.rrelu(input, self.lower, self.upper, self.training, self.inplace)
 
     def extra_repr(self) -> str:
-        inplace_str = ', inplace=True' if self.inplace else ''
-        return f'negative_slope={self.negative_slope}{inplace_str}'
-
-
-
-class Tanh(Module):
-    def forward(self, input: Tensor) -> Tensor:
-        return F.tanh(input)
+        """
+        Return the extra representation of the module.
+        """
+        inplace_str = ", inplace=True" if self.inplace else ""
+        return f"lower={self.lower}, upper={self.upper}{inplace_str}"
 
-class Softmax(Module):
-    r"""Applies the Softmax function to an n-dimensional input Tensor.
 
-    Rescales them so that the elements of the n-dimensional output Tensor
-    lie in the range [0,1] and sum to 1.
+class Hardtanh(Module):
+    r"""Applies the HardTanh function element-wise.
 
-    Softmax is defined as:
+    HardTanh is defined as:
 
     .. math::
-        \text{Softmax}(x_{i}) = \frac{\exp(x_i)}{\sum_j \exp(x_j)}
-
-    When the input Tensor is a sparse tensor then the unspecified
-    values are treated as ``-inf``.
+        \text{HardTanh}(x) = \begin{cases}
+            \text{max\_val} & \text{ if } x > \text{ max\_val } \\
+            \text{min\_val} & \text{ if } x < \text{ min\_val } \\
+            x & \text{ otherwise } \\
+        \end{cases}
 
-    Shape:
-        - Input: :math:`(*)` where `*` means, any number of additional
-          dimensions
-        - Output: :math:`(*)`, same shape as the input
+    Args:
+        min_val: minimum value of the linear region range. Default: -1
+        max_val: maximum value of the linear region range. Default: 1
+        inplace: can optionally do the operation in-place. Default: ``False``
 
-    Returns:
-        a Tensor of the same dimension and shape as the input with
-        values in the range [0, 1]
+    Keyword arguments :attr:`min_value` and :attr:`max_value`
+    have been deprecated in favor of :attr:`min_val` and :attr:`max_val`.
 
-    Args:
-        dim (int): A dimension along which Softmax will be computed (so every slice
-            along dim will sum to 1).
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
 
-    .. note::
-        This module doesn't work directly with NLLLoss,
-        which expects the Log to be computed between the Softmax and itself.
-        Use `LogSoftmax` instead (it's faster and has better numerical properties).
+    .. image:: ../scripts/activation_images/Hardtanh.png
 
     Examples::
 
-        >>> m = nn.Softmax(dim=1)
-        >>> input = mindtorch.randn(2, 3)
+        >>> m = nn.Hardtanh(-2, 2)
+        >>> input = mindtorch.randn(2)
         >>> output = m(input)
-
     """
 
-    __constants__ = ['dim']
-    dim: Optional[int]
+    __constants__ = ["min_val", "max_val", "inplace"]
 
-    def __init__(self, dim: Optional[int] = None) -> None:
-        super().__init__()
-        self.dim = dim
+    min_val: float
+    max_val: float
+    inplace: bool
 
-    def __setstate__(self, state):
-        super().__setstate__(state)
-        if not hasattr(self, 'dim'):
-            self.dim = None
+    def __init__(
+        self,
+        min_val: float = -1.0,
+        max_val: float = 1.0,
+        inplace: bool = False,
+        min_value: Optional[float] = None,
+        max_value: Optional[float] = None,
+    ) -> None:
+        super().__init__()
+        if min_value is not None:
+            warnings.warn(
+                "keyword argument `min_value` is deprecated and rename to `min_val`",
+                FutureWarning,
+                stacklevel=2,
+            )
+            min_val = min_value
+        if max_value is not None:
+            warnings.warn(
+                "keyword argument `max_value` is deprecated and rename to `max_val`",
+                FutureWarning,
+                stacklevel=2,
+            )
+            max_val = max_value
+
+        self.min_val = min_val
+        self.max_val = max_val
+        self.inplace = inplace
+        assert self.max_val > self.min_val
 
     def forward(self, input: Tensor) -> Tensor:
-        return F.softmax(input, self.dim)
+        """
+        Runs the forward pass.
+        """
+        return F.hardtanh(input, self.min_val, self.max_val, self.inplace)
 
     def extra_repr(self) -> str:
-        return f'dim={self.dim}'
-
+        """
+        Return the extra representation of the module.
+        """
+        inplace_str = ", inplace=True" if self.inplace else ""
+        return f"min_val={self.min_val}, max_val={self.max_val}{inplace_str}"
 
-class LogSoftmax(Module):
-    r"""Applies the :math:`\log(\text{Softmax}(x))` function to an n-dimensional input Tensor.
 
-    The LogSoftmax formulation can be simplified as:
+class ReLU6(Hardtanh):
+    r"""Applies the ReLU6 function element-wise.
 
     .. math::
-        \text{LogSoftmax}(x_{i}) = \log\left(\frac{\exp(x_i) }{ \sum_j \exp(x_j)} \right)
-
-    Shape:
-        - Input: :math:`(*)` where `*` means, any number of additional
-          dimensions
-        - Output: :math:`(*)`, same shape as the input
+        \text{ReLU6}(x) = \min(\max(0,x), 6)
 
     Args:
-        dim (int): A dimension along which LogSoftmax will be computed.
+        inplace: can optionally do the operation in-place. Default: ``False``
 
-    Returns:
-        a Tensor of the same dimension and shape as the input with
-        values in the range [-inf, 0)
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/ReLU6.png
 
     Examples::
 
-        >>> m = nn.LogSoftmax(dim=1)
-        >>> input = mindtorch.randn(2, 3)
+        >>> m = nn.ReLU6()
+        >>> input = mindtorch.randn(2)
         >>> output = m(input)
     """
 
-    __constants__ = ['dim']
-    dim: Optional[int]
-
-    def __init__(self, dim: Optional[int] = None) -> None:
-        super().__init__()
-        self.dim = dim
-
-    def __setstate__(self, state):
-        super().__setstate__(state)
-        if not hasattr(self, 'dim'):
-            self.dim = None
+    def __init__(self, inplace: bool = False) -> None:
+        super().__init__(0.0, 6.0, inplace)
 
-    def forward(self, input: Tensor) -> Tensor:
-        return F.log_softmax(input, self.dim)
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        inplace_str = "inplace=True" if self.inplace else ""
+        return inplace_str
 
-    def extra_repr(self):
-        return f'dim={self.dim}'
 
 class Sigmoid(Module):
     r"""Applies the Sigmoid function element-wise.
@@ -274,119 +353,755 @@ class Sigmoid(Module):
     """
 
     def forward(self, input: Tensor) -> Tensor:
-        return F.sigmoid(input)
-
-
-class SiLU(Module):
-    def forward(self, input):
-        return F.silu(input)
-
-
-class Mish(Module):
-    def forward(self, input):
-        return F.mish(input)
-
+        """
+        Runs the forward pass.
+        """
+        return mindtorch.sigmoid(input)
 
-class ReLU6(Module):
-    def forward(self, input):
-        return F.relu6(input)
 
-class ELU(Module):
-    def forward(self, input):
-        return F.elu(input)
+class Hardsigmoid(Module):
+    r"""Applies the Hardsigmoid function element-wise.
 
-class GLU(Module):
-    r"""Applies the gated linear unit function.
+    Hardsigmoid is defined as:
 
-    :math:`{GLU}(a, b)= a \otimes \sigma(b)` where :math:`a` is the first half
-    of the input matrices and :math:`b` is the second half.
+    .. math::
+        \text{Hardsigmoid}(x) = \begin{cases}
+            0 & \text{if~} x \le -3, \\
+            1 & \text{if~} x \ge +3, \\
+            x / 6 + 1 / 2 & \text{otherwise}
+        \end{cases}
 
     Args:
-        dim (int): the dimension on which to split the input. Default: -1
+        inplace: can optionally do the operation in-place. Default: ``False``
 
     Shape:
-        - Input: :math:`(\ast_1, N, \ast_2)` where `*` means, any number of additional
-          dimensions
-        - Output: :math:`(\ast_1, M, \ast_2)` where :math:`M=N/2`
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/Hardsigmoid.png
 
     Examples::
 
-        >>> m = nn.GLU()
-        >>> input = mindtorch.randn(4, 2)
+        >>> m = nn.Hardsigmoid()
+        >>> input = mindtorch.randn(2)
         >>> output = m(input)
     """
 
-    __constants__ = ['dim']
-    dim: int
+    __constants__ = ["inplace"]
 
-    def __init__(self, dim: int = -1) -> None:
+    inplace: bool
+
+    def __init__(self, inplace: bool = False) -> None:
         super().__init__()
-        self.dim = dim
+        self.inplace = inplace
 
     def forward(self, input: Tensor) -> Tensor:
-        return F.glu(input, self.dim)
+        """
+        Runs the forward pass.
+        """
+        return F.hardsigmoid(input, self.inplace)
 
-    def extra_repr(self) -> str:
-        return f'dim={self.dim}'
 
+class Tanh(Module):
+    r"""Applies the Hyperbolic Tangent (Tanh) function element-wise.
 
-class Softplus(Module):
-    r"""Applies the Softplus function element-wise.
+    Tanh is defined as:
 
     .. math::
-        \text{Softplus}(x) = \frac{1}{\beta} * \log(1 + \exp(\beta * x))
+        \text{Tanh}(x) = \tanh(x) = \frac{\exp(x) - \exp(-x)} {\exp(x) + \exp(-x)}
 
-    SoftPlus is a smooth approximation to the ReLU function and can be used
-    to constrain the output of a machine to always be positive.
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
 
-    For numerical stability the implementation reverts to the linear function
-    when :math:`input \times \beta > threshold`.
+    .. image:: ../scripts/activation_images/Tanh.png
 
-    Args:
-        beta: the :math:`\beta` value for the Softplus formulation. Default: 1
-        threshold: values above this revert to a linear function. Default: 20
+    Examples::
+
+        >>> m = nn.Tanh()
+        >>> input = mindtorch.randn(2)
+        >>> output = m(input)
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return mindtorch.tanh(input)
+
+
+class SiLU(Module):
+    r"""Applies the Sigmoid Linear Unit (SiLU) function, element-wise.
+
+    The SiLU function is also known as the swish function.
+
+    .. math::
+        \text{silu}(x) = x * \sigma(x), \text{where } \sigma(x) \text{ is the logistic sigmoid.}
+
+    .. note::
+        See `Gaussian Error Linear Units (GELUs) <https://arxiv.org/abs/1606.08415>`_
+        where the SiLU (Sigmoid Linear Unit) was originally coined, and see
+        `Sigmoid-Weighted Linear Units for Neural Network Function Approximation
+        in Reinforcement Learning <https://arxiv.org/abs/1702.03118>`_ and `Swish:
+        a Self-Gated Activation Function <https://arxiv.org/abs/1710.05941v1>`_
+        where the SiLU was experimented with later.
 
     Shape:
         - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
         - Output: :math:`(*)`, same shape as the input.
 
-    .. image:: ../scripts/activation_images/Softplus.png
+    .. image:: ../scripts/activation_images/SiLU.png
 
     Examples::
 
-        >>> m = nn.Softplus()
+        >>> m = nn.SiLU()
         >>> input = mindtorch.randn(2)
         >>> output = m(input)
     """
 
-    __constants__ = ['beta', 'threshold']
-    beta: float
-    threshold: float
+    __constants__ = ["inplace"]
+    inplace: bool
 
-    def __init__(self, beta: float = 1.0, threshold: float = 20.0) -> None:
+    def __init__(self, inplace: bool = False) -> None:
         super().__init__()
-        self.beta = beta
-        self.threshold = threshold
+        self.inplace = inplace
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.silu(input, inplace=self.inplace)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        inplace_str = "inplace=True" if self.inplace else ""
+        return inplace_str
+
+
+class Mish(Module):
+    r"""Applies the Mish function, element-wise.
+
+    Mish: A Self Regularized Non-Monotonic Neural Activation Function.
+
+    .. math::
+        \text{Mish}(x) = x * \text{Tanh}(\text{Softplus}(x))
+
+    .. note::
+        See `Mish: A Self Regularized Non-Monotonic Neural Activation Function <https://arxiv.org/abs/1908.08681>`_
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/Mish.png
+
+    Examples::
+
+        >>> m = nn.Mish()
+        >>> input = mindtorch.randn(2)
+        >>> output = m(input)
+    """
+
+    __constants__ = ["inplace"]
+    inplace: bool
+
+    def __init__(self, inplace: bool = False) -> None:
+        super().__init__()
+        self.inplace = inplace
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.mish(input, inplace=self.inplace)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        inplace_str = "inplace=True" if self.inplace else ""
+        return inplace_str
+
+
+class Hardswish(Module):
+    r"""Applies the Hardswish function, element-wise.
+
+    Method described in the paper: `Searching for MobileNetV3 <https://arxiv.org/abs/1905.02244>`_.
+
+    Hardswish is defined as:
+
+    .. math::
+        \text{Hardswish}(x) = \begin{cases}
+            0 & \text{if~} x \le -3, \\
+            x & \text{if~} x \ge +3, \\
+            x \cdot (x + 3) /6 & \text{otherwise}
+        \end{cases}
+
+    Args:
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/Hardswish.png
+
+    Examples::
+
+        >>> m = nn.Hardswish()
+        >>> input = mindtorch.randn(2)
+        >>> output = m(input)
+    """
+
+    __constants__ = ["inplace"]
+
+    inplace: bool
+
+    def __init__(self, inplace: bool = False) -> None:
+        super().__init__()
+        self.inplace = inplace
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.hardswish(input, self.inplace)
+
+
+class ELU(Module):
+    r"""Applies the Exponential Linear Unit (ELU) function, element-wise.
+
+    Method described in the paper: `Fast and Accurate Deep Network Learning by Exponential Linear
+    Units (ELUs) <https://arxiv.org/abs/1511.07289>`__.
+
+    ELU is defined as:
+
+    .. math::
+        \text{ELU}(x) = \begin{cases}
+        x, & \text{ if } x > 0\\
+        \alpha * (\exp(x) - 1), & \text{ if } x \leq 0
+        \end{cases}
+
+    Args:
+        alpha: the :math:`\alpha` value for the ELU formulation. Default: 1.0
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/ELU.png
+
+    Examples::
+
+        >>> m = nn.ELU()
+        >>> input = mindtorch.randn(2)
+        >>> output = m(input)
+    """
+
+    __constants__ = ["alpha", "inplace"]
+    alpha: float
+    inplace: bool
+
+    def __init__(self, alpha: float = 1.0, inplace: bool = False) -> None:
+        super().__init__()
+        self.alpha = alpha
+        self.inplace = inplace
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.elu(input, self.alpha, self.inplace)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        inplace_str = ", inplace=True" if self.inplace else ""
+        return f"alpha={self.alpha}{inplace_str}"
+
+
+class CELU(Module):
+    r"""Applies the CELU function element-wise.
+
+    .. math::
+        \text{CELU}(x) = \max(0,x) + \min(0, \alpha * (\exp(x/\alpha) - 1))
+
+    More details can be found in the paper `Continuously Differentiable Exponential Linear Units`_ .
+
+    Args:
+        alpha: the :math:`\alpha` value for the CELU formulation. Default: 1.0
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/CELU.png
+
+    Examples::
+
+        >>> m = nn.CELU()
+        >>> input = mindtorch.randn(2)
+        >>> output = m(input)
+
+    .. _`Continuously Differentiable Exponential Linear Units`:
+        https://arxiv.org/abs/1704.07483
+    """
+
+    __constants__ = ["alpha", "inplace"]
+    alpha: float
+    inplace: bool
+
+    def __init__(self, alpha: float = 1.0, inplace: bool = False) -> None:
+        super().__init__()
+        self.alpha = alpha
+        self.inplace = inplace
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.celu(input, self.alpha, self.inplace)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        inplace_str = ", inplace=True" if self.inplace else ""
+        return f"alpha={self.alpha}{inplace_str}"
+
+
+class SELU(Module):
+    r"""Applies the SELU function element-wise.
+
+    .. math::
+        \text{SELU}(x) = \text{scale} * (\max(0,x) + \min(0, \alpha * (\exp(x) - 1)))
+
+    with :math:`\alpha = 1.6732632423543772848170429916717` and
+    :math:`\text{scale} = 1.0507009873554804934193349852946`.
+
+    .. warning::
+        When using ``kaiming_normal`` or ``kaiming_normal_`` for initialisation,
+        ``nonlinearity='linear'`` should be used instead of ``nonlinearity='selu'``
+        in order to get `Self-Normalizing Neural Networks`_.
+        See :func:`mindtorch.nn.init.calculate_gain` for more information.
+
+    More details can be found in the paper `Self-Normalizing Neural Networks`_ .
+
+    Args:
+        inplace (bool, optional): can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/SELU.png
+
+    Examples::
+
+        >>> m = nn.SELU()
+        >>> input = mindtorch.randn(2)
+        >>> output = m(input)
+
+    .. _Self-Normalizing Neural Networks: https://arxiv.org/abs/1706.02515
+    """
+
+    __constants__ = ["inplace"]
+    inplace: bool
+
+    def __init__(self, inplace: bool = False) -> None:
+        super().__init__()
+        self.inplace = inplace
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.selu(input, self.inplace)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        inplace_str = "inplace=True" if self.inplace else ""
+        return inplace_str
+
+
+class GLU(Module):
+    r"""Applies the gated linear unit function.
+
+    :math:`{GLU}(a, b)= a \otimes \sigma(b)` where :math:`a` is the first half
+    of the input matrices and :math:`b` is the second half.
+
+    Args:
+        dim (int): the dimension on which to split the input. Default: -1
+
+    Shape:
+        - Input: :math:`(\ast_1, N, \ast_2)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(\ast_1, M, \ast_2)` where :math:`M=N/2`
+
+    .. image:: ../scripts/activation_images/GLU.png
+
+    Examples::
+
+        >>> m = nn.GLU()
+        >>> input = mindtorch.randn(4, 2)
+        >>> output = m(input)
+    """
+
+    __constants__ = ["dim"]
+    dim: int
+
+    def __init__(self, dim: int = -1) -> None:
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.glu(input, self.dim)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        return f"dim={self.dim}"
+
+
+class GELU(Module):
+    r"""Applies the Gaussian Error Linear Units function.
+
+    .. math:: \text{GELU}(x) = x * \Phi(x)
+
+    where :math:`\Phi(x)` is the Cumulative Distribution Function for Gaussian Distribution.
+
+    When the approximate argument is 'tanh', Gelu is estimated with:
+
+    .. math:: \text{GELU}(x) = 0.5 * x * (1 + \text{Tanh}(\sqrt{2 / \pi} * (x + 0.044715 * x^3)))
+
+    Args:
+        approximate (str, optional): the gelu approximation algorithm to use:
+            ``'none'`` | ``'tanh'``. Default: ``'none'``
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/GELU.png
+
+    Examples::
+
+        >>> m = nn.GELU()
+        >>> input = mindtorch.randn(2)
+        >>> output = m(input)
+    """
+
+    __constants__ = ["approximate"]
+    approximate: str
+
+    def __init__(self, approximate: str = "none") -> None:
+        super().__init__()
+        self.approximate = approximate
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.gelu(input, approximate=self.approximate)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        return f"approximate={repr(self.approximate)}"
+
+
+class Hardshrink(Module):
+    r"""Applies the Hard Shrinkage (Hardshrink) function element-wise.
+
+    Hardshrink is defined as:
+
+    .. math::
+        \text{HardShrink}(x) =
+        \begin{cases}
+        x, & \text{ if } x > \lambda \\
+        x, & \text{ if } x < -\lambda \\
+        0, & \text{ otherwise }
+        \end{cases}
+
+    Args:
+        lambd: the :math:`\lambda` value for the Hardshrink formulation. Default: 0.5
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/Hardshrink.png
+
+    Examples::
+
+        >>> m = nn.Hardshrink()
+        >>> input = mindtorch.randn(2)
+        >>> output = m(input)
+    """
+
+    __constants__ = ["lambd"]
+    lambd: float
+
+    def __init__(self, lambd: float = 0.5) -> None:
+        super().__init__()
+        self.lambd = lambd
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Run forward pass.
+        """
+        return F.hardshrink(input, self.lambd)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        return f"{self.lambd}"
+
+
+class LeakyReLU(Module):
+    r"""Applies the LeakyReLU function element-wise.
+
+    .. math::
+        \text{LeakyReLU}(x) = \max(0, x) + \text{negative\_slope} * \min(0, x)
+
+
+    or
+
+    .. math::
+        \text{LeakyReLU}(x) =
+        \begin{cases}
+        x, & \text{ if } x \geq 0 \\
+        \text{negative\_slope} \times x, & \text{ otherwise }
+        \end{cases}
+
+    Args:
+        negative_slope: Controls the angle of the negative slope (which is used for
+          negative input values). Default: 1e-2
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(*)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(*)`, same shape as the input
+
+    .. image:: ../scripts/activation_images/LeakyReLU.png
+
+    Examples::
+
+        >>> m = nn.LeakyReLU(0.1)
+        >>> input = mindtorch.randn(2)
+        >>> output = m(input)
+    """
+
+    __constants__ = ["inplace", "negative_slope"]
+    inplace: bool
+    negative_slope: float
+
+    def __init__(self, negative_slope: float = 1e-2, inplace: bool = False) -> None:
+        super().__init__()
+        self.negative_slope = negative_slope
+        self.inplace = inplace
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Run forward pass.
+        """
+        return F.leaky_relu(input, self.negative_slope, self.inplace)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        inplace_str = ", inplace=True" if self.inplace else ""
+        return f"negative_slope={self.negative_slope}{inplace_str}"
+
+
+class LogSigmoid(Module):
+    r"""Applies the Logsigmoid function element-wise.
+
+    .. math::
+        \text{LogSigmoid}(x) = \log\left(\frac{ 1 }{ 1 + \exp(-x)}\right)
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/LogSigmoid.png
+
+    Examples::
+
+        >>> m = nn.LogSigmoid()
+        >>> input = mindtorch.randn(2)
+        >>> output = m(input)
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Run forward pass.
+        """
+        return F.logsigmoid(input)
+
+
+class Softplus(Module):
+    r"""Applies the Softplus function element-wise.
+
+    .. math::
+        \text{Softplus}(x) = \frac{1}{\beta} * \log(1 + \exp(\beta * x))
+
+    SoftPlus is a smooth approximation to the ReLU function and can be used
+    to constrain the output of a machine to always be positive.
+
+    For numerical stability the implementation reverts to the linear function
+    when :math:`input \times \beta > threshold`.
+
+    Args:
+        beta: the :math:`\beta` value for the Softplus formulation. Default: 1
+        threshold: values above this revert to a linear function. Default: 20
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/Softplus.png
+
+    Examples::
+
+        >>> m = nn.Softplus()
+        >>> input = mindtorch.randn(2)
+        >>> output = m(input)
+    """
+
+    __constants__ = ["beta", "threshold"]
+    beta: float
+    threshold: float
+
+    def __init__(self, beta: float = 1.0, threshold: float = 20.0) -> None:
+        super().__init__()
+        self.beta = beta
+        self.threshold = threshold
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Run forward pass.
+        """
+        return F.softplus(input, self.beta, self.threshold)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        return f"beta={self.beta}, threshold={self.threshold}"
+
+
+class Softshrink(Module):
+    r"""Applies the soft shrinkage function element-wise.
+
+    .. math::
+        \text{SoftShrinkage}(x) =
+        \begin{cases}
+        x - \lambda, & \text{ if } x > \lambda \\
+        x + \lambda, & \text{ if } x < -\lambda \\
+        0, & \text{ otherwise }
+        \end{cases}
+
+    Args:
+        lambd: the :math:`\lambda` (must be no less than zero) value for the Softshrink formulation. Default: 0.5
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/Softshrink.png
+
+    Examples::
+
+        >>> m = nn.Softshrink()
+        >>> input = mindtorch.randn(2)
+        >>> output = m(input)
+    """
+
+    __constants__ = ["lambd"]
+    lambd: float
+
+    def __init__(self, lambd: float = 0.5) -> None:
+        super().__init__()
+        self.lambd = lambd
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Run forward pass.
+        """
+        return F.softshrink(input, self.lambd)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        return str(self.lambd)
+
+
+def _check_arg_device(x: Optional[mindtorch.Tensor]) -> bool:
+    if x is not None:
+        return x.device.type in [
+            "cpu",
+            "cuda",
+            mindtorch.utils.backend_registration._privateuse1_backend_name,
+        ]
+    return True
 
-    def forward(self, input: Tensor) -> Tensor:
-        return F.softplus(input, self.beta, self.threshold)
 
-    def extra_repr(self) -> str:
-        return f'beta={self.beta}, threshold={self.threshold}'
+def _arg_requires_grad(x: Optional[mindtorch.Tensor]) -> bool:
+    if x is not None:
+        return x.requires_grad
+    return False
+
+
+def _is_make_fx_tracing():
+    if not mindtorch.jit.is_scripting():
+        mindtorch_dispatch_mode_stack = (
+            mindtorch.utils._python_dispatch._get_current_dispatch_mode_stack()
+        )
+        return any(
+            type(x) == mindtorch.fx.experimental.proxy_tensor.ProxyTorchDispatchMode
+            for x in mindtorch_dispatch_mode_stack
+        )
+    else:
+        return False
+
 
 class MultiheadAttention(Module):
-    r"""Allows the model to jointly attend to information
-    from different representation subspaces as described in the paper:
-    `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_.
+    r"""Allows the model to jointly attend to information from different representation subspaces.
+
+    This MultiheadAttention layer implements the original architecture described
+    in the `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_ paper. The
+    intent of this layer is as a reference implementation for foundational understanding
+    and thus it contains only limited features relative to newer architectures.
+    Given the fast pace of innovation in transformer-like architectures, we recommend
+    exploring this `tutorial <https://pymindtorch.org/tutorials/intermediate/transformer_building_blocks.html>`_
+    to build efficient layers from building blocks in core or using higher
+    level libraries from the `PyTorch Ecosystem <https://landscape.pymindtorch.org/>`_.
 
     Multi-Head Attention is defined as:
 
     .. math::
-        \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
+        \text{MultiHead}(Q, K, V) = \text{Concat}(\text{head}_1,\dots,\text{head}_h)W^O
 
-    where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`.
+    where :math:`\text{head}_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`.
 
-    ``nn.MultiHeadAttention`` will use the optimized implementations of
+    ``nn.MultiheadAttention`` will use the optimized implementations of
     ``scaled_dot_product_attention()`` when possible.
 
     In addition to support for the new ``scaled_dot_product_attention()``
@@ -399,7 +1114,6 @@ class MultiheadAttention(Module):
     - training is disabled (using ``.eval()``)
     - ``add_bias_kv`` is ``False``
     - ``add_zero_attn`` is ``False``
-    - ``batch_first`` is ``True`` and the input is batched
     - ``kdim`` and ``vdim`` are equal to ``embed_dim``
     - if a `NestedTensor <https://pymindtorch.org/docs/stable/nested.html>`_ is passed, neither ``key_padding_mask``
       nor ``attn_mask`` is passed
@@ -437,13 +1151,30 @@ class MultiheadAttention(Module):
 
     """
 
-    __constants__ = ['batch_first']
+    __constants__ = ["batch_first"]
     bias_k: Optional[mindtorch.Tensor]
     bias_v: Optional[mindtorch.Tensor]
 
-    def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False,
-                 kdim=None, vdim=None, batch_first=False, dtype=None) -> None:
-        factory_kwargs = {'dtype': dtype}
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        kdim=None,
+        vdim=None,
+        batch_first=False,
+        device=None,
+        dtype=None,
+    ) -> None:
+        if embed_dim <= 0 or num_heads <= 0:
+            raise ValueError(
+                f"embed_dim and num_heads must be greater than 0,"
+                f" got embed_dim={embed_dim} and num_heads={num_heads} instead"
+            )
+        factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__()
         self.embed_dim = embed_dim
         self.kdim = kdim if kdim is not None else embed_dim
@@ -454,29 +1185,40 @@ def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=Fals
         self.dropout = dropout
         self.batch_first = batch_first
         self.head_dim = embed_dim // num_heads
-        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
-
+        assert self.head_dim * num_heads == self.embed_dim, (
+            "embed_dim must be divisible by num_heads"
+        )
 
         if not self._qkv_same_embed_dim:
-            self.q_proj_weight = Parameter(ops.empty((embed_dim, embed_dim), **factory_kwargs))
-            self.k_proj_weight = Parameter(ops.empty((embed_dim, self.kdim), **factory_kwargs))
-            self.v_proj_weight = Parameter(ops.empty((embed_dim, self.vdim), **factory_kwargs))
-            self.register_parameter('in_proj_weight', None)
+            self.q_proj_weight = Parameter(
+                mindtorch.empty((embed_dim, embed_dim), **factory_kwargs)
+            )
+            self.k_proj_weight = Parameter(
+                mindtorch.empty((embed_dim, self.kdim), **factory_kwargs)
+            )
+            self.v_proj_weight = Parameter(
+                mindtorch.empty((embed_dim, self.vdim), **factory_kwargs)
+            )
+            self.register_parameter("in_proj_weight", None)
         else:
-            self.in_proj_weight = Parameter(ops.empty((3 * embed_dim, embed_dim), **factory_kwargs))
-            self.register_parameter('q_proj_weight', None)
-            self.register_parameter('k_proj_weight', None)
-            self.register_parameter('v_proj_weight', None)
+            self.in_proj_weight = Parameter(
+                mindtorch.empty((3 * embed_dim, embed_dim), **factory_kwargs)
+            )
+            self.register_parameter("q_proj_weight", None)
+            self.register_parameter("k_proj_weight", None)
+            self.register_parameter("v_proj_weight", None)
 
         if bias:
-            self.in_proj_bias = Parameter(ops.empty(3 * embed_dim, **factory_kwargs))
+            self.in_proj_bias = Parameter(mindtorch.empty(3 * embed_dim, **factory_kwargs))
         else:
-            self.register_parameter('in_proj_bias', None)
-        self.out_proj = Linear(embed_dim, embed_dim, bias=bias)
+            self.register_parameter("in_proj_bias", None)
+        self.out_proj = NonDynamicallyQuantizableLinear(
+            embed_dim, embed_dim, bias=bias, **factory_kwargs
+        )
 
         if add_bias_kv:
-            self.bias_k = Parameter(ops.empty((1, 1, embed_dim), **factory_kwargs))
-            self.bias_v = Parameter(ops.empty((1, 1, embed_dim), **factory_kwargs))
+            self.bias_k = Parameter(mindtorch.empty((1, 1, embed_dim), **factory_kwargs))
+            self.bias_v = Parameter(mindtorch.empty((1, 1, embed_dim), **factory_kwargs))
         else:
             self.bias_k = self.bias_v = None
 
@@ -484,105 +1226,116 @@ def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=Fals
 
         self._reset_parameters()
 
-    def _reset_parameters(self):
+    def _reset_parameters(self) -> None:
         if self._qkv_same_embed_dim:
-            init.xavier_uniform_(self.in_proj_weight)
+            xavier_uniform_(self.in_proj_weight)
         else:
-            init.xavier_uniform_(self.q_proj_weight)
-            init.xavier_uniform_(self.k_proj_weight)
-            init.xavier_uniform_(self.v_proj_weight)
+            xavier_uniform_(self.q_proj_weight)
+            xavier_uniform_(self.k_proj_weight)
+            xavier_uniform_(self.v_proj_weight)
 
         if self.in_proj_bias is not None:
-            init.constant_(self.in_proj_bias, 0.)
-            init.constant_(self.out_proj.bias, 0.)
+            constant_(self.in_proj_bias, 0.0)
+            constant_(self.out_proj.bias, 0.0)
         if self.bias_k is not None:
-            init.xavier_normal_(self.bias_k)
+            xavier_normal_(self.bias_k)
         if self.bias_v is not None:
-            init.xavier_normal_(self.bias_v)
+            xavier_normal_(self.bias_v)
 
     def __setstate__(self, state):
         # Support loading old MultiheadAttention checkpoints generated by v1.1.0
-        if '_qkv_same_embed_dim' not in state:
-            state['_qkv_same_embed_dim'] = True
+        if "_qkv_same_embed_dim" not in state:
+            state["_qkv_same_embed_dim"] = True
 
         super().__setstate__(state)
 
     def forward(
-            self,
-            query: Tensor,
-            key: Tensor,
-            value: Tensor,
-            key_padding_mask: Optional[Tensor] = None,
-            need_weights: bool = True,
-            attn_mask: Optional[Tensor] = None,
-            average_attn_weights: bool = True,
-            is_causal : bool = False) -> Tuple[Tensor, Optional[Tensor]]:
-        r"""
-    Args:
-        query: Query embeddings of shape :math:`(L, E_q)` for unbatched input, :math:`(L, N, E_q)` when ``batch_first=False``
-            or :math:`(N, L, E_q)` when ``batch_first=True``, where :math:`L` is the target sequence length,
-            :math:`N` is the batch size, and :math:`E_q` is the query embedding dimension ``embed_dim``.
-            Queries are compared against key-value pairs to produce the output.
-            See "Attention Is All You Need" for more details.
-        key: Key embeddings of shape :math:`(S, E_k)` for unbatched input, :math:`(S, N, E_k)` when ``batch_first=False``
-            or :math:`(N, S, E_k)` when ``batch_first=True``, where :math:`S` is the source sequence length,
-            :math:`N` is the batch size, and :math:`E_k` is the key embedding dimension ``kdim``.
-            See "Attention Is All You Need" for more details.
-        value: Value embeddings of shape :math:`(S, E_v)` for unbatched input, :math:`(S, N, E_v)` when
-            ``batch_first=False`` or :math:`(N, S, E_v)` when ``batch_first=True``, where :math:`S` is the source
-            sequence length, :math:`N` is the batch size, and :math:`E_v` is the value embedding dimension ``vdim``.
-            See "Attention Is All You Need" for more details.
-        key_padding_mask: If specified, a mask of shape :math:`(N, S)` indicating which elements within ``key``
-            to ignore for the purpose of attention (i.e. treat as "padding"). For unbatched `query`, shape should be :math:`(S)`.
-            Binary and float masks are supported.
-            For a binary mask, a ``True`` value indicates that the corresponding ``key`` value will be ignored for
-            the purpose of attention. For a float mask, it will be directly added to the corresponding ``key`` value.
-        need_weights: If specified, returns ``attn_output_weights`` in addition to ``attn_outputs``.
-            Set ``need_weights=False`` to use the optimized ``scaled_dot_product_attention``
-            and achieve the best performance for MHA.
-            Default: ``True``.
-        attn_mask: If specified, a 2D or 3D mask preventing attention to certain positions. Must be of shape
-            :math:`(L, S)` or :math:`(N\cdot\text{num\_heads}, L, S)`, where :math:`N` is the batch size,
-            :math:`L` is the target sequence length, and :math:`S` is the source sequence length. A 2D mask will be
-            broadcasted across the batch while a 3D mask allows for a different mask for each entry in the batch.
-            Binary and float masks are supported. For a binary mask, a ``True`` value indicates that the
-            corresponding position is not allowed to attend. For a float mask, the mask values will be added to
-            the attention weight.
-            If both attn_mask and key_padding_mask are supplied, their types should match.
-        average_attn_weights: If true, indicates that the returned ``attn_weights`` should be averaged across
-            heads. Otherwise, ``attn_weights`` are provided separately per head. Note that this flag only has an
-            effect when ``need_weights=True``. Default: ``True`` (i.e. average weights across heads)
-        is_causal: If specified, applies a causal mask as attention mask.
-            Default: ``False``.
-            Warning:
-            ``is_causal`` provides a hint that ``attn_mask`` is the
-            causal mask. Providing incorrect hints can result in
-            incorrect execution, including forward and backward
-            compatibility.
+        self,
+        query: Tensor,
+        key: Tensor,
+        value: Tensor,
+        key_padding_mask: Optional[Tensor] = None,
+        need_weights: bool = True,
+        attn_mask: Optional[Tensor] = None,
+        average_attn_weights: bool = True,
+        is_causal: bool = False,
+    ) -> tuple[Tensor, Optional[Tensor]]:
+        r"""Compute attention outputs using query, key, and value embeddings.
+
+            Supports optional parameters for padding, masks and attention weights.
 
-    Outputs:
-        - **attn_output** - Attention outputs of shape :math:`(L, E)` when input is unbatched,
-          :math:`(L, N, E)` when ``batch_first=False`` or :math:`(N, L, E)` when ``batch_first=True``,
-          where :math:`L` is the target sequence length, :math:`N` is the batch size, and :math:`E` is the
-          embedding dimension ``embed_dim``.
-        - **attn_output_weights** - Only returned when ``need_weights=True``. If ``average_attn_weights=True``,
-          returns attention weights averaged across heads of shape :math:`(L, S)` when input is unbatched or
-          :math:`(N, L, S)`, where :math:`N` is the batch size, :math:`L` is the target sequence length, and
-          :math:`S` is the source sequence length. If ``average_attn_weights=False``, returns attention weights per
-          head of shape :math:`(\text{num\_heads}, L, S)` when input is unbatched or :math:`(N, \text{num\_heads}, L, S)`.
-
-        .. note::
-            `batch_first` argument is ignored for unbatched inputs.
-        """
-
-        is_batched = query.ndim == 3
+        Args:
+            query: Query embeddings of shape :math:`(L, E_q)` for unbatched input, :math:`(L, N, E_q)` when ``batch_first=False``
+                or :math:`(N, L, E_q)` when ``batch_first=True``, where :math:`L` is the target sequence length,
+                :math:`N` is the batch size, and :math:`E_q` is the query embedding dimension ``embed_dim``.
+                Queries are compared against key-value pairs to produce the output.
+                See "Attention Is All You Need" for more details.
+            key: Key embeddings of shape :math:`(S, E_k)` for unbatched input, :math:`(S, N, E_k)` when ``batch_first=False``
+                or :math:`(N, S, E_k)` when ``batch_first=True``, where :math:`S` is the source sequence length,
+                :math:`N` is the batch size, and :math:`E_k` is the key embedding dimension ``kdim``.
+                See "Attention Is All You Need" for more details.
+            value: Value embeddings of shape :math:`(S, E_v)` for unbatched input, :math:`(S, N, E_v)` when
+                ``batch_first=False`` or :math:`(N, S, E_v)` when ``batch_first=True``, where :math:`S` is the source
+                sequence length, :math:`N` is the batch size, and :math:`E_v` is the value embedding dimension ``vdim``.
+                See "Attention Is All You Need" for more details.
+            key_padding_mask: If specified, a mask of shape :math:`(N, S)` indicating which elements within ``key``
+                to ignore for the purpose of attention (i.e. treat as "padding"). For unbatched `query`, shape should be :math:`(S)`.
+                Binary and float masks are supported.
+                For a binary mask, a ``True`` value indicates that the corresponding ``key`` value will be ignored for
+                the purpose of attention. For a float mask, it will be directly added to the corresponding ``key`` value.
+            need_weights: If specified, returns ``attn_output_weights`` in addition to ``attn_outputs``.
+                Set ``need_weights=False`` to use the optimized ``scaled_dot_product_attention``
+                and achieve the best performance for MHA.
+                Default: ``True``.
+            attn_mask: If specified, a 2D or 3D mask preventing attention to certain positions. Must be of shape
+                :math:`(L, S)` or :math:`(N\cdot\text{num\_heads}, L, S)`, where :math:`N` is the batch size,
+                :math:`L` is the target sequence length, and :math:`S` is the source sequence length. A 2D mask will be
+                broadcasted across the batch while a 3D mask allows for a different mask for each entry in the batch.
+                Binary and float masks are supported. For a binary mask, a ``True`` value indicates that the
+                corresponding position is not allowed to attend. For a float mask, the mask values will be added to
+                the attention weight.
+                If both attn_mask and key_padding_mask are supplied, their types should match.
+            average_attn_weights: If true, indicates that the returned ``attn_weights`` should be averaged across
+                heads. Otherwise, ``attn_weights`` are provided separately per head. Note that this flag only has an
+                effect when ``need_weights=True``. Default: ``True`` (i.e. average weights across heads)
+            is_causal: If specified, applies a causal mask as attention mask.
+                Default: ``False``.
+                Warning:
+                ``is_causal`` provides a hint that ``attn_mask`` is the
+                causal mask. Providing incorrect hints can result in
+                incorrect execution, including forward and backward
+                compatibility.
+
+        Outputs:
+            - **attn_output** - Attention outputs of shape :math:`(L, E)` when input is unbatched,
+              :math:`(L, N, E)` when ``batch_first=False`` or :math:`(N, L, E)` when ``batch_first=True``,
+              where :math:`L` is the target sequence length, :math:`N` is the batch size, and :math:`E` is the
+              embedding dimension ``embed_dim``.
+            - **attn_output_weights** - Only returned when ``need_weights=True``. If ``average_attn_weights=True``,
+              returns attention weights averaged across heads of shape :math:`(L, S)` when input is unbatched or
+              :math:`(N, L, S)`, where :math:`N` is the batch size, :math:`L` is the target sequence length, and
+              :math:`S` is the source sequence length. If ``average_attn_weights=False``, returns attention weights per
+              head of shape :math:`(\text{num\_heads}, L, S)` when input is unbatched or :math:`(N, \text{num\_heads}, L, S)`.
+
+            .. note::
+                `batch_first` argument is ignored for unbatched inputs.
+        """  # noqa: B950
+        why_not_fast_path = ""
+        if (
+            (attn_mask is not None and mindtorch.is_floating_point(attn_mask))
+            or (key_padding_mask is not None)
+            and mindtorch.is_floating_point(key_padding_mask)
+        ):
+            why_not_fast_path = "floating-point masks are not supported for fast path."
+
+        is_batched = query.dim() == 3
 
         key_padding_mask = F._canonical_mask(
             mask=key_padding_mask,
             mask_name="key_padding_mask",
             other_type=F._none_or_dtype(attn_mask),
             other_name="attn_mask",
-            target_type=query.dtype
+            target_type=query.dtype,
         )
 
         attn_mask = F._canonical_mask(
@@ -594,52 +1347,179 @@ def forward(
             check_other=False,
         )
 
+        is_fastpath_enabled = mindtorch.backends.mha.get_fastpath_enabled()
+
+        if not is_fastpath_enabled:
+            why_not_fast_path = "mindtorch.backends.mha.get_fastpath_enabled() was not True"
+        elif not is_batched:
+            why_not_fast_path = (
+                f"input not batched; expected query.dim() of 3 but got {query.dim()}"
+            )
+        elif query is not key or key is not value:
+            # When lifting this restriction, don't forget to either
+            # enforce that the dtypes all match or test cases where
+            # they don't!
+            why_not_fast_path = "non-self attention was used (query, key, and value are not the same Tensor)"
+        elif self.in_proj_bias is not None and query.dtype != self.in_proj_bias.dtype:
+            why_not_fast_path = f"dtypes of query ({query.dtype}) and self.in_proj_bias ({self.in_proj_bias.dtype}) don't match"
+        elif self.in_proj_weight is None:
+            why_not_fast_path = "in_proj_weight was None"
+        elif query.dtype != self.in_proj_weight.dtype:
+            # this case will fail anyway, but at least they'll get a useful error message.
+            why_not_fast_path = f"dtypes of query ({query.dtype}) and self.in_proj_weight ({self.in_proj_weight.dtype}) don't match"
+        elif self.training:
+            why_not_fast_path = "training is enabled"
+        elif (self.num_heads % 2) != 0:
+            why_not_fast_path = "self.num_heads is not even"
+        elif not self.batch_first:
+            why_not_fast_path = "batch_first was not True"
+        elif self.bias_k is not None:
+            why_not_fast_path = "self.bias_k was not None"
+        elif self.bias_v is not None:
+            why_not_fast_path = "self.bias_v was not None"
+        elif self.add_zero_attn:
+            why_not_fast_path = "add_zero_attn was enabled"
+        elif not self._qkv_same_embed_dim:
+            why_not_fast_path = "_qkv_same_embed_dim was not True"
+        elif query.is_nested and (
+            key_padding_mask is not None or attn_mask is not None
+        ):
+            why_not_fast_path = (
+                "supplying both src_key_padding_mask and src_mask at the same time \
+                                 is not supported with NestedTensor input"
+            )
+        elif mindtorch.is_autocast_enabled():
+            why_not_fast_path = "autocast is enabled"
+
+        if not why_not_fast_path:
+            tensor_args = (
+                query,
+                key,
+                value,
+                self.in_proj_weight,
+                self.in_proj_bias,
+                self.out_proj.weight,
+                self.out_proj.bias,
+            )
+            # We have to use list comprehensions below because TorchScript does not support
+            # generator expressions.
+            if mindtorch.overrides.has_mindtorch_function(tensor_args):
+                why_not_fast_path = "some Tensor argument has_mindtorch_function"
+            elif _is_make_fx_tracing():
+                why_not_fast_path = "we are running make_fx tracing"
+            elif not all(_check_arg_device(x) for x in tensor_args):
+                why_not_fast_path = (
+                    "some Tensor argument's device is neither one of "
+                    f"cpu, cuda or {mindtorch.utils.backend_registration._privateuse1_backend_name}"
+                )
+            elif mindtorch.is_grad_enabled() and any(
+                _arg_requires_grad(x) for x in tensor_args
+            ):
+                why_not_fast_path = (
+                    "grad is enabled and at least one of query or the "
+                    "input/output projection weights or biases requires_grad"
+                )
+            if not why_not_fast_path:
+                merged_mask, mask_type = self.merge_masks(
+                    attn_mask, key_padding_mask, query
+                )
+
+                if self.in_proj_bias is not None and self.in_proj_weight is not None:
+                    return mindtorch._native_multi_head_attention(
+                        query,
+                        key,
+                        value,
+                        self.embed_dim,
+                        self.num_heads,
+                        self.in_proj_weight,
+                        self.in_proj_bias,
+                        self.out_proj.weight,
+                        self.out_proj.bias,
+                        merged_mask,
+                        need_weights,
+                        average_attn_weights,
+                        mask_type,
+                    )
+
+        any_nested = query.is_nested or key.is_nested or value.is_nested
+        assert not any_nested, (
+            "MultiheadAttention does not support NestedTensor outside of its fast path. "
+            + f"The fast path was not hit because {why_not_fast_path}"
+        )
+
         if self.batch_first and is_batched:
             # make sure that the transpose op does not affect the "is" property
             if key is value:
                 if query is key:
-                    query = key = value = ops.transpose(query, 1, 0)
+                    query = key = value = query.transpose(1, 0)
                 else:
-                    query, key = (ops.transpose(x, 1, 0) for x in (query, key))
+                    query, key = (x.transpose(1, 0) for x in (query, key))
                     value = key
             else:
-                query, key, value = (ops.transpose(x, 1, 0) for x in (query, key, value))
+                query, key, value = (x.transpose(1, 0) for x in (query, key, value))
 
         if not self._qkv_same_embed_dim:
             attn_output, attn_output_weights = F.multi_head_attention_forward(
-                query, key, value, self.embed_dim, self.num_heads,
-                self.in_proj_weight, self.in_proj_bias,
-                self.bias_k, self.bias_v, self.add_zero_attn,
-                self.dropout, self.out_proj.weight, self.out_proj.bias,
+                query,
+                key,
+                value,
+                self.embed_dim,
+                self.num_heads,
+                self.in_proj_weight,
+                self.in_proj_bias,
+                self.bias_k,
+                self.bias_v,
+                self.add_zero_attn,
+                self.dropout,
+                self.out_proj.weight,
+                self.out_proj.bias,
                 training=self.training,
-                key_padding_mask=key_padding_mask, need_weights=need_weights,
+                key_padding_mask=key_padding_mask,
+                need_weights=need_weights,
                 attn_mask=attn_mask,
                 use_separate_proj_weight=True,
-                q_proj_weight=self.q_proj_weight, k_proj_weight=self.k_proj_weight,
+                q_proj_weight=self.q_proj_weight,
+                k_proj_weight=self.k_proj_weight,
                 v_proj_weight=self.v_proj_weight,
                 average_attn_weights=average_attn_weights,
-                is_causal=is_causal)
+                is_causal=is_causal,
+            )
         else:
             attn_output, attn_output_weights = F.multi_head_attention_forward(
-                query, key, value, self.embed_dim, self.num_heads,
-                self.in_proj_weight, self.in_proj_bias,
-                self.bias_k, self.bias_v, self.add_zero_attn,
-                self.dropout, self.out_proj.weight, self.out_proj.bias,
+                query,
+                key,
+                value,
+                self.embed_dim,
+                self.num_heads,
+                self.in_proj_weight,
+                self.in_proj_bias,
+                self.bias_k,
+                self.bias_v,
+                self.add_zero_attn,
+                self.dropout,
+                self.out_proj.weight,
+                self.out_proj.bias,
                 training=self.training,
                 key_padding_mask=key_padding_mask,
                 need_weights=need_weights,
                 attn_mask=attn_mask,
                 average_attn_weights=average_attn_weights,
-                is_causal=is_causal)
+                is_causal=is_causal,
+            )
         if self.batch_first and is_batched:
-            return ops.transpose(attn_output, 1, 0), attn_output_weights
+            return attn_output.transpose(1, 0), attn_output_weights
         else:
             return attn_output, attn_output_weights
 
-    def merge_masks(self, attn_mask: Optional[Tensor], key_padding_mask: Optional[Tensor],
-                    query: Tensor) -> Tuple[Optional[Tensor], Optional[int]]:
-        r"""
-        Determine mask type and combine masks if necessary. If only one mask is provided, that mask
+    def merge_masks(
+        self,
+        attn_mask: Optional[Tensor],
+        key_padding_mask: Optional[Tensor],
+        query: Tensor,
+    ) -> tuple[Optional[Tensor], Optional[int]]:
+        r"""Determine mask type and combine masks if necessary.
+
+        If only one mask is provided, that mask
         and the corresponding mask type will be returned. If both masks are provided, they will be both
         expanded to shape ``(batch_size, num_heads, seq_len, seq_len)``, combined with logical ``or``
         and mask type 2 will be returned
@@ -664,19 +1544,24 @@ def merge_masks(self, attn_mask: Optional[Tensor], key_padding_mask: Optional[Te
             mask_type = 2
 
             # Always expands attn_mask to 4D
-            if attn_mask.ndim == 3:
+            if attn_mask.dim() == 3:
                 attn_mask_expanded = attn_mask.view(batch_size, -1, seq_len, seq_len)
-            else:  # attn_mask.ndim == 2:
-                attn_mask_expanded = attn_mask.view(1, 1, seq_len, seq_len).expand(batch_size, self.num_heads, -1, -1)
+            else:  # attn_mask.dim() == 2:
+                attn_mask_expanded = attn_mask.view(1, 1, seq_len, seq_len).expand(
+                    batch_size, self.num_heads, -1, -1
+                )
             merged_mask = attn_mask_expanded
 
             if key_padding_mask is not None:
-                key_padding_mask_expanded = key_padding_mask.view(batch_size, 1, 1, seq_len).expand(-1, self.num_heads, -1, -1)
+                key_padding_mask_expanded = key_padding_mask.view(
+                    batch_size, 1, 1, seq_len
+                ).expand(-1, self.num_heads, -1, -1)
                 merged_mask = attn_mask_expanded + key_padding_mask_expanded
 
         # no attn_mask and no key_padding_mask, returns None, None
         return merged_mask, mask_type
 
+
 class PReLU(Module):
     r"""Applies the element-wise PReLU function.
 
@@ -731,195 +1616,287 @@ class PReLU(Module):
     num_parameters: int
 
     def __init__(
-        self, num_parameters: int = 1, init: float = 0.25, dtype=None
+        self, num_parameters: int = 1, init: float = 0.25, device=None, dtype=None
     ) -> None:
-        factory_kwargs = {"dtype": dtype}
+        factory_kwargs = {"device": device, "dtype": dtype}
         self.num_parameters = num_parameters
         super().__init__()
         self.init = init
-        self.weight = Parameter(ops.empty(num_parameters, **factory_kwargs))
+        self.weight = Parameter(mindtorch.empty(num_parameters, **factory_kwargs))
         self.reset_parameters()
 
-    def reset_parameters(self):
-        init.constant_(self.weight, self.init)
+    def reset_parameters(self) -> None:
+        """
+        Resets parameters based on their initialization used in ``__init__``.
+        """
+        mindtorch.nn.init.constant_(self.weight, self.init)
 
     def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
         return F.prelu(input, self.weight)
 
     def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
         return f"num_parameters={self.num_parameters}"
 
-class CELU(Module):
-    r"""Applies the CELU function element-wise.
-
-    .. math::
-        \text{CELU}(x) = \max(0,x) + \min(0, \alpha * (\exp(x/\alpha) - 1))
 
-    More details can be found in the paper `Continuously Differentiable Exponential Linear Units`_ .
+class Softsign(Module):
+    r"""Applies the element-wise Softsign function.
 
-    Args:
-        alpha: the :math:`\alpha` value for the CELU formulation. Default: 1.0
-        inplace: can optionally do the operation in-place. Default: ``False``
+    .. math::
+        \text{SoftSign}(x) = \frac{x}{ 1 + |x|}
 
     Shape:
         - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
         - Output: :math:`(*)`, same shape as the input.
 
-    .. image:: ../scripts/activation_images/CELU.png
+    .. image:: ../scripts/activation_images/Softsign.png
 
     Examples::
 
-        >>> m = nn.CELU()
+        >>> m = nn.Softsign()
         >>> input = mindtorch.randn(2)
         >>> output = m(input)
-
-    .. _`Continuously Differentiable Exponential Linear Units`:
-        https://arxiv.org/abs/1704.07483
     """
 
-    __constants__ = ["alpha", "inplace"]
-    alpha: float
-    inplace: bool
-
-    def __init__(self, alpha: float = 1.0, inplace: bool = False) -> None:
-        super().__init__()
-        self.alpha = alpha
-        self.inplace = inplace
-
     def forward(self, input: Tensor) -> Tensor:
-        return F.celu(input, self.alpha, self.inplace)
+        """
+        Runs the forward pass.
+        """
+        return F.softsign(input)
 
-    def extra_repr(self) -> str:
-        inplace_str = ", inplace=True" if self.inplace else ""
-        return f"alpha={self.alpha}{inplace_str}"
 
-class SELU(Module):
-    r"""Applies the SELU function element-wise.
+class Tanhshrink(Module):
+    r"""Applies the element-wise Tanhshrink function.
 
     .. math::
-        \text{SELU}(x) = \text{scale} * (\max(0,x) + \min(0, \alpha * (\exp(x) - 1)))
-
-    with :math:`\alpha = 1.6732632423543772848170429916717` and
-    :math:`\text{scale} = 1.0507009873554804934193349852946`.
-
-    .. warning::
-        When using ``kaiming_normal`` or ``kaiming_normal_`` for initialisation,
-        ``nonlinearity='linear'`` should be used instead of ``nonlinearity='selu'``
-        in order to get `Self-Normalizing Neural Networks`_.
-        See :func:`mindtorch.nn.init.calculate_gain` for more information.
-
-    More details can be found in the paper `Self-Normalizing Neural Networks`_ .
-
-    Args:
-        inplace (bool, optional): can optionally do the operation in-place. Default: ``False``
+        \text{Tanhshrink}(x) = x - \tanh(x)
 
     Shape:
         - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
         - Output: :math:`(*)`, same shape as the input.
 
-    .. image:: ../scripts/activation_images/SELU.png
+    .. image:: ../scripts/activation_images/Tanhshrink.png
 
     Examples::
 
-        >>> m = nn.SELU()
+        >>> m = nn.Tanhshrink()
         >>> input = mindtorch.randn(2)
         >>> output = m(input)
+    """
 
-    .. _Self-Normalizing Neural Networks: https://arxiv.org/abs/1706.02515
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.tanhshrink(input)
+
+
+class Softmin(Module):
+    r"""Applies the Softmin function to an n-dimensional input Tensor.
+
+    Rescales them so that the elements of the n-dimensional output Tensor
+    lie in the range `[0, 1]` and sum to 1.
+
+    Softmin is defined as:
+
+    .. math::
+        \text{Softmin}(x_{i}) = \frac{\exp(-x_i)}{\sum_j \exp(-x_j)}
+
+    Shape:
+        - Input: :math:`(*)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(*)`, same shape as the input
+
+    Args:
+        dim (int): A dimension along which Softmin will be computed (so every slice
+            along dim will sum to 1).
+
+    Returns:
+        a Tensor of the same dimension and shape as the input, with
+        values in the range [0, 1]
+
+    Examples::
+
+        >>> m = nn.Softmin(dim=1)
+        >>> input = mindtorch.randn(2, 3)
+        >>> output = m(input)
     """
 
-    __constants__ = ["inplace"]
-    inplace: bool
+    __constants__ = ["dim"]
+    dim: Optional[int]
 
-    def __init__(self, inplace: bool = False) -> None:
+    def __init__(self, dim: Optional[int] = None) -> None:
         super().__init__()
-        self.inplace = inplace
+        self.dim = dim
+
+    def __setstate__(self, state):
+        super().__setstate__(state)
+        if not hasattr(self, "dim"):
+            self.dim = None
 
     def forward(self, input: Tensor) -> Tensor:
-        return F.selu(input)
+        """
+        Runs the forward pass.
+        """
+        return F.softmin(input, self.dim, _stacklevel=5)
 
     def extra_repr(self) -> str:
-        inplace_str = "inplace=True" if self.inplace else ""
-        return inplace_str
+        """
+        Return the extra representation of the module.
+        """
+        return f"dim={self.dim}"
 
 
-class Hardsigmoid(Module):
-    r"""Applies the Hardsigmoid function element-wise.
+class Softmax(Module):
+    r"""Applies the Softmax function to an n-dimensional input Tensor.
 
-    Hardsigmoid is defined as:
+    Rescales them so that the elements of the n-dimensional output Tensor
+    lie in the range [0,1] and sum to 1.
+
+    Softmax is defined as:
 
     .. math::
-        \text{Hardsigmoid}(x) = \begin{cases}
-            0 & \text{if~} x \le -3, \\
-            1 & \text{if~} x \ge +3, \\
-            x / 6 + 1 / 2 & \text{otherwise}
-        \end{cases}
+        \text{Softmax}(x_{i}) = \frac{\exp(x_i)}{\sum_j \exp(x_j)}
 
-    Args:
-        inplace: can optionally do the operation in-place. Default: ``False``
+    When the input Tensor is a sparse tensor then the unspecified
+    values are treated as ``-inf``.
 
     Shape:
-        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
-        - Output: :math:`(*)`, same shape as the input.
+        - Input: :math:`(*)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(*)`, same shape as the input
 
-    .. image:: ../scripts/activation_images/Hardsigmoid.png
+    Returns:
+        a Tensor of the same dimension and shape as the input with
+        values in the range [0, 1]
+
+    Args:
+        dim (int): A dimension along which Softmax will be computed (so every slice
+            along dim will sum to 1).
+
+    .. note::
+        This module doesn't work directly with NLLLoss,
+        which expects the Log to be computed between the Softmax and itself.
+        Use `LogSoftmax` instead (it's faster and has better numerical properties).
 
     Examples::
 
-        >>> m = nn.Hardsigmoid()
-        >>> input = mindtorch.randn(2)
+        >>> m = nn.Softmax(dim=1)
+        >>> input = mindtorch.randn(2, 3)
         >>> output = m(input)
-    """
 
-    __constants__ = ["inplace"]
+    """
 
-    inplace: bool
+    __constants__ = ["dim"]
+    dim: Optional[int]
 
-    def __init__(self, inplace: bool = False) -> None:
+    def __init__(self, dim: Optional[int] = None) -> None:
         super().__init__()
-        self.inplace = inplace
+        self.dim = dim
+
+    def __setstate__(self, state):
+        super().__setstate__(state)
+        if not hasattr(self, "dim"):
+            self.dim = None
 
     def forward(self, input: Tensor) -> Tensor:
-        return F.hardsigmoid(input, self.inplace)
+        """
+        Runs the forward pass.
+        """
+        return F.softmax(input, self.dim, _stacklevel=5)
 
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        return f"dim={self.dim}"
 
-class Hardswish(Module):
-    r"""Applies the Hardswish function, element-wise.
 
-    Method described in the paper: `Searching for MobileNetV3 <https://arxiv.org/abs/1905.02244>`_.
+class Softmax2d(Module):
+    r"""Applies SoftMax over features to each spatial location.
 
-    Hardswish is defined as:
+    When given an image of ``Channels x Height x Width``, it will
+    apply `Softmax` to each location :math:`(Channels, h_i, w_j)`
 
-    .. math::
-        \text{Hardswish}(x) = \begin{cases}
-            0 & \text{if~} x \le -3, \\
-            x & \text{if~} x \ge +3, \\
-            x \cdot (x + 3) /6 & \text{otherwise}
-        \end{cases}
+    Shape:
+        - Input: :math:`(N, C, H, W)` or :math:`(C, H, W)`.
+        - Output: :math:`(N, C, H, W)` or :math:`(C, H, W)` (same shape as input)
 
-    Args:
-        inplace: can optionally do the operation in-place. Default: ``False``
+    Returns:
+        a Tensor of the same dimension and shape as the input with
+        values in the range [0, 1]
+
+    Examples::
+
+        >>> m = nn.Softmax2d()
+        >>> # you softmax over the 2nd dimension
+        >>> input = mindtorch.randn(2, 3, 12, 13)
+        >>> output = m(input)
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        if input.dim() not in (3, 4):
+            raise ValueError(
+                f"Softmax2d: expected input to be 3D or 4D, got {input.dim()}D instead"
+            )
+        return F.softmax(input, -3, _stacklevel=5)
+
+
+class LogSoftmax(Module):
+    r"""Applies the :math:`\log(\text{Softmax}(x))` function to an n-dimensional input Tensor.
+
+    The LogSoftmax formulation can be simplified as:
+
+    .. math::
+        \text{LogSoftmax}(x_{i}) = \log\left(\frac{\exp(x_i) }{ \sum_j \exp(x_j)} \right)
 
     Shape:
-        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
-        - Output: :math:`(*)`, same shape as the input.
+        - Input: :math:`(*)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(*)`, same shape as the input
 
-    .. image:: ../scripts/activation_images/Hardswish.png
+    Args:
+        dim (int): A dimension along which LogSoftmax will be computed.
+
+    Returns:
+        a Tensor of the same dimension and shape as the input with
+        values in the range [-inf, 0)
 
     Examples::
 
-        >>> m = nn.Hardswish()
-        >>> input = mindtorch.randn(2)
+        >>> m = nn.LogSoftmax(dim=1)
+        >>> input = mindtorch.randn(2, 3)
         >>> output = m(input)
     """
 
-    __constants__ = ["inplace"]
-
-    inplace: bool
+    __constants__ = ["dim"]
+    dim: Optional[int]
 
-    def __init__(self, inplace: bool = False) -> None:
+    def __init__(self, dim: Optional[int] = None) -> None:
         super().__init__()
-        self.inplace = inplace
+        self.dim = dim
+
+    def __setstate__(self, state):
+        super().__setstate__(state)
+        if not hasattr(self, "dim"):
+            self.dim = None
 
     def forward(self, input: Tensor) -> Tensor:
-        return F.hardswish(input, self.inplace)
+        """
+        Runs the forward pass.
+        """
+        return F.log_softmax(input, self.dim, _stacklevel=5)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        return f"dim={self.dim}"
\ No newline at end of file
diff --git a/mindtorch/nn/modules/adaptive.py b/mindtorch/nn/modules/adaptive.py
index 1b0f112d1..a24920428 100644
--- a/mindtorch/nn/modules/adaptive.py
+++ b/mindtorch/nn/modules/adaptive.py
@@ -1,30 +1,32 @@
-"""adaptive"""
 # mypy: allow-untyped-defs
 
 from collections import namedtuple
-from typing import List, Sequence
+from collections.abc import Sequence
 
-from mindtorch import Tensor
+import mindtorch
 import mindtorch.nn.functional as F
+from mindtorch import Tensor
 
-from . import Sequential, ModuleList, Linear
+from .container import ModuleList, Sequential
+from .linear import Linear
 from .module import Module
-from ... import ops
 
-__all__ = ['AdaptiveLogSoftmaxWithLoss']
 
-_ASMoutput = namedtuple('_ASMoutput', ['output', 'loss'])
+__all__ = ["AdaptiveLogSoftmaxWithLoss"]
 
+_ASMoutput = namedtuple("_ASMoutput", ["output", "loss"])
 
 
 class AdaptiveLogSoftmaxWithLoss(Module):
-    r"""Efficient softmax approximation.
+    (
+        """Efficient softmax approximation.
 
     As described in
     `Efficient softmax approximation for GPUs by Edouard Grave, Armand Joulin,
     Moustapha Ciss\u00e9, David Grangier, and Herv\u00e9 J\u00e9gou
     <https://arxiv.org/abs/1609.04309>`__.
-
+"""
+        r"""
     Adaptive softmax is an approximate strategy for training models with large
     output spaces. It is most effective when the label distribution is highly
     imbalanced, for example in natural language modelling, where the word
@@ -104,10 +106,11 @@ class AdaptiveLogSoftmaxWithLoss(Module):
 
     .. _Zipf's law: https://en.wikipedia.org/wiki/Zipf%27s_law
     """
+    )
 
     in_features: int
     n_classes: int
-    cutoffs: List[int]
+    cutoffs: list[int]
     div_value: float
     head_bias: bool
     head: Linear
@@ -118,27 +121,31 @@ def __init__(
         in_features: int,
         n_classes: int,
         cutoffs: Sequence[int],
-        div_value: float = 4.,
+        div_value: float = 4.0,
         head_bias: bool = False,
-        dtype=None
+        device=None,
+        dtype=None,
     ) -> None:
-        factory_kwargs = {'dtype': dtype}
+        factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__()
 
         cutoffs = list(cutoffs)
 
-        if (len(cutoffs) == 0):
+        if len(cutoffs) == 0:
             raise ValueError("cutoffs should be a sequence of length larger than 0")
 
-        if (cutoffs != sorted(cutoffs)) \
-                or (min(cutoffs) <= 0) \
-                or (max(cutoffs) > (n_classes - 1)) \
-                or (len(set(cutoffs)) != len(cutoffs)) \
-                or any(int(c) != c for c in cutoffs):
-
-            raise ValueError("cutoffs should be a sequence of unique, positive "
-                             "integers sorted in an increasing order, where "
-                             "each value is between 1 and n_classes-1")
+        if (
+            (cutoffs != sorted(cutoffs))
+            or (min(cutoffs) <= 0)
+            or (max(cutoffs) > (n_classes - 1))
+            or (len(set(cutoffs)) != len(cutoffs))
+            or any(int(c) != c for c in cutoffs)
+        ):
+            raise ValueError(
+                "cutoffs should be a sequence of unique, positive "
+                "integers sorted in an increasing order, where "
+                "each value is between 1 and n_classes-1"
+            )
 
         self.in_features = in_features
         self.n_classes = n_classes
@@ -150,12 +157,12 @@ def __init__(
         self.n_clusters = len(self.cutoffs) - 1
         self.head_size = self.shortlist_size + self.n_clusters
 
-        self.head = Linear(self.in_features, self.head_size, bias=self.head_bias,
-                           **factory_kwargs)
+        self.head = Linear(
+            self.in_features, self.head_size, bias=self.head_bias, **factory_kwargs
+        )
         self.tail = ModuleList()
 
         for i in range(self.n_clusters):
-
             hsz = int(self.in_features // (self.div_value ** (i + 1)))
             osz = self.cutoffs[i + 1] - self.cutoffs[i]
 
@@ -167,53 +174,66 @@ def __init__(
             self.tail.append(projection)
 
     def reset_parameters(self) -> None:
+        """
+        Resets parameters based on their initialization used in ``__init__``.
+        """
         self.head.reset_parameters()
-        for i2h, h2o in self.tail:
-            i2h.reset_parameters()
-            h2o.reset_parameters()
+        for i2h, h2o in self.tail:  # type: ignore[misc]
+            i2h.reset_parameters()  # type: ignore[has-type]
+            h2o.reset_parameters()  # type: ignore[has-type]
 
     def forward(self, input_: Tensor, target_: Tensor) -> _ASMoutput:
-        targ_dim = target_.ndim
+        """
+        Runs the forward pass.
+        """
+        targ_dim = target_.dim()
 
         if targ_dim == 1:
-            if input_.shape[0] != target_.shape[0]:
-                raise RuntimeError('Input and target should have the same size '
-                                   'in the batch dimension.')
-            if input_.ndim != 2:
-                raise RuntimeError('1D target tensor expects 2D input tensors, '
-                                   'but found inputs with size', input_.shape)
+            if input_.size(0) != target_.size(0):
+                raise RuntimeError(
+                    "Input and target should have the same size in the batch dimension."
+                )
+            if input_.dim() != 2:
+                raise RuntimeError(
+                    "1D target tensor expects 2D input tensors, "
+                    "but found inputs with size",
+                    input_.size(),
+                )
         elif targ_dim == 0:
-            if input_.ndim != 1:
-                raise RuntimeError('0D target tensor expects 1D input tensors, '
-                                   'but found inputs with size', input_.shape)
+            if input_.dim() != 1:
+                raise RuntimeError(
+                    "0D target tensor expects 1D input tensors, "
+                    "but found inputs with size",
+                    input_.size(),
+                )
         else:
-            raise RuntimeError('0D or 1D target tensor expected, '
-                               'multi-target not supported')
+            raise RuntimeError(
+                "0D or 1D target tensor expected, multi-target not supported"
+            )
 
         is_batched = targ_dim > 0
         input = input_ if is_batched else input_.unsqueeze(0)
         target = target_ if is_batched else target_.unsqueeze(0)
 
         used_rows = 0
-        batch_size = target.shape[0]
+        batch_size = target.size(0)
 
-        output = ops.zeros(batch_size, dtype=input.dtype)
-        gather_inds = ops.zeros(batch_size, dtype=target.dtype)
+        output = input.new_zeros(batch_size)
+        gather_inds = target.new_empty(batch_size)
 
         cutoff_values = [0] + self.cutoffs
         for i in range(len(cutoff_values) - 1):
-
             low_idx = cutoff_values[i]
             high_idx = cutoff_values[i + 1]
 
             target_mask = (target >= low_idx) & (target < high_idx)
-            row_indices = ops.nonzero(target_mask).squeeze()
+            row_indices = target_mask.nonzero().squeeze()
 
             if row_indices.numel() == 0:
                 continue
 
             if i == 0:
-                gather_inds = ops.index_add(gather_inds, 0, row_indices, target[target_mask])
+                gather_inds.index_copy_(0, row_indices, target[target_mask])
 
             else:
                 relative_target = target[target_mask] - low_idx
@@ -222,21 +242,23 @@ def forward(self, input_: Tensor, target_: Tensor) -> _ASMoutput:
                 cluster_output = self.tail[i - 1](input_subset)
                 cluster_index = self.shortlist_size + i - 1
 
-                gather_inds = ops.index_fill(gather_inds, 0, row_indices, cluster_index)
+                gather_inds.index_fill_(0, row_indices, cluster_index)
                 cluster_logprob = F.log_softmax(cluster_output, dim=1)
                 local_logprob = cluster_logprob.gather(1, relative_target.unsqueeze(1))
-                output = ops.index_add(output, 0, row_indices, local_logprob.squeeze(1))
+                output.index_copy_(0, row_indices, local_logprob.squeeze(1))
 
             used_rows += row_indices.numel()
 
         if used_rows != batch_size:
-            raise RuntimeError(f"Target values should be in [0, {self.n_classes - 1}], "
-                               f"but values in range [{target.min().item()}, {target.max().item()}] "
-                               "were found. ")
+            raise RuntimeError(
+                f"Target values should be in [0, {self.n_classes - 1}], "
+                f"but values in range [{target.min().item()}, {target.max().item()}] "
+                "were found. "
+            )
 
         head_output = self.head(input)
         head_logprob = F.log_softmax(head_output, dim=1)
-        output += ops.gather(head_logprob, 1, gather_inds.unsqueeze(1)).squeeze()
+        output += head_logprob.gather(1, gather_inds.unsqueeze(1)).squeeze()
         loss = (-output).mean()
 
         if not is_batched:
@@ -246,21 +268,22 @@ def forward(self, input_: Tensor, target_: Tensor) -> _ASMoutput:
 
     def _get_full_log_prob(self, input, head_output):
         """Given input tensor, and output of ``self.head``, compute the log of the full distribution."""
-        out = ops.zeros((head_output.shape[0], self.n_classes), dtype=input.dtype)
+        out = input.new_empty((head_output.size(0), self.n_classes))
         head_logprob = F.log_softmax(head_output, dim=1)
 
-        out[:, :self.shortlist_size] = head_logprob[:, :self.shortlist_size]
+        out[:, : self.shortlist_size] = head_logprob[:, : self.shortlist_size]
 
         for i, (start_idx, stop_idx) in enumerate(zip(self.cutoffs, self.cutoffs[1:])):
             cluster_output = self.tail[i](input)
             cluster_logprob = F.log_softmax(cluster_output, dim=1)
-            output_logprob = cluster_logprob + head_logprob[:, self.shortlist_size + i].unsqueeze(1)
+            output_logprob = cluster_logprob + head_logprob[
+                :, self.shortlist_size + i
+            ].unsqueeze(1)
 
             out[:, start_idx:stop_idx] = output_logprob
 
         return out
 
-
     def log_prob(self, input: Tensor) -> Tensor:
         r"""Compute log probabilities for all :math:`\texttt{n\_classes}`.
 
@@ -280,8 +303,6 @@ def log_prob(self, input: Tensor) -> Tensor:
         head_output = self.head(input)
         return self._get_full_log_prob(input, head_output)
 
-
-
     def predict(self, input: Tensor) -> Tensor:
         r"""Return the class with the highest probability for each example in the input minibatch.
 
@@ -298,8 +319,8 @@ def predict(self, input: Tensor) -> Tensor:
             - Output: :math:`(N)`
         """
         head_output = self.head(input)
-        output = ops.argmax(head_output, dim=1)
-        not_in_shortlist = (output >= self.shortlist_size)
+        output = mindtorch.argmax(head_output, dim=1)
+        not_in_shortlist = output >= self.shortlist_size
         all_in_shortlist = not (not_in_shortlist.any())
 
         if all_in_shortlist:
@@ -307,10 +328,11 @@ def predict(self, input: Tensor) -> Tensor:
 
         elif not_in_shortlist.all():
             log_prob = self._get_full_log_prob(input, head_output)
-            return ops.argmax(log_prob, dim=1)
+            return mindtorch.argmax(log_prob, dim=1)
 
         else:
-            log_prob = self._get_full_log_prob(input[not_in_shortlist],
-                                               head_output[not_in_shortlist])
-            output[not_in_shortlist] = ops.argmax(log_prob, dim=1)
-            return output
+            log_prob = self._get_full_log_prob(
+                input[not_in_shortlist], head_output[not_in_shortlist]
+            )
+            output[not_in_shortlist] = mindtorch.argmax(log_prob, dim=1)
+            return output
\ No newline at end of file
diff --git a/mindtorch/nn/modules/batchnorm.py b/mindtorch/nn/modules/batchnorm.py
index d4b6b2e5a..122add395 100644
--- a/mindtorch/nn/modules/batchnorm.py
+++ b/mindtorch/nn/modules/batchnorm.py
@@ -1,13 +1,26 @@
-"""batch norm"""
-from typing import Optional, Any
-from mindtorch import Tensor
+# mypy: allow-untyped-defs
+from typing import Any, Optional
+
 import mindtorch
-from ..parameter import Parameter
+from mindtorch import Tensor
+from mindtorch.nn import functional as F, init
+from mindtorch.nn.parameter import Parameter, UninitializedBuffer, UninitializedParameter
 
+# from ._functions import SyncBatchNorm as sync_batch_norm
+from .lazy import LazyModuleMixin
 from .module import Module
-from .. import init
-from ... import ops
-from .. import functional as F
+
+
+__all__ = [
+    "BatchNorm1d",
+    "LazyBatchNorm1d",
+    "BatchNorm2d",
+    "LazyBatchNorm2d",
+    "BatchNorm3d",
+    "LazyBatchNorm3d",
+    "SyncBatchNorm",
+]
+
 
 class _NormBase(Module):
     """Common base of _InstanceNorm and _BatchNorm."""
@@ -16,33 +29,42 @@ class _NormBase(Module):
     __constants__ = ["track_running_stats", "momentum", "eps", "num_features", "affine"]
     num_features: int
     eps: float
-    momentum: float
+    momentum: Optional[float]
     affine: bool
     track_running_stats: bool
     # WARNING: weight and bias purposely not defined here.
-    # See https://github.com/pytorch/pytorch/issues/39670
+    # See https://github.com/pymindtorch/pymindtorch/issues/39670
 
     def __init__(
         self,
         num_features: int,
         eps: float = 1e-5,
-        momentum: float = 0.1,
+        momentum: Optional[float] = 0.1,
         affine: bool = True,
         track_running_stats: bool = True,
-        dtype=None
+        device=None,
+        dtype=None,
     ) -> None:
-        factory_kwargs = {'dtype': dtype}
+        factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__()
         self.num_features = num_features
         self.eps = eps
         self.momentum = momentum
         self.affine = affine
         self.track_running_stats = track_running_stats
-        self.weight = Parameter(ops.empty(num_features, **factory_kwargs), affine)
-        self.bias = Parameter(ops.empty(num_features, **factory_kwargs), affine)
+        if self.affine:
+            self.weight = Parameter(mindtorch.empty(num_features, **factory_kwargs))
+            self.bias = Parameter(mindtorch.empty(num_features, **factory_kwargs))
+        else:
+            self.register_parameter("weight", None)
+            self.register_parameter("bias", None)
         if self.track_running_stats:
-            self.register_buffer('running_mean', ops.zeros(num_features,))
-            self.register_buffer('running_var', ops.ones(num_features,))
+            self.register_buffer(
+                "running_mean", mindtorch.zeros(num_features, **factory_kwargs)
+            )
+            self.register_buffer(
+                "running_var", mindtorch.ones(num_features, **factory_kwargs)
+            )
             self.running_mean: Optional[Tensor]
             self.running_var: Optional[Tensor]
             self.register_buffer(
@@ -64,9 +86,9 @@ def reset_running_stats(self) -> None:
         if self.track_running_stats:
             # running_mean/running_var/num_batches... are registered at runtime depending
             # if self.track_running_stats is on
-            init.zeros_(self.running_mean)  # type: ignore[union-attr]
-            init.ones_(self.running_var)  # type: ignore[union-attr]
-            init.zeros_(self.num_batches_tracked)  # type: ignore[union-attr,operator]
+            self.running_mean.zero_()  # type: ignore[union-attr]
+            self.running_var.fill_(1)  # type: ignore[union-attr]
+            self.num_batches_tracked.zero_()  # type: ignore[union-attr,operator]
 
     def reset_parameters(self) -> None:
         self.reset_running_stats()
@@ -83,18 +105,53 @@ def extra_repr(self):
             "track_running_stats={track_running_stats}".format(**self.__dict__)
         )
 
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ) -> None:
+        version = local_metadata.get("version", None)
+
+        if (version is None or version < 2) and self.track_running_stats:
+            # at version 2: added num_batches_tracked buffer
+            #               this should have a default value of 0
+            num_batches_tracked_key = prefix + "num_batches_tracked"
+            if num_batches_tracked_key not in state_dict:
+                state_dict[num_batches_tracked_key] = (
+                    self.num_batches_tracked
+                    if self.num_batches_tracked is not None
+                    and self.num_batches_tracked.device != mindtorch.device("meta")
+                    else mindtorch.tensor(0, dtype=mindtorch.long)
+                )
+
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+
 
 class _BatchNorm(_NormBase):
     def __init__(
         self,
         num_features: int,
         eps: float = 1e-5,
-        momentum: float = 0.1,
+        momentum: Optional[float] = 0.1,
         affine: bool = True,
         track_running_stats: bool = True,
-        dtype=None
+        device=None,
+        dtype=None,
     ) -> None:
-        factory_kwargs = {'dtype': dtype}
+        factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__(
             num_features, eps, momentum, affine, track_running_stats, **factory_kwargs
         )
@@ -113,7 +170,7 @@ def forward(self, input: Tensor) -> Tensor:
         if self.training and self.track_running_stats:
             # TODO: if statement only here to tell the jit to skip emitting this when it is None
             if self.num_batches_tracked is not None:  # type: ignore[has-type]
-                self.num_batches_tracked = self.num_batches_tracked + 1  # type: ignore[has-type]
+                self.num_batches_tracked.add_(1)  # type: ignore[has-type]
                 if self.momentum is None:  # use cumulative moving average
                     exponential_average_factor = 1.0 / float(self.num_batches_tracked)
                 else:  # use exponential moving average
@@ -136,9 +193,11 @@ def forward(self, input: Tensor) -> Tensor:
         return F.batch_norm(
             input,
             # If buffers are not to be tracked, ensure that they won't be updated
-            self.running_mean
-            if not self.training or self.track_running_stats
-            else None,
+            (
+                self.running_mean
+                if not self.training or self.track_running_stats
+                else None
+            ),
             self.running_var if not self.training or self.track_running_stats else None,
             self.weight,
             self.bias,
@@ -148,6 +207,66 @@ def forward(self, input: Tensor) -> Tensor:
         )
 
 
+class _LazyNormBase(LazyModuleMixin, _NormBase):
+    weight: UninitializedParameter  # type: ignore[assignment]
+    bias: UninitializedParameter  # type: ignore[assignment]
+
+    def __init__(
+        self,
+        eps=1e-5,
+        momentum=0.1,
+        affine=True,
+        track_running_stats=True,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__(
+            # affine and track_running_stats are hardcoded to False to
+            # avoid creating tensors that will soon be overwritten.
+            0,
+            eps,
+            momentum,
+            False,
+            False,
+            **factory_kwargs,
+        )
+        self.affine = affine
+        self.track_running_stats = track_running_stats
+        if self.affine:
+            self.weight = UninitializedParameter(**factory_kwargs)
+            self.bias = UninitializedParameter(**factory_kwargs)
+        if self.track_running_stats:
+            self.running_mean = UninitializedBuffer(**factory_kwargs)
+            self.running_var = UninitializedBuffer(**factory_kwargs)
+            self.num_batches_tracked = mindtorch.tensor(
+                0,
+                dtype=mindtorch.long,
+                **{k: v for k, v in factory_kwargs.items() if k != "dtype"},
+            )
+
+    def reset_parameters(self) -> None:
+        if not self.has_uninitialized_params() and self.num_features != 0:
+            super().reset_parameters()
+
+    def initialize_parameters(self, input) -> None:  # type: ignore[override]
+        if self.has_uninitialized_params():
+            self.num_features = input.shape[1]
+            if self.affine:
+                assert isinstance(self.weight, UninitializedParameter)
+                assert isinstance(self.bias, UninitializedParameter)
+                self.weight.materialize((self.num_features,))
+                self.bias.materialize((self.num_features,))
+            if self.track_running_stats:
+                self.running_mean.materialize(  # type:ignore[union-attr]
+                    (self.num_features,)
+                )
+                self.running_var.materialize(  # type:ignore[union-attr]
+                    (self.num_features,)
+                )
+            self.reset_parameters()
+
+
 class BatchNorm1d(_BatchNorm):
     r"""Applies Batch Normalization over a 2D or 3D input.
 
@@ -163,9 +282,9 @@ class BatchNorm1d(_BatchNorm):
     the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
     of size `C` (where `C` is the number of features or channels of the input). By default, the
     elements of :math:`\gamma` are set to 1 and the elements of :math:`\beta` are set to 0.
-    At train time in the forward pass, the standard-deviation is calculated via the biased estimator,
+    At train time in the forward pass, the variance is calculated via the biased estimator,
     equivalent to ``mindtorch.var(input, unbiased=False)``. However, the value stored in the
-    moving average of the standard-deviation is calculated via the unbiased  estimator, equivalent to
+    moving average of the variance is calculated via the unbiased  estimator, equivalent to
     ``mindtorch.var(input, unbiased=True)``.
 
     Also by default, during training this layer keeps running estimates of its
@@ -219,11 +338,43 @@ class BatchNorm1d(_BatchNorm):
         >>> output = m(input)
     """
 
-    def _check_input_dim(self, input):
+    def _check_input_dim(self, input) -> None:
         if input.dim() != 2 and input.dim() != 3:
-            raise ValueError(
-                f"expected 2D or 3D input (got {input.dim()}D input)"
-            )
+            raise ValueError(f"expected 2D or 3D input (got {input.dim()}D input)")
+
+
+class LazyBatchNorm1d(_LazyNormBase, _BatchNorm):
+    r"""A :class:`mindtorch.nn.BatchNorm1d` module with lazy initialization.
+
+    Lazy initialization based on the ``num_features`` argument of the :class:`BatchNorm1d` that is inferred
+    from the ``input.size(1)``.
+    The attributes that will be lazily initialized are `weight`, `bias`,
+    `running_mean` and `running_var`.
+
+    Check the :class:`mindtorch.nn.modules.lazy.LazyModuleMixin` for further documentation
+    on lazy modules and their limitations.
+
+    Args:
+        eps: a value added to the denominator for numerical stability.
+            Default: 1e-5
+        momentum: the value used for the running_mean and running_var
+            computation. Can be set to ``None`` for cumulative moving average
+            (i.e. simple average). Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters. Default: ``True``
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics, and initializes statistics
+            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
+            When these buffers are ``None``, this module always uses batch statistics.
+            in both training and eval modes. Default: ``True``
+    """
+
+    cls_to_become = BatchNorm1d  # type: ignore[assignment]
+
+    def _check_input_dim(self, input) -> None:
+        if input.dim() != 2 and input.dim() != 3:
+            raise ValueError(f"expected 2D or 3D input (got {input.dim()}D input)")
 
 
 class BatchNorm2d(_BatchNorm):
@@ -298,7 +449,41 @@ class BatchNorm2d(_BatchNorm):
         >>> output = m(input)
     """
 
-    def _check_input_dim(self, input):
+    def _check_input_dim(self, input) -> None:
+        if input.dim() != 4:
+            raise ValueError(f"expected 4D input (got {input.dim()}D input)")
+
+
+class LazyBatchNorm2d(_LazyNormBase, _BatchNorm):
+    r"""A :class:`mindtorch.nn.BatchNorm2d` module with lazy initialization.
+
+    Lazy initialization is done for the ``num_features`` argument of the :class:`BatchNorm2d` that is inferred
+    from the ``input.size(1)``.
+    The attributes that will be lazily initialized are `weight`, `bias`,
+    `running_mean` and `running_var`.
+
+    Check the :class:`mindtorch.nn.modules.lazy.LazyModuleMixin` for further documentation
+    on lazy modules and their limitations.
+
+    Args:
+        eps: a value added to the denominator for numerical stability.
+            Default: 1e-5
+        momentum: the value used for the running_mean and running_var
+            computation. Can be set to ``None`` for cumulative moving average
+            (i.e. simple average). Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters. Default: ``True``
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics, and initializes statistics
+            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
+            When these buffers are ``None``, this module always uses batch statistics.
+            in both training and eval modes. Default: ``True``
+    """
+
+    cls_to_become = BatchNorm2d  # type: ignore[assignment]
+
+    def _check_input_dim(self, input) -> None:
         if input.dim() != 4:
             raise ValueError(f"expected 4D input (got {input.dim()}D input)")
 
@@ -375,10 +560,45 @@ class BatchNorm3d(_BatchNorm):
         >>> output = m(input)
     """
 
-    def _check_input_dim(self, input):
+    def _check_input_dim(self, input) -> None:
         if input.dim() != 5:
             raise ValueError(f"expected 5D input (got {input.dim()}D input)")
 
+
+class LazyBatchNorm3d(_LazyNormBase, _BatchNorm):
+    r"""A :class:`mindtorch.nn.BatchNorm3d` module with lazy initialization.
+
+    Lazy initialization is done for the ``num_features`` argument of the :class:`BatchNorm3d` that is inferred
+    from the ``input.size(1)``.
+    The attributes that will be lazily initialized are `weight`, `bias`,
+    `running_mean` and `running_var`.
+
+    Check the :class:`mindtorch.nn.modules.lazy.LazyModuleMixin` for further documentation
+    on lazy modules and their limitations.
+
+    Args:
+        eps: a value added to the denominator for numerical stability.
+            Default: 1e-5
+        momentum: the value used for the running_mean and running_var
+            computation. Can be set to ``None`` for cumulative moving average
+            (i.e. simple average). Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters. Default: ``True``
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics, and initializes statistics
+            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
+            When these buffers are ``None``, this module always uses batch statistics.
+            in both training and eval modes. Default: ``True``
+    """
+
+    cls_to_become = BatchNorm3d  # type: ignore[assignment]
+
+    def _check_input_dim(self, input) -> None:
+        if input.dim() != 5:
+            raise ValueError(f"expected 5D input (got {input.dim()}D input)")
+
+
 class SyncBatchNorm(_BatchNorm):
     r"""Applies Batch Normalization over a N-Dimensional input.
 
@@ -396,7 +616,7 @@ class SyncBatchNorm(_BatchNorm):
     By default, the elements of :math:`\gamma` are sampled from
     :math:`\mathcal{U}(0, 1)` and the elements of :math:`\beta` are set to 0.
     The standard-deviation is calculated via the biased estimator, equivalent to
-    `torch.var(input, unbiased=False)`.
+    `mindtorch.var(input, unbiased=False)`.
 
     Also by default, during training this layer keeps running estimates of its
     computed mean and variance, which are then used for normalization during
@@ -420,8 +640,8 @@ class SyncBatchNorm(_BatchNorm):
     Normalization or Spatio-temporal Batch Normalization.
 
     Currently :class:`SyncBatchNorm` only supports
-    :class:`~torch.nn.DistributedDataParallel` (DDP) with single GPU per process. Use
-    :meth:`torch.nn.SyncBatchNorm.convert_sync_batchnorm()` to convert
+    :class:`~mindtorch.nn.DistributedDataParallel` (DDP) with single GPU per process. Use
+    :meth:`mindtorch.nn.SyncBatchNorm.convert_sync_batchnorm()` to convert
     :attr:`BatchNorm*D` layer to :class:`SyncBatchNorm` before wrapping
     Network with DDP.
 
@@ -466,17 +686,17 @@ class SyncBatchNorm(_BatchNorm):
         >>> # Note: every rank calls into new_group for every
         >>> # process group created, even if that rank is not
         >>> # part of the group.
-        >>> process_groups = [torch.distributed.new_group(pids) for pids in [r1, r2]]
+        >>> process_groups = [mindtorch.distributed.new_group(pids) for pids in [r1, r2]]
         >>> process_group = process_groups[0 if dist.get_rank() <= 3 else 1]
         >>> # Without Learnable Parameters
         >>> m = nn.BatchNorm3d(100, affine=False, process_group=process_group)
-        >>> input = torch.randn(20, 100, 35, 45, 10)
+        >>> input = mindtorch.randn(20, 100, 35, 45, 10)
         >>> output = m(input)
 
         >>> # network is nn.BatchNorm layer
         >>> sync_bn_network = nn.SyncBatchNorm.convert_sync_batchnorm(network, process_group)
         >>> # only single gpu per process is currently supported
-        >>> ddp_sync_bn_network = torch.nn.parallel.DistributedDataParallel(
+        >>> ddp_sync_bn_network = mindtorch.nn.parallel.DistributedDataParallel(
         >>>                         sync_bn_network,
         >>>                         device_ids=[args.local_rank],
         >>>                         output_device=args.local_rank)
@@ -499,17 +719,20 @@ def __init__(
         )
         self.process_group = process_group
 
-    def _check_input_dim(self, input):
+    def _check_input_dim(self, input) -> None:
         if input.dim() < 2:
             raise ValueError(f"expected at least 2D input (got {input.dim()}D input)")
 
-    def _check_non_zero_input_channels(self, input):
+    def _check_non_zero_input_channels(self, input) -> None:
         if input.size(1) == 0:
             raise ValueError(
                 "SyncBatchNorm number of input channels should be non-zero"
             )
 
     def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
         self._check_input_dim(input)
         self._check_non_zero_input_channels(input)
 
@@ -555,25 +778,26 @@ def forward(self, input: Tensor) -> Tensor:
         need_sync = (
             bn_training
             and self.training
-            and torch.distributed.is_available()
-            and torch.distributed.is_initialized()
+            and mindtorch.distributed.is_available()
+            and mindtorch.distributed.is_initialized()
         )
         if need_sync:
             # currently only GPU/PrivateUse1 input is supported
             if input.device.type not in [
                 "cuda",
+                "hpu",
                 "xpu",
-                torch._C._get_privateuse1_backend_name(),
+                mindtorch._C._get_privateuse1_backend_name(),
             ]:
                 raise ValueError(
                     "SyncBatchNorm expected input tensor to be on GPU or XPU or "
-                    f"{torch._C._get_privateuse1_backend_name()}"
+                    f"{mindtorch._C._get_privateuse1_backend_name()}"
                 )
 
-            process_group = torch.distributed.group.WORLD
+            process_group = mindtorch.distributed.group.WORLD
             if self.process_group:
                 process_group = self.process_group
-            world_size = torch.distributed.get_world_size(process_group)
+            world_size = mindtorch.distributed.get_world_size(process_group)
             need_sync = world_size > 1
 
         # fallback to framework BN when synchronization is not necessary
@@ -604,7 +828,7 @@ def forward(self, input: Tensor) -> Tensor:
 
     @classmethod
     def convert_sync_batchnorm(cls, module, process_group=None):
-        r"""Converts all :attr:`BatchNorm*D` layers in the model to :class:`torch.nn.SyncBatchNorm` layers.
+        r"""Converts all :attr:`BatchNorm*D` layers in the model to :class:`mindtorch.nn.SyncBatchNorm` layers.
 
         Args:
             module (nn.Module): module containing one or more :attr:`BatchNorm*D` layers
@@ -612,18 +836,18 @@ def convert_sync_batchnorm(cls, module, process_group=None):
                 default is the whole world
 
         Returns:
-            The original :attr:`module` with the converted :class:`torch.nn.SyncBatchNorm`
+            The original :attr:`module` with the converted :class:`mindtorch.nn.SyncBatchNorm`
             layers. If the original :attr:`module` is a :attr:`BatchNorm*D` layer,
-            a new :class:`torch.nn.SyncBatchNorm` layer object will be returned
+            a new :class:`mindtorch.nn.SyncBatchNorm` layer object will be returned
             instead.
 
         Example::
 
             >>> # Network with nn.BatchNorm layer
             >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
-            >>> module = torch.nn.Sequential(
-            >>>            torch.nn.Linear(20, 100),
-            >>>            torch.nn.BatchNorm1d(100),
+            >>> module = mindtorch.nn.Sequential(
+            >>>            mindtorch.nn.Linear(20, 100),
+            >>>            mindtorch.nn.BatchNorm1d(100),
             >>>          ).cuda()
             >>> # creating process group (optional)
             >>> # ranks is a list of int identifying rank ids.
@@ -633,14 +857,14 @@ def convert_sync_batchnorm(cls, module, process_group=None):
             >>> # process group created, even if that rank is not
             >>> # part of the group.
             >>> # xdoctest: +SKIP("distributed")
-            >>> process_groups = [torch.distributed.new_group(pids) for pids in [r1, r2]]
+            >>> process_groups = [mindtorch.distributed.new_group(pids) for pids in [r1, r2]]
             >>> process_group = process_groups[0 if dist.get_rank() <= 3 else 1]
-            >>> sync_bn_module = torch.nn.SyncBatchNorm.convert_sync_batchnorm(module, process_group)
+            >>> sync_bn_module = mindtorch.nn.SyncBatchNorm.convert_sync_batchnorm(module, process_group)
 
         """
         module_output = module
-        if isinstance(module, torch.nn.modules.batchnorm._BatchNorm):
-            module_output = torch.nn.SyncBatchNorm(
+        if isinstance(module, mindtorch.nn.modules.batchnorm._BatchNorm):
+            module_output = mindtorch.nn.SyncBatchNorm(
                 module.num_features,
                 module.eps,
                 module.momentum,
@@ -649,7 +873,7 @@ def convert_sync_batchnorm(cls, module, process_group=None):
                 process_group,
             )
             if module.affine:
-                with torch.no_grad():
+                with mindtorch.no_grad():
                     module_output.weight = module.weight
                     module_output.bias = module.bias
             module_output.running_mean = module.running_mean
diff --git a/mindtorch/nn/modules/container.py b/mindtorch/nn/modules/container.py
index 61f1d233e..fca20650c 100644
--- a/mindtorch/nn/modules/container.py
+++ b/mindtorch/nn/modules/container.py
@@ -1,28 +1,60 @@
-"""Container"""
+# mypy: allow-untyped-defs
+from __future__ import annotations
+
 import operator
+from collections import abc as container_abcs, OrderedDict
 from itertools import chain, islice
-from typing import Any, Dict, Iterable, Iterator, Mapping, Optional, Tuple, Union, TypeVar
-from collections import OrderedDict, abc as container_abcs
-from typing_extensions import Self
+from typing import Any, Optional, overload, TYPE_CHECKING, TypeVar, Union
+from typing_extensions import deprecated, Self
 
 import mindtorch
-from ..parameter import Parameter
+from mindtorch.nn.parameter import Parameter
 
 from .module import Module
 
-T = TypeVar('T', bound=Module)
 
+if TYPE_CHECKING:
+    from collections.abc import Iterable, Iterator, Mapping
+
+
+__all__ = [
+    "Container",
+    "Sequential",
+    "ModuleList",
+    "ModuleDict",
+    "ParameterList",
+    "ParameterDict",
+]
+
+T = TypeVar("T", bound=Module)
+_V = TypeVar("_V")
+
+
+# Copied from mindtorch.nn.modules.module, required for a custom __repr__ for ModuleList
 def _addindent(s_, numSpaces):
-    s = s_.split('\n')
+    s = s_.split("\n")
     # don't do anything for single-line stuff
     if len(s) == 1:
         return s_
     first = s.pop(0)
-    s = [(numSpaces * ' ') + line for line in s]
-    s = '\n'.join(s)
-    s = first + '\n' + s
+    s = [(numSpaces * " ") + line for line in s]
+    s = "\n".join(s)
+    s = first + "\n" + s
     return s
 
+
+@deprecated(
+    "`nn.Container` is deprecated. "
+    "All of it's functionality is now implemented in `nn.Module`. Subclass that instead.",
+    category=FutureWarning,
+)
+class Container(Module):
+    def __init__(self, **kwargs: Any) -> None:
+        super().__init__()
+        for key, value in kwargs.items():
+            self.add_module(key, value)
+
+
 class Sequential(Module):
     r"""A sequential container.
 
@@ -53,23 +85,30 @@ class Sequential(Module):
         # for `Conv2d(20,64,5)`. Finally, the output of
         # `Conv2d(20,64,5)` will be used as input to the second `ReLU`
         model = nn.Sequential(
-                  nn.Conv2d(1,20,5),
-                  nn.ReLU(),
-                  nn.Conv2d(20,64,5),
-                  nn.ReLU()
-                )
+            nn.Conv2d(1, 20, 5), nn.ReLU(), nn.Conv2d(20, 64, 5), nn.ReLU()
+        )
 
         # Using Sequential with OrderedDict. This is functionally the
         # same as the above code
-        model = nn.Sequential(OrderedDict([
-                  ('conv1', nn.Conv2d(1,20,5)),
-                  ('relu1', nn.ReLU()),
-                  ('conv2', nn.Conv2d(20,64,5)),
-                  ('relu2', nn.ReLU())
-                ]))
+        model = nn.Sequential(
+            OrderedDict(
+                [
+                    ("conv1", nn.Conv2d(1, 20, 5)),
+                    ("relu1", nn.ReLU()),
+                    ("conv2", nn.Conv2d(20, 64, 5)),
+                    ("relu2", nn.ReLU()),
+                ]
+            )
+        )
     """
 
-    _modules: Dict[str, Module]  # type: ignore[assignment]
+    _modules: dict[str, Module]  # type: ignore[assignment]
+
+    @overload
+    def __init__(self, *args: Module) -> None: ...
+
+    @overload
+    def __init__(self, arg: OrderedDict[str, Module]) -> None: ...
 
     def __init__(self, *args):
         super().__init__()
@@ -80,16 +119,17 @@ def __init__(self, *args):
             for idx, module in enumerate(args):
                 self.add_module(str(idx), module)
 
-    def _get_item_by_idx(self, iterator, idx) -> T:  # type: ignore[misc, type-var]
+    def _get_item_by_idx(self, iterator: Iterable[_V], idx: int) -> _V:
         """Get the idx-th item of the iterator."""
         size = len(self)
         idx = operator.index(idx)
         if not -size <= idx < size:
-            raise IndexError(f'index {idx} is out of range')
+            raise IndexError(f"index {idx} is out of range")
         idx %= size
         return next(islice(iterator, idx, None))
 
-    def __getitem__(self, idx: Union[slice, int]) -> Union['Sequential', T]:
+
+    def __getitem__(self, idx: Union[slice, int]) -> Union[Sequential, Module]:
         if isinstance(idx, slice):
             return self.__class__(OrderedDict(list(self._modules.items())[idx]))
         else:
@@ -110,10 +150,11 @@ def __delitem__(self, idx: Union[slice, int]) -> None:
         str_indices = [str(i) for i in range(len(self._modules))]
         self._modules = OrderedDict(list(zip(str_indices, self._modules.values())))
 
+
     def __len__(self) -> int:
         return len(self._modules)
 
-    def __add__(self, other) -> 'Sequential':
+    def __add__(self, other) -> Sequential:
         if isinstance(other, Sequential):
             ret = Sequential()
             for layer in self:
@@ -122,10 +163,15 @@ def __add__(self, other) -> 'Sequential':
                 ret.append(layer)
             return ret
         else:
-            raise ValueError('add operator supports only objects '
-                             f'of Sequential class, but {str(type(other))} is given.')
+            raise ValueError(
+                "add operator supports only objects "
+                f"of Sequential class, but {str(type(other))} is given."
+            )
 
     def pop(self, key: Union[int, slice]) -> Module:
+        """
+        Pop ``key`` from self.
+        """
         v = self[key]
         del self[key]
         return v
@@ -137,14 +183,20 @@ def __iadd__(self, other) -> Self:
                 self.add_module(str(i + offset), module)
             return self
         else:
-            raise ValueError('add operator supports only objects '
-                             f'of Sequential class, but {str(type(other))} is given.')
+            raise ValueError(
+                "add operator supports only objects "
+                f"of Sequential class, but {str(type(other))} is given."
+            )
 
-    def __mul__(self, other: int) -> 'Sequential':
+    def __mul__(self, other: int) -> Sequential:
         if not isinstance(other, int):
-            raise TypeError(f"unsupported operand type(s) for *: {type(self)} and {type(other)}")
-        elif (other <= 0):
-            raise ValueError(f"Non-positive multiplication factor {other} for {type(self)}")
+            raise TypeError(
+                f"unsupported operand type(s) for *: {type(self)} and {type(other)}"
+            )
+        elif other <= 0:
+            raise ValueError(
+                f"Non-positive multiplication factor {other} for {type(self)}"
+            )
         else:
             combined = Sequential()
             offset = 0
@@ -154,14 +206,18 @@ def __mul__(self, other: int) -> 'Sequential':
                     offset += 1
             return combined
 
-    def __rmul__(self, other: int) -> 'Sequential':
+    def __rmul__(self, other: int) -> Sequential:
         return self.__mul__(other)
 
     def __imul__(self, other: int) -> Self:
         if not isinstance(other, int):
-            raise TypeError(f"unsupported operand type(s) for *: {type(self)} and {type(other)}")
-        elif (other <= 0):
-            raise ValueError(f"Non-positive multiplication factor {other} for {type(self)}")
+            raise TypeError(
+                f"unsupported operand type(s) for *: {type(self)} and {type(other)}"
+            )
+        elif other <= 0:
+            raise ValueError(
+                f"Non-positive multiplication factor {other} for {type(self)}"
+            )
         else:
             len_original = len(self)
             offset = len(self)
@@ -171,11 +227,13 @@ def __imul__(self, other: int) -> Self:
                 offset += len_original
             return self
 
-    def __dir__(self):
+
+    def __dir__(self) -> list[str]:
         keys = super().__dir__()
         keys = [key for key in keys if not key.isdigit()]
         return keys
 
+
     def __iter__(self) -> Iterator[Module]:
         return iter(self._modules.values())
 
@@ -184,38 +242,59 @@ def __iter__(self) -> Iterator[Module]:
     # TestScript.test_sequential_intermediary_types).  Cannot annotate
     # with Any as TorchScript expects a more precise type
     def forward(self, input):
-        if self.__ms_class__:
-            return self.jit_forward(input)
-        return self.slow_forward(input)
-
-    def slow_forward(self, input):
+        """
+        Runs the forward pass.
+        """
         for module in self:
             input = module(input)
         return input
 
-    def jit_forward(self, input):
-        for module in self._modules.values():
-            input = module(input)
-        return input
-
-    def append(self, module: Module) -> 'Sequential':
+    def append(self, module: Module) -> Self:
         r"""Append a given module to the end.
 
         Args:
             module (nn.Module): module to append
+
+        Example::
+
+            >>> import mindtorch.nn as nn
+            >>> n = nn.Sequential(nn.Linear(1, 2), nn.Linear(2, 3))
+            >>> n.append(nn.Linear(3, 4))
+            Sequential(
+                (0): Linear(in_features=1, out_features=2, bias=True)
+                (1): Linear(in_features=2, out_features=3, bias=True)
+                (2): Linear(in_features=3, out_features=4, bias=True)
+            )
+
         """
         self.add_module(str(len(self)), module)
         return self
 
+    def insert(self, index: int, module: Module) -> Self:
+        """
+        Inserts a module into the Sequential container at the specified index.
+
+        Args:
+            index (int): The index to insert the module.
+            module (Module): The module to be inserted.
+
+        Example::
+
+            >>> import mindtorch.nn as nn
+            >>> n = nn.Sequential(nn.Linear(1, 2), nn.Linear(2, 3))
+            >>> n.insert(0, nn.Linear(3, 4))
+            Sequential(
+                (0): Linear(in_features=3, out_features=4, bias=True)
+                (1): Linear(in_features=1, out_features=2, bias=True)
+                (2): Linear(in_features=2, out_features=3, bias=True)
+            )
 
-    def insert(self, index: int, module: Module) -> 'Sequential':
+        """
         if not isinstance(module, Module):
-            raise AssertionError(
-                f'module should be of type: {Module}')
+            raise AssertionError(f"module should be of type: {Module}")
         n = len(self._modules)
         if not (-n <= index <= n):
-            raise IndexError(
-                f'Index out of range: {index}')
+            raise IndexError(f"Index out of range: {index}")
         if index < 0:
             index += n
         for i in range(n, index, -1):
@@ -223,7 +302,27 @@ def insert(self, index: int, module: Module) -> 'Sequential':
         self._modules[str(index)] = module
         return self
 
-    def extend(self, sequential) -> 'Sequential':
+    def extend(self, sequential: Iterable[Module]) -> Self:
+        """
+        Extends the current Sequential container with layers from another Sequential container.
+
+        Args:
+            sequential (Sequential): A Sequential container whose layers will be added to the current container.
+
+        Example::
+
+            >>> import mindtorch.nn as nn
+            >>> n = nn.Sequential(nn.Linear(1, 2), nn.Linear(2, 3))
+            >>> other = nn.Sequential(nn.Linear(3, 4), nn.Linear(4, 5))
+            >>> n.extend(other) # or `n + other`
+            Sequential(
+                (0): Linear(in_features=1, out_features=2, bias=True)
+                (1): Linear(in_features=2, out_features=3, bias=True)
+                (2): Linear(in_features=3, out_features=4, bias=True)
+                (3): Linear(in_features=4, out_features=5, bias=True)
+            )
+
+        """
         for layer in sequential:
             self.append(layer)
         return self
@@ -242,7 +341,7 @@ class ModuleList(Module):
     Example::
 
         class MyModule(nn.Module):
-            def __init__(self):
+            def __init__(self) -> None:
                 super().__init__()
                 self.linears = nn.ModuleList([nn.Linear(10, 10) for i in range(10)])
 
@@ -253,7 +352,7 @@ def forward(self, x):
                 return x
     """
 
-    _modules: Dict[str, Module]  # type: ignore[assignment]
+    _modules: dict[str, Module]  # type: ignore[assignment]
 
     def __init__(self, modules: Optional[Iterable[Module]] = None) -> None:
         super().__init__()
@@ -264,12 +363,19 @@ def _get_abs_string_index(self, idx):
         """Get the absolute index for the list of modules."""
         idx = operator.index(idx)
         if not (-len(self) <= idx < len(self)):
-            raise IndexError(f'index {idx} is out of range')
+            raise IndexError(f"index {idx} is out of range")
         if idx < 0:
             idx += len(self)
         return str(idx)
 
-    def __getitem__(self, idx: Union[int, slice]) -> Union[Module, 'ModuleList']:
+    @overload
+    def __getitem__(self, idx: slice) -> ModuleList: ...
+
+    @overload
+    def __getitem__(self, idx: int) -> Module: ...
+
+
+    def __getitem__(self, idx: Union[int, slice]) -> Union[Module, ModuleList]:
         if isinstance(idx, slice):
             return self.__class__(list(self._modules.values())[idx])
         else:
@@ -289,26 +395,28 @@ def __delitem__(self, idx: Union[int, slice]) -> None:
         str_indices = [str(i) for i in range(len(self._modules))]
         self._modules = OrderedDict(list(zip(str_indices, self._modules.values())))
 
+
     def __len__(self) -> int:
         return len(self._modules)
 
+
     def __iter__(self) -> Iterator[Module]:
         return iter(self._modules.values())
 
     def __iadd__(self, modules: Iterable[Module]) -> Self:
         return self.extend(modules)
 
-    def __add__(self, other: Iterable[Module]) -> 'ModuleList':
+    def __add__(self, other: Iterable[Module]) -> ModuleList:
         combined = ModuleList()
         for i, module in enumerate(chain(self, other)):
             combined.add_module(str(i), module)
         return combined
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         """Return a custom repr for ModuleList that compresses repeated module representations."""
         list_of_reprs = [repr(item) for item in self]
         if len(list_of_reprs) == 0:
-            return self._get_name() + '()'
+            return self._get_name() + "()"
 
         start_end_indices = [[0, 0]]
         repeated_blocks = [list_of_reprs[0]]
@@ -321,7 +429,7 @@ def __repr__(self):
             repeated_blocks.append(r)
 
         lines = []
-        main_str = self._get_name() + '('
+        main_str = self._get_name() + "("
         for (start_id, end_id), b in zip(start_end_indices, repeated_blocks):
             local_repr = f"({start_id}): {b}"  # default repr
 
@@ -332,11 +440,12 @@ def __repr__(self):
             local_repr = _addindent(local_repr, 2)
             lines.append(local_repr)
 
-        main_str += '\n  ' + '\n  '.join(lines) + '\n'
-        main_str += ')'
+        main_str += "\n  " + "\n  ".join(lines) + "\n"
+        main_str += ")"
         return main_str
 
-    def __dir__(self):
+
+    def __dir__(self) -> list[str]:
         keys = super().__dir__()
         keys = [key for key in keys if not key.isdigit()]
         return keys
@@ -352,8 +461,7 @@ def insert(self, index: int, module: Module) -> None:
             self._modules[str(i)] = self._modules[str(i - 1)]
         self._modules[str(index)] = module
 
-
-    def append(self, module: Module) -> 'ModuleList':
+    def append(self, module: Module) -> Self:
         r"""Append a given module to the end of the list.
 
         Args:
@@ -362,7 +470,6 @@ def append(self, module: Module) -> 'ModuleList':
         self.add_module(str(len(self)), module)
         return self
 
-
     def pop(self, key: Union[int, slice]) -> Module:
         v = self[key]
         del self[key]
@@ -375,13 +482,17 @@ def extend(self, modules: Iterable[Module]) -> Self:
             modules (iterable): iterable of modules to append
         """
         if not isinstance(modules, container_abcs.Iterable):
-            raise TypeError("ModuleList.extend should be called with an "
-                            "iterable, but got " + type(modules).__name__)
+            raise TypeError(
+                "ModuleList.extend should be called with an "
+                "iterable, but got " + type(modules).__name__
+            )
         offset = len(self)
         for i, module in enumerate(modules):
             self.add_module(str(offset + i), module)
         return self
 
+    # remove forward altogether to fallback on Module's _forward_unimplemented
+
 
 class ModuleDict(Module):
     r"""Holds submodules in a dictionary.
@@ -410,16 +521,14 @@ class ModuleDict(Module):
     Example::
 
         class MyModule(nn.Module):
-            def __init__(self):
+            def __init__(self) -> None:
                 super().__init__()
-                self.choices = nn.ModuleDict({
-                        'conv': nn.Conv2d(10, 10, 3),
-                        'pool': nn.MaxPool2d(3)
-                })
-                self.activations = nn.ModuleDict([
-                        ['lrelu', nn.LeakyReLU()],
-                        ['prelu', nn.PReLU()]
-                ])
+                self.choices = nn.ModuleDict(
+                    {"conv": nn.Conv2d(10, 10, 3), "pool": nn.MaxPool2d(3)}
+                )
+                self.activations = nn.ModuleDict(
+                    [["lrelu", nn.LeakyReLU()], ["prelu", nn.PReLU()]]
+                )
 
             def forward(self, x, choice, act):
                 x = self.choices[choice](x)
@@ -427,13 +536,14 @@ def forward(self, x, choice, act):
                 return x
     """
 
-    _modules: Dict[str, Module]  # type: ignore[assignment]
+    _modules: dict[str, Module]  # type: ignore[assignment]
 
     def __init__(self, modules: Optional[Mapping[str, Module]] = None) -> None:
         super().__init__()
         if modules is not None:
             self.update(modules)
 
+
     def __getitem__(self, key: str) -> Module:
         return self._modules[key]
 
@@ -443,12 +553,15 @@ def __setitem__(self, key: str, module: Module) -> None:
     def __delitem__(self, key: str) -> None:
         del self._modules[key]
 
+
     def __len__(self) -> int:
         return len(self._modules)
 
+
     def __iter__(self) -> Iterator[str]:
         return iter(self._modules)
 
+
     def __contains__(self, key: str) -> bool:
         return key in self._modules
 
@@ -456,7 +569,6 @@ def clear(self) -> None:
         """Remove all items from the ModuleDict."""
         self._modules.clear()
 
-
     def pop(self, key: str) -> Module:
         r"""Remove key from the ModuleDict and return its module.
 
@@ -468,15 +580,17 @@ def pop(self, key: str) -> Module:
         return v
 
 
-    def keys(self) -> Iterable[str]:
+    def keys(self) -> container_abcs.KeysView[str]:
         r"""Return an iterable of the ModuleDict keys."""
         return self._modules.keys()
 
-    def items(self) -> Iterable[Tuple[str, Module]]:
+
+    def items(self) -> container_abcs.ItemsView[str, Module]:
         r"""Return an iterable of the ModuleDict key/value pairs."""
         return self._modules.items()
 
-    def values(self) -> Iterable[Module]:
+
+    def values(self) -> container_abcs.ValuesView[Module]:
         r"""Return an iterable of the ModuleDict values."""
         return self._modules.values()
 
@@ -492,9 +606,10 @@ def update(self, modules: Mapping[str, Module]) -> None:
                 or an iterable of key-value pairs of type (string, :class:`~mindtorch.nn.Module`)
         """
         if not isinstance(modules, container_abcs.Iterable):
-            raise TypeError("ModuleDict.update should be called with an "
-                            "iterable of key/value pairs, but got " +
-                            type(modules).__name__)
+            raise TypeError(
+                "ModuleDict.update should be called with an "
+                "iterable of key/value pairs, but got " + type(modules).__name__
+            )
 
         if isinstance(modules, (OrderedDict, ModuleDict, container_abcs.Mapping)):
             for key, module in modules.items():
@@ -503,90 +618,169 @@ def update(self, modules: Mapping[str, Module]) -> None:
             # modules here can be a list with two items
             for j, m in enumerate(modules):
                 if not isinstance(m, container_abcs.Iterable):
-                    raise TypeError("ModuleDict update sequence element "
-                                    "#" + str(j) + " should be Iterable; is" +
-                                    type(m).__name__)
+                    raise TypeError(
+                        "ModuleDict update sequence element "
+                        "#" + str(j) + " should be Iterable; is" + type(m).__name__
+                    )
                 if not len(m) == 2:
-                    raise ValueError("ModuleDict update sequence element "
-                                     "#" + str(j) + " has length " + str(len(m)) +
-                                     "; 2 is required")
+                    raise ValueError(
+                        "ModuleDict update sequence element "
+                        "#" + str(j) + " has length " + str(len(m)) + "; 2 is required"
+                    )
                 # modules can be Mapping (what it's typed at), or a list: [(name1, module1), (name2, module2)]
                 # that's too cumbersome to type correctly with overloads, so we add an ignore here
                 self[m[0]] = m[1]  # type: ignore[assignment]
 
+    # remove forward altogether to fallback on Module's _forward_unimplemented
 
-    # remove forward alltogether to fallback on Module's _forward_unimplemented
 
 class ParameterList(Module):
     r"""Holds parameters in a list.
 
-    ParameterList can be indexed like a regular Python list, but parameters it
-    contains are properly registered, and will be visible by all Module methods.
+    :class:`~mindtorch.nn.ParameterList` can be used like a regular Python
+    list, but Tensors that are :class:`~mindtorch.nn.Parameter` are properly registered,
+    and will be visible by all :class:`~mindtorch.nn.Module` methods.
+
+    Note that the constructor, assigning an element of the list, the
+    :meth:`~mindtorch.nn.ParameterList.append` method and the :meth:`~mindtorch.nn.ParameterList.extend`
+    method will convert any :class:`~mindtorch.Tensor` into :class:`~mindtorch.nn.Parameter`.
 
-    Arguments:
-        modules (list, optional): a list of :class:`~mindtorch.nn.Parameter`` to add
+    Args:
+        parameters (iterable, optional): an iterable of elements to add to the list.
 
     Example::
 
         class MyModule(nn.Module):
-            def __init__(self):
-                super(MyModule, self).__init__()
-                self.params = nn.ParameterList([nn.Parameter(mindtorch.randn(10, 10)) for i in range(10)])
+            def __init__(self) -> None:
+                super().__init__()
+                self.params = nn.ParameterList(
+                    [nn.Parameter(mindtorch.randn(10, 10)) for i in range(10)]
+                )
 
             def forward(self, x):
-                # ModuleList can act as an iterable, or be indexed using ints
+                # ParameterList can act as an iterable, or be indexed using ints
                 for i, p in enumerate(self.params):
                     x = self.params[i // 2].mm(x) + p.mm(x)
                 return x
     """
 
-    def __init__(self, parameters=None):
-        super(ParameterList, self).__init__()
-        if parameters is not None:
-            self += parameters
+    def __init__(self, values: Optional[Iterable[Any]] = None) -> None:
+        super().__init__()
+        self._size = 0
+        if values is not None:
+            self += values
 
-    def __getitem__(self, idx):
+    def _get_abs_string_index(self, idx):
+        """Get the absolute index for the list of modules."""
+        idx = operator.index(idx)
         if not (-len(self) <= idx < len(self)):
-            raise IndexError('index {} is out of range'.format(idx))
+            raise IndexError(f"index {idx} is out of range")
         if idx < 0:
             idx += len(self)
-        return self._parameters[str(idx)]
+        return str(idx)
+
+    @overload
+    def __getitem__(self, idx: int) -> Any: ...
+
+    @overload
+    def __getitem__(self: T, idx: slice) -> T: ...
 
-    def __setitem__(self, idx, param):
-        return self.register_parameter(str(idx), param)
+    def __getitem__(self, idx):
+        if isinstance(idx, slice):
+            start, stop, step = idx.indices(len(self))
+            out = self.__class__()
+            for i in range(start, stop, step):
+                out.append(self[i])
+            return out
+        else:
+            idx = self._get_abs_string_index(idx)
+            return getattr(self, str(idx))
+
+    def __setitem__(self, idx: int, param: Any) -> None:
+        # Note that all other function that add an entry to the list part of
+        # the ParameterList end up here. So this is the only place where we need
+        # to wrap things into Parameter if needed.
+        # Objects added via setattr() are not in the list part and thus won't
+        # call into this function.
+        idx = self._get_abs_string_index(idx)
+        if isinstance(param, mindtorch.Tensor) and not isinstance(param, Parameter):
+            param = Parameter(param)
+        return setattr(self, str(idx), param)
 
-    def __len__(self):
-        return len(self._parameters)
+    def __len__(self) -> int:
+        return self._size
 
-    def __iter__(self):
-        return iter(self._parameters.values())
+    def __iter__(self) -> Iterator[Any]:
+        return iter(self[i] for i in range(len(self)))
 
-    def __iadd__(self, parameters):
+    def __iadd__(self, parameters: Iterable[Any]) -> Self:
         return self.extend(parameters)
 
-    def append(self, parameter):
-        """Appends a given parameter at the end of the list.
+    def __dir__(self) -> list[str]:
+        keys = super().__dir__()
+        keys = [key for key in keys if not key.isdigit()]
+        return keys
+
+    def append(self, value: Any) -> Self:
+        """Append a given value at the end of the list.
 
-        Arguments:
-            parameter (nn.Parameter): parameter to append
+        Args:
+            value (Any): value to append
         """
-        self.register_parameter(str(len(self)), parameter)
+        new_idx = len(self)
+        self._size += 1
+        self[new_idx] = value
         return self
 
-    def extend(self, parameters):
-        """Appends parameters from a Python list at the end.
+    def extend(self, values: Iterable[Any]) -> Self:
+        """Append values from a Python iterable to the end of the list.
 
-        Arguments:
-            parameters (list): list of parameters to append
+        Args:
+            values (iterable): iterable of values to append
         """
-        if not isinstance(parameters, list):
-            raise TypeError("ParameterList.extend should be called with a "
-                            "list, but got " + type(parameters).__name__)
-        offset = len(self)
-        for i, param in enumerate(parameters):
-            self.register_parameter(str(offset + i), param)
+        # Tensor is an iterable but we never want to unpack it here
+        if not isinstance(values, container_abcs.Iterable) or isinstance(
+            values, mindtorch.Tensor
+        ):
+            raise TypeError(
+                "ParameterList.extend should be called with an "
+                "iterable, but got " + type(values).__name__
+            )
+        for value in values:
+            self.append(value)
         return self
 
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        child_lines = []
+        for k, p in enumerate(self):
+            if isinstance(p, mindtorch.Tensor):
+                size_str = "x".join(str(size) for size in p.size())
+                if p.device.type in ["cuda", mindtorch._C._get_privateuse1_backend_name()]:
+                    device_str = f" ({p.device})"
+                else:
+                    device_str = ""
+                parastr = "{} containing: [{} of size {}{}]".format(
+                    "Parameter" if isinstance(p, Parameter) else "Tensor",
+                    p.dtype,
+                    size_str,
+                    device_str,
+                )
+                child_lines.append("  (" + str(k) + "): " + parastr)
+            else:
+                child_lines.append(
+                    "  (" + str(k) + "): Object of type: " + type(p).__name__
+                )
+
+        tmpstr = "\n".join(child_lines)
+        return tmpstr
+
+    def __call__(self, *args, **kwargs):
+        raise RuntimeError("ParameterList should not be called.")
+
+
 class ParameterDict(Module):
     r"""Holds parameters in a dictionary.
 
@@ -612,12 +806,14 @@ class ParameterDict(Module):
     Example::
 
         class MyModule(nn.Module):
-            def __init__(self):
+            def __init__(self) -> None:
                 super().__init__()
-                self.params = nn.ParameterDict({
-                        'left': nn.Parameter(mindtorch.randn(5, 10)),
-                        'right': nn.Parameter(mindtorch.randn(5, 10))
-                })
+                self.params = nn.ParameterDict(
+                    {
+                        "left": nn.Parameter(mindtorch.randn(5, 10)),
+                        "right": nn.Parameter(mindtorch.randn(5, 10)),
+                    }
+                )
 
             def forward(self, x, choice):
                 x = self.params[choice].mm(x)
@@ -626,15 +822,17 @@ def forward(self, x, choice):
 
     def __init__(self, parameters: Any = None) -> None:
         super().__init__()
-        self._keys: Dict[str, None] = {}
+        self._keys: dict[str, None] = {}
         if parameters is not None:
             self.update(parameters)
 
     def _key_to_attr(self, key: str) -> str:
         if not isinstance(key, str):
-            raise TypeError("Index given to ParameterDict cannot be used as a key as it is "
-                            f"not a string (type is '{type(key).__name__}'). Open an issue on "
-                            "github if you need non-string keys.")
+            raise TypeError(
+                "Index given to ParameterDict cannot be used as a key as it is "
+                f"not a string (type is '{type(key).__name__}'). Open an issue on "
+                "github if you need non-string keys."
+            )
         else:
             # Use the key as-is so that `.named_parameters()` returns the right thing
             return key
@@ -667,15 +865,14 @@ def __iter__(self) -> Iterator[str]:
         return iter(self._keys)
 
     def __reversed__(self) -> Iterator[str]:
-        return reversed(list(self._keys))
+        return reversed(self._keys)
 
-    def copy(self) -> 'ParameterDict':
+    def copy(self) -> ParameterDict:
         """Return a copy of this :class:`~mindtorch.nn.ParameterDict` instance."""
         # We have to use an OrderedDict because the ParameterDict constructor
         # behaves differently on plain dict vs OrderedDict
         return ParameterDict(OrderedDict((k, self[k]) for k in self._keys))
 
-
     def __contains__(self, key: str) -> bool:
         return key in self._keys
 
@@ -694,13 +891,11 @@ def setdefault(self, key: str, default: Optional[Any] = None) -> Any:
             self[key] = default
         return self[key]
 
-
     def clear(self) -> None:
         """Remove all items from the ParameterDict."""
         for k in self._keys.copy():
             del self[k]
 
-
     def pop(self, key: str) -> Any:
         r"""Remove key from the ParameterDict and return its parameter.
 
@@ -711,8 +906,7 @@ def pop(self, key: str) -> Any:
         del self[key]
         return v
 
-
-    def popitem(self) -> Tuple[str, Any]:
+    def popitem(self) -> tuple[str, Any]:
         """Remove and return the last inserted `(key, parameter)` pair from the ParameterDict."""
         k, _ = self._keys.popitem()
         # We need the key in the _keys to be able to access/del
@@ -721,7 +915,6 @@ def popitem(self) -> Tuple[str, Any]:
         del self[k]
         return k, val
 
-
     def get(self, key: str, default: Optional[Any] = None) -> Any:
         r"""Return the parameter associated with key if present. Otherwise return default if provided, None if not.
 
@@ -731,8 +924,9 @@ def get(self, key: str, default: Optional[Any] = None) -> Any:
         """
         return self[key] if key in self else default
 
-
-    def fromkeys(self, keys: Iterable[str], default: Optional[Any] = None) -> 'ParameterDict':
+    def fromkeys(
+        self, keys: Iterable[str], default: Optional[Any] = None
+    ) -> ParameterDict:
         r"""Return a new ParameterDict with the keys provided.
 
         Args:
@@ -741,23 +935,19 @@ def fromkeys(self, keys: Iterable[str], default: Optional[Any] = None) -> 'Param
         """
         return ParameterDict((k, default) for k in keys)
 
-
-    def keys(self) -> Iterable[str]:
+    def keys(self) -> container_abcs.KeysView[str]:
         r"""Return an iterable of the ParameterDict keys."""
         return self._keys.keys()
 
-
-    def items(self) -> Iterable[Tuple[str, Any]]:
+    def items(self) -> Iterable[tuple[str, Any]]:
         r"""Return an iterable of the ParameterDict key/value pairs."""
         return ((k, self[k]) for k in self._keys)
 
-
     def values(self) -> Iterable[Any]:
         r"""Return an iterable of the ParameterDict values."""
         return (self[k] for k in self._keys)
 
-
-    def update(self, parameters: Union[Mapping[str, Any], 'ParameterDict']) -> None:
+    def update(self, parameters: Union[Mapping[str, Any], ParameterDict]) -> None:
         r"""Update the :class:`~mindtorch.nn.ParameterDict` with key-value pairs from ``parameters``, overwriting existing keys.
 
         .. note::
@@ -770,9 +960,10 @@ def update(self, parameters: Union[Mapping[str, Any], 'ParameterDict']) -> None:
                 key-value pairs of type (string, :class:`~mindtorch.nn.Parameter`)
         """
         if not isinstance(parameters, container_abcs.Iterable):
-            raise TypeError("ParametersDict.update should be called with an "
-                            "iterable of key/value pairs, but got " +
-                            type(parameters).__name__)
+            raise TypeError(
+                "ParametersDict.update should be called with an "
+                "iterable of key/value pairs, but got " + type(parameters).__name__
+            )
 
         if isinstance(parameters, (OrderedDict, ParameterDict)):
             for key, parameter in parameters.items():
@@ -783,44 +974,54 @@ def update(self, parameters: Union[Mapping[str, Any], 'ParameterDict']) -> None:
         else:
             for j, p in enumerate(parameters):
                 if not isinstance(p, container_abcs.Iterable):
-                    raise TypeError("ParameterDict update sequence element "
-                                    "#" + str(j) + " should be Iterable; is" +
-                                    type(p).__name__)
+                    raise TypeError(
+                        "ParameterDict update sequence element "
+                        "#" + str(j) + " should be Iterable; is" + type(p).__name__
+                    )
                 if not len(p) == 2:
-                    raise ValueError("ParameterDict update sequence element "
-                                     "#" + str(j) + " has length " + str(len(p)) +
-                                     "; 2 is required")
+                    raise ValueError(
+                        "ParameterDict update sequence element "
+                        "#" + str(j) + " has length " + str(len(p)) + "; 2 is required"
+                    )
                 # parameters as length-2 list too cumbersome to type, see ModuleDict.update comment
                 self[p[0]] = p[1]  # type: ignore[assignment]
 
-
     def extra_repr(self) -> str:
         child_lines = []
         for k, p in self.items():
             if isinstance(p, mindtorch.Tensor):
-                size_str = 'x'.join(str(size) for size in p.size())
-                parastr = '{} containing: [{} of size {}]'.format(
+                size_str = "x".join(str(size) for size in p.size())
+                if p.device.type in ["cuda", mindtorch._C._get_privateuse1_backend_name()]:
+                    device_str = f" ({p.device})"
+                else:
+                    device_str = ""
+                parastr = "{} containing: [{} of size {}{}]".format(
                     "Parameter" if isinstance(p, Parameter) else "Tensor",
-                    type(p), size_str)
-                child_lines.append('  (' + str(k) + '): ' + parastr)
+                    mindtorch.typename(p),
+                    size_str,
+                    device_str,
+                )
+                child_lines.append("  (" + str(k) + "): " + parastr)
             else:
-                child_lines.append('  (' + str(k) + '): Object of type: ' + type(p).__name__)
-        tmpstr = '\n'.join(child_lines)
+                child_lines.append(
+                    "  (" + str(k) + "): Object of type: " + type(p).__name__
+                )
+        tmpstr = "\n".join(child_lines)
         return tmpstr
 
     def __call__(self, input):
-        raise RuntimeError('ParameterDict should not be called.')
+        raise RuntimeError("ParameterDict should not be called.")
 
-    def __or__(self, other: 'ParameterDict') -> 'ParameterDict':
+    def __or__(self, other: ParameterDict) -> ParameterDict:
         copy = self.copy()
         copy.update(other)
         return copy
 
-    def __ror__(self, other: 'ParameterDict') -> 'ParameterDict':
+    def __ror__(self, other: ParameterDict) -> ParameterDict:
         copy = other.copy()
         copy.update(self)
         return copy
 
-    def __ior__(self, other : 'ParameterDict') -> Self:
+    def __ior__(self, other: ParameterDict) -> Self:
         self.update(other)
-        return self
+        return self
\ No newline at end of file
diff --git a/mindtorch/nn/modules/conv.py b/mindtorch/nn/modules/conv.py
index 42dcddf17..824342d04 100644
--- a/mindtorch/nn/modules/conv.py
+++ b/mindtorch/nn/modules/conv.py
@@ -1,75 +1,128 @@
-# coding=utf-8
-"""conv"""
+# mypy: allow-untyped-defs
 import math
-from typing import Optional, Tuple, Union, List
+from typing import Literal, Optional, Union
+from typing_extensions import deprecated
 
+import mindtorch
 from mindtorch import Tensor
-from ..parameter import Parameter
+from mindtorch.nn import functional as F, init
+from mindtorch.nn.common_types import _size_1_t, _size_2_t, _size_3_t
+from mindtorch.nn.parameter import Parameter, UninitializedParameter
+
+from .lazy import LazyModuleMixin
 from .module import Module
-from ..common_types import _size_2_t, _size_1_t
-from ._utils import _single, _pair, _reverse_repeat_tuple, _triple
-from .. import init
-from .. import functional as F
-from ... import ops
+from .utils import _pair, _reverse_repeat_tuple, _single, _triple
+
+
+__all__ = [
+    "Conv1d",
+    "Conv2d",
+    "Conv3d",
+    "ConvTranspose1d",
+    "ConvTranspose2d",
+    "ConvTranspose3d",
+    "LazyConv1d",
+    "LazyConv2d",
+    "LazyConv3d",
+    "LazyConvTranspose1d",
+    "LazyConvTranspose2d",
+    "LazyConvTranspose3d",
+]
+
+convolution_notes = {
+    "groups_note": r"""* :attr:`groups` controls the connections between inputs and outputs.
+      :attr:`in_channels` and :attr:`out_channels` must both be divisible by
+      :attr:`groups`. For example,
+
+        * At groups=1, all inputs are convolved to all outputs.
+        * At groups=2, the operation becomes equivalent to having two conv
+          layers side by side, each seeing half the input channels
+          and producing half the output channels, and both subsequently
+          concatenated.
+        * At groups= :attr:`in_channels`, each input channel is convolved with
+          its own set of filters (of size
+          :math:`\frac{\text{out\_channels}}{\text{in\_channels}}`).""",
+    "depthwise_separable_note": r"""When `groups == in_channels` and `out_channels == K * in_channels`,
+        where `K` is a positive integer, this operation is also known as a "depthwise convolution".
+
+        In other words, for an input of size :math:`(N, C_{in}, L_{in})`,
+        a depthwise convolution with a depthwise multiplier `K` can be performed with the arguments
+        :math:`(C_\text{in}=C_\text{in}, C_\text{out}=C_\text{in} \times \text{K}, ..., \text{groups}=C_\text{in})`.""",
+}  # noqa: B950
 
 
 class _ConvNd(Module):
-
-    __constants__ = ['stride', 'padding', 'dilation', 'groups',
-                     'padding_mode', 'output_padding', 'in_channels',
-                     'out_channels', 'kernel_size']
-    __annotations__ = {'bias': Optional[Tensor]}
-
-    def _conv_forward(self, input: Tensor, weight: Tensor, bias: Optional[Tensor]) -> Tensor:  # type: ignore[empty-body]
-        ...
+    __constants__ = [
+        "stride",
+        "padding",
+        "dilation",
+        "groups",
+        "padding_mode",
+        "output_padding",
+        "in_channels",
+        "out_channels",
+        "kernel_size",
+    ]
+    __annotations__ = {"bias": Optional[mindtorch.Tensor]}
+
+    def _conv_forward(  # type: ignore[empty-body]
+        self, input: Tensor, weight: Tensor, bias: Optional[Tensor]
+    ) -> Tensor: ...
 
     in_channels: int
-    _reversed_padding_repeated_twice: List[int]
+    _reversed_padding_repeated_twice: list[int]
     out_channels: int
-    kernel_size: Tuple[int, ...]
-    stride: Tuple[int, ...]
-    padding: Union[str, Tuple[int, ...]]
-    dilation: Tuple[int, ...]
+    kernel_size: tuple[int, ...]
+    stride: tuple[int, ...]
+    padding: Union[str, tuple[int, ...]]
+    dilation: tuple[int, ...]
     transposed: bool
-    output_padding: Tuple[int, ...]
+    output_padding: tuple[int, ...]
     groups: int
-    padding_mode: str
+    padding_mode: Literal["zeros", "reflect", "replicate", "circular"]
     weight: Tensor
     bias: Optional[Tensor]
 
-    def __init__(self,
-                 in_channels: int,
-                 out_channels: int,
-                 kernel_size: Tuple[int, ...],
-                 stride: Tuple[int, ...],
-                 padding: Tuple[int, ...],
-                 dilation: Tuple[int, ...],
-                 transposed: bool,
-                 output_padding: Tuple[int, ...],
-                 groups: int,
-                 bias: bool,
-                 padding_mode: str,
-                 dtype=None,
-                 device=None) -> None:
-        factory_kwargs = {'dtype': dtype, 'device': device}
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: tuple[int, ...],
+        stride: tuple[int, ...],
+        padding: Union[str, tuple[int, ...]],
+        dilation: tuple[int, ...],
+        transposed: bool,
+        output_padding: tuple[int, ...],
+        groups: int,
+        bias: bool,
+        padding_mode: Literal["zeros", "reflect", "replicate", "circular"],
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__()
         if groups <= 0:
-            raise ValueError('groups must be a positive integer')
+            raise ValueError("groups must be a positive integer")
         if in_channels % groups != 0:
-            raise ValueError('in_channels must be divisible by groups')
+            raise ValueError("in_channels must be divisible by groups")
         if out_channels % groups != 0:
-            raise ValueError('out_channels must be divisible by groups')
-        valid_padding_strings = {'same', 'valid'}
+            raise ValueError("out_channels must be divisible by groups")
+        valid_padding_strings = {"same", "valid"}
         if isinstance(padding, str):
             if padding not in valid_padding_strings:
                 raise ValueError(
-                    f"Invalid padding string {padding!r}, should be one of {valid_padding_strings}")
-            if padding == 'same' and any(s != 1 for s in stride):
-                raise ValueError("padding='same' is not supported for strided convolutions")
+                    f"Invalid padding string {padding!r}, should be one of {valid_padding_strings}"
+                )
+            if padding == "same" and any(s != 1 for s in stride):
+                raise ValueError(
+                    "padding='same' is not supported for strided convolutions"
+                )
 
-        valid_padding_modes = {'zeros', 'reflect', 'replicate', 'circular'}
+        valid_padding_modes = {"zeros", "reflect", "replicate", "circular"}
         if padding_mode not in valid_padding_modes:
-            raise ValueError(f"padding_mode must be one of {valid_padding_modes}, but got padding_mode='{padding_mode}'")
+            raise ValueError(
+                f"padding_mode must be one of {valid_padding_modes}, but got padding_mode='{padding_mode}'"
+            )
         self.in_channels = in_channels
         self.out_channels = out_channels
         self.kernel_size = kernel_size
@@ -86,34 +139,46 @@ def __init__(self,
         # reverse order than the dimension.
         if isinstance(self.padding, str):
             self._reversed_padding_repeated_twice = [0, 0] * len(kernel_size)
-            if padding == 'same':
-                for d, k, i in zip(dilation, kernel_size,
-                                   range(len(kernel_size) - 1, -1, -1)):
+            if padding == "same":
+                for d, k, i in zip(
+                    dilation, kernel_size, range(len(kernel_size) - 1, -1, -1)
+                ):
                     total_padding = d * (k - 1)
                     left_pad = total_padding // 2
                     self._reversed_padding_repeated_twice[2 * i] = left_pad
                     self._reversed_padding_repeated_twice[2 * i + 1] = (
-                        total_padding - left_pad)
+                        total_padding - left_pad
+                    )
         else:
-            self._reversed_padding_repeated_twice = _reverse_repeat_tuple(self.padding, 2)
+            self._reversed_padding_repeated_twice = _reverse_repeat_tuple(
+                self.padding, 2
+            )
 
         if transposed:
-            self.weight = Parameter(ops.empty(
-                (in_channels, out_channels // groups, *kernel_size), **factory_kwargs))
+            self.weight = Parameter(
+                mindtorch.empty(
+                    (in_channels, out_channels // groups, *kernel_size),
+                    **factory_kwargs,
+                )
+            )
         else:
-            self.weight = Parameter(ops.empty(
-                (out_channels, in_channels // groups, *kernel_size), **factory_kwargs))
+            self.weight = Parameter(
+                mindtorch.empty(
+                    (out_channels, in_channels // groups, *kernel_size),
+                    **factory_kwargs,
+                )
+            )
         if bias:
-            self.bias = Parameter(ops.empty(out_channels, **factory_kwargs))
+            self.bias = Parameter(mindtorch.empty(out_channels, **factory_kwargs))
         else:
-            self.register_parameter('bias', None)
+            self.register_parameter("bias", None)
 
         self.reset_parameters()
 
     def reset_parameters(self) -> None:
         # Setting a=sqrt(5) in kaiming_uniform is the same as initializing with
         # uniform(-1/sqrt(k), 1/sqrt(k)), where k = weight.size(1) * prod(*kernel_size)
-        # For more details see: https://github.com/pytorch/pytorch/issues/15314#issuecomment-477448573
+        # For more details see: https://github.com/pymindtorch/pymindtorch/issues/15314#issuecomment-477448573
         init.kaiming_uniform_(self.weight, a=math.sqrt(5))
         if self.bias is not None:
             fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
@@ -122,45 +187,28 @@ def reset_parameters(self) -> None:
                 init.uniform_(self.bias, -bound, bound)
 
     def extra_repr(self):
-        s = ('{in_channels}, {out_channels}, kernel_size={kernel_size}'
-             ', stride={stride}')
+        s = "{in_channels}, {out_channels}, kernel_size={kernel_size}, stride={stride}"
         if self.padding != (0,) * len(self.padding):
-            s += ', padding={padding}'
+            s += ", padding={padding}"
         if self.dilation != (1,) * len(self.dilation):
-            s += ', dilation={dilation}'
+            s += ", dilation={dilation}"
         if self.output_padding != (0,) * len(self.output_padding):
-            s += ', output_padding={output_padding}'
+            s += ", output_padding={output_padding}"
         if self.groups != 1:
-            s += ', groups={groups}'
+            s += ", groups={groups}"
         if self.bias is None:
-            s += ', bias=False'
-        if self.padding_mode != 'zeros':
-            s += ', padding_mode={padding_mode}'
+            s += ", bias=False"
+        if self.padding_mode != "zeros":
+            s += ", padding_mode={padding_mode}"
         return s.format(**self.__dict__)
 
     def __setstate__(self, state):
         super().__setstate__(state)
-        if not hasattr(self, 'padding_mode'):
-            self.padding_mode = 'zeros'
+        if not hasattr(self, "padding_mode"):
+            self.padding_mode = "zeros"
 
 
 class Conv1d(_ConvNd):
-    r"""Applies a 1D convolution over an input signal composed of several input
-    planes.
-
-    In the simplest case, the output value of the layer with input size
-    :math:`(N, C_{\text{in}}, L)` and output :math:`(N, C_{\text{out}}, L_{\text{out}})` can be
-    precisely described as:
-
-    .. math::
-        \text{out}(N_i, C_{\text{out}_j}) = \text{bias}(C_{\text{out}_j}) +
-        \sum_{k = 0}^{C_{in} - 1} \text{weight}(C_{\text{out}_j}, k)
-        \star \text{input}(N_i, k)
-
-    where :math:`\star` is the valid `cross-correlation`_ operator,
-    :math:`N` is a batch size, :math:`C` denotes a number of channels,
-    :math:`L` is a length of signal sequence.
-    """
     def __init__(
         self,
         in_channels: int,
@@ -171,7 +219,7 @@ def __init__(
         dilation: _size_1_t = 1,
         groups: int = 1,
         bias: bool = True,
-        padding_mode: str = "zeros",  # TODO: refine this type
+        padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
         device=None,
         dtype=None,
     ) -> None:
@@ -230,7 +278,7 @@ def __init__(
         dilation: _size_2_t = 1,
         groups: int = 1,
         bias: bool = True,
-        padding_mode: str = "zeros",  # TODO: refine this type
+        padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
         device=None,
         dtype=None,
     ) -> None:
@@ -276,27 +324,40 @@ def forward(self, input: Tensor) -> Tensor:
 
 
 class Conv3d(_ConvNd):
+
     def __init__(
         self,
         in_channels: int,
         out_channels: int,
-        kernel_size: _size_2_t,
-        stride: _size_2_t = 1,
-        padding: Union[str, _size_2_t] = 0,
-        dilation: _size_2_t = 1,
+        kernel_size: _size_3_t,
+        stride: _size_3_t = 1,
+        padding: Union[str, _size_3_t] = 0,
+        dilation: _size_3_t = 1,
         groups: int = 1,
         bias: bool = True,
-        padding_mode: str = 'zeros',
-        dtype=None
+        padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+        device=None,
+        dtype=None,
     ) -> None:
-        factory_kwargs = {'dtype': dtype}
+        factory_kwargs = {"device": device, "dtype": dtype}
         kernel_size_ = _triple(kernel_size)
         stride_ = _triple(stride)
         padding_ = padding if isinstance(padding, str) else _triple(padding)
         dilation_ = _triple(dilation)
         super().__init__(
-            in_channels, out_channels, kernel_size_, stride_, padding_, dilation_,
-            False, _triple(0), groups, bias, padding_mode, **factory_kwargs)
+            in_channels,
+            out_channels,
+            kernel_size_,
+            stride_,
+            padding_,
+            dilation_,
+            False,
+            _triple(0),
+            groups,
+            bias,
+            padding_mode,
+            **factory_kwargs,
+        )
 
     def _conv_forward(self, input: Tensor, weight: Tensor, bias: Optional[Tensor]):
         if self.padding_mode != "zeros":
@@ -320,23 +381,55 @@ def forward(self, input: Tensor) -> Tensor:
 
 
 class _ConvTransposeNd(_ConvNd):
-    def __init__(self, in_channels, out_channels, kernel_size, stride,
-                 padding, dilation, transposed, output_padding,
-                 groups, bias, padding_mode, dtype=None, device=None) -> None:
-        if padding_mode != 'zeros':
-            raise ValueError(f'Only "zeros" padding mode is supported for {self.__class__.__name__}')
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        dilation,
+        transposed,
+        output_padding,
+        groups,
+        bias,
+        padding_mode,
+        device=None,
+        dtype=None,
+    ) -> None:
+        if padding_mode != "zeros":
+            raise ValueError(
+                f'Only "zeros" padding mode is supported for {self.__class__.__name__}'
+            )
 
-        factory_kwargs = {'dtype': dtype, 'device': device}
+        factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__(
-            in_channels, out_channels, kernel_size, stride,
-            padding, dilation, transposed, output_padding,
-            groups, bias, padding_mode, **factory_kwargs)
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            transposed,
+            output_padding,
+            groups,
+            bias,
+            padding_mode,
+            **factory_kwargs,
+        )
 
     # dilation being an optional parameter is for backwards
     # compatibility
-    def _output_padding(self, input: Tensor, output_size: Optional[List[int]],
-                        stride: List[int], padding: List[int], kernel_size: List[int],
-                        num_spatial_dims: int, dilation: Optional[List[int]] = None) -> List[int]:
+    def _output_padding(
+        self,
+        input: Tensor,
+        output_size: Optional[list[int]],
+        stride: list[int],
+        padding: list[int],
+        kernel_size: list[int],
+        num_spatial_dims: int,
+        dilation: Optional[list[int]] = None,
+    ) -> list[int]:
         if output_size is None:
             ret = _single(self.output_padding)  # converting to list if was not already
         else:
@@ -346,14 +439,20 @@ def _output_padding(self, input: Tensor, output_size: Optional[List[int]],
                 output_size = output_size[num_non_spatial_dims:]
             if len(output_size) != num_spatial_dims:
                 raise ValueError(
-                    f"ConvTranspose{num_spatial_dims}D: for {input.dim()}D input, output_size must have {num_spatial_dims} or {num_non_spatial_dims + num_spatial_dims} elements (got {len(output_size)})")
+                    f"ConvTranspose{num_spatial_dims}D: for {input.dim()}D input, output_size must have {num_spatial_dims} "
+                    f"or {num_non_spatial_dims + num_spatial_dims} elements (got {len(output_size)})"
+                )
 
-            min_sizes = []
-            max_sizes = []
+            min_sizes = mindtorch.jit.annotate(list[int], [])
+            max_sizes = mindtorch.jit.annotate(list[int], [])
             for d in range(num_spatial_dims):
-                dim_size = ((input.size(d + num_non_spatial_dims) - 1) * stride[d] -
-                            2 * padding[d] +
-                            (dilation[d] if dilation is not None else 1) * (kernel_size[d] - 1) + 1)
+                dim_size = (
+                    (input.size(d + num_non_spatial_dims) - 1) * stride[d]
+                    - 2 * padding[d]
+                    + (dilation[d] if dilation is not None else 1)
+                    * (kernel_size[d] - 1)
+                    + 1
+                )
                 min_sizes.append(dim_size)
                 max_sizes.append(min_sizes[d] + stride[d] - 1)
 
@@ -364,67 +463,18 @@ def _output_padding(self, input: Tensor, output_size: Optional[List[int]],
                 if size < min_size or size > max_size:
                     raise ValueError(
                         f"requested an output size of {output_size}, but valid sizes range "
-                        f"from {min_sizes} to {max_sizes} (for an input of {input.size()[2:]})")
+                        f"from {min_sizes} to {max_sizes} (for an input of {input.size()[2:]})"
+                    )
 
-            res = []
+            res = mindtorch.jit.annotate(list[int], [])
             for d in range(num_spatial_dims):
                 res.append(output_size[d] - min_sizes[d])
 
             ret = res
         return ret
 
-class ConvTranspose1d(_ConvTransposeNd):
-    """Applies a 1D transposed convolution operator over an input image
-    composed of several input planes.
-
-    This module can be seen as the gradient of Conv1d with respect to its input.
-    It is also known as a fractionally-strided convolution or
-    a deconvolution (although it is not an actual deconvolution operation).
-
-    | :attr:`stride` controls the stride for the cross-correlation.
-    | If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides
-      for :attr:`padding` number of points.
-    | If :attr:`output_padding` is non-zero, then the output is implicitly zero-padded on one side
-      for :attr:`output_padding` number of points.
-    | :attr:`dilation` controls the spacing between the kernel points; also known as the à trous algorithm.
-      It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
-    | :attr:`groups` controls the connections between inputs and outputs. `in_channels` and `out_channels`
-      must both be divisible by `groups`.
-    |       At groups=1, all inputs are convolved to all outputs.
-    |       At groups=2, the operation becomes equivalent to having two conv layers
-                 side by side, each seeing half the input channels,
-                 and producing half the output channels, and both subsequently concatenated.
-            At groups=`in_channels`, each input channel is convolved with its own set of filters
-                 (of size `out_channels // in_channels`).
-
-    .. note::
-
-         Depending of the size of your kernel, several (of the last)
-         columns of the input might be lost, because it is a valid `cross-correlation`_,
-         and not a full `cross-correlation`_.
-         It is up to the user to add proper padding.
 
-    Args:
-        in_channels (int): Number of channels in the input image
-        out_channels (int): Number of channels produced by the convolution
-        kernel_size (int or tuple): Size of the convolving kernel
-        stride (int or tuple, optional): Stride of the convolution
-        padding (int or tuple, optional): Zero-padding added to both sides of the input
-        output_padding (int or tuple, optional): Zero-padding added to one side of the output
-        groups (int, optional): Number of blocked connections from input channels to output channels
-        bias (bool, optional): If True, adds a learnable bias to the output
-        dilation (int or tuple, optional): Spacing between kernel elements
-
-    Shape:
-        - Input: :math:`(N, C_{in}, L_{in})`
-        - Output: :math:`(N, C_{out}, L_{out})` where
-          :math:`L_{out} = (L_{in} - 1) * stride - 2 * padding + kernel\_size + output\_padding`
-
-    Attributes:
-        weight (Tensor): the learnable weights of the module of shape
-                         (in_channels, out_channels, kernel_size[0], kernel_size[1])
-        bias (Tensor):   the learnable bias of the module of shape (out_channels)
-    """
+class ConvTranspose1d(_ConvTransposeNd):
 
     def __init__(
         self,
@@ -437,7 +487,7 @@ def __init__(
         groups: int = 1,
         bias: bool = True,
         dilation: _size_1_t = 1,
-        padding_mode: str = "zeros",
+        padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
         device=None,
         dtype=None,
     ) -> None:
@@ -493,103 +543,7 @@ def forward(self, input: Tensor, output_size: Optional[list[int]] = None) -> Ten
         )
 
 
-def _deconv_output_length(pad_mode, filter_size, stride_size, dilation_size, padding):
-    """Calculate the width and height of output."""
-    length = 0
-    filter_size = filter_size + (filter_size - 1) * (dilation_size - 1)
-    if pad_mode == 'valid':
-        if filter_size - stride_size > 0:
-            length = filter_size - stride_size
-    elif pad_mode == 'pad':
-        length = - padding + filter_size - stride_size
-
-    return length
-
 class ConvTranspose2d(_ConvTransposeNd):
-    r"""Applies a 2D transposed convolution operator over an input image
-    composed of several input planes.
-
-    This module can be seen as the gradient of Conv2d with respect to its input.
-    It is also known as a fractionally-strided convolution or
-    a deconvolution (although it is not an actual deconvolution operation).
-
-    | :attr:`stride` controls the stride for the cross-correlation.
-    | If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides
-      for :attr:`padding` number of points.
-    | If :attr:`output_padding` is non-zero, then the output is implicitly zero-padded on one side
-      for :attr:`output_padding` number of points.
-    | :attr:`dilation` controls the spacing between the kernel points; also known as the à trous algorithm.
-      It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
-    | :attr:`groups` controls the connections between inputs and outputs. `in_channels` and `out_channels`
-      must both be divisible by `groups`.
-    |       At groups=1, all inputs are convolved to all outputs.
-    |       At groups=2, the operation becomes equivalent to having two conv layers
-                 side by side, each seeing half the input channels,
-                 and producing half the output channels, and both subsequently concatenated.
-            At groups=`in_channels`, each input channel is convolved with its own set of filters
-                 (of size `out_channels // in_channels`).
-
-    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`output_padding`
-    can either be:
-
-        - a single ``int`` -- in which case the same value is used for the height and width dimensions
-        - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension,
-          and the second `int` for the width dimension
-
-    .. note::
-
-         Depending of the size of your kernel, several (of the last)
-         columns of the input might be lost, because it is a valid `cross-correlation`_,
-         and not a full `cross-correlation`_.
-         It is up to the user to add proper padding.
-
-    Args:
-        in_channels (int): Number of channels in the input image
-        out_channels (int): Number of channels produced by the convolution
-        kernel_size (int or tuple): Size of the convolving kernel
-        stride (int or tuple, optional): Stride of the convolution
-        padding (int or tuple, optional): Zero-padding added to both sides of the input
-        output_padding (int or tuple, optional): Zero-padding added to one side of the output
-        groups (int, optional): Number of blocked connections from input channels to output channels
-        bias (bool, optional): If True, adds a learnable bias to the output
-        dilation (int or tuple, optional): Spacing between kernel elements
-
-    Shape:
-        - Input: :math:`(N, C_{in}, H_{in}, W_{in})`
-        - Output: :math:`(N, C_{out}, H_{out}, W_{out})` where
-          :math:`H_{out} = (H_{in} - 1) * stride[0] - 2 * padding[0] + kernel\_size[0] + output\_padding[0]`
-          :math:`W_{out} = (W_{in} - 1) * stride[1] - 2 * padding[1] + kernel\_size[1] + output\_padding[1]`
-
-    Attributes:
-        weight (Tensor): the learnable weights of the module of shape
-                         (in_channels, out_channels, kernel_size[0], kernel_size[1])
-        bias (Tensor):   the learnable bias of the module of shape (out_channels)
-
-    Examples::
-
-        >>> # With square kernels and equal stride
-        >>> m = nn.ConvTranspose2d(16, 33, 3, stride=2)
-        >>> # non-square kernels and unequal stride and with padding
-        >>> m = nn.ConvTranspose2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2))
-        >>> input = autograd.Variable(mindtorch.randn(20, 16, 50, 100))
-        >>> output = m(input)
-        >>> # exact output size can be also specified as an argument
-        >>> input = autograd.Variable(mindtorch.randn(1, 16, 12, 12))
-        >>> downsample = nn.Conv2d(16, 16, 3, stride=2, padding=1)
-        >>> upsample = nn.ConvTranspose2d(16, 16, 3, stride=2, padding=1)
-        >>> h = downsample(input)
-        >>> h.size()
-        mindtorch.Size([1, 16, 6, 6])
-        >>> output = upsample(h, output_size=input.size())
-        >>> output.size()
-        mindtorch.Size([1, 16, 12, 12])
-
-    .. _cross-correlation:
-        https://en.wikipedia.org/wiki/Cross-correlation
-
-    .. _link:
-        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
-    """
 
     def __init__(
         self,
@@ -602,7 +556,7 @@ def __init__(
         groups: int = 1,
         bias: bool = True,
         dilation: _size_2_t = 1,
-        padding_mode: str = "zeros",
+        padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
         device=None,
         dtype=None,
     ) -> None:
@@ -666,103 +620,582 @@ def forward(self, input: Tensor, output_size: Optional[list[int]] = None) -> Ten
             self.dilation,
         )
 
-class ConvTranspose3d(_ConvTransposeNd):
-    r"""Applies a 3D transposed convolution operator over an input image composed of several input
-    planes.
-    The transposed convolution operator multiplies each input value element-wise by a learnable kernel,
-    and sums over the outputs from all input feature planes.
-
-    This module can be seen as the gradient of Conv3d with respect to its input.
-    It is also known as a fractionally-strided convolution or
-    a deconvolution (although it is not an actual deconvolution operation).
-
-    | :attr:`stride` controls the stride for the cross-correlation.
-    | If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides
-      for :attr:`padding` number of points.
-    | If :attr:`output_padding` is non-zero, then the output is implicitly zero-padded on one side
-      for :attr:`output_padding` number of points.
-    | :attr:`dilation` controls the spacing between the kernel points; also known as the à trous algorithm.
-      It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
-    | :attr:`groups` controls the connections between inputs and outputs. `in_channels` and `out_channels`
-      must both be divisible by `groups`.
-    |       At groups=1, all inputs are convolved to all outputs.
-    |       At groups=2, the operation becomes equivalent to having two conv layers
-                 side by side, each seeing half the input channels,
-                 and producing half the output channels, and both subsequently concatenated.
-            At groups=`in_channels`, each input channel is convolved with its own set of filters
-                 (of size `out_channels // in_channels`).
-
-    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`output_padding`
-    can either be:
-
-        - a single ``int`` -- in which case the same value is used for the depth, height and width dimensions
-        - a ``tuple`` of three ints -- in which case, the first `int` is used for the depth dimension,
-          the second `int` for the height dimension and the third `int` for the width dimension
-
-    .. note::
-
-         Depending of the size of your kernel, several (of the last)
-         columns of the input might be lost, because it is a valid `cross-correlation`_,
-         and not a full `cross-correlation`_.
-         It is up to the user to add proper padding.
 
-    Args:
-        in_channels (int): Number of channels in the input image
-        out_channels (int): Number of channels produced by the convolution
-        kernel_size (int or tuple): Size of the convolving kernel
-        stride (int or tuple, optional): Stride of the convolution
-        padding (int or tuple, optional): Zero-padding added to both sides of the input
-        output_padding (int or tuple, optional): Zero-padding added to one side of the output
-        groups (int, optional): Number of blocked connections from input channels to output channels
-        bias (bool, optional): If True, adds a learnable bias to the output
-        dilation (int or tuple, optional): Spacing between kernel elements
-
-    Shape:
-        - Input: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
-        - Output: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})` where
-          :math:`D_{out} = (D_{in} - 1) * stride[0] - 2 * padding[0] + kernel\_size[0] + output\_padding[0]`
-          :math:`H_{out} = (H_{in} - 1) * stride[1] - 2 * padding[1] + kernel\_size[1] + output\_padding[1]`
-          :math:`W_{out} = (W_{in} - 1) * stride[2] - 2 * padding[2] + kernel\_size[2] + output\_padding[2]`
-
-    Attributes:
-        weight (Tensor): the learnable weights of the module of shape
-                         (in_channels, out_channels, kernel_size[0], kernel_size[1], kernel_size[2])
-        bias (Tensor):   the learnable bias of the module of shape (out_channels)
-
-    Examples::
-
-        >>> # With square kernels and equal stride
-        >>> m = nn.ConvTranspose3d(16, 33, 3, stride=2)
-        >>> # non-square kernels and unequal stride and with padding
-        >>> m = nn.Conv3d(16, 33, (3, 5, 2), stride=(2, 1, 1), padding=(0, 4, 2))
-        >>> input = autograd.Variable(mindtorch.randn(20, 16, 10, 50, 100))
-        >>> output = m(input)
-
-    .. _cross-correlation:
-        https://en.wikipedia.org/wiki/Cross-correlation
-
-    .. _link:
-        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
-    """
+class ConvTranspose3d(_ConvTransposeNd):
 
-    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
-                 padding=0, output_padding=0, groups=1, bias=True, dilation=1):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: _size_3_t,
+        stride: _size_3_t = 1,
+        padding: _size_3_t = 0,
+        output_padding: _size_3_t = 0,
+        groups: int = 1,
+        bias: bool = True,
+        dilation: _size_3_t = 1,
+        padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
         kernel_size = _triple(kernel_size)
         stride = _triple(stride)
         padding = _triple(padding)
         dilation = _triple(dilation)
         output_padding = _triple(output_padding)
-        super(ConvTranspose3d, self).__init__(
-            in_channels, out_channels, kernel_size, stride, padding, dilation,
-            True, output_padding, groups, bias)
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            True,
+            output_padding,
+            groups,
+            bias,
+            padding_mode,
+            **factory_kwargs,
+        )
+
+    def forward(self, input: Tensor, output_size: Optional[list[int]] = None) -> Tensor:
+        if self.padding_mode != "zeros":
+            raise ValueError(
+                "Only `zeros` padding mode is supported for ConvTranspose3d"
+            )
+
+        assert isinstance(self.padding, tuple)
+        # One cannot replace List by Tuple or Sequence in "_output_padding" because
+        # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`.
+        num_spatial_dims = 3
+        output_padding = self._output_padding(
+            input,
+            output_size,
+            self.stride,  # type: ignore[arg-type]
+            self.padding,  # type: ignore[arg-type]
+            self.kernel_size,  # type: ignore[arg-type]
+            num_spatial_dims,
+            self.dilation,  # type: ignore[arg-type]
+        )
 
-    def forward(self, input, output_size=None):
-        output_padding = self._output_padding(input, output_size)
         return F.conv_transpose3d(
-            input, self.weight, self.bias, self.stride, self.padding,
-            output_padding, self.groups, self.dilation)
+            input,
+            self.weight,
+            self.bias,
+            self.stride,
+            self.padding,
+            output_padding,
+            self.groups,
+            self.dilation,
+        )
+
+
+# TODO: Deprecate and remove the following alias `_ConvTransposeMixin`.
+#
+# `_ConvTransposeMixin` was a mixin that was removed.  It is meant to be used
+# with `_ConvNd` to construct actual module classes that implements conv
+# transpose ops:
+#
+#   class MyConvTranspose(_ConvNd, _ConvTransposeMixin):
+#       ...
+#
+# In PyTorch, it has been replaced by `_ConvTransposeNd`, which is a proper
+# subclass of `_ConvNd`.  However, some user code in the wild still (incorrectly)
+# use the internal class `_ConvTransposeMixin`.  Hence, we provide this alias
+# for BC, because it is cheap and easy for us to do so, even though that
+# `_ConvTransposeNd` is really not a mixin anymore (but multiple inheritance as
+# above would still work).
+class _ConvTransposeMixin(_ConvTransposeNd):
+    @deprecated(
+        "`_ConvTransposeMixin` is a deprecated internal class. "
+        "Please consider using public APIs.",
+        category=FutureWarning,
+    )
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
 
 
 # TODO: Conv2dLocal
 # TODO: Conv2dMap
 # TODO: ConvTranspose2dMap
+
+
+class _LazyConvXdMixin(LazyModuleMixin):
+    groups: int
+    transposed: bool
+    in_channels: int
+    out_channels: int
+    kernel_size: tuple[int, ...]
+    weight: UninitializedParameter
+    bias: UninitializedParameter
+
+    def reset_parameters(self) -> None:
+        # has_uninitialized_params is defined in parent class and it is using a protocol on self
+        if not self.has_uninitialized_params() and self.in_channels != 0:  # type: ignore[misc]
+            # "type:ignore[..]" is required because mypy thinks that "reset_parameters" is undefined
+            # in super class. Turns out that it is defined in _ConvND which is inherited by any class
+            # that also inherits _LazyConvXdMixin
+            super().reset_parameters()  # type: ignore[misc]
+
+    # Signature of "initialize_parameters" is incompatible with the definition in supertype LazyModuleMixin
+    def initialize_parameters(self, input: Tensor, *args, **kwargs) -> None:  # type: ignore[override]
+        # defined by parent class but using a protocol
+        if self.has_uninitialized_params():  # type: ignore[misc]
+            self.in_channels = self._get_in_channels(input)
+            if self.in_channels % self.groups != 0:
+                raise ValueError("in_channels must be divisible by groups")
+            assert isinstance(self.weight, UninitializedParameter)
+            if self.transposed:
+                self.weight.materialize(
+                    (
+                        self.in_channels,
+                        self.out_channels // self.groups,
+                        *self.kernel_size,
+                    )
+                )
+            else:
+                self.weight.materialize(
+                    (
+                        self.out_channels,
+                        self.in_channels // self.groups,
+                        *self.kernel_size,
+                    )
+                )
+            if self.bias is not None:
+                assert isinstance(self.bias, UninitializedParameter)
+                self.bias.materialize((self.out_channels,))
+            self.reset_parameters()
+
+    # Function to extract in_channels from first input.
+    def _get_in_channels(self, input: Tensor) -> int:
+        num_spatial_dims = self._get_num_spatial_dims()
+        num_dims_no_batch = num_spatial_dims + 1  # +1 for channels dim
+        num_dims_batch = num_dims_no_batch + 1
+        if input.dim() not in (num_dims_no_batch, num_dims_batch):
+            raise RuntimeError(
+                f"Expected {num_dims_no_batch}D (unbatched) or {num_dims_batch}D (batched) input "
+                f"to {self.__class__.__name__}, but "
+                f"got input of size: {input.shape}"
+            )
+        return input.shape[1] if input.dim() == num_dims_batch else input.shape[0]
+
+    # Function to return the number of spatial dims expected for inputs to the module.
+    # This is expected to be implemented by subclasses.
+    def _get_num_spatial_dims(self) -> int:
+        raise NotImplementedError
+
+
+# LazyConv1d defines weight as a Tensor but derived class defines it as UnitializeParameter
+class LazyConv1d(_LazyConvXdMixin, Conv1d):  # type: ignore[misc]
+    r"""A :class:`mindtorch.nn.Conv1d` module with lazy initialization of the ``in_channels`` argument.
+
+    The ``in_channels`` argument of the :class:`Conv1d` is inferred from the ``input.size(1)``.
+    The attributes that will be lazily initialized are `weight` and `bias`.
+
+    Check the :class:`mindtorch.nn.modules.lazy.LazyModuleMixin` for further documentation
+    on lazy modules and their limitations.
+
+    Args:
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of
+            the input. Default: 0
+        dilation (int or tuple, optional): Spacing between kernel
+            elements. Default: 1
+        groups (int, optional): Number of blocked connections from input
+            channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the
+            output. Default: ``True``
+        padding_mode (str, optional): ``'zeros'``, ``'reflect'``,
+            ``'replicate'`` or ``'circular'``. Default: ``'zeros'``
+
+    .. seealso:: :class:`mindtorch.nn.Conv1d` and :class:`mindtorch.nn.modules.lazy.LazyModuleMixin`
+    """
+
+    # super class define this variable as None. "type: ignore[..] is required
+    # since we are redefining the variable.
+    cls_to_become = Conv1d  # type: ignore[assignment]
+
+    def __init__(
+        self,
+        out_channels: int,
+        kernel_size: _size_1_t,
+        stride: _size_1_t = 1,
+        padding: _size_1_t = 0,
+        dilation: _size_1_t = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__(
+            0,
+            0,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            # bias is hardcoded to False to avoid creating tensor
+            # that will soon be overwritten.
+            False,
+            padding_mode,
+            **factory_kwargs,
+        )
+        self.weight = UninitializedParameter(**factory_kwargs)
+        self.out_channels = out_channels
+        if bias:
+            self.bias = UninitializedParameter(**factory_kwargs)
+
+    def _get_num_spatial_dims(self) -> int:
+        return 1
+
+
+# LazyConv2d defines weight as a Tensor but derived class defines it as UnitializeParameter
+class LazyConv2d(_LazyConvXdMixin, Conv2d):  # type: ignore[misc]
+    r"""A :class:`mindtorch.nn.Conv2d` module with lazy initialization of the ``in_channels`` argument.
+
+    The ``in_channels`` argument of the :class:`Conv2d` that is inferred from the ``input.size(1)``.
+    The attributes that will be lazily initialized are `weight` and `bias`.
+
+    Check the :class:`mindtorch.nn.modules.lazy.LazyModuleMixin` for further documentation
+    on lazy modules and their limitations.
+
+    Args:
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of
+            the input. Default: 0
+        dilation (int or tuple, optional): Spacing between kernel
+            elements. Default: 1
+        groups (int, optional): Number of blocked connections from input
+            channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the
+            output. Default: ``True``
+        padding_mode (str, optional): ``'zeros'``, ``'reflect'``,
+            ``'replicate'`` or ``'circular'``. Default: ``'zeros'``
+
+    .. seealso:: :class:`mindtorch.nn.Conv2d` and :class:`mindtorch.nn.modules.lazy.LazyModuleMixin`
+    """
+
+    # super class define this variable as None. "type: ignore[..] is required
+    # since we are redefining the variable.
+    cls_to_become = Conv2d  # type: ignore[assignment]
+
+    def __init__(
+        self,
+        out_channels: int,
+        kernel_size: _size_2_t,
+        stride: _size_2_t = 1,
+        padding: _size_2_t = 0,
+        dilation: _size_2_t = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__(
+            0,
+            0,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            # bias is hardcoded to False to avoid creating tensor
+            # that will soon be overwritten.
+            False,
+            padding_mode,
+            **factory_kwargs,
+        )
+        self.weight = UninitializedParameter(**factory_kwargs)
+        self.out_channels = out_channels
+        if bias:
+            self.bias = UninitializedParameter(**factory_kwargs)
+
+    def _get_num_spatial_dims(self) -> int:
+        return 2
+
+
+# LazyConv3d defines weight as a Tensor but derived class defines it as UnitializeParameter
+class LazyConv3d(_LazyConvXdMixin, Conv3d):  # type: ignore[misc]
+    r"""A :class:`mindtorch.nn.Conv3d` module with lazy initialization of the ``in_channels`` argument.
+
+    The ``in_channels`` argument of the :class:`Conv3d` that is inferred from
+    the ``input.size(1)``.
+    The attributes that will be lazily initialized are `weight` and `bias`.
+
+    Check the :class:`mindtorch.nn.modules.lazy.LazyModuleMixin` for further documentation
+    on lazy modules and their limitations.
+
+    Args:
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of
+            the input. Default: 0
+        dilation (int or tuple, optional): Spacing between kernel
+            elements. Default: 1
+        groups (int, optional): Number of blocked connections from input
+            channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the
+            output. Default: ``True``
+        padding_mode (str, optional): ``'zeros'``, ``'reflect'``,
+            ``'replicate'`` or ``'circular'``. Default: ``'zeros'``
+
+    .. seealso:: :class:`mindtorch.nn.Conv3d` and :class:`mindtorch.nn.modules.lazy.LazyModuleMixin`
+    """
+
+    # super class define this variable as None. "type: ignore[..] is required
+    # since we are redefining the variable.
+    cls_to_become = Conv3d  # type: ignore[assignment]
+
+    def __init__(
+        self,
+        out_channels: int,
+        kernel_size: _size_3_t,
+        stride: _size_3_t = 1,
+        padding: _size_3_t = 0,
+        dilation: _size_3_t = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__(
+            0,
+            0,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            # bias is hardcoded to False to avoid creating tensor
+            # that will soon be overwritten.
+            False,
+            padding_mode,
+            **factory_kwargs,
+        )
+        self.weight = UninitializedParameter(**factory_kwargs)
+        self.out_channels = out_channels
+        if bias:
+            self.bias = UninitializedParameter(**factory_kwargs)
+
+    def _get_num_spatial_dims(self) -> int:
+        return 3
+
+
+# LazyConvTranspose1d defines weight as a Tensor but derived class defines it as UnitializeParameter
+class LazyConvTranspose1d(_LazyConvXdMixin, ConvTranspose1d):  # type: ignore[misc]
+    r"""A :class:`mindtorch.nn.ConvTranspose1d` module with lazy initialization of the ``in_channels`` argument.
+
+    The ``in_channels`` argument of the :class:`ConvTranspose1d` that is inferred from
+    the ``input.size(1)``.
+    The attributes that will be lazily initialized are `weight` and `bias`.
+
+    Check the :class:`mindtorch.nn.modules.lazy.LazyModuleMixin` for further documentation
+    on lazy modules and their limitations.
+
+    Args:
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): ``dilation * (kernel_size - 1) - padding`` zero-padding
+            will be added to both sides of the input. Default: 0
+        output_padding (int or tuple, optional): Additional size added to one side
+            of the output shape. Default: 0
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the output. Default: ``True``
+        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+
+    .. seealso:: :class:`mindtorch.nn.ConvTranspose1d` and :class:`mindtorch.nn.modules.lazy.LazyModuleMixin`
+    """
+
+    # super class define this variable as None. "type: ignore[..] is required
+    # since we are redefining the variable.
+    cls_to_become = ConvTranspose1d  # type: ignore[assignment]
+
+    def __init__(
+        self,
+        out_channels: int,
+        kernel_size: _size_1_t,
+        stride: _size_1_t = 1,
+        padding: _size_1_t = 0,
+        output_padding: _size_1_t = 0,
+        groups: int = 1,
+        bias: bool = True,
+        dilation: _size_1_t = 1,
+        padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__(
+            0,
+            0,
+            kernel_size,
+            stride,
+            padding,
+            output_padding,
+            groups,
+            # bias is hardcoded to False to avoid creating tensor
+            # that will soon be overwritten.
+            False,
+            dilation,
+            padding_mode,
+            **factory_kwargs,
+        )
+        self.weight = UninitializedParameter(**factory_kwargs)
+        self.out_channels = out_channels
+        if bias:
+            self.bias = UninitializedParameter(**factory_kwargs)
+
+    def _get_num_spatial_dims(self) -> int:
+        return 1
+
+
+# LazyConvTranspose2d defines weight as a Tensor but derived class defines it as UnitializeParameter
+class LazyConvTranspose2d(_LazyConvXdMixin, ConvTranspose2d):  # type: ignore[misc]
+    r"""A :class:`mindtorch.nn.ConvTranspose2d` module with lazy initialization of the ``in_channels`` argument.
+
+    The ``in_channels`` argument of the :class:`ConvTranspose2d` is inferred from
+    the ``input.size(1)``.
+    The attributes that will be lazily initialized are `weight` and `bias`.
+
+    Check the :class:`mindtorch.nn.modules.lazy.LazyModuleMixin` for further documentation
+    on lazy modules and their limitations.
+
+    Args:
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): ``dilation * (kernel_size - 1) - padding`` zero-padding
+            will be added to both sides of each dimension in the input. Default: 0
+        output_padding (int or tuple, optional): Additional size added to one side
+            of each dimension in the output shape. Default: 0
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the output. Default: ``True``
+        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+
+    .. seealso:: :class:`mindtorch.nn.ConvTranspose2d` and :class:`mindtorch.nn.modules.lazy.LazyModuleMixin`
+    """
+
+    # super class define this variable as None. "type: ignore[..] is required
+    # since we are redefining the variable.
+    cls_to_become = ConvTranspose2d  # type: ignore[assignment]
+
+    def __init__(
+        self,
+        out_channels: int,
+        kernel_size: _size_2_t,
+        stride: _size_2_t = 1,
+        padding: _size_2_t = 0,
+        output_padding: _size_2_t = 0,
+        groups: int = 1,
+        bias: bool = True,
+        dilation: int = 1,
+        padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__(
+            0,
+            0,
+            kernel_size,
+            stride,
+            padding,
+            output_padding,
+            groups,
+            # bias is hardcoded to False to avoid creating tensor
+            # that will soon be overwritten.
+            False,
+            dilation,
+            padding_mode,
+            **factory_kwargs,
+        )
+        self.weight = UninitializedParameter(**factory_kwargs)
+        self.out_channels = out_channels
+        if bias:
+            self.bias = UninitializedParameter(**factory_kwargs)
+
+    def _get_num_spatial_dims(self) -> int:
+        return 2
+
+
+# LazyConvTranspose3d defines weight as a Tensor but derived class defines it as UnitializeParameter
+class LazyConvTranspose3d(_LazyConvXdMixin, ConvTranspose3d):  # type: ignore[misc]
+    r"""A :class:`mindtorch.nn.ConvTranspose3d` module with lazy initialization of the ``in_channels`` argument.
+
+    The ``in_channels`` argument of the :class:`ConvTranspose3d` is inferred from
+    the ``input.size(1)``.
+    The attributes that will be lazily initialized are `weight` and `bias`.
+
+    Check the :class:`mindtorch.nn.modules.lazy.LazyModuleMixin` for further documentation
+    on lazy modules and their limitations.
+
+    Args:
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): ``dilation * (kernel_size - 1) - padding`` zero-padding
+            will be added to both sides of each dimension in the input. Default: 0
+        output_padding (int or tuple, optional): Additional size added to one side
+            of each dimension in the output shape. Default: 0
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the output. Default: ``True``
+        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+
+    .. seealso:: :class:`mindtorch.nn.ConvTranspose3d` and :class:`mindtorch.nn.modules.lazy.LazyModuleMixin`
+    """
+
+    # super class define this variable as None. "type: ignore[..] is required
+    # since we are redefining the variable.
+    cls_to_become = ConvTranspose3d  # type: ignore[assignment]
+
+    def __init__(
+        self,
+        out_channels: int,
+        kernel_size: _size_3_t,
+        stride: _size_3_t = 1,
+        padding: _size_3_t = 0,
+        output_padding: _size_3_t = 0,
+        groups: int = 1,
+        bias: bool = True,
+        dilation: _size_3_t = 1,
+        padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__(
+            0,
+            0,
+            kernel_size,
+            stride,
+            padding,
+            output_padding,
+            groups,
+            # bias is hardcoded to False to avoid creating tensor
+            # that will soon be overwritten.
+            False,
+            dilation,
+            padding_mode,
+            **factory_kwargs,
+        )
+        self.weight = UninitializedParameter(**factory_kwargs)
+        self.out_channels = out_channels
+        if bias:
+            self.bias = UninitializedParameter(**factory_kwargs)
+
+    def _get_num_spatial_dims(self) -> int:
+        return 3
\ No newline at end of file
diff --git a/mindtorch/nn/modules/distance.py b/mindtorch/nn/modules/distance.py
index 41b36868c..bc7a86f56 100644
--- a/mindtorch/nn/modules/distance.py
+++ b/mindtorch/nn/modules/distance.py
@@ -1,11 +1,10 @@
-"""distance"""
+import mindtorch.nn.functional as F
 from mindtorch import Tensor
 
 from .module import Module
-from .. import functional as F
 
 
-__all__ = ['CosineSimilarity']
+__all__ = ["PairwiseDistance", "CosineSimilarity"]
 
 
 class PairwiseDistance(Module):
@@ -35,30 +34,33 @@ class PairwiseDistance(Module):
         - Output: :math:`(N)` or :math:`()` based on input dimension.
           If :attr:`keepdim` is ``True``, then :math:`(N, 1)` or :math:`(1)` based on input dimension.
 
-    Examples::
+    Examples:
         >>> pdist = nn.PairwiseDistance(p=2)
         >>> input1 = mindtorch.randn(100, 128)
         >>> input2 = mindtorch.randn(100, 128)
         >>> output = pdist(input1, input2)
     """
 
-    __constants__ = ['norm', 'eps', 'keepdim']
+    __constants__ = ["norm", "eps", "keepdim"]
     norm: float
     eps: float
     keepdim: bool
 
-    def __init__(self, p: float = 2., eps: float = 1e-6, keepdim: bool = False) -> None:
+    def __init__(
+        self, p: float = 2.0, eps: float = 1e-6, keepdim: bool = False
+    ) -> None:
         super().__init__()
         self.norm = p
         self.eps = eps
         self.keepdim = keepdim
 
     def forward(self, x1: Tensor, x2: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
         return F.pairwise_distance(x1, x2, self.norm, self.eps, self.keepdim)
 
 
-
-
 class CosineSimilarity(Module):
     r"""Returns cosine similarity between :math:`x_1` and :math:`x_2`, computed along `dim`.
 
@@ -72,16 +74,17 @@ class CosineSimilarity(Module):
     Shape:
         - Input1: :math:`(\ast_1, D, \ast_2)` where D is at position `dim`
         - Input2: :math:`(\ast_1, D, \ast_2)`, same number of dimensions as x1, matching x1 size at dimension `dim`,
-              and broadcastable with x1 at other dimensions.
+          and broadcastable with x1 at other dimensions.
         - Output: :math:`(\ast_1, \ast_2)`
-    Examples::
+
+    Examples:
         >>> input1 = mindtorch.randn(100, 128)
         >>> input2 = mindtorch.randn(100, 128)
         >>> cos = nn.CosineSimilarity(dim=1, eps=1e-6)
         >>> output = cos(input1, input2)
     """
 
-    __constants__ = ['dim', 'eps']
+    __constants__ = ["dim", "eps"]
     dim: int
     eps: float
 
@@ -91,4 +94,7 @@ def __init__(self, dim: int = 1, eps: float = 1e-8) -> None:
         self.eps = eps
 
     def forward(self, x1: Tensor, x2: Tensor) -> Tensor:
-        return F.cosine_similarity(x1, x2, self.dim, self.eps)
+        """
+        Runs the forward pass.
+        """
+        return F.cosine_similarity(x1, x2, self.dim, self.eps)
\ No newline at end of file
diff --git a/mindtorch/nn/modules/dropout.py b/mindtorch/nn/modules/dropout.py
index 6baa5fed2..8cafc9a1d 100644
--- a/mindtorch/nn/modules/dropout.py
+++ b/mindtorch/nn/modules/dropout.py
@@ -1,27 +1,35 @@
-"""dropout"""
+import mindtorch.nn.functional as F
 from mindtorch import Tensor
 
 from .module import Module
-from .. import functional as F
 
 
-__all__ = ['Dropout', 'Dropout1d', 'Dropout2d', 'Dropout3d', 'AlphaDropout', 'FeatureAlphaDropout']
+__all__ = [
+    "Dropout",
+    "Dropout1d",
+    "Dropout2d",
+    "Dropout3d",
+    "AlphaDropout",
+    "FeatureAlphaDropout",
+]
+
 
 class _DropoutNd(Module):
-    __constants__ = ['p', 'inplace']
+    __constants__ = ["p", "inplace"]
     p: float
     inplace: bool
 
     def __init__(self, p: float = 0.5, inplace: bool = False) -> None:
         super().__init__()
         if p < 0 or p > 1:
-            raise ValueError(f"dropout probability has to be between 0 and 1, but got {p}")
-        self.p = float(p)
+            raise ValueError(
+                f"dropout probability has to be between 0 and 1, but got {p}"
+            )
+        self.p = p
         self.inplace = inplace
 
     def extra_repr(self) -> str:
-        return f'p={self.p}, inplace={self.inplace}'
-
+        return f"p={self.p}, inplace={self.inplace}"
 
 
 class Dropout(_DropoutNd):
@@ -59,7 +67,10 @@ class Dropout(_DropoutNd):
     """
 
     def forward(self, input: Tensor) -> Tensor:
-        return F.dropout(input, self.p, self.training)
+        """
+        Runs the forward pass.
+        """
+        return F.dropout(input, self.p, self.training, self.inplace)
 
 
 class Dropout1d(_DropoutNd):
@@ -104,7 +115,10 @@ class Dropout1d(_DropoutNd):
     """
 
     def forward(self, input: Tensor) -> Tensor:
-        return F.dropout1d(input, self.p, self.training)
+        """
+        Runs the forward pass.
+        """
+        return F.dropout1d(input, self.p, self.training, self.inplace)
 
 
 class Dropout2d(_DropoutNd):
@@ -156,9 +170,10 @@ class Dropout2d(_DropoutNd):
     """
 
     def forward(self, input: Tensor) -> Tensor:
-        return F.dropout2d(input, self.p, self.training)
-
-
+        """
+        Runs the forward pass.
+        """
+        return F.dropout2d(input, self.p, self.training, self.inplace)
 
 
 class Dropout3d(_DropoutNd):
@@ -203,6 +218,9 @@ class Dropout3d(_DropoutNd):
     """
 
     def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
         return F.dropout3d(input, self.p, self.training, self.inplace)
 
 
@@ -245,6 +263,9 @@ class AlphaDropout(_DropoutNd):
     """
 
     def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
         return F.alpha_dropout(input, self.p, self.training)
 
 
@@ -296,4 +317,7 @@ class FeatureAlphaDropout(_DropoutNd):
     """
 
     def forward(self, input: Tensor) -> Tensor:
-        return F.feature_alpha_dropout(input, self.p, self.training)
+        """
+        Runs the forward pass.
+        """
+        return F.feature_alpha_dropout(input, self.p, self.training)
\ No newline at end of file
diff --git a/mindtorch/nn/modules/flatten.py b/mindtorch/nn/modules/flatten.py
index 408a03f89..cd0c5ccdd 100644
--- a/mindtorch/nn/modules/flatten.py
+++ b/mindtorch/nn/modules/flatten.py
@@ -1,14 +1,14 @@
-"""flatten"""
-from typing import Tuple, Union, List
+# mypy: allow-untyped-defs
+from typing import Union
+
 from mindtorch import Tensor
+from mindtorch.types import _size
 
 from .module import Module
-from ...ops import flatten, unflatten
 
-__all__ = ['Flatten', 'Unflatten']
 
+__all__ = ["Flatten", "Unflatten"]
 
-_size = Union[List[int], Tuple[int, ...]]
 
 class Flatten(Module):
     r"""
@@ -40,7 +40,7 @@ class Flatten(Module):
         mindtorch.Size([160, 5])
     """
 
-    __constants__ = ['start_dim', 'end_dim']
+    __constants__ = ["start_dim", "end_dim"]
     start_dim: int
     end_dim: int
 
@@ -50,12 +50,16 @@ def __init__(self, start_dim: int = 1, end_dim: int = -1) -> None:
         self.end_dim = end_dim
 
     def forward(self, input: Tensor) -> Tensor:
-        return flatten(input, self.start_dim, self.end_dim)
+        """
+        Runs the forward pass.
+        """
+        return input.flatten(self.start_dim, self.end_dim)
 
     def extra_repr(self) -> str:
-        return f'start_dim={self.start_dim}, end_dim={self.end_dim}'
-
-
+        """
+        Returns the extra representation of the module.
+        """
+        return f"start_dim={self.start_dim}, end_dim={self.end_dim}"
 
 
 class Unflatten(Module):
@@ -98,20 +102,22 @@ class Unflatten(Module):
         >>> output.size()
         mindtorch.Size([2, 2, 5, 5])
         >>> # With namedshape (tuple of tuples)
-        >>> input = mindtorch.randn(2, 50, names=('N', 'features'))
-        >>> unflatten = nn.Unflatten('features', (('C', 2), ('H', 5), ('W', 5)))
+        >>> input = mindtorch.randn(2, 50, names=("N", "features"))
+        >>> unflatten = nn.Unflatten("features", (("C", 2), ("H", 5), ("W", 5)))
         >>> output = unflatten(input)
         >>> output.size()
         mindtorch.Size([2, 2, 5, 5])
     """
 
-    NamedShape = Tuple[Tuple[str, int]]
+    NamedShape = tuple[tuple[str, int]]
 
-    __constants__ = ['dim', 'unflattened_size']
+    __constants__ = ["dim", "unflattened_size"]
     dim: Union[int, str]
     unflattened_size: Union[_size, NamedShape]
 
-    def __init__(self, dim: Union[int, str], unflattened_size: Union[_size, NamedShape]) -> None:
+    def __init__(
+        self, dim: Union[int, str], unflattened_size: Union[_size, NamedShape]
+    ) -> None:
         super().__init__()
 
         if isinstance(dim, int):
@@ -124,27 +130,41 @@ def __init__(self, dim: Union[int, str], unflattened_size: Union[_size, NamedSha
         self.dim = dim
         self.unflattened_size = unflattened_size
 
-    def _require_tuple_tuple(self, input):
-        if (isinstance(input, tuple)):
+    def _require_tuple_tuple(self, input) -> None:
+        if isinstance(input, tuple):
             for idx, elem in enumerate(input):
                 if not isinstance(elem, tuple):
-                    raise TypeError("unflattened_size must be tuple of tuples, " +
-                                    f"but found element of type {type(elem).__name__} at pos {idx}")
+                    raise TypeError(
+                        "unflattened_size must be tuple of tuples, "
+                        + f"but found element of type {type(elem).__name__} at pos {idx}"
+                    )
             return
-        raise TypeError("unflattened_size must be a tuple of tuples, " +
-                        f"but found type {type(input).__name__}")
+        raise TypeError(
+            "unflattened_size must be a tuple of tuples, "
+            + f"but found type {type(input).__name__}"
+        )
 
-    def _require_tuple_int(self, input):
-        if (isinstance(input, (tuple, list))):
+    def _require_tuple_int(self, input) -> None:
+        if isinstance(input, (tuple, list)):
             for idx, elem in enumerate(input):
                 if not isinstance(elem, int):
-                    raise TypeError("unflattened_size must be tuple of ints, " +
-                                    f"but found element of type {type(elem).__name__} at pos {idx}")
+                    raise TypeError(
+                        "unflattened_size must be tuple of ints, "
+                        + f"but found element of type {type(elem).__name__} at pos {idx}"
+                    )
             return
-        raise TypeError(f"unflattened_size must be a tuple of ints, but found type {type(input).__name__}")
+        raise TypeError(
+            f"unflattened_size must be a tuple of ints, but found type {type(input).__name__}"
+        )
 
     def forward(self, input: Tensor) -> Tensor:
-        return unflatten(input, self.dim, self.unflattened_size)
+        """
+        Runs the forward pass.
+        """
+        return input.unflatten(self.dim, self.unflattened_size)
 
     def extra_repr(self) -> str:
-        return f'dim={self.dim}, unflattened_size={self.unflattened_size}'
+        """
+        Returns the extra representation of the module.
+        """
+        return f"dim={self.dim}, unflattened_size={self.unflattened_size}"
\ No newline at end of file
diff --git a/mindtorch/nn/modules/fold.py b/mindtorch/nn/modules/fold.py
index 9a9153b56..56a47c653 100644
--- a/mindtorch/nn/modules/fold.py
+++ b/mindtorch/nn/modules/fold.py
@@ -1,14 +1,16 @@
-"""fold module"""
+import mindtorch.nn.functional as F
 from mindtorch import Tensor
+from mindtorch.nn.common_types import _size_any_t
+
 from .module import Module
-from .. import functional as F
 
-from ..common_types import _size_any_t
 
-__all__ = ['Fold', 'Unfold']
+__all__ = ["Fold", "Unfold"]
+
 
 class Fold(Module):
-    r"""Combines an array of sliding local blocks into a large containing tensor.
+    (
+        r"""Combines an array of sliding local blocks into a large containing tensor.
 
     Consider a batched :attr:`input` tensor containing sliding local blocks,
     e.g., patches of images, of shape :math:`(N, C \times  \prod(\text{kernel\_size}), L)`,
@@ -41,10 +43,12 @@ class Fold(Module):
     * :attr:`padding` controls the amount of implicit zero-paddings on both
       sides for :attr:`padding` number of points for each dimension before
       reshaping.
-
+"""
+        """
     * :attr:`dilation` controls the spacing between the kernel points; also known as the \u00e0 trous algorithm.
       It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
-
+"""
+        r"""
     Args:
         output_size (int or tuple): the shape of the spatial dimensions of the
                                     output (i.e., ``output.sizes()[2:]``)
@@ -118,9 +122,9 @@ class Fold(Module):
         https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
 
     """
+    )
 
-    __constants__ = ['output_size', 'kernel_size', 'dilation', 'padding',
-                     'stride']
+    __constants__ = ["output_size", "kernel_size", "dilation", "padding", "stride"]
     output_size: _size_any_t
     kernel_size: _size_any_t
     dilation: _size_any_t
@@ -133,7 +137,7 @@ def __init__(
         kernel_size: _size_any_t,
         dilation: _size_any_t = 1,
         padding: _size_any_t = 0,
-        stride: _size_any_t = 1
+        stride: _size_any_t = 1,
     ) -> None:
         super().__init__()
         self.output_size = output_size
@@ -143,19 +147,33 @@ def __init__(
         self.stride = stride
 
     def forward(self, input: Tensor) -> Tensor:
-        return F.fold(input, self.output_size, self.kernel_size, self.dilation,
-                      self.padding, self.stride)
+        """
+        Runs the forward pass.
+        """
+        return F.fold(
+            input,
+            self.output_size,
+            self.kernel_size,
+            self.dilation,
+            self.padding,
+            self.stride,
+        )
 
     def extra_repr(self) -> str:
-        return 'output_size={output_size}, kernel_size={kernel_size}, ' \
-            'dilation={dilation}, padding={padding}, stride={stride}'.format(
+        """
+        Return the extra representation of the module.
+        """
+        return (
+            "output_size={output_size}, kernel_size={kernel_size}, "
+            "dilation={dilation}, padding={padding}, stride={stride}".format(
                 **self.__dict__
             )
-
+        )
 
 
 class Unfold(Module):
-    r"""Extracts sliding local blocks from a batched input tensor.
+    (
+        r"""Extracts sliding local blocks from a batched input tensor.
 
     Consider a batched :attr:`input` tensor of shape :math:`(N, C, *)`,
     where :math:`N` is the batch dimension, :math:`C` is the channel dimension,
@@ -187,10 +205,12 @@ class Unfold(Module):
     * :attr:`padding` controls the amount of implicit zero-paddings on both
       sides for :attr:`padding` number of points for each dimension before
       reshaping.
-
+"""
+        """
     * :attr:`dilation` controls the spacing between the kernel points; also known as the \u00e0 trous algorithm.
       It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
-
+"""
+        r"""
     Args:
         kernel_size (int or tuple): the size of the sliding blocks
         dilation (int or tuple, optional): a parameter that controls the
@@ -276,8 +296,9 @@ class Unfold(Module):
         https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
 
     """
+    )
 
-    __constants__ = ['kernel_size', 'dilation', 'padding', 'stride']
+    __constants__ = ["kernel_size", "dilation", "padding", "stride"]
     kernel_size: _size_any_t
     dilation: _size_any_t
     padding: _size_any_t
@@ -288,7 +309,7 @@ def __init__(
         kernel_size: _size_any_t,
         dilation: _size_any_t = 1,
         padding: _size_any_t = 0,
-        stride: _size_any_t = 1
+        stride: _size_any_t = 1,
     ) -> None:
         super().__init__()
         self.kernel_size = kernel_size
@@ -297,9 +318,18 @@ def __init__(
         self.stride = stride
 
     def forward(self, input: Tensor) -> Tensor:
-        return F.unfold(input, self.kernel_size, self.dilation,
-                        self.padding, self.stride)
+        """
+        Runs the forward pass.
+        """
+        return F.unfold(
+            input, self.kernel_size, self.dilation, self.padding, self.stride
+        )
 
     def extra_repr(self) -> str:
-        return 'kernel_size={kernel_size}, dilation={dilation}, padding={padding},' \
-            ' stride={stride}'.format(**self.__dict__)
+        """
+        Return the extra representation of the module.
+        """
+        return (
+            "kernel_size={kernel_size}, dilation={dilation}, padding={padding},"
+            " stride={stride}".format(**self.__dict__)
+        )
\ No newline at end of file
diff --git a/mindtorch/nn/modules/instancenorm.py b/mindtorch/nn/modules/instancenorm.py
index bc967f798..f7fe2fbb7 100644
--- a/mindtorch/nn/modules/instancenorm.py
+++ b/mindtorch/nn/modules/instancenorm.py
@@ -2,10 +2,10 @@
 
 import warnings
 
+import mindtorch.nn.functional as F
 from mindtorch import Tensor
-from .. import functional as F
 
-from .batchnorm import _NormBase
+from .batchnorm import _LazyNormBase, _NormBase
 
 
 __all__ = [
@@ -64,7 +64,7 @@ def _load_from_state_dict(
         missing_keys,
         unexpected_keys,
         error_msgs,
-    ):
+    ) -> None:
         version = local_metadata.get("version", None)
         # at version 1: removed running_mean and running_var when
         # track_running_stats=False (default)
@@ -140,7 +140,7 @@ class InstanceNorm1d(_InstanceNorm):
     for each object in a mini-batch. :math:`\gamma` and :math:`\beta` are learnable parameter vectors
     of size `C` (where `C` is the number of features or channels of the input) if :attr:`affine` is ``True``.
     The variance is calculated via the biased estimator, equivalent to
-    `torch.var(input, unbiased=False)`.
+    `mindtorch.var(input, unbiased=False)`.
 
     By default, this layer uses instance statistics computed from input data in
     both training and evaluation modes.
@@ -189,18 +189,54 @@ class InstanceNorm1d(_InstanceNorm):
         >>> m = nn.InstanceNorm1d(100)
         >>> # With Learnable Parameters
         >>> m = nn.InstanceNorm1d(100, affine=True)
-        >>> input = torch.randn(20, 100, 40)
+        >>> input = mindtorch.randn(20, 100, 40)
         >>> output = m(input)
     """
 
-    def _get_no_batch_dim(self):
+    def _get_no_batch_dim(self) -> int:
         return 2
 
-    def _check_input_dim(self, input):
+    def _check_input_dim(self, input) -> None:
         if input.dim() not in (2, 3):
             raise ValueError(f"expected 2D or 3D input (got {input.dim()}D input)")
 
 
+class LazyInstanceNorm1d(_LazyNormBase, _InstanceNorm):
+    r"""A :class:`mindtorch.nn.InstanceNorm1d` module with lazy initialization of the ``num_features`` argument.
+
+    The ``num_features`` argument of the :class:`InstanceNorm1d` is inferred from the ``input.size(1)``.
+    The attributes that will be lazily initialized are `weight`, `bias`, `running_mean` and `running_var`.
+
+    Check the :class:`mindtorch.nn.modules.lazy.LazyModuleMixin` for further documentation
+    on lazy modules and their limitations.
+
+    Args:
+        num_features: :math:`C` from an expected input of size
+            :math:`(N, C, L)` or :math:`(C, L)`
+        eps: a value added to the denominator for numerical stability. Default: 1e-5
+        momentum: the value used for the running_mean and running_var computation. Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters, initialized the same way as done for batch normalization.
+            Default: ``False``.
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics and always uses batch
+            statistics in both training and eval modes. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, C, L)` or :math:`(C, L)`
+        - Output: :math:`(N, C, L)` or :math:`(C, L)` (same shape as input)
+    """
+
+    cls_to_become = InstanceNorm1d  # type: ignore[assignment]
+
+    def _get_no_batch_dim(self) -> int:
+        return 2
+
+    def _check_input_dim(self, input) -> None:
+        if input.dim() not in (2, 3):
+            raise ValueError(f"expected 2D or 3D input (got {input.dim()}D input)")
+
 
 class InstanceNorm2d(_InstanceNorm):
     r"""Applies Instance Normalization.
@@ -219,7 +255,7 @@ class InstanceNorm2d(_InstanceNorm):
     for each object in a mini-batch. :math:`\gamma` and :math:`\beta` are learnable parameter vectors
     of size `C` (where `C` is the input size) if :attr:`affine` is ``True``.
     The standard-deviation is calculated via the biased estimator, equivalent to
-    `torch.var(input, unbiased=False)`.
+    `mindtorch.var(input, unbiased=False)`.
 
     By default, this layer uses instance statistics computed from input data in
     both training and evaluation modes.
@@ -269,18 +305,55 @@ class InstanceNorm2d(_InstanceNorm):
         >>> m = nn.InstanceNorm2d(100)
         >>> # With Learnable Parameters
         >>> m = nn.InstanceNorm2d(100, affine=True)
-        >>> input = torch.randn(20, 100, 35, 45)
+        >>> input = mindtorch.randn(20, 100, 35, 45)
         >>> output = m(input)
     """
 
-    def _get_no_batch_dim(self):
+    def _get_no_batch_dim(self) -> int:
         return 3
 
-    def _check_input_dim(self, input):
+    def _check_input_dim(self, input) -> None:
         if input.dim() not in (3, 4):
             raise ValueError(f"expected 3D or 4D input (got {input.dim()}D input)")
 
 
+class LazyInstanceNorm2d(_LazyNormBase, _InstanceNorm):
+    r"""A :class:`mindtorch.nn.InstanceNorm2d` module with lazy initialization of the ``num_features`` argument.
+
+    The ``num_features`` argument of the :class:`InstanceNorm2d` is inferred from the ``input.size(1)``.
+    The attributes that will be lazily initialized are `weight`, `bias`,
+    `running_mean` and `running_var`.
+
+    Check the :class:`mindtorch.nn.modules.lazy.LazyModuleMixin` for further documentation
+    on lazy modules and their limitations.
+
+    Args:
+        num_features: :math:`C` from an expected input of size
+            :math:`(N, C, H, W)` or :math:`(C, H, W)`
+        eps: a value added to the denominator for numerical stability. Default: 1e-5
+        momentum: the value used for the running_mean and running_var computation. Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters, initialized the same way as done for batch normalization.
+            Default: ``False``.
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics and always uses batch
+            statistics in both training and eval modes. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, C, H, W)` or :math:`(C, H, W)`
+        - Output: :math:`(N, C, H, W)` or :math:`(C, H, W)` (same shape as input)
+    """
+
+    cls_to_become = InstanceNorm2d  # type: ignore[assignment]
+
+    def _get_no_batch_dim(self) -> int:
+        return 3
+
+    def _check_input_dim(self, input) -> None:
+        if input.dim() not in (3, 4):
+            raise ValueError(f"expected 3D or 4D input (got {input.dim()}D input)")
+
 
 class InstanceNorm3d(_InstanceNorm):
     r"""Applies Instance Normalization.
@@ -298,7 +371,7 @@ class InstanceNorm3d(_InstanceNorm):
     for each object in a mini-batch. :math:`\gamma` and :math:`\beta` are learnable parameter vectors
     of size C (where C is the input size) if :attr:`affine` is ``True``.
     The standard-deviation is calculated via the biased estimator, equivalent to
-    `torch.var(input, unbiased=False)`.
+    `mindtorch.var(input, unbiased=False)`.
 
     By default, this layer uses instance statistics computed from input data in
     both training and evaluation modes.
@@ -348,14 +421,51 @@ class InstanceNorm3d(_InstanceNorm):
         >>> m = nn.InstanceNorm3d(100)
         >>> # With Learnable Parameters
         >>> m = nn.InstanceNorm3d(100, affine=True)
-        >>> input = torch.randn(20, 100, 35, 45, 10)
+        >>> input = mindtorch.randn(20, 100, 35, 45, 10)
         >>> output = m(input)
     """
 
-    def _get_no_batch_dim(self):
+    def _get_no_batch_dim(self) -> int:
         return 4
 
-    def _check_input_dim(self, input):
+    def _check_input_dim(self, input) -> None:
         if input.dim() not in (4, 5):
             raise ValueError(f"expected 4D or 5D input (got {input.dim()}D input)")
 
+
+class LazyInstanceNorm3d(_LazyNormBase, _InstanceNorm):
+    r"""A :class:`mindtorch.nn.InstanceNorm3d` module with lazy initialization of the ``num_features`` argument.
+
+    The ``num_features`` argument of the :class:`InstanceNorm3d` is inferred from the ``input.size(1)``.
+    The attributes that will be lazily initialized are `weight`, `bias`,
+    `running_mean` and `running_var`.
+
+    Check the :class:`mindtorch.nn.modules.lazy.LazyModuleMixin` for further documentation
+    on lazy modules and their limitations.
+
+    Args:
+        num_features: :math:`C` from an expected input of size
+            :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)`
+        eps: a value added to the denominator for numerical stability. Default: 1e-5
+        momentum: the value used for the running_mean and running_var computation. Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters, initialized the same way as done for batch normalization.
+            Default: ``False``.
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics and always uses batch
+            statistics in both training and eval modes. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)`
+        - Output: :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)` (same shape as input)
+    """
+
+    cls_to_become = InstanceNorm3d  # type: ignore[assignment]
+
+    def _get_no_batch_dim(self) -> int:
+        return 4
+
+    def _check_input_dim(self, input) -> None:
+        if input.dim() not in (4, 5):
+            raise ValueError(f"expected 4D or 5D input (got {input.dim()}D input)")
\ No newline at end of file
diff --git a/mindtorch/nn/modules/lazy.py b/mindtorch/nn/modules/lazy.py
index f92670474..d35ea8a8c 100644
--- a/mindtorch/nn/modules/lazy.py
+++ b/mindtorch/nn/modules/lazy.py
@@ -3,7 +3,7 @@
 from typing import Any, Optional, Protocol
 
 import mindtorch
-from ..parameter import is_lazy
+from mindtorch.nn.parameter import is_lazy
 
 
 __all__ = ["LazyModuleMixin"]
@@ -245,7 +245,7 @@ def has_uninitialized_params(self: _LazyProtocol):
                 return True
         return False
 
-    # torchrec tests the code consistency with the following code
+    # mindtorchrec tests the code consistency with the following code
     # fmt: off
     def _infer_parameters(self: _LazyProtocol, module, args, kwargs=None):
         r"""Infers the size and initializes the parameters according to the provided input batch.
diff --git a/mindtorch/nn/modules/linear.py b/mindtorch/nn/modules/linear.py
index f0f915727..ca27fe18a 100644
--- a/mindtorch/nn/modules/linear.py
+++ b/mindtorch/nn/modules/linear.py
@@ -1,94 +1,331 @@
-"""linear"""
-from typing import Any
+# mypy: allow-untyped-defs
 import math
+from typing import Any
+
+import mindtorch
 from mindtorch import Tensor
-from ..parameter import Parameter
+from mindtorch.nn import functional as F, init
+from mindtorch.nn.parameter import Parameter, UninitializedParameter
+
+from .lazy import LazyModuleMixin
 from .module import Module
-from .. import init
-from .. import functional as F
-from ... import ops
+
+
+__all__ = [
+    "Bilinear",
+    "Identity",
+    "LazyLinear",
+    "Linear",
+]
+
+
+class Identity(Module):
+    r"""A placeholder identity operator that is argument-insensitive.
+
+    Args:
+        args: any argument (unused)
+        kwargs: any keyword argument (unused)
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    Examples::
+
+        >>> m = nn.Identity(54, unused_argument1=0.1, unused_argument2=False)
+        >>> input = mindtorch.randn(128, 20)
+        >>> output = m(input)
+        >>> print(output.size())
+        mindtorch.Size([128, 20])
+
+    """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__()
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return input
+
 
 class Linear(Module):
-    r"""Applies a linear transformation to the incoming data: :math:`y = Ax + b`
+    r"""Applies an affine linear transformation to the incoming data: :math:`y = xA^T + b`.
+
+    This module supports :ref:`TensorFloat32<tf32_on_ampere>`.
+
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
 
     Args:
         in_features: size of each input sample
         out_features: size of each output sample
-        bias: If set to False, the layer will not learn an additive bias.
-            Default: True
+        bias: If set to ``False``, the layer will not learn an additive bias.
+            Default: ``True``
 
     Shape:
-        - Input: :math:`(N, in\_features)`
-        - Output: :math:`(N, out\_features)`
+        - Input: :math:`(*, H_\text{in})` where :math:`*` means any number of
+          dimensions including none and :math:`H_\text{in} = \text{in\_features}`.
+        - Output: :math:`(*, H_\text{out})` where all but the last dimension
+          are the same shape as the input and :math:`H_\text{out} = \text{out\_features}`.
 
     Attributes:
         weight: the learnable weights of the module of shape
-            (out_features x in_features)
-        bias:   the learnable bias of the module of shape (out_features)
+            :math:`(\text{out\_features}, \text{in\_features})`. The values are
+            initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where
+            :math:`k = \frac{1}{\text{in\_features}}`
+        bias:   the learnable bias of the module of shape :math:`(\text{out\_features})`.
+                If :attr:`bias` is ``True``, the values are initialized from
+                :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
+                :math:`k = \frac{1}{\text{in\_features}}`
 
     Examples::
 
         >>> m = nn.Linear(20, 30)
-        >>> input = autograd.Variable(torch.randn(128, 20))
+        >>> input = mindtorch.randn(128, 20)
         >>> output = m(input)
         >>> print(output.size())
+        mindtorch.Size([128, 30])
     """
 
-    def __init__(self, in_features, out_features, bias=True, dtype=None, device=None) -> None:
-        factory_kwargs = {'dtype': dtype, 'device': device}
+    __constants__ = ["in_features", "out_features"]
+    in_features: int
+    out_features: int
+    weight: Tensor
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool = True,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__()
         self.in_features = in_features
         self.out_features = out_features
-        self.weight = Parameter(ops.empty((out_features, in_features), **factory_kwargs))
+        self.weight = Parameter(
+            mindtorch.empty((out_features, in_features), **factory_kwargs)
+        )
         if bias:
-            self.bias = Parameter(ops.empty(out_features, **factory_kwargs))
+            self.bias = Parameter(mindtorch.empty(out_features, **factory_kwargs))
         else:
-            self.register_parameter('bias', None)
-
+            self.register_parameter("bias", None)
         self.reset_parameters()
 
     def reset_parameters(self) -> None:
+        """
+        Resets parameters based on their initialization used in ``__init__``.
+        """
         # Setting a=sqrt(5) in kaiming_uniform is the same as initializing with
         # uniform(-1/sqrt(in_features), 1/sqrt(in_features)). For details, see
-        # https://github.com/pytorch/pytorch/issues/57109
+        # https://github.com/pymindtorch/pymindtorch/issues/57109
         init.kaiming_uniform_(self.weight, a=math.sqrt(5))
         if self.bias is not None:
             fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
             bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
             init.uniform_(self.bias, -bound, bound)
 
-    def forward(self, input):
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
         return F.linear(input, self.weight, self.bias)
 
-    def __repr__(self):
-        return self.__class__.__name__ + ' (' \
-            + str(self.in_features) + ' -> ' \
-            + str(self.out_features) + ')'
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        return f"in_features={self.in_features}, out_features={self.out_features}, bias={self.bias is not None}"
 
 
-class Identity(Module):
-    r"""A placeholder identity operator that is argument-insensitive.
+# This class exists solely to avoid triggering an obscure error when scripting
+# an improperly quantized attention layer. See this issue for details:
+# https://github.com/pymindtorch/pymindtorch/issues/58969
+# TODO: fail fast on quantization API usage error, then remove this class
+# and replace uses of it with plain Linear
+class NonDynamicallyQuantizableLinear(Linear):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool = True,
+        device=None,
+        dtype=None,
+    ) -> None:
+        super().__init__(
+            in_features, out_features, bias=bias, device=device, dtype=dtype
+        )
+
+
+class Bilinear(Module):
+    r"""Applies a bilinear transformation to the incoming data: :math:`y = x_1^T A x_2 + b`.
 
     Args:
-        args: any argument (unused)
-        kwargs: any keyword argument (unused)
+        in1_features: size of each first input sample, must be > 0
+        in2_features: size of each second input sample, must be > 0
+        out_features: size of each output sample, must be > 0
+        bias: If set to ``False``, the layer will not learn an additive bias.
+            Default: ``True``
 
     Shape:
-        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
-        - Output: :math:`(*)`, same shape as the input.
+        - Input1: :math:`(*, H_\text{in1})` where :math:`H_\text{in1}=\text{in1\_features}` and
+          :math:`*` means any number of additional dimensions including none. All but the last dimension
+          of the inputs should be the same.
+        - Input2: :math:`(*, H_\text{in2})` where :math:`H_\text{in2}=\text{in2\_features}`.
+        - Output: :math:`(*, H_\text{out})` where :math:`H_\text{out}=\text{out\_features}`
+          and all but the last dimension are the same shape as the input.
+
+    Attributes:
+        weight: the learnable weights of the module of shape
+            :math:`(\text{out\_features}, \text{in1\_features}, \text{in2\_features})`.
+            The values are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where
+            :math:`k = \frac{1}{\text{in1\_features}}`
+        bias:   the learnable bias of the module of shape :math:`(\text{out\_features})`.
+                If :attr:`bias` is ``True``, the values are initialized from
+                :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where
+                :math:`k = \frac{1}{\text{in1\_features}}`
 
     Examples::
 
-        >>> m = nn.Identity(54, unused_argument1=0.1, unused_argument2=False)
-        >>> input = torch.randn(128, 20)
-        >>> output = m(input)
+        >>> m = nn.Bilinear(20, 30, 40)
+        >>> input1 = mindtorch.randn(128, 20)
+        >>> input2 = mindtorch.randn(128, 30)
+        >>> output = m(input1, input2)
         >>> print(output.size())
-        torch.Size([128, 20])
-
+        mindtorch.Size([128, 40])
     """
 
-    def __init__(self, *args: Any, **kwargs: Any) -> None:
+    __constants__ = ["in1_features", "in2_features", "out_features"]
+    in1_features: int
+    in2_features: int
+    out_features: int
+    weight: Tensor
+
+    def __init__(
+        self,
+        in1_features: int,
+        in2_features: int,
+        out_features: int,
+        bias: bool = True,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__()
+        self.in1_features = in1_features
+        self.in2_features = in2_features
+        self.out_features = out_features
+        self.weight = Parameter(
+            mindtorch.empty((out_features, in1_features, in2_features), **factory_kwargs)
+        )
 
-    def forward(self, input: Tensor) -> Tensor:
-        return input
+        if bias:
+            self.bias = Parameter(mindtorch.empty(out_features, **factory_kwargs))
+        else:
+            self.register_parameter("bias", None)
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        """
+        Resets parameters based on their initialization used in ``__init__``.
+        """
+        if self.in1_features <= 0:
+            raise ValueError(
+                f"in1_features must be > 0, but got (in1_features={self.in1_features})"
+            )
+        bound = 1 / math.sqrt(self.weight.size(1))
+        init.uniform_(self.weight, -bound, bound)
+        if self.bias is not None:
+            init.uniform_(self.bias, -bound, bound)
+
+    def forward(self, input1: Tensor, input2: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.bilinear(input1, input2, self.weight, self.bias)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        return (
+            f"in1_features={self.in1_features}, in2_features={self.in2_features}, "
+            f"out_features={self.out_features}, bias={self.bias is not None}"
+        )
+
+
+class LazyLinear(LazyModuleMixin, Linear):
+    r"""A :class:`mindtorch.nn.Linear` module where `in_features` is inferred.
+
+    In this module, the `weight` and `bias` are of :class:`mindtorch.nn.UninitializedParameter`
+    class. They will be initialized after the first call to ``forward`` is done and the
+    module will become a regular :class:`mindtorch.nn.Linear` module. The ``in_features`` argument
+    of the :class:`Linear` is inferred from the ``input.shape[-1]``.
+
+    Check the :class:`mindtorch.nn.modules.lazy.LazyModuleMixin` for further documentation
+    on lazy modules and their limitations.
+
+    Args:
+        out_features: size of each output sample
+        bias: If set to ``False``, the layer will not learn an additive bias.
+            Default: ``True``
+
+    Attributes:
+        weight: the learnable weights of the module of shape
+            :math:`(\text{out\_features}, \text{in\_features})`. The values are
+            initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where
+            :math:`k = \frac{1}{\text{in\_features}}`
+        bias:   the learnable bias of the module of shape :math:`(\text{out\_features})`.
+                If :attr:`bias` is ``True``, the values are initialized from
+                :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
+                :math:`k = \frac{1}{\text{in\_features}}`
+
+
+    """
+
+    cls_to_become = Linear  # type: ignore[assignment]
+    weight: UninitializedParameter
+    bias: UninitializedParameter  # type: ignore[assignment]
+
+    def __init__(
+        self, out_features: int, bias: bool = True, device=None, dtype=None
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        # bias is hardcoded to False to avoid creating tensor
+        # that will soon be overwritten.
+        super().__init__(0, 0, False)
+        self.weight = UninitializedParameter(**factory_kwargs)
+        self.out_features = out_features
+        if bias:
+            self.bias = UninitializedParameter(**factory_kwargs)
+
+    def reset_parameters(self) -> None:
+        """
+        Resets parameters based on their initialization used in ``__init__``.
+        """
+        if not self.has_uninitialized_params() and self.in_features != 0:
+            super().reset_parameters()
+
+    def initialize_parameters(self, input) -> None:  # type: ignore[override]
+        """
+        Infers ``in_features`` based on ``input`` and initializes parameters.
+        """
+        if self.has_uninitialized_params():
+            with mindtorch.no_grad():
+                self.in_features = input.shape[-1]
+                self.weight.materialize((self.out_features, self.in_features))
+                if self.bias is not None:
+                    self.bias.materialize((self.out_features,))
+                self.reset_parameters()
+        if self.in_features == 0:
+            assert input.shape[-1] == self.weight.shape[-1], (
+                f"The in_features inferred from input: {input.shape[-1]} "
+                f"is not equal to in_features from self.weight: "
+                f"{self.weight.shape[-1]}"
+            )
+            self.in_features = input.shape[-1]
+
+
+# TODO: PartialLinear - maybe in sparse?
\ No newline at end of file
diff --git a/mindtorch/nn/modules/loss.py b/mindtorch/nn/modules/loss.py
index 21a5f4f3e..4eab45cd4 100644
--- a/mindtorch/nn/modules/loss.py
+++ b/mindtorch/nn/modules/loss.py
@@ -1,23 +1,45 @@
-"""loss"""
-from typing import Callable, Optional
+# mypy: allow-untyped-defs
+from collections.abc import Callable
+from typing import Optional, Union
 from typing_extensions import deprecated
+
 from mindtorch import Tensor
+from mindtorch.nn import _reduction as _Reduction, functional as F
 
 from .distance import PairwiseDistance
 from .module import Module
-from .. import functional as F
-from .. import _reduction as _Reduction
 
-__all__ = ['L1Loss', 'NLLLoss', 'NLLLoss2d', 'PoissonNLLLoss', 'GaussianNLLLoss', 'KLDivLoss',
-           'MSELoss', 'BCELoss', 'BCEWithLogitsLoss', 'HingeEmbeddingLoss', 'MultiLabelMarginLoss',
-           'SmoothL1Loss', 'HuberLoss', 'SoftMarginLoss', 'CrossEntropyLoss', 'MultiLabelSoftMarginLoss',
-           'CosineEmbeddingLoss', 'MarginRankingLoss', 'MultiMarginLoss', 'TripletMarginLoss',
-           'TripletMarginWithDistanceLoss', 'CTCLoss']
+
+__all__ = [
+    "L1Loss",
+    "NLLLoss",
+    "NLLLoss2d",
+    "PoissonNLLLoss",
+    "GaussianNLLLoss",
+    "KLDivLoss",
+    "MSELoss",
+    "BCELoss",
+    "BCEWithLogitsLoss",
+    "HingeEmbeddingLoss",
+    "MultiLabelMarginLoss",
+    "SmoothL1Loss",
+    "HuberLoss",
+    "SoftMarginLoss",
+    "CrossEntropyLoss",
+    "MultiLabelSoftMarginLoss",
+    "CosineEmbeddingLoss",
+    "MarginRankingLoss",
+    "MultiMarginLoss",
+    "TripletMarginLoss",
+    "TripletMarginWithDistanceLoss",
+    "CTCLoss",
+]
+
 
 class _Loss(Module):
     reduction: str
 
-    def __init__(self, size_average=None, reduce=None, reduction: str = 'mean') -> None:
+    def __init__(self, size_average=None, reduce=None, reduction: str = "mean") -> None:
         super().__init__()
         if size_average is not None or reduce is not None:
             self.reduction: str = _Reduction.legacy_get_string(size_average, reduce)
@@ -26,13 +48,18 @@ def __init__(self, size_average=None, reduce=None, reduction: str = 'mean') -> N
 
 
 class _WeightedLoss(_Loss):
-    def __init__(self, weight: Optional[Tensor] = None, size_average=None, reduce=None, reduction: str = 'mean') -> None:
+    def __init__(
+        self,
+        weight: Optional[Tensor] = None,
+        size_average=None,
+        reduce=None,
+        reduction: str = "mean",
+    ) -> None:
         super().__init__(size_average, reduce, reduction)
-        self.register_buffer('weight', weight)
+        self.register_buffer("weight", weight)
         self.weight: Optional[Tensor]
 
 
-
 class L1Loss(_Loss):
     r"""Creates a criterion that measures the mean absolute error (MAE) between each element in
     the input :math:`x` and target :math:`y`.
@@ -54,11 +81,11 @@ class L1Loss(_Loss):
         \end{cases}
 
     :math:`x` and :math:`y` are tensors of arbitrary shapes with a total
-    of :math:`n` elements each.
+    of :math:`N` elements each.
 
-    The sum operation still operates over all the elements, and divides by :math:`n`.
+    The sum operation still operates over all the elements, and divides by :math:`N`.
 
-    The division by :math:`n` can be avoided if one sets ``reduction = 'sum'``.
+    The division by :math:`N` can be avoided if one sets ``reduction = 'sum'``.
 
     Supports real-valued and complex-valued inputs.
 
@@ -85,7 +112,7 @@ class L1Loss(_Loss):
         - Output: scalar. If :attr:`reduction` is ``'none'``, then
           :math:`(*)`, same shape as the input.
 
-    Examples::
+    Examples:
 
         >>> loss = nn.L1Loss()
         >>> input = mindtorch.randn(3, 5, requires_grad=True)
@@ -93,12 +120,17 @@ class L1Loss(_Loss):
         >>> output = loss(input, target)
         >>> output.backward()
     """
-    __constants__ = ['reduction']
 
-    def forward(self, input: Tensor, target: Tensor) -> Tensor:
-        return F.l1_loss(input, target, reduction=self.reduction)
+    __constants__ = ["reduction"]
 
+    def __init__(self, size_average=None, reduce=None, reduction: str = "mean") -> None:
+        super().__init__(size_average, reduce, reduction)
 
+    def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.l1_loss(input, target, reduction=self.reduction)
 
 
 class NLLLoss(_WeightedLoss):
@@ -127,8 +159,8 @@ class NLLLoss(_WeightedLoss):
     The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
 
     .. math::
-        \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
-        l_n = - w_{y_n} x_{n,y_n}, \quad
+        \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \\
+        l_n = - w_{y_n} x_{n,y_n}, \\
         w_{c} = \text{weight}[c] \cdot \mathbb{1}\{c \not= \text{ignore\_index}\},
 
     where :math:`x` is the input, :math:`y` is the target, :math:`w` is the weight, and
@@ -180,7 +212,7 @@ class NLLLoss(_WeightedLoss):
           :math:`(N, d_1, d_2, ..., d_K)` with :math:`K \geq 1` in the case of K-dimensional loss.
           Otherwise, scalar.
 
-    Examples::
+    Examples:
 
         >>> log_softmax = nn.LogSoftmax(dim=1)
         >>> loss_fn = nn.NLLLoss()
@@ -206,17 +238,32 @@ class NLLLoss(_WeightedLoss):
         >>> loss = loss_fn(output, target)
         >>> loss.backward()
     """
-    __constants__ = ['ignore_index', 'reduction']
+
+    __constants__ = ["ignore_index", "reduction"]
     ignore_index: int
 
-    def __init__(self, weight: Optional[Tensor] = None, size_average=None, ignore_index: int = -100,
-                 reduce=None, reduction: str = 'mean') -> None:
+    def __init__(
+        self,
+        weight: Optional[Tensor] = None,
+        size_average=None,
+        ignore_index: int = -100,
+        reduce=None,
+        reduction: str = "mean",
+    ) -> None:
         super().__init__(weight, size_average, reduce, reduction)
         self.ignore_index = ignore_index
 
     def forward(self, input: Tensor, target: Tensor) -> Tensor:
-        return F.nll_loss(input, target, weight=self.weight, ignore_index=self.ignore_index, reduction=self.reduction)
-
+        """
+        Runs the forward pass.
+        """
+        return F.nll_loss(
+            input,
+            target,
+            weight=self.weight,
+            ignore_index=self.ignore_index,
+            reduction=self.reduction,
+        )
 
 
 @deprecated(
@@ -226,12 +273,17 @@ def forward(self, input: Tensor, target: Tensor) -> Tensor:
     category=FutureWarning,
 )
 class NLLLoss2d(NLLLoss):
-    def __init__(self, weight: Optional[Tensor] = None, size_average=None, ignore_index: int = -100,
-                 reduce=None, reduction: str = 'mean') -> None:
+    def __init__(
+        self,
+        weight: Optional[Tensor] = None,
+        size_average=None,
+        ignore_index: int = -100,
+        reduce=None,
+        reduction: str = "mean",
+    ) -> None:
         super().__init__(weight, size_average, ignore_index, reduce, reduction)
 
 
-
 class PoissonNLLLoss(_Loss):
     r"""Negative log likelihood loss with Poisson distribution of target.
 
@@ -274,7 +326,7 @@ class PoissonNLLLoss(_Loss):
             and :attr:`reduce` are in the process of being deprecated, and in the meantime,
             specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
 
-    Examples::
+    Examples:
 
         >>> loss = nn.PoissonNLLLoss()
         >>> log_input = mindtorch.randn(5, 2, requires_grad=True)
@@ -288,23 +340,38 @@ class PoissonNLLLoss(_Loss):
         - Output: scalar by default. If :attr:`reduction` is ``'none'``, then :math:`(*)`,
           the same shape as the input.
     """
-    __constants__ = ['log_input', 'full', 'eps', 'reduction']
+
+    __constants__ = ["log_input", "full", "eps", "reduction"]
     log_input: bool
     full: bool
     eps: float
 
-    def __init__(self, log_input: bool = True, full: bool = False, size_average=None,
-                 eps: float = 1e-8, reduce=None, reduction: str = 'mean') -> None:
+    def __init__(
+        self,
+        log_input: bool = True,
+        full: bool = False,
+        size_average=None,
+        eps: float = 1e-8,
+        reduce=None,
+        reduction: str = "mean",
+    ) -> None:
         super().__init__(size_average, reduce, reduction)
         self.log_input = log_input
         self.full = full
         self.eps = eps
 
     def forward(self, log_input: Tensor, target: Tensor) -> Tensor:
-        return F.poisson_nll_loss(log_input, target, log_input=self.log_input, full=self.full,
-                                  eps=self.eps, reduction=self.reduction)
-
-
+        """
+        Runs the forward pass.
+        """
+        return F.poisson_nll_loss(
+            log_input,
+            target,
+            log_input=self.log_input,
+            full=self.full,
+            eps=self.eps,
+            reduction=self.reduction,
+        )
 
 
 class GaussianNLLLoss(_Loss):
@@ -343,12 +410,12 @@ class GaussianNLLLoss(_Loss):
           but with one dimension equal to 1 (to allow for broadcasting)
         - Var: :math:`(N, *)` or :math:`(*)`, same shape as the input, or same shape as the input but
           with one dimension equal to 1, or same shape as the input but with one fewer
-          dimension (to allow for broadcasting)
+          dimension (to allow for broadcasting), or a scalar value
         - Output: scalar if :attr:`reduction` is ``'mean'`` (default) or
           ``'sum'``. If :attr:`reduction` is ``'none'``, then :math:`(N, *)`, same
           shape as the input
 
-    Examples::
+    Examples:
         >>> loss = nn.GaussianNLLLoss()
         >>> input = mindtorch.randn(5, 2, requires_grad=True)
         >>> target = mindtorch.randn(5, 2)
@@ -373,19 +440,27 @@ class GaussianNLLLoss(_Loss):
         Conference on Neural Networks (ICNN'94), Orlando, FL, USA, 1994, pp. 55-60
         vol.1, doi: 10.1109/ICNN.1994.374138.
     """
-    __constants__ = ['full', 'eps', 'reduction']
+
+    __constants__ = ["full", "eps", "reduction"]
     full: bool
     eps: float
 
-    def __init__(self, *, full: bool = False, eps: float = 1e-6, reduction: str = 'mean') -> None:
+    def __init__(
+        self, *, full: bool = False, eps: float = 1e-6, reduction: str = "mean"
+    ) -> None:
         super().__init__(None, None, reduction)
         self.full = full
         self.eps = eps
 
-    def forward(self, input: Tensor, target: Tensor, var: Tensor) -> Tensor:
-        return F.gaussian_nll_loss(input, target, var, full=self.full, eps=self.eps, reduction=self.reduction)
-
-
+    def forward(
+        self, input: Tensor, target: Tensor, var: Union[Tensor, float]
+    ) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.gaussian_nll_loss(
+            input, target, var, full=self.full, eps=self.eps, reduction=self.reduction
+        )
 
 
 class KLDivLoss(_Loss):
@@ -409,7 +484,7 @@ class KLDivLoss(_Loss):
 
     .. code-block:: python
 
-        if not log_target: # default
+        if not log_target:  # default
             loss_pointwise = target * (target.log() - input)
         else:
             loss_pointwise = target.exp() * (target - input)
@@ -457,28 +532,38 @@ class KLDivLoss(_Loss):
         - Output: scalar by default. If :attr:`reduction` is `'none'`, then :math:`(*)`,
           same shape as the input.
 
-    Examples::
+    Examples:
         >>> kl_loss = nn.KLDivLoss(reduction="batchmean")
         >>> # input should be a distribution in the log space
         >>> input = F.log_softmax(mindtorch.randn(3, 5, requires_grad=True), dim=1)
         >>> # Sample a batch of distributions. Usually this would come from the dataset
         >>> target = F.softmax(mindtorch.rand(3, 5), dim=1)
         >>> output = kl_loss(input, target)
-
+        >>>
         >>> kl_loss = nn.KLDivLoss(reduction="batchmean", log_target=True)
         >>> log_target = F.log_softmax(mindtorch.rand(3, 5), dim=1)
         >>> output = kl_loss(input, log_target)
     """
-    __constants__ = ['reduction']
 
-    def __init__(self, size_average=None, reduce=None, reduction: str = 'mean', log_target: bool = False) -> None:
+    __constants__ = ["reduction"]
+
+    def __init__(
+        self,
+        size_average=None,
+        reduce=None,
+        reduction: str = "mean",
+        log_target: bool = False,
+    ) -> None:
         super().__init__(size_average, reduce, reduction)
         self.log_target = log_target
 
     def forward(self, input: Tensor, target: Tensor) -> Tensor:
-        return F.kl_div(input, target, reduction=self.reduction, log_target=self.log_target)
-
-
+        """
+        Runs the forward pass.
+        """
+        return F.kl_div(
+            input, target, reduction=self.reduction, log_target=self.log_target
+        )
 
 
 class MSELoss(_Loss):
@@ -502,11 +587,11 @@ class MSELoss(_Loss):
         \end{cases}
 
     :math:`x` and :math:`y` are tensors of arbitrary shapes with a total
-    of :math:`n` elements each.
+    of :math:`N` elements each.
 
-    The mean operation still operates over all the elements, and divides by :math:`n`.
+    The mean operation still operates over all the elements, and divides by :math:`N`.
 
-    The division by :math:`n` can be avoided if one sets ``reduction = 'sum'``.
+    The division by :math:`N` can be avoided if one sets ``reduction = 'sum'``.
 
     Args:
         size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
@@ -529,7 +614,7 @@ class MSELoss(_Loss):
         - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
         - Target: :math:`(*)`, same shape as the input.
 
-    Examples::
+    Examples:
 
         >>> loss = nn.MSELoss()
         >>> input = mindtorch.randn(3, 5, requires_grad=True)
@@ -537,9 +622,16 @@ class MSELoss(_Loss):
         >>> output = loss(input, target)
         >>> output.backward()
     """
-    __constants__ = ['reduction']
+
+    __constants__ = ["reduction"]
+
+    def __init__(self, size_average=None, reduce=None, reduction: str = "mean") -> None:
+        super().__init__(size_average, reduce, reduction)
 
     def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
         return F.mse_loss(input, target, reduction=self.reduction)
 
 
@@ -608,7 +700,7 @@ class BCELoss(_WeightedLoss):
         - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(*)`, same
           shape as input.
 
-    Examples::
+    Examples:
 
         >>> m = nn.Sigmoid()
         >>> loss = nn.BCELoss()
@@ -617,10 +709,25 @@ class BCELoss(_WeightedLoss):
         >>> output = loss(m(input), target)
         >>> output.backward()
     """
-    __constants__ = ['reduction']
+
+    __constants__ = ["reduction"]
+
+    def __init__(
+        self,
+        weight: Optional[Tensor] = None,
+        size_average=None,
+        reduce=None,
+        reduction: str = "mean",
+    ) -> None:
+        super().__init__(weight, size_average, reduce, reduction)
 
     def forward(self, input: Tensor, target: Tensor) -> Tensor:
-        return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)
+        """
+        Runs the forward pass.
+        """
+        return F.binary_cross_entropy(
+            input, target, weight=self.weight, reduction=self.reduction
+        )
 
 
 class BCEWithLogitsLoss(_Loss):
@@ -668,7 +775,7 @@ class BCEWithLogitsLoss(_Loss):
     then ``pos_weight`` for the class should be equal to :math:`\frac{300}{100}=3`.
     The loss would act as if the dataset contains :math:`3\times 100=300` positive examples.
 
-    Examples::
+    Examples:
 
         >>> target = mindtorch.ones([10, 64], dtype=mindtorch.float32)  # 64 classes, batch size = 10
         >>> output = mindtorch.full([10, 64], 1.5)  # A prediction (logit)
@@ -707,7 +814,7 @@ class BCEWithLogitsLoss(_Loss):
             operations. For a target of size [B, C, H, W] (where B is batch size) pos_weight of
             size [B, C, H, W] will apply different pos_weights to each element of the batch or
             [C, H, W] the same pos_weights across the batch. To apply the same positive weight
-            along all spacial dimensions for a 2D multi-class target [C, H, W] use: [C, 1, 1].
+            along all spatial dimensions for a 2D multi-class target [C, H, W] use: [C, 1, 1].
             Default: ``None``
 
     Shape:
@@ -716,7 +823,7 @@ class BCEWithLogitsLoss(_Loss):
         - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(*)`, same
           shape as input.
 
-     Examples::
+    Examples:
 
         >>> loss = nn.BCEWithLogitsLoss()
         >>> input = mindtorch.randn(3, requires_grad=True)
@@ -724,21 +831,30 @@ class BCEWithLogitsLoss(_Loss):
         >>> output = loss(input, target)
         >>> output.backward()
     """
-    def __init__(self, weight: Optional[Tensor] = None, size_average=None, reduce=None, reduction: str = 'mean',
-                 pos_weight: Optional[Tensor] = None) -> None:
+
+    def __init__(
+        self,
+        weight: Optional[Tensor] = None,
+        size_average=None,
+        reduce=None,
+        reduction: str = "mean",
+        pos_weight: Optional[Tensor] = None,
+    ) -> None:
         super().__init__(size_average, reduce, reduction)
-        self.register_buffer('weight', weight)
-        self.register_buffer('pos_weight', pos_weight)
+        self.register_buffer("weight", weight)
+        self.register_buffer("pos_weight", pos_weight)
         self.weight: Optional[Tensor]
         self.pos_weight: Optional[Tensor]
 
     def forward(self, input: Tensor, target: Tensor) -> Tensor:
-        return F.binary_cross_entropy_with_logits(input, target,
-                                                  self.weight,
-                                                  pos_weight=self.pos_weight,
-                                                  reduction=self.reduction)
-
-
+        """Runs the forward pass."""
+        return F.binary_cross_entropy_with_logits(
+            input,
+            target,
+            self.weight,
+            pos_weight=self.pos_weight,
+            reduction=self.reduction,
+        )
 
 
 class HingeEmbeddingLoss(_Loss):
@@ -790,17 +906,25 @@ class HingeEmbeddingLoss(_Loss):
         - Target: :math:`(*)`, same shape as the input
         - Output: scalar. If :attr:`reduction` is ``'none'``, then same shape as the input
     """
-    __constants__ = ['margin', 'reduction']
+
+    __constants__ = ["margin", "reduction"]
     margin: float
 
-    def __init__(self, margin: float = 1.0, size_average=None, reduce=None, reduction: str = 'mean') -> None:
+    def __init__(
+        self,
+        margin: float = 1.0,
+        size_average=None,
+        reduce=None,
+        reduction: str = "mean",
+    ) -> None:
         super().__init__(size_average, reduce, reduction)
         self.margin = margin
 
     def forward(self, input: Tensor, target: Tensor) -> Tensor:
-        return F.hinge_embedding_loss(input, target, margin=self.margin, reduction=self.reduction)
-
-
+        """Runs the forward pass."""
+        return F.hinge_embedding_loss(
+            input, target, margin=self.margin, reduction=self.reduction
+        )
 
 
 class MultiLabelMarginLoss(_Loss):
@@ -847,7 +971,7 @@ class MultiLabelMarginLoss(_Loss):
         - Target: :math:`(C)` or :math:`(N, C)`, label targets padded by -1 ensuring same shape as the input.
         - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(N)`.
 
-    Examples::
+    Examples:
 
         >>> loss = nn.MultiLabelMarginLoss()
         >>> x = mindtorch.FloatTensor([[0.1, 0.2, 0.4, 0.8]])
@@ -858,12 +982,15 @@ class MultiLabelMarginLoss(_Loss):
         tensor(0.85...)
 
     """
-    __constants__ = ['reduction']
 
-    def forward(self, input: Tensor, target: Tensor) -> Tensor:
-        return F.multilabel_margin_loss(input, target, reduction=self.reduction)
+    __constants__ = ["reduction"]
 
+    def __init__(self, size_average=None, reduce=None, reduction: str = "mean") -> None:
+        super().__init__(size_average, reduce, reduction)
 
+    def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        """Runs the forward pass."""
+        return F.multilabel_margin_loss(input, target, reduction=self.reduction)
 
 
 class SmoothL1Loss(_Loss):
@@ -937,18 +1064,20 @@ class SmoothL1Loss(_Loss):
         - Target: :math:`(*)`, same shape as the input.
         - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(*)`, same shape as the input.
     """
-    __constants__ = ['reduction']
 
-    def __init__(self, size_average=None, reduce=None, reduction: str = 'mean', beta: float = 1.0) -> None:
+    __constants__ = ["reduction"]
+
+    def __init__(
+        self, size_average=None, reduce=None, reduction: str = "mean", beta: float = 1.0
+    ) -> None:
         super().__init__(size_average, reduce, reduction)
         self.beta = beta
 
     def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        """Runs the forward pass."""
         return F.smooth_l1_loss(input, target, reduction=self.reduction, beta=self.beta)
 
 
-
-
 class HuberLoss(_Loss):
     r"""Creates a criterion that uses a squared term if the absolute
     element-wise error falls below delta and a delta-scaled L1 term otherwise.
@@ -999,18 +1128,18 @@ class HuberLoss(_Loss):
         - Target: :math:`(*)`, same shape as the input.
         - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(*)`, same shape as the input.
     """
-    __constants__ = ['reduction', 'delta']
 
-    def __init__(self, reduction: str = 'mean', delta: float = 1.0) -> None:
+    __constants__ = ["reduction", "delta"]
+
+    def __init__(self, reduction: str = "mean", delta: float = 1.0) -> None:
         super().__init__(reduction=reduction)
         self.delta = delta
 
     def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        """Runs the forward pass."""
         return F.huber_loss(input, target, reduction=self.reduction, delta=self.delta)
 
 
-
-
 class SoftMarginLoss(_Loss):
     r"""Creates a criterion that optimizes a two-class classification
     logistic loss between input tensor :math:`x` and target tensor :math:`y`
@@ -1043,12 +1172,15 @@ class SoftMarginLoss(_Loss):
           shape as input.
 
     """
-    __constants__ = ['reduction']
 
-    def forward(self, input: Tensor, target: Tensor) -> Tensor:
-        return F.soft_margin_loss(input, target, reduction=self.reduction)
+    __constants__ = ["reduction"]
 
+    def __init__(self, size_average=None, reduce=None, reduction: str = "mean") -> None:
+        super().__init__(size_average, reduce, reduction)
 
+    def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        """Runs the forward pass."""
+        return F.soft_margin_loss(input, target, reduction=self.reduction)
 
 
 class CrossEntropyLoss(_WeightedLoss):
@@ -1123,7 +1255,7 @@ class probabilities only when a single class label per minibatch item is too res
 
     Args:
         weight (Tensor, optional): a manual rescaling weight given to each class.
-            If given, has to be a Tensor of size `C` and floating point dtype
+            If given, has to be a Tensor of size `C`.
         size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
             the losses are averaged over each loss element in the batch. Note that for
             some losses, there are multiple elements per sample. If the field :attr:`size_average`
@@ -1153,8 +1285,12 @@ class probabilities only when a single class label per minibatch item is too res
         - Input: Shape :math:`(C)`, :math:`(N, C)` or :math:`(N, C, d_1, d_2, ..., d_K)` with :math:`K \geq 1`
           in the case of `K`-dimensional loss.
         - Target: If containing class indices, shape :math:`()`, :math:`(N)` or :math:`(N, d_1, d_2, ..., d_K)` with
-          :math:`K \geq 1` in the case of K-dimensional loss where each value should be between :math:`[0, C)`.
-          If containing class probabilities, same shape as the input and each value should be between :math:`[0, 1]`.
+          :math:`K \geq 1` in the case of K-dimensional loss where each value should be between :math:`[0, C)`. The
+          target data type is required to be long when using class indices. If containing class probabilities, the
+          target must be the same shape input, and each value should be between :math:`[0, 1]`. This means the target
+          data type is required to be float when using class probabilities. Note that PyTorch does not strictly enforce
+          probability constraints on the class probabilities and that it is the user's responsibility to ensure
+          ``target`` contains valid probability distributions (see below examples section for more details).
         - Output: If reduction is 'none', shape :math:`()`, :math:`(N)` or :math:`(N, d_1, d_2, ..., d_K)` with :math:`K \geq 1`
           in the case of K-dimensional loss, depending on the shape of the input. Otherwise, scalar.
 
@@ -1167,7 +1303,7 @@ class probabilities only when a single class label per minibatch item is too res
                 N ={} & \text{batch size} \\
             \end{aligned}
 
-    Examples::
+    Examples:
 
         >>> # Example of target with class indices
         >>> loss = nn.CrossEntropyLoss()
@@ -1181,23 +1317,80 @@ class probabilities only when a single class label per minibatch item is too res
         >>> target = mindtorch.randn(3, 5).softmax(dim=1)
         >>> output = loss(input, target)
         >>> output.backward()
+
+    .. note::
+        When ``target`` contains class probabilities, it should consist of soft labels—that is,
+        each ``target`` entry should represent a probability distribution over the possible classes for a given data sample,
+        with individual probabilities between ``[0,1]`` and the total distribution summing to 1.
+        This is why the :func:`softmax()` function is applied to the ``target`` in the class probabilities example above.
+
+        PyTorch does not validate whether the values provided in ``target`` lie in the range ``[0,1]``
+        or whether the distribution of each data sample sums to ``1``.
+        No warning will be raised and it is the user's responsibility
+        to ensure that ``target`` contains valid probability distributions.
+        Providing arbitrary values may yield misleading loss values and unstable gradients during training.
+
+    Examples:
+        >>> # xdoctest: +SKIP
+        >>> # Example of target with incorrectly specified class probabilities
+        >>> loss = nn.CrossEntropyLoss()
+        >>> mindtorch.manual_seed(283)
+        >>> input = mindtorch.randn(3, 5, requires_grad=True)
+        >>> target = mindtorch.randn(3, 5)
+        >>> # Provided target class probabilities are not in range [0,1]
+        >>> target
+        tensor([[ 0.7105,  0.4446,  2.0297,  0.2671, -0.6075],
+                [-1.0496, -0.2753, -0.3586,  0.9270,  1.0027],
+                [ 0.7551,  0.1003,  1.3468, -0.3581, -0.9569]])
+        >>> # Provided target class probabilities do not sum to 1
+        >>> target.sum(axis=1)
+        tensor([2.8444, 0.2462, 0.8873])
+        >>> # No error message and possible misleading loss value
+        >>> loss(input, target).item()
+        4.6379876136779785
+        >>>
+        >>> # Example of target with correctly specified class probabilities
+        >>> # Use .softmax() to ensure true probability distribution
+        >>> target_new = target.softmax(dim=1)
+        >>> # New target class probabilities all in range [0,1]
+        >>> target_new
+        tensor([[0.1559, 0.1195, 0.5830, 0.1000, 0.0417],
+                [0.0496, 0.1075, 0.0990, 0.3579, 0.3860],
+                [0.2607, 0.1355, 0.4711, 0.0856, 0.0471]])
+        >>> # New target class probabilities sum to 1
+        >>> target_new.sum(axis=1)
+        tensor([1.0000, 1.0000, 1.0000])
+        >>> loss(input, target_new).item()
+        2.55349063873291
     """
-    __constants__ = ['ignore_index', 'reduction', 'label_smoothing']
+
+    __constants__ = ["ignore_index", "reduction", "label_smoothing"]
     ignore_index: int
     label_smoothing: float
 
-    def __init__(self, weight: Optional[Tensor] = None, size_average=None, ignore_index: int = -100,
-                 reduce=None, reduction: str = 'mean', label_smoothing: float = 0.0) -> None:
+    def __init__(
+        self,
+        weight: Optional[Tensor] = None,
+        size_average=None,
+        ignore_index: int = -100,
+        reduce=None,
+        reduction: str = "mean",
+        label_smoothing: float = 0.0,
+    ) -> None:
         super().__init__(weight, size_average, reduce, reduction)
         self.ignore_index = ignore_index
         self.label_smoothing = label_smoothing
 
     def forward(self, input: Tensor, target: Tensor) -> Tensor:
-        return F.cross_entropy(input, target, weight=self.weight,
-                               ignore_index=self.ignore_index, reduction=self.reduction,
-                               label_smoothing=self.label_smoothing)
-
-
+        """Runs the forward pass."""
+        return F.cross_entropy(
+            input,
+            target,
+            weight=self.weight,
+            ignore_index=self.ignore_index,
+            reduction=self.reduction,
+            label_smoothing=self.label_smoothing,
+        )
 
 
 class MultiLabelSoftMarginLoss(_WeightedLoss):
@@ -1238,12 +1431,23 @@ class MultiLabelSoftMarginLoss(_WeightedLoss):
         - Target: :math:`(N, C)`, label targets must have the same shape as the input.
         - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(N)`.
     """
-    __constants__ = ['reduction']
 
-    def forward(self, input: Tensor, target: Tensor) -> Tensor:
-        return F.multilabel_soft_margin_loss(input, target, weight=self.weight, reduction=self.reduction)
+    __constants__ = ["reduction"]
 
+    def __init__(
+        self,
+        weight: Optional[Tensor] = None,
+        size_average=None,
+        reduce=None,
+        reduction: str = "mean",
+    ) -> None:
+        super().__init__(weight, size_average, reduce, reduction)
 
+    def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        """Runs the forward pass."""
+        return F.multilabel_soft_margin_loss(
+            input, target, weight=self.weight, reduction=self.reduction
+        )
 
 
 class CosineEmbeddingLoss(_Loss):
@@ -1288,7 +1492,7 @@ class CosineEmbeddingLoss(_Loss):
         - Target: :math:`(N)` or :math:`()`.
         - Output: If :attr:`reduction` is ``'none'``, then :math:`(N)`, otherwise scalar.
 
-    Examples::
+    Examples:
 
         >>> loss = nn.CosineEmbeddingLoss()
         >>> input1 = mindtorch.randn(3, 5, requires_grad=True)
@@ -1297,17 +1501,25 @@ class CosineEmbeddingLoss(_Loss):
         >>> output = loss(input1, input2, target)
         >>> output.backward()
     """
-    __constants__ = ['margin', 'reduction']
+
+    __constants__ = ["margin", "reduction"]
     margin: float
 
-    def __init__(self, margin: float = 0., size_average=None, reduce=None, reduction: str = 'mean') -> None:
+    def __init__(
+        self,
+        margin: float = 0.0,
+        size_average=None,
+        reduce=None,
+        reduction: str = "mean",
+    ) -> None:
         super().__init__(size_average, reduce, reduction)
         self.margin = margin
 
     def forward(self, input1: Tensor, input2: Tensor, target: Tensor) -> Tensor:
-        return F.cosine_embedding_loss(input1, input2, target, margin=self.margin, reduction=self.reduction)
-
-
+        """Runs the forward pass."""
+        return F.cosine_embedding_loss(
+            input1, input2, target, margin=self.margin, reduction=self.reduction
+        )
 
 
 class MarginRankingLoss(_Loss):
@@ -1347,7 +1559,7 @@ class MarginRankingLoss(_Loss):
         - Target: :math:`(N)` or :math:`()`, same shape as the inputs.
         - Output: scalar. If :attr:`reduction` is ``'none'`` and Input size is not :math:`()`, then :math:`(N)`.
 
-    Examples::
+    Examples:
 
         >>> loss = nn.MarginRankingLoss()
         >>> input1 = mindtorch.randn(3, requires_grad=True)
@@ -1356,17 +1568,25 @@ class MarginRankingLoss(_Loss):
         >>> output = loss(input1, input2, target)
         >>> output.backward()
     """
-    __constants__ = ['margin', 'reduction']
+
+    __constants__ = ["margin", "reduction"]
     margin: float
 
-    def __init__(self, margin: float = 0., size_average=None, reduce=None, reduction: str = 'mean') -> None:
+    def __init__(
+        self,
+        margin: float = 0.0,
+        size_average=None,
+        reduce=None,
+        reduction: str = "mean",
+    ) -> None:
         super().__init__(size_average, reduce, reduction)
         self.margin = margin
 
     def forward(self, input1: Tensor, input2: Tensor, target: Tensor) -> Tensor:
-        return F.margin_ranking_loss(input1, input2, target, margin=self.margin, reduction=self.reduction)
-
-
+        """Runs the forward pass."""
+        return F.margin_ranking_loss(
+            input1, input2, target, margin=self.margin, reduction=self.reduction
+        )
 
 
 class MultiMarginLoss(_WeightedLoss):
@@ -1420,7 +1640,7 @@ class MultiMarginLoss(_WeightedLoss):
         - Target: :math:`(N)` or :math:`()`, where each value is :math:`0 \leq \text{targets}[i] \leq C-1`.
         - Output: scalar. If :attr:`reduction` is ``'none'``, then same shape as the target.
 
-    Examples::
+    Examples:
 
         >>> loss = nn.MultiMarginLoss()
         >>> x = mindtorch.tensor([[0.1, 0.2, 0.4, 0.8]])
@@ -1429,16 +1649,24 @@ class MultiMarginLoss(_WeightedLoss):
         >>> loss(x, y)
         tensor(0.32...)
     """
-    __constants__ = ['p', 'margin', 'reduction']
+
+    __constants__ = ["p", "margin", "reduction"]
     margin: float
     p: int
 
-    def __init__(self, p: int = 1, margin: float = 1., weight: Optional[Tensor] = None, size_average=None,
-                 reduce=None, reduction: str = 'mean') -> None:
+    def __init__(
+        self,
+        p: int = 1,
+        margin: float = 1.0,
+        weight: Optional[Tensor] = None,
+        size_average=None,
+        reduce=None,
+        reduction: str = "mean",
+    ) -> None:
         super().__init__(weight, size_average, reduce, reduction)
-        if p not in (1, 2):
+        if p != 1 and p != 2:
             raise ValueError("only p == 1 and p == 2 supported")
-        if weight is not None and weight.dim() != 1 :
+        if weight is not None and weight.dim() != 1:
             raise ValueError(
                 f"MultiMarginLoss: expected weight to be None or 1D tensor, got {weight.dim()}D instead"
             )
@@ -1446,10 +1674,15 @@ def __init__(self, p: int = 1, margin: float = 1., weight: Optional[Tensor] = No
         self.margin = margin
 
     def forward(self, input: Tensor, target: Tensor) -> Tensor:
-        return F.multi_margin_loss(input, target, p=self.p, margin=self.margin,
-                                   weight=self.weight, reduction=self.reduction)
-
-
+        """Runs the forward pass."""
+        return F.multi_margin_loss(
+            input,
+            target,
+            p=self.p,
+            margin=self.margin,
+            weight=self.weight,
+            reduction=self.reduction,
+        )
 
 
 class TripletMarginLoss(_Loss):
@@ -1509,7 +1742,7 @@ class TripletMarginLoss(_Loss):
         - Output: A Tensor of shape :math:`(N)` if :attr:`reduction` is ``'none'`` and
           input shape is :math:`(N, D)`; a scalar otherwise.
 
-    Examples::
+    Examples:
 
     >>> triplet_loss = nn.TripletMarginLoss(margin=1.0, p=2, eps=1e-7)
     >>> anchor = mindtorch.randn(100, 128, requires_grad=True)
@@ -1519,16 +1752,25 @@ class TripletMarginLoss(_Loss):
     >>> output.backward()
 
     .. _Learning shallow convolutional feature descriptors with triplet losses:
-        http://www.bmva.org/bmvc/2016/papers/paper119/index.html
+        https://bmva-archive.org.uk/bmvc/2016/papers/paper119/index.html
     """
-    __constants__ = ['margin', 'p', 'eps', 'swap', 'reduction']
+
+    __constants__ = ["margin", "p", "eps", "swap", "reduction"]
     margin: float
     p: float
     eps: float
     swap: bool
 
-    def __init__(self, margin: float = 1.0, p: float = 2., eps: float = 1e-6, swap: bool = False, size_average=None,
-                 reduce=None, reduction: str = 'mean'):
+    def __init__(
+        self,
+        margin: float = 1.0,
+        p: float = 2.0,
+        eps: float = 1e-6,
+        swap: bool = False,
+        size_average=None,
+        reduce=None,
+        reduction: str = "mean",
+    ) -> None:
         super().__init__(size_average, reduce, reduction)
         if margin <= 0:
             raise ValueError(
@@ -1540,10 +1782,17 @@ def __init__(self, margin: float = 1.0, p: float = 2., eps: float = 1e-6, swap:
         self.swap = swap
 
     def forward(self, anchor: Tensor, positive: Tensor, negative: Tensor) -> Tensor:
-        return F.triplet_margin_loss(anchor, positive, negative, margin=self.margin, p=self.p,
-                                     eps=self.eps, swap=self.swap, reduction=self.reduction)
-
-
+        """Runs the forward pass."""
+        return F.triplet_margin_loss(
+            anchor,
+            positive,
+            negative,
+            margin=self.margin,
+            p=self.p,
+            eps=self.eps,
+            swap=self.swap,
+            reduction=self.reduction,
+        )
 
 
 class TripletMarginWithDistanceLoss(_Loss):
@@ -1606,7 +1855,7 @@ class TripletMarginWithDistanceLoss(_Loss):
         - Output: A Tensor of shape :math:`(N)` if :attr:`reduction` is ``'none'``, or a scalar
           otherwise.
 
-    Examples::
+    Examples:
 
     >>> # Initialize embeddings
     >>> embedding = nn.Embedding(1000, 128)
@@ -1642,30 +1891,43 @@ class TripletMarginWithDistanceLoss(_Loss):
 
     Reference:
         V. Balntas, et al.: Learning shallow convolutional feature descriptors with triplet losses:
-        http://www.bmva.org/bmvc/2016/papers/paper119/index.html
+        https://bmva-archive.org.uk/bmvc/2016/papers/paper119/index.html
     """
-    __constants__ = ['margin', 'swap', 'reduction']
+
+    __constants__ = ["margin", "swap", "reduction"]
     margin: float
     swap: bool
 
-    def __init__(self, *, distance_function: Optional[Callable[[Tensor, Tensor], Tensor]] = None,
-                 margin: float = 1.0, swap: bool = False, reduction: str = 'mean'):
+    def __init__(
+        self,
+        *,
+        distance_function: Optional[Callable[[Tensor, Tensor], Tensor]] = None,
+        margin: float = 1.0,
+        swap: bool = False,
+        reduction: str = "mean",
+    ) -> None:
         super().__init__(size_average=None, reduce=None, reduction=reduction)
         if margin <= 0:
             raise ValueError(
                 f"TripletMarginWithDistanceLoss: expected margin to be greater than 0, got {margin} instead"
             )
-        self.distance_function: Optional[Callable[[Tensor, Tensor], Tensor]] = \
+        self.distance_function: Optional[Callable[[Tensor, Tensor], Tensor]] = (
             distance_function if distance_function is not None else PairwiseDistance()
+        )
         self.margin = margin
         self.swap = swap
 
     def forward(self, anchor: Tensor, positive: Tensor, negative: Tensor) -> Tensor:
-        return F.triplet_margin_with_distance_loss(anchor, positive, negative,
-                                                   distance_function=self.distance_function,
-                                                   margin=self.margin, swap=self.swap, reduction=self.reduction)
-
-
+        """Runs the forward pass."""
+        return F.triplet_margin_with_distance_loss(
+            anchor,
+            positive,
+            negative,
+            distance_function=self.distance_function,
+            margin=self.margin,
+            swap=self.swap,
+            reduction=self.reduction,
+        )
 
 
 class CTCLoss(_Loss):
@@ -1725,13 +1987,13 @@ class CTCLoss(_Loss):
           ``'sum'``. If :attr:`reduction` is ``'none'``, then :math:`(N)` if input is batched or
           :math:`()` if input is unbatched, where :math:`N = \text{batch size}`.
 
-    Examples::
+    Examples:
 
         >>> # Target are to be padded
-        >>> T = 50      # Input sequence length
-        >>> C = 20      # Number of classes (including blank)
-        >>> N = 16      # Batch size
-        >>> S = 30      # Target sequence length of longest target in batch (padding length)
+        >>> T = 50  # Input sequence length
+        >>> C = 20  # Number of classes (including blank)
+        >>> N = 16  # Batch size
+        >>> S = 30  # Target sequence length of longest target in batch (padding length)
         >>> S_min = 10  # Minimum target length, for demonstration purposes
         >>>
         >>> # Initialize random batch of input vectors, for *size = (T,N,C)
@@ -1741,16 +2003,21 @@ class CTCLoss(_Loss):
         >>> target = mindtorch.randint(low=1, high=C, size=(N, S), dtype=mindtorch.long)
         >>>
         >>> input_lengths = mindtorch.full(size=(N,), fill_value=T, dtype=mindtorch.long)
-        >>> target_lengths = mindtorch.randint(low=S_min, high=S, size=(N,), dtype=mindtorch.long)
+        >>> target_lengths = mindtorch.randint(
+        ...     low=S_min,
+        ...     high=S,
+        ...     size=(N,),
+        ...     dtype=mindtorch.long,
+        ... )
         >>> ctc_loss = nn.CTCLoss()
         >>> loss = ctc_loss(input, target, input_lengths, target_lengths)
         >>> loss.backward()
         >>>
         >>>
         >>> # Target are to be un-padded
-        >>> T = 50      # Input sequence length
-        >>> C = 20      # Number of classes (including blank)
-        >>> N = 16      # Batch size
+        >>> T = 50  # Input sequence length
+        >>> C = 20  # Number of classes (including blank)
+        >>> N = 16  # Batch size
         >>>
         >>> # Initialize random batch of input vectors, for *size = (T,N,C)
         >>> input = mindtorch.randn(T, N, C).log_softmax(2).detach().requires_grad_()
@@ -1758,15 +2025,20 @@ class CTCLoss(_Loss):
         >>>
         >>> # Initialize random batch of targets (0 = blank, 1:C = classes)
         >>> target_lengths = mindtorch.randint(low=1, high=T, size=(N,), dtype=mindtorch.long)
-        >>> target = mindtorch.randint(low=1, high=C, size=(sum(target_lengths),), dtype=mindtorch.long)
+        >>> target = mindtorch.randint(
+        ...     low=1,
+        ...     high=C,
+        ...     size=(sum(target_lengths),),
+        ...     dtype=mindtorch.long,
+        ... )
         >>> ctc_loss = nn.CTCLoss()
         >>> loss = ctc_loss(input, target, input_lengths, target_lengths)
         >>> loss.backward()
         >>>
         >>>
         >>> # Target are to be un-padded and unbatched (effectively N=1)
-        >>> T = 50      # Input sequence length
-        >>> C = 20      # Number of classes (including blank)
+        >>> T = 50  # Input sequence length
+        >>> C = 20  # Number of classes (including blank)
         >>>
         >>> # Initialize random batch of input vectors, for *size = (T,C)
         >>> # xdoctest: +SKIP("FIXME: error in doctest")
@@ -1775,7 +2047,12 @@ class CTCLoss(_Loss):
         >>>
         >>> # Initialize random batch of targets (0 = blank, 1:C = classes)
         >>> target_lengths = mindtorch.randint(low=1, high=T, size=(), dtype=mindtorch.long)
-        >>> target = mindtorch.randint(low=1, high=C, size=(target_lengths,), dtype=mindtorch.long)
+        >>> target = mindtorch.randint(
+        ...     low=1,
+        ...     high=C,
+        ...     size=(target_lengths,),
+        ...     dtype=mindtorch.long,
+        ... )
         >>> ctc_loss = nn.CTCLoss()
         >>> loss = ctc_loss(input, target, input_lengths, target_lengths)
         >>> loss.backward()
@@ -1786,10 +2063,11 @@ class CTCLoss(_Loss):
         https://www.cs.toronto.edu/~graves/icml_2006.pdf
 
     Note:
-        In order to use CuDNN, the following must be satisfied: :attr:`targets` must be
+        In order to use CuDNN, the following must be satisfied: the :attr:`targets` must be
         in concatenated format, all :attr:`input_lengths` must be `T`.  :math:`blank=0`,
         :attr:`target_lengths` :math:`\leq 256`, the integer arguments must be of
-        dtype :attr:`mindtorch.int32`.
+        dtype :attr:`mindtorch.int32`, and the :attr:`log_probs` itself must be of
+        dtype :attr:`mindtorch.float32`.
 
         The regular implementation uses the (more common in PyTorch) `mindtorch.long` dtype.
 
@@ -1802,15 +2080,37 @@ class CTCLoss(_Loss):
         True``.
         Please see the notes on :doc:`/notes/randomness` for background.
     """
-    __constants__ = ['blank', 'reduction']
+
+    __constants__ = ["blank", "reduction"]
     blank: int
     zero_infinity: bool
 
-    def __init__(self, blank: int = 0, reduction: str = 'mean', zero_infinity: bool = False):
+    def __init__(
+        self, blank: int = 0, reduction: str = "mean", zero_infinity: bool = False
+    ) -> None:
         super().__init__(reduction=reduction)
         self.blank = blank
         self.zero_infinity = zero_infinity
 
-    def forward(self, log_probs: Tensor, targets: Tensor, input_lengths: Tensor, target_lengths: Tensor) -> Tensor:
-        return F.ctc_loss(log_probs, targets, input_lengths, target_lengths, self.blank, self.reduction,
-                          self.zero_infinity)
+    def forward(
+        self,
+        log_probs: Tensor,
+        targets: Tensor,
+        input_lengths: Tensor,
+        target_lengths: Tensor,
+    ) -> Tensor:
+        """Runs the forward pass."""
+        return F.ctc_loss(
+            log_probs,
+            targets,
+            input_lengths,
+            target_lengths,
+            self.blank,
+            self.reduction,
+            self.zero_infinity,
+        )
+
+
+# TODO: L1HingeEmbeddingCriterion
+# TODO: MSECriterion weight
+# TODO: ClassSimplexCriterion
\ No newline at end of file
diff --git a/mindtorch/nn/modules/normalization.py b/mindtorch/nn/modules/normalization.py
index b747d8c42..95b441a9d 100644
--- a/mindtorch/nn/modules/normalization.py
+++ b/mindtorch/nn/modules/normalization.py
@@ -1,25 +1,125 @@
-"""normalization"""
-from typing import Optional
+# mypy: allow-untyped-defs
 import numbers
-from ..parameter import Parameter
+from typing import Optional, Union
+
+import mindtorch
+from mindtorch import Size, Tensor
+from mindtorch.nn import functional as F, init
+from mindtorch.nn.parameter import Parameter
+
+# from ._functions import CrossMapLRN2d as _cross_map_lrn2d
 from .module import Module
-from .. import functional as F
-from .. import init
-from ... import ops
+
+
+__all__ = ["LocalResponseNorm", "CrossMapLRN2d", "LayerNorm", "GroupNorm", "RMSNorm"]
+
+
+class LocalResponseNorm(Module):
+    r"""Applies local response normalization over an input signal.
+
+    The input signal is composed of several input planes, where channels occupy the second dimension.
+    Applies normalization across channels.
+
+    .. math::
+        b_{c} = a_{c}\left(k + \frac{\alpha}{n}
+        \sum_{c'=\max(0, c-n/2)}^{\min(N-1,c+n/2)}a_{c'}^2\right)^{-\beta}
+
+    Args:
+        size: amount of neighbouring channels used for normalization
+        alpha: multiplicative factor. Default: 0.0001
+        beta: exponent. Default: 0.75
+        k: additive factor. Default: 1
+
+    Shape:
+        - Input: :math:`(N, C, *)`
+        - Output: :math:`(N, C, *)` (same shape as input)
+
+    Examples::
+
+        >>> lrn = nn.LocalResponseNorm(2)
+        >>> signal_2d = mindtorch.randn(32, 5, 24, 24)
+        >>> signal_4d = mindtorch.randn(16, 5, 7, 7, 7, 7)
+        >>> output_2d = lrn(signal_2d)
+        >>> output_4d = lrn(signal_4d)
+
+    """
+
+    __constants__ = ["size", "alpha", "beta", "k"]
+    size: int
+    alpha: float
+    beta: float
+    k: float
+
+    def __init__(
+        self, size: int, alpha: float = 1e-4, beta: float = 0.75, k: float = 1.0
+    ) -> None:
+        super().__init__()
+        self.size = size
+        self.alpha = alpha
+        self.beta = beta
+        self.k = k
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.local_response_norm(input, self.size, self.alpha, self.beta, self.k)
+
+    def extra_repr(self):
+        """
+        Return the extra representation of the module.
+        """
+        return "{size}, alpha={alpha}, beta={beta}, k={k}".format(**self.__dict__)
+
+
+class CrossMapLRN2d(Module):
+    size: int
+    alpha: float
+    beta: float
+    k: float
+
+    def __init__(
+        self, size: int, alpha: float = 1e-4, beta: float = 0.75, k: float = 1
+    ) -> None:
+        super().__init__()
+        self.size = size
+        self.alpha = alpha
+        self.beta = beta
+        self.k = k
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return _cross_map_lrn2d.apply(input, self.size, self.alpha, self.beta, self.k)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        return "{size}, alpha={alpha}, beta={beta}, k={k}".format(**self.__dict__)
+
+
+_shape_t = Union[int, list[int], Size]
 
 
 class LayerNorm(Module):
-    r"""Applies Layer Normalization over a mini-batch of inputs as described in
-    the paper `Layer Normalization`_ .
+    r"""Applies Layer Normalization over a mini-batch of inputs.
+
+    This layer implements the operation as described in
+    the paper `Layer Normalization <https://arxiv.org/abs/1607.06450>`__
 
     .. math::
         y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
 
-    The mean and standard-deviation are calculated separately over the last
-    certain number dimensions which have to be of the shape specified by
-    :attr:`normalized_shape`.
+    The mean and standard-deviation are calculated over the last `D` dimensions, where `D`
+    is the dimension of :attr:`normalized_shape`. For example, if :attr:`normalized_shape`
+    is ``(3, 5)`` (a 2-dimensional shape), the mean and standard-deviation are computed over
+    the last 2 dimensions of the input (i.e. ``input.mean((-2, -1))``).
     :math:`\gamma` and :math:`\beta` are learnable affine transform parameters of
     :attr:`normalized_shape` if :attr:`elementwise_affine` is ``True``.
+    The variance is calculated via the biased estimator, equivalent to
+    `mindtorch.var(input, unbiased=False)`.
 
     .. note::
         Unlike Batch Normalization and Instance Normalization, which applies
@@ -35,14 +135,25 @@ class LayerNorm(Module):
             of size
 
             .. math::
-                [* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
-                    \times \ldots \times \text{normalized_shape}[-1]]
+                [* \times \text{normalized\_shape}[0] \times \text{normalized\_shape}[1]
+                    \times \ldots \times \text{normalized\_shape}[-1]]
 
             If a single integer is used, it is treated as a singleton list, and this module will
             normalize over the last dimension which is expected to be of that specific size.
         eps: a value added to the denominator for numerical stability. Default: 1e-5
         elementwise_affine: a boolean value that when set to ``True``, this module
-            has learnable per-element affine parameters. Default: ``True``
+            has learnable per-element affine parameters initialized to ones (for weights)
+            and zeros (for biases). Default: ``True``.
+        bias: If set to ``False``, the layer will not learn an additive bias (only relevant if
+            :attr:`elementwise_affine` is ``True``). Default: ``True``.
+
+    Attributes:
+        weight: the learnable weights of the module of shape
+            :math:`\text{normalized\_shape}` when :attr:`elementwise_affine` is set to ``True``.
+            The values are initialized to 1.
+        bias:   the learnable bias of the module of shape
+                :math:`\text{normalized\_shape}` when :attr:`elementwise_affine` is set to ``True``.
+                The values are initialized to 0.
 
     Shape:
         - Input: :math:`(N, *)`
@@ -50,37 +161,61 @@ class LayerNorm(Module):
 
     Examples::
 
-        >>> input = mindtorch.randn(20, 5, 10, 10)
-        >>> # With Learnable Parameters
-        >>> m = nn.LayerNorm(input.size()[1:])
-        >>> # Without Learnable Parameters
-        >>> m = nn.LayerNorm(input.size()[1:], elementwise_affine=False)
-        >>> # Normalize over last two dimensions
-        >>> m = nn.LayerNorm([10, 10])
-        >>> # Normalize over last dimension of size 10
-        >>> m = nn.LayerNorm(10)
-        >>> # Activating the module
-        >>> output = m(input)
+        >>> # NLP Example
+        >>> batch, sentence_length, embedding_dim = 20, 5, 10
+        >>> embedding = mindtorch.randn(batch, sentence_length, embedding_dim)
+        >>> layer_norm = nn.LayerNorm(embedding_dim)
+        >>> # Activate module
+        >>> layer_norm(embedding)
+        >>>
+        >>> # Image Example
+        >>> N, C, H, W = 20, 5, 10, 10
+        >>> input = mindtorch.randn(N, C, H, W)
+        >>> # Normalize over the last three dimensions (i.e. the channel and spatial dimensions)
+        >>> # as shown in the image below
+        >>> layer_norm = nn.LayerNorm([C, H, W])
+        >>> output = layer_norm(input)
+
+    .. image:: ../_static/img/nn/layer_norm.jpg
+        :scale: 50 %
 
-    .. _`Layer Normalization`: https://arxiv.org/abs/1607.06450
     """
-    def __init__(self, normalized_shape, eps=1e-5, elementwise_affine=True, bias: bool = True, dtype=None, device=None):
-        factory_kwargs = {'dtype': dtype, 'device': device}
-        super(LayerNorm, self).__init__()
+
+    __constants__ = ["normalized_shape", "eps", "elementwise_affine"]
+    normalized_shape: tuple[int, ...]
+    eps: float
+    elementwise_affine: bool
+
+    def __init__(
+        self,
+        normalized_shape: _shape_t,
+        eps: float = 1e-5,
+        elementwise_affine: bool = True,
+        bias: bool = True,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
         if isinstance(normalized_shape, numbers.Integral):
-            normalized_shape = (normalized_shape,)
-        self.normalized_shape = tuple(normalized_shape)
+            # mypy error: incompatible types in assignment
+            normalized_shape = (normalized_shape,)  # type: ignore[assignment]
+        self.normalized_shape = tuple(normalized_shape)  # type: ignore[arg-type]
         self.eps = eps
         self.elementwise_affine = elementwise_affine
         if self.elementwise_affine:
-            self.weight = Parameter(ops.empty(self.normalized_shape, **factory_kwargs))
+            self.weight = Parameter(
+                mindtorch.empty(self.normalized_shape, **factory_kwargs)
+            )
             if bias:
-                self.bias = Parameter(ops.empty(self.normalized_shape, **factory_kwargs))
+                self.bias = Parameter(
+                    mindtorch.empty(self.normalized_shape, **factory_kwargs)
+                )
             else:
-                self.register_parameter('bias', None)
+                self.register_parameter("bias", None)
         else:
-            self.register_parameter('weight', None)
-            self.register_parameter('bias', None)
+            self.register_parameter("weight", None)
+            self.register_parameter("bias", None)
 
         self.reset_parameters()
 
@@ -90,26 +225,35 @@ def reset_parameters(self) -> None:
             if self.bias is not None:
                 init.zeros_(self.bias)
 
-    def forward(self, input):
-        return F.layer_norm(input, self.normalized_shape, self.weight, self.bias, self.eps)
+    def forward(self, input: Tensor) -> Tensor:
+        return F.layer_norm(
+            input, self.normalized_shape, self.weight, self.bias, self.eps
+        )
 
-    def extra_repr(self):
-        return '{normalized_shape}, eps={eps}, ' \
-            'elementwise_affine={elementwise_affine}'.format(**self.__dict__)
+    def extra_repr(self) -> str:
+        return (
+            "{normalized_shape}, eps={eps}, "
+            "elementwise_affine={elementwise_affine}".format(**self.__dict__)
+        )
 
 
 class GroupNorm(Module):
-    r"""Applies Group Normalization over a mini-batch of inputs as described in
-    the paper `Group Normalization`_ .
+    r"""Applies Group Normalization over a mini-batch of inputs.
+
+    This layer implements the operation as described in
+    the paper `Group Normalization <https://arxiv.org/abs/1803.08494>`__
 
     .. math::
         y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
 
     The input channels are separated into :attr:`num_groups` groups, each containing
-    ``num_channels / num_groups`` channels. The mean and standard-deviation are calculated
-    separately over the each group. :math:`\gamma` and :math:`\beta` are learnable
-    per-channel affine transform parameter vectorss of size :attr:`num_channels` if
+    ``num_channels / num_groups`` channels. :attr:`num_channels` must be divisible by
+    :attr:`num_groups`. The mean and standard-deviation are calculated
+    separately over each group. :math:`\gamma` and :math:`\beta` are learnable
+    per-channel affine transform parameter vectors of size :attr:`num_channels` if
     :attr:`affine` is ``True``.
+    The variance is calculated via the biased estimator, equivalent to
+    `mindtorch.var(input, unbiased=False)`.
 
     This layer uses statistics computed from input data in both training and
     evaluation modes.
@@ -119,11 +263,12 @@ class GroupNorm(Module):
         num_channels (int): number of channels expected in input
         eps: a value added to the denominator for numerical stability. Default: 1e-5
         affine: a boolean value that when set to ``True``, this module
-            has learnable per-channel affine parameters. Default: ``True``
+            has learnable per-channel affine parameters initialized to ones (for weights)
+            and zeros (for biases). Default: ``True``.
 
     Shape:
-        - Input: :math:`(N, num\_channels, *)`
-        - Output: :math:`(N, num\_channels, *)` (same shape as input)
+        - Input: :math:`(N, C, *)` where :math:`C=\text{num\_channels}`
+        - Output: :math:`(N, C, *)` (same shape as input)
 
     Examples::
 
@@ -136,37 +281,54 @@ class GroupNorm(Module):
         >>> m = nn.GroupNorm(1, 6)
         >>> # Activating the module
         >>> output = m(input)
-
-    .. _`Group Normalization`: https://arxiv.org/abs/1803.08494
     """
-    def __init__(self, num_groups, num_channels, eps=1e-5, affine=True, dtype=None):
-        factory_kwargs = {'dtype': dtype}
-        super(GroupNorm, self).__init__()
+
+    __constants__ = ["num_groups", "num_channels", "eps", "affine"]
+    num_groups: int
+    num_channels: int
+    eps: float
+    affine: bool
+
+    def __init__(
+        self,
+        num_groups: int,
+        num_channels: int,
+        eps: float = 1e-5,
+        affine: bool = True,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        if num_channels % num_groups != 0:
+            raise ValueError("num_channels must be divisible by num_groups")
+
         self.num_groups = num_groups
         self.num_channels = num_channels
         self.eps = eps
         self.affine = affine
         if self.affine:
-            self.weight = Parameter(ops.empty(num_channels, **factory_kwargs))
-            self.bias = Parameter(ops.empty(num_channels, **factory_kwargs))
+            self.weight = Parameter(mindtorch.empty(num_channels, **factory_kwargs))
+            self.bias = Parameter(mindtorch.empty(num_channels, **factory_kwargs))
         else:
-            self.register_parameter('weight', None)
-            self.register_parameter('bias', None)
+            self.register_parameter("weight", None)
+            self.register_parameter("bias", None)
 
         self.reset_parameters()
 
-    def forward(self, input):
-        return F.group_norm(input, self.num_groups, self.weight, self.bias, self.eps)
-
-
     def reset_parameters(self) -> None:
         if self.affine:
             init.ones_(self.weight)
             init.zeros_(self.bias)
 
-    def extra_repr(self):
-        return '{num_groups}, {num_channels}, eps={eps}, ' \
-            'affine={affine}'.format(**self.__dict__)
+    def forward(self, input: Tensor) -> Tensor:
+        return F.group_norm(input, self.num_groups, self.weight, self.bias, self.eps)
+
+    def extra_repr(self) -> str:
+        return "{num_groups}, {num_channels}, eps={eps}, affine={affine}".format(
+            **self.__dict__
+        )
+
 
 class RMSNorm(Module):
     r"""Applies Root Mean Square Layer Normalization over a mini-batch of inputs.
@@ -184,7 +346,7 @@ class RMSNorm(Module):
     the last 2 dimensions of the input.
 
     Args:
-        normalized_shape (int or list or torch.Size): input shape from an expected input
+        normalized_shape (int or list or mindtorch.Size): input shape from an expected input
             of size
 
             .. math::
@@ -193,7 +355,7 @@ class RMSNorm(Module):
 
             If a single integer is used, it is treated as a singleton list, and this module will
             normalize over the last dimension which is expected to be of that specific size.
-        eps: a value added to the denominator for numerical stability. Default: :func:`torch.finfo(x.dtype).eps`
+        eps: a value added to the denominator for numerical stability. Default: ``mindtorch.finfo(x.dtype).eps``
         elementwise_affine: a boolean value that when set to ``True``, this module
             has learnable per-element affine parameters initialized to ones (for weights). Default: ``True``.
 
@@ -204,10 +366,11 @@ class RMSNorm(Module):
     Examples::
 
         >>> rms_norm = nn.RMSNorm([2, 3])
-        >>> input = torch.randn(2, 2, 3)
+        >>> input = mindtorch.randn(2, 2, 3)
         >>> rms_norm(input)
 
     """
+
     __constants__ = ["normalized_shape", "eps", "elementwise_affine"]
     normalized_shape: tuple[int, ...]
     eps: Optional[float]
@@ -215,7 +378,7 @@ class RMSNorm(Module):
 
     def __init__(
         self,
-        normalized_shape,
+        normalized_shape: _shape_t,
         eps: Optional[float] = None,
         elementwise_affine: bool = True,
         device=None,
@@ -231,7 +394,7 @@ def __init__(
         self.elementwise_affine = elementwise_affine
         if self.elementwise_affine:
             self.weight = Parameter(
-                ops.empty(self.normalized_shape, **factory_kwargs)
+                mindtorch.empty(self.normalized_shape, **factory_kwargs)
             )
         else:
             self.register_parameter("weight", None)
@@ -244,17 +407,22 @@ def reset_parameters(self) -> None:
         if self.elementwise_affine:
             init.ones_(self.weight)
 
-    def forward(self, x):
+    def forward(self, x: mindtorch.Tensor) -> mindtorch.Tensor:
         """
-        Runs forward pass.
+        Runs the forward pass.
         """
         return F.rms_norm(x, self.normalized_shape, self.weight, self.eps)
 
     def extra_repr(self) -> str:
         """
-        Extra information about the module.
+        Return the extra representation of the module.
         """
         return (
             "{normalized_shape}, eps={eps}, "
             "elementwise_affine={elementwise_affine}".format(**self.__dict__)
         )
+
+
+# TODO: ContrastiveNorm2d
+# TODO: DivisiveNorm2d
+# TODO: SubtractiveNorm2d
\ No newline at end of file
diff --git a/mindtorch/nn/modules/padding.py b/mindtorch/nn/modules/padding.py
index 4cd65c445..be2dce2ee 100644
--- a/mindtorch/nn/modules/padding.py
+++ b/mindtorch/nn/modules/padding.py
@@ -1,52 +1,106 @@
-"""padding"""
-from typing import Sequence, Tuple
+# mypy: allow-untyped-defs
+from collections.abc import Sequence
+
+import mindtorch.nn.functional as F
 from mindtorch import Tensor
+from mindtorch.nn.common_types import _size_2_t, _size_4_t, _size_6_t
 
 from .module import Module
-from ._utils import _pair, _quadruple, _ntuple
-from ..common_types import _size_2_t, _size_4_t, _size_6_t
-from .. import functional as F
-
-class _ConstantPadNd(Module):
-    __constants__ = ['padding', 'value']
-    value: float
+from .utils import _ntuple, _pair, _quadruple
+
+
+# TODO: grad_output size asserts in THNN
+
+__all__ = [
+    "CircularPad1d",
+    "CircularPad2d",
+    "CircularPad3d",
+    "ConstantPad1d",
+    "ConstantPad2d",
+    "ConstantPad3d",
+    "ReflectionPad1d",
+    "ReflectionPad2d",
+    "ReflectionPad3d",
+    "ReplicationPad1d",
+    "ReplicationPad2d",
+    "ReplicationPad3d",
+    "ZeroPad1d",
+    "ZeroPad2d",
+    "ZeroPad3d",
+]
+
+
+class _CircularPadNd(Module):
+    __constants__ = ["padding"]
     padding: Sequence[int]
 
-    def __init__(self, value: float) -> None:
-        super().__init__()
-        self.value = value
+    def _check_input_dim(self, input):
+        raise NotImplementedError
 
     def forward(self, input: Tensor) -> Tensor:
-        return F.pad(input, self.padding, 'constant', self.value)
+        self._check_input_dim(input)
+        return F.pad(input, self.padding, "circular")
 
     def extra_repr(self) -> str:
-        return f'padding={self.padding}, value={self.value}'
+        return f"{self.padding}"
 
-class ConstantPad1d(_ConstantPadNd):
-    r"""Pads the input tensor boundaries with a constant value.
+
+class CircularPad1d(_CircularPadNd):
+    r"""Pads the input tensor using circular padding of the input boundary.
+
+    Tensor values at the beginning of the dimension are used to pad the end,
+    and values at the end are used to pad the beginning. If negative padding is
+    applied then the ends of the tensor get removed.
 
     For `N`-dimensional padding, use :func:`mindtorch.nn.functional.pad()`.
 
     Args:
         padding (int, tuple): the size of the padding. If is `int`, uses the same
-            padding in both boundaries. If a 2-`tuple`, uses
+            padding in all boundaries. If a 2-`tuple`, uses
             (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`)
+            Note that padding size should be less than or equal to the corresponding input dimension.
 
     Shape:
         - Input: :math:`(C, W_{in})` or :math:`(N, C, W_{in})`.
         - Output: :math:`(C, W_{out})` or :math:`(N, C, W_{out})`, where
 
           :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+
+    Examples::
+
+        >>> # xdoctest: +IGNORE_WANT("not sure why xdoctest is choking on this")
+        >>> m = nn.CircularPad1d(2)
+        >>> input = mindtorch.arange(8, dtype=mindtorch.float).reshape(1, 2, 4)
+        >>> input
+        tensor([[[0., 1., 2., 3.],
+                 [4., 5., 6., 7.]]])
+        >>> m(input)
+        tensor([[[2., 3., 0., 1., 2., 3., 0., 1.],
+                 [6., 7., 4., 5., 6., 7., 4., 5.]]])
+        >>> # using different paddings for different sides
+        >>> m = nn.CircularPad1d((3, 1))
+        >>> m(input)
+        tensor([[[1., 2., 3., 0., 1., 2., 3., 0.],
+                 [5., 6., 7., 4., 5., 6., 7., 4.]]])
     """
 
-    padding: Tuple[int, int]
+    padding: tuple[int, int]
 
-    def __init__(self, padding: _size_2_t, value: float):
-        super().__init__(value)
+    def __init__(self, padding: _size_2_t) -> None:
+        super().__init__()
         self.padding = _pair(padding)
 
-class ConstantPad2d(_ConstantPadNd):
-    r"""Pads the input tensor boundaries with a constant value.
+    def _check_input_dim(self, input) -> None:
+        if input.dim() != 2 and input.dim() != 3:
+            raise ValueError(f"expected 2D or 3D input (got {input.dim()}D input)")
+
+
+class CircularPad2d(_CircularPadNd):
+    r"""Pads the input tensor using circular padding of the input boundary.
+
+    Tensor values at the beginning of the dimension are used to pad the end,
+    and values at the end are used to pad the beginning. If negative padding is
+    applied then the ends of the tensor get removed.
 
     For `N`-dimensional padding, use :func:`mindtorch.nn.functional.pad()`.
 
@@ -54,6 +108,7 @@ class ConstantPad2d(_ConstantPadNd):
         padding (int, tuple): the size of the padding. If is `int`, uses the same
             padding in all boundaries. If a 4-`tuple`, uses (:math:`\text{padding\_left}`,
             :math:`\text{padding\_right}`, :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`)
+            Note that padding size should be less than or equal to the corresponding input dimension.
 
     Shape:
         - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
@@ -63,17 +118,49 @@ class ConstantPad2d(_ConstantPadNd):
 
           :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
 
+    Examples::
+
+        >>> m = nn.CircularPad2d(2)
+        >>> input = mindtorch.arange(9, dtype=mindtorch.float).reshape(1, 1, 3, 3)
+        >>> input
+        tensor([[[[0., 1., 2.],
+                  [3., 4., 5.],
+                  [6., 7., 8.]]]])
+        >>> m(input)
+        tensor([[[[4., 5., 3., 4., 5., 3., 4.],
+                  [7., 8., 6., 7., 8., 6., 7.],
+                  [1., 2., 0., 1., 2., 0., 1.],
+                  [4., 5., 3., 4., 5., 3., 4.],
+                  [7., 8., 6., 7., 8., 6., 7.],
+                  [1., 2., 0., 1., 2., 0., 1.],
+                  [4., 5., 3., 4., 5., 3., 4.]]]])
+        >>> # using different paddings for different sides
+        >>> m = nn.CircularPad2d((1, 1, 2, 0))
+        >>> m(input)
+        tensor([[[[5., 3., 4., 5., 3.],
+                  [8., 6., 7., 8., 6.],
+                  [2., 0., 1., 2., 0.],
+                  [5., 3., 4., 5., 3.],
+                  [8., 6., 7., 8., 6.]]]])
     """
 
-    __constants__ = ['padding', 'value']
-    padding: Tuple[int, int, int, int]
+    padding: tuple[int, int, int, int]
 
-    def __init__(self, padding: _size_4_t, value: float) -> None:
-        super().__init__(value)
+    def __init__(self, padding: _size_4_t) -> None:
+        super().__init__()
         self.padding = _quadruple(padding)
 
-class ConstantPad3d(_ConstantPadNd):
-    r"""Pads the input tensor boundaries with a constant value.
+    def _check_input_dim(self, input) -> None:
+        if input.dim() != 3 and input.dim() != 4:
+            raise ValueError(f"expected 3D or 4D input (got {input.dim()}D input)")
+
+
+class CircularPad3d(_CircularPadNd):
+    r"""Pads the input tensor using circular padding of the input boundary.
+
+    Tensor values at the beginning of the dimension are used to pad the end,
+    and values at the end are used to pad the beginning. If negative padding is
+    applied then the ends of the tensor get removed.
 
     For `N`-dimensional padding, use :func:`mindtorch.nn.functional.pad()`.
 
@@ -83,11 +170,12 @@ class ConstantPad3d(_ConstantPadNd):
             (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`,
             :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`,
             :math:`\text{padding\_front}`, :math:`\text{padding\_back}`)
+            Note that padding size should be less than or equal to the corresponding input dimension.
 
     Shape:
         - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
-        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or
-          :math:`(C, D_{out}, H_{out}, W_{out})`, where
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or :math:`(C, D_{out}, H_{out}, W_{out})`,
+          where
 
           :math:`D_{out} = D_{in} + \text{padding\_front} + \text{padding\_back}`
 
@@ -95,17 +183,46 @@ class ConstantPad3d(_ConstantPadNd):
 
           :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
 
+    Examples::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = nn.CircularPad3d(3)
+        >>> input = mindtorch.randn(16, 3, 8, 320, 480)
+        >>> output = m(input)
+        >>> # using different paddings for different sides
+        >>> m = nn.CircularPad3d((3, 3, 6, 6, 1, 1))
+        >>> output = m(input)
     """
 
-    padding: Tuple[int, int, int, int, int, int]
+    padding: tuple[int, int, int, int, int, int]
 
-    def __init__(self, padding: _size_6_t, value: float) -> None:
-        super().__init__(value)
+    def __init__(self, padding: _size_6_t) -> None:
+        super().__init__()
         self.padding = _ntuple(6)(padding)
 
+    def _check_input_dim(self, input) -> None:
+        if input.dim() != 4 and input.dim() != 5:
+            raise ValueError(f"expected 4D or 5D input (got {input.dim()}D input)")
 
-class ZeroPad1d(ConstantPad1d):
-    r"""Pads the input tensor boundaries with zero.
+
+class _ConstantPadNd(Module):
+    __constants__ = ["padding", "value"]
+    value: float
+    padding: Sequence[int]
+
+    def __init__(self, value: float) -> None:
+        super().__init__()
+        self.value = value
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.pad(input, self.padding, "constant", self.value)
+
+    def extra_repr(self) -> str:
+        return f"padding={self.padding}, value={self.value}"
+
+
+class ConstantPad1d(_ConstantPadNd):
+    r"""Pads the input tensor boundaries with a constant value.
 
     For `N`-dimensional padding, use :func:`mindtorch.nn.functional.pad()`.
 
@@ -123,42 +240,40 @@ class ZeroPad1d(ConstantPad1d):
     Examples::
 
         >>> # xdoctest: +IGNORE_WANT("non-deterministic")
-        >>> m = nn.ZeroPad1d(2)
+        >>> m = nn.ConstantPad1d(2, 3.5)
         >>> input = mindtorch.randn(1, 2, 4)
         >>> input
         tensor([[[-1.0491, -0.7152, -0.0749,  0.8530],
                  [-1.3287,  1.8966,  0.1466, -0.2771]]])
         >>> m(input)
-        tensor([[[ 0.0000,  0.0000, -1.0491, -0.7152, -0.0749,  0.8530,  0.0000,
-                   0.0000],
-                 [ 0.0000,  0.0000, -1.3287,  1.8966,  0.1466, -0.2771,  0.0000,
-                   0.0000]]])
-        >>> m = nn.ZeroPad1d(2)
+        tensor([[[ 3.5000,  3.5000, -1.0491, -0.7152, -0.0749,  0.8530,  3.5000,
+                   3.5000],
+                 [ 3.5000,  3.5000, -1.3287,  1.8966,  0.1466, -0.2771,  3.5000,
+                   3.5000]]])
+        >>> m = nn.ConstantPad1d(2, 3.5)
         >>> input = mindtorch.randn(1, 2, 3)
         >>> input
         tensor([[[ 1.6616,  1.4523, -1.1255],
                  [-3.6372,  0.1182, -1.8652]]])
         >>> m(input)
-        tensor([[[ 0.0000,  0.0000,  1.6616,  1.4523, -1.1255,  0.0000,  0.0000],
-                 [ 0.0000,  0.0000, -3.6372,  0.1182, -1.8652,  0.0000,  0.0000]]])
+        tensor([[[ 3.5000,  3.5000,  1.6616,  1.4523, -1.1255,  3.5000,  3.5000],
+                 [ 3.5000,  3.5000, -3.6372,  0.1182, -1.8652,  3.5000,  3.5000]]])
         >>> # using different paddings for different sides
-        >>> m = nn.ZeroPad1d((3, 1))
+        >>> m = nn.ConstantPad1d((3, 1), 3.5)
         >>> m(input)
-        tensor([[[ 0.0000,  0.0000,  0.0000,  1.6616,  1.4523, -1.1255,  0.0000],
-                 [ 0.0000,  0.0000,  0.0000, -3.6372,  0.1182, -1.8652,  0.0000]]])
+        tensor([[[ 3.5000,  3.5000,  3.5000,  1.6616,  1.4523, -1.1255,  3.5000],
+                 [ 3.5000,  3.5000,  3.5000, -3.6372,  0.1182, -1.8652,  3.5000]]])
     """
 
-    padding: Tuple[int, int]
-
-    def __init__(self, padding: _size_2_t) -> None:
-        super().__init__(padding, 0.)
+    padding: tuple[int, int]
 
-    def extra_repr(self) -> str:
-        return f'{self.padding}'
+    def __init__(self, padding: _size_2_t, value: float) -> None:
+        super().__init__(value)
+        self.padding = _pair(padding)
 
 
-class ZeroPad2d(ConstantPad2d):
-    r"""Pads the input tensor boundaries with zero.
+class ConstantPad2d(_ConstantPadNd):
+    r"""Pads the input tensor boundaries with a constant value.
 
     For `N`-dimensional padding, use :func:`mindtorch.nn.functional.pad()`.
 
@@ -178,41 +293,38 @@ class ZeroPad2d(ConstantPad2d):
     Examples::
 
         >>> # xdoctest: +IGNORE_WANT("non-deterministic")
-        >>> m = nn.ZeroPad2d(2)
-        >>> input = mindtorch.randn(1, 1, 3, 3)
+        >>> m = nn.ConstantPad2d(2, 3.5)
+        >>> input = mindtorch.randn(1, 2, 2)
         >>> input
-        tensor([[[[-0.1678, -0.4418,  1.9466],
-                  [ 0.9604, -0.4219, -0.5241],
-                  [-0.9162, -0.5436, -0.6446]]]])
+        tensor([[[ 1.6585,  0.4320],
+                 [-0.8701, -0.4649]]])
         >>> m(input)
-        tensor([[[[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
-                  [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
-                  [ 0.0000,  0.0000, -0.1678, -0.4418,  1.9466,  0.0000,  0.0000],
-                  [ 0.0000,  0.0000,  0.9604, -0.4219, -0.5241,  0.0000,  0.0000],
-                  [ 0.0000,  0.0000, -0.9162, -0.5436, -0.6446,  0.0000,  0.0000],
-                  [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
-                  [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000]]]])
+        tensor([[[ 3.5000,  3.5000,  3.5000,  3.5000,  3.5000,  3.5000],
+                 [ 3.5000,  3.5000,  3.5000,  3.5000,  3.5000,  3.5000],
+                 [ 3.5000,  3.5000,  1.6585,  0.4320,  3.5000,  3.5000],
+                 [ 3.5000,  3.5000, -0.8701, -0.4649,  3.5000,  3.5000],
+                 [ 3.5000,  3.5000,  3.5000,  3.5000,  3.5000,  3.5000],
+                 [ 3.5000,  3.5000,  3.5000,  3.5000,  3.5000,  3.5000]]])
         >>> # using different paddings for different sides
-        >>> m = nn.ZeroPad2d((1, 1, 2, 0))
+        >>> m = nn.ConstantPad2d((3, 0, 2, 1), 3.5)
         >>> m(input)
-        tensor([[[[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
-                  [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
-                  [ 0.0000, -0.1678, -0.4418,  1.9466,  0.0000],
-                  [ 0.0000,  0.9604, -0.4219, -0.5241,  0.0000],
-                  [ 0.0000, -0.9162, -0.5436, -0.6446,  0.0000]]]])
+        tensor([[[ 3.5000,  3.5000,  3.5000,  3.5000,  3.5000],
+                 [ 3.5000,  3.5000,  3.5000,  3.5000,  3.5000],
+                 [ 3.5000,  3.5000,  3.5000,  1.6585,  0.4320],
+                 [ 3.5000,  3.5000,  3.5000, -0.8701, -0.4649],
+                 [ 3.5000,  3.5000,  3.5000,  3.5000,  3.5000]]])
     """
 
-    padding: Tuple[int, int, int, int]
-
-    def __init__(self, padding: _size_4_t) -> None:
-        super().__init__(padding, 0.)
+    __constants__ = ["padding", "value"]
+    padding: tuple[int, int, int, int]
 
-    def extra_repr(self) -> str:
-        return f'{self.padding}'
+    def __init__(self, padding: _size_4_t, value: float) -> None:
+        super().__init__(value)
+        self.padding = _quadruple(padding)
 
 
-class ZeroPad3d(ConstantPad3d):
-    r"""Pads the input tensor boundaries with zero.
+class ConstantPad3d(_ConstantPadNd):
+    r"""Pads the input tensor boundaries with a constant value.
 
     For `N`-dimensional padding, use :func:`mindtorch.nn.functional.pad()`.
 
@@ -236,21 +348,20 @@ class ZeroPad3d(ConstantPad3d):
 
     Examples::
 
-        >>> m = nn.ZeroPad3d(3)
+        >>> m = nn.ConstantPad3d(3, 3.5)
         >>> input = mindtorch.randn(16, 3, 10, 20, 30)
         >>> output = m(input)
         >>> # using different paddings for different sides
-        >>> m = nn.ZeroPad3d((3, 3, 6, 6, 0, 1))
+        >>> m = nn.ConstantPad3d((3, 3, 6, 6, 0, 1), 3.5)
         >>> output = m(input)
     """
 
-    padding: Tuple[int, int, int, int, int, int]
+    padding: tuple[int, int, int, int, int, int]
 
-    def __init__(self, padding: _size_6_t) -> None:
-        super().__init__(padding, 0.)
+    def __init__(self, padding: _size_6_t, value: float) -> None:
+        super().__init__(value)
+        self.padding = _ntuple(6)(padding)
 
-    def extra_repr(self) -> str:
-        return f'{self.padding}'
 
 class _ReflectionPadNd(Module):
     __constants__ = ["padding"]
@@ -266,7 +377,7 @@ def extra_repr(self) -> str:
 class ReflectionPad1d(_ReflectionPadNd):
     r"""Pads the input tensor using the reflection of the input boundary.
 
-    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+    For `N`-dimensional padding, use :func:`mindtorch.nn.functional.pad()`.
 
     Args:
         padding (int, tuple): the size of the padding. If is `int`, uses the same
@@ -284,7 +395,7 @@ class ReflectionPad1d(_ReflectionPadNd):
 
         >>> m = nn.ReflectionPad1d(2)
         >>> # xdoctest: +IGNORE_WANT("other tests seem to modify printing styles")
-        >>> input = torch.arange(8, dtype=torch.float).reshape(1, 2, 4)
+        >>> input = mindtorch.arange(8, dtype=mindtorch.float).reshape(1, 2, 4)
         >>> input
         tensor([[[0., 1., 2., 3.],
                  [4., 5., 6., 7.]]])
@@ -308,7 +419,7 @@ def __init__(self, padding: _size_2_t) -> None:
 class ReflectionPad2d(_ReflectionPadNd):
     r"""Pads the input tensor using the reflection of the input boundary.
 
-    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+    For `N`-dimensional padding, use :func:`mindtorch.nn.functional.pad()`.
 
     Args:
         padding (int, tuple): the size of the padding. If is `int`, uses the same
@@ -328,7 +439,7 @@ class ReflectionPad2d(_ReflectionPadNd):
 
         >>> # xdoctest: +IGNORE_WANT("not sure why xdoctest is choking on this")
         >>> m = nn.ReflectionPad2d(2)
-        >>> input = torch.arange(9, dtype=torch.float).reshape(1, 1, 3, 3)
+        >>> input = mindtorch.arange(9, dtype=mindtorch.float).reshape(1, 1, 3, 3)
         >>> input
         tensor([[[[0., 1., 2.],
                   [3., 4., 5.],
@@ -361,7 +472,7 @@ def __init__(self, padding: _size_4_t) -> None:
 class ReflectionPad3d(_ReflectionPadNd):
     r"""Pads the input tensor using the reflection of the input boundary.
 
-    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+    For `N`-dimensional padding, use :func:`mindtorch.nn.functional.pad()`.
 
     Args:
         padding (int, tuple): the size of the padding. If is `int`, uses the same
@@ -386,7 +497,7 @@ class ReflectionPad3d(_ReflectionPadNd):
 
         >>> # xdoctest: +IGNORE_WANT("not sure why xdoctest is choking on this")
         >>> m = nn.ReflectionPad3d(1)
-        >>> input = torch.arange(8, dtype=torch.float).reshape(1, 1, 2, 2, 2)
+        >>> input = mindtorch.arange(8, dtype=mindtorch.float).reshape(1, 1, 2, 2, 2)
         >>> m(input)
         tensor([[[[[7., 6., 7., 6.],
                    [5., 4., 5., 4.],
@@ -427,7 +538,7 @@ def extra_repr(self) -> str:
 class ReplicationPad1d(_ReplicationPadNd):
     r"""Pads the input tensor using replication of the input boundary.
 
-    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+    For `N`-dimensional padding, use :func:`mindtorch.nn.functional.pad()`.
 
     Args:
         padding (int, tuple): the size of the padding. If is `int`, uses the same
@@ -445,7 +556,7 @@ class ReplicationPad1d(_ReplicationPadNd):
 
         >>> # xdoctest: +IGNORE_WANT("not sure why xdoctest is choking on this")
         >>> m = nn.ReplicationPad1d(2)
-        >>> input = torch.arange(8, dtype=torch.float).reshape(1, 2, 4)
+        >>> input = mindtorch.arange(8, dtype=mindtorch.float).reshape(1, 2, 4)
         >>> input
         tensor([[[0., 1., 2., 3.],
                  [4., 5., 6., 7.]]])
@@ -469,7 +580,7 @@ def __init__(self, padding: _size_2_t) -> None:
 class ReplicationPad2d(_ReplicationPadNd):
     r"""Pads the input tensor using replication of the input boundary.
 
-    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+    For `N`-dimensional padding, use :func:`mindtorch.nn.functional.pad()`.
 
     Args:
         padding (int, tuple): the size of the padding. If is `int`, uses the same
@@ -489,7 +600,7 @@ class ReplicationPad2d(_ReplicationPadNd):
 
         >>> m = nn.ReplicationPad2d(2)
         >>> # xdoctest: +IGNORE_WANT("non-deterministic")
-        >>> input = torch.arange(9, dtype=torch.float).reshape(1, 1, 3, 3)
+        >>> input = mindtorch.arange(9, dtype=mindtorch.float).reshape(1, 1, 3, 3)
         >>> input
         tensor([[[[0., 1., 2.],
                   [3., 4., 5.],
@@ -522,7 +633,7 @@ def __init__(self, padding: _size_4_t) -> None:
 class ReplicationPad3d(_ReplicationPadNd):
     r"""Pads the input tensor using replication of the input boundary.
 
-    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+    For `N`-dimensional padding, use :func:`mindtorch.nn.functional.pad()`.
 
     Args:
         padding (int, tuple): the size of the padding. If is `int`, uses the same
@@ -547,7 +658,7 @@ class ReplicationPad3d(_ReplicationPadNd):
 
         >>> # xdoctest: +IGNORE_WANT("non-deterministic")
         >>> m = nn.ReplicationPad3d(3)
-        >>> input = torch.randn(16, 3, 8, 320, 480)
+        >>> input = mindtorch.randn(16, 3, 8, 320, 480)
         >>> output = m(input)
         >>> # using different paddings for different sides
         >>> m = nn.ReplicationPad3d((3, 3, 6, 6, 1, 1))
@@ -558,4 +669,162 @@ class ReplicationPad3d(_ReplicationPadNd):
 
     def __init__(self, padding: _size_6_t) -> None:
         super().__init__()
-        self.padding = _ntuple(6)(padding)
\ No newline at end of file
+        self.padding = _ntuple(6)(padding)
+
+
+class ZeroPad1d(ConstantPad1d):
+    r"""Pads the input tensor boundaries with zero.
+
+    For `N`-dimensional padding, use :func:`mindtorch.nn.functional.pad()`.
+
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in both boundaries. If a 2-`tuple`, uses
+            (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`)
+
+    Shape:
+        - Input: :math:`(C, W_{in})` or :math:`(N, C, W_{in})`.
+        - Output: :math:`(C, W_{out})` or :math:`(N, C, W_{out})`, where
+
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+
+    Examples::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = nn.ZeroPad1d(2)
+        >>> input = mindtorch.randn(1, 2, 4)
+        >>> input
+        tensor([[[-1.0491, -0.7152, -0.0749,  0.8530],
+                 [-1.3287,  1.8966,  0.1466, -0.2771]]])
+        >>> m(input)
+        tensor([[[ 0.0000,  0.0000, -1.0491, -0.7152, -0.0749,  0.8530,  0.0000,
+                   0.0000],
+                 [ 0.0000,  0.0000, -1.3287,  1.8966,  0.1466, -0.2771,  0.0000,
+                   0.0000]]])
+        >>> m = nn.ZeroPad1d(2)
+        >>> input = mindtorch.randn(1, 2, 3)
+        >>> input
+        tensor([[[ 1.6616,  1.4523, -1.1255],
+                 [-3.6372,  0.1182, -1.8652]]])
+        >>> m(input)
+        tensor([[[ 0.0000,  0.0000,  1.6616,  1.4523, -1.1255,  0.0000,  0.0000],
+                 [ 0.0000,  0.0000, -3.6372,  0.1182, -1.8652,  0.0000,  0.0000]]])
+        >>> # using different paddings for different sides
+        >>> m = nn.ZeroPad1d((3, 1))
+        >>> m(input)
+        tensor([[[ 0.0000,  0.0000,  0.0000,  1.6616,  1.4523, -1.1255,  0.0000],
+                 [ 0.0000,  0.0000,  0.0000, -3.6372,  0.1182, -1.8652,  0.0000]]])
+    """
+
+    padding: tuple[int, int]
+
+    def __init__(self, padding: _size_2_t) -> None:
+        super().__init__(padding, 0.0)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        return f"{self.padding}"
+
+
+class ZeroPad2d(ConstantPad2d):
+    r"""Pads the input tensor boundaries with zero.
+
+    For `N`-dimensional padding, use :func:`mindtorch.nn.functional.pad()`.
+
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 4-`tuple`, uses (:math:`\text{padding\_left}`,
+            :math:`\text{padding\_right}`, :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`)
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where
+
+          :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
+
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+
+    Examples::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = nn.ZeroPad2d(2)
+        >>> input = mindtorch.randn(1, 1, 3, 3)
+        >>> input
+        tensor([[[[-0.1678, -0.4418,  1.9466],
+                  [ 0.9604, -0.4219, -0.5241],
+                  [-0.9162, -0.5436, -0.6446]]]])
+        >>> m(input)
+        tensor([[[[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
+                  [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
+                  [ 0.0000,  0.0000, -0.1678, -0.4418,  1.9466,  0.0000,  0.0000],
+                  [ 0.0000,  0.0000,  0.9604, -0.4219, -0.5241,  0.0000,  0.0000],
+                  [ 0.0000,  0.0000, -0.9162, -0.5436, -0.6446,  0.0000,  0.0000],
+                  [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
+                  [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000]]]])
+        >>> # using different paddings for different sides
+        >>> m = nn.ZeroPad2d((1, 1, 2, 0))
+        >>> m(input)
+        tensor([[[[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
+                  [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
+                  [ 0.0000, -0.1678, -0.4418,  1.9466,  0.0000],
+                  [ 0.0000,  0.9604, -0.4219, -0.5241,  0.0000],
+                  [ 0.0000, -0.9162, -0.5436, -0.6446,  0.0000]]]])
+    """
+
+    padding: tuple[int, int, int, int]
+
+    def __init__(self, padding: _size_4_t) -> None:
+        super().__init__(padding, 0.0)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        return f"{self.padding}"
+
+
+class ZeroPad3d(ConstantPad3d):
+    r"""Pads the input tensor boundaries with zero.
+
+    For `N`-dimensional padding, use :func:`mindtorch.nn.functional.pad()`.
+
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 6-`tuple`, uses
+            (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`,
+            :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`,
+            :math:`\text{padding\_front}`, :math:`\text{padding\_back}`)
+
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or
+          :math:`(C, D_{out}, H_{out}, W_{out})`, where
+
+          :math:`D_{out} = D_{in} + \text{padding\_front} + \text{padding\_back}`
+
+          :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
+
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+
+    Examples::
+
+        >>> m = nn.ZeroPad3d(3)
+        >>> input = mindtorch.randn(16, 3, 10, 20, 30)
+        >>> output = m(input)
+        >>> # using different paddings for different sides
+        >>> m = nn.ZeroPad3d((3, 3, 6, 6, 0, 1))
+        >>> output = m(input)
+    """
+
+    padding: tuple[int, int, int, int, int, int]
+
+    def __init__(self, padding: _size_6_t) -> None:
+        super().__init__(padding, 0.0)
+
+    def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
+        return f"{self.padding}"
\ No newline at end of file
diff --git a/mindtorch/nn/modules/pixelshuffle.py b/mindtorch/nn/modules/pixelshuffle.py
index 175799f87..b533a9cab 100644
--- a/mindtorch/nn/modules/pixelshuffle.py
+++ b/mindtorch/nn/modules/pixelshuffle.py
@@ -1,10 +1,11 @@
-"""pixel shuffle"""
+import mindtorch.nn.functional as F
 from mindtorch import Tensor
+
 from .module import Module
-from .. import functional as F
 
 
-__all__ = ['PixelShuffle', 'PixelUnshuffle']
+__all__ = ["PixelShuffle", "PixelUnshuffle"]
+
 
 class PixelShuffle(Module):
     r"""Rearrange elements in a tensor according to an upscaling factor.
@@ -47,7 +48,7 @@ class PixelShuffle(Module):
         https://arxiv.org/abs/1609.05158
     """
 
-    __constants__ = ['upscale_factor']
+    __constants__ = ["upscale_factor"]
     upscale_factor: int
 
     def __init__(self, upscale_factor: int) -> None:
@@ -55,11 +56,16 @@ def __init__(self, upscale_factor: int) -> None:
         self.upscale_factor = upscale_factor
 
     def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
         return F.pixel_shuffle(input, self.upscale_factor)
 
     def extra_repr(self) -> str:
-        return f'upscale_factor={self.upscale_factor}'
-
+        """
+        Return the extra representation of the module.
+        """
+        return f"upscale_factor={self.upscale_factor}"
 
 
 class PixelUnshuffle(Module):
@@ -101,7 +107,7 @@ class PixelUnshuffle(Module):
         https://arxiv.org/abs/1609.05158
     """
 
-    __constants__ = ['downscale_factor']
+    __constants__ = ["downscale_factor"]
     downscale_factor: int
 
     def __init__(self, downscale_factor: int) -> None:
@@ -109,7 +115,13 @@ def __init__(self, downscale_factor: int) -> None:
         self.downscale_factor = downscale_factor
 
     def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
         return F.pixel_unshuffle(input, self.downscale_factor)
 
     def extra_repr(self) -> str:
-        return f'downscale_factor={self.downscale_factor}'
+        """
+        Return the extra representation of the module.
+        """
+        return f"downscale_factor={self.downscale_factor}"
\ No newline at end of file
diff --git a/mindtorch/nn/modules/pooling.py b/mindtorch/nn/modules/pooling.py
index 7069ee047..bc731181b 100644
--- a/mindtorch/nn/modules/pooling.py
+++ b/mindtorch/nn/modules/pooling.py
@@ -1,23 +1,68 @@
-"""pooling"""
-# pylint: disable=unused-import
 from typing import Optional
+
+import mindtorch.nn.functional as F
 from mindtorch import Tensor
+from mindtorch.nn.common_types import (
+    _ratio_2_t,
+    _ratio_3_t,
+    _size_1_t,
+    _size_2_opt_t,
+    _size_2_t,
+    _size_3_opt_t,
+    _size_3_t,
+    _size_any_opt_t,
+    _size_any_t,
+)
 
 from .module import Module
-from ._utils import _single
-from ..common_types import (_size_any_t, _size_1_t, _size_2_t, _size_3_t,
-                            _ratio_3_t, _ratio_2_t, _size_any_opt_t, _size_2_opt_t, _size_3_opt_t)
-from .. import functional as F
+from .utils import _pair, _single, _triple
+
+
+__all__ = [
+    "MaxPool1d",
+    "MaxPool2d",
+    "MaxPool3d",
+    "MaxUnpool1d",
+    "MaxUnpool2d",
+    "MaxUnpool3d",
+    "AvgPool1d",
+    "AvgPool2d",
+    "AvgPool3d",
+    "FractionalMaxPool2d",
+    "FractionalMaxPool3d",
+    "LPPool1d",
+    "LPPool2d",
+    "LPPool3d",
+    "AdaptiveMaxPool1d",
+    "AdaptiveMaxPool2d",
+    "AdaptiveMaxPool3d",
+    "AdaptiveAvgPool1d",
+    "AdaptiveAvgPool2d",
+    "AdaptiveAvgPool3d",
+]
+
 
 class _MaxPoolNd(Module):
-    __constants__ = ['kernel_size', 'stride', 'padding', 'dilation',
-                     'return_indices', 'ceil_mode']
+    __constants__ = [
+        "kernel_size",
+        "stride",
+        "padding",
+        "dilation",
+        "return_indices",
+        "ceil_mode",
+    ]
     return_indices: bool
     ceil_mode: bool
 
-    def __init__(self, kernel_size: _size_any_t, stride: Optional[_size_any_t] = None,
-                 padding: _size_any_t = 0, dilation: _size_any_t = 1,
-                 return_indices: bool = False, ceil_mode: bool = False) -> None:
+    def __init__(
+        self,
+        kernel_size: _size_any_t,
+        stride: Optional[_size_any_t] = None,
+        padding: _size_any_t = 0,
+        dilation: _size_any_t = 1,
+        return_indices: bool = False,
+        ceil_mode: bool = False,
+    ) -> None:
         super().__init__()
         self.kernel_size = kernel_size
         self.stride = stride if (stride is not None) else kernel_size
@@ -27,8 +72,10 @@ def __init__(self, kernel_size: _size_any_t, stride: Optional[_size_any_t] = Non
         self.ceil_mode = ceil_mode
 
     def extra_repr(self) -> str:
-        return 'kernel_size={kernel_size}, stride={stride}, padding={padding}' \
-            ', dilation={dilation}, ceil_mode={ceil_mode}'.format(**self.__dict__)
+        return (
+            "kernel_size={kernel_size}, stride={stride}, padding={padding}"
+            ", dilation={dilation}, ceil_mode={ceil_mode}".format(**self.__dict__)
+        )
 
 
 class MaxPool1d(_MaxPoolNd):
@@ -61,11 +108,22 @@ class MaxPool1d(_MaxPoolNd):
 
     Shape:
         - Input: :math:`(N, C, L_{in})` or :math:`(C, L_{in})`.
-        - Output: :math:`(N, C, L_{out})` or :math:`(C, L_{out})`, where
+        - Output: :math:`(N, C, L_{out})` or :math:`(C, L_{out})`,
+
+          where ``ceil_mode = False``
 
           .. math::
               L_{out} = \left\lfloor \frac{L_{in} + 2 \times \text{padding} - \text{dilation}
-                    \times (\text{kernel\_size} - 1) - 1}{\text{stride}} + 1\right\rfloor
+                   \times (\text{kernel\_size} - 1) - 1}{\text{stride}}\right\rfloor + 1
+
+          where ``ceil_mode = True``
+
+          .. math::
+              L_{out} = \left\lceil \frac{L_{in} + 2 \times \text{padding} - \text{dilation}
+                    \times (\text{kernel\_size} - 1) - 1 + (stride - 1)}{\text{stride}}\right\rceil + 1
+
+        - Ensure that the last pooling starts inside the image, make :math:`L_{out} = L_{out} - 1`
+          when :math:`(L_{out} - 1) * \text{stride} >= L_{in} + \text{padding}`.
 
     Examples::
 
@@ -84,9 +142,16 @@ class MaxPool1d(_MaxPoolNd):
     dilation: _size_1_t
 
     def forward(self, input: Tensor):
-        return F.max_pool1d(input, self.kernel_size, self.stride,
-                            self.padding, self.dilation, ceil_mode=self.ceil_mode,
-                            return_indices=self.return_indices)
+        """Runs the forward pass."""
+        return F.max_pool1d(
+            input,
+            self.kernel_size,
+            self.stride,
+            self.padding,
+            self.dilation,
+            ceil_mode=self.ceil_mode,
+            return_indices=self.return_indices,
+        )
 
 
 class MaxPool2d(_MaxPoolNd):
@@ -157,9 +222,16 @@ class MaxPool2d(_MaxPoolNd):
     dilation: _size_2_t
 
     def forward(self, input: Tensor):
-        return F.max_pool2d(input, self.kernel_size, self.stride,
-                            self.padding, self.dilation, ceil_mode=self.ceil_mode,
-                            return_indices=self.return_indices)
+        """Runs the forward pass."""
+        return F.max_pool2d(
+            input,
+            self.kernel_size,
+            self.stride,
+            self.padding,
+            self.dilation,
+            ceil_mode=self.ceil_mode,
+            return_indices=self.return_indices,
+        )
 
 
 class MaxPool3d(_MaxPoolNd):
@@ -234,93 +306,292 @@ class MaxPool3d(_MaxPoolNd):
     dilation: _size_3_t
 
     def forward(self, input: Tensor):
-        return F.max_pool3d(input, self.kernel_size, self.stride,
-                            self.padding, self.dilation, ceil_mode=self.ceil_mode,
-                            return_indices=self.return_indices)
+        """Runs the forward pass."""
+        return F.max_pool3d(
+            input,
+            self.kernel_size,
+            self.stride,
+            self.padding,
+            self.dilation,
+            ceil_mode=self.ceil_mode,
+            return_indices=self.return_indices,
+        )
+
+
+class _MaxUnpoolNd(Module):
+    def extra_repr(self) -> str:
+        return f"kernel_size={self.kernel_size}, stride={self.stride}, padding={self.padding}"
 
 
-class _AdaptiveAvgPoolNd(Module):
-    __constants__ = ['output_size']
+class MaxUnpool1d(_MaxUnpoolNd):
+    r"""Computes a partial inverse of :class:`MaxPool1d`.
 
-    def __init__(self, output_size: _size_any_opt_t) -> None:
+    :class:`MaxPool1d` is not fully invertible, since the non-maximal values are lost.
+
+    :class:`MaxUnpool1d` takes in as input the output of :class:`MaxPool1d`
+    including the indices of the maximal values and computes a partial inverse
+    in which all non-maximal values are set to zero.
+
+    Note:
+        This operation may behave nondeterministically when the input indices has repeat values.
+        See https://github.com/pymindtorch/pymindtorch/issues/80827 and :doc:`/notes/randomness` for more information.
+
+    .. note:: :class:`MaxPool1d` can map several input sizes to the same output
+              sizes. Hence, the inversion process can get ambiguous.
+              To accommodate this, you can provide the needed output size
+              as an additional argument :attr:`output_size` in the forward call.
+              See the Inputs and Example below.
+
+    Args:
+        kernel_size (int or tuple): Size of the max pooling window.
+        stride (int or tuple): Stride of the max pooling window.
+            It is set to :attr:`kernel_size` by default.
+        padding (int or tuple): Padding that was added to the input
+
+    Inputs:
+        - `input`: the input Tensor to invert
+        - `indices`: the indices given out by :class:`~mindtorch.nn.MaxPool1d`
+        - `output_size` (optional): the targeted output size
+
+    Shape:
+        - Input: :math:`(N, C, H_{in})` or :math:`(C, H_{in})`.
+        - Output: :math:`(N, C, H_{out})` or :math:`(C, H_{out})`, where
+
+          .. math::
+              H_{out} = (H_{in} - 1) \times \text{stride}[0] - 2 \times \text{padding}[0] + \text{kernel\_size}[0]
+
+          or as given by :attr:`output_size` in the call operator
+
+    Example::
+
+        >>> # xdoctest: +IGNORE_WANT("do other tests modify the global state?")
+        >>> pool = nn.MaxPool1d(2, stride=2, return_indices=True)
+        >>> unpool = nn.MaxUnpool1d(2, stride=2)
+        >>> input = mindtorch.tensor([[[1., 2, 3, 4, 5, 6, 7, 8]]])
+        >>> output, indices = pool(input)
+        >>> unpool(output, indices)
+        tensor([[[ 0.,  2.,  0.,  4.,  0.,  6.,  0., 8.]]])
+
+        >>> # Example showcasing the use of output_size
+        >>> input = mindtorch.tensor([[[1., 2, 3, 4, 5, 6, 7, 8, 9]]])
+        >>> output, indices = pool(input)
+        >>> unpool(output, indices, output_size=input.size())
+        tensor([[[ 0.,  2.,  0.,  4.,  0.,  6.,  0., 8.,  0.]]])
+
+        >>> unpool(output, indices)
+        tensor([[[ 0.,  2.,  0.,  4.,  0.,  6.,  0., 8.]]])
+    """
+
+    kernel_size: _size_1_t
+    stride: _size_1_t
+    padding: _size_1_t
+
+    def __init__(
+        self,
+        kernel_size: _size_1_t,
+        stride: Optional[_size_1_t] = None,
+        padding: _size_1_t = 0,
+    ) -> None:
         super().__init__()
-        self.output_size = output_size
+        self.kernel_size = _single(kernel_size)
+        self.stride = _single(stride if (stride is not None) else kernel_size)
+        self.padding = _single(padding)
 
-    def extra_repr(self) -> str:
-        return f'output_size={self.output_size}'
+    def forward(
+        self, input: Tensor, indices: Tensor, output_size: Optional[list[int]] = None
+    ) -> Tensor:
+        """Runs the forward pass."""
+        return F.max_unpool1d(
+            input, indices, self.kernel_size, self.stride, self.padding, output_size
+        )
 
 
-class AdaptiveAvgPool2d(_AdaptiveAvgPoolNd):
-    r"""Applies a 2D adaptive average pooling over an input signal composed of several input planes.
+class MaxUnpool2d(_MaxUnpoolNd):
+    r"""Computes a partial inverse of :class:`MaxPool2d`.
 
-    The output is of size H x W, for any input size.
-    The number of output features is equal to the number of input planes.
+    :class:`MaxPool2d` is not fully invertible, since the non-maximal values are lost.
+
+    :class:`MaxUnpool2d` takes in as input the output of :class:`MaxPool2d`
+    including the indices of the maximal values and computes a partial inverse
+    in which all non-maximal values are set to zero.
+
+    Note:
+        This operation may behave nondeterministically when the input indices has repeat values.
+        See https://github.com/pymindtorch/pymindtorch/issues/80827 and :doc:`/notes/randomness` for more information.
+
+    .. note:: :class:`MaxPool2d` can map several input sizes to the same output
+              sizes. Hence, the inversion process can get ambiguous.
+              To accommodate this, you can provide the needed output size
+              as an additional argument :attr:`output_size` in the forward call.
+              See the Inputs and Example below.
 
     Args:
-        output_size: the target output size of the image of the form H x W.
-                     Can be a tuple (H, W) or a single H for a square image H x H.
-                     H and W can be either a ``int``, or ``None`` which means the size will
-                     be the same as that of the input.
+        kernel_size (int or tuple): Size of the max pooling window.
+        stride (int or tuple): Stride of the max pooling window.
+            It is set to :attr:`kernel_size` by default.
+        padding (int or tuple): Padding that was added to the input
+
+    Inputs:
+        - `input`: the input Tensor to invert
+        - `indices`: the indices given out by :class:`~mindtorch.nn.MaxPool2d`
+        - `output_size` (optional): the targeted output size
 
     Shape:
         - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
-        - Output: :math:`(N, C, S_{0}, S_{1})` or :math:`(C, S_{0}, S_{1})`, where
-          :math:`S=\text{output\_size}`.
+        - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where
+
+          .. math::
+            H_{out} = (H_{in} - 1) \times \text{stride[0]} - 2 \times \text{padding[0]} + \text{kernel\_size[0]}
+
+          .. math::
+            W_{out} = (W_{in} - 1) \times \text{stride[1]} - 2 \times \text{padding[1]} + \text{kernel\_size[1]}
+
+          or as given by :attr:`output_size` in the call operator
+
+    Example::
+
+        >>> pool = nn.MaxPool2d(2, stride=2, return_indices=True)
+        >>> unpool = nn.MaxUnpool2d(2, stride=2)
+        >>> input = mindtorch.tensor([[[[ 1.,  2.,  3.,  4.],
+                                    [ 5.,  6.,  7.,  8.],
+                                    [ 9., 10., 11., 12.],
+                                    [13., 14., 15., 16.]]]])
+        >>> output, indices = pool(input)
+        >>> unpool(output, indices)
+        tensor([[[[  0.,   0.,   0.,   0.],
+                  [  0.,   6.,   0.,   8.],
+                  [  0.,   0.,   0.,   0.],
+                  [  0.,  14.,   0.,  16.]]]])
+        >>> # Now using output_size to resolve an ambiguous size for the inverse
+        >>> input = mindtorch.tensor([[[[ 1.,  2.,  3.,  4.,  5.],
+                                    [ 6.,  7.,  8.,  9., 10.],
+                                    [11., 12., 13., 14., 15.],
+                                    [16., 17., 18., 19., 20.]]]])
+        >>> output, indices = pool(input)
+        >>> # This call will not work without specifying output_size
+        >>> unpool(output, indices, output_size=input.size())
+        tensor([[[[ 0.,  0.,  0.,  0.,  0.],
+                  [ 0.,  7.,  0.,  9.,  0.],
+                  [ 0.,  0.,  0.,  0.,  0.],
+                  [ 0., 17.,  0., 19.,  0.]]]])
 
-    Examples:
-        >>> # target output size of 5x7
-        >>> m = nn.AdaptiveAvgPool2d((5, 7))
-        >>> input = mindtorch.randn(1, 64, 8, 9)
-        >>> output = m(input)
-        >>> # target output size of 7x7 (square)
-        >>> m = nn.AdaptiveAvgPool2d(7)
-        >>> input = mindtorch.randn(1, 64, 10, 9)
-        >>> output = m(input)
-        >>> # target output size of 10x7
-        >>> m = nn.AdaptiveAvgPool2d((None, 7))
-        >>> input = mindtorch.randn(1, 64, 10, 9)
-        >>> output = m(input)
 
     """
 
-    output_size: _size_2_opt_t
+    kernel_size: _size_2_t
+    stride: _size_2_t
+    padding: _size_2_t
 
-    def forward(self, input: Tensor) -> Tensor:
-        return F.adaptive_avg_pool2d(input, self.output_size)
+    def __init__(
+        self,
+        kernel_size: _size_2_t,
+        stride: Optional[_size_2_t] = None,
+        padding: _size_2_t = 0,
+    ) -> None:
+        super().__init__()
+        self.kernel_size = _pair(kernel_size)
+        self.stride = _pair(stride if (stride is not None) else kernel_size)
+        self.padding = _pair(padding)
 
-class AdaptiveAvgPool1d(_AdaptiveAvgPoolNd):
-    r"""Applies a 1D adaptive average pooling over an input signal composed of several input planes.
+    def forward(
+        self, input: Tensor, indices: Tensor, output_size: Optional[list[int]] = None
+    ) -> Tensor:
+        """Runs the forward pass."""
+        return F.max_unpool2d(
+            input, indices, self.kernel_size, self.stride, self.padding, output_size
+        )
 
-    The output size is :math:`L_{out}`, for any input size.
-    The number of output features is equal to the number of input planes.
+
+class MaxUnpool3d(_MaxUnpoolNd):
+    r"""Computes a partial inverse of :class:`MaxPool3d`.
+
+    :class:`MaxPool3d` is not fully invertible, since the non-maximal values are lost.
+    :class:`MaxUnpool3d` takes in as input the output of :class:`MaxPool3d`
+    including the indices of the maximal values and computes a partial inverse
+    in which all non-maximal values are set to zero.
+
+    Note:
+        This operation may behave nondeterministically when the input indices has repeat values.
+        See https://github.com/pymindtorch/pymindtorch/issues/80827 and :doc:`/notes/randomness` for more information.
+
+    .. note:: :class:`MaxPool3d` can map several input sizes to the same output
+              sizes. Hence, the inversion process can get ambiguous.
+              To accommodate this, you can provide the needed output size
+              as an additional argument :attr:`output_size` in the forward call.
+              See the Inputs section below.
 
     Args:
-        output_size: the target output size :math:`L_{out}`.
+        kernel_size (int or tuple): Size of the max pooling window.
+        stride (int or tuple): Stride of the max pooling window.
+            It is set to :attr:`kernel_size` by default.
+        padding (int or tuple): Padding that was added to the input
+
+    Inputs:
+        - `input`: the input Tensor to invert
+        - `indices`: the indices given out by :class:`~mindtorch.nn.MaxPool3d`
+        - `output_size` (optional): the targeted output size
 
     Shape:
-        - Input: :math:`(N, C, L_{in})` or :math:`(C, L_{in})`.
-        - Output: :math:`(N, C, L_{out})` or :math:`(C, L_{out})`, where
-          :math:`L_{out}=\text{output\_size}`.
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or :math:`(C, D_{out}, H_{out}, W_{out})`, where
 
-    Examples:
-        >>> # target output size of 5
-        >>> m = nn.AdaptiveAvgPool1d(5)
-        >>> input = mindtorch.randn(1, 64, 8)
-        >>> output = m(input)
+          .. math::
+              D_{out} = (D_{in} - 1) \times \text{stride[0]} - 2 \times \text{padding[0]} + \text{kernel\_size[0]}
+
+          .. math::
+              H_{out} = (H_{in} - 1) \times \text{stride[1]} - 2 \times \text{padding[1]} + \text{kernel\_size[1]}
 
+          .. math::
+              W_{out} = (W_{in} - 1) \times \text{stride[2]} - 2 \times \text{padding[2]} + \text{kernel\_size[2]}
+
+          or as given by :attr:`output_size` in the call operator
+
+    Example::
+
+        >>> # pool of square window of size=3, stride=2
+        >>> pool = nn.MaxPool3d(3, stride=2, return_indices=True)
+        >>> unpool = nn.MaxUnpool3d(3, stride=2)
+        >>> output, indices = pool(mindtorch.randn(20, 16, 51, 33, 15))
+        >>> unpooled_output = unpool(output, indices)
+        >>> unpooled_output.size()
+        mindtorch.Size([20, 16, 51, 33, 15])
     """
 
-    output_size: _size_1_t
+    kernel_size: _size_3_t
+    stride: _size_3_t
+    padding: _size_3_t
 
-    def forward(self, input: Tensor) -> Tensor:
-        return F.adaptive_avg_pool1d(input, self.output_size)
+    def __init__(
+        self,
+        kernel_size: _size_3_t,
+        stride: Optional[_size_3_t] = None,
+        padding: _size_3_t = 0,
+    ) -> None:
+        super().__init__()
+        self.kernel_size = _triple(kernel_size)
+        self.stride = _triple(stride if (stride is not None) else kernel_size)
+        self.padding = _triple(padding)
+
+    def forward(
+        self, input: Tensor, indices: Tensor, output_size: Optional[list[int]] = None
+    ) -> Tensor:
+        """Runs the forward pass."""
+        return F.max_unpool3d(
+            input, indices, self.kernel_size, self.stride, self.padding, output_size
+        )
 
 
 class _AvgPoolNd(Module):
-    __constants__ = ['kernel_size', 'stride', 'padding', 'ceil_mode', 'count_include_pad']
+    __constants__ = [
+        "kernel_size",
+        "stride",
+        "padding",
+        "ceil_mode",
+        "count_include_pad",
+    ]
 
     def extra_repr(self) -> str:
-        return f'kernel_size={self.kernel_size}, stride={self.stride}, padding={self.padding}'
+        return f"kernel_size={self.kernel_size}, stride={self.stride}, padding={self.padding}"
 
 
 class AvgPool1d(_AvgPoolNd):
@@ -342,6 +613,9 @@ class AvgPool1d(_AvgPoolNd):
         When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding
         or the input. Sliding windows that would start in the right padded region are ignored.
 
+    .. note::
+        pad should be at most half of effective kernel size.
+
     The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding` can each be
     an ``int`` or a one-element tuple.
 
@@ -378,8 +652,14 @@ class AvgPool1d(_AvgPoolNd):
     ceil_mode: bool
     count_include_pad: bool
 
-    def __init__(self, kernel_size: _size_1_t, stride: _size_1_t = None, padding: _size_1_t = 0, ceil_mode: bool = False,
-                 count_include_pad: bool = True) -> None:
+    def __init__(
+        self,
+        kernel_size: _size_1_t,
+        stride: _size_1_t = None,
+        padding: _size_1_t = 0,
+        ceil_mode: bool = False,
+        count_include_pad: bool = True,
+    ) -> None:
         super().__init__()
         self.kernel_size = _single(kernel_size)
         self.stride = _single(stride if stride is not None else kernel_size)
@@ -388,9 +668,15 @@ def __init__(self, kernel_size: _size_1_t, stride: _size_1_t = None, padding: _s
         self.count_include_pad = count_include_pad
 
     def forward(self, input: Tensor) -> Tensor:
+        """Runs the forward pass."""
         return F.avg_pool1d(
-            input, self.kernel_size[0], self.stride[0], self.padding[0], self.ceil_mode,
-            self.count_include_pad)
+            input,
+            self.kernel_size,
+            self.stride,
+            self.padding,
+            self.ceil_mode,
+            self.count_include_pad,
+        )
 
 
 class AvgPool2d(_AvgPoolNd):
@@ -412,9 +698,12 @@ class AvgPool2d(_AvgPoolNd):
         When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding
         or the input. Sliding windows that would start in the right padded region are ignored.
 
+    .. note::
+        pad should be at most half of effective kernel size.
+
     The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding` can either be:
 
-        - a single ``int`` -- in which case the same value is used for the height and width dimension
+        - a single ``int`` or a single-element tuple -- in which case the same value is used for the height and width dimension
         - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension,
           and the second `int` for the width dimension
 
@@ -455,7 +744,14 @@ class AvgPool2d(_AvgPoolNd):
         >>> output = m(input)
     """
 
-    __constants__ = ['kernel_size', 'stride', 'padding', 'ceil_mode', 'count_include_pad', 'divisor_override']
+    __constants__ = [
+        "kernel_size",
+        "stride",
+        "padding",
+        "ceil_mode",
+        "count_include_pad",
+        "divisor_override",
+    ]
 
     kernel_size: _size_2_t
     stride: _size_2_t
@@ -463,8 +759,15 @@ class AvgPool2d(_AvgPoolNd):
     ceil_mode: bool
     count_include_pad: bool
 
-    def __init__(self, kernel_size: _size_2_t, stride: Optional[_size_2_t] = None, padding: _size_2_t = 0,
-                 ceil_mode: bool = False, count_include_pad: bool = True, divisor_override: Optional[int] = None) -> None:
+    def __init__(
+        self,
+        kernel_size: _size_2_t,
+        stride: Optional[_size_2_t] = None,
+        padding: _size_2_t = 0,
+        ceil_mode: bool = False,
+        count_include_pad: bool = True,
+        divisor_override: Optional[int] = None,
+    ) -> None:
         super().__init__()
         self.kernel_size = kernel_size
         self.stride = stride if (stride is not None) else kernel_size
@@ -474,9 +777,16 @@ def __init__(self, kernel_size: _size_2_t, stride: Optional[_size_2_t] = None, p
         self.divisor_override = divisor_override
 
     def forward(self, input: Tensor) -> Tensor:
-        return F.avg_pool2d(input, self.kernel_size, self.stride,
-                            self.padding, self.ceil_mode, self.count_include_pad, self.divisor_override)
-
+        """Runs the forward pass."""
+        return F.avg_pool2d(
+            input,
+            self.kernel_size,
+            self.stride,
+            self.padding,
+            self.ceil_mode,
+            self.count_include_pad,
+            self.divisor_override,
+        )
 
 
 class AvgPool3d(_AvgPoolNd):
@@ -501,6 +811,9 @@ class AvgPool3d(_AvgPoolNd):
         When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding
         or the input. Sliding windows that would start in the right padded region are ignored.
 
+    .. note::
+        pad should be at most half of effective kernel size.
+
     The parameters :attr:`kernel_size`, :attr:`stride` can either be:
 
         - a single ``int`` -- in which case the same value is used for the depth, height and width dimension
@@ -548,7 +861,14 @@ class AvgPool3d(_AvgPoolNd):
         >>> output = m(input)
     """
 
-    __constants__ = ['kernel_size', 'stride', 'padding', 'ceil_mode', 'count_include_pad', 'divisor_override']
+    __constants__ = [
+        "kernel_size",
+        "stride",
+        "padding",
+        "ceil_mode",
+        "count_include_pad",
+        "divisor_override",
+    ]
 
     kernel_size: _size_3_t
     stride: _size_3_t
@@ -556,8 +876,15 @@ class AvgPool3d(_AvgPoolNd):
     ceil_mode: bool
     count_include_pad: bool
 
-    def __init__(self, kernel_size: _size_3_t, stride: Optional[_size_3_t] = None, padding: _size_3_t = 0,
-                 ceil_mode: bool = False, count_include_pad: bool = True, divisor_override: Optional[int] = None) -> None:
+    def __init__(
+        self,
+        kernel_size: _size_3_t,
+        stride: Optional[_size_3_t] = None,
+        padding: _size_3_t = 0,
+        ceil_mode: bool = False,
+        count_include_pad: bool = True,
+        divisor_override: Optional[int] = None,
+    ) -> None:
         super().__init__()
         self.kernel_size = kernel_size
         self.stride = stride if (stride is not None) else kernel_size
@@ -567,11 +894,647 @@ def __init__(self, kernel_size: _size_3_t, stride: Optional[_size_3_t] = None, p
         self.divisor_override = divisor_override
 
     def forward(self, input: Tensor) -> Tensor:
-        return F.avg_pool3d(input, self.kernel_size, self.stride,
-                            self.padding, self.ceil_mode, self.count_include_pad, self.divisor_override)
+        """Runs the forward pass."""
+        return F.avg_pool3d(
+            input,
+            self.kernel_size,
+            self.stride,
+            self.padding,
+            self.ceil_mode,
+            self.count_include_pad,
+            self.divisor_override,
+        )
 
     def __setstate__(self, d):
         super().__setstate__(d)
-        self.__dict__.setdefault('padding', 0)
-        self.__dict__.setdefault('ceil_mode', False)
-        self.__dict__.setdefault('count_include_pad', True)
+        self.__dict__.setdefault("padding", 0)
+        self.__dict__.setdefault("ceil_mode", False)
+        self.__dict__.setdefault("count_include_pad", True)
+
+
+class FractionalMaxPool2d(Module):
+    r"""Applies a 2D fractional max pooling over an input signal composed of several input planes.
+
+    Fractional MaxPooling is described in detail in the paper `Fractional MaxPooling`_ by Ben Graham
+
+    The max-pooling operation is applied in :math:`kH \times kW` regions by a stochastic
+    step size determined by the target output size.
+    The number of output features is equal to the number of input planes.
+
+    .. note:: Exactly one of ``output_size`` or ``output_ratio`` must be defined.
+
+    Args:
+        kernel_size: the size of the window to take a max over.
+                     Can be a single number k (for a square kernel of k x k) or a tuple `(kh, kw)`
+        output_size: the target output size of the image of the form `oH x oW`.
+                     Can be a tuple `(oH, oW)` or a single number oH for a square image `oH x oH`.
+                     Note that we must have :math:`kH + oH - 1 <= H_{in}` and :math:`kW + oW - 1 <= W_{in}`
+        output_ratio: If one wants to have an output size as a ratio of the input size, this option can be given.
+                      This has to be a number or tuple in the range (0, 1).
+                      Note that we must have :math:`kH + (output\_ratio\_H * H_{in}) - 1 <= H_{in}`
+                      and :math:`kW + (output\_ratio\_W * W_{in}) - 1 <= W_{in}`
+        return_indices: if ``True``, will return the indices along with the outputs.
+                        Useful to pass to :meth:`nn.MaxUnpool2d`. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where
+          :math:`(H_{out}, W_{out})=\text{output\_size}` or
+          :math:`(H_{out}, W_{out})=\text{output\_ratio} \times (H_{in}, W_{in})`.
+
+    Examples:
+        >>> # pool of square window of size=3, and target output size 13x12
+        >>> m = nn.FractionalMaxPool2d(3, output_size=(13, 12))
+        >>> # pool of square window and target output size being half of input image size
+        >>> m = nn.FractionalMaxPool2d(3, output_ratio=(0.5, 0.5))
+        >>> input = mindtorch.randn(20, 16, 50, 32)
+        >>> output = m(input)
+
+    .. _Fractional MaxPooling:
+        https://arxiv.org/abs/1412.6071
+    """
+
+    __constants__ = ["kernel_size", "return_indices", "output_size", "output_ratio"]
+
+    kernel_size: _size_2_t
+    return_indices: bool
+    output_size: _size_2_t
+    output_ratio: _ratio_2_t
+
+    def __init__(
+        self,
+        kernel_size: _size_2_t,
+        output_size: Optional[_size_2_t] = None,
+        output_ratio: Optional[_ratio_2_t] = None,
+        return_indices: bool = False,
+        _random_samples=None,
+    ) -> None:
+        super().__init__()
+        self.kernel_size = _pair(kernel_size)
+        self.return_indices = return_indices
+        self.register_buffer("_random_samples", _random_samples)
+        self.output_size = _pair(output_size) if output_size is not None else None
+        self.output_ratio = _pair(output_ratio) if output_ratio is not None else None
+        if output_size is None and output_ratio is None:
+            raise ValueError(
+                "FractionalMaxPool2d requires specifying either "
+                "an output size, or a pooling ratio"
+            )
+        if output_size is not None and output_ratio is not None:
+            raise ValueError(
+                "only one of output_size and output_ratio may be specified"
+            )
+        if self.output_ratio is not None:
+            if not (0 < self.output_ratio[0] < 1 and 0 < self.output_ratio[1] < 1):
+                raise ValueError(
+                    f"output_ratio must be between 0 and 1 (got {output_ratio})"
+                )
+
+    def forward(self, input: Tensor):
+        return F.fractional_max_pool2d(
+            input,
+            self.kernel_size,
+            self.output_size,
+            self.output_ratio,
+            self.return_indices,
+            _random_samples=self._random_samples,
+        )
+
+
+class FractionalMaxPool3d(Module):
+    r"""Applies a 3D fractional max pooling over an input signal composed of several input planes.
+
+    Fractional MaxPooling is described in detail in the paper `Fractional MaxPooling`_ by Ben Graham
+
+    The max-pooling operation is applied in :math:`kT \times kH \times kW` regions by a stochastic
+    step size determined by the target output size.
+    The number of output features is equal to the number of input planes.
+
+    .. note:: Exactly one of ``output_size`` or ``output_ratio`` must be defined.
+
+    Args:
+        kernel_size: the size of the window to take a max over.
+                     Can be a single number `k` (for a square kernel of `k x k x k`) or a tuple `(kt x kh x kw)`,
+                     `k` must greater than 0.
+        output_size: the target output size of the image of the form `oT x oH x oW`.
+                     Can be a tuple `(oT, oH, oW)` or a single number oH for a square image `oH x oH x oH`
+        output_ratio: If one wants to have an output size as a ratio of the input size, this option can be given.
+                      This has to be a number or tuple in the range (0, 1)
+        return_indices: if ``True``, will return the indices along with the outputs.
+                        Useful to pass to :meth:`nn.MaxUnpool3d`. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, C, T_{in}, H_{in}, W_{in})` or :math:`(C, T_{in}, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, T_{out}, H_{out}, W_{out})` or :math:`(C, T_{out}, H_{out}, W_{out})`, where
+          :math:`(T_{out}, H_{out}, W_{out})=\text{output\_size}` or
+          :math:`(T_{out}, H_{out}, W_{out})=\text{output\_ratio} \times (T_{in}, H_{in}, W_{in})`
+
+    Examples:
+        >>> # pool of cubic window of size=3, and target output size 13x12x11
+        >>> m = nn.FractionalMaxPool3d(3, output_size=(13, 12, 11))
+        >>> # pool of cubic window and target output size being half of input size
+        >>> m = nn.FractionalMaxPool3d(3, output_ratio=(0.5, 0.5, 0.5))
+        >>> input = mindtorch.randn(20, 16, 50, 32, 16)
+        >>> output = m(input)
+
+    .. _Fractional MaxPooling:
+        https://arxiv.org/abs/1412.6071
+    """
+
+    __constants__ = ["kernel_size", "return_indices", "output_size", "output_ratio"]
+    kernel_size: _size_3_t
+    return_indices: bool
+    output_size: _size_3_t
+    output_ratio: _ratio_3_t
+
+    def __init__(
+        self,
+        kernel_size: _size_3_t,
+        output_size: Optional[_size_3_t] = None,
+        output_ratio: Optional[_ratio_3_t] = None,
+        return_indices: bool = False,
+        _random_samples=None,
+    ) -> None:
+        super().__init__()
+        if (isinstance(kernel_size, int) and kernel_size <= 0) or (
+            isinstance(kernel_size, (tuple, list))
+            and not all(k > 0 for k in kernel_size)
+        ):
+            raise ValueError(f"kernel_size must greater than 0, but got {kernel_size}")
+        self.kernel_size = _triple(kernel_size)
+        self.return_indices = return_indices
+        self.register_buffer("_random_samples", _random_samples)
+        self.output_size = _triple(output_size) if output_size is not None else None
+        self.output_ratio = _triple(output_ratio) if output_ratio is not None else None
+        if output_size is None and output_ratio is None:
+            raise ValueError(
+                "FractionalMaxPool3d requires specifying either "
+                "an output size, or a pooling ratio"
+            )
+        if output_size is not None and output_ratio is not None:
+            raise ValueError(
+                "only one of output_size and output_ratio may be specified"
+            )
+        if self.output_ratio is not None:
+            if not (
+                0 < self.output_ratio[0] < 1
+                and 0 < self.output_ratio[1] < 1
+                and 0 < self.output_ratio[2] < 1
+            ):
+                raise ValueError(
+                    f"output_ratio must be between 0 and 1 (got {output_ratio})"
+                )
+
+    def forward(self, input: Tensor):
+        return F.fractional_max_pool3d(
+            input,
+            self.kernel_size,
+            self.output_size,
+            self.output_ratio,
+            self.return_indices,
+            _random_samples=self._random_samples,
+        )
+
+
+class _LPPoolNd(Module):
+    __constants__ = ["norm_type", "kernel_size", "stride", "ceil_mode"]
+
+    norm_type: float
+    ceil_mode: bool
+
+    def __init__(
+        self,
+        norm_type: float,
+        kernel_size: _size_any_t,
+        stride: Optional[_size_any_t] = None,
+        ceil_mode: bool = False,
+    ) -> None:
+        super().__init__()
+        self.norm_type = norm_type
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.ceil_mode = ceil_mode
+
+    def extra_repr(self) -> str:
+        return (
+            "norm_type={norm_type}, kernel_size={kernel_size}, stride={stride}, "
+            "ceil_mode={ceil_mode}".format(**self.__dict__)
+        )
+
+
+class LPPool1d(_LPPoolNd):
+    r"""Applies a 1D power-average pooling over an input signal composed of several input planes.
+
+    On each window, the function computed is:
+
+    .. math::
+        f(X) = \sqrt[p]{\sum_{x \in X} x^{p}}
+
+    - At p = :math:`\infty`, one gets Max Pooling
+    - At p = 1, one gets Sum Pooling (which is proportional to Average Pooling)
+
+    .. note:: If the sum to the power of `p` is zero, the gradient of this function is
+              not defined. This implementation will set the gradient to zero in this case.
+
+    Args:
+        kernel_size: a single int, the size of the window
+        stride: a single int, the stride of the window. Default value is :attr:`kernel_size`
+        ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
+
+    Shape:
+        - Input: :math:`(N, C, L_{in})` or :math:`(C, L_{in})`.
+        - Output: :math:`(N, C, L_{out})` or :math:`(C, L_{out})`, where
+
+          .. math::
+              L_{out} = \left\lfloor\frac{L_{in} - \text{kernel\_size}}{\text{stride}} + 1\right\rfloor
+
+    Examples::
+        >>> # power-2 pool of window of length 3, with stride 2.
+        >>> m = nn.LPPool1d(2, 3, stride=2)
+        >>> input = mindtorch.randn(20, 16, 50)
+        >>> output = m(input)
+    """
+
+    kernel_size: _size_1_t
+    stride: _size_1_t
+
+    def forward(self, input: Tensor) -> Tensor:
+        """Runs the forward pass."""
+        return F.lp_pool1d(
+            input, float(self.norm_type), self.kernel_size, self.stride, self.ceil_mode
+        )
+
+
+class LPPool2d(_LPPoolNd):
+    r"""Applies a 2D power-average pooling over an input signal composed of several input planes.
+
+    On each window, the function computed is:
+
+    .. math::
+        f(X) = \sqrt[p]{\sum_{x \in X} x^{p}}
+
+    - At p = :math:`\infty`, one gets Max Pooling
+    - At p = 1, one gets Sum Pooling (which is proportional to average pooling)
+
+    The parameters :attr:`kernel_size`, :attr:`stride` can either be:
+
+        - a single ``int`` -- in which case the same value is used for the height and width dimension
+        - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension,
+          and the second `int` for the width dimension
+
+    .. note:: If the sum to the power of `p` is zero, the gradient of this function is
+              not defined. This implementation will set the gradient to zero in this case.
+
+    Args:
+        kernel_size: the size of the window
+        stride: the stride of the window. Default value is :attr:`kernel_size`
+        ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where
+
+          .. math::
+              H_{out} = \left\lfloor\frac{H_{in} - \text{kernel\_size}[0]}{\text{stride}[0]} + 1\right\rfloor
+
+          .. math::
+              W_{out} = \left\lfloor\frac{W_{in} - \text{kernel\_size}[1]}{\text{stride}[1]} + 1\right\rfloor
+
+    Examples::
+
+        >>> # power-2 pool of square window of size=3, stride=2
+        >>> m = nn.LPPool2d(2, 3, stride=2)
+        >>> # pool of non-square window of power 1.2
+        >>> m = nn.LPPool2d(1.2, (3, 2), stride=(2, 1))
+        >>> input = mindtorch.randn(20, 16, 50, 32)
+        >>> output = m(input)
+
+    """
+
+    kernel_size: _size_2_t
+    stride: _size_2_t
+
+    def forward(self, input: Tensor) -> Tensor:
+        """Runs the forward pass."""
+        return F.lp_pool2d(
+            input, float(self.norm_type), self.kernel_size, self.stride, self.ceil_mode
+        )
+
+
+class LPPool3d(_LPPoolNd):
+    r"""Applies a 3D power-average pooling over an input signal composed of several input planes.
+
+    On each window, the function computed is:
+
+    .. math::
+        f(X) = \sqrt[p]{\sum_{x \in X} x^{p}}
+
+    - At p = :math:`\infty`, one gets Max Pooling
+    - At p = 1, one gets Sum Pooling (which is proportional to average pooling)
+
+    The parameters :attr:`kernel_size`, :attr:`stride` can either be:
+
+        - a single ``int`` -- in which case the same value is used for the height, width and depth dimension
+        - a ``tuple`` of three ints -- in which case, the first `int` is used for the depth dimension,
+          the second `int` for the height dimension and the third `int` for the width dimension
+
+    .. note:: If the sum to the power of `p` is zero, the gradient of this function is
+              not defined. This implementation will set the gradient to zero in this case.
+
+    Args:
+        kernel_size: the size of the window
+        stride: the stride of the window. Default value is :attr:`kernel_size`
+        ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
+
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or
+          :math:`(C, D_{out}, H_{out}, W_{out})`, where
+
+          .. math::
+              D_{out} = \left\lfloor\frac{D_{in} - \text{kernel\_size}[0]}{\text{stride}[0]} + 1\right\rfloor
+
+          .. math::
+              H_{out} = \left\lfloor\frac{H_{in} - \text{kernel\_size}[1]}{\text{stride}[1]} + 1\right\rfloor
+
+          .. math::
+              W_{out} = \left\lfloor\frac{W_{in} - \text{kernel\_size}[2]}{\text{stride}[2]} + 1\right\rfloor
+
+    Examples::
+
+        >>> # power-2 pool of square window of size=3, stride=2
+        >>> m = nn.LPPool3d(2, 3, stride=2)
+        >>> # pool of non-square window of power 1.2
+        >>> m = nn.LPPool3d(1.2, (3, 2, 2), stride=(2, 1, 2))
+        >>> input = mindtorch.randn(20, 16, 50, 44, 31)
+        >>> output = m(input)
+
+    """
+
+    kernel_size: _size_3_t
+    stride: _size_3_t
+
+    def forward(self, input: Tensor) -> Tensor:
+        """Runs the forward pass."""
+        return F.lp_pool3d(
+            input, float(self.norm_type), self.kernel_size, self.stride, self.ceil_mode
+        )
+
+
+class _AdaptiveMaxPoolNd(Module):
+    __constants__ = ["output_size", "return_indices"]
+    return_indices: bool
+
+    def __init__(
+        self, output_size: _size_any_opt_t, return_indices: bool = False
+    ) -> None:
+        super().__init__()
+        self.output_size = output_size
+        self.return_indices = return_indices
+
+    def extra_repr(self) -> str:
+        return f"output_size={self.output_size}"
+
+
+# FIXME (by @ssnl): Improve adaptive pooling docs: specify what the input and
+#   output shapes are, and how the operation computes output.
+
+
+class AdaptiveMaxPool1d(_AdaptiveMaxPoolNd):
+    r"""Applies a 1D adaptive max pooling over an input signal composed of several input planes.
+
+    The output size is :math:`L_{out}`, for any input size.
+    The number of output features is equal to the number of input planes.
+
+    Args:
+        output_size: the target output size :math:`L_{out}`.
+        return_indices: if ``True``, will return the indices along with the outputs.
+                        Useful to pass to nn.MaxUnpool1d. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, C, L_{in})` or :math:`(C, L_{in})`.
+        - Output: :math:`(N, C, L_{out})` or :math:`(C, L_{out})`, where
+          :math:`L_{out}=\text{output\_size}`.
+
+    Examples:
+        >>> # target output size of 5
+        >>> m = nn.AdaptiveMaxPool1d(5)
+        >>> input = mindtorch.randn(1, 64, 8)
+        >>> output = m(input)
+
+    """
+
+    output_size: _size_1_t
+
+    def forward(self, input: Tensor):
+        """Runs the forward pass."""
+        return F.adaptive_max_pool1d(input, self.output_size, self.return_indices)
+
+
+class AdaptiveMaxPool2d(_AdaptiveMaxPoolNd):
+    r"""Applies a 2D adaptive max pooling over an input signal composed of several input planes.
+
+    The output is of size :math:`H_{out} \times W_{out}`, for any input size.
+    The number of output features is equal to the number of input planes.
+
+    Args:
+        output_size: the target output size of the image of the form :math:`H_{out} \times W_{out}`.
+                     Can be a tuple :math:`(H_{out}, W_{out})` or a single :math:`H_{out}` for a
+                     square image :math:`H_{out} \times H_{out}`. :math:`H_{out}` and :math:`W_{out}`
+                     can be either a ``int``, or ``None`` which means the size will be the same as that
+                     of the input.
+        return_indices: if ``True``, will return the indices along with the outputs.
+                        Useful to pass to nn.MaxUnpool2d. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where
+          :math:`(H_{out}, W_{out})=\text{output\_size}`.
+
+    Examples:
+        >>> # target output size of 5x7
+        >>> m = nn.AdaptiveMaxPool2d((5, 7))
+        >>> input = mindtorch.randn(1, 64, 8, 9)
+        >>> output = m(input)
+        >>> # target output size of 7x7 (square)
+        >>> m = nn.AdaptiveMaxPool2d(7)
+        >>> input = mindtorch.randn(1, 64, 10, 9)
+        >>> output = m(input)
+        >>> # target output size of 10x7
+        >>> m = nn.AdaptiveMaxPool2d((None, 7))
+        >>> input = mindtorch.randn(1, 64, 10, 9)
+        >>> output = m(input)
+
+    """
+
+    output_size: _size_2_opt_t
+
+    def forward(self, input: Tensor):
+        """Runs the forward pass."""
+        return F.adaptive_max_pool2d(input, self.output_size, self.return_indices)
+
+
+class AdaptiveMaxPool3d(_AdaptiveMaxPoolNd):
+    r"""Applies a 3D adaptive max pooling over an input signal composed of several input planes.
+
+    The output is of size :math:`D_{out} \times H_{out} \times W_{out}`, for any input size.
+    The number of output features is equal to the number of input planes.
+
+    Args:
+        output_size: the target output size of the image of the form :math:`D_{out} \times H_{out} \times W_{out}`.
+                     Can be a tuple :math:`(D_{out}, H_{out}, W_{out})` or a single
+                     :math:`D_{out}` for a cube :math:`D_{out} \times D_{out} \times D_{out}`.
+                     :math:`D_{out}`, :math:`H_{out}` and :math:`W_{out}` can be either a
+                     ``int``, or ``None`` which means the size will be the same as that of the input.
+
+        return_indices: if ``True``, will return the indices along with the outputs.
+                        Useful to pass to nn.MaxUnpool3d. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or :math:`(C, D_{out}, H_{out}, W_{out})`,
+          where :math:`(D_{out}, H_{out}, W_{out})=\text{output\_size}`.
+
+    Examples:
+        >>> # target output size of 5x7x9
+        >>> m = nn.AdaptiveMaxPool3d((5, 7, 9))
+        >>> input = mindtorch.randn(1, 64, 8, 9, 10)
+        >>> output = m(input)
+        >>> # target output size of 7x7x7 (cube)
+        >>> m = nn.AdaptiveMaxPool3d(7)
+        >>> input = mindtorch.randn(1, 64, 10, 9, 8)
+        >>> output = m(input)
+        >>> # target output size of 7x9x8
+        >>> m = nn.AdaptiveMaxPool3d((7, None, None))
+        >>> input = mindtorch.randn(1, 64, 10, 9, 8)
+        >>> output = m(input)
+
+    """
+
+    output_size: _size_3_opt_t
+
+    def forward(self, input: Tensor):
+        """Runs the forward pass."""
+        return F.adaptive_max_pool3d(input, self.output_size, self.return_indices)
+
+
+class _AdaptiveAvgPoolNd(Module):
+    __constants__ = ["output_size"]
+
+    def __init__(self, output_size: _size_any_opt_t) -> None:
+        super().__init__()
+        self.output_size = output_size
+
+    def extra_repr(self) -> str:
+        return f"output_size={self.output_size}"
+
+
+class AdaptiveAvgPool1d(_AdaptiveAvgPoolNd):
+    r"""Applies a 1D adaptive average pooling over an input signal composed of several input planes.
+
+    The output size is :math:`L_{out}`, for any input size.
+    The number of output features is equal to the number of input planes.
+
+    Args:
+        output_size: the target output size :math:`L_{out}`.
+
+    Shape:
+        - Input: :math:`(N, C, L_{in})` or :math:`(C, L_{in})`.
+        - Output: :math:`(N, C, L_{out})` or :math:`(C, L_{out})`, where
+          :math:`L_{out}=\text{output\_size}`.
+
+    Examples:
+        >>> # target output size of 5
+        >>> m = nn.AdaptiveAvgPool1d(5)
+        >>> input = mindtorch.randn(1, 64, 8)
+        >>> output = m(input)
+
+    """
+
+    output_size: _size_1_t
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Runs the forward pass.
+        """
+        return F.adaptive_avg_pool1d(input, self.output_size)
+
+
+class AdaptiveAvgPool2d(_AdaptiveAvgPoolNd):
+    r"""Applies a 2D adaptive average pooling over an input signal composed of several input planes.
+
+    The output is of size H x W, for any input size.
+    The number of output features is equal to the number of input planes.
+
+    Args:
+        output_size: the target output size of the image of the form H x W.
+                     Can be a tuple (H, W) or a single H for a square image H x H.
+                     H and W can be either a ``int``, or ``None`` which means the size will
+                     be the same as that of the input.
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, S_{0}, S_{1})` or :math:`(C, S_{0}, S_{1})`, where
+          :math:`S=\text{output\_size}`.
+
+    Examples:
+        >>> # target output size of 5x7
+        >>> m = nn.AdaptiveAvgPool2d((5, 7))
+        >>> input = mindtorch.randn(1, 64, 8, 9)
+        >>> output = m(input)
+        >>> # target output size of 7x7 (square)
+        >>> m = nn.AdaptiveAvgPool2d(7)
+        >>> input = mindtorch.randn(1, 64, 10, 9)
+        >>> output = m(input)
+        >>> # target output size of 10x7
+        >>> m = nn.AdaptiveAvgPool2d((None, 7))
+        >>> input = mindtorch.randn(1, 64, 10, 9)
+        >>> output = m(input)
+
+    """
+
+    output_size: _size_2_opt_t
+
+    def forward(self, input: Tensor) -> Tensor:
+        """Runs the forward pass."""
+        return F.adaptive_avg_pool2d(input, self.output_size)
+
+
+class AdaptiveAvgPool3d(_AdaptiveAvgPoolNd):
+    r"""Applies a 3D adaptive average pooling over an input signal composed of several input planes.
+
+    The output is of size D x H x W, for any input size.
+    The number of output features is equal to the number of input planes.
+
+    Args:
+        output_size: the target output size of the form D x H x W.
+                     Can be a tuple (D, H, W) or a single number D for a cube D x D x D.
+                     D, H and W can be either a ``int``, or ``None`` which means the size will
+                     be the same as that of the input.
+
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, S_{0}, S_{1}, S_{2})` or :math:`(C, S_{0}, S_{1}, S_{2})`,
+          where :math:`S=\text{output\_size}`.
+
+    Examples:
+        >>> # target output size of 5x7x9
+        >>> m = nn.AdaptiveAvgPool3d((5, 7, 9))
+        >>> input = mindtorch.randn(1, 64, 8, 9, 10)
+        >>> output = m(input)
+        >>> # target output size of 7x7x7 (cube)
+        >>> m = nn.AdaptiveAvgPool3d(7)
+        >>> input = mindtorch.randn(1, 64, 10, 9, 8)
+        >>> output = m(input)
+        >>> # target output size of 7x9x8
+        >>> m = nn.AdaptiveAvgPool3d((7, None, None))
+        >>> input = mindtorch.randn(1, 64, 10, 9, 8)
+        >>> output = m(input)
+
+    """
+
+    output_size: _size_3_opt_t
+
+    def forward(self, input: Tensor) -> Tensor:
+        """Runs the forward pass."""
+        return F.adaptive_avg_pool3d(input, self.output_size)
\ No newline at end of file
diff --git a/mindtorch/nn/modules/rnn.py b/mindtorch/nn/modules/rnn.py
index bfcb140ff..d5a2c2e36 100644
--- a/mindtorch/nn/modules/rnn.py
+++ b/mindtorch/nn/modules/rnn.py
@@ -27,7 +27,7 @@
 from ... import ops
 
 
-__all__ = ['LSTM', 'GRU', 'RNN']
+__all__ = ['LSTM', 'GRU', 'RNN', 'RNNBase']
 
 
 def _init_state(shape, dtype, device, is_lstm):
@@ -313,7 +313,7 @@ def forward(self, x, h_0, seq_length, w_ih, w_hh, b_ih, b_hh):
         return outputs, (h, c)
 
 
-class _RNNBase(Module):
+class RNNBase(Module):
     '''Basic class for RNN operators'''
 
     def __init__(self, mode, input_size, hidden_size, num_layers=1, bias=True,
@@ -512,7 +512,7 @@ def forward(self, x, hx=None, seq_length=None):
         return x_n.astype(x_dtype), (hx_n[0].astype(x_dtype), hx_n[1].astype(x_dtype))
 
 
-class RNN(_RNNBase):
+class RNN(RNNBase):
     r"""
     Stacked Elman RNN layers, applying RNN layer with :math:`\tanh` or :math:`\text{ReLU}` non-linearity to the input.
 
@@ -595,7 +595,7 @@ def __init__(self, *args, **kwargs):
         super(RNN, self).__init__(mode, *args, **kwargs)
 
 
-class GRU(_RNNBase):
+class GRU(RNNBase):
     r"""
     Stacked GRU (Gated Recurrent Unit) layers.
 
@@ -684,7 +684,7 @@ def __init__(self, *args, **kwargs):
         super(GRU, self).__init__(mode, *args, **kwargs)
 
 
-class LSTM(_RNNBase):
+class LSTM(RNNBase):
     r"""
     Stacked LSTM (Long Short-Term Memory) layers.
 
diff --git a/mindtorch/nn/modules/sparse.py b/mindtorch/nn/modules/sparse.py
index 22cca6a56..f1fd431f8 100644
--- a/mindtorch/nn/modules/sparse.py
+++ b/mindtorch/nn/modules/sparse.py
@@ -1,12 +1,15 @@
-"""sparse"""
+# mypy: allow-untyped-defs
 from typing import Optional
+
 import mindtorch
 from mindtorch import Tensor
-from ..parameter import Parameter
+from mindtorch.nn import functional as F, init
+from mindtorch.nn.parameter import Parameter
+
 from .module import Module
-from .. import functional as F
-from .. import init
-from ... import ops
+
+
+__all__ = ["Embedding", "EmbeddingBag"]
 
 
 class Embedding(Module):
@@ -15,10 +18,109 @@ class Embedding(Module):
     This module is often used to store word embeddings and retrieve them using indices.
     The input to the module is a list of indices, and the output is the corresponding
     word embeddings.
+
+    Args:
+        num_embeddings (int): size of the dictionary of embeddings
+        embedding_dim (int): the size of each embedding vector
+        padding_idx (int, optional): If specified, the entries at :attr:`padding_idx` do not contribute to the gradient;
+                                     therefore, the embedding vector at :attr:`padding_idx` is not updated during training,
+                                     i.e. it remains as a fixed "pad". For a newly constructed Embedding,
+                                     the embedding vector at :attr:`padding_idx` will default to all zeros,
+                                     but can be updated to another value to be used as the padding vector.
+        max_norm (float, optional): If given, each embedding vector with norm larger than :attr:`max_norm`
+                                    is renormalized to have norm :attr:`max_norm`.
+        norm_type (float, optional): The p of the p-norm to compute for the :attr:`max_norm` option. Default ``2``.
+        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse of frequency of
+                                                the words in the mini-batch. Default ``False``.
+        sparse (bool, optional): If ``True``, gradient w.r.t. :attr:`weight` matrix will be a sparse tensor.
+                                 See Notes for more details regarding sparse gradients.
+
+    Attributes:
+        weight (Tensor): the learnable weights of the module of shape (num_embeddings, embedding_dim)
+                         initialized from :math:`\mathcal{N}(0, 1)`
+
+    Shape:
+        - Input: :math:`(*)`, IntTensor or LongTensor of arbitrary shape containing the indices to extract
+        - Output: :math:`(*, H)`, where `*` is the input shape and :math:`H=\text{embedding\_dim}`
+
+    .. note::
+        Keep in mind that only a limited number of optimizers support
+        sparse gradients: currently it's :class:`optim.SGD` (`CUDA` and `CPU`),
+        :class:`optim.SparseAdam` (`CUDA` and `CPU`) and :class:`optim.Adagrad` (`CPU`)
+
+    .. note::
+        When :attr:`max_norm` is not ``None``, :class:`Embedding`'s forward method will modify the
+        :attr:`weight` tensor in-place. Since tensors needed for gradient computations cannot be
+        modified in-place, performing a differentiable operation on ``Embedding.weight`` before
+        calling :class:`Embedding`'s forward method requires cloning ``Embedding.weight`` when
+        :attr:`max_norm` is not ``None``. For example::
+
+            n, d, m = 3, 5, 7
+            embedding = nn.Embedding(n, d, max_norm=1.0)
+            W = mindtorch.randn((m, d), requires_grad=True)
+            idx = mindtorch.tensor([1, 2])
+            a = (
+                embedding.weight.clone() @ W.t()
+            )  # weight must be cloned for this to be differentiable
+            b = embedding(idx) @ W.t()  # modifies weight in-place
+            out = a.unsqueeze(0) + b.unsqueeze(1)
+            loss = out.sigmoid().prod()
+            loss.backward()
+
+    Examples::
+
+        >>> # an Embedding module containing 10 tensors of size 3
+        >>> embedding = nn.Embedding(10, 3)
+        >>> # a batch of 2 samples of 4 indices each
+        >>> input = mindtorch.LongTensor([[1, 2, 4, 5], [4, 3, 2, 9]])
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> embedding(input)
+        tensor([[[-0.0251, -1.6902,  0.7172],
+                 [-0.6431,  0.0748,  0.6969],
+                 [ 1.4970,  1.3448, -0.9685],
+                 [-0.3677, -2.7265, -0.1685]],
+
+                [[ 1.4970,  1.3448, -0.9685],
+                 [ 0.4362, -0.4004,  0.9400],
+                 [-0.6431,  0.0748,  0.6969],
+                 [ 0.9124, -2.3616,  1.1151]]])
+
+
+        >>> # example with padding_idx
+        >>> embedding = nn.Embedding(10, 3, padding_idx=0)
+        >>> input = mindtorch.LongTensor([[0, 2, 0, 5]])
+        >>> embedding(input)
+        tensor([[[ 0.0000,  0.0000,  0.0000],
+                 [ 0.1535, -2.0309,  0.9315],
+                 [ 0.0000,  0.0000,  0.0000],
+                 [-0.1655,  0.9897,  0.0635]]])
+
+        >>> # example of changing `pad` vector
+        >>> padding_idx = 0
+        >>> embedding = nn.Embedding(3, 3, padding_idx=padding_idx)
+        >>> embedding.weight
+        Parameter containing:
+        tensor([[ 0.0000,  0.0000,  0.0000],
+                [-0.7895, -0.7089, -0.0364],
+                [ 0.6778,  0.5803,  0.2678]], requires_grad=True)
+        >>> with mindtorch.no_grad():
+        ...     embedding.weight[padding_idx] = mindtorch.ones(3)
+        >>> embedding.weight
+        Parameter containing:
+        tensor([[ 1.0000,  1.0000,  1.0000],
+                [-0.7895, -0.7089, -0.0364],
+                [ 0.6778,  0.5803,  0.2678]], requires_grad=True)
     """
 
-    __constants__ = ['num_embeddings', 'embedding_dim', 'padding_idx', 'max_norm',
-                     'norm_type', 'scale_grad_by_freq', 'sparse']
+    __constants__ = [
+        "num_embeddings",
+        "embedding_dim",
+        "padding_idx",
+        "max_norm",
+        "norm_type",
+        "scale_grad_by_freq",
+        "sparse",
+    ]
 
     num_embeddings: int
     embedding_dim: int
@@ -30,31 +132,49 @@ class Embedding(Module):
     freeze: bool
     sparse: bool
 
-    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None,
-                 max_norm: Optional[float] = None, norm_type: float = 2., scale_grad_by_freq: bool = False,
-                 sparse: bool = False, _weight: Optional[Tensor] = None, _freeze: bool = False,
-                 dtype=None, device=None) -> None:
-        factory_kwargs = {'dtype': dtype, 'device': device}
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+        padding_idx: Optional[int] = None,
+        max_norm: Optional[float] = None,
+        norm_type: float = 2.0,
+        scale_grad_by_freq: bool = False,
+        sparse: bool = False,
+        _weight: Optional[Tensor] = None,
+        _freeze: bool = False,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__()
         self.num_embeddings = num_embeddings
         self.embedding_dim = embedding_dim
         if padding_idx is not None:
             if padding_idx > 0:
-                assert padding_idx < self.num_embeddings, 'Padding_idx must be within num_embeddings'
+                assert padding_idx < self.num_embeddings, (
+                    "Padding_idx must be within num_embeddings"
+                )
             elif padding_idx < 0:
-                assert padding_idx >= -self.num_embeddings, 'Padding_idx must be within num_embeddings'
+                assert padding_idx >= -self.num_embeddings, (
+                    "Padding_idx must be within num_embeddings"
+                )
                 padding_idx = self.num_embeddings + padding_idx
         self.padding_idx = padding_idx
         self.max_norm = max_norm
         self.norm_type = norm_type
         self.scale_grad_by_freq = scale_grad_by_freq
         if _weight is None:
-            self.weight = Parameter(ops.empty((num_embeddings, embedding_dim), **factory_kwargs),
-                                    requires_grad=not _freeze)
+            self.weight = Parameter(
+                mindtorch.empty((num_embeddings, embedding_dim), **factory_kwargs),
+                requires_grad=not _freeze,
+            )
             self.reset_parameters()
         else:
-            assert list(_weight.shape) == [num_embeddings, embedding_dim], \
-                'Shape of weight does not match num_embeddings and embedding_dim'
+            assert list(_weight.shape) == [
+                num_embeddings,
+                embedding_dim,
+            ], "Shape of weight does not match num_embeddings and embedding_dim"
             self.weight = Parameter(_weight, requires_grad=not _freeze)
 
         self.sparse = sparse
@@ -68,33 +188,42 @@ def _fill_padding_idx_with_zero(self) -> None:
             with mindtorch.no_grad():
                 self.weight[self.padding_idx].fill_(0)
 
-    def _fill_padding_idx_with_zero(self) -> None:
-        if self.padding_idx is not None:
-            self.weight[self.padding_idx] = 0
-
     def forward(self, input: Tensor) -> Tensor:
         return F.embedding(
-            input, self.weight, self.padding_idx, self.max_norm,
-            self.norm_type, self.scale_grad_by_freq)
+            input,
+            self.weight,
+            self.padding_idx,
+            self.max_norm,
+            self.norm_type,
+            self.scale_grad_by_freq,
+            self.sparse,
+        )
 
     def extra_repr(self) -> str:
-        s = '{num_embeddings}, {embedding_dim}'
+        s = "{num_embeddings}, {embedding_dim}"
         if self.padding_idx is not None:
-            s += ', padding_idx={padding_idx}'
+            s += ", padding_idx={padding_idx}"
         if self.max_norm is not None:
-            s += ', max_norm={max_norm}'
+            s += ", max_norm={max_norm}"
         if self.norm_type != 2:
-            s += ', norm_type={norm_type}'
+            s += ", norm_type={norm_type}"
         if self.scale_grad_by_freq is not False:
-            s += ', scale_grad_by_freq={scale_grad_by_freq}'
+            s += ", scale_grad_by_freq={scale_grad_by_freq}"
         if self.sparse is not False:
-            s += ', sparse=True'
+            s += ", sparse=True"
         return s.format(**self.__dict__)
 
     @classmethod
-    def from_pretrained(cls, embeddings, freeze=True, padding_idx=None,
-                        max_norm=None, norm_type=2., scale_grad_by_freq=False,
-                        sparse=False):
+    def from_pretrained(
+        cls,
+        embeddings,
+        freeze=True,
+        padding_idx=None,
+        max_norm=None,
+        norm_type=2.0,
+        scale_grad_by_freq=False,
+        sparse=False,
+    ):
         r"""Create Embedding instance from given 2-dimensional FloatTensor.
 
         Args:
@@ -113,16 +242,17 @@ def from_pretrained(cls, embeddings, freeze=True, padding_idx=None,
         Examples::
 
             >>> # FloatTensor containing pretrained weights
-            >>> weight = torch.FloatTensor([[1, 2.3, 3], [4, 5.1, 6.3]])
+            >>> weight = mindtorch.FloatTensor([[1, 2.3, 3], [4, 5.1, 6.3]])
             >>> embedding = nn.Embedding.from_pretrained(weight)
             >>> # Get embeddings for index 1
-            >>> input = torch.LongTensor([1])
+            >>> input = mindtorch.LongTensor([1])
             >>> # xdoctest: +IGNORE_WANT("non-deterministic")
             >>> embedding(input)
             tensor([[ 4.0000,  5.1000,  6.3000]])
         """
-        assert embeddings.dim() == 2, \
-            'Embeddings parameter is expected to be 2-dimensional'
+        assert embeddings.dim() == 2, (
+            "Embeddings parameter is expected to be 2-dimensional"
+        )
         rows, cols = embeddings.shape
         embedding = cls(
             num_embeddings=rows,
@@ -133,5 +263,286 @@ def from_pretrained(cls, embeddings, freeze=True, padding_idx=None,
             max_norm=max_norm,
             norm_type=norm_type,
             scale_grad_by_freq=scale_grad_by_freq,
-            sparse=sparse)
+            sparse=sparse,
+        )
         return embedding
+
+
+class EmbeddingBag(Module):
+    r"""Compute sums or means of 'bags' of embeddings, without instantiating the intermediate embeddings.
+
+    For bags of constant length, no :attr:`per_sample_weights`, no indices equal to :attr:`padding_idx`,
+    and with 2D inputs, this class
+
+        * with ``mode="sum"`` is equivalent to :class:`~mindtorch.nn.Embedding` followed by ``mindtorch.sum(dim=1)``,
+        * with ``mode="mean"`` is equivalent to :class:`~mindtorch.nn.Embedding` followed by ``mindtorch.mean(dim=1)``,
+        * with ``mode="max"`` is equivalent to :class:`~mindtorch.nn.Embedding` followed by ``mindtorch.max(dim=1)``.
+
+    However, :class:`~mindtorch.nn.EmbeddingBag` is much more time and memory efficient than using a chain of these
+    operations.
+
+    EmbeddingBag also supports per-sample weights as an argument to the forward
+    pass. This scales the output of the Embedding before performing a weighted
+    reduction as specified by ``mode``. If :attr:`per_sample_weights` is passed, the
+    only supported ``mode`` is ``"sum"``, which computes a weighted sum according to
+    :attr:`per_sample_weights`.
+
+    Args:
+        num_embeddings (int): size of the dictionary of embeddings
+        embedding_dim (int): the size of each embedding vector
+        max_norm (float, optional): If given, each embedding vector with norm larger than :attr:`max_norm`
+                                    is renormalized to have norm :attr:`max_norm`.
+        norm_type (float, optional): The p of the p-norm to compute for the :attr:`max_norm` option. Default ``2``.
+        scale_grad_by_freq (bool, optional): if given, this will scale gradients by the inverse of frequency of
+                                                the words in the mini-batch. Default ``False``.
+                                                Note: this option is not supported when ``mode="max"``.
+        mode (str, optional): ``"sum"``, ``"mean"`` or ``"max"``. Specifies the way to reduce the bag.
+                                 ``"sum"`` computes the weighted sum, taking :attr:`per_sample_weights`
+                                 into consideration. ``"mean"`` computes the average of the values
+                                 in the bag, ``"max"`` computes the max value over each bag.
+                                 Default: ``"mean"``
+        sparse (bool, optional): if ``True``, gradient w.r.t. :attr:`weight` matrix will be a sparse tensor. See
+                                 Notes for more details regarding sparse gradients. Note: this option is not
+                                 supported when ``mode="max"``.
+        include_last_offset (bool, optional): if ``True``, :attr:`offsets` has one additional element, where the last element
+                                      is equivalent to the size of `indices`. This matches the CSR format.
+        padding_idx (int, optional): If specified, the entries at :attr:`padding_idx` do not contribute to the
+                                     gradient; therefore, the embedding vector at :attr:`padding_idx` is not updated
+                                     during training, i.e. it remains as a fixed "pad". For a newly constructed
+                                     EmbeddingBag, the embedding vector at :attr:`padding_idx` will default to all
+                                     zeros, but can be updated to another value to be used as the padding vector.
+                                     Note that the embedding vector at :attr:`padding_idx` is excluded from the
+                                     reduction.
+
+    Attributes:
+        weight (Tensor): the learnable weights of the module of shape `(num_embeddings, embedding_dim)`
+                         initialized from :math:`\mathcal{N}(0, 1)`.
+
+    Examples::
+
+        >>> # an EmbeddingBag module containing 10 tensors of size 3
+        >>> embedding_sum = nn.EmbeddingBag(10, 3, mode='sum')
+        >>> # a batch of 2 samples of 4 indices each
+        >>> input = mindtorch.tensor([1, 2, 4, 5, 4, 3, 2, 9], dtype=mindtorch.long)
+        >>> offsets = mindtorch.tensor([0, 4], dtype=mindtorch.long)
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> embedding_sum(input, offsets)
+        tensor([[-0.8861, -5.4350, -0.0523],
+                [ 1.1306, -2.5798, -1.0044]])
+
+        >>> # Example with padding_idx
+        >>> embedding_sum = nn.EmbeddingBag(10, 3, mode='sum', padding_idx=2)
+        >>> input = mindtorch.tensor([2, 2, 2, 2, 4, 3, 2, 9], dtype=mindtorch.long)
+        >>> offsets = mindtorch.tensor([0, 4], dtype=mindtorch.long)
+        >>> embedding_sum(input, offsets)
+        tensor([[ 0.0000,  0.0000,  0.0000],
+                [-0.7082,  3.2145, -2.6251]])
+
+        >>> # An EmbeddingBag can be loaded from an Embedding like so
+        >>> embedding = nn.Embedding(10, 3, padding_idx=2)
+        >>> embedding_sum = nn.EmbeddingBag.from_pretrained(
+                embedding.weight,
+                padding_idx=embedding.padding_idx,
+                mode='sum')
+    """
+
+    __constants__ = [
+        "num_embeddings",
+        "embedding_dim",
+        "max_norm",
+        "norm_type",
+        "scale_grad_by_freq",
+        "mode",
+        "sparse",
+        "include_last_offset",
+        "padding_idx",
+    ]
+
+    num_embeddings: int
+    embedding_dim: int
+    max_norm: Optional[float]
+    norm_type: float
+    scale_grad_by_freq: bool
+    weight: Tensor
+    mode: str
+    sparse: bool
+    include_last_offset: bool
+    padding_idx: Optional[int]
+
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+        max_norm: Optional[float] = None,
+        norm_type: float = 2.0,
+        scale_grad_by_freq: bool = False,
+        mode: str = "mean",
+        sparse: bool = False,
+        _weight: Optional[Tensor] = None,
+        include_last_offset: bool = False,
+        padding_idx: Optional[int] = None,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        self.max_norm = max_norm
+        self.norm_type = norm_type
+        self.scale_grad_by_freq = scale_grad_by_freq
+        if padding_idx is not None:
+            if padding_idx > 0:
+                assert padding_idx < self.num_embeddings, (
+                    "padding_idx must be within num_embeddings"
+                )
+            elif padding_idx < 0:
+                assert padding_idx >= -self.num_embeddings, (
+                    "padding_idx must be within num_embeddings"
+                )
+                padding_idx = self.num_embeddings + padding_idx
+        self.padding_idx = padding_idx
+        if _weight is None:
+            self.weight = Parameter(
+                mindtorch.empty((num_embeddings, embedding_dim), **factory_kwargs)
+            )
+            self.reset_parameters()
+        else:
+            assert list(_weight.shape) == [
+                num_embeddings,
+                embedding_dim,
+            ], "Shape of weight does not match num_embeddings and embedding_dim"
+            self.weight = Parameter(_weight)
+        self.mode = mode
+        self.sparse = sparse
+        self.include_last_offset = include_last_offset
+
+    def reset_parameters(self) -> None:
+        init.normal_(self.weight)
+        self._fill_padding_idx_with_zero()
+
+    def _fill_padding_idx_with_zero(self) -> None:
+        if self.padding_idx is not None:
+            with mindtorch.no_grad():
+                self.weight[self.padding_idx].fill_(0)
+
+    def forward(
+        self,
+        input: Tensor,
+        offsets: Optional[Tensor] = None,
+        per_sample_weights: Optional[Tensor] = None,
+    ) -> Tensor:
+        """Forward pass of EmbeddingBag.
+
+        Args:
+            input (Tensor): Tensor containing bags of indices into the embedding matrix.
+            offsets (Tensor, optional): Only used when :attr:`input` is 1D. :attr:`offsets` determines
+                the starting index position of each bag (sequence) in :attr:`input`.
+            per_sample_weights (Tensor, optional): a tensor of float / double weights, or None
+                to indicate all weights should be taken to be ``1``. If specified, :attr:`per_sample_weights`
+                must have exactly the same shape as input and is treated as having the same
+                :attr:`offsets`, if those are not ``None``. Only supported for ``mode='sum'``.
+
+        Returns:
+            Tensor output shape of `(B, embedding_dim)`.
+
+        .. note::
+
+            A few notes about ``input`` and ``offsets``:
+
+            - :attr:`input` and :attr:`offsets` have to be of the same type, either int or long
+
+            - If :attr:`input` is 2D of shape `(B, N)`, it will be treated as ``B`` bags (sequences)
+              each of fixed length ``N``, and this will return ``B`` values aggregated in a way
+              depending on the :attr:`mode`. :attr:`offsets` is ignored and required to be ``None`` in this case.
+
+            - If :attr:`input` is 1D of shape `(N)`, it will be treated as a concatenation of
+              multiple bags (sequences).  :attr:`offsets` is required to be a 1D tensor containing the
+              starting index positions of each bag in :attr:`input`. Therefore, for :attr:`offsets` of shape `(B)`,
+              :attr:`input` will be viewed as having ``B`` bags. Empty bags (i.e., having 0-length) will have
+              returned vectors filled by zeros.
+        """
+        return F.embedding_bag(
+            input,
+            self.weight,
+            offsets,
+            self.max_norm,
+            self.norm_type,
+            self.scale_grad_by_freq,
+            self.mode,
+            self.sparse,
+            per_sample_weights,
+            self.include_last_offset,
+            self.padding_idx,
+        )
+
+    def extra_repr(self) -> str:
+        s = "{num_embeddings}, {embedding_dim}"
+        if self.max_norm is not None:
+            s += ", max_norm={max_norm}"
+        if self.norm_type != 2:
+            s += ", norm_type={norm_type}"
+        if self.scale_grad_by_freq is not False:
+            s += ", scale_grad_by_freq={scale_grad_by_freq}"
+        s += ", mode={mode}"
+        if self.padding_idx is not None:
+            s += ", padding_idx={padding_idx}"
+        return s.format(**{k: repr(v) for k, v in self.__dict__.items()})
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        embeddings: Tensor,
+        freeze: bool = True,
+        max_norm: Optional[float] = None,
+        norm_type: float = 2.0,
+        scale_grad_by_freq: bool = False,
+        mode: str = "mean",
+        sparse: bool = False,
+        include_last_offset: bool = False,
+        padding_idx: Optional[int] = None,
+    ) -> "EmbeddingBag":
+        r"""Create EmbeddingBag instance from given 2-dimensional FloatTensor.
+
+        Args:
+            embeddings (Tensor): FloatTensor containing weights for the EmbeddingBag.
+                First dimension is being passed to EmbeddingBag as 'num_embeddings', second as 'embedding_dim'.
+            freeze (bool, optional): If ``True``, the tensor does not get updated in the learning process.
+                Equivalent to ``embeddingbag.weight.requires_grad = False``. Default: ``True``
+            max_norm (float, optional): See module initialization documentation. Default: ``None``
+            norm_type (float, optional): See module initialization documentation. Default ``2``.
+            scale_grad_by_freq (bool, optional): See module initialization documentation. Default ``False``.
+            mode (str, optional): See module initialization documentation. Default: ``"mean"``
+            sparse (bool, optional): See module initialization documentation. Default: ``False``.
+            include_last_offset (bool, optional): See module initialization documentation. Default: ``False``.
+            padding_idx (int, optional): See module initialization documentation. Default: ``None``.
+
+        Examples::
+
+            >>> # FloatTensor containing pretrained weights
+            >>> weight = mindtorch.FloatTensor([[1, 2.3, 3], [4, 5.1, 6.3]])
+            >>> embeddingbag = nn.EmbeddingBag.from_pretrained(weight)
+            >>> # Get embeddings for index 1
+            >>> input = mindtorch.LongTensor([[1, 0]])
+            >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+            >>> embeddingbag(input)
+            tensor([[ 2.5000,  3.7000,  4.6500]])
+        """
+        assert embeddings.dim() == 2, (
+            "Embeddings parameter is expected to be 2-dimensional"
+        )
+        rows, cols = embeddings.shape
+        embeddingbag = cls(
+            num_embeddings=rows,
+            embedding_dim=cols,
+            _weight=embeddings,
+            max_norm=max_norm,
+            norm_type=norm_type,
+            scale_grad_by_freq=scale_grad_by_freq,
+            mode=mode,
+            sparse=sparse,
+            include_last_offset=include_last_offset,
+            padding_idx=padding_idx,
+        )
+        embeddingbag.weight.requires_grad = not freeze
+        return embeddingbag
\ No newline at end of file
diff --git a/mindtorch/nn/modules/upsampling.py b/mindtorch/nn/modules/upsampling.py
index ce20b85af..2c47202ce 100644
--- a/mindtorch/nn/modules/upsampling.py
+++ b/mindtorch/nn/modules/upsampling.py
@@ -1,13 +1,14 @@
-"""upsample"""
+# mypy: allow-untyped-defs
 from typing import Optional
+
+import mindtorch.nn.functional as F
 from mindtorch import Tensor
+from mindtorch.nn.common_types import _ratio_2_t, _ratio_any_t, _size_2_t, _size_any_t
 
 from .module import Module
-from .. import functional as F
-from ..common_types import _size_2_t, _ratio_2_t, _size_any_t, _ratio_any_t
 
-__all__ = ['Upsample', 'UpsamplingNearest2d', 'UpsamplingBilinear2d']
 
+__all__ = ["Upsample", "UpsamplingNearest2d", "UpsamplingBilinear2d"]
 
 
 class Upsample(Module):
@@ -132,7 +133,15 @@ class Upsample(Module):
                   [1.2000, 1.3600, 1.5200, 1.2800, 0.6400, 0.0000],
                   [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]]]])
     """
-    __constants__ = ['size', 'scale_factor', 'mode', 'align_corners', 'name', 'recompute_scale_factor']
+
+    __constants__ = [
+        "size",
+        "scale_factor",
+        "mode",
+        "align_corners",
+        "name",
+        "recompute_scale_factor",
+    ]
     name: str
     size: Optional[_size_any_t]
     scale_factor: Optional[_ratio_any_t]
@@ -140,9 +149,14 @@ class Upsample(Module):
     align_corners: Optional[bool]
     recompute_scale_factor: Optional[bool]
 
-    def __init__(self, size: Optional[_size_any_t] = None, scale_factor: Optional[_ratio_any_t] = None,
-                 mode: str = 'nearest', align_corners: Optional[bool] = None,
-                 recompute_scale_factor: Optional[bool] = None) -> None:
+    def __init__(
+        self,
+        size: Optional[_size_any_t] = None,
+        scale_factor: Optional[_ratio_any_t] = None,
+        mode: str = "nearest",
+        align_corners: Optional[bool] = None,
+        recompute_scale_factor: Optional[bool] = None,
+    ) -> None:
         super().__init__()
         self.name = type(self).__name__
         self.size = size
@@ -155,21 +169,38 @@ def __init__(self, size: Optional[_size_any_t] = None, scale_factor: Optional[_r
         self.recompute_scale_factor = recompute_scale_factor
 
     def forward(self, input: Tensor) -> Tensor:
-        return F.interpolate(input, self.size, self.scale_factor, self.mode, self.align_corners,
-                             recompute_scale_factor=self.recompute_scale_factor)
+        """
+        Runs the forward pass.
+        """
+        return F.interpolate(
+            input,
+            self.size,
+            self.scale_factor,
+            self.mode,
+            self.align_corners,
+            recompute_scale_factor=self.recompute_scale_factor,
+        )
+
+    def __setstate__(self, state):
+        if "recompute_scale_factor" not in state:
+            state["recompute_scale_factor"] = True
+
+        super().__setstate__(state)
 
     def extra_repr(self) -> str:
+        """
+        Return the extra representation of the module.
+        """
         if self.scale_factor is not None:
-            info = 'scale_factor=' + repr(self.scale_factor)
+            info = "scale_factor=" + repr(self.scale_factor)
         else:
-            info = 'size=' + repr(self.size)
-        info += ', mode=' + repr(self.mode)
+            info = "size=" + repr(self.size)
+        info += ", mode=" + repr(self.mode)
         return info
 
 
 class UpsamplingNearest2d(Upsample):
-    r"""Applies a 2D nearest neighbor upsampling to an input signal composed of several input
-    channels.
+    r"""Applies a 2D nearest neighbor upsampling to an input signal composed of several input channels.
 
     To specify the scale, it takes either the :attr:`size` or the :attr:`scale_factor`
     as it's constructor argument.
@@ -208,13 +239,17 @@ class UpsamplingNearest2d(Upsample):
                   [3., 3., 4., 4.],
                   [3., 3., 4., 4.]]]])
     """
-    def __init__(self, size: Optional[_size_2_t] = None, scale_factor: Optional[_ratio_2_t] = None) -> None:
-        super().__init__(size, scale_factor, mode='nearest')
+
+    def __init__(
+        self,
+        size: Optional[_size_2_t] = None,
+        scale_factor: Optional[_ratio_2_t] = None,
+    ) -> None:
+        super().__init__(size, scale_factor, mode="nearest")
 
 
 class UpsamplingBilinear2d(Upsample):
-    r"""Applies a 2D bilinear upsampling to an input signal composed of several input
-    channels.
+    r"""Applies a 2D bilinear upsampling to an input signal composed of several input channels.
 
     To specify the scale, it takes either the :attr:`size` or the :attr:`scale_factor`
     as it's constructor argument.
@@ -255,5 +290,10 @@ class UpsamplingBilinear2d(Upsample):
                   [2.3333, 2.6667, 3.0000, 3.3333],
                   [3.0000, 3.3333, 3.6667, 4.0000]]]])
     """
-    def __init__(self, size: Optional[_size_2_t] = None, scale_factor: Optional[_ratio_2_t] = None) -> None:
-        super().__init__(size, scale_factor, mode='bilinear', align_corners=True)
+
+    def __init__(
+        self,
+        size: Optional[_size_2_t] = None,
+        scale_factor: Optional[_ratio_2_t] = None,
+    ) -> None:
+        super().__init__(size, scale_factor, mode="bilinear", align_corners=True)
\ No newline at end of file
diff --git a/mindtorch/nn/modules/utils.py b/mindtorch/nn/modules/utils.py
index c3f14433d..62d0fa25a 100644
--- a/mindtorch/nn/modules/utils.py
+++ b/mindtorch/nn/modules/utils.py
@@ -1,10 +1,14 @@
+# mypy: allow-untyped-defs
 import collections
 from itertools import repeat
+from typing import Any
+
+
+__all__ = ["consume_prefix_in_state_dict_if_present"]
+
 
 def _ntuple(n, name="parse"):
     def parse(x):
-        if isinstance(x, (list, tuple)) and len(x) == 1:
-            x = x[0]
         if isinstance(x, collections.abc.Iterable):
             return tuple(x)
         return tuple(repeat(x, n))
@@ -12,4 +16,66 @@ def parse(x):
     parse.__name__ = name
     return parse
 
+
+_single = _ntuple(1, "_single")
 _pair = _ntuple(2, "_pair")
+_triple = _ntuple(3, "_triple")
+_quadruple = _ntuple(4, "_quadruple")
+
+
+def _reverse_repeat_tuple(t, n):
+    r"""Reverse the order of `t` and repeat each element for `n` times.
+
+    This can be used to translate padding arg used by Conv and Pooling modules
+    to the ones used by `F.pad`.
+    """
+    return tuple(x for x in reversed(t) for _ in range(n))
+
+
+def _list_with_default(out_size: list[int], defaults: list[int]) -> list[int]:
+    import torch
+
+    if isinstance(out_size, (int, torch.SymInt)):
+        return out_size
+    if len(defaults) <= len(out_size):
+        raise ValueError(f"Input dimension should be at least {len(out_size) + 1}")
+    return [
+        v if v is not None else d for v, d in zip(out_size, defaults[-len(out_size) :])
+    ]
+
+
+def consume_prefix_in_state_dict_if_present(
+    state_dict: dict[str, Any],
+    prefix: str,
+) -> None:
+    r"""Strip the prefix in state_dict in place, if any.
+
+    .. note::
+        Given a `state_dict` from a DP/DDP model, a local model can load it by applying
+        `consume_prefix_in_state_dict_if_present(state_dict, "module.")` before calling
+        :meth:`torch.nn.Module.load_state_dict`.
+
+    Args:
+        state_dict (OrderedDict): a state-dict to be loaded to the model.
+        prefix (str): prefix.
+    """
+    keys = list(state_dict.keys())
+    for key in keys:
+        if key.startswith(prefix):
+            newkey = key[len(prefix) :]
+            state_dict[newkey] = state_dict.pop(key)
+
+    # also strip the prefix in metadata if any.
+    if hasattr(state_dict, "_metadata"):
+        keys = list(state_dict._metadata.keys())
+        for key in keys:
+            # for the metadata dict, the key can be:
+            # '': for the DDP module, which we want to remove.
+            # 'module': for the actual model.
+            # 'module.xx.xx': for the rest.
+            if len(key) == 0:
+                continue
+            # handling both, 'module' case and  'module.' cases
+            if key == prefix.replace(".", "") or key.startswith(prefix):
+                newkey = key[len(prefix) :]
+                state_dict._metadata[newkey] = state_dict._metadata.pop(key)
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 04359f1a9..b15afeb21 100644
--- a/setup.py
+++ b/setup.py
@@ -153,10 +153,10 @@ def run(self):
         "mindnlp": "mindnlp",
         "mindtorch": "mindtorch",
     },
-    package_data={
-        'mindnlp': ['*', '*/*', '*/*/*', '*/*/*/*', '*/*/*/*/*', '*/*/*/*/*/*'],
-        'mindtorch': ['*', '*/*', '*/*/*', '*/*/*/*', '*/*/*/*/*', '*/*/*/*/*/*']
-    },
+    # package_data={
+    #     'mindnlp': ['*', '*/*', '*/*/*', '*/*/*/*', '*/*/*/*/*', '*/*/*/*/*/*'],
+    #     'mindtorch': ['*', '*/*', '*/*/*', '*/*/*/*', '*/*/*/*/*', '*/*/*/*/*/*']
+    # },
     cmdclass={
         'egg_info': EggInfo,
         'build_py': BuildPy,

From 1cdce47c30e036eb310250986fbeae1a95c84efe Mon Sep 17 00:00:00 2001
From: lvyufeng <lvyufeng@cqu.edu.cn>
Date: Tue, 30 Sep 2025 11:34:05 +0800
Subject: [PATCH 2/2] remove legacy code and fix pylint

---
 .github/pylint.conf                           |   5 +-
 .../inference/gpt-oss/app_multiprocess.py     |   4 +-
 .../inference/gpt-oss/gpt_oss_multiprocess.py |   2 +-
 mindnlp/data/__init__.py                      |   5 -
 mindnlp/data/processors/__init__.py           |  16 -
 mindnlp/data/processors/squad.py              | 597 ---------------
 mindnlp/dataset/transforms/__init__.py        |  28 -
 mindnlp/dataset/transforms/basic_tokenizer.py | 298 --------
 mindnlp/dataset/transforms/jieba_tokenizer.py |  85 ---
 mindnlp/dataset/transforms/lookup.py          | 111 ---
 mindnlp/dataset/transforms/pad_transform.py   |  91 ---
 mindnlp/evaluate.py                           |   2 +-
 mindnlp/integrations/__init__.py              |   3 -
 mindnlp/integrations/evaluate.py              |  20 -
 mindnlp/integrations/safetensors.py           | 252 ------
 mindnlp/integrations/transformers.py          |   8 -
 mindnlp/peft.py                               |   2 +-
 mindnlp/quant/smooth_quant/smooth.py          |  13 +-
 mindnlp/transformers/__init__.py              |  12 +-
 mindnlp/transformers/generation/__init__.py   |   1 -
 .../transformers/generation/logits_process.py |  11 -
 mindnlp/transformers/masking_utils.py         |  63 +-
 mindnlp/transformers/modeling_utils.py        | 150 ++--
 mindnlp/transformers/ms_utils.py              | 258 -------
 mindnlp/transformers/trainer.py               |   5 +-
 mindnlp/utils/__init__.py                     |   4 +-
 mindnlp/utils/decorators.py                   |  15 +-
 mindnlp/utils/generic.py                      | 264 ++++---
 mindnlp/utils/import_utils.py                 | 641 ++++++----------
 mindnlp/utils/logging.py                      | 107 +--
 mindnlp/utils/safetensors_patch.py            |  24 +-
 mindnlp/utils/testing_utils.py                | 717 ++++++++++--------
 mindtorch/_apis/npu.py                        |   2 +
 setup.py                                      |   9 +-
 34 files changed, 976 insertions(+), 2849 deletions(-)
 delete mode 100644 mindnlp/data/processors/__init__.py
 delete mode 100644 mindnlp/data/processors/squad.py
 delete mode 100644 mindnlp/dataset/transforms/__init__.py
 delete mode 100644 mindnlp/dataset/transforms/basic_tokenizer.py
 delete mode 100644 mindnlp/dataset/transforms/jieba_tokenizer.py
 delete mode 100644 mindnlp/dataset/transforms/lookup.py
 delete mode 100644 mindnlp/dataset/transforms/pad_transform.py
 delete mode 100644 mindnlp/integrations/__init__.py
 delete mode 100644 mindnlp/integrations/evaluate.py
 delete mode 100644 mindnlp/integrations/safetensors.py
 delete mode 100644 mindnlp/integrations/transformers.py
 delete mode 100644 mindnlp/transformers/generation/__init__.py
 delete mode 100644 mindnlp/transformers/generation/logits_process.py
 delete mode 100644 mindnlp/transformers/ms_utils.py

diff --git a/.github/pylint.conf b/.github/pylint.conf
index 49ca2a7c8..32719c6b8 100644
--- a/.github/pylint.conf
+++ b/.github/pylint.conf
@@ -218,7 +218,10 @@ disable=raw-checker-failed,
         use-a-generator,
         nested-min-max,
         method-hidden,
-        unsubscriptable-object
+        unsubscriptable-object,
+        wildcard-import,
+        unused-wildcard-import,
+        missing-module-docstring
 
 # Enable the message, report, category or checker with the given id(s). You can
 # either give multiple identifier separated by comma (,) or put this option
diff --git a/examples/transformers/inference/gpt-oss/app_multiprocess.py b/examples/transformers/inference/gpt-oss/app_multiprocess.py
index 239197505..3028f7ffe 100644
--- a/examples/transformers/inference/gpt-oss/app_multiprocess.py
+++ b/examples/transformers/inference/gpt-oss/app_multiprocess.py
@@ -1,7 +1,5 @@
-import mindnlp
-import mindspore
+from mindtorch import distributed as dist
 from mindnlp import core
-from mindnlp.core import distributed as dist
 from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
 
 import gradio as gr
diff --git a/examples/transformers/inference/gpt-oss/gpt_oss_multiprocess.py b/examples/transformers/inference/gpt-oss/gpt_oss_multiprocess.py
index 0a076fd33..35a44d66e 100644
--- a/examples/transformers/inference/gpt-oss/gpt_oss_multiprocess.py
+++ b/examples/transformers/inference/gpt-oss/gpt_oss_multiprocess.py
@@ -1,5 +1,5 @@
+from mindtorch import distributed as dist
 import mindnlp
-from mindnlp.core import distributed as dist
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 dist.init_process_group('hccl')
diff --git a/mindnlp/data/__init__.py b/mindnlp/data/__init__.py
index 90c9e7217..57f7cff77 100644
--- a/mindnlp/data/__init__.py
+++ b/mindnlp/data/__init__.py
@@ -13,8 +13,3 @@
 # limitations under the License.
 # ============================================================================
 """data module"""
-from .processors import (
-    SquadExample,
-    SquadFeatures,
-    squad_convert_examples_to_features,
-)
diff --git a/mindnlp/data/processors/__init__.py b/mindnlp/data/processors/__init__.py
deleted file mode 100644
index 249eea565..000000000
--- a/mindnlp/data/processors/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""processors."""
-from .squad import SquadExample, SquadFeatures, squad_convert_examples_to_features
diff --git a/mindnlp/data/processors/squad.py b/mindnlp/data/processors/squad.py
deleted file mode 100644
index e8834f26c..000000000
--- a/mindnlp/data/processors/squad.py
+++ /dev/null
@@ -1,597 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-# pylint: disable=redefined-outer-name
-"""squad."""
-from functools import partial
-from multiprocessing import Pool, cpu_count
-
-import numpy as np
-from tqdm import tqdm
-
-from mindnlp.transformers.models.bert.tokenization_bert import whitespace_tokenize
-from ...transformers.tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase, TruncationStrategy
-from ...utils import logging
-
-
-# Store the tokenizers which insert 2 separators tokens
-MULTI_SEP_TOKENS_TOKENIZERS_SET = {"roberta", "camembert", "bart", "mpnet"}
-
-logger = logging.get_logger(__name__)
-
-def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text):
-    """Returns tokenized answer spans that better match the annotated answer."""
-    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
-
-    for new_start in range(input_start, input_end + 1):
-        for new_end in range(input_end, new_start - 1, -1):
-            text_span = " ".join(doc_tokens[new_start : (new_end + 1)])
-            if text_span == tok_answer_text:
-                return (new_start, new_end)
-
-    return (input_start, input_end)
-
-
-def _check_is_max_context(doc_spans, cur_span_index, position):
-    """Check if this is the 'max context' doc span for the token."""
-    best_score = None
-    best_span_index = None
-    for span_index, doc_span in enumerate(doc_spans):
-        end = doc_span.start + doc_span.length - 1
-        if position < doc_span.start:
-            continue
-        if position > end:
-            continue
-        num_left_context = position - doc_span.start
-        num_right_context = end - position
-        score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
-        if best_score is None or score > best_score:
-            best_score = score
-            best_span_index = span_index
-
-    return cur_span_index == best_span_index
-
-
-def _new_check_is_max_context(doc_spans, cur_span_index, position):
-    """Check if this is the 'max context' doc span for the token."""
-    # if len(doc_spans) == 1:
-    # return True
-    best_score = None
-    best_span_index = None
-    for span_index, doc_span in enumerate(doc_spans):
-        end = doc_span["start"] + doc_span["length"] - 1
-        if position < doc_span["start"]:
-            continue
-        if position > end:
-            continue
-        num_left_context = position - doc_span["start"]
-        num_right_context = end - position
-        score = min(num_left_context, num_right_context) + 0.01 * doc_span["length"]
-        if best_score is None or score > best_score:
-            best_score = score
-            best_span_index = span_index
-
-    return cur_span_index == best_span_index
-
-
-def _is_whitespace(c):
-    r"""
-    Args:
-        c (str): The character to be checked for whitespace.
-            It can be a space (' '), tab ('\t'), carriage return ('\r'), newline ('\n'), or a specific Unicode character (8239).
-    Returns:
-        None: Indicates whether the input character is a whitespace character or not.
-    Raises:
-        None.
-    """
-    if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
-        return True
-    return False
-
-
-def squad_convert_example_to_features(
-    example, max_seq_length, doc_stride, max_query_length, padding_strategy, is_training
-):
-    r"""
-    Args:
-    example (Example): An object representing a single training example containing the necessary information for feature conversion.
-    max_seq_length (int): The maximum total input sequence length after tokenization.
-    doc_stride (int): The stride to use when splitting the document into multiple features.
-    max_query_length (int): The maximum length of the question tokens after tokenization.
-    padding_strategy (str): The strategy for padding the input sequences. Can be 'longest', 'do_not_pad', 'max_length', 'pad_to_max_length', 'prefix', or 'x'.
-    is_training (bool): Indicates whether the function is being used during training.
-    
-    Returns:
-    None. The function processes the input example and parameters to generate features for the SQuAD dataset.
-    
-    Raises:
-    None.
-    """
-    features = []
-    if is_training and not example.is_impossible:
-        # Get start and end position
-        start_position = example.start_position
-        end_position = example.end_position
-
-        # If the answer cannot be found in the text, then skip this example.
-        actual_text = " ".join(example.doc_tokens[start_position : (end_position + 1)])
-        cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text))
-        if actual_text.find(cleaned_answer_text) == -1:
-            logger.warning(f"Could not find answer: '{actual_text}' vs. '{cleaned_answer_text}'")
-            return []
-
-    tok_to_orig_index = []
-    orig_to_tok_index = []
-    all_doc_tokens = []
-    for i, token in enumerate(example.doc_tokens):
-        orig_to_tok_index.append(len(all_doc_tokens))
-        if tokenizer.__class__.__name__ in [
-            "RobertaTokenizer",
-            "LongformerTokenizer",
-            "BartTokenizer",
-            "RobertaTokenizerFast",
-            "LongformerTokenizerFast",
-            "BartTokenizerFast",
-        ]:
-            sub_tokens = tokenizer.tokenize(token, add_prefix_space=True)
-        else:
-            sub_tokens = tokenizer.tokenize(token)
-        for sub_token in sub_tokens:
-            tok_to_orig_index.append(i)
-            all_doc_tokens.append(sub_token)
-
-    if is_training and not example.is_impossible:
-        tok_start_position = orig_to_tok_index[example.start_position]
-        if example.end_position < len(example.doc_tokens) - 1:
-            tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
-        else:
-            tok_end_position = len(all_doc_tokens) - 1
-
-        (tok_start_position, tok_end_position) = _improve_answer_span(
-            all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text
-        )
-
-    spans = []
-
-    truncated_query = tokenizer.encode(
-        example.question_text, add_special_tokens=False, truncation=True, max_length=max_query_length
-    )
-
-    # Tokenizers who insert 2 SEP tokens in-between <context> & <question> need to have special handling
-    # in the way they compute mask of added tokens.
-    tokenizer_type = type(tokenizer).__name__.replace("Tokenizer", "").lower()
-    sequence_added_tokens = (
-        tokenizer.model_max_length - tokenizer.max_len_single_sentence + 1
-        if tokenizer_type in MULTI_SEP_TOKENS_TOKENIZERS_SET
-        else tokenizer.model_max_length - tokenizer.max_len_single_sentence
-    )
-    sequence_pair_added_tokens = tokenizer.model_max_length - tokenizer.max_len_sentences_pair
-
-    span_doc_tokens = all_doc_tokens
-    while len(spans) * doc_stride < len(all_doc_tokens):
-        # Define the side we want to truncate / pad and the text/pair sorting
-        if tokenizer.padding_side == "right":
-            texts = truncated_query
-            pairs = span_doc_tokens
-            truncation = TruncationStrategy.ONLY_SECOND.value
-        else:
-            texts = span_doc_tokens
-            pairs = truncated_query
-            truncation = TruncationStrategy.ONLY_FIRST.value
-
-        encoded_dict = tokenizer.encode_plus(  # TODO(thom) update this logic
-            texts,
-            pairs,
-            truncation=truncation,
-            padding=padding_strategy,
-            max_length=max_seq_length,
-            return_overflowing_tokens=True,
-            stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
-            return_token_type_ids=True,
-        )
-
-        paragraph_len = min(
-            len(all_doc_tokens) - len(spans) * doc_stride,
-            max_seq_length - len(truncated_query) - sequence_pair_added_tokens,
-        )
-
-        if tokenizer.pad_token_id in encoded_dict["input_ids"]:
-            if tokenizer.padding_side == "right":
-                non_padded_ids = encoded_dict["input_ids"][: encoded_dict["input_ids"].index(tokenizer.pad_token_id)]
-            else:
-                last_padding_id_position = (
-                    len(encoded_dict["input_ids"]) - 1 - encoded_dict["input_ids"][::-1].index(tokenizer.pad_token_id)
-                )
-                non_padded_ids = encoded_dict["input_ids"][last_padding_id_position + 1 :]
-
-        else:
-            non_padded_ids = encoded_dict["input_ids"]
-
-        tokens = tokenizer.convert_ids_to_tokens(non_padded_ids)
-
-        token_to_orig_map = {}
-        for i in range(paragraph_len):
-            index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i
-            token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i]
-
-        encoded_dict["paragraph_len"] = paragraph_len
-        encoded_dict["tokens"] = tokens
-        encoded_dict["token_to_orig_map"] = token_to_orig_map
-        encoded_dict["truncated_query_with_special_tokens_length"] = len(truncated_query) + sequence_added_tokens
-        encoded_dict["token_is_max_context"] = {}
-        encoded_dict["start"] = len(spans) * doc_stride
-        encoded_dict["length"] = paragraph_len
-
-        spans.append(encoded_dict)
-
-        if "overflowing_tokens" not in encoded_dict or (
-            "overflowing_tokens" in encoded_dict and len(encoded_dict["overflowing_tokens"]) == 0
-        ):
-            break
-        span_doc_tokens = encoded_dict["overflowing_tokens"]
-
-    for doc_span_index in range(len(spans)):
-        for j in range(spans[doc_span_index]["paragraph_len"]):
-            is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j)
-            index = (
-                j
-                if tokenizer.padding_side == "left"
-                else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j
-            )
-            spans[doc_span_index]["token_is_max_context"][index] = is_max_context
-
-    for span in spans:
-        # Identify the position of the CLS token
-        cls_index = span["input_ids"].index(tokenizer.cls_token_id)
-
-        # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
-        # Original TF implementation also keep the classification token (set to 0)
-        p_mask = np.ones_like(span["token_type_ids"])
-        if tokenizer.padding_side == "right":
-            p_mask[len(truncated_query) + sequence_added_tokens :] = 0
-        else:
-            p_mask[-len(span["tokens"]) : -(len(truncated_query) + sequence_added_tokens)] = 0
-
-        pad_token_indices = np.where(span["input_ids"] == tokenizer.pad_token_id)
-        special_token_indices = np.asarray(
-            tokenizer.get_special_tokens_mask(span["input_ids"], already_has_special_tokens=True)
-        ).nonzero()
-
-        p_mask[pad_token_indices] = 1
-        p_mask[special_token_indices] = 1
-
-        # Set the cls index to 0: the CLS index can be used for impossible answers
-        p_mask[cls_index] = 0
-
-        span_is_impossible = example.is_impossible
-        start_position = 0
-        end_position = 0
-        if is_training and not span_is_impossible:
-            # For training, if our document chunk does not contain an annotation
-            # we throw it out, since there is nothing to predict.
-            doc_start = span["start"]
-            doc_end = span["start"] + span["length"] - 1
-            out_of_span = False
-
-            if not (tok_start_position >= doc_start and tok_end_position <= doc_end):
-                out_of_span = True
-
-            if out_of_span:
-                start_position = cls_index
-                end_position = cls_index
-                span_is_impossible = True
-            else:
-                if tokenizer.padding_side == "left":
-                    doc_offset = 0
-                else:
-                    doc_offset = len(truncated_query) + sequence_added_tokens
-
-                start_position = tok_start_position - doc_start + doc_offset
-                end_position = tok_end_position - doc_start + doc_offset
-
-        features.append(
-            SquadFeatures(
-                span["input_ids"],
-                span["attention_mask"],
-                span["token_type_ids"],
-                cls_index,
-                p_mask.tolist(),
-                example_index=0,  # Can not set unique_id and example_index here. They will be set after multiple processing.
-                unique_id=0,
-                paragraph_len=span["paragraph_len"],
-                token_is_max_context=span["token_is_max_context"],
-                tokens=span["tokens"],
-                token_to_orig_map=span["token_to_orig_map"],
-                start_position=start_position,
-                end_position=end_position,
-                is_impossible=span_is_impossible,
-                qas_id=example.qas_id,
-            )
-        )
-    return features
-
-
-def squad_convert_example_to_features_init(tokenizer_for_convert: PreTrainedTokenizerBase):
-    r"""
-    Converts an example into a feature for the Squad dataset.
-    
-    Args:
-        tokenizer_for_convert (PreTrainedTokenizerBase): The tokenizer used for converting the example into features.
-    
-    Returns:
-        None
-    
-    Raises:
-        None
-    """
-    global tokenizer # pylint: disable=global-variable-undefined
-    tokenizer = tokenizer_for_convert
-
-
-def squad_convert_examples_to_features(
-    examples,
-    tokenizer,
-    max_seq_length,
-    doc_stride,
-    max_query_length,
-    is_training,
-    padding_strategy="max_length",
-    threads=1,
-    tqdm_enabled=True,
-):
-    """
-    Converts a list of examples into a list of features that can be directly given as input to a model. It is
-    model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
-
-    Args:
-        examples: list of [`~data.processors.squad.SquadExample`]
-        tokenizer: an instance of a child of [`PreTrainedTokenizer`]
-        max_seq_length: The maximum sequence length of the inputs.
-        doc_stride: The stride used when the context is too large and is split across several features.
-        max_query_length: The maximum length of the query.
-        is_training: whether to create features for model evaluation or model training.
-        padding_strategy: Default to "max_length". Which padding strategy to use
-        threads: multiple processing threads.
-
-
-    Returns:
-        list of [`~data.processors.squad.SquadFeatures`]
-
-    Example:
-
-    ```python
-    processor = SquadV2Processor()
-    examples = processor.get_dev_examples(data_dir)
-
-    features = squad_convert_examples_to_features(
-        examples=examples,
-        tokenizer=tokenizer,
-        max_seq_length=args.max_seq_length,
-        doc_stride=args.doc_stride,
-        max_query_length=args.max_query_length,
-        is_training=not evaluate,
-    )
-    ```"""
-    # Defining helper methods
-    features = []
-
-    threads = min(threads, cpu_count())
-    with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
-        annotate_ = partial(
-            squad_convert_example_to_features,
-            max_seq_length=max_seq_length,
-            doc_stride=doc_stride,
-            max_query_length=max_query_length,
-            padding_strategy=padding_strategy,
-            is_training=is_training,
-        )
-        features = list(
-            tqdm(
-                p.imap(annotate_, examples, chunksize=32),
-                total=len(examples),
-                desc="convert squad examples to features",
-                disable=not tqdm_enabled,
-            )
-        )
-
-    new_features = []
-    unique_id = 1000000000
-    example_index = 0
-    for example_features in tqdm(
-        features, total=len(features), desc="add example index and unique id", disable=not tqdm_enabled
-    ):
-        if not example_features:
-            continue
-        for example_feature in example_features:
-            example_feature.example_index = example_index
-            example_feature.unique_id = unique_id
-            new_features.append(example_feature)
-            unique_id += 1
-        example_index += 1
-    features = new_features
-    del new_features
-    return features
-
-class SquadExample:
-    """
-    A single training/test example for the Squad dataset, as loaded from disk.
-
-    Args:
-        qas_id: The example's unique identifier
-        question_text: The question string
-        context_text: The context string
-        answer_text: The answer string
-        start_position_character: The character position of the start of the answer
-        title: The title of the example
-        answers: None by default, this is used during evaluation. Holds answers as well as their start positions.
-        is_impossible: False by default, set to True if the example has no possible answer.
-    """
-    def __init__(
-        self,
-        qas_id,
-        question_text,
-        context_text,
-        answer_text,
-        start_position_character,
-        title,
-        answers=[],
-        is_impossible=False,
-    ):
-        r"""
-        Initialize a SquadExample object with the provided parameters.
-        
-        Args:
-            self (SquadExample): The instance of the SquadExample class.
-            qas_id (str): The unique identifier for the question-answer pair.
-            question_text (str): The text of the question.
-            context_text (str): The text of the context in which the question is asked.
-            answer_text (str): The text of the answer to the question.
-            start_position_character (int): The starting character position of the answer in the context.
-            title (str): The title of the context.
-            answers (list, optional): A list of additional answers related to the question. Default is an empty list.
-            is_impossible (bool): Indicates if the question is impossible to answer. Default is False.
-        
-        Returns:
-            None. This method initializes the SquadExample object with the provided parameters.
-        
-        Raises:
-            None.
-        """
-        self.qas_id = qas_id
-        self.question_text = question_text
-        self.context_text = context_text
-        self.answer_text = answer_text
-        self.title = title
-        self.is_impossible = is_impossible
-        self.answers = answers
-
-        self.start_position, self.end_position = 0, 0
-
-        doc_tokens = []
-        char_to_word_offset = []
-        prev_is_whitespace = True
-
-        # Split on whitespace so that different tokens may be attributed to their original position.
-        for c in self.context_text:
-            if _is_whitespace(c):
-                prev_is_whitespace = True
-            else:
-                if prev_is_whitespace:
-                    doc_tokens.append(c)
-                else:
-                    doc_tokens[-1] += c
-                prev_is_whitespace = False
-            char_to_word_offset.append(len(doc_tokens) - 1)
-
-        self.doc_tokens = doc_tokens
-        self.char_to_word_offset = char_to_word_offset
-
-        # Start and end positions only has a value during evaluation.
-        if start_position_character is not None and not is_impossible:
-            self.start_position = char_to_word_offset[start_position_character]
-            self.end_position = char_to_word_offset[
-                min(start_position_character + len(answer_text) - 1, len(char_to_word_offset) - 1)
-            ]
-
-
-class SquadFeatures:
-    """
-    Single squad example features to be fed to a model. Those features are model-specific and can be crafted from
-    [`~data.processors.squad.SquadExample`] using the
-    :method:*~transformers.data.processors.squad.squad_convert_examples_to_features* method.
-
-    Args:
-        input_ids: Indices of input sequence tokens in the vocabulary.
-        attention_mask: Mask to avoid performing attention on padding token indices.
-        token_type_ids: Segment token indices to indicate first and second portions of the inputs.
-        cls_index: the index of the CLS token.
-        p_mask: Mask identifying tokens that can be answers vs. tokens that cannot.
-            Mask with 1 for tokens than cannot be in the answer and 0 for token that can be in an answer
-        example_index: the index of the example
-        unique_id: The unique Feature identifier
-        paragraph_len: The length of the context
-        token_is_max_context:
-            List of booleans identifying which tokens have their maximum context in this feature object. If a token
-            does not have their maximum context in this feature object, it means that another feature object has more
-            information related to that token and should be prioritized over this feature for that token.
-        tokens: list of tokens corresponding to the input ids
-        token_to_orig_map: mapping between the tokens and the original text, needed in order to identify the answer.
-        start_position: start of the answer token index
-        end_position: end of the answer token index
-        encoding: optionally store the BatchEncoding with the fast-tokenizer alignment methods.
-    """
-    def __init__(
-        self,
-        input_ids,
-        attention_mask,
-        token_type_ids,
-        cls_index,
-        p_mask,
-        example_index,
-        unique_id,
-        paragraph_len,
-        token_is_max_context,
-        tokens,
-        token_to_orig_map,
-        start_position,
-        end_position,
-        is_impossible,
-        qas_id: str = None,
-        encoding: BatchEncoding = None,
-    ):
-        r"""
-        This method initializes an instance of the SquadFeatures class.
-        
-        Args:
-            input_ids (list): List of input IDs representing the tokenized input sequence.
-            attention_mask (list): List of attention mask values to indicate which tokens should be attended to.
-            token_type_ids (list): List of token type IDs to differentiate between question and context tokens.
-            cls_index (int): Index of the [CLS] token in the input sequence.
-            p_mask (list): List of mask values for the paragraph tokens.
-            example_index (int): Index of the example in the dataset.
-            unique_id (int): Unique identifier for the squad feature.
-            paragraph_len (int): Length of the paragraph.
-            token_is_max_context (dict): Dictionary mapping token indices to their maximum context indices.
-            tokens (list): List of tokens in the input sequence.
-            token_to_orig_map (list): List of token-to-original mapping.
-            start_position (int): Start position of the answer span.
-            end_position (int): End position of the answer span.
-            is_impossible (bool): Indicates if the question is unanswerable.
-            qas_id (str, optional): ID of the question-answer pair. Defaults to None.
-            encoding (BatchEncoding, optional): Batch encoding for the input sequence. Defaults to None.
-        
-        Returns:
-            None: This method does not return any value.
-        
-        Raises:
-            N/A
-        """
-        self.input_ids = input_ids
-        self.attention_mask = attention_mask
-        self.token_type_ids = token_type_ids
-        self.cls_index = cls_index
-        self.p_mask = p_mask
-
-        self.example_index = example_index
-        self.unique_id = unique_id
-        self.paragraph_len = paragraph_len
-        self.token_is_max_context = token_is_max_context
-        self.tokens = tokens
-        self.token_to_orig_map = token_to_orig_map
-
-        self.start_position = start_position
-        self.end_position = end_position
-        self.is_impossible = is_impossible
-        self.qas_id = qas_id
-        self.encoding = encoding
diff --git a/mindnlp/dataset/transforms/__init__.py b/mindnlp/dataset/transforms/__init__.py
deleted file mode 100644
index f4e4cea50..000000000
--- a/mindnlp/dataset/transforms/__init__.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright 2022 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""
-dataset processing transforms
-"""
-
-from mindspore.dataset.text import Truncate, AddToken
-
-from .lookup import Lookup
-from .basic_tokenizer import BasicTokenizer
-from .pad_transform import PadTransform
-from .jieba_tokenizer import JiebaTokenizer
-
-__all__ = [
-    'Truncate', 'AddToken', 'Lookup', 'PadTransform', 'BasicTokenizer', 'JiebaTokenizer'
-]
diff --git a/mindnlp/dataset/transforms/basic_tokenizer.py b/mindnlp/dataset/transforms/basic_tokenizer.py
deleted file mode 100644
index 3d140619d..000000000
--- a/mindnlp/dataset/transforms/basic_tokenizer.py
+++ /dev/null
@@ -1,298 +0,0 @@
-# Copyright 2022 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""
-BasicTokenizer
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import unicodedata
-import platform
-import numpy as np
-import mindspore._c_dataengine as cde # pylint: disable=no-name-in-module, import-error
-from mindspore.dataset.transforms.transforms import PyTensorOperation
-from mindspore.dataset.text.transforms import TextTensorOperation, Implementation
-
-class BasicTokenizer(TextTensorOperation, PyTensorOperation):
-    """
-    Tokenize the input UTF-8 encoded string by specific rules.
-
-    Args:
-        lower_case (bool, optional): Whether to perform lowercase processing on the text. If True, will fold the
-            text to lower case and strip accented characters. If False, will only perform normalization on the
-            text, with mode specified by `normalization_form`. Default: False.
-        py_transform (bool, optional): Whether use python implementation. Default: False.
-
-    Raises:
-        TypeError: If `lower_case` is not of type bool.
-        TypeError: If `py_transform` is not of type bool.
-        RuntimeError: If dtype of input Tensor is not str.
-
-    Supported Platforms:
-        ``CPU``
-
-    Examples:
-        >>> from mindnlp.dataset.transforms import BasicTokenizer
-        >>> tokenizer_op = BasicTokenizer()
-        >>> text = "Welcom to China!"
-        >>> tokenized_text = tokenizer_op(text)
-
-    """
-    # @check_decode
-    def __init__(self, lower_case=False, py_transform=False):
-        r"""
-        Initializes an instance of the BasicTokenizer class.
-        
-        Args:
-            self: The instance of the class.
-            lower_case (bool): Specifies whether the tokens should be converted to lowercase. Defaults to False.
-            py_transform (bool): Specifies whether the Python implementation should be used or not. Defaults to False.
-                If py_transform is set to True or the current operating system is Windows, the Python implementation is used.
-                Otherwise, the C implementation is used.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            None.
-        """
-        super().__init__()
-        if py_transform or platform.system().lower() == 'windows':
-            self.tokenizer = _BasicTokenizer(lower_case)
-            self.implementation = Implementation.PY
-        else:
-            self.tokenizer = None
-            self.implementation = Implementation.C
-        self.lower_case = lower_case
-
-    def __call__(self, text_input):
-        """
-        Call method for input conversion for eager mode with C++ implementation.
-        """
-        if isinstance(text_input, str):
-            text_input = np.array(text_input)
-        elif not isinstance(text_input, np.ndarray):
-            raise TypeError(
-                f"Input should be a text line in 1-D NumPy format, got {type(text_input)}.")
-        return super().__call__(text_input)
-
-    def execute_py(self, text_input):
-        """
-        Execute method.
-        """
-        return self._execute_py(text_input)
-
-    def _execute_py(self, text_input):
-        """
-        Execute method.
-        """
-        tokens = self.tokenizer.tokenize(text_input)
-        return np.array(tokens)
-
-    def parse(self):
-        r"""Parse the input using the BasicTokenizerOperation.
-        
-        This method applies the BasicTokenizerOperation to the input data, which tokenizes the input text into a list of tokens.
-        
-        Args:
-            self: An instance of the BasicTokenizer class.
-        
-        Returns:
-            None: This method does not return any value.
-        
-        Raises:
-            None: This method does not raise any exceptions.
-        """
-        from mindspore.dataset.text.transforms import DE_C_INTER_NORMALIZE_FORM, NormalizeForm
-        normalization_form = DE_C_INTER_NORMALIZE_FORM.get(NormalizeForm.NFD)
-        return cde.BasicTokenizerOperation(self.lower_case, False, normalization_form,
-                                           False, False)
-
-def _convert_to_unicode(text):
-    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
-    if isinstance(text, str):
-        return text
-    if isinstance(text, bytes):
-        return text.decode("utf-8", "ignore")
-    if isinstance(text, np.ndarray):
-        if text.dtype.type is np.bytes_:
-            text = np.char.decode(text, "utf-8")
-        return str(text)
-    raise ValueError(f"Unsupported string type: {type(text)}, {text.dtype}")
-
-def _whitespace_tokenize(text):
-    """Runs basic whitespace cleaning and splitting on a piece of text."""
-    text = text.strip()
-    if not text:
-        return []
-    tokens = text.split()
-    return tokens
-
-
-class _BasicTokenizer():
-    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
-    def __init__(self, do_lower_case=True):
-        """Constructs a BasicTokenizer.
-        Args:
-          do_lower_case: Whether to lower case the input.
-        """
-        self.do_lower_case = do_lower_case
-
-    def tokenize(self, text):
-        """Tokenizes a piece of text."""
-        text = _convert_to_unicode(text)
-        text = self._clean_text(text)
-
-        # This was added on November 1st, 2018 for the multilingual and Chinese
-        # models. This is also applied to the English models now, but it doesn't
-        # matter since the English models were not trained on any Chinese data
-        # and generally don't have any Chinese data in them (there are Chinese
-        # characters in the vocabulary because Wikipedia does have some Chinese
-        # words in the English Wikipedia.).
-        text = self._tokenize_chinese_chars(text)
-
-        orig_tokens = _whitespace_tokenize(text)
-        split_tokens = []
-        for token in orig_tokens:
-            if self.do_lower_case:
-                token = token.lower()
-                token = self._run_strip_accents(token)
-            split_tokens.extend(self._run_split_on_punc(token))
-
-        output_tokens = _whitespace_tokenize(" ".join(split_tokens))
-        return output_tokens
-
-    def _run_strip_accents(self, text):
-        """Strips accents from a piece of text."""
-        text = unicodedata.normalize("NFD", text)
-        output = []
-        for char in text:
-            cat = unicodedata.category(char)
-            if cat == "Mn":
-                continue
-            output.append(char)
-        return "".join(output)
-
-    def _run_split_on_punc(self, text):
-        """Splits punctuation on a piece of text."""
-        chars = list(text)
-        i = 0
-        start_new_word = True
-        output = []
-        while i < len(chars):
-            char = chars[i]
-            if _is_punctuation(char):
-                output.append([char])
-                start_new_word = True
-            else:
-                if start_new_word:
-                    output.append([])
-                start_new_word = False
-                output[-1].append(char)
-            i += 1
-
-        return ["".join(x) for x in output]
-
-    def _tokenize_chinese_chars(self, text):
-        """Adds whitespace around any CJK character."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if self._is_chinese_char(cp):
-                output.append(" ")
-                output.append(char)
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-
-    def _is_chinese_char(self, cp):
-        """Checks whether CP is the codepoint of a CJK character."""
-        # This defines a "chinese character" as anything in the CJK Unicode block:
-        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
-        #
-        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
-        # despite its name. The modern Korean Hangul alphabet is a different block,
-        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
-        # space-separated words, so they are not treated specially and handled
-        # like the all of the other languages.
-        if ((0x4E00 <= cp <= 0x9FFF) or
-                (0x3400 <= cp <= 0x4DBF) or
-                (0x20000 <= cp <= 0x2A6DF) or
-                (0x2A700 <= cp <= 0x2B73F) or
-                (0x2B740 <= cp <= 0x2B81F) or
-                (0x2B820 <= cp <= 0x2CEAF) or
-                (0xF900 <= cp <= 0xFAFF) or
-                (0x2F800 <= cp <= 0x2FA1F)):
-            return True
-
-        return False
-
-    def _clean_text(self, text):
-        """Performs invalid character removal and whitespace cleanup on text."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if cp == 0 or cp == 0xfffd or _is_control(char):
-                continue
-            if _is_whitespace(char):
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-
-
-def _is_whitespace(char):
-    """Checks whether `chars` is a whitespace character."""
-    # \t, \n, and \r are technically contorl characters but we treat them
-    # as whitespace since they are generally considered as such.
-    if char in (" ", "\t", "\n", "\r"):
-        return True
-    cat = unicodedata.category(char)
-    if cat == "Zs":
-        return True
-    return False
-
-
-def _is_control(char):
-    """Checks whether `chars` is a control character."""
-    # These are technically control characters but we count them as whitespace
-    # characters.
-    if char in ("\t", "\n", "\r"):
-        return False
-    cat = unicodedata.category(char)
-    if cat in ("Cc", "Cf"):
-        return True
-    return False
-
-
-def _is_punctuation(char):
-    """Checks whether `chars` is a punctuation character."""
-    cp = ord(char)
-    # We treat all non-letter/number ASCII as punctuation.
-    # Characters such as "^", "$", and "`" are not in the Unicode
-    # Punctuation class but we treat them as punctuation anyways, for
-    # consistency.
-    if ((33 <= cp <= 47) or
-            (58 <= cp <= 64) or
-            (91 <= cp <= 96) or
-            (123 <= cp <= 126)):
-        return True
-    cat = unicodedata.category(char)
-    if cat.startswith("P"):
-        return True
-    return False
diff --git a/mindnlp/dataset/transforms/jieba_tokenizer.py b/mindnlp/dataset/transforms/jieba_tokenizer.py
deleted file mode 100644
index e3bf6d8e1..000000000
--- a/mindnlp/dataset/transforms/jieba_tokenizer.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""
-JiebaTokenizer Python version
-"""
-import logging
-import os
-from mindnlp.utils import is_jieba_available
-
-class JiebaTokenizer:
-
-    r""" 
-        A class for tokenizing Chinese text using the Jieba library.
-    
-        This class provides methods for initializing the tokenizer with custom word dictionaries and settings, as well as for tokenizing Chinese text with options for different cutting modes and Named Entity
-Recognition (NER) recognition.
-    
-        Args:
-            dict_path (str): Path to a custom dictionary file. Default is an empty string.
-            custom_word_freq_dict (dict): A dictionary mapping custom words to their frequencies.
-    
-        Attributes:
-            model (object): The Jieba model for tokenization.
-    
-        Methods:
-            __init__(dict_path='', custom_word_freq_dict=None):
-                Initializes the JiebaTokenizer instance with a custom dictionary and word frequencies.
-    
-            tokenize(sentence, cut_all=False, HMM=True):
-                Tokenizes the input sentence and returns a list of strings representing the segmented words.
-    
-        Example Usage:
-            tokenizer = JiebaTokenizer()
-            tokens = tokenizer.tokenize('我爱自然语言处理', cut_all=True)
-            print(tokens)  # Output: ['我', '爱', '自然', '自然语言', '言', '处理']
-    """
-    def __init__(self, dict_path='', custom_word_freq_dict=None):
-        r"""
-        Initializes a new instance of the JiebaTokenizer class.
-        
-        Args:
-            self: The current instance of the JiebaTokenizer class.
-            dict_path (str): The path to the custom dictionary file. Defaults to an empty string.
-            custom_word_freq_dict (dict): A dictionary containing custom word frequencies. Defaults to None.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            None. This method does not raise any exceptions.
-        """
-        if is_jieba_available():
-            import jieba
-
-        self.model = jieba
-        self.model.default_logger.setLevel(logging.ERROR)
-        # 初始化大词典
-        if os.path.exists(dict_path):
-            self.model.set_dictionary(dict_path)
-        # 加载用户自定义词典
-        if custom_word_freq_dict:
-            for w, f in custom_word_freq_dict.items():
-                self.model.add_word(w, freq=f)
-
-    def tokenize(self, sentence, cut_all=False, HMM=True):
-        """
-        切词并返回切词位置
-        :param sentence: 句子
-        :param cut_all: 全模式，默认关闭
-        :param HMM: 是否打开NER识别，默认打开
-        :return:  A list of strings.
-        """
-        return self.model.lcut(sentence, cut_all=cut_all, HMM=HMM)
diff --git a/mindnlp/dataset/transforms/lookup.py b/mindnlp/dataset/transforms/lookup.py
deleted file mode 100644
index 56848659f..000000000
--- a/mindnlp/dataset/transforms/lookup.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-# pylint:disable=I1101
-# pylint:disable=W0212
-
-"""
-lookup transforms
-"""
-import mindspore._c_dataengine as cde # pylint: disable=no-name-in-module, import-error
-from mindspore.dataset.text.transforms import TextTensorOperation
-from mindspore.dataset.mindtorch.datatypes import mstype_to_detype
-from mindspore.common import dtype as mstype
-from mindspore.dataset.text import Vocab as msVocab
-from mindnlp.vocab import Vocab as nlpVocab
-
-
-class Lookup(TextTensorOperation):
-    """
-    Look up a word into an id according to the input vocabulary table.
-
-    Args:
-        vocab (Vocab): A vocabulary object.
-        unk_token (str): unknow token for OOV.
-        return_dtype (mindspore.dtype, optional): The data type that lookup operation maps
-            string to. Default: mindspore.int32.
-
-    Raises:
-        TypeError: If `vocab` is not of type text.Vocab.
-        TypeError: If `return_dtype` is not of type mindspore.dtype.
-
-    Examples:
-        >>> from mindnlp import Vocab
-        >>> from mindnlp.transforms import Lookup
-        >>> # Load vocabulary from list
-        >>> vocab = Vocab(['深', '圳', '欢', '迎', '您'])
-        >>> # Use Lookup operation to map tokens to ids
-        >>> lookup = Lookup(vocab)
-        >>> text_file_dataset = text_file_dataset.map(operations=[lookup])
-    """
-    def __init__(self, vocab, unk_token, return_dtype=mstype.int32):
-        r"""
-        Initializes a Lookup object.
-        
-        Args:
-            self (object): The instance of the Lookup class.
-            vocab (object): An object representing the vocabulary. It can be an instance of nlpVocab or msVocab.
-                             For nlpVocab, the vocabulary is created from the token dictionary of the object.
-                             For msVocab, the vocabulary is obtained from the 'c_vocab' attribute of the object.
-                             Raises a ValueError if the vocab object is not of type nlpVocab or msVocab.
-            unk_token (str): The unknown token used for out-of-vocabulary words.
-            return_dtype (type, optional): The return data type for the lookup values. Defaults to mstype.int32.
-        
-        Returns:
-            None. This method initializes the Lookup object with the provided parameters.
-        
-        Raises:
-            ValueError: If the 'vocab' parameter is not an instance of nlpVocab or msVocab.
-        """
-        super().__init__()
-        if isinstance(vocab, nlpVocab):
-            self._vocab = cde.Vocab.from_dict(vocab._token_dict)
-        elif isinstance(vocab, msVocab):
-            self._vocab = vocab.c_vocab
-        else:
-            raise ValueError(f'do not support vocab type {type(vocab)}.')
-
-        self._unk_token = unk_token
-        self._return_dtype = return_dtype
-
-    def parse(self):
-        r"""
-        Parses the lookup operation based on the specified vocabulary.
-        
-        Args:
-            self: An instance of the Lookup class.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            None.
-        
-        Description:
-        This method performs the lookup operation by using the specified vocabulary. It takes into account the following parameters:
-        
-        - `self`: An instance of the Lookup class. This parameter is required to access the instance variables and methods of the class.
-        
-        The lookup operation is performed using the `cde.LookupOperation` function. The parameters used for the lookup operation are as follows:
-        
-        - `self._vocab`: The vocabulary used for the lookup operation.
-        - `self._unk_token`: The token to be used for unknown words in the lookup operation.
-        - `str(mstype_to_detype(self._return_dtype))`: The return data type of the lookup operation, converted to a string.
-        
-        The method does not return any value, as it modifies the internal state of the Lookup instance.
-        
-        Note:
-        - This method assumes that the `cde.LookupOperation` function is available and properly implemented.
-        """
-        return cde.LookupOperation(self._vocab, self._unk_token, str(mstype_to_detype(self._return_dtype)))
diff --git a/mindnlp/dataset/transforms/pad_transform.py b/mindnlp/dataset/transforms/pad_transform.py
deleted file mode 100644
index bbb5e5f0b..000000000
--- a/mindnlp/dataset/transforms/pad_transform.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""AddToken transform"""
-import numpy as np
-from mindspore.dataset.transforms.transforms import PyTensorOperation
-from mindspore.dataset.text.transforms import Implementation
-
-
-class PadTransform(PyTensorOperation):
-    """
-    Pad tensor to a fixed length with given padding value.
-
-    Args:
-        max_length (int): Maximum length to pad to.
-        pad_value (int): Value to pad the tensor with.
-        return_length (bool): Whether return auxiliary sequence length.
-
-    Raises:
-        TypeError: If `token` is not of type str.
-
-    Supported Platforms:
-        ``CPU``
-
-    Examples:
-
-    """
-    # @check_decode
-    def __init__(self, max_length: int, pad_value:int, return_length:bool = False):
-        r"""
-        Initializes an instance of the PadTransform class.
-        
-        Args:
-            self: The instance of the class.
-            max_length (int): The maximum length of the sequence to be padded.
-            pad_value (int): The value used for padding the sequence.
-            return_length (bool, optional): Flag indicating whether to return the length of the padded sequence. Defaults to False.
-        
-        Returns:
-            None: This method does not return any value.
-        
-        Raises:
-            None: This method does not raise any exceptions.
-        """
-        super().__init__()
-        self.max_length = max_length
-        self.pad_value = pad_value
-        self.return_length = return_length
-        self.implementation = Implementation.PY
-
-    def __call__(self, text_input):
-        """
-        Call method for input conversion for eager mode with C++ implementation.
-        """
-        if not isinstance(text_input, np.ndarray):
-            raise TypeError(
-                f"Input should be a text line in 1-D ndarray contains string, got {type(text_input)}.")
-        return super().__call__(text_input)
-
-    def execute_py(self, text_input):
-        """
-        Execute method.
-        """
-        return self._execute_py(text_input)
-
-    def _execute_py(self, text_input):
-        """
-        Execute method.
-        """
-        text_input = text_input[:self.max_length]
-        text_length = len(text_input)
-
-        pad_value = np.array([self.pad_value] * (self.max_length - text_length), text_input.dtype)
-        text_output = np.concatenate([text_input, pad_value], 0)
-
-        if self.return_length:
-            length = np.array(text_length)
-            return text_output, length
-
-        return text_output
diff --git a/mindnlp/evaluate.py b/mindnlp/evaluate.py
index 02c955cf8..faeeb4aeb 100644
--- a/mindnlp/evaluate.py
+++ b/mindnlp/evaluate.py
@@ -1 +1 @@
-from evaluate import *
\ No newline at end of file
+from evaluate import *
diff --git a/mindnlp/integrations/__init__.py b/mindnlp/integrations/__init__.py
deleted file mode 100644
index 20b4fab81..000000000
--- a/mindnlp/integrations/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from . import safetensors
-from . import transformers
-from . import evaluate
diff --git a/mindnlp/integrations/evaluate.py b/mindnlp/integrations/evaluate.py
deleted file mode 100644
index a5c632d38..000000000
--- a/mindnlp/integrations/evaluate.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""
-evaluate module.
-"""
-from evaluate import config
-
-config.HUB_EVALUATE_URL = "https://openi.pcl.ac.cn/{path}/raw/branch/{revision}/{name}"
diff --git a/mindnlp/integrations/safetensors.py b/mindnlp/integrations/safetensors.py
deleted file mode 100644
index 86f2f0794..000000000
--- a/mindnlp/integrations/safetensors.py
+++ /dev/null
@@ -1,252 +0,0 @@
-from typing import OrderedDict
-import mmap
-import json
-import numpy as np
-
-import safetensors
-
-import mindtorch
-
-from mindtorch.configs import SUPPORT_BF16
-
-if SUPPORT_BF16:
-    from mindspore.common.np_dtype import bfloat16  # pylint: disable=import-error
-else:
-    from ml_dtypes import bfloat16
-
-MAGIC_NUMBER = 0x1950A86A20F9469CFC6C
-PROTOCOL_VERSION = 1001
-MAX_HEADER_SIZE = 100 * 1000 * 1000
-
-
-_MS_TYPES = {
-    "F64": mindtorch.float64,
-    "F32": mindtorch.float32,
-    "F16": mindtorch.float16,
-    "BF16": mindtorch.bfloat16,
-    "I64": mindtorch.int64,
-    "U64": mindtorch.uint64,
-    "I32": mindtorch.int32,
-    "U32": mindtorch.uint32,
-    "I16": mindtorch.int16,
-    "U16": mindtorch.uint16,
-    "I8": mindtorch.int8,
-    "U8": mindtorch.uint8,
-    "BOOL": mindtorch.bool,
-}
-
-_NP_TYPES = {
-    "F64": np.float64,
-    "F32": np.float32,
-    "F16": np.float16,
-    "BF16": bfloat16,
-    "I64": np.int64,
-    "U64": np.uint64,
-    "I32": np.int32,
-    "U32": np.uint32,
-    "I16": np.int16,
-    "U16": np.uint16,
-    "I8": np.int8,
-    "U8": np.uint8,
-    "BOOL": bool,
-}
-
-
-_DTYPE_SIZE = {
-    "BOOL": 1,
-    "U8": 1,
-    "I8": 1,
-    "F8_E5M2": 1,
-    "F8_E4M3": 1,
-    "I16": 2,
-    "U16": 2,
-    "I32": 4,
-    "U32": 4,
-    "I64": 8,
-    "U64": 8,
-    "F16": 2,
-    "BF16": 2,
-    "F32": 4,
-    "F64": 8,
-}
-
-class PySafeSlice:
-    def __init__(self, info, bufferfile, base_ptr, buffermmap):
-        self.info = info
-        self.bufferfile = bufferfile
-        self.buffermmap = buffermmap
-        self.base_ptr = base_ptr
-
-        self.start = [0 for _ in self.shape]
-        self.stop = list(self.shape)
-        self.step = [1 for _ in self.shape]
-
-    @property
-    def ndim(self):
-        return len(self.shape)
-
-    def get(self, *args, **kwargs):
-        nbytes = int(np.prod(self.shape)) * np.dtype(self.dtype).itemsize
-        offset = self.start_offset
-        tensor = np.frombuffer(self.buffermmap, dtype=self.dtype, offset=offset,
-                               count=nbytes // np.dtype(self.dtype).itemsize)
-        tensor = tensor.reshape(self.shape)
-        if not SUPPORT_BF16 and self.info["dtype"] == 'BF16':
-            tensor = tensor.astype(np.float16)
-        tensor = mindtorch.from_numpy(tensor)
-        return tensor
-
-    @property
-    def start_offset(self):
-        return self.base_ptr + self.info["data_offsets"][0]
-
-    def get_shape(self):
-        return self.shape
-
-    def get_dtype(self):
-        return self.info["dtype"]
-
-    @property
-    def shape(self):
-        return self.info["shape"]
-
-    @property
-    def dtype(self):
-        return _NP_TYPES[self.info["dtype"]]
-
-    @property
-    def nelements(self):
-        return np.prod(self.info["shape"])
-
-    @property
-    def bits(self):
-        return _DTYPE_SIZE[self.info["dtype"]]
-
-    @property
-    def nbytes(self):
-        return self.nelements * self.bits
-
-    def __getitem__(self, slice):
-        if slice is Ellipsis:
-            return self.get()
-        return self.get()[slice]
-
-def getSize(fileobject):
-    fileobject.seek(0, 2)  # move the cursor to the end of the file
-    size = fileobject.tell()
-    fileobject.seek(0)  # move the cursor to the start of the file
-    return size
-
-
-def metadata_validate(metadata):
-    start = 0
-    for key, info in metadata.items():
-        s, e = info["data_offsets"]
-        if s != start or e < s:
-            raise ValueError(f"SafeTensorError::InvalidOffset({key})")
-        start = e
-        nelements = np.prod(info["shape"])
-        nbytes = nelements * _DTYPE_SIZE[info["dtype"]]
-        if (e - s) != nbytes:
-            raise ValueError("SafeTensorError::TensorInvalidInfo")
-    return start
-
-def read_metadata(buffer):
-    buffer_len = getSize(buffer)
-    if buffer_len < 8:
-        raise ValueError("SafeTensorError::HeaderTooSmall")
-
-    n = np.frombuffer(buffer.read(8), dtype=np.uint64).item()
-
-    if n > MAX_HEADER_SIZE:
-        raise ValueError("SafeTensorError::HeaderTooLarge")
-
-    stop = n + 8
-    if stop > buffer_len:
-        raise ValueError("SafeTensorError::InvalidHeaderLength")
-
-    tensors = json.loads(buffer.read(n), object_pairs_hook=OrderedDict)
-
-    metadata = tensors.pop("__metadata__", None)
-    buffer_end = metadata_validate(tensors)
-
-    if buffer_end + 8 + n != buffer_len:
-        raise ValueError("SafeTensorError::MetadataIncompleteBuffer")
-
-    return stop, tensors, metadata
-
-
-class fast_safe_open:
-    def __init__(self, filename, framework=None, device="cpu"):
-        self.filename = filename
-        self.framework = framework
-        self.file = open(self.filename, "rb")
-        self.file_mmap = mmap.mmap(self.file.fileno(), 0, access=mmap.ACCESS_COPY)
-        self.base, self.tensors_decs, self.__metadata__ = read_metadata(self.file)
-        self.tensors = OrderedDict()
-        for key, info in self.tensors_decs.items():
-            self.tensors[key] = PySafeSlice(info, self.file, self.base, self.file_mmap)
-            self.tensors[key].key = key
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, *args):
-        self.file.close()
-
-    def metadata(self):
-        return self.__metadata__
-
-    def keys(self):
-        return list(self.tensors.keys())
-
-    def get_tensor(self, name):
-        return self.tensors[name].get()
-
-    def get_slice(self, name):
-        return self.tensors[name]
-
-def safe_load_file(filename):
-    """
-    This function safely loads a file containing state dictionary data and converts it into a dictionary of MindSpore Parameters.
-
-    Args:
-        filename (str): The path to the file containing the state dictionary data to be loaded.
-
-    Returns:
-        dict: A dictionary where keys are parameter names and values are MindSpore Parameters.
-
-    Raises:
-        FileNotFoundError: If the specified file 'filename' does not exist.
-        ValueError: If the data in the file is not in the correct format to create MindSpore Parameters.
-    """
-    result = {}
-    with fast_safe_open(filename, framework="np") as f:
-        for k in f.keys():
-            result[k] = f.get_tensor(k)
-    return result
-
-
-def safe_save_file(tensor_dict, filename, metadata=None):
-    """
-    Function to safely save a dictionary of tensors to a file.
-
-    Args:
-        tensor_dict (dict): A dictionary where keys are strings and values are numpy arrays representing tensors.
-        filename (str): The name of the file where the tensor data will be saved.
-        metadata (optional): Additional metadata to be saved along with the tensor data. Default is None.
-
-    Returns:
-        None. The function does not return any value explicitly.
-
-    Raises:
-        ValueError: If the input tensor_dict is not in the expected format.
-        IOError: If there are issues with writing the data to the specified file.
-        Exception: Any other unexpected error that may occur during the process.
-    """
-    tensor_dict = {k: v.asnumpy() for k, v in tensor_dict.items()}
-    return safetensors.numpy.save_file(tensor_dict, filename, metadata)
-
-safetensors.safe_open = fast_safe_open
-from safetensors import torch
-torch.load_file = safe_load_file
diff --git a/mindnlp/integrations/transformers.py b/mindnlp/integrations/transformers.py
deleted file mode 100644
index d09b85aeb..000000000
--- a/mindnlp/integrations/transformers.py
+++ /dev/null
@@ -1,8 +0,0 @@
-import transformers
-
-def mock_is_not_available():
-    return False
-
-transformers.utils.import_utils.is_torchvision_v2_available.__code__ = mock_is_not_available.__code__
-transformers.utils.import_utils.is_torch_flex_attn_available.__code__ = mock_is_not_available.__code__
-
diff --git a/mindnlp/peft.py b/mindnlp/peft.py
index 06c839a9c..705c3aa4a 100644
--- a/mindnlp/peft.py
+++ b/mindnlp/peft.py
@@ -1 +1 @@
-from peft import *
\ No newline at end of file
+from peft import *
diff --git a/mindnlp/quant/smooth_quant/smooth.py b/mindnlp/quant/smooth_quant/smooth.py
index 0d0066e29..73e3bec74 100644
--- a/mindnlp/quant/smooth_quant/smooth.py
+++ b/mindnlp/quant/smooth_quant/smooth.py
@@ -1,10 +1,13 @@
 '''
 code from https://github.com/mit-han-lab/smoothquant/
 '''
-from mindtorch import ops, nn, no_grad
-
 from transformers.models.llama.modeling_llama import LlamaDecoderLayer, LlamaRMSNorm
 
+import mindtorch
+from mindtorch import nn
+from mindtorch.autograd import no_grad
+
+
 @no_grad()
 def smooth_ln_fcs_llama_like(ln, fcs, act_scales, alpha=0.5):
     if not isinstance(fcs, list):
@@ -15,10 +18,10 @@ def smooth_ln_fcs_llama_like(ln, fcs, act_scales, alpha=0.5):
         assert ln.weight.shape[0] == fc.in_features == act_scales.numel()
     dtype = fcs[0].weight.dtype
     act_scales = act_scales.to(dtype=dtype)
-    weight_scales = ops.cat(
-        [ops.max(fc.weight.abs(), dim=0, keepdim=True)[0] for fc in fcs], dim=0
+    weight_scales = mindtorch.cat(
+        [mindtorch.max(fc.weight.abs(), dim=0, keepdim=True)[0] for fc in fcs], dim=0
     )
-    weight_scales = ops.max(weight_scales, dim=0)[0].clamp(min=1e-5)
+    weight_scales = mindtorch.max(weight_scales, dim=0)[0].clamp(min=1e-5)
     scales = (
         (act_scales.pow(alpha) / weight_scales.pow(1 - alpha))
         .clamp(min=1e-5)
diff --git a/mindnlp/transformers/__init__.py b/mindnlp/transformers/__init__.py
index b2fafb024..a356c2254 100644
--- a/mindnlp/transformers/__init__.py
+++ b/mindnlp/transformers/__init__.py
@@ -1,20 +1,20 @@
 import sys
 from packaging import version
-from mindtorch.configs import ON_ORANGE_PI
+
+import transformers
+
 from mindnlp.utils.import_utils import *
 from mindnlp.utils.import_utils import _LazyModule
 
 
-from . import ms_utils
 from .masking_utils import create_causal_mask, create_sliding_window_causal_mask, create_masks_for_generate
 from .modeling_utils import construct_pipeline_parallel_model, _load_pretrained_model_wrapper, \
     _get_resolved_checkpoint_files_wrapper
 from .tokenization_utils import apply_chat_template_wrapper
 from .trainer import training_step
-from .generation import *
+from ..utils.decorators import dtype_wrapper, patch_dtype_wrapper, patch_wrappers
 
 # redirect mindnlp.transformers to transformers
-import transformers
 sys.modules[__name__] = _LazyModule(
     'transformers',
     transformers.__file__,
@@ -35,7 +35,6 @@ def empty_fn(*args, **kwargs):
 transformers.utils.import_utils.is_torch_sdpa_available = not_supported
 
 
-from ..utils.decorators import dtype_wrapper, patch_dtype_wrapper, patch_wrappers
 
 patch_dtype_wrapper(transformers.AutoModel, 'from_pretrained')
 if version.parse(transformers.__version__) >= version.parse('4.56.0'):
@@ -68,9 +67,6 @@ def empty_fn(*args, **kwargs):
 transformers.generation.utils.create_masks_for_generate = create_masks_for_generate
 
 transformers.trainer.Trainer.training_step = training_step
-# for ORANGE_PI
-if ON_ORANGE_PI:
-    transformers.generation.logits_process.InfNanRemoveLogitsProcessor.__call__ = InfNanRemoveLogitsProcessor_call
 
 # add mindnlp.transformers modules/attrs to lazymodule
 # setattr(sys.modules[__name__], 'test_ms_model', test_ms_model)
diff --git a/mindnlp/transformers/generation/__init__.py b/mindnlp/transformers/generation/__init__.py
deleted file mode 100644
index e32cadf18..000000000
--- a/mindnlp/transformers/generation/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from . logits_process import *
\ No newline at end of file
diff --git a/mindnlp/transformers/generation/logits_process.py b/mindnlp/transformers/generation/logits_process.py
deleted file mode 100644
index 19826b287..000000000
--- a/mindnlp/transformers/generation/logits_process.py
+++ /dev/null
@@ -1,11 +0,0 @@
-import mindtorch
-
-def InfNanRemoveLogitsProcessor_call(self, input_ids, scores):
-    # set all +/-inf values to max/min possible value
-    scores_processed = scores
-    scores_processed = mindtorch.where(scores == float("inf"), mindtorch.finfo(scores.dtype).max, scores_processed)
-    scores_processed = mindtorch.where(scores == -float("inf"), mindtorch.finfo(scores.dtype).min, scores_processed)
-    # set all nan values to 0.0
-    scores_processed = mindtorch.where(scores != scores, 0.0, scores)
-
-    return scores_processed
diff --git a/mindnlp/transformers/masking_utils.py b/mindnlp/transformers/masking_utils.py
index a284dc9c3..85d7bd249 100644
--- a/mindnlp/transformers/masking_utils.py
+++ b/mindnlp/transformers/masking_utils.py
@@ -207,7 +207,7 @@ def _ignore_causal_mask_sdpa(
     if (
         not is_tracing
         # only cases when lower and upper diags are the same, see https://github.com/pytorch/pytorch/issues/108108
-        and (query_length == 1 or kv_length == query_length)
+        and query_length in (1, kv_length)
         # in this case we need to add special patterns to the mask so cannot be skipped otherwise
         and (local_attention_size is None or kv_length < local_attention_size)
         # In this case, we need to add padding to the mask, so cannot be skipped otherwise
@@ -519,70 +519,11 @@ def flash_attention_mask(
     return attention_mask
 
 
-def flex_attention_mask(
-    batch_size: int,
-    cache_position: mindtorch.Tensor,
-    kv_length: int,
-    kv_offset: int = 0,
-    mask_function: Callable = causal_mask_function,
-    attention_mask: Optional[mindtorch.Tensor] = None,
-    **kwargs,
-) -> BlockMask:
-    """
-    Create a 4D block mask which is a compressed representation of the full 4D block causal mask. BlockMask is essential
-    for performant computation of flex attention. See: https://pymindtorch.org/blog/flexattention/
-
-    Args:
-        batch_size (`int`):
-            The batch size of the input sequence.
-        cache_position (`mindtorch.Tensor`):
-            A tensor of shape (query_length,) indicating the current indices of the input sequence elements.
-        kv_length (`int`):
-            The size that the key and value states will have during the attention computation.
-        kv_offset (`int`, optional):
-            An optional offset to indicate at which first position the key and values states will refer to.
-        mask_function (`Callable`):
-            The mask factory function describing the mask pattern.
-        attention_mask (`mindtorch.Tensor`, optional):
-            The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length)
-    """
-    q_length, q_offset = cache_position.shape[0], cache_position[0]
-
-    # Potentially add the padding 2D mask
-    if attention_mask is not None:
-        # Older torch (2.5.x) cannot handle sequences not in multiples of 128 (default block size)
-        # Hence we pad to multiples of this as a minimum to ensure this
-        pad_len = ((attention_mask.shape[1] // flex_default_block_size) + 1) * flex_default_block_size
-        pad_len = pad_len - attention_mask.shape[1]
-        if not _is_torch_greater_or_equal_than_2_6 and pad_len > 0:
-            attention_mask = mindtorch.nn.functional.pad(attention_mask, value=0, pad=(0, pad_len))
-
-        padding_mask = prepare_padding_mask(attention_mask, kv_length, kv_offset, _slice=False)
-        mask_function = and_masks(mask_function, padding_mask_function(padding_mask))
-
-    # Add the offsets on top (because flex interface only allows length, not start and end indices)
-    mask_function = add_offsets_to_mask_function(mask_function, q_offset, kv_offset)
-
-    # Finally create the block mask
-    block_mask = create_block_mask(
-        mask_mod=mask_function,
-        B=batch_size,
-        H=None,
-        Q_LEN=q_length,
-        KV_LEN=kv_length,
-        device=cache_position.device,
-        _compile=_is_torch_greater_or_equal_than_2_6,
-    )
-    return block_mask
-
-
-
 # Global AttentionMaskInterface shared by all models which do not need to overwrite any of the existing ones
 ALL_MASK_ATTENTION_FUNCTIONS = {
     "sdpa": sdpa_mask,
     "eager": eager_mask,
     "flash_attention_2": flash_attention_mask,
-    "flex_attention": flex_attention_mask,
 }
 
 def find_packed_sequence_indices(position_ids: mindtorch.Tensor) -> mindtorch.Tensor:
@@ -1145,7 +1086,7 @@ def __new__(cls, data, style=None):
         cls.style = style
         return mindtorch.Tensor._make_subclass(cls, data, require_grad=False)
 
-    def __init__(self, data):
+    def __init__(self, data): # pylint: disable=super-init-not-called
         # You can initialize any additional metadata here if needed
         pass
 
diff --git a/mindnlp/transformers/modeling_utils.py b/mindnlp/transformers/modeling_utils.py
index fa10dedc7..9fb31dc4d 100644
--- a/mindnlp/transformers/modeling_utils.py
+++ b/mindnlp/transformers/modeling_utils.py
@@ -1,37 +1,42 @@
-
 import types
 
 from mindspore.communication import GlobalComm
-from mindtorch import nn, ops, distributed as dist
+
+import mindtorch
+from mindtorch import nn, distributed as dist
 from ..utils import logging
 
 logger = logging.get_logger(__name__)
 
 
 def replace_submodule(model, submodule_path, new_module):
-    parent_path, _, child_name = submodule_path.rpartition('.')
+    parent_path, _, child_name = submodule_path.rpartition(".")
 
     parent_module = model.get_submodule(parent_path) if parent_path else model
 
     setattr(parent_module, child_name, new_module)
 
+
 def send_forward(self, *args, **kwargs):
     output = self._forward(*args, **kwargs)
     dist.isend(output[0], self.dist)
     return output
 
+
 def receive_forward(self, *args, **kwargs):
     hidden_states = args[0]
     dist.recv(hidden_states, src=self.src)
     output = self._forward(*((hidden_states,) + args[1:]), **kwargs)
     return output
 
+
 def broadcast_forward(self, *args, **kwargs):
     output = self._forward(*args, **kwargs)
     dist.broadcast(output, src=self.src)
     dist.barrier()
     return output
 
+
 class DecoderLayerIdentity(nn.Module):
     def __init__(self, layer_idx, config):
         super().__init__()
@@ -40,17 +45,34 @@ def __init__(self, layer_idx, config):
         self.attention_type = config.layer_types[layer_idx]
 
     def forward(self, *args, **kwargs):
-        past_key_value = kwargs.get('past_key_value', None)
+        past_key_value = kwargs.get("past_key_value", None)
         hidden_states = args[0]
         bs, seq_len, _ = hidden_states.shape
 
         if past_key_value is not None:
             past_key_value.update(
-                ops.empty(bs, self.num_key_value_heads, seq_len, 0, dtype=hidden_states.dtype, device='meta'),
-                ops.empty(bs, self.num_key_value_heads, seq_len, 0, dtype=hidden_states.dtype, device='meta'),
-                self.layer_idx)
+                mindtorch.empty(
+                    bs,
+                    self.num_key_value_heads,
+                    seq_len,
+                    0,
+                    dtype=hidden_states.dtype,
+                    device="meta",
+                ),
+                mindtorch.empty(
+                    bs,
+                    self.num_key_value_heads,
+                    seq_len,
+                    0,
+                    dtype=hidden_states.dtype,
+                    device="meta",
+                ),
+                self.layer_idx,
+            )
 
-        return ops.empty(*hidden_states.shape, dtype=hidden_states.dtype, device='meta')
+        return mindtorch.empty(
+            *hidden_states.shape, dtype=hidden_states.dtype, device="meta"
+        )
 
 
 class EmbeddingIndentity(nn.Module):
@@ -61,7 +83,10 @@ def __init__(self, num_embeddings: int, embedding_dim: int, dtype=None):
         self.dtype = dtype
 
     def forward(self, input):
-        return ops.empty(input.shape + (self.embedding_dim,), dtype=self.dtype, device='meta')
+        return mindtorch.empty(
+            input.shape + (self.embedding_dim,), dtype=self.dtype, device="meta"
+        )
+
 
 class LinearIndetity(nn.Module):
     def __init__(self, in_features, out_features, dtype=None):
@@ -71,7 +96,10 @@ def __init__(self, in_features, out_features, dtype=None):
         self.dtype = dtype
 
     def forward(self, input):
-        return ops.empty(input.shape[:-1] + (self.out_features,), dtype=self.dtype, device='meta')
+        return mindtorch.empty(
+            input.shape[:-1] + (self.out_features,), dtype=self.dtype, device="meta"
+        )
+
 
 def construct_pipeline_parallel_model(model, device_map):
     current_device = dist.get_rank()
@@ -87,13 +115,19 @@ def construct_pipeline_parallel_model(model, device_map):
         if device != current_device:
             submodule = model.get_submodule(scope_name)
             if isinstance(submodule, nn.Embedding):
-                new_embedding = EmbeddingIndentity(submodule.num_embeddings, submodule.embedding_dim, model.dtype)
+                new_embedding = EmbeddingIndentity(
+                    submodule.num_embeddings, submodule.embedding_dim, model.dtype
+                )
                 replace_submodule(model, scope_name, new_embedding)
             elif isinstance(submodule, nn.Linear):
-                new_linear = LinearIndetity(submodule.in_features, submodule.out_features, model.dtype)
+                new_linear = LinearIndetity(
+                    submodule.in_features, submodule.out_features, model.dtype
+                )
                 replace_submodule(model, scope_name, new_linear)
             elif submodule.__class__.__name__ in no_split_modules:
-                new_layer = DecoderLayerIdentity(submodule.self_attn.layer_idx, submodule.self_attn.config)
+                new_layer = DecoderLayerIdentity(
+                    submodule.self_attn.layer_idx, submodule.self_attn.config
+                )
                 replace_submodule(model, scope_name, new_layer)
             else:
                 # new_layer = nn.Identity()
@@ -101,15 +135,21 @@ def construct_pipeline_parallel_model(model, device_map):
                 pass
 
     if current_device < last_device:
-        current_last_layer = model.get_submodule(reversed_device_map[current_device][-1])
+        current_last_layer = model.get_submodule(
+            reversed_device_map[current_device][-1]
+        )
         current_last_layer._forward = current_last_layer.forward
         current_last_layer.forward = types.MethodType(send_forward, current_last_layer)
         current_last_layer.dist = current_device + 1
 
     if current_device > 0:
-        current_first_layer = model.get_submodule(reversed_device_map[current_device][0])
+        current_first_layer = model.get_submodule(
+            reversed_device_map[current_device][0]
+        )
         current_first_layer._forward = current_first_layer.forward
-        current_first_layer.forward = types.MethodType(receive_forward, current_first_layer)
+        current_first_layer.forward = types.MethodType(
+            receive_forward, current_first_layer
+        )
         current_first_layer.src = current_device - 1
 
     model_last_layer = model.get_submodule(next(reversed(device_map)))
@@ -119,17 +159,18 @@ def construct_pipeline_parallel_model(model, device_map):
 
     return model
 
+
 def find_usefull_files(shared_files, shared_meta, model_params):
-    files_path = '/'.join(shared_files[0].split('/')[:-1])
+    files_path = "/".join(shared_files[0].split("/")[:-1])
     usefull_files = set()
 
-    for param_name, file_name in shared_meta['weight_map'].items():
+    for param_name, file_name in shared_meta["weight_map"].items():
         if param_name in model_params:
             usefull_files.add(file_name)
         # else:
         #     shared_meta['all_checkpoint_keys'].remove(param_name)
 
-    usefull_files = [files_path + '/' + file for file in usefull_files]
+    usefull_files = [files_path + "/" + file for file in usefull_files]
 
     return usefull_files, shared_meta
 
@@ -141,31 +182,31 @@ def wrapper(
         state_dict,
         checkpoint_files,
         pretrained_model_name_or_path,
-        ignore_mismatched_sizes,
-        sharded_metadata,
-        device_map,
-        disk_offload_folder,
-        offload_state_dict = None,
-        dtype = None,
-        hf_quantizer = None,
-        keep_in_fp32_regex = None,
-        device_mesh = None,
-        key_mapping = None,
-        weights_only = True,
+        **kwargs,
     ):
+        device_map = kwargs.pop("device_map", None)
+        sharded_metadata = kwargs.pop("sharded_metadata", None)
+
         # if device_map is not None and not initialize distribute module, raise Error.
         if device_map is not None:
-            if all([isinstance(d, int) for d in device_map.values()]) and len(set(device_map.values())) > 1:
+            if (
+                all([isinstance(d, int) for d in device_map.values()])
+                and len(set(device_map.values())) > 1
+            ):
                 if not GlobalComm.INITED:
-                    raise RuntimeError(f'to use transformers with multi-gpu/npu, please use `msrun/mpirun` ' \
-                                    f'with {len(set(device_map.values()))} devices to launch multiprocess.')
+                    raise RuntimeError(
+                        f"to use transformers with multi-gpu/npu, please use `msrun/mpirun` "
+                        f"with {len(set(device_map.values()))} devices to launch multiprocess."
+                    )
 
                 model = construct_pipeline_parallel_model(model, device_map)
-                checkpoint_files, sharded_metadata = find_usefull_files(checkpoint_files, sharded_metadata, model.state_dict().keys())
+                checkpoint_files, sharded_metadata = find_usefull_files(
+                    checkpoint_files, sharded_metadata, model.state_dict().keys()
+                )
 
                 rank = dist.get_rank()
                 world_size = dist.get_world_size()
-                
+
                 dist.barrier()
 
                 for target_rank in range(world_size):
@@ -177,43 +218,24 @@ def wrapper(
                             state_dict,
                             checkpoint_files,
                             pretrained_model_name_or_path,
-                            ignore_mismatched_sizes,
-                            sharded_metadata,
-                            device_map,
-                            disk_offload_folder,
-                            offload_state_dict,
-                            dtype,
-                            hf_quantizer,
-                            keep_in_fp32_regex,
-                            device_mesh,
-                            key_mapping,
-                            weights_only,
+                            **kwargs,
                         )
 
                     dist.barrier()
                 return model
 
         return fn(
-                cls,
-                model,
-                state_dict,
-                checkpoint_files,
-                pretrained_model_name_or_path,
-                ignore_mismatched_sizes,
-                sharded_metadata,
-                device_map,
-                disk_offload_folder,
-                offload_state_dict,
-                dtype,
-                hf_quantizer,
-                keep_in_fp32_regex,
-                device_mesh,
-                key_mapping,
-                weights_only,
-            )
+            cls,
+            model,
+            state_dict,
+            checkpoint_files,
+            pretrained_model_name_or_path,
+            **kwargs,
+        )
 
     return wrapper
 
+
 def _get_resolved_checkpoint_files_wrapper(fn):
     def wrapper(*args, **kwargs):
         if GlobalComm.INITED and dist.get_world_size() > 1:
@@ -237,4 +259,4 @@ def wrapper(*args, **kwargs):
         else:
             return fn(*args, **kwargs)
 
-    return wrapper
\ No newline at end of file
+    return wrapper
diff --git a/mindnlp/transformers/ms_utils.py b/mindnlp/transformers/ms_utils.py
deleted file mode 100644
index 7c03a1213..000000000
--- a/mindnlp/transformers/ms_utils.py
+++ /dev/null
@@ -1,258 +0,0 @@
-# Copyright 2022 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""MindNLP MindSpore Utils"""
-
-import inspect
-from typing import Union, Optional, List, Tuple
-
-import mindspore
-from mindspore.common.initializer import initializer, Normal
-
-from mindtorch import nn, ops
-from mindtorch.nn import Parameter
-
-ALL_LAYERNORM_LAYERS = [nn.LayerNorm]
-
-class Conv1D(nn.Module):
-    """
-    1D-convolutional layer Basically works like a linear layer but the weights are transposed.
-
-    Args:
-        n_out (`int`): The number of output features.
-        n_in (`int`): The number of input features.
-    """
-    def __init__(self, n_out, n_in):
-        """
-        Initialize the Conv1D class with the specified number of output channels and input channels.
-        
-        Args:
-            self (object): The instance of the Conv1D class.
-            n_out (int): The number of output channels for the convolution operation.
-            n_in (int): The number of input channels for the convolution operation.
-        
-        Returns:
-            None.
-        
-        Raises:
-            None.
-        """
-        super().__init__()
-        self.n_out = n_out
-        self.weight = nn.Parameter(ops.empty(nx, nf))
-        self.bias = nn.Parameter(ops.zeros(nf))
-
-    def forward(self, x):
-        """
-        Constructs the 1D convolutional operation on the input tensor x.
-        
-        Args:
-            self (Conv1D): An instance of the Conv1D class.
-            x (mindspore.Tensor): The input tensor on which the convolution operation is applied. 
-                Should have a shape of (batch_size, sequence_length, input_channels).
-                
-        Returns:
-            None: The method modifies the input tensor x in place by performing the convolution operation.
-        
-        Raises:
-            ValueError: If the shape of the input tensor x is not as expected for a 1D convolution operation.
-            RuntimeError: If there are any runtime issues during the convolution operation.
-        """
-        size_out = x.shape[:-1] + (self.n_out,)
-        x = ops.matmul(x.view(-1, x.shape[-1]), self.weight) + self.bias
-        x = x.view(size_out)
-        return x
-
-
-def prune_conv1d_layer(layer, index, dim=1):
-    """
-    Prune a Conv1D layer to keep only entries in index. A Conv1D work as a Linear layer (see e.g. BERT) but the weights
-    are transposed.
-
-    Used to remove heads.
-
-    Args:
-        layer ([`~mindspore_utils.Conv1D`]): The layer to prune.
-        index (`mindspore.Tensor[int64]`): The indices to keep in the layer.
-        axis (`int`, *optional*, defaults to 1): The dimension on which to keep the indices.
-
-    Returns:
-        [`~mindspore_utils.Conv1D`]: The pruned layer as a new layer with `requires_grad=True`.
-    """
-    gama_l = layer.weight.index_select(dim, index)
-    if dim == 0:
-        beta_l = layer.bias
-    else:
-        beta_l = layer.bias[index]
-    new_size = list(layer.weight.shape)
-    new_size[dim] = len(index)
-    new_layer = Conv1D(new_size[1], new_size[0])
-    new_layer.weight.requires_grad = False
-    new_layer.weight.assign_value(gama_l)
-    new_layer.weight.requires_grad = True
-    new_layer.bias.requires_grad = False
-    new_layer.bias.assign_value(beta_l)
-    new_layer.bias.requires_grad = True
-    return new_layer
-
-
-def find_pruneable_heads_and_indices(heads, n_heads, head_size, already_pruned_heads):
-    """
-    Finds the heads and their indices taking `already_pruned_heads` into account.
-
-    Args:
-        heads (`List[int]`): List of the indices of heads to prune.
-        n_heads (`int`): The number of heads in the model.
-        head_size (`int`): The size of each head.
-        already_pruned_heads (`Set[int]`): A set of already pruned heads.
-
-    Returns:
-        `Tuple[Set[int], MindSpore.Tensor[int64]]`: A tuple with the remaining heads and their corresponding indices.
-    """
-    mask = ops.ones((n_heads, head_size))
-    heads = set(heads) - already_pruned_heads  # Convert to set and remove already pruned heads
-    for head in heads:
-        # Compute how many pruned heads are before the head and move the index accordingly
-        head = head - sum(1 if h < head else 0 for h in already_pruned_heads)
-        mask[head] = 0
-    mask = mask.view(-1).eq(1)
-    index = ops.arange(len(mask), dtype=mindspore.int64)[mask]
-    return heads, index
-
-def prune_linear_layer(layer, index, dim=0):
-    """
-    Prune a linear layer to keep only entries in index.
-    Used to remove heads.
-
-    Args:
-        layer (`mindspore.nn.Linear`): The layer to prune.
-        index (`mindspore.Tensor[int64]`): The indices to keep in the layer.
-        axis (`int`, *optional*, defaults to 0): The dimension on which to keep the indices.
-
-    Returns:
-        `mindspore.nn.Linear`: The pruned layer as a new layer with `requires_grad=True`.
-    """
-    W = layer.weight.index_select(dim, index).copy()
-    if layer.bias is not None:
-        if dim == 1:
-            b = layer.bias.copy()
-        else:
-            b = layer.bias[index].copy()
-    new_size = list(layer.weight.shape)
-    new_size[dim] = len(index)
-    new_layer = nn.Linear(new_size[1], new_size[0], bias=layer.bias is not None)
-    new_layer.weight.requires_grad = False
-    new_layer.weight.assign_value(W)
-    new_layer.weight.requires_grad = True
-    if layer.bias is not None:
-        new_layer.bias.requires_grad = False
-        new_layer.bias.assign_value(b)
-        new_layer.bias.requires_grad = True
-    return new_layer
-
-
-def apply_chunking_to_forward(forward_fn, chunk_size, chunk_axis, *input_tensors):
-    """
-    This function chunks the `input_tensors` into smaller input tensor parts of size `chunk_size` over the dimension
-    `chunk_axis`. It then applies a layer `forward_fn` to each chunk independently to save memory.
-    If the `forward_fn` is independent across the `chunk_dim` this function will yield the same result as directly
-    applying `forward_fn` to `input_tensors`.
-
-    Args:
-        forward_fn (`Callable[..., mindspore.Tensor]`):
-            The forward function of the model.
-        chunk_size (`int`):
-            The chunk size of a chunked tensor: `num_chunks = len(input_tensors[0]) / chunk_size`.
-        chunk_axis (`int`):
-            The dimension over which the `input_tensors` should be chunked.
-        input_tensors (`Tuple[mindspore.Tensor]`):
-            The input tensors of `forward_fn` which will be chunked
-
-    Returns:
-        `mindspore.Tensor`: A tensor with the same shape as the `forward_fn` would have given if applied`.
-    """
-    assert len(input_tensors) > 0, f"{input_tensors} has to be a tuple/list of tensors"
-
-     # inspect.signature exist since python 3.5 and is a python method -> no problem with backward compatibility
-    num_args_in_forward_chunk_fn = len(inspect.signature(forward_fn).parameters)
-    if num_args_in_forward_chunk_fn != len(input_tensors):
-        raise ValueError(
-            f"forward_chunk_fn expects {num_args_in_forward_chunk_fn} arguments, but only {len(input_tensors)} input "
-            "tensors are given"
-        )
-
-    if chunk_size > 0:
-        tensor_shape = input_tensors[0].shape[chunk_axis]
-        for input_tensor in input_tensors:
-            if input_tensor.shape[chunk_axis] != tensor_shape:
-                raise ValueError(
-                    f"All input tenors have to be of the same shape: {tensor_shape}, "
-                    f"found shape {input_tensor.shape[chunk_axis]}"
-                )
-
-        if input_tensors[0].shape[chunk_axis] % chunk_size != 0:
-            raise ValueError(
-                f"The dimension to be chunked {input_tensors[0].shape[chunk_axis]} has to be a multiple of the chunk "
-                f"size {chunk_size}"
-            )
-
-        num_chunks = input_tensors[0].shape[chunk_axis] // chunk_size
-
-        # chunk input tensor into tuples
-        input_tensors_chunks = tuple(ops.chunk(input_tensor, num_chunks, dim=chunk_axis) for input_tensor in input_tensors)
-        # apply forward fn to every tuple
-        output_chunks = tuple(forward_fn(*input_tensors_chunk) for input_tensors_chunk in zip(*input_tensors_chunks))
-        # concatenate output at same dimension
-        return ops.cat(output_chunks, dim=chunk_axis)
-
-    return forward_fn(*input_tensors)
-
-def zero_init(cls, *args, **kwargs):
-    """init zeros to speed up initialize stage."""
-    for k in kwargs.keys():
-        if 'init' in k:
-            kwargs.pop(k)
-    init_signature = inspect.signature(cls.__init__)
-    init_params = init_signature.parameters
-    for param_name in init_params.keys():
-        if 'init' in param_name:
-            kwargs[param_name] = 'zeros'
-    def _reset_parameters(self):
-        pass
-    cls.reset_parameters = _reset_parameters
-    return cls(*args, **kwargs)
-
-def meshgrid(
-    *tensors: Union[mindspore.Tensor, List[mindspore.Tensor]], indexing: Optional[str] = None
-) -> Tuple[mindspore.Tensor, ...]:
-    """
-    Wrapper around torch.meshgrid to avoid warning messages about the introduced `indexing` argument.
-
-    Reference: https://pytorch.org/docs/1.13/generated/torch.meshgrid.html
-    """
-    return ops.meshgrid(*tensors, indexing=indexing)
-
-def isin_friendly(elements: mindspore.Tensor, test_elements: mindspore.Tensor) -> mindspore.Tensor:
-    """
-    Same as `ops.isin` without flags, but MPS-friendly.
-
-    Args:
-        elements (`mindspore.Tensor`): Input elements
-        test_elements (`mindspore.Tensor`): The elements to check against.
-
-    Returns:
-        `mindspore.Tensor`: A boolean tensor of the same shape as `elements` that is True for `elements` in `test_elements`
-        and False otherwise
-    """
-    return elements.tile((test_elements.shape[0], 1)).eq(test_elements.unsqueeze(1)).sum(0).bool().squeeze()
\ No newline at end of file
diff --git a/mindnlp/transformers/trainer.py b/mindnlp/transformers/trainer.py
index 50a4c5b34..5757838cb 100644
--- a/mindnlp/transformers/trainer.py
+++ b/mindnlp/transformers/trainer.py
@@ -1,10 +1,11 @@
 from typing import Union, Any, Optional
-import mindtorch
-from mindtorch import nn, autograd
 
 from transformers.training_args import OptimizerNames
 from accelerate.utils import DistributedType
 
+import mindtorch
+from mindtorch import nn, autograd
+
 def training_step(
     self,
     model: nn.Module,
diff --git a/mindnlp/utils/__init__.py b/mindnlp/utils/__init__.py
index fac1eefd4..4d81afe36 100644
--- a/mindnlp/utils/__init__.py
+++ b/mindnlp/utils/__init__.py
@@ -17,15 +17,17 @@
 Common utils
 """
 from .generic import *
+
 # from .decompress import unzip, untar, ungz
 # from .download import *
 # from .compatibility import *
 # from .chat_template_utils import *
 from .import_utils import *
+
 # from .testing_utils import require_mindspore
 # from .save import convert_file_size_to_int
 # from .peft_utils import find_adapter_config_file
 
 DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
 DUMMY_MASK = [[1, 1, 1, 1, 1], [1, 1, 1, 0, 0], [0, 0, 0, 1, 1]]
-SENTENCEPIECE_UNDERLINE = "▁"
\ No newline at end of file
+SENTENCEPIECE_UNDERLINE = "▁"
diff --git a/mindnlp/utils/decorators.py b/mindnlp/utils/decorators.py
index e5df75a97..56d1ca207 100644
--- a/mindnlp/utils/decorators.py
+++ b/mindnlp/utils/decorators.py
@@ -1,27 +1,30 @@
 import warnings
-import functools
 import mindspore
 from mindtorch.configs import ON_A1
 
+
 def dtype_wrapper(fn):
     def wrapper(*args, **kwargs):
-        ms_dtype = kwargs.pop('ms_dtype', None)
-        ms_dtype = kwargs.pop('mindspore_dtype', ms_dtype)
+        ms_dtype = kwargs.pop("ms_dtype", None)
+        ms_dtype = kwargs.pop("mindspore_dtype", ms_dtype)
         if ON_A1 and ms_dtype == mindspore.bfloat16:
-            warnings.warn('910A do not support bfloat16, use float16 for `ms_dtype`.')
+            warnings.warn("910A do not support bfloat16, use float16 for `ms_dtype`.")
             ms_dtype = mindspore.float16
         if ms_dtype is not None:
-            kwargs['torch_dtype'] = ms_dtype
+            kwargs["torch_dtype"] = ms_dtype
         return fn(*args, **kwargs)
+
     return wrapper
 
+
 def patch_dtype_wrapper(cls, method_name, other_decorators=None):
     patch_wrappers(cls, method_name, [dtype_wrapper])
 
+
 def patch_wrappers(cls, method_name, other_decorators=None):
     original_method = getattr(cls, method_name)
     wrapped_func = original_method.__func__
-    
+
     if other_decorators is not None:
         for dec in other_decorators:
             wrapped_func = dec(wrapped_func)
diff --git a/mindnlp/utils/generic.py b/mindnlp/utils/generic.py
index 13f8bb953..5edfbf252 100644
--- a/mindnlp/utils/generic.py
+++ b/mindnlp/utils/generic.py
@@ -36,16 +36,17 @@ def is_tensor(x):
 
     return isinstance(x, np.ndarray)
 
+
 def _is_mindspore(x):
     """
     Checks if the input x is a MindSpore tensor.
-    
+
     Args:
         x (object): The input object to be checked.
-    
+
     Returns:
         None: This function does not return any value.
-    
+
     Raises:
         None: This function does not raise any exceptions.
     """
@@ -56,7 +57,8 @@ def is_mindspore_tensor(x):
     """
     Tests if `x` is a torch tensor or not. Safe to call even if torch is not installed.
     """
-    return False if not is_mindspore_available() else _is_mindspore(x)
+    return _is_mindspore(x)
+
 
 def set_attribute_for_modules(module, key: str, value: Any):
     """
@@ -65,7 +67,8 @@ def set_attribute_for_modules(module, key: str, value: Any):
     setattr(module, key, value)
     for submodule in module.children():
         set_attribute_for_modules(submodule, key, value)
-        
+
+
 def can_return_tuple(func):
     """
     Decorator to wrap model method, to call output.to_tuple() if return_dict=False passed as a kwarg or
@@ -78,7 +81,9 @@ def can_return_tuple(func):
     @wraps(func)
     def wrapper(self, *args, **kwargs):
         is_requested_to_return_tuple = kwargs.pop("return_dict", True) is False
-        is_configured_to_return_tuple = self.config.use_return_dict is False if hasattr(self, "config") else False
+        is_configured_to_return_tuple = (
+            self.config.use_return_dict is False if hasattr(self, "config") else False
+        )
 
         # The following allows to convert output to tuple ONLY on top level forward call,
         # while internal modules of the model will return Output objects
@@ -92,32 +97,38 @@ def wrapper(self, *args, **kwargs):
 
         try:
             output = func(self, *args, **kwargs)
-            if is_requested_to_return_tuple or (is_configured_to_return_tuple and is_top_level_module):
+            if is_requested_to_return_tuple or (
+                is_configured_to_return_tuple and is_top_level_module
+            ):
                 output = output.to_tuple()
         finally:
             # Remove the flag after the model forward call is finished.
-            if is_configured_to_return_tuple and is_top_level_module:
-                del_attribute_from_modules(self, "_is_top_level_module")
+            # if is_configured_to_return_tuple and is_top_level_module:
+            #     del_attribute_from_modules(self, "_is_top_level_module")
+            pass
 
         return output
 
     return wrapper
+
+
 class ExplicitEnum(str, Enum):
     """
     Enum with more explicit error message for missing values.
     """
+
     @classmethod
     def _missing_(cls, value):
         """
         This method `_missing_` in the class `ExplicitEnum` is a class method used to handle missing values in the ExplicitEnum class.
-        
+
         Args:
             cls (class): The class itself, used for referring to the class instance inside the method.
             value (any): The value that was not found in the ExplicitEnum class.
-            
+
         Returns:
             None: This method does not return any value as it raises an exception when called.
-        
+
         Raises:
             ValueError: If the value provided is not a valid member of the Enum class, a ValueError is raised with a message listing the valid options to choose from.
         """
@@ -125,23 +136,28 @@ def _missing_(cls, value):
             f"{value} is not a valid {cls.__name__}, please select one of {list(cls._value2member_map_.keys())}"
         )
 
+
 class TensorType(ExplicitEnum):
     """
     Possible values for the `return_tensors` argument in [`PreTrainedTokenizerBase.__call__`]. Useful for
     tab-completion in an IDE.
     """
+
     MINDSPORE = "ms"
     NUMPY = "np"
 
+
 class PaddingStrategy(ExplicitEnum):
     """
     Possible values for the `padding` argument in [`PreTrainedTokenizerBase.__call__`]. Useful for tab-completion in an
     IDE.
     """
+
     LONGEST = "longest"
     MAX_LENGTH = "max_length"
     DO_NOT_PAD = "do_not_pad"
 
+
 class LossKwargs(TypedDict, total=False):
     """
     Keyword arguments to be passed to the loss function
@@ -154,6 +170,7 @@ class LossKwargs(TypedDict, total=False):
 
     num_items_in_batch: Optional[int]
 
+
 class ModelOutput(OrderedDict):
     """
     Base class for all model outputs as dataclass. Has a `__getitem__` that allows indexing by integer or slice (like a
@@ -170,15 +187,15 @@ class ModelOutput(OrderedDict):
 
     def __post_init__(self):
         """Perform post-initialization actions for the ModelOutput class.
-        
+
         This method is automatically called after the initialization of a ModelOutput object.
-        
+
         Args:
             self: An instance of the ModelOutput class.
-        
+
         Returns:
             None
-        
+
         Raises:
             ValueError: If the ModelOutput object has no fields or more than one required field.
             ValueError: If a key/value pair in the first field is not a tuple or if it does not follow the format (key, value).
@@ -190,10 +207,14 @@ def __post_init__(self):
         if len(class_fields) == 0:
             raise ValueError(f"{self.__class__.__name__} has no fields.")
         if not all(field.default is None for field in class_fields[1:]):
-            raise ValueError(f"{self.__class__.__name__} should not have more than one required field.")
+            raise ValueError(
+                f"{self.__class__.__name__} should not have more than one required field."
+            )
 
         first_field = getattr(self, class_fields[0].name)
-        other_fields_are_none = all(getattr(self, field.name) is None for field in class_fields[1:])
+        other_fields_are_none = all(
+            getattr(self, field.name) is None for field in class_fields[1:]
+        )
 
         if other_fields_are_none and not is_tensor(first_field):
             if isinstance(first_field, dict):
@@ -211,9 +232,9 @@ def __post_init__(self):
             if first_field_iterator:
                 for idx, element in enumerate(iterator):
                     if (
-                            not isinstance(element, (list, tuple))
-                            or not len(element) == 2
-                            or not isinstance(element[0], str)
+                        not isinstance(element, (list, tuple))
+                        or not len(element) == 2
+                        or not isinstance(element[0], str)
                     ):
                         if idx == 0:
                             # If we do not have an iterator of key/values, set it as attribute
@@ -238,90 +259,98 @@ def __post_init__(self):
     def __delitem__(self, *args, **kwargs):
         """
         __delitem__
-        
+
         Deletes an item from the ModelOutput instance.
-        
+
         Args:
             self (ModelOutput): The ModelOutput instance from which the item will be deleted.
-        
+
         Returns:
             None. This method does not return a value.
-        
+
         Raises:
             RuntimeError: If the '__delitem__' method is attempted to be used on a ModelOutput instance, a RuntimeError is raised with a message indicating that this method cannot be used on the instance.
         """
-        raise RuntimeError(f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance.")
+        raise RuntimeError(
+            f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance."
+        )
 
     def setdefault(self, *args, **kwargs):
         """
-        Sets a default value in the ModelOutput instance.
-        
-        Args:
-            self: The ModelOutput instance itself.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            RuntimeError: This exception is raised if the method 'setdefault' is called on a ModelOutput instance. The message in the exception states that the 'setdefault' method cannot be used on a
-ModelOutput instance.
-        
-        Note:
-            The 'setdefault' method is not supported for ModelOutput instances as it can only be used on dictionary objects.
+                Sets a default value in the ModelOutput instance.
+
+                Args:
+                    self: The ModelOutput instance itself.
+
+                Returns:
+                    None. This method does not return any value.
+
+                Raises:
+                    RuntimeError: This exception is raised if the method 'setdefault' is called on a ModelOutput instance. The message in the exception states that the 'setdefault' method cannot be used on a
+        ModelOutput instance.
+
+                Note:
+                    The 'setdefault' method is not supported for ModelOutput instances as it can only be used on dictionary objects.
         """
-        raise RuntimeError(f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance.")
+        raise RuntimeError(
+            f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance."
+        )
 
     def pop(self, *args, **kwargs):
         """
         Method that raises a RuntimeError to prevent the use of 'pop' on a ModelOutput instance.
-        
+
         Args:
-            self (object): The ModelOutput instance on which 'pop' is being called. 
+            self (object): The ModelOutput instance on which 'pop' is being called.
                            This parameter is required and represents the current instance of the class.
-        
+
         Returns:
             None. This method does not return any value.
-        
+
         Raises:
             RuntimeError: Raised when attempting to use 'pop' method on a ModelOutput instance. The exception message
                           specifies that 'pop' cannot be used on a ModelOutput instance to prevent unintended behavior.
         """
-        raise RuntimeError(f"You cannot use ``pop`` on a {self.__class__.__name__} instance.")
+        raise RuntimeError(
+            f"You cannot use ``pop`` on a {self.__class__.__name__} instance."
+        )
 
     def update(self, *args, **kwargs):
         """
         Updates the current instance of the ModelOutput class.
-        
+
         Args:
             self (ModelOutput): The instance of the ModelOutput class.
-        
+
         Returns:
             None: This method does not return any value.
-        
+
         Raises:
             RuntimeError: If the method is called on an instance of the ModelOutput class. This is to prevent using the 'update' method on a ModelOutput instance, as it is not allowed.
-        
+
         Note:
             The 'update' method is not allowed to be used on a ModelOutput instance. If called, it will raise a RuntimeError.
         """
-        raise RuntimeError(f"You cannot use ``update`` on a {self.__class__.__name__} instance.")
+        raise RuntimeError(
+            f"You cannot use ``update`` on a {self.__class__.__name__} instance."
+        )
 
     def __getitem__(self, k):
         """
-        This method allows accessing the elements of the ModelOutput object using the square bracket notation.
-        
-        Args:
-            self (ModelOutput): The instance of the ModelOutput class.
-            k (str or int): The key or index for accessing the element. If k is a string, it is used as a key to retrieve the corresponding value. If k is an integer, it is used as an index to retrieve the
-element. 
-        
-        Returns:
-            None: This method does not return any value directly. The retrieved value is returned based on the input key or index.
-        
-        Raises:
-            TypeError: If the input parameter k is not a string or an integer.
-            KeyError: If the input key k is not found in the internal dictionary when k is a string.
-            IndexError: If the input index k is out of range when k is an integer.
+                This method allows accessing the elements of the ModelOutput object using the square bracket notation.
+
+                Args:
+                    self (ModelOutput): The instance of the ModelOutput class.
+                    k (str or int): The key or index for accessing the element. If k is a string, it is used as a key to retrieve the corresponding value. If k is an integer, it is used as an index to retrieve the
+        element.
+
+                Returns:
+                    None: This method does not return any value directly. The retrieved value is returned based on the input key or index.
+
+                Raises:
+                    TypeError: If the input parameter k is not a string or an integer.
+                    KeyError: If the input key k is not found in the internal dictionary when k is a string.
+                    IndexError: If the input index k is out of range when k is an integer.
         """
         if isinstance(k, str):
             inner_dict = dict(self.items())
@@ -330,19 +359,19 @@ def __getitem__(self, k):
 
     def __setattr__(self, name, value):
         """
-        Method __setattr__ in the class ModelOutput sets the value for the specified attribute name.
-        
-        Args:
-            self (object): The instance of the ModelOutput class.
-            name (str): The name of the attribute to be set.
-            value (any): The value to be assigned to the attribute. It can be of any type.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            No specific exceptions are raised by this method. However, if the attribute name is not in the keys of the object, it will be added as a new attribute. If the value is None, the attribute will be
-set to None.
+                Method __setattr__ in the class ModelOutput sets the value for the specified attribute name.
+
+                Args:
+                    self (object): The instance of the ModelOutput class.
+                    name (str): The name of the attribute to be set.
+                    value (any): The value to be assigned to the attribute. It can be of any type.
+
+                Returns:
+                    None. This method does not return any value.
+
+                Raises:
+                    No specific exceptions are raised by this method. However, if the attribute name is not in the keys of the object, it will be added as a new attribute. If the value is None, the attribute will be
+        set to None.
         """
         if name in self.keys() and value is not None:
             # Don't call self.__setitem__ to avoid recursion errors
@@ -352,15 +381,15 @@ def __setattr__(self, name, value):
     def __setitem__(self, key, value):
         """
         This method '__setitem__' in the class 'ModelOutput' allows setting key-value pairs in the model output object.
-        
+
         Args:
             self (ModelOutput): The instance of the ModelOutput class.
             key (Any): The key to be set in the model output object.
             value (Any): The value corresponding to the key to be set in the model output object.
-        
+
         Returns:
             None. This method does not return any value explicitly.
-        
+
         Raises:
             This method may raise the following exceptions:
             - TypeError: If the key is not of a valid type.
@@ -378,6 +407,7 @@ def to_tuple(self) -> Tuple[Any]:
         """
         return tuple(v for _, v in self.items())
 
+
 # vendored from distutils.util
 def strtobool(val):
     """Convert a string representation of truth to true (1) or false (0).
@@ -392,6 +422,7 @@ def strtobool(val):
         return 0
     raise ValueError(f"invalid truth value {val!r}")
 
+
 class cached_property(property):
     """
     Descriptor that mimics @property but caches output in member variable.
@@ -400,18 +431,19 @@ class cached_property(property):
 
     Built-in in functools from Python 3.8.
     """
+
     def __get__(self, obj, objtype=None):
-        """ 
+        """
         Method '__get__' in the class 'cached_property'.
-        
+
         Args:
             self (object): The current instance of the class.
             obj (object): The object on which the method is being called.
             objtype (object): The type of the object, if available. Defaults to None.
-        
+
         Returns:
             None: The method returns a value of type None.
-        
+
         Raises:
             AttributeError: If the attribute is unreadable, this exception is raised.
         """
@@ -427,16 +459,17 @@ def __get__(self, obj, objtype=None):
             setattr(obj, attr, cached)
         return cached
 
+
 def _is_numpy(x):
     """
     This function checks if the input is a NumPy array.
-    
+
     Args:
         x (any): The input to be checked for being a NumPy array.
-    
+
     Returns:
         None: This function does not return a value.
-    
+
     Raises:
         None
     """
@@ -449,6 +482,7 @@ def is_numpy_array(x):
     """
     return _is_numpy(x)
 
+
 def infer_framework_from_repr(x):
     """
     Tries to guess the framework of an object `x` from its repr (brittle but will help in `is_tensor` to try the
@@ -460,6 +494,7 @@ def infer_framework_from_repr(x):
     if representation.startswith("<class 'numpy."):
         return "np"
 
+
 def _get_frameworks_and_test_func(x):
     """
     Returns an (ordered since we are in Python 3.7+) dictionary framework to test function, which places the framework
@@ -474,7 +509,9 @@ def _get_frameworks_and_test_func(x):
     frameworks = [] if preferred_framework is None else [preferred_framework]
     if preferred_framework != "np":
         frameworks.append("np")
-    frameworks.extend([f for f in framework_to_test if f not in [preferred_framework, "np"]])
+    frameworks.extend(
+        [f for f in framework_to_test if f not in [preferred_framework, "np"]]
+    )
     return {f: framework_to_test[f] for f in frameworks}
 
 
@@ -503,6 +540,7 @@ def to_py_obj(obj):
         return obj.tolist()
     return obj
 
+
 def to_numpy(obj):
     """
     Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list to a Numpy array.
@@ -525,22 +563,24 @@ def to_numpy(obj):
 
     return obj
 
+
 class ContextManagers:
     """
     Wrapper for `contextlib.ExitStack` which enters a collection of context managers. Adaptation of `ContextManagers`
     in the `fastcore` library.
     """
+
     def __init__(self, context_managers: List[ContextManager]):
         """
         __init__
-        
+
         Args:
             self: The instance of the class.
             context_managers (List[ContextManager]): A list of context managers to be initialized.
-        
+
         Returns:
             None: This method does not return any value.
-        
+
         Raises:
             No specific exceptions are raised by this method.
         """
@@ -549,17 +589,17 @@ def __init__(self, context_managers: List[ContextManager]):
 
     def __enter__(self):
         """
-        Method '__enter__' in the class 'ContextManagers'.
-        
-        Args:
-            self (object): The instance of the ContextManagers class on which the method is called. It is used to access the instance attributes and methods.
-            
-        Returns:
-            None. This method does not return any value explicitly, it performs context management operations within the class.
-            
-        Raises:
-            This method may raise exceptions if the context managers encountered during the iteration in the for loop raise any exceptions. Ensure proper error handling is in place to catch and handle any
-exceptions that may occur during the context management operations.
+                Method '__enter__' in the class 'ContextManagers'.
+
+                Args:
+                    self (object): The instance of the ContextManagers class on which the method is called. It is used to access the instance attributes and methods.
+
+                Returns:
+                    None. This method does not return any value explicitly, it performs context management operations within the class.
+
+                Raises:
+                    This method may raise exceptions if the context managers encountered during the iteration in the for loop raise any exceptions. Ensure proper error handling is in place to catch and handle any
+        exceptions that may occur during the context management operations.
         """
         for context_manager in self.context_managers:
             self.stack.enter_context(context_manager)
@@ -567,20 +607,21 @@ def __enter__(self):
     def __exit__(self, *args, **kwargs):
         """
         __exit__
-        
+
         Method in the class ContextManagers.
-        
+
         Args:
             self: (object) The instance of the class.
-        
+
         Returns:
             None: This method does not return any value.
-        
+
         Raises:
             This method does not explicitly raise any exceptions.
         """
         self.stack.__exit__(*args, **kwargs)
 
+
 def find_labels(model_class):
     """
     Find the labels used by a given model.
@@ -592,10 +633,15 @@ def find_labels(model_class):
     signature = inspect.signature(model_class.forward)  # TensorFlow models
 
     if "QuestionAnswering" in model_name:
-        return [p for p in signature.parameters if "label" in p or p in ("start_positions", "end_positions")]
+        return [
+            p
+            for p in signature.parameters
+            if "label" in p or p in ("start_positions", "end_positions")
+        ]
     else:
         return [p for p in signature.parameters if "label" in p]
 
+
 def can_return_loss(model_class):
     """
     Check if a given model can return loss.
@@ -609,4 +655,4 @@ def can_return_loss(model_class):
         if p == "return_loss" and signature.parameters[p].default is True:
             return True
 
-    return False
\ No newline at end of file
+    return False
diff --git a/mindnlp/utils/import_utils.py b/mindnlp/utils/import_utils.py
index a858c94c6..8519a746e 100644
--- a/mindnlp/utils/import_utils.py
+++ b/mindnlp/utils/import_utils.py
@@ -42,7 +42,9 @@
 
 
 # TODO: This doesn't work for all packages (`bs4`, `faiss`, etc.) Talk to Sylvain to see how to do with it better.
-def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[tuple[bool, str], bool]:
+def _is_package_available(
+    pkg_name: str, return_version: bool = False
+) -> Union[tuple[bool, str], bool]:
     # Check if the package spec exists and grab its version to avoid importing a local directory
     package_exists = importlib.util.find_spec(pkg_name) is not None
     package_version = "N/A"
@@ -118,7 +120,9 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
 AUTOROUND_MIN_VERSION = "0.5.0"
 TRITON_MIN_VERSION = "1.0.0"
 
-_accelerate_available, _accelerate_version = _is_package_available("accelerate", return_version=True)
+_accelerate_available, _accelerate_version = _is_package_available(
+    "accelerate", return_version=True
+)
 _apex_available = _is_package_available("apex")
 _apollo_torch_available = _is_package_available("apollo_torch")
 _aqlm_available = _is_package_available("aqlm")
@@ -133,7 +137,9 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
 _galore_torch_available = _is_package_available("galore_torch")
 _lomo_available = _is_package_available("lomo_optim")
 _grokadamw_available = _is_package_available("grokadamw")
-_schedulefree_available, _schedulefree_version = _is_package_available("schedulefree", return_version=True)
+_schedulefree_available, _schedulefree_version = _is_package_available(
+    "schedulefree", return_version=True
+)
 _torch_optimi_available = importlib.util.find_spec("optimi") is not None
 # `importlib.metadata.version` doesn't work with `bs4` but `beautifulsoup4`. For `importlib.util.find_spec`, reversed.
 _bs4_available = importlib.util.find_spec("bs4") is not None
@@ -161,7 +167,9 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
 _ftfy_available = _is_package_available("ftfy")
 _g2p_en_available = _is_package_available("g2p_en")
 _hadamard_available = _is_package_available("fast_hadamard_transform")
-_ipex_available, _ipex_version = _is_package_available("intel_extension_for_pytorch", return_version=True)
+_ipex_available, _ipex_version = _is_package_available(
+    "intel_extension_for_pytorch", return_version=True
+)
 _jieba_available = _is_package_available("jieba")
 _jinja_available = _is_package_available("jinja2")
 _kenlm_available = _is_package_available("kenlm")
@@ -175,11 +183,15 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
 _optimum_available = _is_package_available("optimum")
 _auto_gptq_available = _is_package_available("auto_gptq")
 _gptqmodel_available = _is_package_available("gptqmodel")
-_auto_round_available, _auto_round_version = _is_package_available("auto_round", return_version=True)
+_auto_round_available, _auto_round_version = _is_package_available(
+    "auto_round", return_version=True
+)
 # `importlib.metadata.version` doesn't work with `awq`
 _auto_awq_available = importlib.util.find_spec("awq") is not None
 _quark_available = _is_package_available("quark")
-_fp_quant_available, _fp_quant_version = _is_package_available("fp_quant", return_version=True)
+_fp_quant_available, _fp_quant_version = _is_package_available(
+    "fp_quant", return_version=True
+)
 _qutlass_available = _is_package_available("qutlass")
 _is_optimum_quanto_available = False
 try:
@@ -188,7 +200,9 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
 except importlib.metadata.PackageNotFoundError:
     _is_optimum_quanto_available = False
 # For compressed_tensors, only check spec to allow compressed_tensors-nightly package
-_compressed_tensors_available = importlib.util.find_spec("compressed_tensors") is not None
+_compressed_tensors_available = (
+    importlib.util.find_spec("compressed_tensors") is not None
+)
 _pandas_available = _is_package_available("pandas")
 _peft_available = _is_package_available("peft")
 _phonemizer_available = _is_package_available("phonemizer")
@@ -216,16 +230,22 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
 _smdistributed_available = importlib.util.find_spec("smdistributed") is not None
 _soundfile_available = _is_package_available("soundfile")
 _spacy_available = _is_package_available("spacy")
-_sudachipy_available, _sudachipy_version = _is_package_available("sudachipy", return_version=True)
+_sudachipy_available, _sudachipy_version = _is_package_available(
+    "sudachipy", return_version=True
+)
 _tensorflow_probability_available = _is_package_available("tensorflow_probability")
 _tensorflow_text_available = _is_package_available("tensorflow_text")
 _tf2onnx_available = _is_package_available("tf2onnx")
 _timm_available = _is_package_available("timm")
 _tokenizers_available = _is_package_available("tokenizers")
 _torchaudio_available = _is_package_available("torchaudio")
-_torchao_available, _torchao_version = _is_package_available("torchao", return_version=True)
+_torchao_available, _torchao_version = _is_package_available(
+    "torchao", return_version=True
+)
 _torchdistx_available = _is_package_available("torchdistx")
-_torchvision_available, _torchvision_version = _is_package_available("torchvision", return_version=True)
+_torchvision_available, _torchvision_version = _is_package_available(
+    "torchvision", return_version=True
+)
 _mlx_available = _is_package_available("mlx")
 _num2words_available = _is_package_available("num2words")
 _hqq_available, _hqq_version = _is_package_available("hqq", return_version=True)
@@ -237,17 +257,23 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
 _kernels_available = _is_package_available("kernels")
 _matplotlib_available = _is_package_available("matplotlib")
 _mistral_common_available = _is_package_available("mistral_common")
-_triton_available, _triton_version = _is_package_available("triton", return_version=True)
+_triton_available, _triton_version = _is_package_available(
+    "triton", return_version=True
+)
 _triton_kernels_available = _is_package_available("triton_kernels")
 
 _torch_version = "N/A"
 _torch_available = False
 if USE_TORCH in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TF not in ENV_VARS_TRUE_VALUES:
-    _torch_available, _torch_version = _is_package_available("torch", return_version=True)
+    _torch_available, _torch_version = _is_package_available(
+        "torch", return_version=True
+    )
     if _torch_available:
         _torch_available = version.parse(_torch_version) >= version.parse("2.1.0")
         if not _torch_available:
-            logger.warning(f"Disabling PyTorch because PyTorch >= 2.1 is required but found {_torch_version}")
+            logger.warning(
+                f"Disabling PyTorch because PyTorch >= 2.1 is required but found {_torch_version}"
+            )
 else:
     logger.info("Disabling PyTorch because USE_TF is set")
     _torch_available = False
@@ -258,7 +284,10 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
 if FORCE_TF_AVAILABLE in ENV_VARS_TRUE_VALUES:
     _tf_available = True
 else:
-    if USE_TF in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TORCH not in ENV_VARS_TRUE_VALUES:
+    if (
+        USE_TF in ENV_VARS_TRUE_AND_AUTO_VALUES
+        and USE_TORCH not in ENV_VARS_TRUE_VALUES
+    ):
         # Note: _is_package_available("tensorflow") fails for tensorflow-cpu. Please test any changes to the line below
         # with tensorflow-cpu to make sure it still works!
         _tf_available = importlib.util.find_spec("tensorflow") is not None
@@ -354,7 +383,9 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
     if _flax_available:
         _jax_available, _jax_version = _is_package_available("jax", return_version=True)
         if _jax_available:
-            logger.info(f"JAX version {_jax_version}, Flax version {_flax_version} available.")
+            logger.info(
+                f"JAX version {_jax_version}, Flax version {_flax_version} available."
+            )
         else:
             _flax_available = _jax_available = False
             _jax_version = _flax_version = "N/A"
@@ -362,7 +393,9 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
 
 _torch_xla_available = False
 if USE_TORCH_XLA in ENV_VARS_TRUE_VALUES:
-    _torch_xla_available, _torch_xla_version = _is_package_available("torch_xla", return_version=True)
+    _torch_xla_available, _torch_xla_version = _is_package_available(
+        "torch_xla", return_version=True
+    )
     if _torch_xla_available:
         logger.info(f"Torch XLA version {_torch_xla_version} available.")
 
@@ -392,7 +425,9 @@ def is_libcst_available():
 
 
 def is_accelerate_available(min_version: str = ACCELERATE_MIN_VERSION):
-    return _accelerate_available and version.parse(_accelerate_version) >= version.parse(min_version)
+    return _accelerate_available and version.parse(
+        _accelerate_version
+    ) >= version.parse(min_version)
 
 
 def is_torch_accelerator_available():
@@ -420,7 +455,9 @@ def is_torch_deterministic():
 
 
 def is_triton_available(min_version: str = TRITON_MIN_VERSION):
-    return _triton_available and version.parse(_triton_version) >= version.parse(min_version)
+    return _triton_available and version.parse(_triton_version) >= version.parse(
+        min_version
+    )
 
 
 def is_triton_kernels_availalble():
@@ -456,12 +493,6 @@ def is_torch_sdpa_available():
     elif _torch_version == "N/A":
         return False
 
-    # NOTE: MLU is OK with non-contiguous inputs.
-    if is_torch_mlu_available():
-        return True
-    # NOTE: NPU can use SDPA in Transformers with torch>=2.1.0.
-    if is_torch_npu_available():
-        return True
     # NOTE: We require torch>=2.1.1 to avoid a numerical issue in SDPA with non-contiguous inputs: https://github.com/pytorch/pytorch/issues/112577
     return version.parse(_torch_version) >= version.parse("2.1.1")
 
@@ -510,7 +541,9 @@ def is_grokadamw_available():
 
 
 def is_schedulefree_available(min_version: str = SCHEDULEFREE_MIN_VERSION):
-    return _schedulefree_available and version.parse(_schedulefree_version) >= version.parse(min_version)
+    return _schedulefree_available and version.parse(
+        _schedulefree_version
+    ) >= version.parse(min_version)
 
 
 def is_pyctcdecode_available():
@@ -583,21 +616,6 @@ def is_mamba_ssm_available():
     return False
 
 
-def is_mamba_2_ssm_available():
-    if is_torch_available():
-        import torch
-
-        if not torch.cuda.is_available():
-            return False
-        else:
-            if _is_package_available("mamba_ssm"):
-                import mamba_ssm
-
-                if version.parse(mamba_ssm.__version__) >= version.parse("2.0.4"):
-                    return True
-    return False
-
-
 def is_causal_conv1d_available():
     if is_torch_available():
         import torch
@@ -625,7 +643,9 @@ def is_torch_mps_available(min_version: Optional[str] = None):
         import torch
 
         if hasattr(torch.backends, "mps"):
-            backend_available = torch.backends.mps.is_available() and torch.backends.mps.is_built()
+            backend_available = (
+                torch.backends.mps.is_available() and torch.backends.mps.is_built()
+            )
             if min_version is not None:
                 flag = version.parse(_torch_version) >= version.parse(min_version)
                 backend_available = backend_available and flag
@@ -641,12 +661,6 @@ def is_torch_bf16_gpu_available() -> bool:
 
     if torch.cuda.is_available():
         return torch.cuda.is_bf16_supported()
-    if is_torch_xpu_available():
-        return torch.xpu.is_bf16_supported()
-    if is_torch_hpu_available():
-        return True
-    if is_torch_npu_available():
-        return torch.npu.is_bf16_supported()
     return False
 
 
@@ -670,12 +684,6 @@ def is_torch_fp16_available_on_device(device):
     if not is_torch_available():
         return False
 
-    if is_torch_hpu_available():
-        if is_habana_gaudi1():
-            return False
-        else:
-            return True
-
     import torch
 
     try:
@@ -685,8 +693,12 @@ def is_torch_fp16_available_on_device(device):
         # At this moment, let's be strict of the check: check if `LayerNorm` is also supported on device, because many
         # models use this layer.
         batch, sentence_length, embedding_dim = 3, 4, 5
-        embedding = torch.randn(batch, sentence_length, embedding_dim, dtype=torch.float16, device=device)
-        layer_norm = torch.nn.LayerNorm(embedding_dim, dtype=torch.float16, device=device)
+        embedding = torch.randn(
+            batch, sentence_length, embedding_dim, dtype=torch.float16, device=device
+        )
+        layer_norm = torch.nn.LayerNorm(
+            embedding_dim, dtype=torch.float16, device=device
+        )
         _ = layer_norm(embedding)
 
     except:  # noqa: E722
@@ -768,7 +780,10 @@ def is_flax_available():
 
 def is_flute_available():
     try:
-        return importlib.util.find_spec("flute") is not None and importlib.metadata.version("flute-kernel") >= "0.4.1"
+        return (
+            importlib.util.find_spec("flute") is not None
+            and importlib.metadata.version("flute-kernel") >= "0.4.1"
+        )
     except importlib.metadata.PackageNotFoundError:
         return False
 
@@ -781,209 +796,6 @@ def is_g2p_en_available():
     return _g2p_en_available
 
 
-@lru_cache
-def is_torch_xla_available(check_is_tpu=False, check_is_gpu=False):
-    """
-    Check if `torch_xla` is available. To train a native pytorch job in an environment with torch xla installed, set
-    the USE_TORCH_XLA to false.
-    """
-    assert not (check_is_tpu and check_is_gpu), "The check_is_tpu and check_is_gpu cannot both be true."
-
-    if not _torch_xla_available:
-        return False
-
-    import torch_xla
-
-    if check_is_gpu:
-        return torch_xla.runtime.device_type() in ["GPU", "CUDA"]
-    elif check_is_tpu:
-        return torch_xla.runtime.device_type() == "TPU"
-
-    return True
-
-
-@lru_cache
-def is_torch_neuroncore_available(check_device=True):
-    if importlib.util.find_spec("torch_neuronx") is not None:
-        return is_torch_xla_available()
-    return False
-
-
-@lru_cache
-def is_torch_npu_available(check_device=False):
-    "Checks if `torch_npu` is installed and potentially if a NPU is in the environment"
-    if not _torch_available or importlib.util.find_spec("torch_npu") is None:
-        return False
-
-    import torch
-    import torch_npu  # noqa: F401
-
-    if check_device:
-        try:
-            # Will raise a RuntimeError if no NPU is found
-            _ = torch.npu.device_count()
-            return torch.npu.is_available()
-        except RuntimeError:
-            return False
-    return hasattr(torch, "npu") and torch.npu.is_available()
-
-
-@lru_cache
-def is_torch_mlu_available(check_device=False):
-    """
-    Checks if `mlu` is available via an `cndev-based` check which won't trigger the drivers and leave mlu
-    uninitialized.
-    """
-    if not _torch_available or importlib.util.find_spec("torch_mlu") is None:
-        return False
-
-    import torch
-    import torch_mlu  # noqa: F401
-
-    pytorch_cndev_based_mlu_check_previous_value = os.environ.get("PYTORCH_CNDEV_BASED_MLU_CHECK")
-    try:
-        os.environ["PYTORCH_CNDEV_BASED_MLU_CHECK"] = str(1)
-        available = torch.mlu.is_available()
-    finally:
-        if pytorch_cndev_based_mlu_check_previous_value:
-            os.environ["PYTORCH_CNDEV_BASED_MLU_CHECK"] = pytorch_cndev_based_mlu_check_previous_value
-        else:
-            os.environ.pop("PYTORCH_CNDEV_BASED_MLU_CHECK", None)
-
-    return available
-
-
-@lru_cache
-def is_torch_musa_available(check_device=False):
-    "Checks if `torch_musa` is installed and potentially if a MUSA is in the environment"
-    if not _torch_available or importlib.util.find_spec("torch_musa") is None:
-        return False
-
-    import torch
-    import torch_musa  # noqa: F401
-
-    torch_musa_min_version = "0.33.0"
-    if _accelerate_available and version.parse(_accelerate_version) < version.parse(torch_musa_min_version):
-        return False
-
-    if check_device:
-        try:
-            # Will raise a RuntimeError if no MUSA is found
-            _ = torch.musa.device_count()
-            return torch.musa.is_available()
-        except RuntimeError:
-            return False
-    return hasattr(torch, "musa") and torch.musa.is_available()
-
-
-@lru_cache
-def is_torch_hpu_available():
-    "Checks if `torch.hpu` is available and potentially if a HPU is in the environment"
-    if (
-        not _torch_available
-        or importlib.util.find_spec("habana_frameworks") is None
-        or importlib.util.find_spec("habana_frameworks.torch") is None
-    ):
-        return False
-
-    torch_hpu_min_accelerate_version = "1.5.0"
-    if _accelerate_available and version.parse(_accelerate_version) < version.parse(torch_hpu_min_accelerate_version):
-        return False
-
-    import torch
-
-    if os.environ.get("PT_HPU_LAZY_MODE", "1") == "1":
-        # import habana_frameworks.torch in case of lazy mode to patch torch with torch.hpu
-        import habana_frameworks.torch  # noqa: F401
-
-    if not hasattr(torch, "hpu") or not torch.hpu.is_available():
-        return False
-
-    # We patch torch.gather for int64 tensors to avoid a bug on Gaudi
-    # Graph compile failed with synStatus 26 [Generic failure]
-    # This can be removed once bug is fixed but for now we need it.
-    original_gather = torch.gather
-
-    def patched_gather(input: torch.Tensor, dim: int, index: torch.LongTensor) -> torch.Tensor:
-        if input.dtype == torch.int64 and input.device.type == "hpu":
-            return original_gather(input.to(torch.int32), dim, index).to(torch.int64)
-        else:
-            return original_gather(input, dim, index)
-
-    torch.gather = patched_gather
-    torch.Tensor.gather = patched_gather
-
-    original_take_along_dim = torch.take_along_dim
-
-    def patched_take_along_dim(
-        input: torch.Tensor, indices: torch.LongTensor, dim: Optional[int] = None
-    ) -> torch.Tensor:
-        if input.dtype == torch.int64 and input.device.type == "hpu":
-            return original_take_along_dim(input.to(torch.int32), indices, dim).to(torch.int64)
-        else:
-            return original_take_along_dim(input, indices, dim)
-
-    torch.take_along_dim = patched_take_along_dim
-
-    original_cholesky = torch.linalg.cholesky
-
-    def safe_cholesky(A, *args, **kwargs):
-        output = original_cholesky(A, *args, **kwargs)
-
-        if torch.isnan(output).any():
-            jitter_value = 1e-9
-            diag_jitter = torch.eye(A.size(-1), dtype=A.dtype, device=A.device) * jitter_value
-            output = original_cholesky(A + diag_jitter, *args, **kwargs)
-
-        return output
-
-    torch.linalg.cholesky = safe_cholesky
-
-    original_scatter = torch.scatter
-
-    def patched_scatter(
-        input: torch.Tensor, dim: int, index: torch.Tensor, src: torch.Tensor, *args, **kwargs
-    ) -> torch.Tensor:
-        if input.device.type == "hpu" and input is src:
-            return original_scatter(input, dim, index, src.clone(), *args, **kwargs)
-        else:
-            return original_scatter(input, dim, index, src, *args, **kwargs)
-
-    torch.scatter = patched_scatter
-    torch.Tensor.scatter = patched_scatter
-
-    # IlyasMoutawwakil: we patch torch.compile to use the HPU backend by default
-    # https://github.com/huggingface/transformers/pull/38790#discussion_r2157043944
-    # This is necessary for cases where torch.compile is used as a decorator (defaulting to inductor)
-    # https://github.com/huggingface/transformers/blob/af6120b3eb2470b994c21421bb6eaa76576128b0/src/transformers/models/modernbert/modeling_modernbert.py#L204
-    original_compile = torch.compile
-
-    def hpu_backend_compile(*args, **kwargs):
-        if kwargs.get("backend") not in ["hpu_backend", "eager"]:
-            logger.warning(
-                f"Calling torch.compile with backend={kwargs.get('backend')} on a Gaudi device is not supported. "
-                "We will override the backend with 'hpu_backend' to avoid errors."
-            )
-            kwargs["backend"] = "hpu_backend"
-
-        return original_compile(*args, **kwargs)
-
-    torch.compile = hpu_backend_compile
-
-    return True
-
-
-@lru_cache
-def is_habana_gaudi1():
-    if not is_torch_hpu_available():
-        return False
-
-    import habana_frameworks.torch.utils.experimental as htexp  # noqa: F401
-
-    # Check if the device is Gaudi1 (vs Gaudi2, Gaudi3)
-    return htexp._get_device_type() == htexp.synDeviceType.synDeviceGaudi
-
-
 def is_torchdynamo_available():
     return is_torch_available()
 
@@ -1067,7 +879,9 @@ def is_aqlm_available():
 
 
 def is_vptq_available(min_version: str = VPTQ_MIN_VERSION):
-    return _vptq_available and version.parse(_vptq_version) >= version.parse(min_version)
+    return _vptq_available and version.parse(_vptq_version) >= version.parse(
+        min_version
+    )
 
 
 def is_av_available():
@@ -1095,55 +909,6 @@ def is_ninja_available():
         return True
 
 
-def is_ipex_available(min_version: str = ""):
-    def get_major_and_minor_from_version(full_version):
-        return str(version.parse(full_version).major) + "." + str(version.parse(full_version).minor)
-
-    if not is_torch_available() or not _ipex_available:
-        return False
-
-    torch_major_and_minor = get_major_and_minor_from_version(_torch_version)
-    ipex_major_and_minor = get_major_and_minor_from_version(_ipex_version)
-    if torch_major_and_minor != ipex_major_and_minor:
-        logger.warning(
-            f"Intel Extension for PyTorch {ipex_major_and_minor} needs to work with PyTorch {ipex_major_and_minor}.*,"
-            f" but PyTorch {_torch_version} is found. Please switch to the matching version and run again."
-        )
-        return False
-    if min_version:
-        return version.parse(_ipex_version) >= version.parse(min_version)
-    return True
-
-
-@lru_cache
-def is_torch_xpu_available(check_device=False):
-    """
-    Checks if XPU acceleration is available either via native PyTorch (>=2.6),
-    `intel_extension_for_pytorch` or via stock PyTorch (>=2.4) and potentially
-    if a XPU is in the environment.
-    """
-    if not is_torch_available():
-        return False
-
-    torch_version = version.parse(_torch_version)
-    if torch_version.major == 2 and torch_version.minor < 6:
-        if is_ipex_available():
-            import intel_extension_for_pytorch  # noqa: F401
-        elif torch_version.major == 2 and torch_version.minor < 4:
-            return False
-
-    import torch
-
-    if check_device:
-        try:
-            # Will raise a RuntimeError if no XPU  is found
-            _ = torch.xpu.device_count()
-            return torch.xpu.is_available()
-        except RuntimeError:
-            return False
-    return hasattr(torch, "xpu") and torch.xpu.is_available()
-
-
 @lru_cache
 def is_bitsandbytes_available(check_library_only=False) -> bool:
     if not _bitsandbytes_available:
@@ -1159,81 +924,15 @@ def is_bitsandbytes_available(check_library_only=False) -> bool:
 
     # `bitsandbytes` versions older than 0.43.1 eagerly require CUDA at import time,
     # so those versions of the library are practically only available when CUDA is too.
-    if version.parse(importlib.metadata.version("bitsandbytes")) < version.parse("0.43.1"):
+    if version.parse(importlib.metadata.version("bitsandbytes")) < version.parse(
+        "0.43.1"
+    ):
         return torch.cuda.is_available()
 
     # Newer versions of `bitsandbytes` can be imported on systems without CUDA.
     return True
 
 
-def is_bitsandbytes_multi_backend_available() -> bool:
-    if not is_bitsandbytes_available():
-        return False
-
-    import bitsandbytes as bnb
-
-    return "multi_backend" in getattr(bnb, "features", set())
-
-
-def is_flash_attn_2_available():
-    if not is_torch_available():
-        return False
-
-    if not _is_package_available("flash_attn"):
-        return False
-
-    # Let's add an extra check to see if cuda is available
-    import torch
-
-    if not (torch.cuda.is_available() or is_torch_mlu_available()):
-        return False
-
-    if torch.version.cuda:
-        return version.parse(importlib.metadata.version("flash_attn")) >= version.parse("2.1.0")
-    elif torch.version.hip:
-        # TODO: Bump the requirement to 2.1.0 once released in https://github.com/ROCmSoftwarePlatform/flash-attention
-        return version.parse(importlib.metadata.version("flash_attn")) >= version.parse("2.0.4")
-    elif is_torch_mlu_available():
-        return version.parse(importlib.metadata.version("flash_attn")) >= version.parse("2.3.3")
-    else:
-        return False
-
-
-@lru_cache
-def is_flash_attn_3_available():
-    if not is_torch_available():
-        return False
-
-    if not _is_package_available("flash_attn_3"):
-        return False
-
-    import torch
-
-    if not torch.cuda.is_available():
-        return False
-
-    # TODO: Check for a minimum version when FA3 is stable
-    # return version.parse(importlib.metadata.version("flash_attn_3")) >= version.parse("3.0.0")
-
-    return True
-
-
-@lru_cache
-def is_flash_attn_greater_or_equal_2_10():
-    if not _is_package_available("flash_attn"):
-        return False
-
-    return version.parse(importlib.metadata.version("flash_attn")) >= version.parse("2.1.0")
-
-
-@lru_cache
-def is_flash_attn_greater_or_equal(library_version: str):
-    if not _is_package_available("flash_attn"):
-        return False
-
-    return version.parse(importlib.metadata.version("flash_attn")) >= version.parse(library_version)
-
-
 @lru_cache
 def is_torch_greater_or_equal(library_version: str, accept_dev: bool = False):
     """
@@ -1245,11 +944,13 @@ def is_torch_greater_or_equal(library_version: str, accept_dev: bool = False):
         return False
 
     if accept_dev:
-        return version.parse(version.parse(importlib.metadata.version("torch")).base_version) >= version.parse(
+        return version.parse(
+            version.parse(importlib.metadata.version("torch")).base_version
+        ) >= version.parse(library_version)
+    else:
+        return version.parse(importlib.metadata.version("torch")) >= version.parse(
             library_version
         )
-    else:
-        return version.parse(importlib.metadata.version("torch")) >= version.parse(library_version)
 
 
 @lru_cache
@@ -1263,11 +964,13 @@ def is_torch_less_or_equal(library_version: str, accept_dev: bool = False):
         return False
 
     if accept_dev:
-        return version.parse(version.parse(importlib.metadata.version("torch")).base_version) <= version.parse(
+        return version.parse(
+            version.parse(importlib.metadata.version("torch")).base_version
+        ) <= version.parse(library_version)
+    else:
+        return version.parse(importlib.metadata.version("torch")) <= version.parse(
             library_version
         )
-    else:
-        return version.parse(importlib.metadata.version("torch")) <= version.parse(library_version)
 
 
 @lru_cache
@@ -1280,7 +983,9 @@ def is_huggingface_hub_greater_or_equal(library_version: str, accept_dev: bool =
             version.parse(importlib.metadata.version("huggingface_hub")).base_version
         ) >= version.parse(library_version)
     else:
-        return version.parse(importlib.metadata.version("huggingface_hub")) >= version.parse(library_version)
+        return version.parse(
+            importlib.metadata.version("huggingface_hub")
+        ) >= version.parse(library_version)
 
 
 def is_torchdistx_available():
@@ -1308,7 +1013,9 @@ def is_seqio_available():
 
 
 def is_gguf_available(min_version: str = GGUF_MIN_VERSION):
-    return _is_gguf_available and version.parse(_gguf_version) >= version.parse(min_version)
+    return _is_gguf_available and version.parse(_gguf_version) >= version.parse(
+        min_version
+    )
 
 
 def is_protobuf_available():
@@ -1318,7 +1025,9 @@ def is_protobuf_available():
 
 
 def is_fsdp_available(min_version: str = FSDP_MIN_VERSION):
-    return is_torch_available() and version.parse(_torch_version) >= version.parse(min_version)
+    return is_torch_available() and version.parse(_torch_version) >= version.parse(
+        min_version
+    )
 
 
 def is_optimum_available():
@@ -1330,7 +1039,9 @@ def is_auto_awq_available():
 
 
 def is_auto_round_available(min_version: str = AUTOROUND_MIN_VERSION):
-    return _auto_round_available and version.parse(_auto_round_version) >= version.parse(min_version)
+    return _auto_round_available and version.parse(
+        _auto_round_version
+    ) >= version.parse(min_version)
 
 
 def is_optimum_quanto_available():
@@ -1343,7 +1054,9 @@ def is_quark_available():
 
 
 def is_fp_quant_available():
-    return _fp_quant_available and version.parse(_fp_quant_version) >= version.parse("0.1.6")
+    return _fp_quant_available and version.parse(_fp_quant_version) >= version.parse(
+        "0.1.6"
+    )
 
 
 def is_qutlass_available():
@@ -1431,7 +1144,10 @@ def is_in_notebook():
         if "IPKernelApp" not in get_ipython().config:
             raise ImportError("console")
         # Removed the lines to include VSCode
-        if "DATABRICKS_RUNTIME_VERSION" in os.environ and os.environ["DATABRICKS_RUNTIME_VERSION"] < "11.0":
+        if (
+            "DATABRICKS_RUNTIME_VERSION" in os.environ
+            and os.environ["DATABRICKS_RUNTIME_VERSION"] < "11.0"
+        ):
             # Databricks Runtime 11.0 and above uses IPython kernel by default so it should be compatible with Jupyter notebook
             # https://docs.microsoft.com/en-us/azure/databricks/notebooks/ipython-kernel
             raise ImportError("databricks")
@@ -1459,7 +1175,9 @@ def is_sagemaker_dp_enabled():
     try:
         # Parse it and check the field "sagemaker_distributed_dataparallel_enabled".
         sagemaker_params = json.loads(sagemaker_params)
-        if not sagemaker_params.get("sagemaker_distributed_dataparallel_enabled", False):
+        if not sagemaker_params.get(
+            "sagemaker_distributed_dataparallel_enabled", False
+        ):
             return False
     except json.JSONDecodeError:
         return False
@@ -1516,7 +1234,9 @@ def is_torchaudio_available():
 
 
 def is_torchao_available(min_version: str = TORCHAO_MIN_VERSION):
-    return _torchao_available and version.parse(_torchao_version) >= version.parse(min_version)
+    return _torchao_available and version.parse(_torchao_version) >= version.parse(
+        min_version
+    )
 
 
 def is_speech_available():
@@ -1571,7 +1291,9 @@ def is_sudachi_projection_available():
 
 
 def is_jumanpp_available():
-    return (importlib.util.find_spec("rhoknp") is not None) and (shutil.which("jumanpp") is not None)
+    return (importlib.util.find_spec("rhoknp") is not None) and (
+        shutil.which("jumanpp") is not None
+    )
 
 
 def is_cython_available():
@@ -1602,7 +1324,9 @@ def is_liger_kernel_available():
     if not _liger_kernel_available:
         return False
 
-    return version.parse(importlib.metadata.version("liger_kernel")) >= version.parse("0.3.0")
+    return version.parse(importlib.metadata.version("liger_kernel")) >= version.parse(
+        "0.3.0"
+    )
 
 
 def is_rich_available():
@@ -2055,13 +1779,22 @@ def check_torch_load_is_safe():
         ("pyctcdecode", (is_pyctcdecode_available, PYCTCDECODE_IMPORT_ERROR)),
         ("pytesseract", (is_pytesseract_available, PYTESSERACT_IMPORT_ERROR)),
         ("sacremoses", (is_sacremoses_available, SACREMOSES_IMPORT_ERROR)),
-        ("pytorch_quantization", (is_pytorch_quantization_available, PYTORCH_QUANTIZATION_IMPORT_ERROR)),
+        (
+            "pytorch_quantization",
+            (is_pytorch_quantization_available, PYTORCH_QUANTIZATION_IMPORT_ERROR),
+        ),
         ("sentencepiece", (is_sentencepiece_available, SENTENCEPIECE_IMPORT_ERROR)),
         ("sklearn", (is_sklearn_available, SKLEARN_IMPORT_ERROR)),
         ("speech", (is_speech_available, SPEECH_IMPORT_ERROR)),
-        ("tensorflow_probability", (is_tensorflow_probability_available, TENSORFLOW_PROBABILITY_IMPORT_ERROR)),
+        (
+            "tensorflow_probability",
+            (is_tensorflow_probability_available, TENSORFLOW_PROBABILITY_IMPORT_ERROR),
+        ),
         ("tf", (is_tf_available, TENSORFLOW_IMPORT_ERROR)),
-        ("tensorflow_text", (is_tensorflow_text_available, TENSORFLOW_TEXT_IMPORT_ERROR)),
+        (
+            "tensorflow_text",
+            (is_tensorflow_text_available, TENSORFLOW_TEXT_IMPORT_ERROR),
+        ),
         ("timm", (is_timm_available, TIMM_IMPORT_ERROR)),
         ("torchaudio", (is_torchaudio_available, TORCHAUDIO_IMPORT_ERROR)),
         ("natten", (is_natten_available, NATTEN_IMPORT_ERROR)),
@@ -2097,11 +1830,21 @@ def requires_backends(obj, backends):
     name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
 
     # Raise an error for users who might not realize that classes without "TF" are torch-only
-    if "torch" in backends and "tf" not in backends and not is_torch_available() and is_tf_available():
+    if (
+        "torch" in backends
+        and "tf" not in backends
+        and not is_torch_available()
+        and is_tf_available()
+    ):
         raise ImportError(PYTORCH_IMPORT_ERROR_WITH_TF.format(name))
 
     # Raise the inverse error for PyTorch users trying to load TF classes
-    if "tf" in backends and "torch" not in backends and is_torch_available() and not is_tf_available():
+    if (
+        "tf" in backends
+        and "torch" not in backends
+        and is_torch_available()
+        and not is_tf_available()
+    ):
         raise ImportError(TF_IMPORT_ERROR_WITH_PYTORCH.format(name))
 
     failed = []
@@ -2127,7 +1870,12 @@ class DummyObject(type):
     is_dummy = True
 
     def __getattribute__(cls, key):
-        if (key.startswith("_") and key != "_from_config") or key == "is_dummy" or key == "mro" or key == "call":
+        if (
+            (key.startswith("_") and key != "_from_config")
+            or key == "is_dummy"
+            or key == "mro"
+            or key == "call"
+        ):
             return super().__getattribute__(key)
         requires_backends(cls, cls._backends)
 
@@ -2163,7 +1911,9 @@ def __init__(
         super().__init__(name)
 
         self._object_missing_backend = {}
-        self._explicit_import_shortcut = explicit_import_shortcut if explicit_import_shortcut else {}
+        self._explicit_import_shortcut = (
+            explicit_import_shortcut if explicit_import_shortcut else {}
+        )
 
         if any(isinstance(key, frozenset) for key in import_structure):
             self._modules = set()
@@ -2191,7 +1941,12 @@ def __init__(
                 # }
 
                 module_keys = set(
-                    chain(*[[k.rsplit(".", i)[0] for i in range(k.count(".") + 1)] for k in list(module.keys())])
+                    chain(
+                        *[
+                            [k.rsplit(".", i)[0] for i in range(k.count(".") + 1)]
+                            for k in list(module.keys())
+                        ]
+                    )
                 )
 
                 for backend in backends:
@@ -2209,7 +1964,11 @@ def __init__(
                     try:
                         if not callable():
                             missing_backends.append(backend)
-                    except (importlib.metadata.PackageNotFoundError, ModuleNotFoundError, RuntimeError):
+                    except (
+                        importlib.metadata.PackageNotFoundError,
+                        ModuleNotFoundError,
+                        RuntimeError,
+                    ):
                         missing_backends.append(backend)
 
                 self._modules = self._modules.union(module_keys)
@@ -2242,7 +2001,9 @@ def __init__(
                 for value in values:
                     self._class_to_module[value] = key
             # Needed for autocompletion in an IDE
-            self.__all__ = list(import_structure.keys()) + list(chain(*import_structure.values()))
+            self.__all__ = list(import_structure.keys()) + list(
+                chain(*import_structure.values())
+            )
             self.__file__ = module_file
             self.__spec__ = module_spec
             self.__path__ = [os.path.dirname(module_file)]
@@ -2341,7 +2102,9 @@ def direct_transformers_import(path: str, file="__init__.py") -> ModuleType:
     """
     name = "transformers"
     location = os.path.join(path, file)
-    spec = importlib.util.spec_from_file_location(name, location, submodule_search_locations=[path])
+    spec = importlib.util.spec_from_file_location(
+        name, location, submodule_search_locations=[path]
+    )
     module = importlib.util.module_from_spec(spec)
     spec.loader.exec_module(module)
     module = sys.modules[name]
@@ -2383,7 +2146,9 @@ def split_package_version(package_version_str) -> tuple[str, str, str]:
 
 class Backend:
     def __init__(self, backend_requirement: str):
-        self.package_name, self.version_comparison, self.version = split_package_version(backend_requirement)
+        self.package_name, self.version_comparison, self.version = (
+            split_package_version(backend_requirement)
+        )
 
         if self.package_name not in BACKENDS_MAPPING:
             raise ValueError(
@@ -2392,7 +2157,8 @@ def __init__(self, backend_requirement: str):
 
     def is_satisfied(self) -> bool:
         return VersionComparison.from_string(self.version_comparison)(
-            version.parse(importlib.metadata.version(self.package_name)), version.parse(self.version)
+            version.parse(importlib.metadata.version(self.package_name)),
+            version.parse(self.version),
         )
 
     def __repr__(self) -> str:
@@ -2425,7 +2191,9 @@ def requires(*, backends=()):
             if any(key in backend for key in ["=", "<", ">"]):
                 applied_backends.append(Backend(backend))
             else:
-                raise ValueError(f"Backend should be defined in the BACKENDS_MAPPING. Offending backend: {backend}")
+                raise ValueError(
+                    f"Backend should be defined in the BACKENDS_MAPPING. Offending backend: {backend}"
+                )
 
     def inner_fn(fun):
         fun.__backends = applied_backends
@@ -2439,7 +2207,8 @@ def inner_fn(fun):
     lambda e: "modeling_flax_" in e: ("flax",),
     lambda e: "modeling_" in e: ("torch",),
     lambda e: e.startswith("tokenization_") and e.endswith("_fast"): ("tokenizers",),
-    lambda e: e.startswith("image_processing_") and e.endswith("_fast"): ("vision", "torch", "torchvision"),
+    lambda e: e.startswith("image_processing_")
+    and e.endswith("_fast"): ("vision", "torch", "torchvision"),
     lambda e: e.startswith("image_processing_"): ("vision",),
 }
 
@@ -2472,7 +2241,9 @@ def fetch__all__(file_content):
 
     # __all__ is defined on a single line
     if lines[0].endswith("]"):
-        return [obj.strip("\"' ") for obj in lines[0].split("=")[1].strip(" []").split(",")]
+        return [
+            obj.strip("\"' ") for obj in lines[0].split("=")[1].strip(" []").split(",")
+        ]
 
     # __all__ is defined on multiple lines
     else:
@@ -2551,7 +2322,9 @@ def create_import_structure_from_path(module_path):
 
     for f in os.listdir(module_path):
         if f != "__pycache__" and os.path.isdir(os.path.join(module_path, f)):
-            import_structure[f] = create_import_structure_from_path(os.path.join(module_path, f))
+            import_structure[f] = create_import_structure_from_path(
+                os.path.join(module_path, f)
+            )
 
         elif not os.path.isdir(os.path.join(directory, f)):
             adjacent_modules.append(f)
@@ -2566,8 +2339,12 @@ def create_import_structure_from_path(module_path):
     def find_substring(substring, list_):
         return any(substring in x for x in list_)
 
-    if find_substring("modular_", adjacent_modules) and find_substring("modeling_", adjacent_modules):
-        adjacent_modules = [module for module in adjacent_modules if "modular_" not in module]
+    if find_substring("modular_", adjacent_modules) and find_substring(
+        "modeling_", adjacent_modules
+    ):
+        adjacent_modules = [
+            module for module in adjacent_modules if "modular_" not in module
+        ]
 
     module_requirements = {}
     for module_name in adjacent_modules:
@@ -2601,7 +2378,9 @@ def find_substring(substring, list_):
             for index, line in enumerate(lines):
                 # This allows exporting items with other decorators. We'll take a look
                 # at the line that follows at the same indentation level.
-                if line.startswith((" ", "\t", "@", ")")) and not line.startswith("@requires"):
+                if line.startswith((" ", "\t", "@", ")")) and not line.startswith(
+                    "@requires"
+                ):
                     continue
 
                 # Skipping line enables putting whatever we want between the
@@ -2614,8 +2393,20 @@ def find_substring(substring, list_):
 
                     # Backends are defined on the same line as export
                     if "backends" in previous_line:
-                        backends_string = previous_line.split("backends=")[1].split("(")[1].split(")")[0]
-                        backends = tuple(sorted([b.strip("'\",") for b in backends_string.split(", ") if b]))
+                        backends_string = (
+                            previous_line.split("backends=")[1]
+                            .split("(")[1]
+                            .split(")")[0]
+                        )
+                        backends = tuple(
+                            sorted(
+                                [
+                                    b.strip("'\",")
+                                    for b in backends_string.split(", ")
+                                    if b
+                                ]
+                            )
+                        )
 
                     # Backends are defined in the lines following export, for example such as:
                     # @export(
@@ -2640,7 +2431,10 @@ def find_substring(substring, list_):
                                 backend_line = backend_line.split("=")[1]
                             if '"' in backend_line or "'" in backend_line:
                                 if ", " in backend_line:
-                                    backends.extend(backend.strip("()\"', ") for backend in backend_line.split(", "))
+                                    backends.extend(
+                                        backend.strip("()\"', ")
+                                        for backend in backend_line.split(", ")
+                                    )
                                 else:
                                     backends.append(backend_line.strip("()\"', "))
 
@@ -2815,7 +2609,9 @@ def flatten_dict(_dict, previous_key=None):
 
 
 @lru_cache
-def define_import_structure(module_path: str, prefix: Optional[str] = None) -> IMPORT_STRUCTURE_T:
+def define_import_structure(
+    module_path: str, prefix: Optional[str] = None
+) -> IMPORT_STRUCTURE_T:
     """
     This method takes a module_path as input and creates an import structure digestible by a _LazyModule.
 
@@ -2844,7 +2640,10 @@ def define_import_structure(module_path: str, prefix: Optional[str] = None) -> I
     if prefix is None:
         return spread_dict
     else:
-        spread_dict = {k: {f"{prefix}.{kk}": vv for kk, vv in v.items()} for k, v in spread_dict.items()}
+        spread_dict = {
+            k: {f"{prefix}.{kk}": vv for kk, vv in v.items()}
+            for k, v in spread_dict.items()
+        }
         return spread_dict
 
 
@@ -2855,7 +2654,9 @@ def clear_import_cache():
     This is useful when actively developing/modifying Transformers code.
     """
     # Get all transformers modules
-    transformers_modules = [mod_name for mod_name in sys.modules if mod_name.startswith("transformers.")]
+    transformers_modules = [
+        mod_name for mod_name in sys.modules if mod_name.startswith("transformers.")
+    ]
 
     # Remove them from sys.modules
     for mod_name in transformers_modules:
diff --git a/mindnlp/utils/logging.py b/mindnlp/utils/logging.py
index 5cd14e8b2..912912acb 100644
--- a/mindnlp/utils/logging.py
+++ b/mindnlp/utils/logging.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 # ============================================================================
 # pylint: disable=unused-import
-""" Logging utilities."""
+"""Logging utilities."""
 
 import functools
 import logging
@@ -74,21 +74,21 @@ def _get_default_logging_level():
 def _get_library_name() -> str:
     """
     Returns the name of the library based on the module name.
-    
+
     Returns:
         str: The name of the library extracted from the module name.
-    
+
     """
-    return __name__.split(".")[0] # pylint: disable=use-maxsplit-arg
+    return __name__.split(".")[0]  # pylint: disable=use-maxsplit-arg
 
 
 def _get_library_root_logger() -> logging.Logger:
     """
     Retrieves the root logger for the library.
-    
+
     Returns:
         A logging.Logger object representing the root logger for the library.
-    
+
     Raises:
         None.
     """
@@ -98,10 +98,10 @@ def _get_library_root_logger() -> logging.Logger:
 def _configure_library_root_logger() -> None:
     """
     This function configures the root logger for the library.
-    
+
     Returns:
         None: This function does not return any value.
-    
+
     Raises:
         None
     """
@@ -124,7 +124,9 @@ def _configure_library_root_logger() -> None:
         library_root_logger.setLevel(_get_default_logging_level())
         # if logging level is debug, we add pathname and lineno to formatter for easy debugging
         if os.getenv("TRANSFORMERS_VERBOSITY", None) == "detail":
-            formatter = logging.Formatter("[%(levelname)s|%(pathname)s:%(lineno)s] %(asctime)s >> %(message)s")
+            formatter = logging.Formatter(
+                "[%(levelname)s|%(pathname)s:%(lineno)s] %(asctime)s >> %(message)s"
+            )
             _default_handler.setFormatter(formatter)
 
         library_root_logger.propagate = False
@@ -133,13 +135,13 @@ def _configure_library_root_logger() -> None:
 def _reset_library_root_logger() -> None:
     """
     Resets the root logger of the library to its default state.
-    
+
     Args:
         None
-    
+
     Returns:
         None. The function does not return any value.
-    
+
     Raises:
         None
     """
@@ -158,7 +160,7 @@ def _reset_library_root_logger() -> None:
 def get_log_levels_dict():
     """
     Returns a dictionary of log levels.
-    
+
     Returns:
         dict: A dictionary containing log levels and their corresponding values.
     """
@@ -321,7 +323,9 @@ def enable_explicit_format() -> None:
     handlers = _get_library_root_logger().handlers
 
     for handler in handlers:
-        formatter = logging.Formatter("[%(levelname)s|%(filename)s:%(lineno)s] %(asctime)s >> %(message)s")
+        formatter = logging.Formatter(
+            "[%(levelname)s|%(filename)s:%(lineno)s] %(asctime)s >> %(message)s"
+        )
         handler.setFormatter(formatter)
 
 
@@ -342,7 +346,7 @@ def warning_advice(self, *args, **kwargs):
     This method is identical to `logger.warning()`, but if env var TRANSFORMERS_NO_ADVISORY_WARNINGS=1 is set, this
     warning will not be printed
     """
-    no_advisory_warnings = os.getenv("NO_ADVISORY_WARNINGS", False) # pylint: disable=invalid-envvar-default
+    no_advisory_warnings = os.getenv("NO_ADVISORY_WARNINGS", False)  # pylint: disable=invalid-envvar-default
     if no_advisory_warnings:
         return
     self.warning(*args, **kwargs)
@@ -368,16 +372,17 @@ def warning_once(self, *args, **kwargs):
 
 class EmptyTqdm:
     """Dummy tqdm which doesn't do anything."""
+
     def __init__(self, *args, **kwargs):
         """
         Initializes an instance of the EmptyTqdm class.
-        
+
         Args:
             self: The instance of the EmptyTqdm class.
-        
+
         Returns:
             None. This method does not return any value.
-        
+
         Raises:
             None.
         """
@@ -386,13 +391,13 @@ def __init__(self, *args, **kwargs):
     def __iter__(self):
         """
         This method implements the iterator protocol for the EmptyTqdm class.
-        
+
         Args:
             self: EmptyTqdm object. The instance of the EmptyTqdm class for which the iterator is being created.
-        
+
         Returns:
             None. This method returns an iterator object that iterates over the _iterator attribute of the EmptyTqdm instance.
-        
+
         Raises:
             No specific exceptions are raised by this method.
         """
@@ -400,22 +405,24 @@ def __iter__(self):
 
     def __getattr__(self, _):
         """Return empty function."""
+
         def empty_fn(*args, **kwargs):
             return
+
         return empty_fn
 
     def __enter__(self):
         """
         __enter__
-        
+
         Args:
             self: EmptyTqdm
                 The self parameter refers to the current instance of the EmptyTqdm class.
-        
+
         Returns:
             None
                 This method returns None.
-        
+
         Raises:
             No exceptions are raised by this method.
         """
@@ -424,7 +431,7 @@ def __enter__(self):
     def __exit__(self, type_, value, traceback):
         """
         __exit__ method in the EmptyTqdm class.
-        
+
         Args:
             self: EmptyTqdm object
                 The instance of the EmptyTqdm class.
@@ -434,11 +441,11 @@ def __exit__(self, type_, value, traceback):
                 The exception that was raised. It represents the actual exception object.
             traceback: traceback
                 The traceback object. It represents the traceback information associated with the exception.
-        
+
         Returns:
             None
             This method does not return any value.
-        
+
         Raises:
             This method does not raise any exceptions explicitly.
         """
@@ -446,24 +453,24 @@ def __exit__(self, type_, value, traceback):
 
 
 class _tqdm_cls:
-
     """_tqdm_cls is a Python class that provides functionality for managing the progress of tasks. It includes methods for calling the class, setting a lock, and getting a lock. This class is designed to work
-in conjunction with the tqdm_lib module for displaying progress bars during iterative processes. When _tqdm_active is True, the class uses methods from the tqdm_lib.tqdm module to handle progress tracking.
-Otherwise, it falls back to using an EmptyTqdm instance for progress tracking. The set_lock method allows users to specify a lock for thread safety, and the get_lock method retrieves the current lock if one
-has been set."""
+    in conjunction with the tqdm_lib module for displaying progress bars during iterative processes. When _tqdm_active is True, the class uses methods from the tqdm_lib.tqdm module to handle progress tracking.
+    Otherwise, it falls back to using an EmptyTqdm instance for progress tracking. The set_lock method allows users to specify a lock for thread safety, and the get_lock method retrieves the current lock if one
+    has been set."""
+
     def __call__(self, *args, **kwargs):
         """
-        This method __call__ in the class _tqdm_cls is used to conditionally return either a tqdm object or an EmptyTqdm object based on the _tqdm_active flag.
-        
-        Args:
-            self (object): The instance of the _tqdm_cls class. It is used to access the attributes and methods of the class.
-        
-        Returns:
-            None: This method does not explicitly return any value. It returns either a tqdm object or an EmptyTqdm object based on the _tqdm_active flag.
-        
-        Raises:
-            No specific exceptions are raised by this method under normal circumstances. However, if there are issues related to the instantiation of tqdm objects or EmptyTqdm objects, standard Python
-exceptions may be raised.
+                This method __call__ in the class _tqdm_cls is used to conditionally return either a tqdm object or an EmptyTqdm object based on the _tqdm_active flag.
+
+                Args:
+                    self (object): The instance of the _tqdm_cls class. It is used to access the attributes and methods of the class.
+
+                Returns:
+                    None: This method does not explicitly return any value. It returns either a tqdm object or an EmptyTqdm object based on the _tqdm_active flag.
+
+                Raises:
+                    No specific exceptions are raised by this method under normal circumstances. However, if there are issues related to the instantiation of tqdm objects or EmptyTqdm objects, standard Python
+        exceptions may be raised.
         """
         if _tqdm_active:
             return tqdm_lib.tqdm(*args, **kwargs)
@@ -472,15 +479,15 @@ def __call__(self, *args, **kwargs):
     def set_lock(self, *args, **kwargs):
         """
         Method to set the lock for the _tqdm_cls instance.
-        
+
         Args:
             self (_tqdm_cls): The instance of the _tqdm_cls class.
                 This parameter is required to access the instance and set the lock.
                 It is of type _tqdm_cls and represents the instance on which the lock is being set.
-        
+
         Returns:
             None: This method does not return any value. The lock is set within the instance itself.
-        
+
         Raises:
             No specific exceptions are raised by this method.
             However, if _tqdm_active is False, the method will not set the lock and will return without any further action.
@@ -492,13 +499,13 @@ def set_lock(self, *args, **kwargs):
     def get_lock(self):
         """
         This method is used to retrieve the lock used by the _tqdm_cls class.
-        
+
         Args:
             self (object): The instance of the _tqdm_cls class.
-            
+
         Returns:
             None: This method does not return any value.
-        
+
         Raises:
             N/A
         """
@@ -511,7 +518,7 @@ def get_lock(self):
 
 def is_progress_bar_enabled() -> bool:
     """Return a boolean indicating whether tqdm progress bars are enabled."""
-    global _tqdm_active # pylint: disable=global-variable-not-assigned
+    global _tqdm_active  # pylint: disable=global-variable-not-assigned
     return bool(_tqdm_active)
 
 
@@ -524,4 +531,4 @@ def enable_progress_bar():
 def disable_progress_bar():
     """Disable tqdm progress bar."""
     global _tqdm_active
-    _tqdm_active = False
\ No newline at end of file
+    _tqdm_active = False
diff --git a/mindnlp/utils/safetensors_patch.py b/mindnlp/utils/safetensors_patch.py
index 34ff71026..367b37da3 100644
--- a/mindnlp/utils/safetensors_patch.py
+++ b/mindnlp/utils/safetensors_patch.py
@@ -2,12 +2,13 @@
 import mmap
 from typing import OrderedDict
 import numpy as np
+import safetensors
+from safetensors import SafetensorError
+
 import mindspore
 
 import mindtorch
 from mindtorch.configs import SUPPORT_BF16
-import safetensors
-from safetensors import SafetensorError
 
 if SUPPORT_BF16:
     from mindspore.common.np_dtype import bfloat16  # pylint: disable=import-error
@@ -69,6 +70,7 @@
     "F64": 8,
 }
 
+
 class PySafeSlice:
     def __init__(self, info, bufferfile, base_ptr, buffermmap):
         self.info = info
@@ -93,7 +95,7 @@ def get(self, slice=None):
         array = array.reshape(self.shape)
         if slice is not None:
             array = array[slice]
-        if not SUPPORT_BF16 and self.info["dtype"] == 'BF16':
+        if not SUPPORT_BF16 and self.info["dtype"] == "BF16":
             array = array.astype(np.float16)
         tensor = mindtorch.from_numpy(array)
         tensor._ptr = array.ctypes.data
@@ -132,6 +134,7 @@ def nbytes(self):
     def __getitem__(self, slice):
         return self.get(slice)
 
+
 def getSize(fileobject):
     fileobject.seek(0, 2)  # move the cursor to the end of the file
     size = fileobject.tell()
@@ -153,6 +156,7 @@ def metadata_validate(metadata):
             raise ValueError("SafeTensorError::TensorInvalidInfo")
     return end
 
+
 def read_metadata(buffer):
     buffer_len = getSize(buffer)
     if buffer_len < 8:
@@ -197,9 +201,9 @@ def __exit__(self, *args):
         self.file.close()
 
     def metadata(self):
-        meta =  self.__metadata__
+        meta = self.__metadata__
         if meta is not None:
-            meta['format'] = 'pt'
+            meta["format"] = "pt"
         return meta
 
     def keys(self):
@@ -211,18 +215,19 @@ def get_tensor(self, name):
     def get_slice(self, name):
         return self.tensors[name]
 
+
 def safe_save_file(tensor_dict, filename, metadata=None):
     """
     Function to safely save a dictionary of tensors to a file.
-    
+
     Args:
         tensor_dict (dict): A dictionary where keys are strings and values are numpy arrays representing tensors.
         filename (str): The name of the file where the tensor data will be saved.
         metadata (optional): Additional metadata to be saved along with the tensor data. Default is None.
-    
+
     Returns:
         None. The function does not return any value explicitly.
-    
+
     Raises:
         ValueError: If the input tensor_dict is not in the expected format.
         IOError: If there are issues with writing the data to the specified file.
@@ -231,11 +236,14 @@ def safe_save_file(tensor_dict, filename, metadata=None):
     tensor_dict = {k: v.asnumpy() for k, v in tensor_dict.items()}
     return safetensors.numpy.save_file(tensor_dict, filename, metadata)
 
+
 def _tobytes(tensor, name):
     return tensor.tobytes()
 
+
 def setup_safetensors_patch():
     safetensors.safe_open = fast_safe_open
     from safetensors import torch
+
     torch.save_file = safe_save_file
     torch._tobytes = _tobytes
diff --git a/mindnlp/utils/testing_utils.py b/mindnlp/utils/testing_utils.py
index b94c9f124..ca7760d86 100644
--- a/mindnlp/utils/testing_utils.py
+++ b/mindnlp/utils/testing_utils.py
@@ -43,7 +43,6 @@
 import numpy as np
 
 import mindspore
-from mindtorch.configs import SUPPORT_BF16
 
 from transformers.utils.import_utils import (
     is_pytest_available,
@@ -60,13 +59,9 @@
     is_g2p_en_available,
     is_levenshtein_available,
     is_nltk_available,
-    is_ftfy_available
+    is_ftfy_available,
 )
 from transformers.utils.generic import strtobool
-from .import_utils import (
-    is_mindspore_available,
-    is_soundfile_availble,
-)
 
 if is_pytest_available():
     from _pytest.doctest import (
@@ -86,13 +81,11 @@
     Module = object
     DoctestItem = object
 
-if is_mindspore_available():
-    from mindspore import ops
-
 
 DUMMY_UNKNOWN_IDENTIFIER = "julien-c/dummy-unknown"
 SMALL_MODEL_IDENTIFIER = "julien-c/bert-xsmall-dummy"
 
+
 def is_pipeline_test(test_case):
     """
     Decorator marking a test as a pipeline test. If RUN_PIPELINE_TESTS is set to a falsy value, those tests will be
@@ -108,24 +101,25 @@ def is_pipeline_test(test_case):
         else:
             return pytest.mark.is_pipeline_test()(test_case)
 
+
 def parse_flag_from_env(key, default=False):
     """
-    Parses a flag value from the environment variable.
-    
-    Args:
-        key (str): The name of the environment variable to retrieve the flag value from.
-        default (bool, optional): The default flag value to return if the environment variable is not set. Defaults to False.
-    
-    Returns:
-        bool: The parsed flag value. Returns the default value if the environment variable is not set or if its value cannot be parsed.
-    
-    Raises:
-        ValueError: If the environment variable value is set but cannot be parsed as a boolean ('yes' or 'no').
-    
-    Note:
-        The flag value is retrieved from the environment variable specified by `key`. If the environment variable is not set, the default value is returned. If the environment variable value is set, it is
-parsed as a boolean using the `strtobool` function from the `distutils.util` module. If the parsing fails, a `ValueError` is raised with a descriptive error message indicating that the value must be either
-'yes' or 'no'.
+        Parses a flag value from the environment variable.
+
+        Args:
+            key (str): The name of the environment variable to retrieve the flag value from.
+            default (bool, optional): The default flag value to return if the environment variable is not set. Defaults to False.
+
+        Returns:
+            bool: The parsed flag value. Returns the default value if the environment variable is not set or if its value cannot be parsed.
+
+        Raises:
+            ValueError: If the environment variable value is set but cannot be parsed as a boolean ('yes' or 'no').
+
+        Note:
+            The flag value is retrieved from the environment variable specified by `key`. If the environment variable is not set, the default value is returned. If the environment variable value is set, it is
+    parsed as a boolean using the `strtobool` function from the `distutils.util` module. If the parsing fails, a `ValueError` is raised with a descriptive error message indicating that the value must be either
+    'yes' or 'no'.
     """
     try:
         value = os.environ[key]
@@ -141,10 +135,12 @@ def parse_flag_from_env(key, default=False):
             raise ValueError(f"If set, {key} must be yes or no.") from exc
     return _value
 
+
 _run_slow_tests = parse_flag_from_env("RUN_SLOW", default=False)
 _run_too_slow_tests = parse_flag_from_env("RUN_TOO_SLOW", default=False)
 _run_pipeline_tests = parse_flag_from_env("RUN_PIPELINE_TESTS", default=True)
 
+
 def slow(test_case):
     """
     Decorator marking a test as slow.
@@ -154,6 +150,7 @@ def slow(test_case):
     """
     return unittest.skipUnless(_run_slow_tests, "test is slow")(test_case)
 
+
 def tooslow(test_case):
     """
     Decorator marking a test as too slow.
@@ -164,16 +161,17 @@ def tooslow(test_case):
     """
     return unittest.skipUnless(_run_too_slow_tests, "test is too slow")(test_case)
 
+
 def parse_int_from_env(key, default=None):
     """Parses an integer value from the specified environment variable.
-    
+
     Args:
         key (str): The name of the environment variable to retrieve the integer value from.
         default (int, optional): The default integer value to return if the environment variable is not set or cannot be converted to an integer. Defaults to None.
-    
+
     Returns:
         int or None: The integer value parsed from the environment variable or the default value if provided. Returns None if the environment variable is not set and no default value is specified.
-    
+
     Raises:
         ValueError: If the value retrieved from the environment variable cannot be converted to an integer.
     """
@@ -203,7 +201,9 @@ def require_levenshtein(test_case):
     These tests are skipped when Levenshtein isn't installed.
 
     """
-    return unittest.skipUnless(is_levenshtein_available(), "test requires Levenshtein")(test_case)
+    return unittest.skipUnless(is_levenshtein_available(), "test requires Levenshtein")(
+        test_case
+    )
 
 
 def require_nltk(test_case):
@@ -223,57 +223,65 @@ def require_vision(test_case):
     """
     return unittest.skipUnless(is_vision_available(), "test requires vision")(test_case)
 
+
 def require_tokenizers(test_case):
     """
     Decorator marking a test that requires 🤗 Tokenizers. These tests are skipped when 🤗 Tokenizers isn't installed.
     """
-    return unittest.skipUnless(is_tokenizers_available(), "test requires tokenizers")(test_case)
+    return unittest.skipUnless(is_tokenizers_available(), "test requires tokenizers")(
+        test_case
+    )
+
 
 def require_sentencepiece(test_case):
     """
     Decorator marking a test that requires SentencePiece. These tests are skipped when SentencePiece isn't installed.
     """
-    return unittest.skipUnless(is_sentencepiece_available(), "test requires SentencePiece")(test_case)
-
-def require_mindspore(test_case):
-    """
-    Decorator marking a test that requires MindSpore.
-
-    These tests are skipped when MindSpore isn't installed.
-
-    """
-    return unittest.skipUnless(is_mindspore_available(), "test requires MindSpore")(test_case)
+    return unittest.skipUnless(
+        is_sentencepiece_available(), "test requires SentencePiece"
+    )(test_case)
 
-def require_bfloat16(test_case):
-    """require_bfloat16"""
-    return unittest.skipUnless(SUPPORT_BF16, "test need bfloat16")(test_case)
 
 def require_mindspore_gpu(test_case):
     """Decorator marking a test that requires CUDA and MindSpore."""
-    return unittest.skipUnless(mindspore.get_context('device_target') == "GPU", "test requires CUDA")(test_case)
+    return unittest.skipUnless(
+        mindspore.get_context("device_target") == "GPU", "test requires CUDA"
+    )(test_case)
+
 
 def require_mindspore_npu(test_case):
     """Decorator marking a test that requires CANN and MindSpore."""
-    return unittest.skipUnless(mindspore.get_context('device_target') == "Ascend", "test requires CANN")(test_case)
+    return unittest.skipUnless(
+        mindspore.get_context("device_target") == "Ascend", "test requires CANN"
+    )(test_case)
 
 
 def require_librosa(test_case):
     """
     Decorator marking a test that requires librosa
     """
-    return unittest.skipUnless(is_librosa_available(), "test requires librosa")(test_case)
+    return unittest.skipUnless(is_librosa_available(), "test requires librosa")(
+        test_case
+    )
+
 
 def require_essentia(test_case):
     """
     Decorator marking a test that requires essentia
     """
-    return unittest.skipUnless(is_essentia_available(), "test requires essentia")(test_case)
+    return unittest.skipUnless(is_essentia_available(), "test requires essentia")(
+        test_case
+    )
+
 
 def require_pretty_midi(test_case):
     """
     Decorator marking a test that requires pretty_midi
     """
-    return unittest.skipUnless(is_pretty_midi_available(), "test requires pretty_midi")(test_case)
+    return unittest.skipUnless(is_pretty_midi_available(), "test requires pretty_midi")(
+        test_case
+    )
+
 
 def require_scipy(test_case):
     """
@@ -281,23 +289,33 @@ def require_scipy(test_case):
     """
     return unittest.skipUnless(is_scipy_available(), "test requires Scipy")(test_case)
 
+
 def require_pyctcdecode(test_case):
     """
     Decorator marking a test that requires pyctcdecode
     """
-    return unittest.skipUnless(is_pyctcdecode_available(), "test requires pyctcdecode")(test_case)
+    return unittest.skipUnless(is_pyctcdecode_available(), "test requires pyctcdecode")(
+        test_case
+    )
+
 
 def require_safetensors(test_case):
     """
     Decorator marking a test that requires safetensors. These tests are skipped when safetensors isn't installed.
     """
-    return unittest.skipUnless(is_safetensors_available(), "test requires safetensors")(test_case)
+    return unittest.skipUnless(is_safetensors_available(), "test requires safetensors")(
+        test_case
+    )
+
 
 def require_pytesseract(test_case):
     """
     Decorator marking a test that requires pytesseract
     """
-    return unittest.skipUnless(is_pytesseract_available(), "test requires pytesseract")(test_case)
+    return unittest.skipUnless(is_pytesseract_available(), "test requires pytesseract")(
+        test_case
+    )
+
 
 def require_g2p_en(test_case):
     """
@@ -309,17 +327,19 @@ def require_g2p_en(test_case):
 def cmd_exists(cmd):
     """
     Check if a command exists in the system PATH.
-    
+
     Args:
         cmd (str): The name of the command to check for existence in the system PATH.
-    
+
     Returns:
         None: Returns None if the command exists in the system PATH, otherwise returns False.
-    
+
     Raises:
         None.
     """
     return shutil.which(cmd) is not None
+
+
 #
 # Helper functions for dealing with testing text outputs
 # The original code came from:
@@ -338,13 +358,13 @@ def cmd_exists(cmd):
 def apply_print_resets(buf):
     """
     Apply print resets by removing any characters before the last carriage return in the given buffer.
-    
+
     Args:
         buf (str): The input buffer containing text data.
-        
+
     Returns:
         None. The function modifies the buffer in place.
-    
+
     Raises:
         None.
     """
@@ -354,14 +374,14 @@ def apply_print_resets(buf):
 def assert_screenout(out, what):
     """
     This function asserts the presence of a specified string within the provided output.
-    
+
     Args:
         out (str): The output string to be checked for the presence of the specified string.
         what (str): The string to be searched for within the output.
-    
+
     Returns:
         None: This function does not return any value.
-    
+
     Raises:
         AssertionError: If the specified string 'what' is not found within the output string 'out'.
     """
@@ -418,29 +438,30 @@ class CaptureStd:
         print("Secret message")
     assert "message" in cs.out
     ```"""
+
     def __init__(self, out=True, err=True, replay=True):
         """Initialize a CaptureStd object.
-        
-        Args:
-            self (CaptureStd): The instance of the CaptureStd class.
-            out (bool): Flag indicating whether to capture stdout. Default is True.
-            err (bool): Flag indicating whether to capture stderr. Default is True.
-            replay (bool): Flag indicating whether to replay captured output. Default is True.
-        
-        Returns:
-            None
-        
-        Raises:
-            None
-        
-        This method initializes a CaptureStd object with the given parameters. The 'out' parameter determines whether to capture stdout, while the 'err' parameter determines whether to capture stderr. By
-default, both 'out' and 'err' are set to True. If 'out' is True, a StringIO object is created to capture stdout. If 'out' is False, stdout is not captured and the 'out' attribute is set to 'not capturing
-stdout'. The same logic applies to 'err' and stderr.
-        
-        The 'replay' parameter determines whether the captured output should be replayed. By default, 'replay' is set to True.
-        
-        Note: If 'out' or 'err' is set to True, but the CaptureStd context is not finished yet (i.e., __exit__ is not called), an error message is set to the corresponding attribute indicating that the context
-was called too early.
+
+                Args:
+                    self (CaptureStd): The instance of the CaptureStd class.
+                    out (bool): Flag indicating whether to capture stdout. Default is True.
+                    err (bool): Flag indicating whether to capture stderr. Default is True.
+                    replay (bool): Flag indicating whether to replay captured output. Default is True.
+
+                Returns:
+                    None
+
+                Raises:
+                    None
+
+                This method initializes a CaptureStd object with the given parameters. The 'out' parameter determines whether to capture stdout, while the 'err' parameter determines whether to capture stderr. By
+        default, both 'out' and 'err' are set to True. If 'out' is True, a StringIO object is created to capture stdout. If 'out' is False, stdout is not captured and the 'out' attribute is set to 'not capturing
+        stdout'. The same logic applies to 'err' and stderr.
+
+                The 'replay' parameter determines whether the captured output should be replayed. By default, 'replay' is set to True.
+
+                Note: If 'out' or 'err' is set to True, but the CaptureStd context is not finished yet (i.e., __exit__ is not called), an error message is set to the corresponding attribute indicating that the context
+        was called too early.
         """
         self.replay = replay
 
@@ -461,13 +482,13 @@ def __init__(self, out=True, err=True, replay=True):
     def __enter__(self):
         """
         The '__enter__' method is used as a context manager to redirect the standard output and standard error streams to the provided buffers.
-        
+
         Args:
             self: An instance of the 'CaptureStd' class.
-        
+
         Returns:
             None. This method does not return any value explicitly.
-        
+
         Raises:
             None.
         """
@@ -484,13 +505,13 @@ def __enter__(self):
     def __exit__(self, *exc):
         """
         This method __exit__ is called automatically when exiting a 'with' block that uses the CaptureStd context manager.
-        
+
         Args:
             self: An instance of the CaptureStd class that represents the current context manager. It is used to access the attributes and buffers within the context manager.
-        
+
         Returns:
             None. The method does not explicitly return a value.
-        
+
         Raises:
             This method does not raise any exceptions explicitly. However, exceptions may be raised if there are errors during the execution of the code within the method.
         """
@@ -510,33 +531,33 @@ def __exit__(self, *exc):
 
     def __repr__(self):
         """
-        Returns a string representation of the CaptureStd object.
-        
-        Args:
-            self: The instance of the CaptureStd class.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            None.
-        
-        Description:
-            The __repr__ method is called when the repr() function is used on an instance of the CaptureStd class. It generates a string representation of the object, which includes the captured stdout and
-stderr outputs, if any. The generated string representation is returned by the method.
-        
-            This method checks if the 'out_buf' attribute of the CaptureStd object is not empty. If it is not empty, the captured stdout output is added to the message string. Similarly, if the 'err_buf'
-attribute is not empty, the captured stderr output is added to the message string. The final message string is then returned by the method.
-        
-            Note that the stdout and stderr outputs are represented as 'stdout: <output>' and 'stderr: <output>' respectively in the message string.
-        
-        Example Usage:
-            capture = CaptureStd()
-            capture.capture_stdout('Hello, world!')
-            capture.capture_stderr('Oops, an error occurred.')
-            repr_str = repr(capture)
-            print(repr_str)
-            # Output: "stdout: Hello, world!\nstderr: Oops, an error occurred.\n"
+                Returns a string representation of the CaptureStd object.
+
+                Args:
+                    self: The instance of the CaptureStd class.
+
+                Returns:
+                    None. This method does not return any value.
+
+                Raises:
+                    None.
+
+                Description:
+                    The __repr__ method is called when the repr() function is used on an instance of the CaptureStd class. It generates a string representation of the object, which includes the captured stdout and
+        stderr outputs, if any. The generated string representation is returned by the method.
+
+                    This method checks if the 'out_buf' attribute of the CaptureStd object is not empty. If it is not empty, the captured stdout output is added to the message string. Similarly, if the 'err_buf'
+        attribute is not empty, the captured stderr output is added to the message string. The final message string is then returned by the method.
+
+                    Note that the stdout and stderr outputs are represented as 'stdout: <output>' and 'stderr: <output>' respectively in the message string.
+
+                Example Usage:
+                    capture = CaptureStd()
+                    capture.capture_stdout('Hello, world!')
+                    capture.capture_stderr('Oops, an error occurred.')
+                    repr_str = repr(capture)
+                    print(repr_str)
+                    # Output: "stdout: Hello, world!\nstderr: Oops, an error occurred.\n"
         """
         msg = ""
         if self.out_buf:
@@ -554,19 +575,20 @@ def __repr__(self):
 
 class CaptureStdout(CaptureStd):
     """Same as CaptureStd but captures only stdout"""
+
     def __init__(self, replay=True):
         """
         Initializes an instance of the CaptureStdout class.
-        
+
         Args:
             self: The instance of the class.
-            replay (bool): A boolean flag indicating whether the captured output should be replayed. 
+            replay (bool): A boolean flag indicating whether the captured output should be replayed.
                            Defaults to True. If set to True, the captured output will be replayed.
                            If set to False, the captured output will not be replayed.
-        
+
         Returns:
             None. This method does not return any value.
-        
+
         Raises:
             No specific exceptions are raised by this method.
         """
@@ -575,17 +597,18 @@ def __init__(self, replay=True):
 
 class CaptureStderr(CaptureStd):
     """Same as CaptureStd but captures only stderr"""
+
     def __init__(self, replay=True):
         """
         Initializes an instance of the CaptureStderr class.
-        
+
         Args:
             self (CaptureStderr): The current object.
             replay (bool): Indicates whether to replay the captured stderr output. Default is True.
-        
+
         Returns:
             None. This method does not return any value.
-        
+
         Raises:
             None. This method does not raise any exceptions.
         """
@@ -594,39 +617,40 @@ def __init__(self, replay=True):
 
 class CaptureLogger:
     """
-     Context manager to capture `logging` streams
+    Context manager to capture `logging` streams
 
-     Args:
-         logger: 'logging` logger object
+    Args:
+        logger: 'logging` logger object
 
-     Returns:
-         The captured output is available via `self.out`
+    Returns:
+        The captured output is available via `self.out`
 
-     Example:
+    Example:
 
-     ```python
-     >>> from transformers import logging
-     >>> from transformers.testing_utils import CaptureLogger
+    ```python
+    >>> from transformers import logging
+    >>> from transformers.testing_utils import CaptureLogger
+
+    >>> msg = "Testing 1, 2, 3"
+    >>> logging.set_verbosity_info()
+    >>> logger = logging.get_logger("transformers.models.bart.tokenization_bart")
+    >>> with CaptureLogger(logger) as cl:
+    ...     logger.info(msg)
+    >>> assert cl.out, msg + "\n"
+    ```
+    """
 
-     >>> msg = "Testing 1, 2, 3"
-     >>> logging.set_verbosity_info()
-     >>> logger = logging.get_logger("transformers.models.bart.tokenization_bart")
-     >>> with CaptureLogger(logger) as cl:
-     ...     logger.info(msg)
-     >>> assert cl.out, msg + "\n"
-     ```
-     """
     def __init__(self, logger):
         """
         Initializes a new instance of the CaptureLogger class.
-        
+
         Args:
             self: The instance of the class.
             logger: An object representing the logger to be used for capturing logs. It should be an instance of a logger class.
-        
+
         Returns:
             None. This method does not return any value.
-        
+
         Raises:
             None. This method does not raise any exceptions.
         """
@@ -637,17 +661,17 @@ def __init__(self, logger):
 
     def __enter__(self):
         """
-        This method is an implementation of the context manager protocol for the CaptureLogger class.
-        
-        Args:
-            self: An instance of the CaptureLogger class. It represents the current object that the method is being called upon.
-        
-        Returns:
-            None. The method does not explicitly return any value, but it adds a handler to the logger associated with the CaptureLogger instance.
-        
-        Raises:
-            This method does not raise any exceptions under normal circumstances. However, potential exceptions could be raised if there are issues with adding the handler to the logger, such as improper
-configuration of the logging system.
+                This method is an implementation of the context manager protocol for the CaptureLogger class.
+
+                Args:
+                    self: An instance of the CaptureLogger class. It represents the current object that the method is being called upon.
+
+                Returns:
+                    None. The method does not explicitly return any value, but it adds a handler to the logger associated with the CaptureLogger instance.
+
+                Raises:
+                    This method does not raise any exceptions under normal circumstances. However, potential exceptions could be raised if there are issues with adding the handler to the logger, such as improper
+        configuration of the logging system.
         """
         self.logger.addHandler(self.sh)
         return self
@@ -655,13 +679,13 @@ def __enter__(self):
     def __exit__(self, *exc):
         """
         This method __exit__ is called automatically when exiting a 'with' block in the CaptureLogger class.
-        
+
         Args:
             self (CaptureLogger): An instance of the CaptureLogger class. It is used to access the logger and the captured output.
-            
+
         Returns:
             None. This method does not return any value.
-        
+
         Raises:
             This method does not raise any exceptions explicitly. However, exceptions may be raised internally if there are issues with removing the handler or getting the captured output.
         """
@@ -671,13 +695,13 @@ def __exit__(self, *exc):
     def __repr__(self):
         """
         Return a string representation of the CaptureLogger object.
-        
+
         Args:
             self (CaptureLogger): The instance of the CaptureLogger class.
-        
+
         Returns:
             None: This method does not explicitly return any value, as it returns None.
-        
+
         Raises:
             None: This method does not raise any exceptions.
         """
@@ -804,38 +828,39 @@ def test_whatever(self):
     def test_whatever(self):
         env = self.get_env()
     ```"""
+
     def setUp(self):
         """
-        Set up the necessary environment for the TestCasePlus class.
-        
-        Args:
-            self: The instance of the TestCasePlus class.
-        
-        Returns:
-            None. This method does not return any value.
-        
-        Raises:
-            ValueError: If the root directory of the repository cannot be determined from the test file path.
-        
-        Description:
-        This method is called before each test case to set up the required environment for the TestCasePlus class. It initializes various directories and paths based on the current test file's location. The
-method performs the following steps:
-        
-        1. Sets up a list to keep track of temporary directories that need to be cleaned up later.
-        2. Retrieves the path of the test file using the inspect module.
-        3. Resolves the absolute path of the test file.
-        4. Determines the parent directory of the test file.
-        5. Checks if there are 'src' and 'tests' directories in any of the parent directories up to three levels above the test file.
-        6. If such directories are found, the loop breaks and the repository root directory is set as the temporary directory.
-        7. If no valid temporary directory is found, a ValueError is raised indicating that the root directory of the repository could not be determined.
-        8. Sets the paths for the 'tests', 'examples', and 'src' directories within the repository root directory.
-        
-        Note:
-        This method assumes a specific directory structure for the repository, where 'src' and 'tests' directories exist at an appropriate level above the test file.
-        
-        Example usage:
-            test_case = TestCasePlus()
-            test_case.setUp()
+                Set up the necessary environment for the TestCasePlus class.
+
+                Args:
+                    self: The instance of the TestCasePlus class.
+
+                Returns:
+                    None. This method does not return any value.
+
+                Raises:
+                    ValueError: If the root directory of the repository cannot be determined from the test file path.
+
+                Description:
+                This method is called before each test case to set up the required environment for the TestCasePlus class. It initializes various directories and paths based on the current test file's location. The
+        method performs the following steps:
+
+                1. Sets up a list to keep track of temporary directories that need to be cleaned up later.
+                2. Retrieves the path of the test file using the inspect module.
+                3. Resolves the absolute path of the test file.
+                4. Determines the parent directory of the test file.
+                5. Checks if there are 'src' and 'tests' directories in any of the parent directories up to three levels above the test file.
+                6. If such directories are found, the loop breaks and the repository root directory is set as the temporary directory.
+                7. If no valid temporary directory is found, a ValueError is raised indicating that the root directory of the repository could not be determined.
+                8. Sets the paths for the 'tests', 'examples', and 'src' directories within the repository root directory.
+
+                Note:
+                This method assumes a specific directory structure for the repository, where 'src' and 'tests' directories exist at an appropriate level above the test file.
+
+                Example usage:
+                    test_case = TestCasePlus()
+                    test_case.setUp()
         """
         # get_auto_remove_tmp_dir feature:
         self.teardown_tmp_dirs = []
@@ -851,7 +876,9 @@ def setUp(self):
         if tmp_dir:
             self._repo_root_dir = tmp_dir
         else:
-            raise ValueError(f"can't figure out the root of the repo from {self._test_file_path}")
+            raise ValueError(
+                f"can't figure out the root of the repo from {self._test_file_path}"
+            )
         self._tests_dir = self._repo_root_dir / "tests"
         self._examples_dir = self._repo_root_dir / "examples"
         self._src_dir = self._repo_root_dir / "src"
@@ -860,13 +887,13 @@ def setUp(self):
     def test_file_path(self):
         """
         Returns the test file path.
-        
+
         Args:
             self: An instance of the TestCasePlus class.
-        
+
         Returns:
             None. The method does not return any value.
-        
+
         Raises:
             This method does not raise any exceptions.
         """
@@ -876,18 +903,18 @@ def test_file_path(self):
     def test_file_path_str(self):
         """
         Method to retrieve the string representation of the test file path.
-        
+
         Args:
             self: Instance of the TestCasePlus class.
                 - Type: object
                 - Purpose: Represents the current instance of the class.
                 - Restrictions: None
-        
+
         Returns:
             The method returns a string representing the test file path.
                 - Type: str
                 - Purpose: Provides the string representation of the test file path.
-        
+
         Raises:
             No exceptions are raised by this method.
         """
@@ -897,14 +924,14 @@ def test_file_path_str(self):
     def test_file_dir(self):
         """
         This method retrieves the directory path where test files are located.
-        
+
         Args:
             self: An instance of the TestCasePlus class.
                 This parameter refers to the current instance of the TestCasePlus class.
-        
+
         Returns:
             None. The method does not return any value explicitly but retrieves and returns the test file directory path.
-        
+
         Raises:
             This method does not raise any exceptions.
         """
@@ -914,13 +941,13 @@ def test_file_dir(self):
     def test_file_dir_str(self):
         """
         Method test_file_dir_str in the class TestCasePlus.
-        
+
         Args:
             self: Represents the instance of the class. No additional parameters are required.
-        
+
         Returns:
             str: A string representation of the _test_file_dir attribute of the instance.
-        
+
         Raises:
             None.
         """
@@ -930,16 +957,16 @@ def test_file_dir_str(self):
     def tests_dir(self):
         """
         Method: tests_dir
-        
+
         Description:
         Returns the tests directory path used by the TestCasePlus class.
-        
+
         Args:
         - self (object): The instance of the TestCasePlus class.
-        
+
         Returns:
         - None: This method does not return any value explicitly.
-        
+
         Raises:
         - None
         """
@@ -948,24 +975,24 @@ def tests_dir(self):
     @property
     def tests_dir_str(self):
         """
-        Returns the tests directory as a string.
-        
-        Args:
-            self: An instance of the TestCasePlus class.
-        
-        Returns:
-            str: The tests directory path converted to a string.
-        
-        Raises:
-            None.
-        
-        This method returns the tests directory path as a string. The tests directory is obtained from the '_tests_dir' attribute of the TestCasePlus class. The returned string represents the absolute path of
-the tests directory.
-        
-        Example usage:
-            >>> test_case = TestCasePlus()
-            >>> test_case.tests_dir_str()
-            '/path/to/tests/directory'
+                Returns the tests directory as a string.
+
+                Args:
+                    self: An instance of the TestCasePlus class.
+
+                Returns:
+                    str: The tests directory path converted to a string.
+
+                Raises:
+                    None.
+
+                This method returns the tests directory path as a string. The tests directory is obtained from the '_tests_dir' attribute of the TestCasePlus class. The returned string represents the absolute path of
+        the tests directory.
+
+                Example usage:
+                    >>> test_case = TestCasePlus()
+                    >>> test_case.tests_dir_str()
+                    '/path/to/tests/directory'
         """
         return str(self._tests_dir)
 
@@ -973,13 +1000,13 @@ def tests_dir_str(self):
     def examples_dir(self):
         """
         Method to get the examples directory path.
-        
+
         Args:
             self: The instance of the class.
-            
+
         Returns:
             None. The method returns the examples directory path.
-        
+
         Raises:
             This method does not raise any exceptions.
         """
@@ -989,16 +1016,16 @@ def examples_dir(self):
     def examples_dir_str(self):
         """
         Method examples_dir_str in the class TestCasePlus returns the string representation of the _examples_dir attribute.
-        
+
         Args:
             self: An instance of the TestCasePlus class.
                 Purpose: Represents the current instance of the class.
                 Restrictions: None.
-        
+
         Returns:
             str: A string representation of the _examples_dir attribute.
                 Purpose: Provides a human-readable string representation of the _examples_dir attribute.
-        
+
         Raises:
             None.
         """
@@ -1008,14 +1035,14 @@ def examples_dir_str(self):
     def repo_root_dir(self):
         """
         Method to retrieve the root directory of the repository.
-        
+
         Args:
             self (TestCasePlus): The instance of the TestCasePlus class.
                 This parameter is required to access the instance attributes and methods.
-        
+
         Returns:
             None. The method returns the value of the '_repo_root_dir' attribute of the instance.
-        
+
         Raises:
             This method does not raise any exceptions.
         """
@@ -1025,15 +1052,15 @@ def repo_root_dir(self):
     def repo_root_dir_str(self):
         """
         Method to retrieve the repository root directory as a string.
-        
+
         Args:
             self: The instance of the class TestCasePlus.
                 This parameter is automatically passed and refers to the instance itself.
-        
+
         Returns:
             str: A string representing the repository root directory.
                 This method returns the repository root directory as a string.
-        
+
         Raises:
             None.
         """
@@ -1043,13 +1070,13 @@ def repo_root_dir_str(self):
     def src_dir(self):
         """
         Returns the source directory path for the TestCasePlus class.
-        
+
         Args:
             self (TestCasePlus): An instance of the TestCasePlus class.
-        
+
         Returns:
             None: The method does not return any value.
-        
+
         Raises:
             None: This method does not raise any exceptions.
         """
@@ -1059,16 +1086,16 @@ def src_dir(self):
     def src_dir_str(self):
         """
         Method to retrieve the source directory path as a string representation.
-        
+
         Args:
             self: An instance of the TestCasePlus class.
                 This parameter refers to the current object instance.
                 It is used to access the source directory path stored in the _src_dir attribute.
-        
+
         Returns:
             None
             This method returns the source directory path as a string. If the source directory path does not exist or is empty, None is returned.
-        
+
         Raises:
             None
             This method does not raise any exceptions.
@@ -1186,7 +1213,9 @@ def python_one_liner_max_rss(self, one_liner_str):
         ```
         """
         if not cmd_exists("/usr/bin/time"):
-            raise ValueError("/usr/bin/time is required, install with `apt install time`")
+            raise ValueError(
+                "/usr/bin/time is required, install with `apt install time`"
+            )
 
         cmd = shlex.split(f"/usr/bin/time -f %M python -c '{one_liner_str}'")
         with CaptureStd() as cs:
@@ -1198,13 +1227,13 @@ def python_one_liner_max_rss(self, one_liner_str):
     def tearDown(self):
         """
         Tears down the test case by cleaning up temporary directories.
-        
+
         Args:
             self (TestCasePlus): The instance of the TestCasePlus class.
-        
+
         Returns:
             None: This method does not return any value.
-        
+
         Raises:
             None: This method does not raise any exceptions.
         """
@@ -1344,7 +1373,9 @@ def pytest_terminal_summary_main(tr, ids):
             f.write("slowest durations\n")
             for i, rep in enumerate(dlist):
                 if rep.duration < durations_min:
-                    f.write(f"{len(dlist)-i} durations < {durations_min} secs were omitted")
+                    f.write(
+                        f"{len(dlist)-i} durations < {durations_min} secs were omitted"
+                    )
                     break
                 f.write(f"{rep.duration:02.2f}s {rep.when:<8} {rep.nodeid}\n")
 
@@ -1358,7 +1389,9 @@ def summary_failures_short(tr):
             msg = tr._getfailureheadline(rep)
             tr.write_sep("_", msg, red=True, bold=True)
             # chop off the optional leading extra frames, leaving only the last one
-            longrepr = re.sub(r".*_ _ _ (_ ){10,}_ _ ", "", rep.longreprtext, 0, re.M | re.S)
+            longrepr = re.sub(
+                r".*_ _ _ (_ ){10,}_ _ ", "", rep.longreprtext, 0, re.M | re.S
+            )
             tr._tw.line(longrepr)
             # note: not printing out any rep.sections to keep the report short
 
@@ -1392,7 +1425,9 @@ def summary_failures_short(tr):
         tr.summary_warnings()  # normal warnings
         tr.summary_warnings()  # final warnings
 
-    tr.reportchars = "wPpsxXEf"  # emulate -rA (used in summary_passes() and short_test_summary())
+    tr.reportchars = (
+        "wPpsxXEf"  # emulate -rA (used in summary_passes() and short_test_summary())
+    )
 
     # Skip the `passes` report, as it starts to take more than 5 minutes, and sometimes it timeouts on CircleCI if it
     # takes > 10 minutes (as this part doesn't generate any output on the terminal).
@@ -1417,32 +1452,33 @@ def summary_failures_short(tr):
 
 # --- distributed testing functions --- #
 
+
 # adapted from https://stackoverflow.com/a/59041913/9201239
 class _RunOutput:
-
     """
     Represents the output of a command execution, including the return code, standard output, and standard error.
-    
+
     Attributes:
         returncode (int): The return code of the executed command.
         stdout (str): The standard output captured from the command execution.
         stderr (str): The standard error captured from the command execution.
     """
+
     def __init__(self, returncode, stdout, stderr):
         """
         __init__(self, returncode, stdout, stderr)
-        
+
         Initializes the _RunOutput class instance with the provided return code, standard output, and standard error.
-        
+
         Args:
             self (_RunOutput): The instance of the _RunOutput class.
             returncode (int): The return code from the executed command.
             stdout (str): The standard output generated by the executed command.
             stderr (str): The standard error generated by the executed command.
-        
+
         Returns:
             None: This method does not return any value.
-        
+
         Raises:
             No specific exceptions are raised by this method.
         """
@@ -1454,14 +1490,14 @@ def __init__(self, returncode, stdout, stderr):
 async def _read_stream(stream, callback):
     """
     Docstring for _read_stream function:
-    
+
     Args:
         stream (stream): The input stream from which the function reads data.
         callback (function): The callback function to be executed for each line read from the stream.
-    
+
     Returns:
         None. The function does not return any value.
-    
+
     Raises:
         No specific exceptions are raised by this function.
     """
@@ -1473,10 +1509,12 @@ async def _read_stream(stream, callback):
             break
 
 
-async def _stream_subprocess(cmd, env=None, stdin=None, timeout=None, quiet=False, echo=False) -> _RunOutput:
+async def _stream_subprocess(
+    cmd, env=None, stdin=None, timeout=None, quiet=False, echo=False
+) -> _RunOutput:
     """
     This function runs a subprocess and captures its standard output and error streams.
-    
+
     Args:
     - cmd (List[str]): A list of command and arguments to be executed.
     - env (Optional[Dict[str, str]]): A dictionary of environment variables to be used for the subprocess.
@@ -1484,10 +1522,10 @@ async def _stream_subprocess(cmd, env=None, stdin=None, timeout=None, quiet=Fals
     - timeout (Optional[float]): The maximum time in seconds to wait for the subprocess to complete.
     - quiet (bool): If True, suppresses the output of the subprocess.
     - echo (bool): If True, prints the command being executed.
-    
+
     Returns:
     _RunOutput: An object containing the return code of the subprocess, its standard output, and standard error.
-    
+
     Raises:
     - asyncio.TimeoutError: If the subprocess execution exceeds the specified timeout.
     - OSError: If an OS-related error occurs during the subprocess execution.
@@ -1532,7 +1570,9 @@ def tee(line, sink, pipe, label=""):
     return _RunOutput(await p.wait(), out, err)
 
 
-def execute_subprocess_async(cmd, env=None, stdin=None, timeout=180, quiet=False, echo=True) -> _RunOutput:
+def execute_subprocess_async(
+    cmd, env=None, stdin=None, timeout=180, quiet=False, echo=True
+) -> _RunOutput:
     """
     Args:
         cmd (List[str]): A list of strings representing the command and its arguments to be executed.
@@ -1541,16 +1581,18 @@ def execute_subprocess_async(cmd, env=None, stdin=None, timeout=180, quiet=False
         timeout (int): The maximum time in seconds to wait for the subprocess to complete.
         quiet (bool): If True, suppresses output from the subprocess.
         echo (bool): If True, prints the subprocess output to the console.
-    
+
     Returns:
         _RunOutput: An object containing the output of the executed subprocess, including stdout, stderr, and returncode.
-    
+
     Raises:
         RuntimeError: If the subprocess fails with a non-zero return code or produces no output.
     """
     loop = asyncio.get_event_loop()
     result = loop.run_until_complete(
-        _stream_subprocess(cmd, env=env, stdin=stdin, timeout=timeout, quiet=quiet, echo=echo)
+        _stream_subprocess(
+            cmd, env=env, stdin=stdin, timeout=timeout, quiet=quiet, echo=echo
+        )
     )
 
     cmd_str = " ".join(cmd)
@@ -1603,13 +1645,14 @@ def nested_simplify(obj, decimals=3):
     if isinstance(obj, np.ndarray):
         return nested_simplify(obj.tolist())
     if isinstance(obj, Mapping):
-        return {nested_simplify(k, decimals): nested_simplify(v, decimals) for k, v in obj.items()}
+        return {
+            nested_simplify(k, decimals): nested_simplify(v, decimals)
+            for k, v in obj.items()
+        }
     if isinstance(obj, (str, int, np.int64)):
         return obj
     if obj is None:
         return obj
-    if is_mindspore_available() and ops.is_tensor(obj):
-        return nested_simplify(obj.numpy().tolist())
     if isinstance(obj, float):
         return round(obj, decimals)
     if isinstance(obj, (np.int32, np.float32)):
@@ -1620,17 +1663,17 @@ def nested_simplify(obj, decimals=3):
 def to_2tuple(x):
     """
     Converts the input value to a 2-tuple.
-    
+
     Args:
         x: The value to be converted. It can be of any type.
-    
+
     Returns:
         A 2-tuple with the input value. If the input value is already an iterable, it is returned as is.
         Otherwise, a 2-tuple is created with the input value repeated twice.
-    
+
     Raises:
         None.
-    
+
     """
     if isinstance(x, collections.abc.Iterable):
         return x
@@ -1640,6 +1683,8 @@ def to_2tuple(x):
 # These utils relate to ensuring the right error message is received when running scripts
 class SubprocessCallException(Exception):
     """SubprocessCallException"""
+
+
 def run_command(command: List[str], return_stdout=False):
     """
     Runs `command` with `subprocess.check_output` and will potentially return the `stdout`. Will also properly capture
@@ -1657,6 +1702,7 @@ def run_command(command: List[str], return_stdout=False):
         ) from e
     return None
 
+
 class RequestCounter:
     """
     Helper class that will count all requests made online.
@@ -1672,40 +1718,53 @@ class RequestCounter:
     assert counter.total_calls == 1
     ```
     """
+
     def __enter__(self):
         """
         __enter__
-        
+
         Args:
             self: The instance of the RequestCounter class.
-        
+
         Returns:
             None. This method does not explicitly return a value.
-        
+
         Raises:
             No specific exceptions are raised within this method.
         """
         self._counter = defaultdict(int)
-        self.patcher = patch.object(urllib3.connectionpool.log, "debug", wraps=urllib3.connectionpool.log.debug)
+        self.patcher = patch.object(
+            urllib3.connectionpool.log, "debug", wraps=urllib3.connectionpool.log.debug
+        )
         self.mock = self.patcher.start()
         return self
 
     def __exit__(self, *args, **kwargs) -> None:
         """
         This method '__exit__' in the class 'RequestCounter' is called upon exiting a context manager. It updates the request counters based on the logged HTTP methods.
-        
+
         Args:
         - self: An instance of the 'RequestCounter' class. It represents the current instance of the class.
-        
+
         Returns:
         - None: This method does not return any value.
-        
+
         Raises:
         This method does not explicitly raise any exceptions.
         """
         for call in self.mock.call_args_list:
             log = call.args[0] % call.args[1:]
-            for method in ("HEAD", "GET", "POST", "PUT", "DELETE", "CONNECT", "OPTIONS", "TRACE", "PATCH"):
+            for method in (
+                "HEAD",
+                "GET",
+                "POST",
+                "PUT",
+                "DELETE",
+                "CONNECT",
+                "OPTIONS",
+                "TRACE",
+                "PATCH",
+            ):
                 if method in log:
                     self._counter[method] += 1
                     break
@@ -1714,14 +1773,14 @@ def __exit__(self, *args, **kwargs) -> None:
     def __getitem__(self, key: str) -> int:
         """
         Retrieve the count associated with the specified key from the RequestCounter.
-        
+
         Args:
             self (RequestCounter): An instance of the RequestCounter class.
             key (str): The key for which the count needs to be retrieved. It should be a string representing the identifier of the request.
-        
+
         Returns:
             int: The count associated with the specified key. This count indicates the number of times the request identified by the key has been made.
-        
+
         Raises:
             KeyError: If the specified key does not exist in the RequestCounter, a KeyError is raised indicating that the count for the key cannot be retrieved.
         """
@@ -1729,23 +1788,28 @@ def __getitem__(self, key: str) -> int:
 
     @property
     def total_calls(self) -> int:
-        """ 
+        """
         Method to calculate the total number of calls made to the RequestCounter instance.
-        
+
         Args:
             self (RequestCounter): The instance of the RequestCounter class.
                 This parameter is automatically passed when calling the method.
-            
+
         Returns:
             int: The total number of calls made to the RequestCounter instance.
                 It is the sum of all the values stored in the internal counter.
-        
+
         Raises:
             No specific exceptions are raised by this method.
         """
         return sum(self._counter.values())
 
-def is_flaky(max_attempts: int = 5, wait_before_retry: Optional[float] = None, description: Optional[str] = None):
+
+def is_flaky(
+    max_attempts: int = 5,
+    wait_before_retry: Optional[float] = None,
+    description: Optional[str] = None,
+):
     """
     To decorate flaky tests. They will be retried on failures.
 
@@ -1758,6 +1822,7 @@ def is_flaky(max_attempts: int = 5, wait_before_retry: Optional[float] = None, d
             A string to describe the situation (what / where / why is flaky, link to GH issue/PR comments, errors,
             etc.)
     """
+
     def decorator(test_func_ref):
         @functools.wraps(test_func_ref)
         def wrapper(*args, **kwargs):
@@ -1768,7 +1833,10 @@ def wrapper(*args, **kwargs):
                     return test_func_ref(*args, **kwargs)
 
                 except Exception as err:
-                    print(f"Test failed with {err} at try {retry_count}/{max_attempts}.", file=sys.stderr)
+                    print(
+                        f"Test failed with {err} at try {retry_count}/{max_attempts}.",
+                        file=sys.stderr,
+                    )
                     if wait_before_retry is not None:
                         time.sleep(wait_before_retry)
                     retry_count += 1
@@ -1840,11 +1908,18 @@ def preprocess_string(string, skip_cuda_tests):
     `string`.
     """
     codeblock_pattern = r"(```(?:python|py)\s*\n\s*>>> )((?:.*?\n)*?.*?```)"
-    codeblocks = re.split(re.compile(codeblock_pattern, flags=re.MULTILINE | re.DOTALL), string)
+    codeblocks = re.split(
+        re.compile(codeblock_pattern, flags=re.MULTILINE | re.DOTALL), string
+    )
     is_cuda_found = False
     for i, codeblock in enumerate(codeblocks):
-        if "load_dataset(" in codeblock and "# doctest: +IGNORE_RESULT" not in codeblock:
-            codeblocks[i] = re.sub(r"(>>> .*load_dataset\(.*)", r"\1 # doctest: +IGNORE_RESULT", codeblock)
+        if (
+            "load_dataset(" in codeblock
+            and "# doctest: +IGNORE_RESULT" not in codeblock
+        ):
+            codeblocks[i] = re.sub(
+                r"(>>> .*load_dataset\(.*)", r"\1 # doctest: +IGNORE_RESULT", codeblock
+            )
         if (
             (">>>" in codeblock or "..." in codeblock)
             and re.search(r"cuda|to\(0\)|device=0", codeblock)
@@ -1868,6 +1943,7 @@ class HfDocTestParser(doctest.DocTestParser):
 
     Tests involving cuda are skipped base on a naive pattern that should be updated if it is not enough.
     """
+
     # This regular expression is used to find doctest examples in a
     # string.  It defines three groups: `source` is the source code
     # (including leading indentation and prompts); `indent` is the
@@ -1910,20 +1986,22 @@ class HfDoctestModule(Module):
     Overwrites the `DoctestModule` of the pytest package to make sure the HFDocTestParser is used when discovering
     tests.
     """
+
     def collect(self) -> Iterable[DoctestItem]:
         """
         Collects doctests from the specified module.
-        
+
         Args:
             self (HfDoctestModule): The instance of the HfDoctestModule class.
-        
+
         Returns:
             Iterable[DoctestItem]: A collection of doctests represented as DoctestItem objects.
-        
+
         Raises:
             ImportError: If the module cannot be imported and the 'doctest_ignore_import_errors' configuration option is not set.
             Skip: If the 'doctest_ignore_import_errors' configuration option is set and the module cannot be imported.
         """
+
         class MockAwareDocTestFinder(doctest.DocTestFinder):
             """A hackish doctest finder that overrides stdlib internals to fix a stdlib bug.
 
@@ -1950,7 +2028,9 @@ def _find_lineno(self, obj, source_lines):
                     source_lines,
                 )
 
-            def _find(self, tests, obj, name, module, source_lines, globs, seen) -> None:
+            def _find(
+                self, tests, obj, name, module, source_lines, globs, seen
+            ) -> None:
                 if _is_mocked(obj):
                     return
                 with _patch_unwrap_mock_aware():
@@ -1990,20 +2070,24 @@ def _find(self, tests, obj, name, module, source_lines, globs, seen) -> None:
         )
         for test in finder.find(module, module.__name__):
             if test.examples:  # skip empty doctests and cuda
-                yield DoctestItem.from_parent(self, name=test.name, runner=runner, dtest=test)
+                yield DoctestItem.from_parent(
+                    self, name=test.name, runner=runner, dtest=test
+                )
 
 
-def _device_agnostic_dispatch(device: str, dispatch_table: Dict[str, Callable], *args, **kwargs):
+def _device_agnostic_dispatch(
+    device: str, dispatch_table: Dict[str, Callable], *args, **kwargs
+):
     """
     Executes a device-agnostic dispatch based on the given device and dispatch table.
-    
+
     Args:
         device (str): The device for which the dispatch is performed.
         dispatch_table (Dict[str, Callable]): A dictionary containing the dispatch functions for different devices.
-    
+
     Returns:
         None: Returns None if the dispatch function for the given device is None.
-    
+
     Raises:
         None: This function does not raise any exceptions.
     """
@@ -2018,6 +2102,7 @@ def _device_agnostic_dispatch(device: str, dispatch_table: Dict[str, Callable],
         return None
     return fn(*args, **kwargs)
 
+
 def get_tests_dir(append_path=None):
     """
     Args:
@@ -2039,22 +2124,23 @@ def get_tests_dir(append_path=None):
         return os.path.join(tests_dir, append_path)
     return tests_dir
 
+
 def check_json_file_has_correct_format(file_path):
-    '''
+    """
     Check if the provided JSON file has the correct format.
-    
+
     Args:
         file_path (str): The path to the JSON file to be checked.
-    
+
     Returns:
         None: This function does not return any value.
-    
+
     Raises:
         AssertionError: If the JSON file does not have the correct format as per the specified conditions.
         FileNotFoundError: If the specified file_path does not exist.
         UnicodeDecodeError: If the file cannot be decoded using the specified encoding.
-    '''
-    with open(file_path, "r", encoding='utf-8') as f:
+    """
+    with open(file_path, "r", encoding="utf-8") as f:
         lines = f.readlines()
         if len(lines) == 1:
             # length can only be 1 if dict is empty
@@ -2069,8 +2155,10 @@ def check_json_file_has_correct_format(file_path):
                 assert left_indent == 2
             assert lines[-1].strip() == "}"
 
+
 _run_staging = parse_flag_from_env("MINDNLP_CO_STAGING", default=False)
 
+
 def is_staging_test(test_case):
     """
     Decorator marking a test as a staging test.
@@ -2088,17 +2176,6 @@ def is_staging_test(test_case):
             return pytest.mark.is_staging_test()(test_case)
 
 
-def require_soundfile(test_case):
-    """
-    Decorator marking a test that requires soundfile
-
-    These tests are skipped when soundfile isn't installed.
-
-    """
-    return unittest.skipUnless(is_soundfile_availble(), "test requires soundfile")(
-        test_case
-    )
-
 def backend_empty_cache():
-    if hasattr(mindspore, 'hal'):
-        mindspore.hal.empty_cache()
\ No newline at end of file
+    if hasattr(mindspore, "hal"):
+        mindspore.hal.empty_cache()
diff --git a/mindtorch/_apis/npu.py b/mindtorch/_apis/npu.py
index f9d754198..540462fca 100644
--- a/mindtorch/_apis/npu.py
+++ b/mindtorch/_apis/npu.py
@@ -1603,6 +1603,8 @@ def multinomial(input, num_samples, replacement, generator):
     return pyboost.multinomial_ext_op(input, num_samples, replacement, seed, offset)
 
 def right_shift(input, other):
+    if isinstance(other, int):
+        other = mindspore.Tensor(other, dtype=input.dtype)
     if use_pyboost():
         return pyboost.right_shift_op(input, other)
     return legacy.right_shift(input, other)
diff --git a/setup.py b/setup.py
index b15afeb21..adf9a7a76 100644
--- a/setup.py
+++ b/setup.py
@@ -153,10 +153,11 @@ def run(self):
         "mindnlp": "mindnlp",
         "mindtorch": "mindtorch",
     },
-    # package_data={
-    #     'mindnlp': ['*', '*/*', '*/*/*', '*/*/*/*', '*/*/*/*/*', '*/*/*/*/*/*'],
-    #     'mindtorch': ['*', '*/*', '*/*/*', '*/*/*/*', '*/*/*/*/*', '*/*/*/*/*/*']
-    # },
+    package_data={
+        'mindnlp': ['*.py', '*/*.py', '*/*/*.py', '*/*/*/*.py', '*/*/*/*/*.py', '*/*/*/*/*/*.py',
+                    '*.cu', '*/*.cu', '*/*/*.cu', '*/*/*/*.cu', '*/*/*/*/*.cu'],
+        'mindtorch': ['*.py', '*/*.py', '*/*/*.py', '*/*/*/*.py', '*/*/*/*/*.py', '*/*/*/*/*/*.py']
+    },
     cmdclass={
         'egg_info': EggInfo,
         'build_py': BuildPy,