diff --git a/mindnlp/core/nn/functional.py b/mindnlp/core/nn/functional.py
index 47d060454..133d68be4 100644
--- a/mindnlp/core/nn/functional.py
+++ b/mindnlp/core/nn/functional.py
@@ -177,6 +177,25 @@ def binary_cross_entropy_with_logits(input, target, weight=None, reduction='mean
         return mindspore.mint.nn.functional.binary_cross_entropy_with_logits(input, target, weight, reduction, pos_weight)
     return ops.binary_cross_entropy_with_logits(input, target, weight, pos_weight, reduction)
 
+def gumbel_softmax(logits: Tensor, tau: float = 1, hard: bool = False, eps: float = 1e-10, dim: int = -1) -> Tensor:
+    if eps != 1e-10:
+        warnings.warn("`eps` parameter is deprecated and has no effect.")
+
+    uniform_samples = _get_cache_prim(ops.UniformReal)()(logits.shape)
+    gumbels = -ops.log(-ops.log(uniform_samples + eps) + eps) # ~Gumbel(0, 1)
+    gumbels = (logits + gumbels) / tau  # ~Gumbel(logits,tau)
+    y_soft = softmax(gumbels, dim)
+
+    if hard:
+        # Straight through.
+        index = y_soft.argmax(dim)
+        y_hard = one_hot(index, logits.shape[dim])
+        ret = ops.stop_gradient(y_hard - y_soft) + y_soft
+    else:
+        # Reparametrization trick.
+        ret = y_soft
+    return ret
+
 def log_softmax(input, dim=-1, dtype=None):
     out = ops.log_softmax(input, dim)
     if dtype is not None:
@@ -791,7 +810,7 @@ def multi_head_attention_forward(
         assert key_padding_mask.shape == (bsz, src_len), \
             f"expecting key_padding_mask shape of {(bsz, src_len)}, but got {key_padding_mask.shape}"
         key_padding_mask = key_padding_mask.view(bsz, 1, 1, src_len).   \
-            expand(-1, num_heads, -1, -1).reshape(bsz * num_heads, 1, src_len)
+            broadcast_to((-1, num_heads, -1, -1)).reshape(bsz * num_heads, 1, src_len)
         if attn_mask is None:
             attn_mask = key_padding_mask
         else:
diff --git a/mindnlp/core/serialization.py b/mindnlp/core/serialization.py
index b7164cda7..eec39ac9b 100644
--- a/mindnlp/core/serialization.py
+++ b/mindnlp/core/serialization.py
@@ -35,7 +35,7 @@
 
 import numpy as np
 import mindspore
-from mindspore import Tensor
+from mindspore import Tensor, Parameter
 from mindspore.train.serialization import _exec_save, _parse_ckpt_proto, tensor_to_np_type, tensor_to_ms_type
 
 import safetensors
@@ -756,6 +756,13 @@ def _open_zipfile_writer(name_or_buffer):
         container = _open_zipfile_writer_buffer
     return container(name_or_buffer)
 
+def _rebuild_parameter(data, requires_grad, backward_hooks):
+    param = Parameter(data, requires_grad=requires_grad)
+    # NB: This line exists only for backwards compatibility; the
+    # general expectation is that backward_hooks is an empty
+    # OrderedDict.  See Note [Don't serialize hooks]
+    return param
+
 def _rebuild_tensor_v2(storage, storage_offset, size, stride, requires_grad, backward_hooks, metadata=None):
     '''Rebuilds a tensor based on the provided parameters.
     
diff --git a/mindnlp/peft/peft_model.py b/mindnlp/peft/peft_model.py
index 922f34c57..32e51bbcd 100644
--- a/mindnlp/peft/peft_model.py
+++ b/mindnlp/peft/peft_model.py
@@ -253,7 +253,7 @@ def load_adapter(self, model_id: str, adapter_name: str, is_trainable: bool = Fa
 
         return load_result
 
-    def get_nb_trainable_parameters(self) -> tuple[int, int]:
+    def get_nb_trainable_parameters(self):
         r"""
         Returns the number of trainable parameters and the number of all parameters in the model.
         """
diff --git a/mindnlp/transformers/cache_utils.py b/mindnlp/transformers/cache_utils.py
index 98206eb11..71e154499 100644
--- a/mindnlp/transformers/cache_utils.py
+++ b/mindnlp/transformers/cache_utils.py
@@ -990,7 +990,7 @@ def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
         """Returns the sequence length of the cached states. A layer index can be optionally passed."""
         if len(self.self_attention_cache.key_cache) <= layer_idx:
             return 0
-        return (ops.any(self.self_attention_cache.key_cache[layer_idx][0, 0], dim=-1)).sum().item()
+        return (ops.any(self.self_attention_cache.key_cache[layer_idx][0, 0].bool(), dim=-1)).sum().item()
 
     def reset(self):
         if hasattr(self.self_attention_cache, "reset"):
diff --git a/mindnlp/transformers/models/data2vec/modeling_data2vec_audio.py b/mindnlp/transformers/models/data2vec/modeling_data2vec_audio.py
index 4d477afcb..0f511cc1a 100644
--- a/mindnlp/transformers/models/data2vec/modeling_data2vec_audio.py
+++ b/mindnlp/transformers/models/data2vec/modeling_data2vec_audio.py
@@ -20,7 +20,7 @@
 import mindspore
 from mindspore.common.initializer import Uniform, HeNormal, initializer,Normal
 
-from mindnlp.core import nn, ops
+from mindnlp.core import nn, ops, no_grad
 from mindnlp.core.nn import functional as F
 from mindnlp.utils import logging
 from ...activations import ACT2FN
@@ -969,8 +969,9 @@ def forward(
         if labels is not None:
             # retrieve loss input_lengths from attention_mask
             labels = labels.astype(mindspore.int32)
-            # if labels.max() >= self.config.vocab_size:
-            #     raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
+            with no_grad():
+                if ops.max(labels) >= self.config.vocab_size:
+                    raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
             attention_mask = (
                 attention_mask if attention_mask is not None else ops.ones_like(input_values, dtype=mindspore.int64)
             )
diff --git a/mindnlp/transformers/models/deformable_detr/modeling_deformable_detr.py b/mindnlp/transformers/models/deformable_detr/modeling_deformable_detr.py
index e4e129845..2620b6384 100644
--- a/mindnlp/transformers/models/deformable_detr/modeling_deformable_detr.py
+++ b/mindnlp/transformers/models/deformable_detr/modeling_deformable_detr.py
@@ -1687,9 +1687,9 @@ def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes)
             proposals.append(proposal)
             _cur += height * width
         output_proposals = ops.cat(proposals, 1)
-        output_proposals_valid = (
-            (output_proposals > 0.01).int() & (output_proposals < 0.99).int()
-        ).all(-1, keep_dims=True)
+        output_proposals_valid = ops.all(
+            ((output_proposals > 0.01).int() & (output_proposals < 0.99).int()).bool(), -1, keepdim=True
+        )
         output_proposals = ops.log(
             output_proposals / (1 - output_proposals)
         )  # inverse sigmoid
@@ -2291,8 +2291,9 @@ def loss_labels(self, outputs, targets, indices, num_boxes):
             source_logits.shape[2] + 1,
             dtype=source_logits.dtype,
         )
+        target_classes = target_classes.unsqueeze(-1)
         target_classes_onehot = ops.scatter(
-            target_classes_onehot, 2, target_classes.unsqueeze(-1), ops.ones_like(target_classes_onehot)
+            target_classes_onehot, 2, target_classes, ops.ones_like(target_classes, dtype=target_classes_onehot.dtype)
         )
         target_classes_onehot = target_classes_onehot[:, :, :-1]
         loss_ce = (
diff --git a/mindnlp/transformers/models/wav2vec2/__init__.py b/mindnlp/transformers/models/wav2vec2/__init__.py
index e1790d678..c380c1179 100644
--- a/mindnlp/transformers/models/wav2vec2/__init__.py
+++ b/mindnlp/transformers/models/wav2vec2/__init__.py
@@ -15,21 +15,11 @@
 ''' Wav2Vec2 Model '''
 
 from . import configuration_wav2vec2, feature_extraction_wav2vec2, processing_wav2vec2, tokenization_wav2vec2, modeling_wav2vec2
-from .configuration_wav2vec2 import WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP, Wav2Vec2Config
+from .configuration_wav2vec2 import *
 from .feature_extraction_wav2vec2 import Wav2Vec2FeatureExtractor
 from .processing_wav2vec2 import Wav2Vec2Processor
 from .tokenization_wav2vec2 import Wav2Vec2CTCTokenizer, Wav2Vec2Tokenizer
-from .modeling_wav2vec2 import (
-    WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST,
-    Wav2Vec2ForAudioFrameClassification,
-    Wav2Vec2ForCTC,
-    Wav2Vec2ForMaskedLM,
-    Wav2Vec2ForPreTraining,
-    Wav2Vec2ForSequenceClassification,
-    Wav2Vec2ForXVector,
-    Wav2Vec2Model,
-    Wav2Vec2PreTrainedModel,
-)
+from .modeling_wav2vec2 import *
 
 __all__ = []
 __all__.extend(configuration_wav2vec2.__all__)
diff --git a/mindnlp/transformers/models/wav2vec2/modeling_wav2vec2.py b/mindnlp/transformers/models/wav2vec2/modeling_wav2vec2.py
index 10d0f3ad4..73b795581 100644
--- a/mindnlp/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/mindnlp/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Mindspore Wav2Vec2 model. """
+"""MindSpore Wav2Vec2 model."""
 
 import math
 import warnings
@@ -21,14 +21,12 @@
 
 import numpy as np
 import mindspore
-from mindspore import Tensor, Parameter
-from mindspore.common.initializer import initializer, Normal, Uniform
-
 from mindnlp.core import nn, ops
-from mindnlp.core.nn import functional as F
+from mindnlp.core.nn import CrossEntropyLoss
+from mindnlp.core.serialization import load, safe_load_file
+
 from ...activations import ACT2FN
 from ...modeling_outputs import (
-    ModelOutput,
     BaseModelOutput,
     CausalLMOutput,
     MaskedLMOutput,
@@ -39,37 +37,46 @@
 )
 from ...modeling_utils import PreTrainedModel
 from ....utils import (
+    ModelOutput,
     cached_file,
+    is_safetensors_available,
     logging,
 )
-
 from .configuration_wav2vec2 import Wav2Vec2Config
 
-__all__ = [
-    'WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST',
-    'Wav2Vec2PreTrainedModel',
-    'Wav2Vec2Model',
-    'Wav2Vec2ForPreTraining',
-    'Wav2Vec2ForMaskedLM',
-    'Wav2Vec2ForCTC',
-    'Wav2Vec2ForSequenceClassification',
-    'Wav2Vec2ForAudioFrameClassification',
-    'Wav2Vec2ForXVector',
-]
+
+WAV2VEC2_ADAPTER_PT_FILE = "adapter.{}.bin"
+WAV2VEC2_ADAPTER_SAFE_FILE = "adapter.{}.safetensors"
+
 
 logger = logging.get_logger(__name__)
 
+
 _HIDDEN_STATES_START_POSITION = 2
 
-WAV2VEC2_ADAPTER_PT_FILE = "adapter.{}.bin"
-WAV2VEC2_ADAPTER_SAFE_FILE = "adapter.{}.safetensors"
-WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "facebook/wav2vec2-base-960h",
-    "facebook/wav2vec2-large-960h",
-    "facebook/wav2vec2-large-960h-lv60",
-    "facebook/wav2vec2-large-960h-lv60-self",
-    # See all Wav2Vec2 models at https://hf-mirror.com/models?filter=wav2vec2
-]
+# General docstring
+_CONFIG_FOR_DOC = "Wav2Vec2Config"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "facebook/wav2vec2-base-960h"
+_EXPECTED_OUTPUT_SHAPE = [1, 292, 768]
+
+# CTC docstring
+_CTC_EXPECTED_OUTPUT = "'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'"
+_CTC_EXPECTED_LOSS = 53.48
+
+# Audio class docstring
+_SEQ_CLASS_CHECKPOINT = "superb/wav2vec2-base-superb-ks"
+_SEQ_CLASS_EXPECTED_OUTPUT = "'_unknown_'"
+_SEQ_CLASS_EXPECTED_LOSS = 6.54
+
+# Frame class docstring
+_FRAME_CLASS_CHECKPOINT = "anton-l/wav2vec2-base-superb-sd"
+_FRAME_EXPECTED_OUTPUT = [0, 0]
+
+# Speaker Verification docstring
+_XVECTOR_CHECKPOINT = "anton-l/wav2vec2-base-superb-sv"
+_XVECTOR_EXPECTED_OUTPUT = 0.98
 
 
 @dataclass
@@ -78,48 +85,47 @@ class Wav2Vec2ForPreTrainingOutput(ModelOutput):
     Output type of [`Wav2Vec2ForPreTraining`], with potential hidden states and attentions.
 
     Args:
-        loss (*optional*, returned when `sample_negative_indices` are passed, `Tensor` of shape `(1,)`):
+        loss (*optional*, returned when `sample_negative_indices` are passed, `mindspore.Tensor` of shape `(1,)`):
             Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official
             paper](https://arxiv.org/pdf/2006.11477.pdf) . (classification) loss.
-        projected_states (`Tensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
+        projected_states (`mindspore.Tensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
             Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked
             projected quantized states.
-        projected_quantized_states (`Tensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
+        projected_quantized_states (`mindspore.Tensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
             Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive
             target vectors for contrastive loss.
-        hidden_states (`tuple(Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when
-            `config.output_hidden_states=True`):
-            Tuple of `Tensor` (one for the output of the embeddings + one for the output of each layer) of
+        hidden_states (`tuple(mindspore.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `mindspore.Tensor` (one for the output of the embeddings + one for the output of each layer) of
             shape `(batch_size, sequence_length, hidden_size)`.
 
             Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(Tensor)`, *optional*, returned when `output_attentions=True` is passed or when
-            `config.output_attentions=True`):
-            Tuple of `Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+        attentions (`tuple(mindspore.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `mindspore.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
             sequence_length)`.
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-        contrastive_loss (*optional*, returned when `sample_negative_indices` are passed, `Tensor` of shape `(1,)`):
+        contrastive_loss (*optional*, returned when `sample_negative_indices` are passed, `mindspore.Tensor` of shape `(1,)`):
             The contrastive loss (L_m) as stated in the [official paper](https://arxiv.org/pdf/2006.11477.pdf) .
-        diversity_loss (*optional*, returned when `sample_negative_indices` are passed, `Tensor` of shape `(1,)`):
+        diversity_loss (*optional*, returned when `sample_negative_indices` are passed, `mindspore.Tensor` of shape `(1,)`):
             The diversity loss (L_d) as stated in the [official paper](https://arxiv.org/pdf/2006.11477.pdf) .
     """
-    loss: Optional[Tensor] = None
-    projected_states: Tensor = None
-    projected_quantized_states: Tensor = None
-    codevector_perplexity: Tensor = None
-    hidden_states: Optional[Tuple[Tensor]] = None
-    attentions: Optional[Tuple[Tensor]] = None
-    contrastive_loss: Optional[Tensor] = None
-    diversity_loss: Optional[Tensor] = None
+
+    loss: Optional[mindspore.Tensor] = None
+    projected_states: mindspore.Tensor = None
+    projected_quantized_states: mindspore.Tensor = None
+    codevector_perplexity: mindspore.Tensor = None
+    hidden_states: Optional[Tuple[mindspore.Tensor]] = None
+    attentions: Optional[Tuple[mindspore.Tensor]] = None
+    contrastive_loss: Optional[mindspore.Tensor] = None
+    diversity_loss: Optional[mindspore.Tensor] = None
 
 
 def _compute_mask_indices(
     shape: Tuple[int, int],
     mask_prob: float,
     mask_length: int,
-    attention_mask: Optional[Tensor] = None,
+    attention_mask: Optional[mindspore.Tensor] = None,
     min_masks: int = 0,
 ) -> np.ndarray:
     """
@@ -129,15 +135,15 @@ def _compute_mask_indices(
 
     Args:
         shape: The shape for which to compute masks. This should be of a tuple of size 2 where
-            the first element is the batch size and the second element is the length of the axis to span.
+               the first element is the batch size and the second element is the length of the axis to span.
         mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
-            independently generated mask spans of length `mask_length` is computed by
-            `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
-            actual percentage will be smaller.
+                    independently generated mask spans of length `mask_length` is computed by
+                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
+                    actual percentage will be smaller.
         mask_length: size of the mask
         min_masks: minimum number of masked spans
         attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
-            each batch dimension.
+                        each batch dimension.
     """
     batch_size, sequence_length = shape
 
@@ -248,8 +254,6 @@ def _sample_negative_indices(
     # get `num_negatives` random vector indices from the same utterance
     sampled_negative_indices = np.zeros(shape=(batch_size, sequence_length, num_negatives), dtype=np.int32)
 
-    if isinstance(mask_time_indices, Tensor):
-        mask_time_indices = mask_time_indices.asnumpy()
     mask_time_indices = (
         mask_time_indices.astype(bool) if mask_time_indices is not None else np.ones(features_shape, dtype=bool)
     )
@@ -273,48 +277,7 @@ def _sample_negative_indices(
 
 
 class Wav2Vec2NoLayerNormConvLayer(nn.Module):
-
-    """
-    Wav2Vec2NoLayerNormConvLayer is a Python class representing a convolutional layer without layer normalization for
-    the Wav2Vec2 model. This class inherits from nn.Module and is used for processing audio features.
-
-    Attributes:
-        config (Wav2Vec2Config): The configuration object for the Wav2Vec2 model.
-        layer_id (int): The index of the convolutional layer.
-        in_conv_dim (int): The input dimension of the convolutional layer.
-        out_conv_dim (int): The output dimension of the convolutional layer.
-        conv (nn.Conv1d): The 1D convolutional operation applied to the input.
-        activation (function): The activation function used to process the convolutional output.
-
-    Methods:
-        __init__: Initializes the Wav2Vec2NoLayerNormConvLayer with the provided configuration and layer index.
-        forward: Applies the convolutional and activation operations to the input hidden_states.
-
-    Note:
-        This class is part of the Wav2Vec2 model and is specifically designed for processing audio features without
-        layer normalization.
-    """
-    def __init__(self, config: Wav2Vec2Config, layer_id=0):
-        """
-        __init__(self, config: Wav2Vec2Config, layer_id=0)
-
-        Initializes a new instance of the Wav2Vec2NoLayerNormConvLayer class.
-
-        Args:
-            self: The instance of the class.
-            config (Wav2Vec2Config): An instance of the Wav2Vec2Config class containing the configuration parameters
-                for the Wav2Vec2 model.
-            layer_id (int, optional): The index of the layer. Defaults to 0. Specifies the layer for which the
-                convolutional layer is initialized.
-
-        Returns:
-            None.
-
-        Raises:
-            ValueError: If the layer_id is less than 0.
-            AttributeError: If the layer_id exceeds the maximum index available in the configuration parameters.
-            TypeError: If the provided config parameter is not an instance of the Wav2Vec2Config class.
-        """
+    def __init__(self, config, layer_id=0):
         super().__init__()
         self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
         self.out_conv_dim = config.conv_dim[layer_id]
@@ -329,56 +292,13 @@ def __init__(self, config: Wav2Vec2Config, layer_id=0):
         self.activation = ACT2FN[config.feat_extract_activation]
 
     def forward(self, hidden_states):
-        """
-        Constructs the hidden states using convolutional layer and activation function.
-
-        Args:
-            self (Wav2Vec2NoLayerNormConvLayer): The instance of the Wav2Vec2NoLayerNormConvLayer class.
-            hidden_states (torch.Tensor): The input hidden states tensor.
-
-        Returns:
-            torch.Tensor: The forwarded hidden states after applying convolution and activation.
-
-        Raises:
-            TypeError: If the input hidden_states is not a torch.Tensor.
-        """
         hidden_states = self.conv(hidden_states)
         hidden_states = self.activation(hidden_states)
         return hidden_states
 
 
 class Wav2Vec2LayerNormConvLayer(nn.Module):
-
-    """
-    This class represents a convolutional layer with layer normalization in the Wav2Vec2 model.
-    It inherits from the nn.Module class.
-
-    Attributes:
-        config (Wav2Vec2Config): The configuration object for the Wav2Vec2 model.
-        layer_id (int): The ID of the current layer.
-
-    Methods:
-        __init__:
-            Initializes the Wav2Vec2LayerNormConvLayer with the given configuration and layer ID.
-
-        forward:
-            Applies the convolutional layer with layer normalization to the input hidden states.
-
-    """
-    def __init__(self, config: Wav2Vec2Config, layer_id=0):
-        """
-        Initialize the Wav2Vec2LayerNormConvLayer.
-
-        Args:
-            config (Wav2Vec2Config): The configuration object containing the parameters for the layer.
-            layer_id (int, optional): The ID of the layer. Defaults to 0.
-
-        Returns:
-            None
-
-        Raises:
-            None
-        """
+    def __init__(self, config, layer_id=0):
         super().__init__()
         self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
         self.out_conv_dim = config.conv_dim[layer_id]
@@ -390,71 +310,22 @@ def __init__(self, config: Wav2Vec2Config, layer_id=0):
             stride=config.conv_stride[layer_id],
             bias=config.conv_bias,
         )
-        self.layer_norm = nn.LayerNorm(self.out_conv_dim)
+        self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True)
         self.activation = ACT2FN[config.feat_extract_activation]
 
     def forward(self, hidden_states):
-        """
-        Construct the hidden states using the Wav2Vec2LayerNormConvLayer method.
-
-        Args:
-            self (Wav2Vec2LayerNormConvLayer): An instance of the Wav2Vec2LayerNormConvLayer class.
-            hidden_states (Tensor): The input hidden states to be processed.
-                It should have the shape (batch_size, sequence_length, feature_dim).
-
-        Returns:
-            None.
-
-        Raises:
-            None.
-        """
         hidden_states = self.conv(hidden_states)
-        hidden_states = hidden_states.swapaxes(-2, -1)
+
+        hidden_states = ops.transpose(hidden_states, -2, -1)
         hidden_states = self.layer_norm(hidden_states)
-        hidden_states = hidden_states.swapaxes(-2, -1)
+        hidden_states = ops.transpose(hidden_states, -2, -1)
+
         hidden_states = self.activation(hidden_states)
         return hidden_states
 
 
 class Wav2Vec2GroupNormConvLayer(nn.Module):
-
-    """
-    This class represents a group normalization convolutional layer used in the Wav2Vec2 model.
-    It applies a 1D convolution operation followed by group normalization, activation, and layer normalization to the
-    input hidden states.
-
-    Args:
-        config (Wav2Vec2Config): The configuration object containing the settings for the Wav2Vec2 model.
-        layer_id (int, optional): The index of the convolutional layer in the model. Defaults to 0.
-
-    Attributes:
-        in_conv_dim (int): The input dimension of the convolutional layer.
-        out_conv_dim (int): The output dimension of the convolutional layer.
-        conv (nn.Conv1d): The 1D convolutional layer used to process the hidden states.
-        activation (function): The activation function applied to the processed hidden states.
-        layer_norm (nn.GroupNorm): The group normalization layer applied to the hidden states.
-
-    Methods:
-        forward: Applies the convolutional layer, normalization, activation, and returns the processed hidden states.
-
-    """
-    def __init__(self, config: Wav2Vec2Config, layer_id=0):
-        """
-        Initializes an instance of the Wav2Vec2GroupNormConvLayer class.
-
-        Args:
-            self: The current instance of the class.
-            config (Wav2Vec2Config): An instance of the Wav2Vec2Config class containing configuration settings.
-            layer_id (int): The index of the convolutional layer within the configuration. Defaults to 0.
-
-        Returns:
-            None.
-
-        Raises:
-            ValueError: If the layer_id is less than 0.
-            KeyError: If the specified activation function in config is not found in the ACT2FN dictionary.
-            ValueError: If the specified pad_mode in the nn.Conv1d function is not 'valid'.
-        """
+    def __init__(self, config, layer_id=0):
         super().__init__()
         self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
         self.out_conv_dim = config.conv_dim[layer_id]
@@ -467,68 +338,18 @@ def __init__(self, config: Wav2Vec2Config, layer_id=0):
             bias=config.conv_bias,
         )
         self.activation = ACT2FN[config.feat_extract_activation]
+
         self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim, affine=True)
 
     def forward(self, hidden_states):
-        """
-        This method forwards a group normalization convolutional layer for the Wav2Vec2 model.
-
-        Args:
-            self (Wav2Vec2GroupNormConvLayer): The instance of the Wav2Vec2GroupNormConvLayer class.
-            hidden_states (torch.Tensor): The input tensor representing the hidden states to be processed by the group normalization convolutional layer.
-
-        Returns:
-            torch.Tensor: The processed tensor representing the hidden states after passing through the group normalization convolutional layer.
-
-        Raises:
-            None.
-        """
         hidden_states = self.conv(hidden_states)
-        hidden_states = self.layer_norm(hidden_states.unsqueeze(-1)).squeeze(-1)    # tmfix: GroupNorm only support 4D
+        hidden_states = self.layer_norm(hidden_states)
         hidden_states = self.activation(hidden_states)
         return hidden_states
 
 
 class Wav2Vec2PositionalConvEmbedding(nn.Module):
-
-    """
-    This class represents a positional convolutional embedding layer in the Wav2Vec2 model architecture.
-    It inherits from nn.Module and is designed to process hidden states through convolutional and activation operations.
-
-    Attributes:
-        config: Wav2Vec2Config
-            An instance of Wav2Vec2Config containing configuration parameters for the layer.
-
-    Methods:
-        __init__:
-            Initializes the Wav2Vec2PositionalConvEmbedding with the provided configuration.
-
-        forward:
-            Applies positional convolutional embedding operations on the input hidden_states and returns the
-            transformed output.
-
-    Usage:
-        Instantiate this class by providing a Wav2Vec2Config object as configuration, then call the forward method
-        with hidden states to process them.
-
-    Note:
-        This class utilizes a convolutional layer, padding layer, and activation function to process hidden states
-        efficiently.
-    """
-    def __init__(self, config: Wav2Vec2Config):
-        """
-        Initializes a new instance of the Wav2Vec2PositionalConvEmbedding class.
-
-        Args:
-            self: An instance of the Wav2Vec2PositionalConvEmbedding class.
-            config (Wav2Vec2Config): The configuration object containing various settings for the Wav2Vec2 model.
-
-        Returns:
-            None
-
-        Raises:
-            None
-        """
+    def __init__(self, config):
         super().__init__()
         self.conv = nn.Conv1d(
             config.hidden_size,
@@ -536,88 +357,32 @@ def __init__(self, config: Wav2Vec2Config):
             kernel_size=config.num_conv_pos_embeddings,
             padding=config.num_conv_pos_embeddings // 2,
             groups=config.num_conv_pos_embedding_groups,
-            bias=True,
         )
 
-        self.conv = F.weight_norm(self.conv, name='weight', dim=2)
+        weight_norm = nn.utils.weight_norm
+
+        self.conv = weight_norm(self.conv, name="weight", dim=2)
+
         self.padding = Wav2Vec2SamePadLayer(config.num_conv_pos_embeddings)
         self.activation = ACT2FN[config.feat_extract_activation]
 
     def forward(self, hidden_states):
-        """
-        This method forwards the positional convolutional embedding for the Wav2Vec2 model.
-
-        Args:
-            self (Wav2Vec2PositionalConvEmbedding): The instance of the Wav2Vec2PositionalConvEmbedding class.
-            hidden_states (array-like): The input hidden states with shape (batch_size, sequence_length, hidden_size).
-
-        Returns:
-            None: This method does not return any value. The positional convolutional embedding is applied to the
-                input hidden states in place.
+        hidden_states = ops.transpose(hidden_states, 1, 2)
 
-        Raises:
-            ValueError: If the input hidden_states is not in the expected format or shape.
-            RuntimeError: If an error occurs during the convolution or activation process.
-        """
-        hidden_states = hidden_states.swapaxes(1, 2)
         hidden_states = self.conv(hidden_states)
         hidden_states = self.padding(hidden_states)
         hidden_states = self.activation(hidden_states)
-        hidden_states = hidden_states.swapaxes(1, 2)
+
+        hidden_states = ops.transpose(hidden_states, 1, 2)
         return hidden_states
 
 
 class Wav2Vec2SamePadLayer(nn.Module):
-
-    """
-    This class represents a layer in the Wav2Vec2 model that performs padding removal.
-
-    Wav2Vec2SamePadLayer is a subclass of nn.Module and is designed to remove padding from hidden states in the
-    Wav2Vec2 model. It is primarily used in the Wav2Vec2 model for speech recognition tasks.
-
-    Attributes:
-        num_pad_remove (int): The number of padding elements to remove from the hidden states.
-
-    Methods:
-        __init__: Initializes a new instance of the Wav2Vec2SamePadLayer class.
-        forward: Removes padding elements from the hidden states.
-
-    """
     def __init__(self, num_conv_pos_embeddings):
-        """
-        Initializes an instance of the Wav2Vec2SamePadLayer class.
-
-        Args:
-            self (Wav2Vec2SamePadLayer): The current instance of the Wav2Vec2SamePadLayer class.
-            num_conv_pos_embeddings (int): The number of convolutional positional embeddings.
-                It is used to determine the value of the num_pad_remove attribute.
-                The value must be a non-negative integer.
-
-        Returns:
-            None.
-
-        Raises:
-            None.
-        """
         super().__init__()
         self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0
 
     def forward(self, hidden_states):
-        """
-        Constructs the hidden states of the Wav2Vec2SamePadLayer.
-
-        Args:
-            self (Wav2Vec2SamePadLayer): An instance of the Wav2Vec2SamePadLayer class.
-            hidden_states (torch.Tensor): The hidden states to be processed.
-                Expected shape is (batch_size, sequence_length, hidden_size).
-                The hidden states are processed based on the `num_pad_remove` value.
-
-        Returns:
-            None.
-
-        Raises:
-            None.
-        """
         if self.num_pad_remove > 0:
             hidden_states = hidden_states[:, :, : -self.num_pad_remove]
         return hidden_states
@@ -625,29 +390,8 @@ def forward(self, hidden_states):
 
 class Wav2Vec2FeatureEncoder(nn.Module):
     """Construct the features from raw audio waveform"""
-    def __init__(self, config: Wav2Vec2Config):
-        """
-        Initializes a new instance of the Wav2Vec2FeatureEncoder class.
-
-        Args:
-            self: The object itself.
-            config (Wav2Vec2Config):
-                The configuration object for the feature encoder.
-
-                - config.feat_extract_norm (str): The type of normalization to be applied during feature extraction.
-
-                    - 'group': Applies group normalization to the convolutional layers.
-                    - 'layer': Applies layer normalization to the convolutional layers.
-
-                - config.num_feat_extract_layers (int): The number of feature extraction layers.
-
-        Returns:
-            None.
-
-        Raises:
-            ValueError: If `config.feat_extract_norm` is not one of ['group', 'layer'].
 
-        """
+    def __init__(self, config):
         super().__init__()
 
         if config.feat_extract_norm == "group":
@@ -663,74 +407,35 @@ def __init__(self, config: Wav2Vec2Config):
                 f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']"
             )
         self.conv_layers = nn.ModuleList(conv_layers)
+        self.gradient_checkpointing = False
         self._requires_grad = True
 
     def _freeze_parameters(self):
-        """
-        Freezes the parameters of the Wav2Vec2FeatureEncoder.
-
-        Args:
-            self: An instance of the Wav2Vec2FeatureEncoder class.
-
-        Returns:
-            None.
-
-        Raises:
-            None.
-        """
-        for _, param in self.parameters_and_names():
+        for param in self.parameters():
             param.requires_grad = False
         self._requires_grad = False
 
     def forward(self, input_values):
-        """
-        Method 'forward' in the class 'Wav2Vec2FeatureEncoder' forwards the hidden states from the input values
-        using convolutional layers.
-
-        Args:
-            self (object): The instance of the class.
-            input_values (tensor): The input values for forwarding hidden states. It is expected to be a 2D tensor.
+        hidden_states = input_values[:, None]
 
-        Returns:
-            tensor: The forwarded hidden states after passing through the convolutional layers.
+        # make sure hidden_states require grad for gradient_checkpointing
+        if self._requires_grad and self.training:
+            hidden_states.requires_grad = True
 
-        Raises:
-            None
-        """
-        hidden_states = input_values[:, None]
         for conv_layer in self.conv_layers:
-            hidden_states = conv_layer(hidden_states)
+            if self._requires_grad and self.gradient_checkpointing and self.training:
+                hidden_states = self._gradient_checkpointing_func(
+                    conv_layer.__call__,
+                    hidden_states,
+                )
+            else:
+                hidden_states = conv_layer(hidden_states)
+
         return hidden_states
 
 
 class Wav2Vec2FeatureExtractor(Wav2Vec2FeatureEncoder):
-
-    """
-    Wav2Vec2FeatureExtractor is a class that represents a feature extractor for Wav2Vec2 models.
-    It is designed to extract features from audio data for use in Wav2Vec2 models.
-
-    This class inherits from Wav2Vec2FeatureEncoder, and it is recommended to use Wav2Vec2FeatureEncoder instead of
-    this class, as Wav2Vec2FeatureExtractor has been deprecated.
-
-    Please refer to the documentation for Wav2Vec2FeatureEncoder for feature extraction and encoding in Wav2Vec2 models.
-    """
-    def __init__(self, config: Wav2Vec2Config):
-        """
-        This method initializes an instance of the Wav2Vec2FeatureExtractor class.
-
-        Args:
-            self: The instance of the class.
-            config (Wav2Vec2Config): An instance of the Wav2Vec2Config class containing the configuration parameters
-                for the feature extractor.
-
-        Returns:
-            None.
-
-        Raises:
-            FutureWarning: If the class Wav2Vec2FeatureExtractor is used, a FutureWarning is raised indicating that
-                the class has been depreciated. It is recommended to use the base
-                class instead.
-        """
+    def __init__(self, config):
         super().__init__(config)
         warnings.warn(
             f"The class `{self.__class__.__name__}` has been depreciated "
@@ -741,63 +446,13 @@ class instead.
 
 
 class Wav2Vec2FeatureProjection(nn.Module):
-
-    """
-    Wav2Vec2FeatureProjection is a Python class that represents a feature projection module for Wav2Vec2.
-    This class inherits from nn.Module and contains methods for initializing the feature projection and forwarding the
-    hidden states.
-
-    The __init__ method initializes the feature projection module by setting up layer normalization, dense projection,
-    and dropout.
-
-    The forward method applies layer normalization to the hidden states, projects the normalized states using dense
-    projection, and applies dropout to the projected states before returning the hidden states and the normalized
-    hidden states.
-    """
-    def __init__(self, config: Wav2Vec2Config):
-        """
-        Initializes the Wav2Vec2FeatureProjection class.
-
-        Args:
-            self: The instance of the Wav2Vec2FeatureProjection class.
-            config (Wav2Vec2Config): An instance of the Wav2Vec2Config class containing the configuration parameters
-                for the Wav2Vec2 feature projection. It specifies the configuration for the layer
-                normalization, projection, and dropout layers.
-
-        Returns:
-            None.
-
-        Raises:
-            TypeError: If the config parameter is not of type Wav2Vec2Config.
-            ValueError: If the config.conv_dim[-1] is not valid or if the config.hidden_size is not valid.
-            RuntimeError: If an error occurs during the initialization of layer normalization, projection,
-                or dropout layers.
-        """
+    def __init__(self, config):
         super().__init__()
         self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)
         self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
-        self.dropout = nn.Dropout(p=config.feat_proj_dropout)
+        self.dropout = nn.Dropout(config.feat_proj_dropout)
 
     def forward(self, hidden_states):
-        """
-        This method forwards the hidden states by applying layer normalization, projection, and dropout.
-
-        Args:
-            self (Wav2Vec2FeatureProjection): The instance of the Wav2Vec2FeatureProjection class.
-            hidden_states (Tensor): The input hidden states to be processed. It should be a tensor of shape
-                (batch_size, sequence_length, feature_dim).
-
-        Returns:
-            Tuple[Tensor, Tensor]:
-                A tuple containing two tensors:
-
-                - hidden_states (Tensor): The processed hidden states after applying layer normalization, projection,
-                and dropout.
-                - norm_hidden_states (Tensor): The normalized hidden states obtained after applying layer normalization.
-
-        Raises:
-            None.
-        """
         # non-projected hidden states are needed for quantization
         norm_hidden_states = self.layer_norm(hidden_states)
         hidden_states = self.projection(norm_hidden_states)
@@ -808,6 +463,7 @@ def forward(self, hidden_states):
 # Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->Wav2Vec2
 class Wav2Vec2Attention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
+
     def __init__(
         self,
         embed_dim: int,
@@ -818,24 +474,6 @@ def __init__(
         is_causal: bool = False,
         config: Optional[Wav2Vec2Config] = None,
     ):
-        """
-        Initializes an instance of the Wav2Vec2Attention class.
-
-        Args:
-            embed_dim (int): The dimension of the input embeddings.
-            num_heads (int): The number of attention heads.
-            dropout (float, optional): The dropout probability. Defaults to 0.0.
-            is_decoder (bool, optional): Whether the attention module is used as a decoder. Defaults to False.
-            bias (bool, optional): Whether to include bias in linear projections. Defaults to True.
-            is_causal (bool, optional): Whether the attention is causal. Defaults to False.
-            config (Optional[Wav2Vec2Config], optional): The configuration object. Defaults to None.
-
-        Returns:
-            None
-
-        Raises:
-            ValueError: If embed_dim is not divisible by num_heads.
-        """
         super().__init__()
         self.embed_dim = embed_dim
         self.num_heads = num_heads
@@ -857,34 +495,20 @@ def __init__(
         self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
         self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
 
-    def _shape(self, tensor: Tensor, seq_len: int, bsz: int):
-        """
-        This method '_shape' is defined in the class 'Wav2Vec2Attention' and is used to reshape the input tensor to
-        the specified shape.
-
-        Args:
-            tensor (Tensor): The input tensor to be reshaped. It should be of type Tensor.
-            seq_len (int): The length of the sequence. It should be an integer.
-            bsz (int): The batch size. It should be an integer.
-
-        Returns:
-            None.
-
-        Raises:
-            None.
-        """
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).swapaxes(1, 2)
+    def _shape(self, tensor: mindspore.Tensor, seq_len: int, bsz: int):
+        return ops.transpose(tensor.view(bsz, seq_len, self.num_heads, self.head_dim), 1, 2)
 
     def forward(
         self,
-        hidden_states: Tensor,
-        key_value_states: Optional[Tensor] = None,
-        past_key_value: Optional[Tuple[Tensor]] = None,
-        attention_mask: Optional[Tensor] = None,
-        layer_head_mask: Optional[Tensor] = None,
+        hidden_states: mindspore.Tensor,
+        key_value_states: Optional[mindspore.Tensor] = None,
+        past_key_value: Optional[Tuple[mindspore.Tensor]] = None,
+        attention_mask: Optional[mindspore.Tensor] = None,
+        layer_head_mask: Optional[mindspore.Tensor] = None,
         output_attentions: bool = False,
-    ) -> Tuple[Tensor, Optional[Tensor], Optional[Tuple[Tensor]]]:
+    ) -> Tuple[mindspore.Tensor, Optional[mindspore.Tensor], Optional[Tuple[mindspore.Tensor]]]:
         """Input shape: Batch x Time x Channel"""
+
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
         is_cross_attention = key_value_states is not None
@@ -921,10 +545,10 @@ def forward(
             value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
 
         if self.is_decoder:
-            # if cross_attention save Tuple(Tensor, Tensor) of all cross attention key/value_states.
+            # if cross_attention save Tuple(mindspore.Tensor, mindspore.Tensor) of all cross attention key/value_states.
             # Further calls to cross_attention layer can then reuse all cross-attention
             # key/value_states (first "if" case)
-            # if uni-directional self-attention (decoder) save Tuple(Tensor, Tensor) of
+            # if uni-directional self-attention (decoder) save Tuple(mindspore.Tensor, mindspore.Tensor) of
             # all previous decoder key/value_states. Further calls to uni-directional self-attention
             # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
             # if encoder bi-directional self-attention `past_key_value` is always `None`
@@ -936,7 +560,7 @@ def forward(
         value_states = value_states.reshape(*proj_shape)
 
         src_len = key_states.shape[1]
-        attn_weights = ops.bmm(query_states, key_states.swapaxes(1, 2))
+        attn_weights = ops.bmm(query_states, ops.transpose(key_states, 1, 2))
 
         if attn_weights.shape != (bsz * self.num_heads, tgt_len, src_len):
             raise ValueError(
@@ -952,7 +576,7 @@ def forward(
             attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
 
-        attn_weights = ops.softmax(attn_weights, dim=-1)
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
 
         if layer_head_mask is not None:
             if layer_head_mask.shape != (self.num_heads,):
@@ -973,7 +597,7 @@ def forward(
         else:
             attn_weights_reshaped = None
 
-        attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
 
         attn_output = ops.bmm(attn_probs, value_states)
 
@@ -984,7 +608,7 @@ def forward(
             )
 
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
-        attn_output = attn_output.swapaxes(1, 2)
+        attn_output = ops.transpose(attn_output, 1, 2)
 
         # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
         # partitioned across GPUs when using tensor-parallelism.
@@ -995,49 +619,15 @@ def forward(
         return attn_output, attn_weights_reshaped, past_key_value
 
 
-class Wav2Vec2FeedForward(nn.Module):
-
-    """
-    Wav2Vec2FeedForward is a class representing the feedforward network for the Wav2Vec2 model.
-    This class inherits from nn.Module and contains methods for initializing the network and forwarding the
-    feedforward layers.
-
-    The __init__ method initializes the feedforward network with the provided configuration.
-    It sets up the intermediate dropout, intermediate dense, intermediate activation function, output dense, and output
-    dropout layers based on the configuration parameters.
-
-    The forward method takes hidden states as input and processes them through the intermediate dense layer,
-    intermediate activation function, intermediate dropout layer, output dense layer, and output dropout layer.
-    It then returns the processed hidden states.
-
-    Note:
-        This docstring is based on the provided code snippet and may need to be updated with additional information once
-        the entire class implementation is available.
-    """
-    def __init__(self, config: Wav2Vec2Config):
-        """
-        Initialize the Wav2Vec2FeedForward class.
+WAV2VEC2_ATTENTION_CLASSES = {
+    "eager": Wav2Vec2Attention,
+}
 
-        Args:
-            self: Instance of the class.
-            config (Wav2Vec2Config): Configuration object containing parameters for initialization.
-                The config parameter is of type Wav2Vec2Config and holds the configuration settings required for
-                initializing the feed-forward module.
-                It is expected to contain the following attributes:
 
-                - activation_dropout (float): Dropout probability for intermediate layers.
-                - hidden_size (int): Size of the hidden layers.
-                - intermediate_size (int): Size of the intermediate layer.
-                - hidden_act (str or function): Activation function for the hidden layers.
-
-        Returns:
-            None.
-
-        Raises:
-            None.
-        """
+class Wav2Vec2FeedForward(nn.Module):
+    def __init__(self, config):
         super().__init__()
-        self.intermediate_dropout = nn.Dropout(p=config.activation_dropout)
+        self.intermediate_dropout = nn.Dropout(config.activation_dropout)
 
         self.intermediate_dense = nn.Linear(config.hidden_size, config.intermediate_size)
         if isinstance(config.hidden_act, str):
@@ -1046,30 +636,9 @@ def __init__(self, config: Wav2Vec2Config):
             self.intermediate_act_fn = config.hidden_act
 
         self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size)
-        self.output_dropout = nn.Dropout(p=config.hidden_dropout)
+        self.output_dropout = nn.Dropout(config.hidden_dropout)
 
     def forward(self, hidden_states):
-        """
-        Constructs the feed-forward network for the Wav2Vec2 model.
-
-        Args:
-            self (Wav2Vec2FeedForward): An instance of the Wav2Vec2FeedForward class.
-            hidden_states (torch.Tensor): The input hidden states to be passed through the feed-forward network.
-
-        Returns:
-            torch.Tensor: The output hidden states after passing through the feed-forward network.
-
-        Raises:
-            TypeError: If the input hidden_states is not of type torch.Tensor.
-            ValueError: If the input hidden_states does not have a rank of 2.
-
-        This method takes the input hidden states and passes them through a feed-forward network consisting of several
-        layers. The feed-forward network is forwarded using intermediate dense layers, activation functions,
-        and dropout layers. The hidden_states are first passed through the intermediate dense layer, followed by the
-        intermediate activation function and dropout layer. The resulting hidden_states are then passed through the
-        output dense layer and another dropout layer. The final output hidden_states are returned.
-        Note that the input hidden_states must be a tensor of rank 2, representing a batch of hidden states.
-        """
         hidden_states = self.intermediate_dense(hidden_states)
         hidden_states = self.intermediate_act_fn(hidden_states)
         hidden_states = self.intermediate_dropout(hidden_states)
@@ -1080,94 +649,21 @@ def forward(self, hidden_states):
 
 
 class Wav2Vec2EncoderLayer(nn.Module):
-
-    """A class representing an encoder layer of the Wav2Vec2 model.
-
-    The Wav2Vec2EncoderLayer class inherits from the nn.Module class and implements the functionality of a single encoder
-    layer in the Wav2Vec2 model architecture. It consists of multiple sub-modules, including an attention mechanism,
-    dropout layers, layer normalization, and a feed-forward neural network.
-
-    Attributes:
-        attention (Wav2Vec2Attention): The attention mechanism used in the layer.
-        dropout (nn.Dropout): The dropout layer applied to the hidden states.
-        layer_norm (nn.LayerNorm): The layer normalization applied to the hidden states.
-        feed_forward (Wav2Vec2FeedForward): The feed-forward neural network used in the layer.
-        final_layer_norm (nn.LayerNorm): The final layer normalization applied to the hidden states.
-
-    Methods:
-        forward(hidden_states, attention_mask=None, output_attentions=False):
-            Applies the forward pass of the encoder layer.
-
-            Args:
-
-            - hidden_states (Tensor): The input hidden states.
-            - attention_mask (Tensor, optional): The attention mask to apply to the attention mechanism (default: None).
-            - output_attentions (bool, optional): Whether to return the attention weights (default: False).
-
-            Returns:
-
-            - outputs (tuple): A tuple containing the output hidden states. If output_attentions is True, the tuple
-            also contains the attention weights.
-
-    Note:
-        The Wav2Vec2EncoderLayer class is designed to be used within the Wav2Vec2Encoder class, which stacks multiple
-        encoder layers to form the complete Wav2Vec2 model.
-    """
-    def __init__(self, config: Wav2Vec2Config):
-        """
-        Initializes a Wav2Vec2EncoderLayer instance.
-
-        Args:
-            self (Wav2Vec2EncoderLayer): The instance of the Wav2Vec2EncoderLayer class.
-            config (Wav2Vec2Config):
-                An instance of Wav2Vec2Config containing configuration parameters for the encoder layer.
-
-                - Wav2Vec2Config.hidden_size (int): The hidden size for the encoder layer.
-                - Wav2Vec2Config.num_attention_heads (int): The number of attention heads in the attention mechanism.
-                - Wav2Vec2Config.attention_dropout (float): The dropout probability for the attention mechanism.
-                - Wav2Vec2Config.hidden_dropout (float): The dropout probability for the hidden layers.
-                - Wav2Vec2Config.layer_norm_eps (float): The epsilon value for layer normalization.
-
-        Returns:
-            None.
-
-        Raises:
-            None.
-        """
+    def __init__(self, config):
         super().__init__()
-        self.attention = Wav2Vec2Attention(
+        self.attention = WAV2VEC2_ATTENTION_CLASSES[config._attn_implementation](
             embed_dim=config.hidden_size,
             num_heads=config.num_attention_heads,
             dropout=config.attention_dropout,
             is_decoder=False,
         )
-        self.dropout = nn.Dropout(p=config.hidden_dropout)
+
+        self.dropout = nn.Dropout(config.hidden_dropout)
         self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.feed_forward = Wav2Vec2FeedForward(config)
         self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 
     def forward(self, hidden_states, attention_mask=None, output_attentions=False):
-        """
-        Constructs the Wav2Vec2EncoderLayer.
-
-        This method applies the Wav2Vec2EncoderLayer to the input hidden_states. It performs attention, residual
-        connections, layer normalization, feed-forward, and final layer normalization.
-
-        Args:
-            self (Wav2Vec2EncoderLayer): The instance of the Wav2Vec2EncoderLayer class.
-            hidden_states (torch.Tensor): The input hidden states of shape (batch_size, sequence_length, hidden_size).
-            attention_mask (torch.Tensor, optional): The attention mask of shape (batch_size, sequence_length).
-                Defaults to None.
-            output_attentions (bool, optional): Whether to output the attention weights. Defaults to False.
-
-        Returns:
-            tuple: A tuple containing the hidden states of shape (batch_size, sequence_length, hidden_size).
-                If output_attentions is True, the tuple also contains the attention weights of shape (batch_size,
-                num_heads, sequence_length, sequence_length).
-
-        Raises:
-            None
-        """
         attn_residual = hidden_states
         hidden_states, attn_weights, _ = self.attention(
             hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
@@ -1188,59 +684,15 @@ def forward(self, hidden_states, attention_mask=None, output_attentions=False):
 
 
 class Wav2Vec2EncoderLayerStableLayerNorm(nn.Module):
-
-    """
-    This class represents an encoder layer in the Wav2Vec2 model with stable layer normalization.
-    It inherits from the nn.Module class.
-
-    Attributes:
-        attention (Wav2Vec2Attention): An instance of the Wav2Vec2Attention class for attention mechanism.
-        dropout (nn.Dropout): An instance of the nn.Dropout class for dropout regularization.
-        layer_norm (nn.LayerNorm): An instance of the nn.LayerNorm class for stable layer normalization.
-        feed_forward (Wav2Vec2FeedForward): An instance of the Wav2Vec2FeedForward class for feed-forward layer.
-        final_layer_norm (nn.LayerNorm): An instance of the nn.LayerNorm class for stable layer normalization of final
-            output.
-        adapter_layer (Wav2Vec2AttnAdapterLayer or None): An instance of the Wav2Vec2AttnAdapterLayer class for adapter
-            layer, if provided. None otherwise.
-
-    Methods:
-        forward:
-            Applies the encoder layer operations on the input hidden states.
-
-            Args:
-
-            - hidden_states (Tensor): The input hidden states.
-            - attention_mask (Optional[Tensor]): The attention mask tensor, if provided. Defaults to None.
-            - output_attentions (bool): Whether to output attention weights. Defaults to False.
-
-            Returns:
-
-            - Tuple[Tensor, Union[Tensor, None]]: A tuple containing the final hidden states and optionally the
-            attention weights, if output_attentions is True.
-    """
-    def __init__(self, config: Wav2Vec2Config):
-        """
-        Initializes a new instance of the Wav2Vec2EncoderLayerStableLayerNorm class.
-
-        Args:
-            self: The instance of the class.
-            config (Wav2Vec2Config): The configuration object containing the settings for the encoder layer.
-                It should be an instance of the Wav2Vec2Config class.
-
-        Returns:
-            None.
-
-        Raises:
-            None.
-        """
+    def __init__(self, config):
         super().__init__()
-        self.attention = Wav2Vec2Attention(
+        self.attention = WAV2VEC2_ATTENTION_CLASSES[config._attn_implementation](
             embed_dim=config.hidden_size,
             num_heads=config.num_attention_heads,
             dropout=config.attention_dropout,
             is_decoder=False,
         )
-        self.dropout = nn.Dropout(p=config.hidden_dropout)
+        self.dropout = nn.Dropout(config.hidden_dropout)
         self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.feed_forward = Wav2Vec2FeedForward(config)
         self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
@@ -1252,27 +704,10 @@ def __init__(self, config: Wav2Vec2Config):
 
     def forward(
         self,
-        hidden_states: Tensor,
-        attention_mask: Optional[Tensor] = None,
+        hidden_states: mindspore.Tensor,
+        attention_mask: Optional[mindspore.Tensor] = None,
         output_attentions: bool = False,
     ):
-        """
-        Constructs the Wav2Vec2EncoderLayerStableLayerNorm.
-
-        Args:
-            self: Instance of the Wav2Vec2EncoderLayerStableLayerNorm class.
-            hidden_states (Tensor): The input hidden states to be processed by the encoder layer.
-            attention_mask (Optional[Tensor]): Optional tensor representing the attention mask.
-                Defaults to None. If provided, masks certain elements in the attention computation.
-            output_attentions (bool): Flag indicating whether to output attention weights during computation.
-                Defaults to False.
-
-        Returns:
-            Tuple: A tuple containing the processed hidden states and optionally the attention weights.
-
-        Raises:
-            None.
-        """
         attn_residual = hidden_states
         hidden_states = self.layer_norm(hidden_states)
         hidden_states, attn_weights, _ = self.attention(
@@ -1294,103 +729,41 @@ def forward(
 
 
 class Wav2Vec2Encoder(nn.Module):
-
-    """
-    A class representing the Wav2Vec2Encoder in the Wav2Vec2 model architecture.
-
-    The Wav2Vec2Encoder is responsible for encoding the input hidden states with positional embeddings and applying
-    a series of Wav2Vec2EncoderLayer for feature extraction.
-
-    Attributes:
-        config (Wav2Vec2Config): The configuration for the Wav2Vec2 model.
-        pos_conv_embed (Wav2Vec2PositionalConvEmbedding): The positional convolutional embedding layer.
-        layer_norm (nn.LayerNorm): The layer normalization layer.
-        dropout (nn.Dropout): The dropout layer.
-        layers (nn.ModuleList): The list of Wav2Vec2EncoderLayer instances.
-
-    Methods:
-        forward(hidden_states, attention_mask=None, output_attentions=False, output_hidden_states=False, return_dict=True):
-            Applies the Wav2Vec2Encoder layer-wise to the hidden states.
-
-            Args:
-
-            - hidden_states (Tensor): The input hidden states.
-            - attention_mask (Optional[Tensor], optional): The attention mask tensor. Defaults to None.
-            - output_attentions (bool, optional): Whether to output the attentions. Defaults to False.
-            - output_hidden_states (bool, optional): Whether to output the hidden states. Defaults to False.
-            - return_dict (bool, optional): Whether to return a BaseModelOutput dictionary. Defaults to True.
-
-            Returns:
-
-            - BaseModelOutput or Tuple[Tensor, Tuple[Tensor], Tuple[Tensor]]: The encoded hidden states, all hidden
-            states (if output_hidden_states=True), and all self-attentions (if output_attentions=True).
-    """
-    def __init__(self, config: Wav2Vec2Config):
-        """
-        Initializes the Wav2Vec2Encoder class.
-
-        Args:
-            self: The instance of the class.
-            config (Wav2Vec2Config): An instance of the Wav2Vec2Config class containing the configuration parameters
-                for the encoder. It specifies the configuration for the Wav2Vec2 model, such as hidden size,
-                layer normalization epsilon, hidden dropout probability, and the number of hidden layers.
-
-        Returns:
-            None.
-
-        Raises:
-            None: This method does not raise any exceptions explicitly. However, exceptions may be raised during the
-                initialization of the Wav2Vec2PositionalConvEmbedding, nn.LayerNorm, nn.Dropout, and nn.ModuleList objects.
-        """
+    def __init__(self, config):
         super().__init__()
         self.config = config
         self.pos_conv_embed = Wav2Vec2PositionalConvEmbedding(config)
         self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(p=config.hidden_dropout)
+        self.dropout = nn.Dropout(config.hidden_dropout)
         self.layers = nn.ModuleList([Wav2Vec2EncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
 
     def forward(
         self,
-        hidden_states: Tensor,
-        attention_mask: Optional[Tensor] = None,
+        hidden_states: mindspore.tensor,
+        attention_mask: Optional[mindspore.Tensor] = None,
         output_attentions: bool = False,
         output_hidden_states: bool = False,
         return_dict: bool = True,
     ):
-        """
-        Constructs the Wav2Vec2Encoder.
-
-        Args:
-            self (Wav2Vec2Encoder): The instance of the Wav2Vec2Encoder class.
-            hidden_states (Tensor): The input hidden states. A tensor of shape (batch_size, sequence_length, hidden_size).
-            attention_mask (Optional[Tensor]): An optional tensor specifying the attention mask. Defaults to None.
-            output_attentions (bool): Whether to output attentions. Defaults to False.
-            output_hidden_states (bool): Whether to output hidden states. Defaults to False.
-            return_dict (bool): Whether to return a dictionary. Defaults to True.
-
-        Returns:
-            None.
-
-        Raises:
-            ValueError: If the hidden_states tensor has invalid shape or type.
-            ValueError: If the attention_mask tensor has invalid shape or type.
-            TypeError: If the output_attentions or output_hidden_states parameters are not of type bool.
-            TypeError: If the return_dict parameter is not of type bool.
-        """
         all_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
 
         if attention_mask is not None:
             # make sure padded tokens output 0
-            expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+            expand_attention_mask = attention_mask.unsqueeze(-1).tile((1, 1, hidden_states.shape[2]))
             hidden_states[~expand_attention_mask] = 0
-
-            # extend attention_mask
-            attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
-            attention_mask = attention_mask * float(ops.finfo(hidden_states.dtype).min)
-            attention_mask = attention_mask.expand(
-                attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
-            )
+            if self._use_flash_attention_2:
+                # 2d mask is passed through the layers
+                attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+            else:
+                # extend attention_mask
+                attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
+                attention_mask = attention_mask * float(ops.finfo(hidden_states.dtype).min)
+                attention_mask = attention_mask.broadcast_to(
+                    (attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1])
+                )
 
         position_embeddings = self.pos_conv_embed(hidden_states)
         hidden_states = hidden_states + position_embeddings
@@ -1406,9 +779,18 @@ def forward(
 
             skip_the_layer = self.training and (dropout_probability < self.config.layerdrop)
             if not skip_the_layer:
-                layer_outputs = layer(
-                    hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
-                )
+                # under deepspeed zero3 all gpus must run in sync
+                if self.gradient_checkpointing and self.training:
+                    layer_outputs = self._gradient_checkpointing_func(
+                        layer.__call__,
+                        hidden_states,
+                        attention_mask,
+                        output_attentions,
+                    )
+                else:
+                    layer_outputs = layer(
+                        hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+                    )
                 hidden_states = layer_outputs[0]
 
             if skip_the_layer:
@@ -1430,46 +812,17 @@ def forward(
 
 
 class Wav2Vec2EncoderStableLayerNorm(nn.Module):
-
-    """
-    Wav2Vec2EncoderStableLayerNorm is a Python class that represents an encoder with stable layer normalization for
-    the Wav2Vec2 model. This class inherits from the nn.Module module.
-
-    This class initializes with a Wav2Vec2Config object and forwards a series of encoder layers with stable
-    layer normalization. The encoder layers operate on the input hidden states and optionally apply
-    attention masks, producing hidden states with added positional embeddings and layer normalization.
-
-    The forward method applies the encoder layers to the input hidden states, handling attention masks,
-    outputting hidden states, and attentions based on the specified configurations.
-
-    This class provides functionalities for building and using a stable layer normalization encoder for the Wav2Vec2
-    model, supporting various output options and configurations.
-
-    For detailed information on the class methods and usage, please refer to the specific method docstrings within
-    the source code.
-    """
-    def __init__(self, config: Wav2Vec2Config):
-        """
-        Initializes an instance of the Wav2Vec2EncoderStableLayerNorm class.
-
-        Args:
-            self: The object instance.
-            config (Wav2Vec2Config): The configuration object for the Wav2Vec2 model.
-
-        Returns:
-            None
-
-        Raises:
-            None
-        """
+    def __init__(self, config):
         super().__init__()
         self.config = config
         self.pos_conv_embed = Wav2Vec2PositionalConvEmbedding(config)
         self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(p=config.hidden_dropout)
+        self.dropout = nn.Dropout(config.hidden_dropout)
         self.layers = nn.ModuleList(
             [Wav2Vec2EncoderLayerStableLayerNorm(config) for _ in range(config.num_hidden_layers)]
         )
+        self.gradient_checkpointing = False
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
 
     def forward(
         self,
@@ -1479,43 +832,29 @@ def forward(
         output_hidden_states=False,
         return_dict=True,
     ):
-        """
-        Constructs the Wav2Vec2EncoderStableLayerNorm.
-
-        Args:
-
-        - hidden_states: The input hidden states of shape (batch_size, sequence_length, hidden_size).
-        - attention_mask: Optional attention mask of shape (batch_size, sequence_length).
-        It is used to mask the attention scores.
-        - output_attentions: Boolean flag indicating whether to output attention weights. Defaults to False.
-        - output_hidden_states: Boolean flag indicating whether to output hidden states of all layers. Defaults to False.
-        - return_dict: Boolean flag indicating whether to return a dictionary as output. Defaults to True.
-
-        Returns:
-            None
-
-        Raises:
-            None
-        """
         all_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
 
         if attention_mask is not None:
             # make sure padded tokens are not attended to
-            expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+            expand_attention_mask = attention_mask.unsqueeze(-1).tile((1, 1, hidden_states.shape[2]))
             hidden_states[~expand_attention_mask] = 0
-
-            # extend attention_mask
-            attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
-            attention_mask = attention_mask * float(ops.finfo(hidden_states.dtype).min)
-            attention_mask = attention_mask.expand(
-                attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
-            )
+            if self._use_flash_attention_2:
+                # 2d mask is passed through the layers
+                attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+            else:
+                # extend attention_mask
+                attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
+                attention_mask = attention_mask * float(ops.finfo(hidden_states.dtype).min)
+                attention_mask = attention_mask.broadcast_to(
+                    (attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1])
+                )
 
         position_embeddings = self.pos_conv_embed(hidden_states)
         hidden_states = hidden_states + position_embeddings
         hidden_states = self.dropout(hidden_states)
 
+
         for layer in self.layers:
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
@@ -1525,9 +864,17 @@ def forward(
 
             skip_the_layer = self.training and (dropout_probability < self.config.layerdrop)
             if not skip_the_layer:
-                layer_outputs = layer(
-                    hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
-                )
+                if self.gradient_checkpointing and self.training:
+                    layer_outputs = self._gradient_checkpointing_func(
+                        layer.__call__,
+                        hidden_states,
+                        attention_mask,
+                        output_attentions,
+                    )
+                else:
+                    layer_outputs = layer(
+                        hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+                    )
                 hidden_states = layer_outputs[0]
 
             if skip_the_layer:
@@ -1550,30 +897,13 @@ def forward(
         )
 
 
-class Wav2Vec2GumbelVectorQuantizer(nn.Module):
-    """
-    Vector quantization using gumbel softmax. See `[CATEGORICAL REPARAMETERIZATION WITH
-    GUMBEL-SOFTMAX](https://arxiv.org/pdf/1611.01144.pdf) for more information.
-    """
-    def __init__(self, config: Wav2Vec2Config):
-        """
-        Initializes a new instance of the Wav2Vec2GumbelVectorQuantizer class.
-
-        Args:
-            self: The instance of the Wav2Vec2GumbelVectorQuantizer class.
-            config (Wav2Vec2Config): An instance of the Wav2Vec2Config class containing configuration parameters
-                for the vector quantizer.
-
-                - num_codevector_groups (int): The number of codevector groups.
-                - num_codevectors_per_group (int): The number of codevectors per group.
-                - codevector_dim (int): The dimension of the codevectors.
-
-        Returns:
-            None.
+class Wav2Vec2GumbelVectorQuantizer(nn.Module):
+    """
+    Vector quantization using gumbel softmax. See `[CATEGORICAL REPARAMETERIZATION WITH
+    GUMBEL-SOFTMAX](https://arxiv.org/pdf/1611.01144.pdf) for more information.
+    """
 
-        Raises:
-            ValueError: If `config.codevector_dim` is not divisible by `config.num_codevector_groups` for concatenation.
-        """
+    def __init__(self, config):
         super().__init__()
         self.num_groups = config.num_codevector_groups
         self.num_vars = config.num_codevectors_per_group
@@ -1585,8 +915,8 @@ def __init__(self, config: Wav2Vec2Config):
             )
 
         # storage for codebook variables (codewords)
-        self.codevectors = Parameter(
-            ops.zeros((1, self.num_groups * self.num_vars, config.codevector_dim // self.num_groups))
+        self.codevectors = nn.Parameter(
+            ops.randn(1, self.num_groups * self.num_vars, config.codevector_dim // self.num_groups)
         )
         self.weight_proj = nn.Linear(config.conv_dim[-1], self.num_groups * self.num_vars)
 
@@ -1595,53 +925,17 @@ def __init__(self, config: Wav2Vec2Config):
 
     @staticmethod
     def _compute_perplexity(probs, mask=None):
-        """
-        Compute the perplexity of given probability distribution.
-
-        Args:
-            probs (Tensor): The input probability distribution. It should be a tensor of shape (N, D) where N is the
-                number of elements and D is the dimensionality of the distribution. mask (Tensor, optional):
-                A boolean tensor of the same shape as probs, indicating which elements to include in the computation.
-                If provided, only the elements where mask is True will be considered. Defaults to None.
-
-        Returns:
-            None: This method does not return anything but updates the internal state of the class.
-
-        Raises:
-            ValueError: If the shape of probs and mask do not match.
-            ValueError: If the dimensionality of probs is not 2.
-        """
         if mask is not None:
-            mask_extended = mask.flatten()[:, None, None].expand(probs.shape)
+            mask_extended = mask.flatten()[:, None, None].broadcast_to(probs.shape)
             probs = ops.where(mask_extended, probs, ops.zeros_like(probs))
-            marginal_probs = probs.sum(axis=0) / mask.sum()
+            marginal_probs = ops.sum(probs, dim=0) / mask.sum()
         else:
-            marginal_probs = probs.mean(axis=0)
+            marginal_probs = ops.mean(probs, dim=0)
 
         perplexity = ops.exp(-ops.sum(marginal_probs * ops.log(marginal_probs + 1e-7), dim=-1)).sum()
         return perplexity
 
     def forward(self, hidden_states, mask_time_indices=None):
-        '''
-        Constructs codevectors and computes perplexity for Wav2Vec2GumbelVectorQuantizer.
-
-        Args:
-            self: The instance of the Wav2Vec2GumbelVectorQuantizer class.
-            hidden_states (tensor): The input hidden states with shape (batch_size, sequence_length, hidden_size).
-            mask_time_indices (tensor, optional): A binary mask tensor of shape (batch_size, sequence_length) where
-                1s indicate valid time indices and 0s indicate masked time indices. Default is None.
-
-        Returns:
-            tuple:
-                A tuple containing:
-
-                - codevectors (tensor): The forwarded codevectors with shape (batch_size, sequence_length, -1).
-                - perplexity (tensor): The computed perplexity.
-
-        Raises:
-            ValueError: If the input hidden_states tensor has an invalid shape.
-            RuntimeError: If the function encounters a runtime error during computation.
-        '''
         batch_size, sequence_length, hidden_size = hidden_states.shape
 
         # project to codevector dim
@@ -1650,8 +944,8 @@ def forward(self, hidden_states, mask_time_indices=None):
 
         if self.training:
             # sample code vector probs via gumbel in differentiateable way
-            codevector_probs = ops.gumbel_softmax(
-                hidden_states.float(), tau=float(self.temperature), hard=True
+            codevector_probs = nn.functional.gumbel_softmax(
+                hidden_states.float(), tau=self.temperature, hard=True
             ).type_as(hidden_states)
 
             # compute perplexity
@@ -1662,13 +956,12 @@ def forward(self, hidden_states, mask_time_indices=None):
         else:
             # take argmax in non-differentiable way
             # comptute hard codevector distribution (one hot)
-            # NOTE: 把 hidden_states 变成 hardsoftmax(dim=-1) 形式
-            codevector_idx = ops.argmax(hidden_states, dim=-1)      # (364) => (364, 1)
-            x = hidden_states.new_zeros(hidden_states.shape)    # (364, 320)
-            index = codevector_idx.view(-1, 1)
-            update = ops.ones_like(index, dtype=hidden_states.dtype)    # fill with onehot
-            codevector_probs = ops.scatter(x, -1, index, update)
-            codevector_probs = codevector_probs.view(batch_size * sequence_length, self.num_groups, -1) # (182, 2, 320)
+            codevector_idx = ops.argmax(hidden_states, dim=-1).view(-1, 1)
+            codevector_probs = ops.scatter(
+                ops.zeros(hidden_states.shape, dtype=hidden_states.dtype),
+                -1, codevector_idx, ops.ones(codevector_idx.shape, dtype=hidden_states.dtype)
+            )
+            codevector_probs = codevector_probs.view(batch_size * sequence_length, self.num_groups, -1)
 
             perplexity = self._compute_perplexity(codevector_probs, mask_time_indices)
 
@@ -1682,39 +975,7 @@ def forward(self, hidden_states, mask_time_indices=None):
 
 
 class Wav2Vec2Adapter(nn.Module):
-
-    """
-    Wav2Vec2Adapter is a class that represents an adapter layer for adapting the hidden states of a Wav2Vec2 model.
-    This class inherits from nn.Module and implements methods for initializing and forwarding the adapter layer.
-
-    Attributes:
-        proj (nn.Linear or None): A dense layer used for projecting hidden states if output_hidden_size is
-            different from hidden_size.
-        proj_layer_norm (nn.LayerNorm or None): A layer normalization module applied after projection if needed.
-        layers (nn.ModuleList): A list of Wav2Vec2AdapterLayer instances representing adapter layers.
-        layerdrop (float): The probability of dropping a layer during training.
-
-    Methods:
-        __init__: Initializes the Wav2Vec2Adapter object with the provided configuration.
-        forward: Applies the adapter layer transformations to the input hidden states.
-
-    """
-    def __init__(self, config: Wav2Vec2Config):
-        """
-        Initializes a new instance of the Wav2Vec2Adapter class.
-
-        Args:
-            self: The current instance of the class.
-            config (Wav2Vec2Config): An instance of Wav2Vec2Config containing configuration parameters for the adapter.
-                This parameter is required for initializing the adapter and must be an instance of Wav2Vec2Config.
-
-        Returns:
-            None.
-
-        Raises:
-            TypeError: If the config parameter is not of type Wav2Vec2Config.
-            ValueError: If the output_hidden_size in the config parameter does not match the hidden_size.
-        """
+    def __init__(self, config):
         super().__init__()
 
         # feature dim might need to be down-projected
@@ -1724,73 +985,28 @@ def __init__(self, config: Wav2Vec2Config):
         else:
             self.proj = self.proj_layer_norm = None
 
-        self.layers = nn.ModuleList([Wav2Vec2AdapterLayer(config) for _ in range(config.num_adapter_layers)])
+        self.layers = nn.ModuleList(Wav2Vec2AdapterLayer(config) for _ in range(config.num_adapter_layers))
         self.layerdrop = config.layerdrop
 
     def forward(self, hidden_states):
-        """
-        This method forwards the hidden states by applying transformations and layers.
-
-        Args:
-            self (object): The instance of the Wav2Vec2Adapter class.
-            hidden_states (numpy.ndarray): The input hidden states to be processed.
-                It is expected to be a 3D array with shape (batch_size, sequence_length, hidden_size).
-
-        Returns:
-            numpy.ndarray: The processed hidden states with shape (batch_size, sequence_length, hidden_size).
-
-        Raises:
-            None
-        """
         # down project hidden_states if necessary
         if self.proj is not None and self.proj_layer_norm is not None:
             hidden_states = self.proj(hidden_states)
             hidden_states = self.proj_layer_norm(hidden_states)
 
-        hidden_states = hidden_states.swapaxes(1, 2)
+        hidden_states = ops.transpose(hidden_states, 1, 2)
 
         for layer in self.layers:
             layerdrop_prob = np.random.random()
             if not self.training or (layerdrop_prob > self.layerdrop):
                 hidden_states = layer(hidden_states)
 
-        hidden_states = hidden_states.swapaxes(1, 2)
+        hidden_states = ops.transpose(hidden_states, 1, 2)
         return hidden_states
 
 
 class Wav2Vec2AdapterLayer(nn.Module):
-
-    '''
-    Wav2Vec2AdapterLayer is a Python class that represents an adapter layer for the Wav2Vec2 model.
-    This class inherits from nn.Module.
-
-    The adapter layer contains methods for initialization and forwardion.
-
-    The __init__ method initializes the adapter layer with the provided configuration. It sets up a 1D convolutional
-    layer with specified parameters such as kernel size, stride, padding, and bias.
-
-    The forward method takes hidden_states as input and applies the convolutional layer followed by the
-    gated linear unit (GLU) activation function. It then returns the processed hidden states.
-
-    This class provides functionality for creating and processing adapter layers within the Wav2Vec2 model.
-    '''
-    def __init__(self, config: Wav2Vec2Config):
-        """
-        __init__
-
-        Initializes a new instance of the Wav2Vec2AdapterLayer class.
-
-        Args:
-            self: The instance of the Wav2Vec2AdapterLayer class.
-            config (Wav2Vec2Config): An instance of the Wav2Vec2Config class containing the configuration parameters
-                for the adapter layer.
-
-        Returns:
-            None.
-
-        Raises:
-            None.
-        """
+    def __init__(self, config):
         super().__init__()
         self.conv = nn.Conv1d(
             config.output_hidden_size,
@@ -1798,55 +1014,17 @@ def __init__(self, config: Wav2Vec2Config):
             config.adapter_kernel_size,
             stride=config.adapter_stride,
             padding=1,
-            bias=True,
         )
 
     def forward(self, hidden_states):
-        """
-        Method to forward the Wav2Vec2AdapterLayer.
-
-        Args:
-            self (Wav2Vec2AdapterLayer): The instance of the Wav2Vec2AdapterLayer class.
-            hidden_states (Tensor): The input hidden states to be processed. It should be a tensor.
-
-        Returns:
-            Tensor: The processed hidden states after applying convolution and gated linear units (GLU) operation.
-
-        Raises:
-            None.
-        """
         hidden_states = self.conv(hidden_states)
-        hidden_states = F.glu(hidden_states, dim=1)
+        hidden_states = nn.functional.glu(hidden_states, dim=1)
+
         return hidden_states
 
 
 class Wav2Vec2AttnAdapterLayer(nn.Module):
-
-    """
-    This class represents a single layer of an attention adapter module in the Wav2Vec2 model. The adapter module is
-    designed to enhance the training throughput by directly implementing the adapter modules with 3D tensor weights as
-    parameters, without using ModuleList.
-
-    Attributes:
-        input_dim (int): The dimension of the input tensor to the adapter module.
-        hidden_dim (int): The hidden dimension of the adapter module.
-        norm (nn.LayerNorm): A layer normalization module to normalize the hidden states.
-        linear_1 (nn.Linear): A linear transformation module that maps the hidden states to the input dimension.
-        act_fn (nn.ReLU): An activation function module that applies the ReLU activation to the hidden states.
-        linear_2 (nn.Linear): A linear transformation module that maps the hidden states back to the hidden dimension.
-
-    Methods:
-        forward:
-            Applies the attention adapter layer operations to the input hidden states tensor.
-
-            Args:
-
-            -  hidden_states (Tensor): The input hidden states tensor.
-            Returns:
-
-            - Tensor: The output hidden states tensor after applying the attention adapter layer operations.
-    """
-    def __init__(self, config: Wav2Vec2Config):
+    def __init__(self, config):
         """
         Implements adapter modules directly with 3D tensor weight as parameters and without using ModuleList to speed
         up training throughput.
@@ -1860,28 +1038,13 @@ def __init__(self, config: Wav2Vec2Config):
         self.act_fn = nn.ReLU()
         self.linear_2 = nn.Linear(self.input_dim, self.hidden_dim)
 
-    def forward(self, hidden_states: Tensor):
-        """
-        Method: forward
-
-        Description:
-        Constructs the adaptation layer for the Wav2Vec2AttnAdapterModel.
-
-        Args:
-            self: (Wav2Vec2AttnAdapterLayer) The instance of the Wav2Vec2AttnAdapterLayer class.
-            hidden_states: (Tensor) The input hidden states to be processed by the adaptation layer.
-
-        Returns:
-            None
-
-        Raises:
-            ValueError: If the input hidden_states tensor is empty or invalid.
-            TypeError: If the input hidden_states is not of type Tensor.
-        """
+    def forward(self, hidden_states: mindspore.Tensor):
         hidden_states = self.norm(hidden_states)
+
         hidden_states = self.linear_1(hidden_states)
         hidden_states = self.act_fn(hidden_states)
         hidden_states = self.linear_2(hidden_states)
+
         return hidden_states
 
 
@@ -1890,57 +1053,61 @@ class Wav2Vec2PreTrainedModel(PreTrainedModel):
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
     models.
     """
+
     config_class = Wav2Vec2Config
     base_model_prefix = "wav2vec2"
     main_input_name = "input_values"
+    supports_gradient_checkpointing = True
 
-    def _init_weights(self, cell):
+    def _init_weights(self, module):
         """Initialize the weights"""
         # Wav2Vec2ForPreTraining last 2 linear layers need standard Linear init.
-        if isinstance(cell, Wav2Vec2ForPreTraining):
-            cell.project_hid._is_initialized = True
-            cell.project_q._is_initialized = True
+        if isinstance(module, Wav2Vec2ForPreTraining):
+            module.project_hid.reset_parameters()
+            module.project_q.reset_parameters()
+            module.project_hid._is_initialized = True
+            module.project_q._is_initialized = True
         # gumbel softmax requires special init
-        elif isinstance(cell, Wav2Vec2GumbelVectorQuantizer):
-            cell.weight_proj.weight.set_data(initializer(Normal(1.0), cell.weight_proj.weight.shape, cell.weight_proj.weight.dtype))
-            cell.weight_proj.bias.set_data(initializer('zeros', cell.weight_proj.bias.shape, cell.weight_proj.bias.dtype))
-            cell.codevectors.set_data(initializer('uniform', cell.codevectors.shape, cell.codevectors.dtype))
-        elif isinstance(cell, Wav2Vec2PositionalConvEmbedding):
-            cell.conv.weight.set_data(
-                initializer(Normal(2 * math.sqrt(1 / (cell.conv.kernel_size[0] * cell.conv.in_channels))),
-                            cell.conv.weight.shape, cell.conv.weight.dtype))
-            cell.conv.bias.set_data(initializer('zeros', cell.conv.bias.shape, cell.conv.bias.dtype))
-        elif isinstance(cell, Wav2Vec2FeatureProjection):
-            k = math.sqrt(1 / cell.projection.in_channels)
-            cell.projection.weight.set_data(
-                initializer(Uniform(k), cell.projection.weight.shape, cell.projection.weight.dtype))
-            cell.projection.bias.set_data(
-                initializer(Uniform(k), cell.projection.bias.shape, cell.projection.bias.dtype))
-        elif isinstance(cell, nn.Linear):
-            cell.weight.set_data(initializer(Normal(self.config.initializer_range), cell.weight.shape, cell.weight.dtype))
-            if cell.bias is not None:
-                cell.bias.set_data(initializer('zeros', cell.bias.shape, cell.bias.dtype))
-        elif isinstance(cell, (nn.LayerNorm, nn.GroupNorm)):
-            cell.weight.set_data(initializer('ones', cell.weight.shape, cell.weight.dtype))
-            cell.bias.set_data(initializer('zeros', cell.bias.shape, cell.bias.dtype))
-        elif isinstance(cell, nn.Conv1d):
-            cell.weight.set_data(initializer('he_normal', cell.weight.shape, cell.weight.dtype))
-            if cell.bias is not None:
-                k = math.sqrt(cell.group / (cell.in_channels * cell.kernel_size[0]))
-                cell.bias.set_data(initializer(Uniform(k), cell.bias.shape, cell.bias.dtype))
+        elif isinstance(module, Wav2Vec2GumbelVectorQuantizer):
+            nn.init.normal_(module.weight_proj.weight, mean=0.0, std=1)
+            nn.init.zeros_(module.weight_proj.bias)
+            nn.init.uniform_(module.codevectors)
+        elif isinstance(module, Wav2Vec2PositionalConvEmbedding):
+            nn.init.normal_(
+                module.conv.weight,
+                mean=0,
+                std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)),
+            )
+            nn.init.constant_(module.conv.bias, 0)
+        elif isinstance(module, Wav2Vec2FeatureProjection):
+            k = math.sqrt(1 / module.projection.in_features)
+            nn.init.uniform_(module.projection.weight, a=-k, b=k)
+            nn.init.uniform_(module.projection.bias, a=-k, b=k)
+        elif isinstance(module, nn.Linear):
+            nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            nn.init.zeros_(module.bias)
+            nn.init.ones_(module.weight)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.kaiming_normal_(module.weight)
+            if module.bias is not None:
+                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-k, b=k)
 
     def _get_feat_extract_output_lengths(
-        self, input_lengths: Union[Tensor, int], add_adapter: Optional[bool] = None
+        self, input_lengths: Union[mindspore.Tensor, int], add_adapter: Optional[bool] = None
     ):
         """
         Computes the output length of the convolutional layers
         """
+
         add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
 
         def _conv_out_length(input_length, kernel_size, stride):
             # 1D convolutional layer output length formula taken
-            # from https://pyops.org/docs/stable/generated/ops.nn.Conv1d.html
-            return (input_length - kernel_size) // stride + 1
+            return ops.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
 
         for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
             input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
@@ -1952,26 +1119,11 @@ def _conv_out_length(input_length, kernel_size, stride):
         return input_lengths
 
     def _get_feature_vector_attention_mask(
-        self, feature_vector_length: int, attention_mask: Tensor, add_adapter=None
+        self, feature_vector_length: int, attention_mask: mindspore.Tensor, add_adapter=None
     ):
-        """
-        This method calculates the attention mask for the feature vectors in a Wav2Vec2 model.
-
-        Args:
-            self (Wav2Vec2PreTrainedModel): The instance of the Wav2Vec2PreTrainedModel class.
-            feature_vector_length (int): The length of the feature vectors.
-            attention_mask (Tensor): The attention mask tensor.
-            add_adapter (Optional): An optional parameter to add adapter.
-
-        Returns:
-            attention_mask (Tensor): The attention mask tensor for the feature vectors.
-
-        Raises:
-            None.
-        """
         # Effectively attention_mask.sum(-1), but not inplace to be able to run
         # on inference mode.
-        non_padded_lengths = attention_mask.cumsum(axis=-1)[:, -1]
+        non_padded_lengths = ops.cumsum(attention_mask, dim=-1)[:, -1]
 
         output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
         output_lengths = output_lengths.to(mindspore.int64)
@@ -1983,37 +1135,21 @@ def _get_feature_vector_attention_mask(
         )
         # these two operations makes sure that all values before the output lengths idxs are attended to
         attention_mask[(ops.arange(attention_mask.shape[0]), output_lengths - 1)] = 1
-        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+        attention_mask = attention_mask.flip([-1]).int().cumsum(-1).flip([-1]).bool()
         return attention_mask
 
     def _get_adapters(self):
-        """
-        Method _get_adapters in the class Wav2Vec2PreTrainedModel.
-
-        Args:
-            self (object): The instance of the class Wav2Vec2PreTrainedModel.
-
-        Returns:
-            dict: A dictionary containing adapter weights.
-                The keys are composed of the parameter names from the adapter layers and the LM head, and the values are
-                the corresponding parameters.
-
-        Raises:
-            ValueError: If the 'adapter_attn_dim' attribute in 'config' is not defined, a ValueError is raised with
-                a message indicating that the class has no adapter layers and prompting to define
-                'config.adapter_attn_dim'.
-        """
         if self.config.adapter_attn_dim is None:
             raise ValueError(f"{self.__class__} has no adapter layers. Make sure to define `config.adapter_attn_dim`.")
 
         adapter_weights = {}
-        for name, module in self.parameters_and_names():
+        for name, module in self.named_modules():
             if isinstance(module, Wav2Vec2AttnAdapterLayer):
-                for param_name, param in module.parameters_and_names():
+                for param_name, param in module.named_parameters():
                     adapter_weights[".".join([name, param_name])] = param
 
         if isinstance(self, Wav2Vec2ForCTC):
-            for name, param in self.lm_head.parameters_and_names():
+            for name, param in self.lm_head.named_parameters():
                 adapter_weights[".".join(["lm_head", name])] = param
 
         return adapter_weights
@@ -2023,7 +1159,7 @@ def init_adapter_layers(self):
         (Re-)initialize attention adapter layers and lm head for adapter-only fine-tuning
         """
         # init attention adapters
-        for module in self.cells():
+        for module in self.modules():
             if isinstance(module, Wav2Vec2AttnAdapterLayer):
                 self._init_weights(module)
 
@@ -2047,9 +1183,9 @@ def load_adapter(self, target_lang: str, force_load=True, **kwargs):
             force_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                 cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
-                file exists.
+            resume_download:
+                Deprecated and ignored. All downloads are now resumed by default when possible.
+                Will be removed in v5 of Transformers.
             proxies (`Dict[str, str]`, *optional*):
                 A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
@@ -2060,7 +1196,7 @@ def load_adapter(self, target_lang: str, force_load=True, **kwargs):
                 the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
             revision (`str`, *optional*, defaults to `"main"`):
                 The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-                git-based system for storing models and other artifacts on hf-mirror.com, so `revision` can be any
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
                 identifier allowed by git.
 
                 <Tip>
@@ -2076,22 +1212,23 @@ def load_adapter(self, target_lang: str, force_load=True, **kwargs):
 
         <Tip>
 
-        Activate the special ["offline-mode"](https://hf-mirror.com/transformers/installation.html#offline-mode) to
+        Activate the special ["offline-mode"](https://huggingface.co/transformers/installation.html#offline-mode) to
         use this method in a firewalled environment.
 
         </Tip>
 
-        Example:
-            ```python
-            >>> from transformers import Wav2Vec2ForCTC, AutoProcessor
-            ...
-            >>> ckpt = "facebook/mms-1b-all"
-            >>> processor = AutoProcessor.from_pretrained(ckpt)
-            >>> model = Wav2Vec2ForCTC.from_pretrained(ckpt, target_lang="eng")
-            >>> # set specific language
-            >>> processor.tokenizer.set_target_lang("spa")
-            >>> model.load_adapter("spa")
-            ```
+        Examples:
+
+        ```python
+        >>> from transformers import Wav2Vec2ForCTC, AutoProcessor
+
+        >>> ckpt = "facebook/mms-1b-all"
+        >>> processor = AutoProcessor.from_pretrained(ckpt)
+        >>> model = Wav2Vec2ForCTC.from_pretrained(ckpt, target_lang="eng")
+        >>> # set specific language
+        >>> processor.tokenizer.set_target_lang("spa")
+        >>> model.load_adapter("spa")
+        ```
         """
         if self.config.adapter_attn_dim is None:
             raise ValueError(f"Cannot load_adapter for {target_lang} if `config.adapter_attn_dim` is not defined.")
@@ -2102,16 +1239,17 @@ def load_adapter(self, target_lang: str, force_load=True, **kwargs):
 
         cache_dir = kwargs.pop("cache_dir", None)
         force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
+        resume_download = kwargs.pop("resume_download", None)
         proxies = kwargs.pop("proxies", None)
         local_files_only = kwargs.pop("local_files_only", False)
         token = kwargs.pop("token", None)
         use_auth_token = kwargs.pop("use_auth_token", None)
-        use_safetensors = kwargs.pop("use_safetensors", False)
+        revision = kwargs.pop("revision", None)
+        use_safetensors = kwargs.pop("use_safetensors", None if is_safetensors_available() else False)
 
         if use_auth_token is not None:
             warnings.warn(
-                "The `use_auth_token` argument is deprecated. Please use `token` instead.",
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
                 FutureWarning,
             )
             if token is not None:
@@ -2135,26 +1273,28 @@ def load_adapter(self, target_lang: str, force_load=True, **kwargs):
                     resume_download=resume_download,
                     proxies=proxies,
                     local_files_only=local_files_only,
+                    token=token,
+                    revision=revision,
                     cache_dir=cache_dir,
                 )
 
-                # state_dict = safe_load_file(weight_path)
-                state_dict = None
+                state_dict = safe_load_file(weight_path)
+
             except EnvironmentError:
                 if use_safetensors:
                     # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted
                     # to the original exception.
                     raise
 
-            except Exception as exc:
+            except Exception:
                 # For any other exception, we throw a generic error.
                 if use_safetensors:
                     raise EnvironmentError(
                         f"Can't load the model for '{model_path_or_id}'. If you were trying to load it"
-                        " from 'https://hf-mirror.com/models', make sure you don't have a local directory with the"
+                        " from 'https://huggingface.co/models', make sure you don't have a local directory with the"
                         f" same name. Otherwise, make sure '{model_path_or_id}' is the correct path to a"
                         f" directory containing a file named {filepath}."
-                    ) from exc
+                    )
 
         # 2. If this didn't work let's try loading a PyTorch adapter weight
         if state_dict is None:
@@ -2168,29 +1308,27 @@ def load_adapter(self, target_lang: str, force_load=True, **kwargs):
                     resume_download=resume_download,
                     proxies=proxies,
                     local_files_only=local_files_only,
+                    token=token,
+                    revision=revision,
                     cache_dir=cache_dir,
                 )
 
-                weights_only_kwarg = {"weights_only": True}
-                state_dict = ops.load(
-                    weight_path,
-                    map_location="cpu",
-                    **weights_only_kwarg,
-                )
+                state_dict = load(weight_path)
 
             except EnvironmentError:
                 # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted
                 # to the original exception.
                 raise
 
-            except Exception as exc:
+            except Exception as e:
+                print(e)
                 # For any other exception, we throw a generic error.
                 raise EnvironmentError(
                     f"Can't load the model for '{model_path_or_id}'. If you were trying to load it"
-                    " from 'https://hf-mirror.com/models', make sure you don't have a local directory with the"
+                    " from 'https://huggingface.co/models', make sure you don't have a local directory with the"
                     f" same name. Otherwise, make sure '{model_path_or_id}' is the correct path to a"
                     f" directory containing a file named {filepath}."
-                ) from exc
+                )
 
         adapter_weights = self._get_adapters()
         unexpected_keys = set(state_dict.keys()) - set(adapter_weights.keys())
@@ -2210,7 +1348,7 @@ def load_adapter(self, target_lang: str, force_load=True, **kwargs):
             self.config.vocab_size = target_vocab_size
 
         # make sure that adapter weights are put in exactly the same precision and device placement and overwritten adapter weights
-        state_dict = {k: v.to(adapter_weights[k]) for k, v in state_dict.items()}
+        state_dict = {k: v.to(adapter_weights[k].dtype) for k, v in state_dict.items()}
         self.load_state_dict(state_dict, strict=False)
 
         # set target language corectly
@@ -2218,58 +1356,7 @@ def load_adapter(self, target_lang: str, force_load=True, **kwargs):
 
 
 class Wav2Vec2Model(Wav2Vec2PreTrainedModel):
-
-    """
-    The `Wav2Vec2Model` class is a Python class that represents a Wav2Vec2 model for speech recognition.
-    It is a subclass of the `Wav2Vec2PreTrainedModel` class.
-
-    Wav2Vec2Model inherits the following attributes and methods from the parent class:
-
-    - `config`: An instance of the `Wav2Vec2Config` class, containing the configuration parameters for the model.
-    - `feature_extractor`: An instance of the `Wav2Vec2FeatureEncoder` class, responsible for extracting features
-    from the input waveform.
-    - `feature_projection`: An instance of the `Wav2Vec2FeatureProjection` class, responsible for projecting the
-    extracted features.
-    - `encoder`: An instance of the `Wav2Vec2Encoder` or `Wav2Vec2EncoderStableLayerNorm` class, responsible for
-    encoding the hidden states.
-    - `adapter`: An instance of the `Wav2Vec2Adapter` class, used to adapt the hidden states (optional).
-    - `post_init()`: A method called after the initialization of the model.
-
-    The `Wav2Vec2Model` class also defines the following methods:
-
-    - `freeze_feature_extractor`: Disables the gradient computation for the feature encoder, preventing its parameters
-    from being updated during training.
-    - `freeze_feature_encoder`: Disables the gradient computation for the feature encoder, preventing its parameters
-    from being updated during training.
-    - `_mask_hidden_states`: Masks extracted features along
-    the time axis and/or the feature axis according to SpecAugment.
-    - `forward`: Constructs the model by processing the input values and returns the model outputs.
-
-    Please note that the `freeze_feature_extractor()` method is deprecated.
-    The equivalent `freeze_feature_encoder()` method should be used instead.
-
-    For more information about the Wav2Vec2 model, please refer to the official paper [SpecAugment]
-    (https://arxiv.org/abs/1904.08779).
-    """
     def __init__(self, config: Wav2Vec2Config):
-        """
-        Initializes a new instance of the Wav2Vec2Model class.
-
-        Args:
-            self: The instance of the Wav2Vec2Model class.
-            config (Wav2Vec2Config): An instance of the Wav2Vec2Config class containing the configuration parameters
-                for the model.
-
-        Returns:
-            None.
-
-        Raises:
-            TypeError: If the config parameter is not of type Wav2Vec2Config.
-            ValueError: If the config parameters mask_time_prob or mask_feature_prob are less than 0.0.
-            ValueError: If the config parameter do_stable_layer_norm is not a boolean value.
-            ValueError: If the config parameter hidden_size is not defined.
-            ValueError: If an error occurs during the initialization process.
-        """
         super().__init__(config)
         self.config = config
         self.feature_extractor = Wav2Vec2FeatureEncoder(config)
@@ -2277,7 +1364,7 @@ def __init__(self, config: Wav2Vec2Config):
 
         # model only needs masking vector if mask prob is > 0.0
         if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
-            self.masked_spec_embed = Parameter(initializer(Uniform(), (config.hidden_size,), dtype=mindspore.float32))
+            self.masked_spec_embed = nn.Parameter(ops.randn(config.hidden_size))
 
         if config.do_stable_layer_norm:
             self.encoder = Wav2Vec2EncoderStableLayerNorm(config)
@@ -2295,7 +1382,7 @@ def freeze_feature_extractor(self):
         not be updated during training.
         """
         warnings.warn(
-            "The method `freeze_feature_extractor` is deprecated. "
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
             "Please use the equivalent `freeze_feature_encoder` method instead.",
             FutureWarning,
         )
@@ -2310,14 +1397,15 @@ def freeze_feature_encoder(self):
 
     def _mask_hidden_states(
         self,
-        hidden_states: Tensor,
-        mask_time_indices: Optional[Tensor] = None,
-        attention_mask: Optional[Tensor] = None,
+        hidden_states: mindspore.Tensor,
+        mask_time_indices: Optional[mindspore.Tensor] = None,
+        attention_mask: Optional[mindspore.Tensor] = None,
     ):
         """
         Masks extracted features along time axis and/or along feature axis according to
         [SpecAugment](https://arxiv.org/abs/1904.08779).
         """
+
         # `config.apply_spec_augment` can set masking to False
         if not getattr(self.config, "apply_spec_augment", True):
             return hidden_states
@@ -2336,7 +1424,7 @@ def _mask_hidden_states(
                 attention_mask=attention_mask,
                 min_masks=self.config.mask_time_min_masks,
             )
-            mask_time_indices = Tensor(mask_time_indices, dtype=mindspore.bool_)
+            mask_time_indices = mindspore.tensor(mask_time_indices, dtype=mindspore.bool_)
             hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
 
         if self.config.mask_feature_prob > 0 and self.training:
@@ -2347,43 +1435,21 @@ def _mask_hidden_states(
                 mask_length=self.config.mask_feature_length,
                 min_masks=self.config.mask_feature_min_masks,
             )
-            mask_feature_indices = Tensor(mask_feature_indices, dtype=mindspore.bool_)
-            mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
+            mask_feature_indices = mindspore.tensor(mask_feature_indices, dtype=mindspore.bool_)
+            mask_feature_indices = mask_feature_indices[:, None].broadcast_to((-1, sequence_length, -1))
             hidden_states[mask_feature_indices] = 0
 
         return hidden_states
 
     def forward(
         self,
-        input_values: Optional[Tensor],
-        attention_mask: Optional[Tensor] = None,
-        mask_time_indices: Optional[Tensor] = None,
+        input_values: Optional[mindspore.Tensor],
+        attention_mask: Optional[mindspore.Tensor] = None,
+        mask_time_indices: Optional[mindspore.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
-        """
-        Constructs the Wav2Vec2 model for processing input audio data.
-
-        Args:
-            self (Wav2Vec2Model): The instance of the Wav2Vec2Model class.
-            input_values (Optional[Tensor]): The input audio data values with shape (batch_size, audio_length).
-            attention_mask (Optional[Tensor]): The attention mask for the input audio data with shape
-                (batch_size, audio_length).
-            mask_time_indices (Optional[Tensor]): The mask for time indices with shape (batch_size, audio_length).
-            output_attentions (Optional[bool]): Whether to output attentions. Defaults to None.
-            output_hidden_states (Optional[bool]): Whether to output hidden states. Defaults to None.
-            return_dict (Optional[bool]): Whether to return a dictionary of output. Defaults to None.
-
-        Returns:
-            Union[Tuple, Wav2Vec2BaseModelOutput]: The forwarded model output, which can be a tuple or a
-                Wav2Vec2BaseModelOutput object.
-
-        Raises:
-            ValueError: If the input_values and attention_mask have mismatched shapes.
-            TypeError: If the input_values or attention_mask is not a Tensor.
-            RuntimeError: If the encoder fails to process the input audio data.
-        """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -2391,7 +1457,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         extract_features = self.feature_extractor(input_values)
-        extract_features = extract_features.swapaxes(1, 2)
+        extract_features = ops.transpose(extract_features, 1, 2)
 
         if attention_mask is not None:
             # compute reduced attention_mask corresponding to feature vectors
@@ -2429,47 +1495,10 @@ def forward(
 
 
 class Wav2Vec2ForPreTraining(Wav2Vec2PreTrainedModel):
-
-    """Wav2Vec2ForPreTraining
-
-    This class represents a pre-training model for Wav2Vec2, which is used for pre-training the Wav2Vec2 model.
-    It includes methods for setting Gumbel softmax temperature, freezing the feature encoder, computing contrastive
-    logits, and forwarding the model for pre-training.
-
-    Methods:
-        set_gumbel_temperature: Set the Gumbel softmax temperature to a given value. Only necessary for training.
-        freeze_feature_extractor: Disable gradient computation for the feature encoder to prevent parameter updates
-            during training.
-        freeze_feature_encoder: Disable gradient computation for the feature encoder to prevent parameter updates
-            during training.
-        compute_contrastive_logits: Compute logits for contrastive loss based on cosine similarity between features
-            and apply temperature.
-        forward: Construct the model for pre-training, including masking features for contrastive loss.
-
-    Attributes:
-        wav2vec2: Wav2Vec2Model instance for the Wav2Vec2 model.
-        dropout_features: Dropout layer for feature vectors.
-        quantizer: Wav2Vec2GumbelVectorQuantizer instance for quantization.
-        project_hid: Dense layer for projecting hidden states.
-        project_q: Dense layer for projecting quantized features.
-    """
     def __init__(self, config: Wav2Vec2Config):
-        """
-        Initializes a new instance of the Wav2Vec2ForPreTraining class.
-
-        Args:
-            self: The instance of the Wav2Vec2ForPreTraining class.
-            config (Wav2Vec2Config): The configuration object for the Wav2Vec2 model.
-
-        Returns:
-            None.
-
-        Raises:
-            None
-        """
         super().__init__(config)
         self.wav2vec2 = Wav2Vec2Model(config)
-        self.dropout_features = nn.Dropout(p=config.feat_quantizer_dropout)
+        self.dropout_features = nn.Dropout(config.feat_quantizer_dropout)
 
         self.quantizer = Wav2Vec2GumbelVectorQuantizer(config)
 
@@ -2491,7 +1520,7 @@ def freeze_feature_extractor(self):
         not be updated during training.
         """
         warnings.warn(
-            "The method `freeze_feature_extractor` is deprecated. "
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
             "Please use the equivalent `freeze_feature_encoder` method instead.",
             FutureWarning,
         )
@@ -2506,9 +1535,9 @@ def freeze_feature_encoder(self):
 
     @staticmethod
     def compute_contrastive_logits(
-        target_features: Tensor,
-        negative_features: Tensor,
-        predicted_features: Tensor,
+        target_features: mindspore.Tensor,
+        negative_features: mindspore.Tensor,
+        predicted_features: mindspore.Tensor,
         temperature: int = 0.1,
     ):
         """
@@ -2516,78 +1545,82 @@ def compute_contrastive_logits(
         `[positive_feature, negative_features]` and `[predicted_features]`. Additionally, temperature can be applied.
         """
         target_features = ops.cat([target_features, negative_features], dim=0)
-        logits = ops.cosine_similarity(predicted_features.float(), target_features.float(), dim=-1).type_as(target_features)
+
+        logits = nn.functional.cosine_similarity(predicted_features.float(), target_features.float(), dim=-1).type_as(
+            target_features
+        )
+
         # apply temperature
         logits = logits / temperature
         return logits
 
     def forward(
         self,
-        input_values: Optional[Tensor],
-        attention_mask: Optional[Tensor] = None,
-        mask_time_indices: Optional[Tensor] = None,
-        sampled_negative_indices: Optional[Tensor] = None,
+        input_values: Optional[mindspore.Tensor],
+        attention_mask: Optional[mindspore.Tensor] = None,
+        mask_time_indices: Optional[mindspore.Tensor] = None,
+        sampled_negative_indices: Optional[mindspore.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, Wav2Vec2ForPreTrainingOutput]:
         r"""
-        Args:
-            mask_time_indices (`Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
-                masked extracted features in *config.proj_codevector_dim* space.
-            sampled_negative_indices (`Tensor` of shape `(batch_size, sequence_length, num_negatives)`, *optional*):
-                Indices indicating which quantized target vectors are used as negative sampled vectors in contrastive loss.
-                Required input for pre-training.
+        mask_time_indices (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
+            masked extracted features in *config.proj_codevector_dim* space.
+        sampled_negative_indices (`mindspore.Tensor` of shape `(batch_size, sequence_length, num_negatives)`, *optional*):
+            Indices indicating which quantized target vectors are used as negative sampled vectors in contrastive loss.
+            Required input for pre-training.
 
         Returns:
-            Union[Tuple, Wav2Vec2ForPreTrainingOutput]
 
         Example:
-            ```python
-            >>> import torch
-            >>> from transformers import AutoFeatureExtractor, Wav2Vec2ForPreTraining
-            >>> from transformers.models.wav2vec2.modeling_wav2vec2 import _compute_mask_indices, _sample_negative_indices
-            >>> from datasets import load_dataset
-            ...
-            >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
-            >>> model = Wav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-base")
-            ...
-            >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-            >>> input_values = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt").input_values  # Batch size 1
-            ...
-            >>> # compute masked indices
-            >>> batch_size, raw_sequence_length = input_values.shape
-            >>> sequence_length = model._get_feat_extract_output_lengths(raw_sequence_length).item()
-            >>> mask_time_indices = _compute_mask_indices(
-            ...     shape=(batch_size, sequence_length), mask_prob=0.2, mask_length=2
-            ... )
-            >>> sampled_negative_indices = _sample_negative_indices(
-            ...     features_shape=(batch_size, sequence_length),
-            ...     num_negatives=model.config.num_negatives,
-            ...     mask_time_indices=mask_time_indices,
-            ... )
-            >>> mask_time_indices = Tensor(data=mask_time_indices, device=input_values.device, dtype=mindspore.int64)
-            >>> sampled_negative_indices = Tensor(
-            ...     data=sampled_negative_indices, device=input_values.device, dtype=mindspore.int64
-            ... )
-            ...
-            >>> with ops.no_grad():
-            ...     outputs = model(input_values, mask_time_indices=mask_time_indices)
-            ...
-            >>> # compute cosine similarity between predicted (=projected_states) and target (=projected_quantized_states)
-            >>> cosine_sim = ops.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1)
-            ...
-            >>> # show that cosine similarity is much higher than random
-            >>> cosine_sim[mask_time_indices.to(mindspore.bool_)].mean() > 0.5
-            tensor(True)
-            >>> # for contrastive loss training model should be put into train mode
-            >>> model = model.train()
-            >>> loss = model(
-            ...     input_values, mask_time_indices=mask_time_indices, sampled_negative_indices=sampled_negative_indices
-            ... ).loss
-            ```
-        """
+
+        ```python
+        >>> import torch
+        >>> from transformers import AutoFeatureExtractor, Wav2Vec2ForPreTraining
+        >>> from transformers.models.wav2vec2.modeling_wav2vec2 import _compute_mask_indices, _sample_negative_indices
+        >>> from datasets import load_dataset
+
+        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
+        >>> model = Wav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-base")
+
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> input_values = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt").input_values  # Batch size 1
+
+        >>> # compute masked indices
+        >>> batch_size, raw_sequence_length = input_values.shape
+        >>> sequence_length = model._get_feat_extract_output_lengths(raw_sequence_length).item()
+        >>> mask_time_indices = _compute_mask_indices(
+        ...     shape=(batch_size, sequence_length), mask_prob=0.2, mask_length=2
+        ... )
+        >>> sampled_negative_indices = _sample_negative_indices(
+        ...     features_shape=(batch_size, sequence_length),
+        ...     num_negatives=model.config.num_negatives,
+        ...     mask_time_indices=mask_time_indices,
+        ... )
+        >>> mask_time_indices = mindspore.tensor(data=mask_time_indices, dtype=mindspore.int64)
+        >>> sampled_negative_indices = mindspore.tensor(
+        ...     data=sampled_negative_indices, dtype=mindspore.int64
+        ... )
+
+        >>> with no_grad():
+        ...     outputs = model(input_values, mask_time_indices=mask_time_indices)
+
+        >>> # compute cosine similarity between predicted (=projected_states) and target (=projected_quantized_states)
+        >>> cosine_sim = ops.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1)
+
+        >>> # show that cosine similarity is much higher than random
+        >>> cosine_sim[mask_time_indices.to(mindspore.bool_)].mean() > 0.5
+        tensor(True)
+
+        >>> # for contrastive loss training model should be put into train mode
+        >>> model = model.train()
+        >>> loss = model(
+        ...     input_values, mask_time_indices=mask_time_indices, sampled_negative_indices=sampled_negative_indices
+        ... ).loss
+        ```"""
+
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if mask_time_indices is not None:
@@ -2617,6 +1650,8 @@ def forward(
         quantized_features, codevector_perplexity = self.quantizer(
             extract_features, mask_time_indices=mask_time_indices
         )
+
+        quantized_features = quantized_features.to(self.project_q.weight.dtype)
         quantized_features = self.project_q(quantized_features)
 
         loss = contrastive_loss = diversity_loss = None
@@ -2648,16 +1683,14 @@ def forward(
             neg_is_pos = (quantized_features == negative_quantized_features).all(-1)
 
             if neg_is_pos.any():
-                # NOTE: avoid loss NaN
-                # float("-inf") => finfo(logits.dtype, 'min') := -3.40282e+38
-                logits[1:][neg_is_pos] = -3.40282e+35
+                logits[1:][neg_is_pos] = float("-inf")
 
             # 6. compute contrastive loss \mathbf{L}_m = cross_entropy(logs) =
             # -log(exp(sim(c_t, q_t)/\kappa) / \sum_{\sim{q}} exp(sim(c_t, \sim{q})/\kappa))
-            logits = logits.swapaxes(0, 2).reshape(-1, logits.shape[0])
-            target = ((1 - mask_time_indices.long()) * -100).swapaxes(0, 1).flatten()
+            logits = ops.transpose(logits, 0, 2).reshape(-1, logits.shape[0])
+            target = ops.transpose(((1 - mask_time_indices.long()) * -100), 0, 1).flatten()
 
-            contrastive_loss = F.cross_entropy(logits.float(), target, reduction="sum")
+            contrastive_loss = nn.functional.cross_entropy(logits.float(), target, reduction="sum")
             # 7. compute diversity loss: \mathbf{L}_d
             num_codevectors = self.config.num_codevectors_per_group * self.config.num_codevector_groups
             diversity_loss = ((num_codevectors - codevector_perplexity) / num_codevectors) * mask_time_indices.sum()
@@ -2683,63 +1716,7 @@ def forward(
 
 
 class Wav2Vec2ForMaskedLM(Wav2Vec2PreTrainedModel):
-
-    """
-    This class represents a Wav2Vec2 model for Masked Language Modeling (MLM).
-    It is deprecated and should be replaced with `Wav2Vec2ForCTC`.
-
-    The `Wav2Vec2ForMaskedLM` class inherits from the `Wav2Vec2PreTrainedModel` class.
-
-    Attributes:
-        `wav2vec2`: The underlying Wav2Vec2Model.
-        `dropout`: A dropout layer for regularization.
-        `lm_head`: A dense layer for language modeling prediction.
-
-    Methods:
-        `__init__`: Initializes a new instance of the `Wav2Vec2ForMaskedLM` class.
-        `forward`: Constructs the model for masked language modeling.
-
-    Note:
-        This class is deprecated and should be replaced with `Wav2Vec2ForCTC`.
-    """
-    def __init__(self, config: Wav2Vec2Config):
-        """
-        Initializes an instance of the 'Wav2Vec2ForMaskedLM' class.
-
-        Args:
-            self: The object instance.
-            config (Wav2Vec2Config):
-                The configuration object containing various hyperparameters for the model.
-
-                - `config` should be an instance of the 'Wav2Vec2Config' class.
-                - This parameter is required.
-
-        Returns:
-            None
-
-        Raises:
-            FutureWarning: Raised if the class `Wav2Vec2ForMaskedLM` is used, as it is deprecated.
-                Recommends using `Wav2Vec2ForCTC` instead.
-                This warning is raised as a future version may not support the deprecated class.
-
-        Description:
-            This method initializes an instance of the 'Wav2Vec2ForMaskedLM' class. It sets up the model architecture
-            and initializes the necessary components. The initialization process includes the following steps:
-
-            1. Calls the parent class '__init__' method using 'super()' to initialize the base class.
-            2. Raises a 'FutureWarning' to notify users that the class `Wav2Vec2ForMaskedLM` is deprecated and
-            recommends using `Wav2Vec2ForCTC` instead.
-            3. Initializes the 'wav2vec2' attribute as an instance of 'Wav2Vec2Model' using the provided 'config'.
-            4. Initializes the 'dropout' attribute as an instance of 'nn.Dropout' with the dropout probability specified
-            in 'config'.
-            5. Initializes the 'lm_head' attribute as an instance of 'nn.Linear' with the hidden size and vocabulary
-            size specified in 'config'.
-            6. Calls the 'post_init' method to perform any additional post-initialization steps.
-
-        Note:
-            The 'Wav2Vec2ForMaskedLM' class is deprecated and may not be supported in future versions. It is recommended
-            to use the 'Wav2Vec2ForCTC' class instead.
-        """
+    def __init__(self, config):
         super().__init__(config)
 
         warnings.warn(
@@ -2747,7 +1724,7 @@ def __init__(self, config: Wav2Vec2Config):
         )
 
         self.wav2vec2 = Wav2Vec2Model(config)
-        self.dropout = nn.Dropout(p=config.final_dropout)
+        self.dropout = nn.Dropout(config.final_dropout)
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size)
 
         # Initialize weights and apply final processing
@@ -2755,41 +1732,13 @@ def __init__(self, config: Wav2Vec2Config):
 
     def forward(
         self,
-        input_values: Tensor,
-        attention_mask: Optional[Tensor] = None,
+        input_values: mindspore.Tensor,
+        attention_mask: Optional[mindspore.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Tensor] = None,
+        labels: Optional[mindspore.Tensor] = None,
     ) -> Union[Tuple, MaskedLMOutput]:
-        """
-        Args:
-            self (Wav2Vec2ForMaskedLM): The instance of the Wav2Vec2ForMaskedLM class.
-            input_values (Tensor): The input tensor representing the input audio features. Its shape is
-                (batch_size, sequence_length, feature_dim).
-            attention_mask (Optional[Tensor]): Optional tensor representing the attention mask for the input.
-                If provided, should have the shape (batch_size, sequence_length).
-            output_attentions (Optional[bool]): Optional flag to indicate whether to return attentions in the output.
-                Defaults to None.
-            output_hidden_states (Optional[bool]): Optional flag to indicate whether to return hidden states
-                in the output. Defaults to None.
-            return_dict (Optional[bool]): Optional flag to indicate whether to return the output as a dictionary.
-                If not provided, it defaults to the value specified in the configuration.
-            labels (Optional[Tensor]): Optional tensor representing the labels for the masked language modeling task.
-                Its shape is (batch_size, sequence_length).
-
-        Returns:
-            Union[Tuple, MaskedLMOutput]:
-                The return value can be either a tuple or a MaskedLMOutput object.
-
-                - If return_dict is False, it returns a tuple containing the logits and, optionally, the hidden states
-                and attentions.
-                - If return_dict is True, it returns a MaskedLMOutput object containing the logits,
-                hidden states, and attentions.
-
-        Raises:
-            None
-        """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         outputs = self.wav2vec2(
@@ -2811,50 +1760,11 @@ def forward(
 
 
 class Wav2Vec2ForCTC(Wav2Vec2PreTrainedModel):
-
-    """
-    This class represents a Wav2Vec2 model fine-tuned for Connectionist Temporal Classification (CTC) tasks.
-    It inherits from the Wav2Vec2PreTrainedModel, providing methods for initializing the model, tying weights,
-    freezing the feature extractor, feature encoder, and base model, as well as forwarding the model
-    for inference and training.
-
-    The Wav2Vec2ForCTC class encapsulates the Wav2Vec2 model with additional methods for CTC-specific functionality,
-    such as handling labels for CTC, computing CTC loss, and processing input values for CTC tasks.
-
-    The class provides methods for fine-tuning the Wav2Vec2 model for CTC tasks, including freezing specific components
-    of the model, as well as forwarding the model for CTC inference and training.
-
-    Additionally, the class provides methods for tying weights and freezing specific components of the model to ensure
-    compatibility with adapter weights and to control parameter updates during training.
-
-    This class is designed for fine-tuning the Wav2Vec2 model for CTC tasks, providing a comprehensive set of methods
-    for customizing the model's behavior and supporting CTC-specific functionality.
-    """
-    def __init__(self, config: Wav2Vec2Config, target_lang: Optional[str] = None):
-        """
-        Initializes a new instance of the Wav2Vec2ForCTC class.
-
-        Args:
-            self: The object itself.
-            config (Wav2Vec2Config): The configuration for the Wav2Vec2Model.
-            target_lang (Optional[str], optional): The target language. Defaults to None.
-
-        Returns:
-            None
-
-        Raises:
-            ValueError: If the configuration does not define the vocabulary size of the language model head.
-
-        Note:
-            The vocabulary size of the language model head must be defined either by instantiating the model
-            with `Wav2Vec2ForCTC.from_pretrained(..., vocab_size=vocab_size)` or by explicitly defining the
-            `vocab_size` in the model's configuration.
-
-        """
+    def __init__(self, config, target_lang: Optional[str] = None):
         super().__init__(config)
 
         self.wav2vec2 = Wav2Vec2Model(config)
-        self.dropout = nn.Dropout(p=config.final_dropout)
+        self.dropout = nn.Dropout(config.final_dropout)
 
         self.target_lang = target_lang
 
@@ -2880,6 +1790,7 @@ def tie_weights(self):
 
         This method is **not** supposed to be called by the user and is prone to be changed in the future.
         """
+
         # Note that `tie_weights` is usually used to tie input and output embedding weights. The method is re-purposed to
         # correctly load adapter layers for Wav2Vec2 so that we do not have to introduce a new API to
         # [`PreTrainedModel`]. While slightly hacky, Wav2Vec2 never has to tie input and output embeddings, so that it is
@@ -2899,7 +1810,7 @@ def freeze_feature_extractor(self):
         not be updated during training.
         """
         warnings.warn(
-            "The method `freeze_feature_extractor` is deprecated. "
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
             "Please use the equivalent `freeze_feature_encoder` method instead.",
             FutureWarning,
         )
@@ -2917,28 +1828,30 @@ def freeze_base_model(self):
         Calling this function will disable the gradient computation for the base model so that its parameters will not
         be updated during training. Only the classification head will be updated.
         """
-        for _, param in self.wav2vec2.parameters_and_names():
+        for param in self.wav2vec2.parameters():
             param.requires_grad = False
 
     def forward(
         self,
-        input_values: Optional[Tensor],
-        attention_mask: Optional[Tensor] = None,
+        input_values: Optional[mindspore.Tensor],
+        attention_mask: Optional[mindspore.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Tensor] = None,
+        labels: Optional[mindspore.Tensor] = None,
     ) -> Union[Tuple, CausalLMOutput]:
         r"""
-        Args:
-            labels (`Tensor` of shape `(batch_size, target_length)`, *optional*):
-                Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
-                the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
-                All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
-                config.vocab_size - 1]`.
+        labels (`mindspore.Tensor` of shape `(batch_size, target_length)`, *optional*):
+            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
+            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
+            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
+            config.vocab_size - 1]`.
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
+        if labels is not None and labels.max() >= self.config.vocab_size:
+            raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
+
         outputs = self.wav2vec2(
             input_values,
             attention_mask=attention_mask,
@@ -2954,10 +1867,6 @@ def forward(
 
         loss = None
         if labels is not None:
-            labels = labels.astype(mindspore.int32)
-            if labels.max() >= self.config.vocab_size:
-                raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
-
             # retrieve loss input_lengths from attention_mask
             attention_mask = (
                 attention_mask if attention_mask is not None else ops.ones_like(input_values, dtype=mindspore.int64)
@@ -2971,11 +1880,11 @@ def forward(
             flattened_targets = labels.masked_select(labels_mask)
 
             # ctc_loss doesn't support fp16
-            log_probs = F.log_softmax(logits, dim=-1).swapaxes(0, 1)
+            log_probs = ops.transpose(nn.functional.log_softmax(logits, dim=-1, dtype=mindspore.float32), 0, 1)
 
-            loss, log_alpha = F.ctc_loss(
+            loss = nn.functional.ctc_loss(
                 log_probs,
-                labels,     # flattened_targets
+                labels,
                 input_lengths,
                 target_lengths,
                 blank=self.config.pad_token_id,
@@ -2993,39 +1902,7 @@ def forward(
 
 
 class Wav2Vec2ForSequenceClassification(Wav2Vec2PreTrainedModel):
-
-    """
-    The `Wav2Vec2ForSequenceClassification` class represents a Wav2Vec2 model for sequence classification tasks.
-    It inherits from the `Wav2Vec2PreTrainedModel` class. This class provides methods for initializing the model,
-    freezing specific components, and computing the sequence classification output. It also  includes methods for
-    handling the feature extractor, feature encoder, and base model. The class supports the forwardion of the sequence
-    classification output and provides options for setting various parameters such as attention masks, output attentions,
-    output hidden states, and labels.
-
-    Deprecated methods such as `freeze_feature_extractor` and `freeze_base_model` are included along with their
-    corresponding replacements. The `forward` method computes the sequence classification/regression loss and handles
-    the classification output based on the input values, attention masks, and labels. The class allows for fine-tuning
-    the model for sequence classification tasks while providing flexibility in handling different components and
-    parameters.
-
-    For detailed information about the class and its methods, refer to the individual method docstrings and the base
-    class `Wav2Vec2PreTrainedModel` for additional context and functionality.
-    """
-    def __init__(self, config: Wav2Vec2Config):
-        """
-        Initializes a new instance of the Wav2Vec2ForSequenceClassification class.
-
-        Args:
-            self: The object itself.
-            config (Wav2Vec2Config): An instance of Wav2Vec2Config containing the configuration settings for the model.
-
-        Returns:
-            None.
-
-        Raises:
-            ValueError: Raised if the 'add_adapter' attribute is set to True in the config, as sequence classification
-                does not support the use of Wav2Vec2 adapters.
-        """
+    def __init__(self, config):
         super().__init__(config)
 
         if hasattr(config, "add_adapter") and config.add_adapter:
@@ -3035,7 +1912,7 @@ def __init__(self, config: Wav2Vec2Config):
         self.wav2vec2 = Wav2Vec2Model(config)
         num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
         if config.use_weighted_layer_sum:
-            self.layer_weights = Parameter(ops.ones(num_layers) / num_layers)
+            self.layer_weights = nn.Parameter(ops.ones(num_layers) / num_layers)
         self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
         self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)
 
@@ -3048,7 +1925,7 @@ def freeze_feature_extractor(self):
         not be updated during training.
         """
         warnings.warn(
-            "The method `freeze_feature_extractor` is deprecated. "
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
             "Please use the equivalent `freeze_feature_encoder` method instead.",
             FutureWarning,
         )
@@ -3066,25 +1943,25 @@ def freeze_base_model(self):
         Calling this function will disable the gradient computation for the base model so that its parameters will not
         be updated during training. Only the classification head will be updated.
         """
-        for _, param in self.wav2vec2.parameters_and_names():
+        for param in self.wav2vec2.parameters():
             param.requires_grad = False
 
     def forward(
         self,
-        input_values: Optional[Tensor],
-        attention_mask: Optional[Tensor] = None,
+        input_values: Optional[mindspore.Tensor],
+        attention_mask: Optional[mindspore.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Tensor] = None,
+        labels: Optional[mindspore.Tensor] = None,
     ) -> Union[Tuple, SequenceClassifierOutput]:
         r"""
-        Args:
-            labels (`Tensor` of shape `(batch_size,)`, *optional*):
-                Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-                config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-                `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
+
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
 
@@ -3099,25 +1976,25 @@ def forward(
         if self.config.use_weighted_layer_sum:
             hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
             hidden_states = ops.stack(hidden_states, dim=1)
-            norm_weights = ops.softmax(self.layer_weights, dim=-1)
-            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(axis=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
         else:
             hidden_states = outputs[0]
 
         hidden_states = self.projector(hidden_states)
         if attention_mask is None:
-            pooled_output = hidden_states.mean(axis=1)
+            pooled_output = ops.mean(hidden_states, dim=1)
         else:
             padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
             hidden_states[~padding_mask] = 0.0
-            pooled_output = hidden_states.sum(axis=1) / padding_mask.sum(axis=1).view(-1, 1)
+            pooled_output = ops.sum(hidden_states, dim=1) / ops.sum(padding_mask, dim=1).view(-1, 1)
 
         logits = self.classifier(pooled_output)
 
         loss = None
         if labels is not None:
-            labels = labels.astype(mindspore.int32)
-            loss = F.cross_entropy(logits.view(-1, self.config.num_labels), labels.view(-1))
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
 
         if not return_dict:
             output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
@@ -3130,53 +2007,8 @@ def forward(
             attentions=outputs.attentions,
         )
 
-
 class Wav2Vec2ForAudioFrameClassification(Wav2Vec2PreTrainedModel):
-
-    """
-    This class represents a Wav2Vec2 model for audio frame classification. It inherits from the Wav2Vec2PreTrainedModel
-    and includes methods for initializing the model, freezing the feature encoder and base model, as well as
-    forwarding the model for inference and training.
-
-    Attributes:
-        wav2vec2 (Wav2Vec2Model): The Wav2Vec2Model used for audio frame classification.
-        classifier (nn.Linear): The classification head for the model.
-        num_labels (int): The number of labels for classification.
-        layer_weights (Parameter, optional): The weights for weighted layer sum if configured.
-
-    Methods:
-        __init__:
-            Initializes the Wav2Vec2ForAudioFrameClassification model with the provided configuration.
-
-        freeze_feature_encoder:
-            Disables the gradient computation for the feature encoder, preventing its parameters from being updated
-            during training.
-
-        freeze_base_model:
-            Disables the gradient computation for the base model, preventing its parameters from being updated during
-            training while allowing the classification head to be updated.
-
-        forward:
-            Constructs the model for inference and training, handling input values, attention masks, labels, and other
-            optional parameters. Returns TokenClassifierOutput containing loss, logits, hidden states, and attentions.
-    """
-    def __init__(self, config: Wav2Vec2Config):
-        """
-        Initializes a new instance of the Wav2Vec2ForAudioFrameClassification class.
-
-        Args:
-            self: The instance of the class.
-            config (Wav2Vec2Config): The configuration object for the Wav2Vec2 model.
-                It specifies the parameters and settings for the model initialization.
-                Must be an instance of Wav2Vec2Config.
-
-        Returns:
-            None.
-
-        Raises:
-            ValueError: If the 'config' object has the attribute 'add_adapter' set to True,
-                which is not supported for audio frame classification with Wav2Vec2.
-        """
+    def __init__(self, config):
         super().__init__(config)
 
         if hasattr(config, "add_adapter") and config.add_adapter:
@@ -3186,12 +2018,24 @@ def __init__(self, config: Wav2Vec2Config):
         self.wav2vec2 = Wav2Vec2Model(config)
         num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
         if config.use_weighted_layer_sum:
-            self.layer_weights = Parameter(ops.ones(num_layers) / num_layers)
+            self.layer_weights = nn.Parameter(ops.ones(num_layers) / num_layers)
         self.classifier = nn.Linear(config.hidden_size, config.num_labels)
         self.num_labels = config.num_labels
 
         self.init_weights()
 
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
     def freeze_feature_encoder(self):
         """
         Calling this function will disable the gradient computation for the feature encoder so that its parameter will
@@ -3204,25 +2048,25 @@ def freeze_base_model(self):
         Calling this function will disable the gradient computation for the base model so that its parameters will not
         be updated during training. Only the classification head will be updated.
         """
-        for _, param in self.wav2vec2.parameters_and_names():
+        for param in self.wav2vec2.parameters():
             param.requires_grad = False
 
     def forward(
         self,
-        input_values: Optional[Tensor],
-        attention_mask: Optional[Tensor] = None,
-        labels: Optional[Tensor] = None,
+        input_values: Optional[mindspore.Tensor],
+        attention_mask: Optional[mindspore.Tensor] = None,
+        labels: Optional[mindspore.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, TokenClassifierOutput]:
         r"""
-        Args:
-            labels (`Tensor` of shape `(batch_size,)`, *optional*):
-                Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-                config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-                `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
+
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
 
@@ -3237,8 +2081,8 @@ def forward(
         if self.config.use_weighted_layer_sum:
             hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
             hidden_states = ops.stack(hidden_states, dim=1)
-            norm_weights = ops.softmax(self.layer_weights, dim=-1)
-            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(axis=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
         else:
             hidden_states = outputs[0]
 
@@ -3246,8 +2090,8 @@ def forward(
 
         loss = None
         if labels is not None:
-            labels = labels.astype(mindspore.int32)
-            loss = F.cross_entropy(logits.view(-1, self.num_labels), ops.argmax(labels.view(-1, self.num_labels), dim=1))
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), ops.argmax(labels.view(-1, self.num_labels), dim=1))
 
         if not return_dict:
             output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
@@ -3262,120 +2106,30 @@ def forward(
 
 
 class AMSoftmaxLoss(nn.Module):
-
-    """
-    The AMSoftmaxLoss class represents a neural network cell for computing the AM-Softmax loss. This class inherits
-    from nn.Module and provides methods for initializing the loss function and forwarding the computation graph.
-
-    Attributes:
-        scale (float): The scale parameter for the AM-Softmax loss function.
-        margin (float): The margin parameter for the AM-Softmax loss function.
-        num_labels (int): The number of unique labels in the dataset.
-        weight (Parameter): The weight parameter for the neural network.
-
-    Methods:
-        __init__: Initializes the AMSoftmaxLoss instance with input dimension, number of labels, scale, and margin.
-
-        forward: Constructs the computation graph for the AM-Softmax loss function using the given
-            hidden states and labels.
-
-    Note:
-        The AMSoftmaxLoss class is designed for use in neural network training and optimization tasks.
-    """
     def __init__(self, input_dim, num_labels, scale=30.0, margin=0.4):
-        """
-        __init__
-
-        Initializes an instance of the AMSoftmaxLoss class.
-
-        Args:
-            self (object): The instance of the class.
-            input_dim (int): The dimension of the input features.
-            num_labels (int): The number of unique labels for classification.
-            scale (float, optional): The scale factor for the angular margin. Defaults to 30.0.
-            margin (float, optional): The angular margin value. Defaults to 0.4.
-
-        Returns:
-            None.
-
-        Raises:
-            ValueError: If input_dim or num_labels are not positive integers.
-            TypeError: If scale or margin are not of type float.
-        """
-        super().__init__()
+        super(AMSoftmaxLoss, self).__init__()
         self.scale = scale
         self.margin = margin
         self.num_labels = num_labels
-        self.weight = Parameter(ops.randn(input_dim, num_labels), requires_grad=True)
+        self.weight = nn.Parameter(ops.randn(input_dim, num_labels), requires_grad=True)
+        self.loss = nn.CrossEntropyLoss()
 
     def forward(self, hidden_states, labels):
-        """
-        This method forwards an AMSoftmax loss function.
-
-        Args:
-            self (object): The instance of the AMSoftmaxLoss class.
-            hidden_states (tensor): A tensor representing the hidden states of the model.
-            labels (tensor): A tensor containing the ground truth labels for the corresponding hidden states.
-                It is expected that the labels are flattened for processing.
-
-        Returns:
-            None.
-
-        Raises:
-            ValueError: If the dimensions of the weight tensor and hidden_states tensor are not compatible
-                for matrix multiplication.
-            RuntimeError: If there is an issue with the normalization operation on the weight or hidden_states tensor.
-            ValueError: If the labels tensor does not match the expected shape for one-hot encoding.
-            RuntimeError: If there is a problem with the cross-entropy calculation.
-        """
         labels = labels.flatten()
-        weight = self.weight / ops.norm(self.weight, dim=0, keepdim=True)
-        hidden_states = hidden_states / ops.norm(hidden_states, dim=1, keepdim=True)
+        weight = nn.functional.normalize(self.weight, dim=0)
+        hidden_states = nn.functional.normalize(hidden_states, dim=1)
         cos_theta = ops.mm(hidden_states, weight)
         psi = cos_theta - self.margin
 
-        onehot = ops.one_hot(labels, self.num_labels)
+        onehot = nn.functional.one_hot(labels, self.num_labels)
         logits = self.scale * ops.where(onehot.bool(), psi, cos_theta)
-        loss = F.cross_entropy(logits, labels)
+        loss = self.loss(logits, labels)
+
         return loss
 
 
 class TDNNLayer(nn.Module):
-
-    """TDNNLayer represents a time-delay neural network (TDNN) layer for processing sequential data.
-    It inherits from nn.Module and is initialized with a Wav2Vec2Config and an optional layer_id.
-
-    Attributes:
-        config (Wav2Vec2Config): The configuration for the Wav2Vec2 model.
-        layer_id (int): The index of the TDNN layer.
-
-    Methods:
-        forward(hidden_states): Applies the TDNN layer operations to the input hidden_states.
-
-    The TDNNLayer class applies a convolutional layer with specified kernel size and dilation to the input data.
-    It then applies a ReLU activation function to the output.
-
-    Note:
-        This class is part of the Wav2Vec2 model architecture.
-
-    """
-    def __init__(self, config: Wav2Vec2Config, layer_id=0):
-        """
-        Initializes a TDNNLayer object.
-
-        Args:
-            self: The instance of the TDNNLayer class.
-            config (Wav2Vec2Config): An instance of Wav2Vec2Config that holds configuration parameters for the layer.
-            layer_id (int): An integer representing the ID of the layer. Default is 0. Must be within the range of
-                available layers in the configuration.
-
-        Returns:
-            None.
-
-        Raises:
-            TypeError: If the config parameter is not of type Wav2Vec2Config.
-            ValueError: If the layer_id is outside the valid range of available layers in the configuration.
-        """
+    def __init__(self, config, layer_id=0):
         super().__init__()
         self.in_conv_dim = config.tdnn_dim[layer_id - 1] if layer_id > 0 else config.tdnn_dim[layer_id]
         self.out_conv_dim = config.tdnn_dim[layer_id]
@@ -3385,76 +2139,32 @@ def __init__(self, config: Wav2Vec2Config, layer_id=0):
         self.kernel = nn.Linear(self.in_conv_dim * self.kernel_size, self.out_conv_dim)
         self.activation = nn.ReLU()
 
-    def forward(self, hidden_states):
-        '''
-        Constructs the TDNN layer with the input hidden_states.
-
-        Args:
-            self (TDNNLayer): The instance of the TDNNLayer class.
-            hidden_states (Tensor): The input hidden states to be processed by the TDNN layer.
-                It should be a tensor of shape (batch_size, in_channels, sequence_length).
+    def forward(self, hidden_states: mindspore.Tensor) -> mindspore.Tensor:
+        from ....peft.tuners.lora import LoraLayer
+        if isinstance(self.kernel, LoraLayer):
+            warnings.warn(
+                "Detected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. "
+                "You should exclude TDNNLayer from LoRA's target modules.",
+            )
 
-        Returns:
-            hidden_states (Tensor): The processed hidden states after applying the TDNN layer operations.
-                It will be a tensor of shape (batch_size, out_channels, new_length), where out_channels is the number
-                of output channels and new_length is the length of the output sequence.
-
-        Raises:
-            TypeError: If the input hidden_states is not a tensor.
-            ValueError: If the input hidden_states does not have the expected shape or dimensions.
-        '''
-        hidden_states = hidden_states.unsqueeze(1)
-        hidden_states = F.unfold(
-            hidden_states,
-            (self.kernel_size, self.in_conv_dim),
-            stride=(1, self.in_conv_dim),
-            dilation=(self.dilation, 1),
-        )
-        hidden_states = hidden_states.swapaxes(1, 2)
-        hidden_states = self.kernel(hidden_states)
+        # for backward compatibility, we keep nn.Linear but call F.conv1d for speed up
+        hidden_states = ops.transpose(hidden_states, 1, 2)
+        weight = ops.transpose(self.kernel.weight.view(self.out_conv_dim, self.kernel_size, self.in_conv_dim), 1, 2)
+        hidden_states = nn.functional.conv1d(hidden_states, weight, self.kernel.bias, dilation=self.dilation)
+        hidden_states = ops.transpose(hidden_states, 1, 2)
 
         hidden_states = self.activation(hidden_states)
         return hidden_states
 
 
 class Wav2Vec2ForXVector(Wav2Vec2PreTrainedModel):
-
-    """
-    This class represents a Wav2Vec2 model for extracting x-vector embeddings from audio data. It inherits from the
-    Wav2Vec2PreTrainedModel class, and provides methods for freezing specific model components and computing x-vector
-    embeddings from input audio data.
-
-    The class contains methods for freezing the feature extractor, freezing the feature encoder, and freezing the base
-    model to disable gradient computation for specific model components. Additionally, it includes methods for computing
-    the output length of the TDNN layers and for forwarding x-vector embeddings from input audio data.
-
-    The forward method takes input audio data and optional parameters such as attention mask and labels, and returns
-    x-vector embeddings along with optional loss and hidden states. The method also supports outputting hidden states
-    and attentions based on the configuration settings.
-
-    This class is designed to be used for x-vector extraction tasks and provides flexibility for customizing the model's
-    behavior and freezing specific components during training.
-    """
-    def __init__(self, config: Wav2Vec2Config):
-        """
-        Initializes an instance of the Wav2Vec2ForXVector class.
-
-        Args:
-            self: The instance of the Wav2Vec2ForXVector class.
-            config (Wav2Vec2Config): An object of type Wav2Vec2Config containing configuration settings for the model.
-
-        Returns:
-            None.
-
-        Raises:
-            None.
-        """
+    def __init__(self, config):
         super().__init__(config)
 
         self.wav2vec2 = Wav2Vec2Model(config)
         num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
         if config.use_weighted_layer_sum:
-            self.layer_weights = Parameter(ops.ones(num_layers) / num_layers)
+            self.layer_weights = nn.Parameter(ops.ones(num_layers) / num_layers)
         self.projector = nn.Linear(config.hidden_size, config.tdnn_dim[0])
 
         tdnn_layers = [TDNNLayer(config, i) for i in range(len(config.tdnn_dim))]
@@ -3473,7 +2183,7 @@ def freeze_feature_extractor(self):
         not be updated during training.
         """
         warnings.warn(
-            "The method `freeze_feature_extractor` is deprecated. "
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
             "Please use the equivalent `freeze_feature_encoder` method instead.",
             FutureWarning,
         )
@@ -3491,16 +2201,16 @@ def freeze_base_model(self):
         Calling this function will disable the gradient computation for the base model so that its parameters will not
         be updated during training. Only the classification head will be updated.
         """
-        for named, param in self.wav2vec2.parameters_and_names():
+        for param in self.wav2vec2.parameters():
             param.requires_grad = False
 
-    def _get_tdnn_output_lengths(self, input_lengths: Union[Tensor, int]):
+    def _get_tdnn_output_lengths(self, input_lengths: Union[mindspore.Tensor, int]):
         """
         Computes the output length of the TDNN layers
         """
+
         def _conv_out_length(input_length, kernel_size, stride):
             # 1D convolutional layer output length formula taken
-            # from https://pyops.org/docs/stable/generated/ops.nn.Conv1d.html
             return (input_length - kernel_size) // stride + 1
 
         for kernel_size in self.config.tdnn_kernel:
@@ -3510,20 +2220,20 @@ def _conv_out_length(input_length, kernel_size, stride):
 
     def forward(
         self,
-        input_values: Optional[Tensor],
-        attention_mask: Optional[Tensor] = None,
+        input_values: Optional[mindspore.Tensor],
+        attention_mask: Optional[mindspore.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        labels: Optional[Tensor] = None,
+        labels: Optional[mindspore.Tensor] = None,
     ) -> Union[Tuple, XVectorOutput]:
         r"""
-        Args:
-            labels (`Tensor` of shape `(batch_size,)`, *optional*):
-                Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-                config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-                `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
+
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
 
@@ -3538,8 +2248,8 @@ def forward(
         if self.config.use_weighted_layer_sum:
             hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
             hidden_states = ops.stack(hidden_states, dim=1)
-            norm_weights = ops.softmax(self.layer_weights, dim=-1)
-            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(axis=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
         else:
             hidden_states = outputs[0]
 
@@ -3550,17 +2260,16 @@ def forward(
 
         # Statistic Pooling
         if attention_mask is None:
-            mean_features = hidden_states.mean(axis=1)
-            #std_features = hidden_states.std(axis=1)   # NOTE: buggy API
-            std_features = ops.std(hidden_states, dim=1, keepdim=True).squeeze(1)
+            mean_features = ops.mean(hidden_states, dim=1)
+            std_features = ops.std(hidden_states, dim=1)
         else:
-            feat_extract_output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(axis=1))
+            feat_extract_output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(dim=1))
             tdnn_output_lengths = self._get_tdnn_output_lengths(feat_extract_output_lengths)
             mean_features = []
             std_features = []
             for i, length in enumerate(tdnn_output_lengths):
-                mean_features.append(hidden_states[i, :length].mean(axis=0))
-                std_features.append(hidden_states[i, :length].std(axis=0))
+                mean_features.append(ops.mean(hidden_states[i, :length], dim=0))
+                std_features.append(ops.std(hidden_states[i, :length], dim=0))
             mean_features = ops.stack(mean_features)
             std_features = ops.stack(std_features)
         statistic_pooling = ops.cat([mean_features, std_features], dim=-1)
@@ -3570,7 +2279,6 @@ def forward(
 
         loss = None
         if labels is not None:
-            labels = labels.astype(mindspore.int32)
             loss = self.objective(logits, labels)
 
         if not return_dict:
@@ -3584,3 +2292,14 @@ def forward(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
+
+__all__ = [
+    "Wav2Vec2ForAudioFrameClassification",
+    "Wav2Vec2ForCTC",
+    "Wav2Vec2ForMaskedLM",
+    "Wav2Vec2ForPreTraining",
+    "Wav2Vec2ForSequenceClassification",
+    "Wav2Vec2ForXVector",
+    "Wav2Vec2Model",
+    "Wav2Vec2PreTrainedModel",
+]
diff --git a/mindnlp/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py b/mindnlp/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py
index fca66596d..eef59c342 100644
--- a/mindnlp/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py
+++ b/mindnlp/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py
@@ -15,15 +15,12 @@
 """MindSpore Wav2Vec2-BERT model."""
 
 import math
+import warnings
 from typing import Optional, Tuple, Union
 
 import numpy as np
 import mindspore
-from mindspore import Parameter
-from mindspore.common.initializer import initializer, Normal, Uniform, HeNormal, XavierUniform
-
 from mindnlp.core import nn, ops
-from mindnlp.core.nn import functional as F
 from mindnlp.core.nn import CrossEntropyLoss
 
 from ...activations import ACT2FN
@@ -48,6 +45,18 @@
 
 _HIDDEN_STATES_START_POSITION = 2
 
+# General docstring
+_CONFIG_FOR_DOC = "Wav2Vec2BertConfig"
+
+# Base docstring
+_BASE_CHECKPOINT_FOR_DOC = "facebook/w2v-bert-2.0"
+_PRETRAINED_CHECKPOINT_FOR_DOC = "hf-audio/wav2vec2-bert-CV16-en"
+_EXPECTED_OUTPUT_SHAPE = [1, 146, 1024]
+
+# CTC docstring
+_CTC_EXPECTED_OUTPUT = "'mr quilter is the apostle of the middle classes and we are glad to welcome his gospel'"
+_CTC_EXPECTED_LOSS = 17.04
+
 
 # Copied from transformers.models.seamless_m4t_v2.modeling_seamless_m4t_v2._compute_new_attention_mask
 def _compute_new_attention_mask(hidden_states: mindspore.Tensor, seq_lens: mindspore.Tensor):
@@ -64,9 +73,9 @@ def _compute_new_attention_mask(hidden_states: mindspore.Tensor, seq_lens: minds
     """
     batch_size, mask_seq_len = hidden_states.shape[:2]
 
-    indices = ops.arange(mask_seq_len).expand(batch_size, -1)
+    indices = ops.arange(mask_seq_len).broadcast_to((batch_size, -1))
 
-    bool_mask = indices >= seq_lens.unsqueeze(1).expand(-1, mask_seq_len)
+    bool_mask = indices >= seq_lens.unsqueeze(1).broadcast_to((-1, mask_seq_len))
 
     mask = hidden_states.new_ones((batch_size, mask_seq_len))
 
@@ -213,7 +222,6 @@ def _sample_negative_indices(
     mask_time_indices = (
         mask_time_indices.astype(bool) if mask_time_indices is not None else np.ones(features_shape, dtype=bool)
     )
-    mask_time_indices = mask_time_indices.numpy() if isinstance(mask_time_indices, mindspore.Tensor) else mask_time_indices
 
     for batch_idx in range(batch_size):
         high = mask_time_indices[batch_idx].sum() - 1
@@ -246,7 +254,7 @@ def __init__(self, config):
 
         inv_freq = 1.0 / (base ** (ops.arange(0, dim, 2, dtype=mindspore.int64).float() / dim))
         # Ignore copy
-        self.inv_freq = inv_freq
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.cached_sequence_length = None
         self.cached_rotary_positional_embedding = None
 
@@ -278,7 +286,7 @@ def __init__(self, config):
         self.max_len = config.max_source_positions
         self.d_model = config.hidden_size
         self.pe = None
-        self.extend_pe(mindspore.tensor(0.0).expand(1, self.max_len))
+        self.extend_pe(mindspore.tensor(0.0).broadcast_to((1, self.max_len)))
 
     def extend_pe(self, x):
         # Reset the positional encodings
@@ -325,7 +333,7 @@ def __init__(self, config):
         super().__init__()
         self.layer_norm = nn.LayerNorm(config.feature_projection_input_dim, eps=config.layer_norm_eps)
         self.projection = nn.Linear(config.feature_projection_input_dim, config.hidden_size)
-        self.dropout = nn.Dropout(p=config.feat_proj_dropout)
+        self.dropout = nn.Dropout(config.feat_proj_dropout)
 
     def forward(self, hidden_states):
         # non-projected hidden states are needed for quantization
@@ -340,13 +348,13 @@ def __init__(self, config, act_fn=None, hidden_size=None):
         super().__init__()
         act_fn = act_fn if act_fn is not None else config.hidden_act
         hidden_size = hidden_size if hidden_size is not None else config.hidden_size
-        self.intermediate_dropout = nn.Dropout(p=config.activation_dropout)
+        self.intermediate_dropout = nn.Dropout(config.activation_dropout)
 
         self.intermediate_dense = nn.Linear(hidden_size, config.intermediate_size)
         self.intermediate_act_fn = ACT2FN[act_fn] if isinstance(act_fn, str) else act_fn
 
         self.output_dense = nn.Linear(config.intermediate_size, hidden_size)
-        self.output_dropout = nn.Dropout(p=config.hidden_dropout)
+        self.output_dropout = nn.Dropout(config.hidden_dropout)
 
     # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeedForward.forward
     def forward(self, hidden_states):
@@ -396,7 +404,7 @@ def __init__(self, config):
             padding=0,
             bias=False,
         )
-        self.dropout = nn.Dropout(p=config.conformer_conv_dropout)
+        self.dropout = nn.Dropout(config.conformer_conv_dropout)
 
     def forward(self, hidden_states, attention_mask=None):
         hidden_states = self.layer_norm(hidden_states)
@@ -416,7 +424,7 @@ def forward(self, hidden_states, attention_mask=None):
         hidden_states = self.glu(hidden_states)
 
         # Pad the sequence entirely on the left because of causal convolution.
-        hidden_states = ops.pad(hidden_states, (self.depthwise_conv.kernel_size[0] - 1, 0))
+        hidden_states = nn.functional.pad(hidden_states, (self.depthwise_conv.kernel_size[0] - 1, 0))
 
         # 1D Depthwise Conv
         hidden_states = self.depthwise_conv(hidden_states)
@@ -456,8 +464,8 @@ def __init__(self, config, is_adapter_attention=False):
             self.linear_pos = nn.Linear(hidden_size, hidden_size, bias=False)
             # these two learnable bias are used in matrix c and matrix d
             # as described in https://arxiv.org/abs/1901.02860 Section 3.3
-            self.pos_bias_u = Parameter(ops.zeros(self.num_heads, self.head_size))
-            self.pos_bias_v = Parameter(ops.zeros(self.num_heads, self.head_size))
+            self.pos_bias_u = nn.Parameter(ops.zeros(self.num_heads, self.head_size))
+            self.pos_bias_v = nn.Parameter(ops.zeros(self.num_heads, self.head_size))
 
         if self.position_embeddings_type == "relative_key":
             self.left_max_position_embeddings = config.left_max_position_embeddings
@@ -616,7 +624,7 @@ def __init__(self, config):
 
         # Self-Attention
         self.self_attn_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
-        self.self_attn_dropout = nn.Dropout(p=dropout)
+        self.self_attn_dropout = nn.Dropout(dropout)
         self.self_attn = Wav2Vec2BertSelfAttention(config)
 
         # Conformer Convolution
@@ -680,8 +688,9 @@ def __init__(self, config):
         else:
             self.embed_positions = None
 
-        self.dropout = nn.Dropout(p=config.hidden_dropout)
+        self.dropout = nn.Dropout(config.hidden_dropout)
         self.layers = nn.ModuleList([Wav2Vec2BertEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
 
     def forward(
         self,
@@ -702,8 +711,8 @@ def forward(
             # extend attention_mask
             attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
             attention_mask = attention_mask * float(ops.finfo(hidden_states.dtype).min)
-            attention_mask = attention_mask.expand(
-                attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
+            attention_mask = attention_mask.broadcast_to(
+                (attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1])
             )
 
         hidden_states = self.dropout(hidden_states)
@@ -722,13 +731,24 @@ def forward(
 
             skip_the_layer = self.training and (dropout_probability < self.config.layerdrop)
             if not skip_the_layer:
-                layer_outputs = layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    relative_position_embeddings=relative_position_embeddings,
-                    output_attentions=output_attentions,
-                    conv_attention_mask=conv_attention_mask,
-                )
+                # under deepspeed zero3 all gpus must run in sync
+                if self.gradient_checkpointing and self.training:
+                    layer_outputs = self._gradient_checkpointing_func(
+                        layer.__call__,
+                        hidden_states,
+                        attention_mask,
+                        relative_position_embeddings,
+                        output_attentions,
+                        conv_attention_mask,
+                    )
+                else:
+                    layer_outputs = layer(
+                        hidden_states,
+                        attention_mask=attention_mask,
+                        relative_position_embeddings=relative_position_embeddings,
+                        output_attentions=output_attentions,
+                        conv_attention_mask=conv_attention_mask,
+                    )
                 hidden_states = layer_outputs[0]
 
             if skip_the_layer:
@@ -758,7 +778,7 @@ def __init__(self, config):
             self.proj_layer_norm = nn.LayerNorm(config.output_hidden_size, eps=config.layer_norm_eps)
         else:
             self.proj = self.proj_layer_norm = None
-        self.layers = nn.ModuleList([Wav2Vec2BertAdapterLayer(config) for _ in range(config.num_adapter_layers)])
+        self.layers = nn.ModuleList(Wav2Vec2BertAdapterLayer(config) for _ in range(config.num_adapter_layers))
         self.layerdrop = config.layerdrop
 
         self.kernel_size = config.adapter_kernel_size
@@ -769,7 +789,7 @@ def _compute_sub_sample_lengths_from_attention_mask(self, seq_lens):
             return seq_lens
         pad = self.kernel_size // 2
         seq_lens = ((seq_lens + 2 * pad - self.kernel_size) / self.stride) + 1
-        return seq_lens #.floor()
+        return seq_lens.floor()
 
     def forward(self, hidden_states, attention_mask=None):
         # down project hidden_states if necessary
@@ -822,7 +842,7 @@ def __init__(self, config):
             padding=self.stride // 2,
         )
         self.self_attn = Wav2Vec2BertSelfAttention(config, is_adapter_attention=True)
-        self.self_attn_dropout = nn.Dropout(p=dropout)
+        self.self_attn_dropout = nn.Dropout(dropout)
 
         # Feed-forward
         self.ffn_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
@@ -890,34 +910,34 @@ class Wav2Vec2BertPreTrainedModel(PreTrainedModel):
     config_class = Wav2Vec2BertConfig
     base_model_prefix = "wav2vec2_bert"
     main_input_name = "input_features"
-    supports_gradient_checkpointing = False
+    supports_gradient_checkpointing = True
 
     # Ignore copy
     def _init_weights(self, module):
         """Initialize the weights"""
         if isinstance(module, Wav2Vec2BertSelfAttention):
             if hasattr(module, "pos_bias_u"):
-                module.pos_bias_u.set_data(initializer(XavierUniform(), module.pos_bias_u.shape, module.pos_bias_u.dtype))
+                nn.init.xavier_uniform_(module.pos_bias_u)
             if hasattr(module, "pos_bias_v"):
-                module.pos_bias_v.set_data(initializer(XavierUniform(), module.pos_bias_v.shape, module.pos_bias_v.dtype))
+                nn.init.xavier_uniform_(module.pos_bias_v)
         elif isinstance(module, Wav2Vec2BertFeatureProjection):
-            k = math.sqrt(1 / module.projection.in_channels)
-            module.projection.weight.set_data(initializer(Uniform(k), module.projection.weight.shape, module.projection.weight.dtype))
-            module.projection.bias.set_data(initializer(Uniform(k), module.projection.bias.shape, module.projection.bias.dtype))
+            k = math.sqrt(1 / module.projection.in_features)
+            nn.init.uniform_(module.projection.weight, a=-k, b=k)
+            nn.init.uniform_(module.projection.bias, a=-k, b=k)
         elif isinstance(module, nn.Linear):
-            module.weight.set_data(initializer(Normal(self.config.initializer_range), module.weight.shape, module.weight.dtype))
+            nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
 
             if module.bias is not None:
-                module.bias.set_data(initializer('zeros', module.bias.shape, module.bias.dtype))
+                nn.init.zeros_(module.bias)
         elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
-            module.bias.set_data(initializer('zeros', module.bias.shape, module.bias.dtype))
-            module.weight.set_data(initializer('ones', module.weight.shape, module.weight.dtype))
+            nn.init.zeros_(module.bias)
+            nn.init.ones_(module.weight)
         elif isinstance(module, nn.Conv1d):
-            module.weight.set_data(initializer(HeNormal(), module.weight.shape, module.weight.dtype))
+            nn.init.kaiming_normal_(module.weight)
 
             if module.bias is not None:
-                k = math.sqrt(module.group / (module.in_channels * module.kernel_size[0]))
-                module.bias.set_data(initializer(Uniform(k), module.bias.shape, module.bias.dtype))
+                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-k, b=k)
 
     # Ignore copy
     def _get_feat_extract_output_lengths(
@@ -931,7 +951,6 @@ def _get_feat_extract_output_lengths(
 
         def _conv_out_length(input_length, kernel_size, stride, padding):
             # 1D convolutional layer output length formula taken
-            # from https://pyops.org/docs/stable/generated/ops.nn.Conv1d.html
             return ops.div(input_length + 2 * padding - kernel_size, stride, rounding_mode="floor") + 1
 
         if add_adapter:
@@ -948,7 +967,7 @@ def _get_feature_vector_attention_mask(
     ):
         # Effectively attention_mask.sum(-1), but not inplace to be able to run
         # on inference mode.
-        non_padded_lengths = attention_mask.cumsum(axis=-1)[:, -1]
+        non_padded_lengths = ops.cumsum(attention_mask, dim=-1)[:, -1]
 
         output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
         output_lengths = output_lengths.to(mindspore.int64)
@@ -960,54 +979,10 @@ def _get_feature_vector_attention_mask(
         )
         # these two operations makes sure that all values before the output lengths idxs are attended to
         attention_mask[(ops.arange(attention_mask.shape[0]), output_lengths - 1)] = 1
-        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+        attention_mask = attention_mask.flip([-1]).int().cumsum(-1).flip([-1]).bool()
         return attention_mask
 
 
-WAV2VEC2_BERT_START_DOCSTRING = r"""
-    Wav2Vec2Bert was proposed in [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech
-    Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael
-    Auli.
-
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving etc.).
-
-    This model is a MindSpore [nn.Module](https://pyops.org/docs/stable/nn.html#nn.Module) sub-class. Use it as a
-    regular MindSpore Module and refer to the MindSpore documentation for all matter related to general usage and behavior.
-
-    Parameters:
-        config ([`Wav2Vec2BertConfig`]): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-WAV2VEC2_BERT_INPUTS_DOCSTRING = r"""
-    Args:
-        input_features (`mindspore.Tensor` of shape `(batch_size, sequence_length)`):
-            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
-            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
-            soundfile`). To prepare the array into `input_features`, the [`AutoProcessor`] should be used for padding and
-            conversion into a tensor of type `mindspore.Tensor`. See [`Wav2Vec2BertProcessor.__call__`] for details.
-        attention_mask (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
-            1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
 class Wav2Vec2BertModel(Wav2Vec2BertPreTrainedModel):
     def __init__(self, config: Wav2Vec2BertConfig):
         super().__init__(config)
@@ -1016,8 +991,7 @@ def __init__(self, config: Wav2Vec2BertConfig):
 
         # model only needs masking vector if mask prob is > 0.0
         if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
-            tensor = ops.zeros([config.hidden_size])
-            self.masked_spec_embed = Parameter(initializer(Uniform(), tensor.shape, tensor.dtype))
+            self.masked_spec_embed = nn.Parameter(ops.randn(config.hidden_size))
 
         self.encoder = Wav2Vec2BertEncoder(config)
 
@@ -1072,7 +1046,7 @@ def _mask_hidden_states(
                 min_masks=self.config.mask_feature_min_masks,
             )
             mask_feature_indices = mindspore.tensor(mask_feature_indices, dtype=mindspore.bool_)
-            mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
+            mask_feature_indices = mask_feature_indices[:, None].broadcast_to((-1, sequence_length, -1))
             hidden_states[mask_feature_indices] = 0
 
         return hidden_states
@@ -1126,11 +1100,12 @@ def forward(
 
 
 class Wav2Vec2BertForCTC(Wav2Vec2BertPreTrainedModel):
+    # Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerForCTC.__init__ with Wav2Vec2Conformer->Wav2Vec2Bert,WAV2VEC2_CONFORMER->WAV2VEC2_BERT,wav2vec2_conformer->wav2vec2_bert
     def __init__(self, config, target_lang: Optional[str] = None):
         super().__init__(config)
 
         self.wav2vec2_bert = Wav2Vec2BertModel(config)
-        self.dropout = nn.Dropout(p=config.final_dropout)
+        self.dropout = nn.Dropout(config.final_dropout)
 
         self.target_lang = target_lang
 
@@ -1185,8 +1160,6 @@ def forward(
 
         loss = None
         if labels is not None:
-            labels = labels.astype(mindspore.int32)
-
             # retrieve loss input_lengths from attention_mask
             attention_mask = (
                 attention_mask
@@ -1202,11 +1175,11 @@ def forward(
             flattened_targets = labels.masked_select(labels_mask)
 
             # ctc_loss doesn't support fp16
-            log_probs = F.log_softmax(logits, dim=-1).swapaxes(0, 1)
+            log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=mindspore.float32).swapaxes(0, 1)
 
-            loss = F.ctc_loss(
+            loss = nn.functional.ctc_loss(
                 log_probs,
-                labels,     # flattened_targets
+                labels,
                 input_lengths,
                 target_lengths,
                 blank=self.config.pad_token_id,
@@ -1222,7 +1195,6 @@ def forward(
             loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
         )
 
-
 class Wav2Vec2BertForSequenceClassification(Wav2Vec2BertPreTrainedModel):
     # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.__init__ with Wav2Vec2->Wav2Vec2Bert,wav2vec2->wav2vec2_bert
     def __init__(self, config):
@@ -1235,7 +1207,7 @@ def __init__(self, config):
         self.wav2vec2_bert = Wav2Vec2BertModel(config)
         num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
         if config.use_weighted_layer_sum:
-            self.layer_weights = Parameter(ops.ones(num_layers) / num_layers)
+            self.layer_weights = nn.Parameter(ops.ones(num_layers) / num_layers)
         self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
         self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)
 
@@ -1281,24 +1253,23 @@ def forward(
         if self.config.use_weighted_layer_sum:
             hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
             hidden_states = ops.stack(hidden_states, dim=1)
-            norm_weights = ops.softmax(self.layer_weights, dim=-1)
-            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(axis=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = ops.sum((hidden_states * norm_weights.view(-1, 1, 1)), dim=1)
         else:
             hidden_states = outputs[0]
 
         hidden_states = self.projector(hidden_states)
         if attention_mask is None:
-            pooled_output = hidden_states.mean(axis=1)
+            pooled_output = ops.mean(hidden_states, dim=1)
         else:
             padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
             hidden_states[~padding_mask] = 0.0
-            pooled_output = hidden_states.sum(axis=1) / padding_mask.sum(axis=1).view(-1, 1)
+            pooled_output = ops.sum(hidden_states, dim=1) / ops.sum(padding_mask, dim=1).view(-1, 1)
 
         logits = self.classifier(pooled_output)
 
         loss = None
         if labels is not None:
-            labels = labels.astype(mindspore.int32)
             loss_fct = CrossEntropyLoss()
             loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
 
@@ -1313,8 +1284,8 @@ def forward(
             attentions=outputs.attentions,
         )
 
-
 class Wav2Vec2BertForAudioFrameClassification(Wav2Vec2BertPreTrainedModel):
+    # Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerForAudioFrameClassification.__init__ with Wav2Vec2Conformer->Wav2Vec2Bert,WAV2VEC2_CONFORMER->WAV2VEC2_BERT,wav2vec2_conformer->wav2vec2_bert
     def __init__(self, config):
         super().__init__(config)
 
@@ -1325,7 +1296,7 @@ def __init__(self, config):
         self.wav2vec2_bert = Wav2Vec2BertModel(config)
         num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
         if config.use_weighted_layer_sum:
-            self.layer_weights = Parameter(ops.ones(num_layers) / num_layers)
+            self.layer_weights = nn.Parameter(ops.ones(num_layers) / num_layers)
         self.classifier = nn.Linear(config.hidden_size, config.num_labels)
         self.num_labels = config.num_labels
 
@@ -1371,8 +1342,8 @@ def forward(
         if self.config.use_weighted_layer_sum:
             hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
             hidden_states = ops.stack(hidden_states, dim=1)
-            norm_weights = ops.softmax(self.layer_weights, dim=-1)
-            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(axis=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = ops.sum((hidden_states * norm_weights.view(-1, 1, 1)), dim=1)
         else:
             hidden_states = outputs[0]
 
@@ -1380,7 +1351,6 @@ def forward(
 
         loss = None
         if labels is not None:
-            labels = labels.astype(mindspore.int32)
             loss_fct = CrossEntropyLoss()
             loss = loss_fct(logits.view(-1, self.num_labels), ops.argmax(labels.view(-1, self.num_labels), dim=1))
 
@@ -1403,17 +1373,17 @@ def __init__(self, input_dim, num_labels, scale=30.0, margin=0.4):
         self.scale = scale
         self.margin = margin
         self.num_labels = num_labels
-        self.weight = Parameter(ops.randn(input_dim, num_labels), requires_grad=True)
+        self.weight = nn.Parameter(ops.randn(input_dim, num_labels), requires_grad=True)
         self.loss = nn.CrossEntropyLoss()
 
     def forward(self, hidden_states, labels):
         labels = labels.flatten()
-        weight = F.normalize(self.weight, dim=0)
-        hidden_states = F.normalize(hidden_states, dim=1)
+        weight = nn.functional.normalize(self.weight, dim=0)
+        hidden_states = nn.functional.normalize(hidden_states, dim=1)
         cos_theta = ops.mm(hidden_states, weight)
         psi = cos_theta - self.margin
 
-        onehot = ops.one_hot(labels, self.num_labels)
+        onehot = nn.functional.one_hot(labels, self.num_labels)
         logits = self.scale * ops.where(onehot.bool(), psi, cos_theta)
         loss = self.loss(logits, labels)
 
@@ -1433,10 +1403,18 @@ def __init__(self, config, layer_id=0):
         self.activation = nn.ReLU()
 
     def forward(self, hidden_states: mindspore.Tensor) -> mindspore.Tensor:
+        from ....peft.tuners.lora import LoraLayer
+
+        if isinstance(self.kernel, LoraLayer):
+            warnings.warn(
+                "Detected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. "
+                "You should exclude TDNNLayer from LoRA's target modules.",
+            )
+
         # for backward compatibility, we keep nn.Linear but call F.conv1d for speed up
         hidden_states = hidden_states.swapaxes(1, 2)
         weight = self.kernel.weight.view(self.out_conv_dim, self.kernel_size, self.in_conv_dim).swapaxes(1, 2)
-        hidden_states = ops.conv1d(hidden_states, weight, self.kernel.bias, dilation=self.dilation)
+        hidden_states = nn.functional.conv1d(hidden_states, weight, self.kernel.bias, dilation=self.dilation)
         hidden_states = hidden_states.swapaxes(1, 2)
 
         hidden_states = self.activation(hidden_states)
@@ -1444,13 +1422,14 @@ def forward(self, hidden_states: mindspore.Tensor) -> mindspore.Tensor:
 
 
 class Wav2Vec2BertForXVector(Wav2Vec2BertPreTrainedModel):
+    # Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerForXVector.__init__ with Wav2Vec2Conformer->Wav2Vec2Bert,WAV2VEC2_CONFORMER->WAV2VEC2_BERT,wav2vec2_conformer->wav2vec2_bert
     def __init__(self, config):
         super().__init__(config)
 
         self.wav2vec2_bert = Wav2Vec2BertModel(config)
         num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
         if config.use_weighted_layer_sum:
-            self.layer_weights = Parameter(ops.ones(num_layers) / num_layers)
+            self.layer_weights = nn.Parameter(ops.ones(num_layers) / num_layers)
         self.projector = nn.Linear(config.hidden_size, config.tdnn_dim[0])
 
         tdnn_layers = [TDNNLayer(config, i) for i in range(len(config.tdnn_dim))]
@@ -1480,7 +1459,6 @@ def _get_tdnn_output_lengths(self, input_lengths: Union[mindspore.Tensor, int]):
 
         def _conv_out_length(input_length, kernel_size, stride):
             # 1D convolutional layer output length formula taken
-            # from https://pyops.org/docs/stable/generated/ops.nn.Conv1d.html
             return (input_length - kernel_size) // stride + 1
 
         for kernel_size in self.config.tdnn_kernel:
@@ -1519,8 +1497,8 @@ def forward(
         if self.config.use_weighted_layer_sum:
             hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
             hidden_states = ops.stack(hidden_states, dim=1)
-            norm_weights = ops.softmax(self.layer_weights, dim=-1)
-            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(axis=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = ops.sum((hidden_states * norm_weights.view(-1, 1, 1)), dim=1)
         else:
             hidden_states = outputs[0]
 
@@ -1531,16 +1509,16 @@ def forward(
 
         # Statistic Pooling
         if attention_mask is None:
-            mean_features = hidden_states.mean(axis=1)
-            std_features = ops.std(hidden_states, dim=1, keepdim=True).squeeze(1)
+            mean_features = ops.mean(hidden_states, dim=1)
+            std_features = ops.std(hidden_states, dim=1)
         else:
-            feat_extract_output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(axis=1))
+            feat_extract_output_lengths = self._get_feat_extract_output_lengths(ops.sum(attention_mask, dim=1))
             tdnn_output_lengths = self._get_tdnn_output_lengths(feat_extract_output_lengths)
             mean_features = []
             std_features = []
             for i, length in enumerate(tdnn_output_lengths):
-                mean_features.append(hidden_states[i, :length].mean(axis=0))
-                std_features.append(hidden_states[i, :length].std(axis=0))
+                mean_features.append(ops.mean(hidden_states[i, :length], dim=0))
+                std_features.append(ops.std(hidden_states[i, :length], dim=0))
             mean_features = ops.stack(mean_features)
             std_features = ops.stack(std_features)
         statistic_pooling = ops.cat([mean_features, std_features], dim=-1)
@@ -1564,7 +1542,6 @@ def forward(
             attentions=outputs.attentions,
         )
 
-
 __all__ = [
     'Wav2Vec2BertForAudioFrameClassification',
     'Wav2Vec2BertForCTC',
diff --git a/mindnlp/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py b/mindnlp/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
index f6ce98c36..6a7f6cfce 100644
--- a/mindnlp/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
+++ b/mindnlp/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""mindnlp Wav2Vec2-Conformer model."""
+"""MindSpore Wav2Vec2-Conformer model."""
 
 import math
 import warnings
@@ -21,15 +21,9 @@
 
 import numpy as np
 import mindspore
-from mindspore import Tensor, Parameter
-from mindspore.common.initializer import initializer, Uniform, Normal
-
 from mindnlp.core import nn, ops
-from mindnlp.core.nn import functional as F
-from mindnlp.utils import (
-    ModelOutput,
-    logging,
-)
+from mindnlp.core.nn import CrossEntropyLoss
+
 from ...activations import ACT2FN
 from ...modeling_outputs import (
     BaseModelOutput,
@@ -40,20 +34,15 @@
     XVectorOutput,
 )
 from ...modeling_utils import PreTrainedModel
+from ....utils import (
+    ModelOutput,
+    logging,
+)
 from .configuration_wav2vec2_conformer import Wav2Vec2ConformerConfig
 
 
 logger = logging.get_logger(__name__)
 
-__all__ = [
-    "Wav2Vec2ConformerForAudioFrameClassification",
-    "Wav2Vec2ConformerForCTC",
-    "Wav2Vec2ConformerForPreTraining",
-    "Wav2Vec2ConformerForSequenceClassification",
-    "Wav2Vec2ConformerForXVector",
-    "Wav2Vec2ConformerModel",
-    "Wav2Vec2ConformerPreTrainedModel",
-]
 
 _HIDDEN_STATES_START_POSITION = 2
 
@@ -85,14 +74,12 @@ class Wav2Vec2ConformerForPreTrainingOutput(ModelOutput):
         projected_quantized_states (`mindspore.Tensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
             Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive
             target vectors for contrastive loss.
-        hidden_states (`tuple(mindspore.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed
-            or when `config.output_hidden_states=True`):
+        hidden_states (`tuple(mindspore.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `mindspore.Tensor` (one for the output of the embeddings + one for the output of each layer) of
             shape `(batch_size, sequence_length, hidden_size)`.
 
             Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(mindspore.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when
-            `config.output_attentions=True`):
+        attentions (`tuple(mindspore.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
             Tuple of `mindspore.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
             sequence_length)`.
 
@@ -114,14 +101,6 @@ class Wav2Vec2ConformerForPreTrainingOutput(ModelOutput):
     diversity_loss: Optional[mindspore.Tensor] = None
 
 
-def is_deepspeed_zero3_enabled():
-    return False
-
-
-def is_peft_available():
-    return False
-
-
 # Copied from transformers.models.wav2vec2.modeling_wav2vec2._compute_mask_indices
 def _compute_mask_indices(
     shape: Tuple[int, int],
@@ -137,15 +116,15 @@ def _compute_mask_indices(
 
     Args:
         shape: The shape for which to compute masks. This should be of a tuple of size 2 where
-            the first element is the batch size and the second element is the length of the axis to span.
-        mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of
-            independently generated mask spans of length `mask_length` is computed by
-            `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
-            actual percentage will be smaller.
+               the first element is the batch size and the second element is the length of the axis to span.
+        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
+                    independently generated mask spans of length `mask_length` is computed by
+                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
+                    actual percentage will be smaller.
         mask_length: size of the mask
         min_masks: minimum number of masked spans
         attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
-            each batch dimension.
+                        each batch dimension.
     """
     batch_size, sequence_length = shape
 
@@ -178,7 +157,7 @@ def compute_num_masked_span(input_length):
 
     # compute number of masked spans in batch
     input_lengths = (
-        attention_mask.sum(-1).asnumpy().tolist()
+        attention_mask.sum(-1).tolist()
         if attention_mask is not None
         else [sequence_length for _ in range(batch_size)]
     )
@@ -219,7 +198,7 @@ def compute_num_masked_span(input_length):
 
     spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
 
-    # expand(broadcast_to) masked indices to masked spans
+    # expand masked indices to masked spans
     spec_aug_mask_idxs = np.broadcast_to(
         spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
     )
@@ -257,8 +236,6 @@ def _sample_negative_indices(
     # get `num_negatives` random vector indices from the same utterance
     sampled_negative_indices = np.zeros(shape=(batch_size, sequence_length, num_negatives), dtype=np.int32)
 
-    if isinstance(mask_time_indices, Tensor):
-        mask_time_indices = mask_time_indices.asnumpy()
     mask_time_indices = (
         mask_time_indices.astype(bool) if mask_time_indices is not None else np.ones(features_shape, dtype=bool)
     )
@@ -317,7 +294,7 @@ def __init__(self, config, layer_id=0):
             stride=config.conv_stride[layer_id],
             bias=config.conv_bias,
         )
-        self.layer_norm = nn.LayerNorm([self.out_conv_dim])
+        self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True)
         self.activation = ACT2FN[config.feat_extract_activation]
 
     def forward(self, hidden_states):
@@ -369,7 +346,9 @@ def __init__(self, config):
         )
 
         weight_norm = nn.utils.weight_norm
-        self.conv = weight_norm(self.conv, name='weight', dim=2)
+
+        self.conv = weight_norm(self.conv, name="weight", dim=2)
+
         self.padding = Wav2Vec2ConformerSamePadLayer(config.num_conv_pos_embeddings)
         self.activation = ACT2FN[config.feat_extract_activation]
 
@@ -395,7 +374,7 @@ def __init__(self, config):
         base = config.rotary_embedding_base
 
         inv_freq = 1.0 / (base ** (ops.arange(0, dim, 2, dtype=mindspore.int64).float() / dim))
-        self.inv_freq = inv_freq
+        self.register_buffer("inv_freq", inv_freq)
         self.cached_sequence_length = None
         self.cached_rotary_positional_embedding = None
 
@@ -426,7 +405,7 @@ def __init__(self, config):
         self.max_len = config.max_source_positions
         self.d_model = config.hidden_size
         self.pe = None
-        self.extend_pe(mindspore.Tensor(0.0).broadcast_to((1, self.max_len)))
+        self.extend_pe(mindspore.tensor(0.0).broadcast_to((1, self.max_len)))
 
     def extend_pe(self, x):
         # Reset the positional encodings
@@ -434,7 +413,7 @@ def extend_pe(self, x):
             # self.pe contains both positive and negative parts
             # the length of self.pe is 2 * input_len - 1
             if self.pe.shape[1] >= x.shape[1] * 2 - 1:
-                if self.pe.dtype != x.dtype :
+                if self.pe.dtype != x.dtype:
                     self.pe = self.pe.to(dtype=x.dtype)
                 return
         # Suppose `i` is the position of query vector and `j` is the
@@ -501,30 +480,40 @@ def __init__(self, config):
                 f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']"
             )
         self.conv_layers = nn.ModuleList(conv_layers)
-        # self.gradient_checkpointing = False
+        self.gradient_checkpointing = False
         self._requires_grad = True
 
     def _freeze_parameters(self):
-        for _, param in self.parameters_and_names():
+        for param in self.parameters():
             param.requires_grad = False
         self._requires_grad = False
 
     def forward(self, input_values):
         hidden_states = input_values[:, None]
-        for conv_layer in self.conv_layers:
-            hidden_states = conv_layer(hidden_states)
-        return hidden_states
 
+        # make sure hidden_states require grad for gradient_checkpointing
+        if self._requires_grad and self.training:
+            hidden_states.requires_grad = True
 
+        for conv_layer in self.conv_layers:
+            if self._requires_grad and self.gradient_checkpointing and self.training:
+                hidden_states = self._gradient_checkpointing_func(
+                    conv_layer.__call__,
+                    hidden_states,
+                )
+            else:
+                hidden_states = conv_layer(hidden_states)
+
+        return hidden_states
 
 
 # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureProjection with Wav2Vec2->Wav2Vec2Conformer
 class Wav2Vec2ConformerFeatureProjection(nn.Module):
     def __init__(self, config):
         super().__init__()
-        self.layer_norm = nn.LayerNorm([config.conv_dim[-1]], eps=config.layer_norm_eps)
+        self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)
         self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
-        self.dropout = nn.Dropout(p = config.feat_proj_dropout)
+        self.dropout = nn.Dropout(config.feat_proj_dropout)
 
     def forward(self, hidden_states):
         # non-projected hidden states are needed for quantization
@@ -538,7 +527,7 @@ def forward(self, hidden_states):
 class Wav2Vec2ConformerFeedForward(nn.Module):
     def __init__(self, config):
         super().__init__()
-        self.intermediate_dropout = nn.Dropout(p = config.activation_dropout)
+        self.intermediate_dropout = nn.Dropout(config.activation_dropout)
 
         self.intermediate_dense = nn.Linear(config.hidden_size, config.intermediate_size)
         if isinstance(config.hidden_act, str):
@@ -547,7 +536,7 @@ def __init__(self, config):
             self.intermediate_act_fn = config.hidden_act
 
         self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size)
-        self.output_dropout = nn.Dropout(p = config.hidden_dropout)
+        self.output_dropout = nn.Dropout(config.hidden_dropout)
 
     def forward(self, hidden_states):
         hidden_states = self.intermediate_dense(hidden_states)
@@ -566,7 +555,7 @@ def __init__(self, config):
         super().__init__()
         if (config.conv_depthwise_kernel_size - 1) % 2 == 1:
             raise ValueError("`config.conv_depthwise_kernel_size` should be a odd number for 'SAME' padding")
-        self.layer_norm = nn.LayerNorm([config.hidden_size])
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
         self.pointwise_conv1 = nn.Conv1d(
             config.hidden_size,
             2 * config.hidden_size,
@@ -595,7 +584,7 @@ def __init__(self, config):
             padding=0,
             bias=False,
         )
-        self.dropout = nn.Dropout(p = config.conformer_conv_dropout)
+        self.dropout = nn.Dropout(config.conformer_conv_dropout)
 
     def forward(self, hidden_states):
         hidden_states = self.layer_norm(hidden_states)
@@ -620,8 +609,7 @@ def forward(self, hidden_states):
 
 
 class Wav2Vec2ConformerSelfAttention(nn.Module):
-    """
-    Construct an Wav2Vec2ConformerSelfAttention object.
+    """Construct an Wav2Vec2ConformerSelfAttention object.
     Can be enhanced with rotary or relative position embeddings.
     """
 
@@ -644,8 +632,8 @@ def __init__(self, config):
             self.linear_pos = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
             # these two learnable bias are used in matrix c and matrix d
             # as described in https://arxiv.org/abs/1901.02860 Section 3.3
-            self.pos_bias_u = mindspore.Parameter(ops.zeros(self.num_heads, self.head_size),'pos_bias_u')
-            self.pos_bias_v = mindspore.Parameter(ops.zeros(self.num_heads, self.head_size),'pos_bias_v')
+            self.pos_bias_u = nn.Parameter(ops.zeros(self.num_heads, self.head_size))
+            self.pos_bias_v = nn.Parameter(ops.zeros(self.num_heads, self.head_size))
 
     def forward(
         self,
@@ -777,21 +765,21 @@ def __init__(self, config):
         dropout = config.attention_dropout
 
         # Feed-forward 1
-        self.ffn1_layer_norm = nn.LayerNorm([embed_dim])
+        self.ffn1_layer_norm = nn.LayerNorm(embed_dim)
         self.ffn1 = Wav2Vec2ConformerFeedForward(config)
 
         # Self-Attention
-        self.self_attn_layer_norm = nn.LayerNorm([embed_dim])
-        self.self_attn_dropout = nn.Dropout(p = dropout)
+        self.self_attn_layer_norm = nn.LayerNorm(embed_dim)
+        self.self_attn_dropout = nn.Dropout(dropout)
         self.self_attn = Wav2Vec2ConformerSelfAttention(config)
 
         # Conformer Convolution
         self.conv_module = Wav2Vec2ConformerConvolutionModule(config)
 
         # Feed-forward 2
-        self.ffn2_layer_norm = nn.LayerNorm([embed_dim])
+        self.ffn2_layer_norm = nn.LayerNorm(embed_dim)
         self.ffn2 = Wav2Vec2ConformerFeedForward(config)
-        self.final_layer_norm = nn.LayerNorm([embed_dim])
+        self.final_layer_norm = nn.LayerNorm(embed_dim)
 
     def forward(
         self,
@@ -800,7 +788,6 @@ def forward(
         relative_position_embeddings: Optional[mindspore.Tensor] = None,
         output_attentions: bool = False,
     ):
-
         # 1. Feed-Forward 1 layer
         residual = hidden_states
         hidden_states = self.ffn1_layer_norm(hidden_states)
@@ -847,8 +834,8 @@ def __init__(self, config):
             self.embed_positions = None
 
         self.pos_conv_embed = Wav2Vec2ConformerPositionalConvEmbedding(config)
-        self.layer_norm = nn.LayerNorm([config.hidden_size], eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(p = config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout)
         self.layers = nn.ModuleList([Wav2Vec2ConformerEncoderLayer(config) for _ in range(config.num_hidden_layers)])
         self.gradient_checkpointing = False
 
@@ -870,9 +857,9 @@ def forward(
             # extend attention_mask
             attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
             attention_mask = attention_mask * float(ops.finfo(hidden_states.dtype).min)
-            attention_mask = attention_mask.broadcast_to((
-                attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
-            ))
+            attention_mask = attention_mask.broadcast_to(
+                (attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1])
+            )
 
         hidden_states = self.dropout(hidden_states)
 
@@ -881,7 +868,6 @@ def forward(
         else:
             relative_position_embeddings = None
 
-        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
 
         for i, layer in enumerate(self.layers):
             if output_hidden_states:
@@ -891,7 +877,7 @@ def forward(
             dropout_probability = ops.rand([])
 
             skip_the_layer = self.training and (dropout_probability < self.config.layerdrop)
-            if not skip_the_layer or deepspeed_zero3_is_enabled:
+            if not skip_the_layer:
                 # under deepspeed zero3 all gpus must run in sync
                 if self.gradient_checkpointing and self.training:
                     layer_outputs = self._gradient_checkpointing_func(
@@ -948,9 +934,7 @@ def __init__(self, config):
             )
 
         # storage for codebook variables (codewords)
-        self.codevectors = Parameter(
-            ops.zeros((1, self.num_groups * self.num_vars, config.codevector_dim // self.num_groups))
-        )
+        self.codevectors = nn.Parameter(ops.randn(1, self.num_groups * self.num_vars, config.codevector_dim // self.num_groups))
         self.weight_proj = nn.Linear(config.conv_dim[-1], self.num_groups * self.num_vars)
 
         # can be decayed for training
@@ -959,11 +943,11 @@ def __init__(self, config):
     @staticmethod
     def _compute_perplexity(probs, mask=None):
         if mask is not None:
-            mask_extended = mask.flatten()[:, None, None].broadcast_to((probs.shape))
+            mask_extended = mask.flatten()[:, None, None].broadcast_to(probs.shape)
             probs = ops.where(mask_extended, probs, ops.zeros_like(probs))
-            marginal_probs = probs.sum(axis=0) / mask.sum()
+            marginal_probs = ops.sum(probs, dim=0) / mask.sum()
         else:
-            marginal_probs = probs.mean(axis=0)
+            marginal_probs = ops.mean(probs, dim=0)
 
         perplexity = ops.exp(-ops.sum(marginal_probs * ops.log(marginal_probs + 1e-7), dim=-1)).sum()
         return perplexity
@@ -977,7 +961,7 @@ def forward(self, hidden_states, mask_time_indices=None):
 
         if self.training:
             # sample code vector probs via gumbel in differentiateable way
-            codevector_probs = ops.gumbel_softmax(
+            codevector_probs = nn.functional.gumbel_softmax(
                 hidden_states.float(), tau=self.temperature, hard=True
             ).type_as(hidden_states)
 
@@ -989,12 +973,12 @@ def forward(self, hidden_states, mask_time_indices=None):
         else:
             # take argmax in non-differentiable way
             # comptute hard codevector distribution (one hot)
-            codevector_idx = ops.argmax(hidden_states,dim=-1)
-            x = hidden_states.new_zeros(hidden_states.shape)    # (364, 320)
-            index = codevector_idx.view(-1, 1)
-            update = ops.ones_like(index, dtype=hidden_states.dtype)    # fill with onehot
-            codevector_probs = ops.scatter(x, -1, index, update)
-            codevector_probs = codevector_probs.view(batch_size * sequence_length, self.num_groups, -1) # (182, 2, 320)
+            codevector_idx = ops.argmax(hidden_states, dim=-1).view(-1, 1)
+            codevector_probs = ops.scatter(
+                ops.zeros(hidden_states.shape, dtype=hidden_states.dtype),
+                -1, codevector_idx, ops.ones(codevector_idx.shape, dtype=hidden_states.dtype)
+            )
+            codevector_probs = codevector_probs.view(batch_size * sequence_length, self.num_groups, -1)
 
             perplexity = self._compute_perplexity(codevector_probs, mask_time_indices)
 
@@ -1015,11 +999,11 @@ def __init__(self, config):
         # feature dim might need to be down-projected
         if config.output_hidden_size != config.hidden_size:
             self.proj = nn.Linear(config.hidden_size, config.output_hidden_size)
-            self.proj_layer_norm = nn.LayerNorm([config.output_hidden_size])
+            self.proj_layer_norm = nn.LayerNorm(config.output_hidden_size)
         else:
             self.proj = self.proj_layer_norm = None
 
-        self.layers = nn.ModuleList([Wav2Vec2ConformerAdapterLayer(config) for _ in range(config.num_adapter_layers)])
+        self.layers = nn.ModuleList(Wav2Vec2ConformerAdapterLayer(config) for _ in range(config.num_adapter_layers))
         self.layerdrop = config.layerdrop
 
     def forward(self, hidden_states):
@@ -1053,7 +1037,7 @@ def __init__(self, config):
 
     def forward(self, hidden_states):
         hidden_states = self.conv(hidden_states)
-        hidden_states = F.glu(hidden_states, dim=1)
+        hidden_states = nn.functional.glu(hidden_states, dim=1)
 
         return hidden_states
 
@@ -1069,46 +1053,49 @@ class Wav2Vec2ConformerPreTrainedModel(PreTrainedModel):
     main_input_name = "input_values"
     supports_gradient_checkpointing = True
 
-
-    def _init_weights(self, cell):
+    def _init_weights(self, module):
         """Initialize the weights"""
         # Wav2Vec2ForPreTraining last 2 linear layers need standard Linear init.
-        if isinstance(cell, Wav2Vec2ConformerForPreTraining):
-            cell.project_hid._is_hf_initialized = True
-            cell.project_q._is_hf_initialized = True
+        if isinstance(module, Wav2Vec2ConformerForPreTraining):
+            module.project_hid.reset_parameters()
+            module.project_q.reset_parameters()
+            module.project_hid._is_initialized = True
+            module.project_q._is_initialized = True
         # gumbel softmax requires special init
-        elif isinstance(cell, Wav2Vec2ConformerGumbelVectorQuantizer):
-            cell.weight_proj.weight.set_data(initializer(Normal(1.0), cell.weight_proj.weight.shape, cell.weight_proj.weight.dtype))
-            cell.weight_proj.bias.set_data(initializer('zeros', cell.weight_proj.bias.shape, cell.weight_proj.bias.dtype))
-            cell.codevectors.set_data(initializer('uniform', cell.codevectors.shape, cell.codevectors.dtype))
-        elif isinstance(cell, Wav2Vec2ConformerSelfAttention):
-            if hasattr(cell, "pos_bias_u"):
-                cell.pos_bias_u.set_data(initializer('XavierUniform', cell.pos_bias_u.shape, cell.pos_bias_u.dtype))
-            if hasattr(cell, "pos_bias_v"):
-                cell.pos_bias_v.set_data(initializer('XavierUniform', cell.pos_bias_u.shape, cell.pos_bias_u.dtype))
-        elif isinstance(cell, Wav2Vec2ConformerPositionalConvEmbedding):
-            cell.conv.weight.set_data(
-                initializer(Normal(2 * math.sqrt(1 / (cell.conv.kernel_size[0] * cell.conv.in_channels))),
-                            cell.conv.weight.shape, cell.conv.weight.dtype))
-            cell.conv.bias.set_data(initializer('zeros', cell.conv.bias.shape, cell.conv.bias.dtype))
-        elif isinstance(cell, Wav2Vec2ConformerFeatureProjection):
-            k = math.sqrt(1 / cell.projection.in_channels)
-            cell.projection.weight.set_data(
-                initializer(Uniform(k), cell.projection.weight.shape, cell.projection.weight.dtype))
-            cell.projection.bias.set_data(
-                initializer(Uniform(k), cell.projection.bias.shape, cell.projection.bias.dtype))
-        elif isinstance(cell, nn.Linear):
-            cell.weight.set_data(initializer(Normal(self.config.initializer_range), cell.weight.shape, cell.weight.dtype))
-            if cell.bias is not None:
-                cell.bias.set_data(initializer('zeros', cell.bias.shape, cell.bias.dtype))
-        elif isinstance(cell, (nn.LayerNorm, nn.GroupNorm)):
-            cell.weight.set_data(initializer('ones', cell.weight.shape, cell.weight.dtype))
-            cell.bias.set_data(initializer('zeros', cell.bias.shape, cell.bias.dtype))
-        elif isinstance(cell, nn.Conv1d):
-            cell.weight.set_data(initializer('he_normal', cell.weight.shape, cell.weight.dtype))
-            if cell.bias is not None:
-                k = math.sqrt(cell.group / (cell.in_channels * cell.kernel_size[0]))
-                cell.bias.set_data(initializer(Uniform(k), cell.bias.shape, cell.bias.dtype))
+        elif isinstance(module, Wav2Vec2ConformerGumbelVectorQuantizer):
+            nn.init.normal_(module.weight_proj.weight, mean=0.0, std=1)
+            nn.init.zeros_(module.weight_proj.bias)
+            nn.init.uniform_(module.codevectors)
+        elif isinstance(module, Wav2Vec2ConformerSelfAttention):
+            if hasattr(module, "pos_bias_u"):
+                nn.init.xavier_uniform_(module.pos_bias_u)
+            if hasattr(module, "pos_bias_v"):
+                nn.init.xavier_uniform_(module.pos_bias_v)
+        elif isinstance(module, Wav2Vec2ConformerPositionalConvEmbedding):
+            nn.init.normal_(
+                module.conv.weight,
+                mean=0,
+                std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)),
+            )
+            nn.init.constant_(module.conv.bias, 0)
+        elif isinstance(module, Wav2Vec2ConformerFeatureProjection):
+            k = math.sqrt(1 / module.projection.in_features)
+            nn.init.uniform_(module.projection.weight, a=-k, b=k)
+            nn.init.uniform_(module.projection.bias, a=-k, b=k)
+        elif isinstance(module, nn.Linear):
+            nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
+
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            nn.init.zeros_(module.bias)
+            nn.init.ones_(module.weight)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.kaiming_normal_(module.weight)
+
+            if module.bias is not None:
+                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-k, b=k)
 
     def _get_feat_extract_output_lengths(
         self, input_lengths: Union[mindspore.Tensor, int], add_adapter: Optional[bool] = None
@@ -1121,8 +1108,7 @@ def _get_feat_extract_output_lengths(
 
         def _conv_out_length(input_length, kernel_size, stride):
             # 1D convolutional layer output length formula taken
-            # from https://pyops.org/docs/stable/generated/ops.nn.Conv1d.html
-            return (input_length - kernel_size) // stride + 1
+            return ops.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
 
         for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
             input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
@@ -1138,7 +1124,7 @@ def _get_feature_vector_attention_mask(
     ):
         # Effectively attention_mask.sum(-1), but not inplace to be able to run
         # on inference mode.
-        non_padded_lengths = attention_mask.cumsum(axis=-1)[:, -1]
+        non_padded_lengths = ops.cumsum(attention_mask, dim=-1)[:, -1]
 
         output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
         output_lengths = output_lengths.to(mindspore.int64)
@@ -1150,66 +1136,10 @@ def _get_feature_vector_attention_mask(
         )
         # these two operations makes sure that all values before the output lengths idxs are attended to
         attention_mask[(ops.arange(attention_mask.shape[0]), output_lengths - 1)] = 1
-        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+        attention_mask = attention_mask.flip([-1]).int().cumsum(-1).flip([-1]).bool()
         return attention_mask
 
 
-WAV2VEC2_CONFORMER_START_DOCSTRING = r"""
-    Wav2Vec2Conformer was proposed in [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech
-    Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael
-    Auli.
-
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving etc.).
-
-    This model is a PyTorch [nn.Module](https://pyops.org/docs/stable/nn.html#nn.Module) sub-class. Use it as a
-    regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior.
-
-    Parameters:
-        config ([`Wav2Vec2ConformerConfig`]): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-WAV2VEC2_CONFORMER_INPUTS_DOCSTRING = r"""
-    Args:
-        input_values (`mindspore.Tensor` of shape `(batch_size, sequence_length)`):
-            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
-            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
-            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
-            conversion into a tensor of type `mindspore.Tensor`. See [`Wav2Vec2Processor.__call__`] for details.
-        attention_mask (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0, 1]`:
-            
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-            
-            [What are attention masks?](../glossary#attention-mask)
-
-            <Tip warning={true}>
-
-            `attention_mask` should only be passed if the corresponding processor has `config.return_attention_mask ==
-            True`. For all models whose processor has `config.return_attention_mask == False`, such as
-            [wav2vec2-conformer-rel-pos-large](https://huggingface.co/facebook/wav2vec2-conformer-rel-pos-large),
-            `attention_mask` should **not** be passed to avoid degraded performance when doing batched inference. For
-            such models `input_values` should simply be padded with 0 and passed without `attention_mask`. Be aware
-            that these models also yield slightly different results depending on whether `input_values` is padded or
-            not.
-
-            </Tip>
-
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
 class Wav2Vec2ConformerModel(Wav2Vec2ConformerPreTrainedModel):
     def __init__(self, config: Wav2Vec2ConformerConfig):
         super().__init__(config)
@@ -1219,7 +1149,7 @@ def __init__(self, config: Wav2Vec2ConformerConfig):
 
         # model only needs masking vector if mask prob is > 0.0
         if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
-            self.masked_spec_embed = Parameter(initializer(Normal(), [config.hidden_size]), 'masked_spec_embed')
+            self.masked_spec_embed = nn.Parameter(ops.randn(config.hidden_size))
 
         self.encoder = Wav2Vec2ConformerEncoder(config)
 
@@ -1266,7 +1196,7 @@ def _mask_hidden_states(
                 attention_mask=attention_mask,
                 min_masks=self.config.mask_time_min_masks,
             )
-            mask_time_indices = mindspore.Tensor(mask_time_indices, dtype=mindspore.bool_)
+            mask_time_indices = mindspore.tensor(mask_time_indices, dtype=mindspore.bool_)
             hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
 
         if self.config.mask_feature_prob > 0 and self.training:
@@ -1277,7 +1207,7 @@ def _mask_hidden_states(
                 mask_length=self.config.mask_feature_length,
                 min_masks=self.config.mask_feature_min_masks,
             )
-            mask_feature_indices = mindspore.Tensor(mask_feature_indices, dtype=mindspore.bool_)
+            mask_feature_indices = mindspore.tensor(mask_feature_indices, dtype=mindspore.bool_)
             mask_feature_indices = mask_feature_indices[:, None].broadcast_to((-1, sequence_length, -1))
             hidden_states[mask_feature_indices] = 0
 
@@ -1299,22 +1229,20 @@ def forward(
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-
         extract_features = self.feature_extractor(input_values)
         extract_features = extract_features.swapaxes(1, 2)
 
-
         if attention_mask is not None:
             # compute reduced attention_mask corresponding to feature vectors
             attention_mask = self._get_feature_vector_attention_mask(
                 extract_features.shape[1], attention_mask, add_adapter=False
             )
 
-
         hidden_states, extract_features = self.feature_projection(extract_features)
         hidden_states = self._mask_hidden_states(
             hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
         )
+
         encoder_outputs = self.encoder(
             hidden_states,
             attention_mask=attention_mask,
@@ -1344,7 +1272,7 @@ class Wav2Vec2ConformerForPreTraining(Wav2Vec2ConformerPreTrainedModel):
     def __init__(self, config: Wav2Vec2ConformerConfig):
         super().__init__(config)
         self.wav2vec2_conformer = Wav2Vec2ConformerModel(config)
-        self.dropout_features = nn.Dropout(p = config.feat_quantizer_dropout)
+        self.dropout_features = nn.Dropout(config.feat_quantizer_dropout)
 
         self.quantizer = Wav2Vec2ConformerGumbelVectorQuantizer(config)
 
@@ -1403,62 +1331,61 @@ def forward(
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, Wav2Vec2ConformerForPreTrainingOutput]:
         r"""
-        Args:
-            mask_time_indices (`ops.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
-                masked extracted features in *config.proj_codevector_dim* space.
-            sampled_negative_indices (`ops.BoolTensor` of shape `(batch_size, sequence_length, num_negatives)`, *optional*):
-                Indices indicating which quantized target vectors are used as negative sampled vectors in contrastive loss.
-                Required input for pre-training.
+        mask_time_indices (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
+            masked extracted features in *config.proj_codevector_dim* space.
+        sampled_negative_indices (`mindspore.Tensor` of shape `(batch_size, sequence_length, num_negatives)`, *optional*):
+            Indices indicating which quantized target vectors are used as negative sampled vectors in contrastive loss.
+            Required input for pre-training.
 
         Returns:
-            `Union[Tuple, Wav2Vec2ConformerForPreTrainingOutput]`
 
         Example:
-            ```python
-            >>> import torch
-            >>> from transformers import AutoFeatureExtractor, Wav2Vec2ConformerForPreTraining
-            >>> from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer import _compute_mask_indices, _sample_negative_indices
-            >>> from datasets import load_dataset
-            ...
-            >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-conformer-rel-pos-large")
-            >>> model = Wav2Vec2ConformerForPreTraining.from_pretrained("facebook/wav2vec2-conformer-rel-pos-large")
-            ...
-            >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-            >>> input_values = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt").input_values  # Batch size 1
-            ...
-            >>> # compute masked indices
-            >>> batch_size, raw_sequence_length = input_values.shape
-            >>> sequence_length = model._get_feat_extract_output_lengths(raw_sequence_length).item()
-            >>> mask_time_indices = _compute_mask_indices(
-            ...     shape=(batch_size, sequence_length), mask_prob=0.2, mask_length=2
-            ... )
-            >>> sampled_negative_indices = _sample_negative_indices(
-            ...     features_shape=(batch_size, sequence_length),
-            ...     num_negatives=model.config.num_negatives,
-            ...     mask_time_indices=mask_time_indices,
-            ... )
-            >>> mask_time_indices = mindspore.Tensor(data=mask_time_indices, dtype=mindspore.int64)
-            >>> sampled_negative_indices = mindspore.Tensor(
-            ...     data=sampled_negative_indices, dtype=mindspore.int64
-            ... )
-            ...
-            >>> with ops.no_grad():
-            ...     outputs = model(input_values, mask_time_indices=mask_time_indices)
-            ...
-            >>> # compute cosine similarity between predicted (=projected_states) and target (=projected_quantized_states)
-            >>> cosine_sim = ops.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1)
-            ...
-            >>> # show that cosine similarity is much higher than random
-            >>> cosine_sim[mask_time_indices.to(ops.bool)].mean() > 0.5
-            tensor(True)
-            >>> # for contrastive loss training model should be put into train mode
-            >>> model = model.train()
-            >>> loss = model(
-            ...     input_values, mask_time_indices=mask_time_indices, sampled_negative_indices=sampled_negative_indices
-            ... ).loss
-            ```
-        """
+
+        ```python
+        >>> import torch
+        >>> from transformers import AutoFeatureExtractor, Wav2Vec2ConformerForPreTraining
+        >>> from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer import _compute_mask_indices, _sample_negative_indices
+        >>> from datasets import load_dataset
+
+        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-conformer-rel-pos-large")
+        >>> model = Wav2Vec2ConformerForPreTraining.from_pretrained("facebook/wav2vec2-conformer-rel-pos-large")
+
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> input_values = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt").input_values  # Batch size 1
+
+        >>> # compute masked indices
+        >>> batch_size, raw_sequence_length = input_values.shape
+        >>> sequence_length = model._get_feat_extract_output_lengths(raw_sequence_length).item()
+        >>> mask_time_indices = _compute_mask_indices(
+        ...     shape=(batch_size, sequence_length), mask_prob=0.2, mask_length=2
+        ... )
+        >>> sampled_negative_indices = _sample_negative_indices(
+        ...     features_shape=(batch_size, sequence_length),
+        ...     num_negatives=model.config.num_negatives,
+        ...     mask_time_indices=mask_time_indices,
+        ... )
+        >>> mask_time_indices = mindspore.tensor(data=mask_time_indices, dtype=mindspore.int64)
+        >>> sampled_negative_indices = mindspore.tensor(
+        ...     data=sampled_negative_indices, dtype=mindspore.int64
+        ... )
+
+        >>> with no_grad():
+        ...     outputs = model(input_values, mask_time_indices=mask_time_indices)
+
+        >>> # compute cosine similarity between predicted (=projected_states) and target (=projected_quantized_states)
+        >>> cosine_sim = ops.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1)
+
+        >>> # show that cosine similarity is much higher than random
+        >>> cosine_sim[mask_time_indices.to(mindspore.bool_)].mean() > 0.5
+        tensor(True)
+
+        >>> # for contrastive loss training model should be put into train mode
+        >>> model = model.train()
+        >>> loss = model(
+        ...     input_values, mask_time_indices=mask_time_indices, sampled_negative_indices=sampled_negative_indices
+        ... ).loss
+        ```"""
 
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
@@ -1522,16 +1449,14 @@ def forward(
             neg_is_pos = (quantized_features == negative_quantized_features).all(-1)
 
             if neg_is_pos.any():
-                # NOTE: avoid loss NaN
-                # float("-inf") => finfo(logits.dtype, 'min') := -3.40282e+38
-                logits[1:][neg_is_pos] = -3.40282e+35
+                logits[1:][neg_is_pos] = float("-inf")
 
             # 6. compute contrastive loss \mathbf{L}_m = cross_entropy(logs) =
             # -log(exp(sim(c_t, q_t)/\kappa) / \sum_{\sim{q}} exp(sim(c_t, \sim{q})/\kappa))
             logits = logits.swapaxes(0, 2).reshape(-1, logits.shape[0])
             target = ((1 - mask_time_indices.long()) * -100).swapaxes(0, 1).flatten()
 
-            contrastive_loss = F.cross_entropy(logits.float(), target, reduction="sum")
+            contrastive_loss = nn.functional.cross_entropy(logits.float(), target, reduction="sum")
             # 7. compute diversity loss: \mathbf{L}_d
             num_codevectors = self.config.num_codevectors_per_group * self.config.num_codevector_groups
             diversity_loss = ((num_codevectors - codevector_perplexity) / num_codevectors) * mask_time_indices.sum()
@@ -1555,13 +1480,14 @@ def forward(
             diversity_loss=diversity_loss,
         )
 
+
 class Wav2Vec2ConformerForCTC(Wav2Vec2ConformerPreTrainedModel):
     # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC.__init__ with Wav2Vec2->Wav2Vec2Conformer,wav2vec2->wav2vec2_conformer
     def __init__(self, config, target_lang: Optional[str] = None):
         super().__init__(config)
 
         self.wav2vec2_conformer = Wav2Vec2ConformerModel(config)
-        self.dropout = nn.Dropout(p = config.final_dropout)
+        self.dropout = nn.Dropout(config.final_dropout)
 
         self.target_lang = target_lang
 
@@ -1580,39 +1506,6 @@ def __init__(self, config, target_lang: Optional[str] = None):
         # Initialize weights and apply final processing
         self.post_init()
 
-    #Copied from wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC.freeze_feature_encoder with wav2vec2->wav2vec2_conformer
-    def tie_weights(self):
-        """
-        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
-        passing `target_lang=...` to `from_pretrained(...)`.
-
-        This method is **not** supposed to be called by the user and is prone to be changed in the future.
-        """
-        # Note that `tie_weights` is usually used to tie input and output embedding weights. The method is re-purposed to
-        # correctly load adapter layers for Wav2Vec2 so that we do not have to introduce a new API to
-        # [`PreTrainedModel`]. While slightly hacky, Wav2Vec2 never has to tie input and output embeddings, so that it is
-        # ok to repurpose this function here.
-        target_lang = self.target_lang
-
-        if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None:
-            raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.")
-        elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None:
-            logger.info("By default `target_lang` is set to 'eng'.")
-        elif target_lang is not None:
-            self.load_adapter(target_lang)
-
-    def freeze_feature_extractor(self):
-        """
-        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
-        not be updated during training.
-        """
-        warnings.warn(
-            "The method `freeze_feature_extractor` is deprecated. "
-            "Please use the equivalent `freeze_feature_encoder` method instead.",
-            FutureWarning,
-        )
-        self.freeze_feature_encoder()
-
     # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC.freeze_feature_encoder with wav2vec2->wav2vec2_conformer
     def freeze_feature_encoder(self):
         """
@@ -1621,14 +1514,6 @@ def freeze_feature_encoder(self):
         """
         self.wav2vec2_conformer.feature_extractor._freeze_parameters()
 
-    def freeze_base_model(self):
-        """
-        Calling this function will disable the gradient computation for the base model so that its parameters will not
-        be updated during training. Only the classification head will be updated.
-        """
-        for _, param in self.wav2vec2.parameters_and_names():
-            param.requires_grad = False
-
     # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC.forward with Wav2Vec2->Wav2Vec2Conformer,wav2vec2->wav2vec2_conformer
     def forward(
         self,
@@ -1640,16 +1525,17 @@ def forward(
         labels: Optional[mindspore.Tensor] = None,
     ) -> Union[Tuple, CausalLMOutput]:
         r"""
-        Args:
-            labels (`mindspore.Tensor` of shape `(batch_size, target_length)`, *optional*):
-                Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
-                the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
-                All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
-                config.vocab_size - 1]`.
+        labels (`mindspore.Tensor` of shape `(batch_size, target_length)`, *optional*):
+            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
+            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
+            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
+            config.vocab_size - 1]`.
         """
-
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
+        if labels is not None and labels.max() >= self.config.vocab_size:
+            raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
+
         outputs = self.wav2vec2_conformer(
             input_values,
             attention_mask=attention_mask,
@@ -1665,8 +1551,6 @@ def forward(
 
         loss = None
         if labels is not None:
-            if labels.max() >= self.config.vocab_size:
-                raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
             # retrieve loss input_lengths from attention_mask
             attention_mask = (
                 attention_mask if attention_mask is not None else ops.ones_like(input_values, dtype=mindspore.int64)
@@ -1680,11 +1564,11 @@ def forward(
             flattened_targets = labels.masked_select(labels_mask)
 
             # ctc_loss doesn't support fp16
-            log_probs = F.log_softmax(logits, dim=-1).swapaxes(0, 1)
+            log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=mindspore.float32).swapaxes(0, 1)
 
-            loss = F.ctc_loss(
+            loss = nn.functional.ctc_loss(
                 log_probs,
-                labels,     # flattened_targets
+                labels,
                 input_lengths,
                 target_lengths,
                 blank=self.config.pad_token_id,
@@ -1713,7 +1597,7 @@ def __init__(self, config):
         self.wav2vec2_conformer = Wav2Vec2ConformerModel(config)
         num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
         if config.use_weighted_layer_sum:
-            self.layer_weights = mindspore.Parameter(ops.ones(num_layers) / num_layers,'layer_weights')
+            self.layer_weights = nn.Parameter(ops.ones(num_layers) / num_layers)
         self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
         self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)
 
@@ -1733,7 +1617,7 @@ def freeze_base_model(self):
         Calling this function will disable the gradient computation for the base model so that its parameters will not
         be updated during training. Only the classification head will be updated.
         """
-        for _, param in self.wav2vec2_conformer.parameters_and_names():
+        for param in self.wav2vec2_conformer.parameters():
             param.requires_grad = False
 
     # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.forward with Wav2Vec2->Wav2Vec2Conformer,wav2vec2->wav2vec2_conformer,WAV_2_VEC_2->WAV2VEC2_CONFORMER
@@ -1747,11 +1631,10 @@ def forward(
         labels: Optional[mindspore.Tensor] = None,
     ) -> Union[Tuple, SequenceClassifierOutput]:
         r"""
-        Args:
-            labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
-                Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-                config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-                `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
 
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1768,24 +1651,25 @@ def forward(
         if self.config.use_weighted_layer_sum:
             hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
             hidden_states = ops.stack(hidden_states, dim=1)
-            norm_weights = ops.softmax(self.layer_weights, dim=-1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
             hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
         else:
             hidden_states = outputs[0]
 
         hidden_states = self.projector(hidden_states)
         if attention_mask is None:
-            pooled_output = hidden_states.mean(axis=1)
+            pooled_output = ops.mean(hidden_states, dim=1)
         else:
             padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
             hidden_states[~padding_mask] = 0.0
-            pooled_output = hidden_states.sum(axis=1) / padding_mask.sum(axis=1).view(-1, 1)
+            pooled_output = ops.sum(hidden_states, dim=1) / ops.sum(padding_mask, dim=1).view(-1, 1)
 
         logits = self.classifier(pooled_output)
 
         loss = None
         if labels is not None:
-            loss = F.cross_entropy(logits.view(-1, self.config.num_labels), labels.view(-1))
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
 
         if not return_dict:
             output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
@@ -1811,7 +1695,7 @@ def __init__(self, config):
         self.wav2vec2_conformer = Wav2Vec2ConformerModel(config)
         num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
         if config.use_weighted_layer_sum:
-            self.layer_weights = mindspore.Parameter(ops.ones(num_layers) / num_layers,'layer_weights')
+            self.layer_weights = nn.Parameter(ops.ones(num_layers) / num_layers)
         self.classifier = nn.Linear(config.hidden_size, config.num_labels)
         self.num_labels = config.num_labels
 
@@ -1845,11 +1729,10 @@ def forward(
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, TokenClassifierOutput]:
         r"""
-        Args:
-            labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
-                Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-                config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-                `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
 
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1866,8 +1749,8 @@ def forward(
         if self.config.use_weighted_layer_sum:
             hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
             hidden_states = ops.stack(hidden_states, dim=1)
-            norm_weights = ops.softmax(self.layer_weights, dim=-1)
-            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(axis=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
         else:
             hidden_states = outputs[0]
 
@@ -1875,7 +1758,8 @@ def forward(
 
         loss = None
         if labels is not None:
-            loss = F.cross_entropy(logits.view(-1, self.num_labels), ops.argmax(labels.view(-1, self.num_labels), dim=1))
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), ops.argmax(labels.view(-1, self.num_labels), dim=1))
 
         if not return_dict:
             output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
@@ -1896,20 +1780,19 @@ def __init__(self, input_dim, num_labels, scale=30.0, margin=0.4):
         self.scale = scale
         self.margin = margin
         self.num_labels = num_labels
-        # self.weight = mindspore.Parameter(ops.randn(input_dim, num_labels), 'weight').requires_grad = True
-        self.weight = Parameter(ops.randn(input_dim, num_labels), requires_grad=True)
-        #self.loss = F.cross_entropy()
-
+        self.weight = nn.Parameter(ops.randn(input_dim, num_labels), requires_grad=True)
+        self.loss = nn.CrossEntropyLoss()
 
     def forward(self, hidden_states, labels):
         labels = labels.flatten()
-        weight = self.weight / ops.norm(self.weight, dim=0, keepdim=True)
-        hidden_states = hidden_states / ops.norm(hidden_states, dim=1, keepdim=True)
+        weight = nn.functional.normalize(self.weight, dim=0)
+        hidden_states = nn.functional.normalize(hidden_states, dim=1)
         cos_theta = ops.mm(hidden_states, weight)
         psi = cos_theta - self.margin
-        onehot = ops.one_hot(labels, self.num_labels)
+
+        onehot = nn.functional.one_hot(labels, self.num_labels)
         logits = self.scale * ops.where(onehot.bool(), psi, cos_theta)
-        loss = F.cross_entropy(logits, labels)
+        loss = self.loss(logits, labels)
 
         return loss
 
@@ -1927,17 +1810,18 @@ def __init__(self, config, layer_id=0):
         self.activation = nn.ReLU()
 
     def forward(self, hidden_states: mindspore.Tensor) -> mindspore.Tensor:
-        # if is_peft_available():
-        #     from peft.tuners.lora import LoraLayer
-        #     if isinstance(self.kernel, LoraLayer):
-        #         warnings.warn(
-        #             "Detected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. "
-        #             "You should exclude TDNNLayer from LoRA's target modules.",
-        #         )
+        from ....peft.tuners.lora import LoraLayer
+
+        if isinstance(self.kernel, LoraLayer):
+            warnings.warn(
+                "Detected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. "
+                "You should exclude TDNNLayer from LoRA's target modules.",
+            )
+
         # for backward compatibility, we keep nn.Linear but call F.conv1d for speed up
         hidden_states = hidden_states.swapaxes(1, 2)
         weight = self.kernel.weight.view(self.out_conv_dim, self.kernel_size, self.in_conv_dim).swapaxes(1, 2)
-        hidden_states = ops.conv1d(hidden_states, weight, self.kernel.bias, dilation=self.dilation)
+        hidden_states = nn.functional.conv1d(hidden_states, weight, self.kernel.bias, dilation=self.dilation)
         hidden_states = hidden_states.swapaxes(1, 2)
 
         hidden_states = self.activation(hidden_states)
@@ -1951,7 +1835,7 @@ def __init__(self, config):
         self.wav2vec2_conformer = Wav2Vec2ConformerModel(config)
         num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
         if config.use_weighted_layer_sum:
-            self.layer_weights = mindspore.Parameter(ops.ones(num_layers) / num_layers)
+            self.layer_weights = nn.Parameter(ops.ones(num_layers) / num_layers)
         self.projector = nn.Linear(config.hidden_size, config.tdnn_dim[0])
 
         tdnn_layers = [TDNNLayer(config, i) for i in range(len(config.tdnn_dim))]
@@ -1978,7 +1862,7 @@ def freeze_base_model(self):
         Calling this function will disable the gradient computation for the base model so that its parameters will not
         be updated during training. Only the classification head will be updated.
         """
-        for _, param in self.wav2vec2_conformer.parameters_and_names():
+        for param in self.wav2vec2_conformer.parameters():
             param.requires_grad = False
 
     # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForXVector._get_tdnn_output_lengths with wav2vec2->wav2vec2_conformer
@@ -1989,7 +1873,6 @@ def _get_tdnn_output_lengths(self, input_lengths: Union[mindspore.Tensor, int]):
 
         def _conv_out_length(input_length, kernel_size, stride):
             # 1D convolutional layer output length formula taken
-            # from https://pyops.org/docs/stable/generated/ops.nn.Conv1d.html
             return (input_length - kernel_size) // stride + 1
 
         for kernel_size in self.config.tdnn_kernel:
@@ -2008,11 +1891,10 @@ def forward(
         labels: Optional[mindspore.Tensor] = None,
     ) -> Union[Tuple, XVectorOutput]:
         r"""
-        Args:
-            labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
-                Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-                config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-                `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
 
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -2029,8 +1911,8 @@ def forward(
         if self.config.use_weighted_layer_sum:
             hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
             hidden_states = ops.stack(hidden_states, dim=1)
-            norm_weights = ops.softmax(self.layer_weights, dim=-1)
-            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(axis=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
         else:
             hidden_states = outputs[0]
 
@@ -2041,16 +1923,16 @@ def forward(
 
         # Statistic Pooling
         if attention_mask is None:
-            mean_features = hidden_states.mean(axis=1)
-            std_features = ops.std(hidden_states, dim=1, keepdim=True).squeeze(1)
+            mean_features = ops.mean(hidden_states, dim=1)
+            std_features = ops.std(hidden_states, dim=1)
         else:
             feat_extract_output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(dim=1))
             tdnn_output_lengths = self._get_tdnn_output_lengths(feat_extract_output_lengths)
             mean_features = []
             std_features = []
             for i, length in enumerate(tdnn_output_lengths):
-                mean_features.append(hidden_states[i, :length].mean(dim=0))
-                std_features.append(hidden_states[i, :length].std(dim=0))
+                mean_features.append(ops.mean(hidden_states[i, :length], dim=0))
+                std_features.append(ops.std(hidden_states[i, :length], dim=0))
             mean_features = ops.stack(mean_features)
             std_features = ops.stack(std_features)
         statistic_pooling = ops.cat([mean_features, std_features], dim=-1)
@@ -2060,7 +1942,6 @@ def forward(
 
         loss = None
         if labels is not None:
-            labels = labels.astype(mindspore.int32)
             loss = self.objective(logits, labels)
 
         if not return_dict:
@@ -2074,3 +1955,13 @@ def forward(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
+
+__all__ = [
+    "Wav2Vec2ConformerForAudioFrameClassification",
+    "Wav2Vec2ConformerForCTC",
+    "Wav2Vec2ConformerForPreTraining",
+    "Wav2Vec2ConformerForSequenceClassification",
+    "Wav2Vec2ConformerForXVector",
+    "Wav2Vec2ConformerModel",
+    "Wav2Vec2ConformerPreTrainedModel",
+]
diff --git a/mindnlp/transformers/models/wavlm/modeling_wavlm.py b/mindnlp/transformers/models/wavlm/modeling_wavlm.py
index 33884f413..28d121aa7 100644
--- a/mindnlp/transformers/models/wavlm/modeling_wavlm.py
+++ b/mindnlp/transformers/models/wavlm/modeling_wavlm.py
@@ -16,17 +16,14 @@
 
 import math
 import warnings
-from typing import Optional, Tuple, Union, List
+from typing import Optional, Tuple, Union
 
 import numpy as np
 import mindspore
-from mindspore.common.initializer import initializer, Normal, TruncatedNormal, Uniform, HeNormal
-
-from mindnlp.core import nn, ops
 from mindnlp.core.nn import functional as F
-from mindnlp.utils import logging
+from mindnlp.core import nn, ops
+from mindnlp.core.nn import CrossEntropyLoss
 
-from .configuration_wavlm import WavLMConfig
 from ...activations import ACT2FN
 from ...modeling_outputs import (
     BaseModelOutput,
@@ -37,6 +34,11 @@
     XVectorOutput,
 )
 from ...modeling_utils import PreTrainedModel
+from ....utils import (
+    logging,
+)
+from .configuration_wavlm import WavLMConfig
+
 
 logger = logging.get_logger(__name__)
 
@@ -63,112 +65,6 @@
 _XVECTOR_EXPECTED_OUTPUT = 0.97
 
 
-def _canonical_mask(
-        mask: Optional[mindspore.Tensor],
-        mask_name: str,
-        other_type: Optional[int],
-        other_name: str,
-        target_type: int,
-        check_other: bool = True,
-) -> Optional[mindspore.Tensor]:
-    if mask is not None:
-        _mask_dtype = mask.dtype
-        _mask_is_float = ops.is_floating_point(mask)
-        if _mask_dtype != mindspore.bool_ and not _mask_is_float:
-            raise AssertionError(
-                f"only bool and floating types of {mask_name} are supported")
-        if check_other and other_type is not None:
-            if _mask_dtype != other_type:
-                warnings.warn(
-                    f"Support for mismatched {mask_name} and {other_name} "
-                    "is deprecated. Use same type for both instead."
-                )
-        if not _mask_is_float:
-            zero_tensor = ops.zeros_like(mask, dtype=target_type)
-            mask = ops.where(mask, mindspore.Tensor(float("-inf"), target_type), zero_tensor)
-            # mask = (
-            #     ops.zeros_like(mask, dtype=target_type)
-            #     .masked_fill_(mask, float("-inf"))
-            # )
-    return mask
-
-def linear(x, weight, bias):
-    """inner linear"""
-    out = ops.matmul(x, weight.swapaxes(-1, -2))
-    if bias is not None:
-        out = out + bias
-    return out
-
-def _none_or_dtype(input: Optional[mindspore.Tensor]) -> Optional[int]:
-    if input is None:
-        return None
-    elif isinstance(input, mindspore.Tensor):
-        return input.dtype
-    raise RuntimeError("input to _none_or_dtype() must be None or mindspore.Tensor")
-
-def _in_projection_packed(
-    q: mindspore.Tensor,
-    k: mindspore.Tensor,
-    v: mindspore.Tensor,
-    w: mindspore.Tensor,
-    b: Optional[mindspore.Tensor] = None,
-) -> List[mindspore.Tensor]:
-    r"""Perform the in-projection step of the attention operation, using packed weights.
-
-    Output is a triple containing projection tensors for query, key and value.
-
-    Args:
-        q, k, v: query, key and value tensors to be projected. For self-attention,
-            these are typically the same tensor; for encoder-decoder attention,
-            k and v are typically the same tensor. (We take advantage of these
-            identities for performance if they are present.) Regardless, q, k and v
-            must share a common embedding dimension; otherwise their shapes may vary.
-        w: projection weights for q, k and v, packed into a single tensor. Weights
-            are packed along dimension 0, in q, k, v order.
-        b: optional projection biases for q, k and v, packed into a single tensor
-            in q, k, v order.
-
-    Shape:
-        Inputs:
-            - q: :math:`(..., E)` where E is the embedding dimension
-            - k: :math:`(..., E)` where E is the embedding dimension
-            - v: :math:`(..., E)` where E is the embedding dimension
-            - w: :math:`(E * 3, E)` where E is the embedding dimension
-            - b: :math:`E * 3` where E is the embedding dimension
-
-        Output:
-            - in output list :math:`[q', k', v']`, each output tensor will have the
-            same shape as the corresponding input tensor.
-    """
-    E = q.size(-1)
-    if k is v:
-        if q is k:
-            # self-attention
-            proj = linear(q, w, b)
-            # reshape to 3, E and not E, 3 is deliberate for better memory coalescing and keeping same order as chunk()
-            proj = proj.unflatten(-1, (3, E)).unsqueeze(0).swapaxes(0, -2).squeeze(-2)
-            return proj[0], proj[1], proj[2]
-        else:
-            # encoder-decoder attention
-            w_q, w_kv = w.split([E, E * 2])
-            if b is None:
-                b_q = b_kv = None
-            else:
-                b_q, b_kv = b.split([E, E * 2])
-            q_proj = linear(q, w_q, b_q)
-            kv_proj = linear(k, w_kv, b_kv)
-            # reshape to 2, E and not E, 2 is deliberate for better memory coalescing and keeping same order as chunk()
-            kv_proj = kv_proj.unflatten(-1, (2, E)).unsqueeze(0).swapaxes(0, -2).squeeze(-2)
-            return (q_proj, kv_proj[0], kv_proj[1])
-    else:
-        w_q, w_k, w_v = w.chunk(3)
-        if b is None:
-            b_q = b_k = b_v = None
-        else:
-            b_q, b_k, b_v = b.chunk(3)
-        return linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v)
-
-
 # Copied from transformers.models.wav2vec2.modeling_wav2vec2._compute_mask_indices
 def _compute_mask_indices(
     shape: Tuple[int, int],
@@ -184,15 +80,15 @@ def _compute_mask_indices(
 
     Args:
         shape: The shape for which to compute masks. This should be of a tuple of size 2 where
-            the first element is the batch size and the second element is the length of the axis to span.
+               the first element is the batch size and the second element is the length of the axis to span.
         mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
-            independently generated mask spans of length `mask_length` is computed by
-            `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
-            actual percentage will be smaller.
+                    independently generated mask spans of length `mask_length` is computed by
+                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
+                    actual percentage will be smaller.
         mask_length: size of the mask
         min_masks: minimum number of masked spans
         attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
-            each batch dimension.
+                        each batch dimension.
     """
     batch_size, sequence_length = shape
 
@@ -225,7 +121,7 @@ def compute_num_masked_span(input_length):
 
     # compute number of masked spans in batch
     input_lengths = (
-        ops.stop_gradient(attention_mask.sum(-1)).tolist()
+        attention_mask.sum(-1).tolist()
         if attention_mask is not None
         else [sequence_length for _ in range(batch_size)]
     )
@@ -325,15 +221,15 @@ def __init__(self, config, layer_id=0):
             stride=config.conv_stride[layer_id],
             bias=config.conv_bias,
         )
-        self.layer_norm = nn.LayerNorm(self.out_conv_dim)
+        self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True)
         self.activation = ACT2FN[config.feat_extract_activation]
 
     def forward(self, hidden_states):
         hidden_states = self.conv(hidden_states)
 
-        hidden_states = hidden_states.swapaxes(-2, -1)
+        hidden_states = ops.transpose(hidden_states, -2, -1)
         hidden_states = self.layer_norm(hidden_states)
-        hidden_states = hidden_states.swapaxes(-2, -1)
+        hidden_states = ops.transpose(hidden_states, -2, -1)
 
         hidden_states = self.activation(hidden_states)
         return hidden_states
@@ -374,22 +270,22 @@ def __init__(self, config):
             kernel_size=config.num_conv_pos_embeddings,
             padding=config.num_conv_pos_embeddings // 2,
             groups=config.num_conv_pos_embedding_groups,
-            bias=True
         )
 
-        self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+        weight_norm = nn.utils.weight_norm
+        self.conv = weight_norm(self.conv, name="weight", dim=2)
 
         self.padding = WavLMSamePadLayer(config.num_conv_pos_embeddings)
         self.activation = ACT2FN[config.feat_extract_activation]
 
     def forward(self, hidden_states):
-        hidden_states = hidden_states.swapaxes(1, 2)
+        hidden_states = ops.transpose(hidden_states, 1, 2)
 
         hidden_states = self.conv(hidden_states)
         hidden_states = self.padding(hidden_states)
         hidden_states = self.activation(hidden_states)
 
-        hidden_states = hidden_states.swapaxes(1, 2)
+        hidden_states = ops.transpose(hidden_states, 1, 2)
         return hidden_states
 
 
@@ -427,7 +323,7 @@ def __init__(self, config):
         self._requires_grad = True
 
     def _freeze_parameters(self):
-        for param in self.get_parameters():
+        for param in self.parameters():
             param.requires_grad = False
         self._requires_grad = False
 
@@ -467,7 +363,7 @@ def __init__(self, config):
         super().__init__()
         self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)
         self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
-        self.dropout = nn.Dropout(p=config.feat_proj_dropout)
+        self.dropout = nn.Dropout(config.feat_proj_dropout)
 
     def forward(self, hidden_states):
         # non-projected hidden states are needed for quantization
@@ -510,7 +406,7 @@ def __init__(
         self.num_buckets = num_buckets
         self.max_distance = max_distance
 
-        self.gru_rel_pos_const = mindspore.Parameter(ops.ones(1, self.num_heads, 1, 1))
+        self.gru_rel_pos_const = nn.Parameter(ops.ones(1, self.num_heads, 1, 1))
         self.gru_rel_pos_linear = nn.Linear(self.head_dim, 8)
 
         if has_relative_position_bias:
@@ -531,7 +427,7 @@ def forward(
         if position_bias is None:
             position_bias = self.compute_bias(tgt_len, tgt_len)
             position_bias = (
-                position_bias.unsqueeze(0).repeat(bsz, 1, 1, 1).view(bsz * self.num_heads, tgt_len, tgt_len)
+                position_bias.unsqueeze(0).tile((bsz, 1, 1, 1)).view(bsz * self.num_heads, tgt_len, tgt_len)
             )
 
         # Compute relative position bias:
@@ -544,7 +440,7 @@ def forward(
         relative_position_proj = relative_position_proj.view(gated_hidden_states.shape[:-1] + (2, 4)).sum(-1)
 
         # 3) compute gate for position bias from projected hidden states
-        gate_a, gate_b = ops.sigmoid(relative_position_proj).chunk(2, axis=-1)
+        gate_a, gate_b = ops.chunk(ops.sigmoid(relative_position_proj), 2, dim=-1)
         gate_output = gate_a * (gate_b * self.gru_rel_pos_const - 1.0) + 2.0
 
         # 4) apply gate to position bias to compute gated position_bias
@@ -555,7 +451,6 @@ def forward(
             hidden_states, attention_mask, gated_position_bias, output_attentions
         )
 
-
         return attn_output, attn_weights, position_bias
 
     def torch_multi_head_self_attention(
@@ -567,14 +462,13 @@ def torch_multi_head_self_attention(
     ) -> (mindspore.Tensor, mindspore.Tensor):
         """simple wrapper around torch's multi_head_attention_forward function"""
         # self-attention assumes q = k = v
-        query = key = value = hidden_states.swapaxes(0, 1)
+        query = key = value = ops.transpose(hidden_states, 0, 1)
         key_padding_mask = attention_mask.ne(1) if attention_mask is not None else None
 
         # disable bias and add_zero_attn
         bias_k = bias_v = None
         add_zero_attn = False
 
-
         # PyTorch 1.3.0 has F.multi_head_attention_forward defined
         # so no problem with backwards compatibility
         attn_output, attn_weights = F.multi_head_attention_forward(
@@ -583,7 +477,7 @@ def torch_multi_head_self_attention(
             value,
             self.embed_dim,
             self.num_heads,
-            ops.zeros([0]),
+            ops.empty([0]),
             ops.cat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias)),
             bias_k,
             bias_v,
@@ -593,8 +487,8 @@ def torch_multi_head_self_attention(
             self.out_proj.bias,
             self.training,
             key_padding_mask,
-            # attention_mask,
-            attn_mask=gated_position_bias,
+            output_attentions,
+            gated_position_bias,
             use_separate_proj_weight=True,
             q_proj_weight=self.q_proj.weight,
             k_proj_weight=self.k_proj.weight,
@@ -602,7 +496,7 @@ def torch_multi_head_self_attention(
         )
 
         # [Seq_Len, Batch Size, ...] -> [Batch Size, Seq_Len, ...]
-        attn_output = attn_output.swapaxes(0, 1)
+        attn_output = ops.transpose(attn_output, 0, 1)
 
         if attn_weights is not None:
             # IMPORTANT: Attention weights are averaged weights
@@ -626,7 +520,7 @@ def compute_bias(self, query_length: int, key_length: int) -> mindspore.Tensor:
     def _relative_positions_bucket(self, relative_positions: mindspore.Tensor) -> mindspore.Tensor:
         num_buckets = self.num_buckets // 2
 
-        relative_buckets = (relative_positions > 0).astype(mindspore.int64) * num_buckets
+        relative_buckets = (relative_positions > 0).to(mindspore.int64) * num_buckets
         relative_positions = ops.abs(relative_positions)
 
         max_exact = num_buckets // 2
@@ -635,17 +529,11 @@ def _relative_positions_bucket(self, relative_positions: mindspore.Tensor) -> mi
         relative_positions_if_large = ops.log(relative_positions.float() / max_exact)
         relative_positions_if_large = relative_positions_if_large / math.log(self.max_distance / max_exact)
         relative_positions_if_large = relative_positions_if_large * (num_buckets - max_exact)
-        relative_position_if_large = (max_exact + relative_positions_if_large).astype(mindspore.int64)
-        # relative_position_if_large = ops.min(
-        #     relative_position_if_large, ops.full_like(relative_position_if_large, num_buckets - 1)
-        # )
-        relative_position_if_large = ops.where(
-            relative_position_if_large < ops.full_like(relative_position_if_large, num_buckets - 1),
-            relative_position_if_large,
-            ops.full_like(relative_position_if_large, num_buckets - 1)
+        relative_position_if_large = (max_exact + relative_positions_if_large).to(mindspore.int64)
+        relative_position_if_large = ops.minimum(
+            relative_position_if_large, ops.full_like(relative_position_if_large, num_buckets - 1)
         )
 
-
         relative_buckets += ops.where(is_small, relative_positions, relative_position_if_large)
         return relative_buckets
 
@@ -654,7 +542,7 @@ def _relative_positions_bucket(self, relative_positions: mindspore.Tensor) -> mi
 class WavLMFeedForward(nn.Module):
     def __init__(self, config):
         super().__init__()
-        self.intermediate_dropout = nn.Dropout(p=config.activation_dropout)
+        self.intermediate_dropout = nn.Dropout(config.activation_dropout)
 
         self.intermediate_dense = nn.Linear(config.hidden_size, config.intermediate_size)
         if isinstance(config.hidden_act, str):
@@ -663,7 +551,7 @@ def __init__(self, config):
             self.intermediate_act_fn = config.hidden_act
 
         self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size)
-        self.output_dropout = nn.Dropout(p=config.hidden_dropout)
+        self.output_dropout = nn.Dropout(config.hidden_dropout)
 
     def forward(self, hidden_states):
         hidden_states = self.intermediate_dense(hidden_states)
@@ -686,7 +574,7 @@ def __init__(self, config: WavLMConfig, has_relative_position_bias: bool = True)
             max_distance=config.max_bucket_distance,
             has_relative_position_bias=has_relative_position_bias,
         )
-        self.dropout = nn.Dropout(p=config.hidden_dropout)
+        self.dropout = nn.Dropout(config.hidden_dropout)
         self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.feed_forward = WavLMFeedForward(config)
         self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
@@ -700,8 +588,6 @@ def forward(self, hidden_states, attention_mask=None, position_bias=None, output
             output_attentions=output_attentions,
             index=index,
         )
-
-
         hidden_states = self.dropout(hidden_states)
         hidden_states = attn_residual + hidden_states
 
@@ -715,9 +601,6 @@ def forward(self, hidden_states, attention_mask=None, position_bias=None, output
         if output_attentions:
             outputs += (attn_weights,)
 
-
-
-
         return outputs
 
 
@@ -732,7 +615,7 @@ def __init__(self, config: WavLMConfig, has_relative_position_bias: bool = True)
             max_distance=config.max_bucket_distance,
             has_relative_position_bias=has_relative_position_bias,
         )
-        self.dropout = nn.Dropout(p=config.hidden_dropout)
+        self.dropout = nn.Dropout(config.hidden_dropout)
         self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.feed_forward = WavLMFeedForward(config)
         self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
@@ -764,7 +647,7 @@ def __init__(self, config):
         self.config = config
         self.pos_conv_embed = WavLMPositionalConvEmbedding(config)
         self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(p=config.hidden_dropout)
+        self.dropout = nn.Dropout(config.hidden_dropout)
         self.layers = nn.ModuleList(
             [WavLMEncoderLayer(config, has_relative_position_bias=(i == 0)) for i in range(config.num_hidden_layers)]
         )
@@ -845,7 +728,7 @@ def __init__(self, config):
         self.config = config
         self.pos_conv_embed = WavLMPositionalConvEmbedding(config)
         self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(p=config.hidden_dropout)
+        self.dropout = nn.Dropout(config.hidden_dropout)
         self.layers = nn.ModuleList(
             [
                 WavLMEncoderLayerStableLayerNorm(config, has_relative_position_bias=(i == 0))
@@ -938,7 +821,7 @@ def __init__(self, config):
             )
 
         # storage for codebook variables (codewords)
-        self.codevectors = mindspore.Parameter(
+        self.codevectors = nn.Parameter(
             mindspore.Tensor(1, self.num_groups * self.num_vars, config.codevector_dim // self.num_groups)
         )
         self.weight_proj = nn.Linear(config.conv_dim[-1], self.num_groups * self.num_vars)
@@ -948,7 +831,7 @@ def __init__(self, config):
 
     @staticmethod
     def _compute_perplexity(probs):
-        marginal_probs = probs.mean(axis=0)
+        marginal_probs = probs.mean(dim=0)
         perplexity = ops.exp(-ops.sum(marginal_probs * ops.log(marginal_probs + 1e-7), dim=-1)).sum()
         return perplexity
 
@@ -961,7 +844,7 @@ def forward(self, hidden_states):
 
         if self.training:
             # sample code vector probs via gumbel in differentiateable way
-            codevector_probs = ops.gumbel_softmax(hidden_states.float(), tau=self.temperature, hard=True)
+            codevector_probs = nn.functional.gumbel_softmax(hidden_states.float(), tau=self.temperature, hard=True)
             codevector_probs = codevector_probs.type_as(hidden_states)
 
             # compute perplexity
@@ -1010,14 +893,14 @@ def forward(self, hidden_states):
             hidden_states = self.proj(hidden_states)
             hidden_states = self.proj_layer_norm(hidden_states)
 
-        hidden_states = hidden_states.swapaxes(1, 2)
+        hidden_states = ops.transpose(hidden_states, 1, 2)
 
         for layer in self.layers:
             layerdrop_prob = np.random.random()
             if not self.training or (layerdrop_prob > self.layerdrop):
                 hidden_states = layer(hidden_states)
 
-        hidden_states = hidden_states.swapaxes(1, 2)
+        hidden_states = ops.transpose(hidden_states, 1, 2)
         return hidden_states
 
 
@@ -1035,7 +918,7 @@ def __init__(self, config):
 
     def forward(self, hidden_states):
         hidden_states = self.conv(hidden_states)
-        hidden_states = F.glu(hidden_states, dim=1)
+        hidden_states = nn.functional.glu(hidden_states, dim=1)
 
         return hidden_states
 
@@ -1051,69 +934,38 @@ class WavLMPreTrainedModel(PreTrainedModel):
     main_input_name = "input_values"
     supports_gradient_checkpointing = True
 
-    def _init_weights(self, cell):
+    def _init_weights(self, module):
         """Initialize the weights"""
         # gumbel softmax requires special init
-        if isinstance(cell, WavLMGumbelVectorQuantizer):
-            # module.weight_proj.weight.data.normal_(mean=0.0, std=1)
-            # module.weight_proj.bias.data.zero_()
-            # nn.init.uniform_(module.codevectors)
-            cell.weight_proj.weight.set_data(initializer(Normal(1),
-                                             cell.weight.shape, cell.weight.dtype))
-            cell.weight_proj.bias.set_data(initializer('zeros', cell.bias.shape, cell.bias.dtype))
-            cell.codevectors.set_data(initializer(TruncatedNormal(sigma=0.2, mean=0.5, a=-2.5, b=2.5),
-                                             cell.codevectors.shape, cell.codevectors.dtype))
-
-        elif isinstance(cell, WavLMPositionalConvEmbedding):
-            # nn.init.normal_(
-            #     module.conv.weight,
-            #     mean=0,
-            #     std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)),
-            # )
-            # nn.init.constant_(module.conv.bias, 0)
-            cell.conv.weight.set_data(initializer(Normal(2 * math.sqrt(1 / (cell.conv.kernel_size[0] * cell.conv.in_channels))),
-                                                        cell.conv.weight.shape, cell.conv.weight.dtype))
-            cell.conv.bias.set_data(initializer('zeros', cell.conv.bias.shape, cell.conv.bias.dtype))
-
-
-
-        elif isinstance(cell, WavLMFeatureProjection):
-            # k = math.sqrt(1 / module.projection.in_features)
-            # nn.init.uniform_(module.projection.weight, a=-k, b=k)
-            # nn.init.uniform_(module.projection.bias, a=-k, b=k)
-            k = math.sqrt(1 / cell.projection.in_channels)
-            cell.projection.weight.set_data(initializer(Uniform(scale=k),
-                                                  cell.projection.weight.shape, cell.projection.weight.dtype))
-            cell.projection.bias.set_data(initializer(Uniform(scale=k),
-                                                        cell.projection.bias.shape, cell.projection.bias.dtype))
-
-        elif isinstance(cell, nn.Linear):
-            # module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            #
-            # if module.bias is not None:
-            #     module.bias.data.zero_()
-            cell.weight.set_data(initializer(Normal(self.config.initializer_range),
-                                             cell.weight.shape, cell.weight.dtype))
-            if cell.bias is not None:
-                cell.bias.set_data(initializer('zeros', cell.bias.shape, cell.bias.dtype))
-        elif isinstance(cell, (nn.LayerNorm, nn.GroupNorm)):
-            # module.bias.data.zero_()
-            # module.weight.data.fill_(1.0)
-            cell.weight.set_data(initializer('ones', cell.weight.shape, cell.weight.dtype))
-            cell.bias.set_data(initializer('zeros', cell.bias.shape, cell.bias.dtype))
-        elif isinstance(cell, nn.Conv1d):
-            # nn.init.kaiming_normal_(module.weight)
-            #
-            # if module.bias is not None:
-            #     k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
-            #     nn.init.uniform_(module.bias, a=-k, b=k)
-            cell.weight.set_data(
-                initializer(HeNormal(),cell.weight.shape, cell.weight.dtype))
-            if cell.bias is not None:
-                k = math.sqrt(cell.group / (cell.in_channels * cell.kernel_size[0]))
-                cell.bias.set_data(initializer(Uniform(scale=k),
-                                                    cell.bias.shape, cell.bias.dtype))
-
+        if isinstance(module, WavLMGumbelVectorQuantizer):
+            nn.init.normal_(module.weight_proj.weight, mean=0.0, std=1)
+            nn.init.zeros_(module.weight_proj.bias)
+            nn.init.uniform_(module.codevectors)
+        elif isinstance(module, WavLMPositionalConvEmbedding):
+            nn.init.normal_(
+                module.conv.weight,
+                mean=0,
+                std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)),
+            )
+            nn.init.constant_(module.conv.bias, 0)
+        elif isinstance(module, WavLMFeatureProjection):
+            k = math.sqrt(1 / module.projection.in_features)
+            nn.init.uniform_(module.projection.weight, a=-k, b=k)
+            nn.init.uniform_(module.projection.bias, a=-k, b=k)
+        elif isinstance(module, nn.Linear):
+            nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
+
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            nn.init.zeros_(module.bias)
+            nn.init.ones_(module.weight)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.kaiming_normal_(module.weight)
+
+            if module.bias is not None:
+                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-k, b=k)
 
     def _get_feat_extract_output_lengths(
         self, input_lengths: Union[mindspore.Tensor, int], add_adapter: Optional[bool] = None
@@ -1126,7 +978,6 @@ def _get_feat_extract_output_lengths(
 
         def _conv_out_length(input_length, kernel_size, stride):
             # 1D convolutional layer output length formula taken
-            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
             return ops.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
 
         for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
@@ -1143,10 +994,10 @@ def _get_feature_vector_attention_mask(
     ):
         # Effectively attention_mask.sum(-1), but not inplace to be able to run
         # on inference mode.
-        non_padded_lengths = attention_mask.cumsum(axis=-1)[:, -1]
+        non_padded_lengths = ops.cumsum(attention_mask, dim=-1)[:, -1]
 
         output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
-        output_lengths = output_lengths.astype(mindspore.int64)
+        output_lengths = output_lengths.to(mindspore.int64)
 
         batch_size = attention_mask.shape[0]
 
@@ -1155,66 +1006,10 @@ def _get_feature_vector_attention_mask(
         )
         # these two operations makes sure that all values before the output lengths idxs are attended to
         attention_mask[(ops.arange(attention_mask.shape[0]), output_lengths - 1)] = 1
-        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+        attention_mask = ops.cumsum(attention_mask.flip([-1]), -1).flip([-1]).bool()
         return attention_mask
 
 
-WAVLM_START_DOCSTRING = r"""
-    WavLM was proposed in [WavLM: Unified Speech Representation Learning with Labeled and Unlabeled
-    Data](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo
-    Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian,
-    Jian Wu, Michael Zeng, Xiangzhan Yu, Furu Wei.
-
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving etc.).
-
-    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
-    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
-    behavior.
-
-    Parameters:
-        config ([`WavLMConfig`]): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-WAVLM_INPUTS_DOCSTRING = r"""
-    Args:
-        input_values (`mindspore.Tensor` of shape `(batch_size, sequence_length)`):
-            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
-            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
-            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
-            conversion into a tensor of type `mindspore.Tensor`. See [`Wav2Vec2Processor.__call__`] for details.
-        attention_mask (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0, 1]`:
-            
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-            
-            [What are attention masks?](../glossary#attention-mask)
-
-            <Tip warning={true}>
-
-            `attention_mask` should only be passed if the corresponding processor has `config.return_attention_mask ==
-            True`. For all models whose processor has `config.return_attention_mask == False`, `attention_mask` should
-            **not** be passed to avoid degraded performance when doing batched inference. For such models
-            `input_values` should simply be padded with 0 and passed without `attention_mask`. Be aware that these
-            models also yield slightly different results depending on whether `input_values` is padded or not.
-
-            </Tip>
-
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
 # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model with Wav2Vec2->WavLM, wav2vec2->wavlm, WAV_2_VEC_2->WAVLM, WavLMBaseModelOutput->Wav2Vec2BaseModelOutput
 class WavLMModel(WavLMPreTrainedModel):
     def __init__(self, config: WavLMConfig):
@@ -1225,99 +1020,7 @@ def __init__(self, config: WavLMConfig):
 
         # model only needs masking vector if mask prob is > 0.0
         if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
-
-            self.masked_spec_embed_fixed = mindspore.Tensor([0.6690, 0.8174, 0.0483, 0.8542, 0.5385, 0.7270, 0.8509, 0.7227, 0.4435,
-                0.9075, 0.5943, 0.5755, 0.2277, 0.5103, 0.1635, 0.6906, 0.3977, 0.9756,
-                0.0362, 0.9023, 0.3385, 0.1798, 0.5457, 0.9846, 0.8872, 0.7534, 0.7174,
-                0.9129, 0.0361, 0.5914, 0.6458, 0.0551, 0.4543, 0.2475, 0.5665, 0.5622,
-                0.7827, 0.2933, 0.4264, 0.2142, 0.8809, 0.7395, 0.8117, 0.8880, 0.9114,
-                0.7873, 0.1974, 0.5749, 0.2186, 0.7509, 0.9451, 0.5604, 0.4548, 0.3830,
-                0.8748, 0.0481, 0.7892, 0.6930, 0.6757, 0.3346, 0.5754, 0.0830, 0.3630,
-                0.3927, 0.4438, 0.3057, 0.2056, 0.6541, 0.8959, 0.3882, 0.3742, 0.6756,
-                0.2212, 0.4545, 0.4845, 0.5233, 0.9661, 0.8705, 0.0297, 0.2031, 0.9059,
-                0.2570, 0.3765, 0.6301, 0.2756, 0.4591, 0.2101, 0.5576, 0.1532, 0.3753,
-                0.6413, 0.1778, 0.5639, 0.7753, 0.4551, 0.7990, 0.1866, 0.0881, 0.5993,
-                0.0529, 0.9180, 0.4496, 0.7429, 0.7545, 0.8755, 0.8374, 0.0907, 0.7265,
-                0.7455, 0.0652, 0.0794, 0.3860, 0.9730, 0.7865, 0.8821, 0.2630, 0.2690,
-                0.6491, 0.0887, 0.4657, 0.8514, 0.0096, 0.6633, 0.7675, 0.9290, 0.9126,
-                0.0885, 0.7826, 0.8512, 0.6113, 0.7821, 0.0923, 0.9687, 0.3606, 0.7457,
-                0.3216, 0.4239, 0.0411, 0.1968, 0.6589, 0.9997, 0.6803, 0.3238, 0.0318,
-                0.3006, 0.0840, 0.3048, 0.7558, 0.5318, 0.0110, 0.6965, 0.9264, 0.8576,
-                0.8286, 0.7549, 0.3492, 0.6382, 0.4695, 0.6429, 0.8461, 0.4037, 0.6143,
-                0.6750, 0.0130, 0.5454, 0.8819, 0.7204, 0.8509, 0.5713, 0.3463, 0.3251,
-                0.1364, 0.9822, 0.1932, 0.4651, 0.8423, 0.0824, 0.0385, 0.6319, 0.4540,
-                0.9898, 0.0858, 0.2168, 0.8091, 0.2082, 0.0317, 0.5799, 0.8108, 0.2224,
-                0.1679, 0.2297, 0.1149, 0.6511, 0.8530, 0.2673, 0.2593, 0.1479, 0.6914,
-                0.1220, 0.2791, 0.2264, 0.3477, 0.0301, 0.4977, 0.9622, 0.9822, 0.1609,
-                0.9212, 0.2130, 0.7508, 0.9012, 0.8798, 0.9235, 0.2774, 0.1695, 0.1931,
-                0.6583, 0.8880, 0.1824, 0.5290, 0.8476, 0.5914, 0.2393, 0.2043, 0.5509,
-                0.4092, 0.5522, 0.1584, 0.1846, 0.5055, 0.3038, 0.2121, 0.1347, 0.8977,
-                0.4759, 0.3980, 0.1729, 0.5186, 0.3864, 0.1076, 0.7897, 0.5062, 0.6262,
-                0.3445, 0.7281, 0.5154, 0.1098, 0.8532, 0.8998, 0.1109, 0.1660, 0.2890,
-                0.3983, 0.9154, 0.2710, 0.6147, 0.1245, 0.2494, 0.1251, 0.6717, 0.4353,
-                0.8889, 0.4446, 0.2871, 0.5897, 0.8086, 0.4644, 0.5078, 0.5242, 0.4318,
-                0.9208, 0.2187, 0.1061, 0.2322, 0.9779, 0.1891, 0.5374, 0.8748, 0.2969,
-                0.9084, 0.4123, 0.2679, 0.1227, 0.2493, 0.0069, 0.4302, 0.7309, 0.6150,
-                0.8707, 0.9405, 0.0665, 0.0617, 0.4912, 0.8631, 0.3454, 0.5959, 0.4082,
-                0.5628, 0.1539, 0.4820, 0.2230, 0.7901, 0.9863, 0.3853, 0.6251, 0.0294,
-                0.5922, 0.4190, 0.1238, 0.9131, 0.7443, 0.7243, 0.2333, 0.5575, 0.9056,
-                0.6038, 0.6373, 0.3231, 0.1106, 0.7115, 0.0738, 0.1821, 0.5646, 0.6631,
-                0.9203, 0.3644, 0.8854, 0.7089, 0.9513, 0.6969, 0.6221, 0.9998, 0.3835,
-                0.1778, 0.8368, 0.4535, 0.0226, 0.7247, 0.3746, 0.3204, 0.0739, 0.5398,
-                0.9403, 0.6918, 0.7779, 0.1451, 0.2665, 0.2724, 0.9406, 0.7556, 0.4615,
-                0.9865, 0.9019, 0.4024, 0.0430, 0.5586, 0.0194, 0.4044, 0.8839, 0.6115,
-                0.9678, 0.0424, 0.1750, 0.1324, 0.3528, 0.0426, 0.4412, 0.0817, 0.5239,
-                0.1943, 0.2168, 0.1862, 0.1268, 0.9675, 0.7493, 0.9916, 0.0120, 0.6652,
-                0.3382, 0.1434, 0.0340, 0.5746, 0.2504, 0.6652, 0.4948, 0.9776, 0.8149,
-                0.8904, 0.6182, 0.5081, 0.9500, 0.6186, 0.7949, 0.9912, 0.0316, 0.5226,
-                0.6809, 0.6388, 0.8631, 0.3738, 0.3314, 0.0405, 0.1620, 0.3713, 0.8028,
-                0.9732, 0.9597, 0.3242, 0.2495, 0.2347, 0.2002, 0.5536, 0.1284, 0.7263,
-                0.5329, 0.3998, 0.5114, 0.9307, 0.3562, 0.7596, 0.7474, 0.5452, 0.6765,
-                0.9079, 0.6698, 0.3373, 0.7954, 0.8829, 0.8574, 0.2378, 0.5754, 0.4218,
-                0.4776, 0.6210, 0.0870, 0.7172, 0.4000, 0.7223, 0.3835, 0.0187, 0.6055,
-                0.2987, 0.1763, 0.9496, 0.0019, 0.6128, 0.2233, 0.6464, 0.6703, 0.3060,
-                0.5027, 0.5011, 0.1066, 0.9224, 0.6772, 0.1122, 0.4799, 0.0956, 0.6784,
-                0.2987, 0.4378, 0.8626, 0.1457, 0.8810, 0.2955, 0.3982, 0.9872, 0.2424,
-                0.4985, 0.9825, 0.8322, 0.6646, 0.5974, 0.9266, 0.7363, 0.8470, 0.3441,
-                0.6455, 0.0959, 0.3900, 0.0110, 0.5135, 0.7431, 0.9956, 0.4753, 0.2459,
-                0.1745, 0.4280, 0.3137, 0.5803, 0.8807, 0.0013, 0.2719, 0.2735, 0.0174,
-                0.5792, 0.2755, 0.7145, 0.6616, 0.7531, 0.0317, 0.1691, 0.2877, 0.9014,
-                0.3965, 0.5576, 0.0569, 0.0952, 0.7354, 0.6605, 0.4193, 0.0895, 0.3981,
-                0.5928, 0.1463, 0.7944, 0.8587, 0.8905, 0.5828, 0.8698, 0.0869, 0.5440,
-                0.0108, 0.9643, 0.2618, 0.0239, 0.5285, 0.9577, 0.5655, 0.6379, 0.2955,
-                0.6893, 0.6071, 0.1768, 0.3647, 0.6052, 0.7924, 0.8311, 0.4018, 0.4684,
-                0.7488, 0.9257, 0.1174, 0.9175, 0.2108, 0.7104, 0.0650, 0.9683, 0.1456,
-                0.3139, 0.9895, 0.4817, 0.3550, 0.3194, 0.2714, 0.3304, 0.3714, 0.6225,
-                0.5636, 0.6906, 0.1564, 0.2612, 0.8385, 0.2389, 0.6572, 0.1156, 0.5804,
-                0.3947, 0.0016, 0.2312, 0.0136, 0.2436, 0.7072, 0.4118, 0.6912, 0.1629,
-                0.0368, 0.5640, 0.7028, 0.0881, 0.9698, 0.7337, 0.0634, 0.7968, 0.0754,
-                0.6724, 0.2065, 0.7023, 0.1979, 0.4276, 0.3267, 0.3916, 0.9641, 0.5335,
-                0.3355, 0.5741, 0.9364, 0.7964, 0.2325, 0.4632, 0.0586, 0.4343, 0.9153,
-                0.3367, 0.3897, 0.8585, 0.4316, 0.3008, 0.4461, 0.3888, 0.4275, 0.2071,
-                0.7893, 0.7605, 0.4429, 0.1573, 0.0303, 0.7489, 0.9437, 0.2839, 0.2179,
-                0.3195, 0.4809, 0.1952, 0.8383, 0.0198, 0.8895, 0.4406, 0.9321, 0.5931,
-                0.3670, 0.9503, 0.5326, 0.9467, 0.2632, 0.4534, 0.7885, 0.7485, 0.9038,
-                0.5202, 0.4448, 0.6610, 0.1788, 0.2415, 0.0186, 0.3090, 0.3962, 0.7363,
-                0.5319, 0.0024, 0.5918, 0.0702, 0.3051, 0.3310, 0.6551, 0.7465, 0.2650,
-                0.3644, 0.8870, 0.9065, 0.9198, 0.6367, 0.5113, 0.1910, 0.8260, 0.4486,
-                0.8939, 0.9591, 0.0051, 0.9798, 0.6846, 0.9752, 0.6470, 0.2136, 0.8094,
-                0.1351, 0.6637, 0.1317, 0.5875, 0.3815, 0.3004, 0.5598, 0.2138, 0.2395,
-                0.7725, 0.4870, 0.2897, 0.5427, 0.7458, 0.4651, 0.7445, 0.5091, 0.5224,
-                0.1761, 0.3968, 0.8253, 0.0378, 0.1911, 0.2917, 0.8945, 0.5533, 0.9208,
-                0.9452, 0.5043, 0.4790, 0.6593, 0.4681, 0.5305, 0.2849, 0.7655, 0.8555,
-                0.2354, 0.5224, 0.2482, 0.6614, 0.4972, 0.8426, 0.3883, 0.1001, 0.4299,
-                0.6966, 0.4446, 0.9288, 0.4683, 0.0273, 0.1940, 0.8093, 0.3530, 0.8765,
-                0.8774, 0.7397, 0.6672, 0.8504, 0.9556, 0.9929, 0.3112, 0.7945, 0.2682,
-                0.4824, 0.1706, 0.8585, 0.9539, 0.1334, 0.0866, 0.8030, 0.8256, 0.1504,
-                0.0553, 0.5819, 0.3482, 0.9587, 0.3867, 0.5643, 0.7611, 0.5880, 0.2536,
-                0.6834, 0.3636, 0.3593, 0.1886, 0.2166, 0.0668, 0.8122, 0.2461, 0.5877,
-                0.0802, 0.4127, 0.1399])
-            if config.hidden_size >= 768:
-                self.masked_spec_embed=self.masked_spec_embed_fixed
-            else:
-                self.masked_spec_embed = ops.abs(mindspore.Tensor(shape=(config.hidden_size), dtype=mindspore.float32, init=Uniform(1.0)))
-
-
+            self.masked_spec_embed = nn.Parameter(ops.randn(config.hidden_size))
 
         if config.do_stable_layer_norm:
             self.encoder = WavLMEncoderStableLayerNorm(config)
@@ -1335,7 +1038,7 @@ def freeze_feature_extractor(self):
         not be updated during training.
         """
         warnings.warn(
-            "The method `freeze_feature_extractor` is deprecated. "
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
             "Please use the equivalent `freeze_feature_encoder` method instead.",
             FutureWarning,
         )
@@ -1368,7 +1071,7 @@ def _mask_hidden_states(
 
         if mask_time_indices is not None:
             # apply SpecAugment along time axis with given mask_time_indices
-            hidden_states[mask_time_indices] = self.masked_spec_embed.astype(hidden_states.dtype)
+            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
         elif self.config.mask_time_prob > 0 and self.training:
             mask_time_indices = _compute_mask_indices(
                 (batch_size, sequence_length),
@@ -1377,10 +1080,10 @@ def _mask_hidden_states(
                 attention_mask=attention_mask,
                 min_masks=self.config.mask_time_min_masks,
             )
-            mask_time_indices = mindspore.Tensor(mask_time_indices, dtype=mindspore.bool_)
-            hidden_states[mask_time_indices] = self.masked_spec_embed.astype(hidden_states.dtype)
+            mask_time_indices = mindspore.tensor(mask_time_indices, dtype=mindspore.bool_)
+            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
 
-        if self.config.mask_feature_prob > 0:
+        if self.config.mask_feature_prob > 0 and self.training:
             # generate indices & apply SpecAugment along feature axis
             mask_feature_indices = _compute_mask_indices(
                 (batch_size, hidden_size),
@@ -1388,11 +1091,10 @@ def _mask_hidden_states(
                 mask_length=self.config.mask_feature_length,
                 min_masks=self.config.mask_feature_min_masks,
             )
-            mask_feature_indices = mindspore.Tensor(mask_feature_indices, dtype=mindspore.bool_)
-            mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
+            mask_feature_indices = mindspore.tensor(mask_feature_indices, dtype=mindspore.bool_)
+            mask_feature_indices = mask_feature_indices[:, None].broadcast_to((-1, sequence_length, -1))
             hidden_states[mask_feature_indices] = 0
 
-
         return hidden_states
 
     def forward(
@@ -1411,7 +1113,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         extract_features = self.feature_extractor(input_values)
-        extract_features = extract_features.swapaxes(1, 2)
+        extract_features = ops.transpose(extract_features, 1, 2)
 
         if attention_mask is not None:
             # compute reduced attention_mask corresponding to feature vectors
@@ -1424,8 +1126,6 @@ def forward(
             hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
         )
 
-
-
         encoder_outputs = self.encoder(
             hidden_states,
             attention_mask=attention_mask,
@@ -1436,7 +1136,6 @@ def forward(
 
         hidden_states = encoder_outputs[0]
 
-
         if self.adapter is not None:
             hidden_states = self.adapter(hidden_states)
 
@@ -1457,7 +1156,7 @@ def __init__(self, config, target_lang: Optional[str] = None):
         super().__init__(config)
 
         self.wavlm = WavLMModel(config)
-        self.dropout = nn.Dropout(p=config.final_dropout)
+        self.dropout = nn.Dropout(config.final_dropout)
 
         self.target_lang = target_lang
 
@@ -1503,7 +1202,7 @@ def freeze_feature_extractor(self):
         not be updated during training.
         """
         warnings.warn(
-            "The method `freeze_feature_extractor` is deprecated. "
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
             "Please use the equivalent `freeze_feature_encoder` method instead.",
             FutureWarning,
         )
@@ -1521,7 +1220,7 @@ def freeze_base_model(self):
         Calling this function will disable the gradient computation for the base model so that its parameters will not
         be updated during training. Only the classification head will be updated.
         """
-        for param in self.wavlm.get_parameters():
+        for param in self.wavlm.parameters():
             param.requires_grad = False
 
     def forward(
@@ -1534,14 +1233,14 @@ def forward(
         labels: Optional[mindspore.Tensor] = None,
     ) -> Union[Tuple, CausalLMOutput]:
         r"""
-        Args:
-            labels (`mindspore.Tensor` of shape `(batch_size, target_length)`, *optional*):
-                Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
-                the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
-                All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
-                config.vocab_size - 1]`.
+        labels (`mindspore.Tensor` of shape `(batch_size, target_length)`, *optional*):
+            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
+            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
+            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
+            config.vocab_size - 1]`.
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
         if labels is not None and labels.max() >= self.config.vocab_size:
             raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
 
@@ -1564,7 +1263,7 @@ def forward(
             attention_mask = (
                 attention_mask if attention_mask is not None else ops.ones_like(input_values, dtype=mindspore.int64)
             )
-            input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).astype(mindspore.int64)
+            input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(mindspore.int64)
 
             # assuming that padded tokens are filled with -100
             # when not being attended to
@@ -1573,10 +1272,9 @@ def forward(
             flattened_targets = labels.masked_select(labels_mask)
 
             # ctc_loss doesn't support fp16
-            log_probs = F.log_softmax(logits.astype(mindspore.float32), dim=-1).swapaxes(0, 1)
+            log_probs = ops.transpose(nn.functional.log_softmax(logits, dim=-1, dtype=mindspore.float32), 0, 1)
 
-            # with torch.backends.cudnn.flags(enabled=False):
-            loss = F.ctc_loss(
+            loss = nn.functional.ctc_loss(
                 log_probs,
                 labels,
                 input_lengths,
@@ -1606,7 +1304,7 @@ def __init__(self, config):
         self.wavlm = WavLMModel(config)
         num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
         if config.use_weighted_layer_sum:
-            self.layer_weights = mindspore.Parameter(ops.ones(num_layers) / num_layers)
+            self.layer_weights = nn.Parameter(ops.ones(num_layers) / num_layers)
         self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
         self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)
 
@@ -1620,7 +1318,7 @@ def freeze_feature_extractor(self):
         not be updated during training.
         """
         warnings.warn(
-            "The method `freeze_feature_extractor` is deprecated. "
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
             "Please use the equivalent `freeze_feature_encoder` method instead.",
             FutureWarning,
         )
@@ -1640,7 +1338,7 @@ def freeze_base_model(self):
         Calling this function will disable the gradient computation for the base model so that its parameters will not
         be updated during training. Only the classification head will be updated.
         """
-        for param in self.wavlm.get_parameters():
+        for param in self.wavlm.parameters():
             param.requires_grad = False
 
     # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.forward with Wav2Vec2->WavLM, wav2vec2->wavlm
@@ -1654,11 +1352,10 @@ def forward(
         labels: Optional[mindspore.Tensor] = None,
     ) -> Union[Tuple, SequenceClassifierOutput]:
         r"""
-        Args:
-            labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
-                Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-                config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-                `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
 
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1672,28 +1369,28 @@ def forward(
             return_dict=return_dict,
         )
 
-
         if self.config.use_weighted_layer_sum:
             hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
             hidden_states = ops.stack(hidden_states, dim=1)
-            norm_weights = ops.softmax(self.layer_weights, dim=-1)
-            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(axis=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
         else:
             hidden_states = outputs[0]
 
         hidden_states = self.projector(hidden_states)
         if attention_mask is None:
-            pooled_output = hidden_states.mean(axis=1)
+            pooled_output = ops.mean(hidden_states, dim=1)
         else:
             padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
             hidden_states[~padding_mask] = 0.0
-            pooled_output = hidden_states.sum(axis=1) / padding_mask.sum(axis=1).view(-1, 1)
+            pooled_output = ops.sum(hidden_states, dim=1) / ops.sum(padding_mask, dim=1).view(-1, 1)
 
         logits = self.classifier(pooled_output)
 
         loss = None
         if labels is not None:
-            loss = F.cross_entropy(logits.view(-1, self.config.num_labels), labels.view(-1))
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
 
         if not return_dict:
             output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
@@ -1706,6 +1403,7 @@ def forward(
             attentions=outputs.attentions,
         )
 
+
 # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForAudioFrameClassification with Wav2Vec2->WavLM, wav2vec2->wavlm, WAV_2_VEC_2->WAVLM
 class WavLMForAudioFrameClassification(WavLMPreTrainedModel):
     def __init__(self, config):
@@ -1718,7 +1416,7 @@ def __init__(self, config):
         self.wavlm = WavLMModel(config)
         num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
         if config.use_weighted_layer_sum:
-            self.layer_weights = mindspore.Parameter(ops.ones(num_layers) / num_layers)
+            self.layer_weights = nn.Parameter(ops.ones(num_layers) / num_layers)
         self.classifier = nn.Linear(config.hidden_size, config.num_labels)
         self.num_labels = config.num_labels
 
@@ -1730,7 +1428,7 @@ def freeze_feature_extractor(self):
         not be updated during training.
         """
         warnings.warn(
-            "The method `freeze_feature_extractor` is deprecated. "
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
             "Please use the equivalent `freeze_feature_encoder` method instead.",
             FutureWarning,
         )
@@ -1748,7 +1446,7 @@ def freeze_base_model(self):
         Calling this function will disable the gradient computation for the base model so that its parameters will not
         be updated during training. Only the classification head will be updated.
         """
-        for param in self.wavlm.get_parameters():
+        for param in self.wavlm.parameters():
             param.requires_grad = False
 
     def forward(
@@ -1761,11 +1459,10 @@ def forward(
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, TokenClassifierOutput]:
         r"""
-        Args:
-            labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
-                Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-                config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-                `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
 
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1782,8 +1479,8 @@ def forward(
         if self.config.use_weighted_layer_sum:
             hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
             hidden_states = ops.stack(hidden_states, dim=1)
-            norm_weights = ops.softmax(self.layer_weights, dim=-1)
-            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(axis=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
         else:
             hidden_states = outputs[0]
 
@@ -1791,8 +1488,8 @@ def forward(
 
         loss = None
         if labels is not None:
-            loss = F.cross_entropy(logits.view(-1, self.num_labels), ops.argmax(labels.view(-1, self.num_labels), dim=1))
-
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), ops.argmax(labels.view(-1, self.num_labels), dim=1))
 
         if not return_dict:
             output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
@@ -1813,19 +1510,19 @@ def __init__(self, input_dim, num_labels, scale=30.0, margin=0.4):
         self.scale = scale
         self.margin = margin
         self.num_labels = num_labels
-        self.weight = mindspore.Parameter(ops.randn(input_dim, num_labels), requires_grad=True)
-        # self.loss = nn.CrossEntropyLoss()
+        self.weight = nn.Parameter(ops.randn(input_dim, num_labels), requires_grad=True)
+        self.loss = nn.CrossEntropyLoss()
 
     def forward(self, hidden_states, labels):
         labels = labels.flatten()
-        weight = F.normalize(self.weight, dim=0)
-        hidden_states = F.normalize(hidden_states, dim=1)
+        weight = nn.functional.normalize(self.weight, dim=0)
+        hidden_states = nn.functional.normalize(hidden_states, dim=1)
         cos_theta = ops.mm(hidden_states, weight)
         psi = cos_theta - self.margin
 
-        onehot = F.one_hot(labels, self.num_labels)
+        onehot = nn.functional.one_hot(labels, self.num_labels)
         logits = self.scale * ops.where(onehot.bool(), psi, cos_theta)
-        loss = F.cross_entropy(logits, labels)
+        loss = self.loss(logits, labels)
 
         return loss
 
@@ -1843,20 +1540,19 @@ def __init__(self, config, layer_id=0):
         self.activation = nn.ReLU()
 
     def forward(self, hidden_states: mindspore.Tensor) -> mindspore.Tensor:
-        # if is_peft_available():
-        #     from peft.tuners.lora import LoraLayer
-        #
-        #     if isinstance(self.kernel, LoraLayer):
-        #         warnings.warn(
-        #             "Detected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. "
-        #             "You should exclude TDNNLayer from LoRA's target modules.",
-        #         )
+        from ....peft.tuners.lora import LoraLayer
+
+        if isinstance(self.kernel, LoraLayer):
+            warnings.warn(
+                "Detected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. "
+                "You should exclude TDNNLayer from LoRA's target modules.",
+            )
 
         # for backward compatibility, we keep nn.Linear but call F.conv1d for speed up
-        hidden_states = hidden_states.swapaxes(1, 2)
-        weight = self.kernel.weight.view(self.out_conv_dim, self.kernel_size, self.in_conv_dim).swapaxes(1, 2)
-        hidden_states = ops.conv1d(hidden_states, weight, self.kernel.bias, dilation=self.dilation)
-        hidden_states = hidden_states.swapaxes(1, 2)
+        hidden_states = ops.transpose(hidden_states, 1, 2)
+        weight = ops.transpose(self.kernel.weight.view(self.out_conv_dim, self.kernel_size, self.in_conv_dim), 1, 2)
+        hidden_states = nn.functional.conv1d(hidden_states, weight, self.kernel.bias, dilation=self.dilation)
+        hidden_states = ops.transpose(hidden_states, 1, 2)
 
         hidden_states = self.activation(hidden_states)
         return hidden_states
@@ -1870,7 +1566,7 @@ def __init__(self, config):
         self.wavlm = WavLMModel(config)
         num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
         if config.use_weighted_layer_sum:
-            self.layer_weights = mindspore.Parameter(ops.ones(num_layers) / num_layers)
+            self.layer_weights = nn.Parameter(ops.ones(num_layers) / num_layers)
         self.projector = nn.Linear(config.hidden_size, config.tdnn_dim[0])
 
         tdnn_layers = [TDNNLayer(config, i) for i in range(len(config.tdnn_dim))]
@@ -1889,7 +1585,7 @@ def freeze_feature_extractor(self):
         not be updated during training.
         """
         warnings.warn(
-            "The method `freeze_feature_extractor` is deprecated. "
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
             "Please use the equivalent `freeze_feature_encoder` method instead.",
             FutureWarning,
         )
@@ -1907,7 +1603,7 @@ def freeze_base_model(self):
         Calling this function will disable the gradient computation for the base model so that its parameters will not
         be updated during training. Only the classification head will be updated.
         """
-        for param in self.wavlm.get_parameters():
+        for param in self.wavlm.parameters():
             param.requires_grad = False
 
     def _get_tdnn_output_lengths(self, input_lengths: Union[mindspore.Tensor, int]):
@@ -1917,7 +1613,6 @@ def _get_tdnn_output_lengths(self, input_lengths: Union[mindspore.Tensor, int]):
 
         def _conv_out_length(input_length, kernel_size, stride):
             # 1D convolutional layer output length formula taken
-            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
             return (input_length - kernel_size) // stride + 1
 
         for kernel_size in self.config.tdnn_kernel:
@@ -1935,11 +1630,10 @@ def forward(
         labels: Optional[mindspore.Tensor] = None,
     ) -> Union[Tuple, XVectorOutput]:
         r"""
-        Args:
-            labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
-                Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-                config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-                `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
 
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1956,8 +1650,8 @@ def forward(
         if self.config.use_weighted_layer_sum:
             hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
             hidden_states = ops.stack(hidden_states, dim=1)
-            norm_weights = ops.softmax(self.layer_weights, dim=-1)
-            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(axis=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
         else:
             hidden_states = outputs[0]
 
@@ -1968,15 +1662,15 @@ def forward(
 
         # Statistic Pooling
         if attention_mask is None:
-            mean_features = hidden_states.mean(axis=1)
+            mean_features = ops.mean(hidden_states, dim=1)
             std_features = ops.std(hidden_states, dim=1)
         else:
-            feat_extract_output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(axis=1))
+            feat_extract_output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(dim=1))
             tdnn_output_lengths = self._get_tdnn_output_lengths(feat_extract_output_lengths)
             mean_features = []
             std_features = []
             for i, length in enumerate(tdnn_output_lengths):
-                mean_features.append(hidden_states[i, :length].mean(axis=0))
+                mean_features.append(ops.mean(hidden_states[i, :length], dim=0))
                 std_features.append(ops.std(hidden_states[i, :length], dim=0))
             mean_features = ops.stack(mean_features)
             std_features = ops.stack(std_features)
diff --git a/tests/ut/transformers/models/deta/test_modeling_deta.py b/tests/ut/transformers/models/deta/test_modeling_deta.py
index 54318889b..f7fbed3a7 100644
--- a/tests/ut/transformers/models/deta/test_modeling_deta.py
+++ b/tests/ut/transformers/models/deta/test_modeling_deta.py
@@ -370,6 +370,10 @@ def test_resize_tokens_embeddings(self):
     def test_feed_forward_chunking(self):
         pass
 
+    @unittest.skip(reason="grid_sampler_2d_grad_cpu_kernel.h:162] store] memcpy_s failed. errorno is: 34")
+    def test_training(self):
+        pass
+
     def test_attention_outputs(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.return_dict = True
diff --git a/tests/ut/transformers/models/wav2vec2/test_modeling_wav2vec2.py b/tests/ut/transformers/models/wav2vec2/test_modeling_wav2vec2.py
index 1c183c6c7..ea6268be1 100644
--- a/tests/ut/transformers/models/wav2vec2/test_modeling_wav2vec2.py
+++ b/tests/ut/transformers/models/wav2vec2/test_modeling_wav2vec2.py
@@ -1,4 +1,3 @@
-
 # coding=utf-8
 # Copyright 2021 The HuggingFace Inc. team. All rights reserved.
 #
@@ -13,63 +12,31 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# pylint: disable=missing-class-docstring
-# pylint: disable=missing-function-docstring
-# pylint: disable=unused-argument
-# pylint: disable=unused-variable
-# pylint: disable=invalid-name
-# pylint: disable=consider-using-enumerate
-# pylint: disable=import-error
-# pylint: disable=redefined-builtin
-# pylint: disable=ungrouped-imports
-""" Testing suite for the PyTorch Wav2Vec2 model. """
+"""Testing suite for the PyTorch Wav2Vec2 model."""
 
 import gc
 import math
 import multiprocessing
 import os
+import pickle
 import tempfile
 import traceback
 import unittest
 
 import numpy as np
-import librosa as L
 from datasets import load_dataset
+from pytest import mark
 
-import mindspore as ms
-import mindspore.ops as F
-import mindspore.numpy as mnp
-from mindspore import nn
-from mindspore import Tensor
-
-from mindnlp.transformers import Wav2Vec2Config
+from mindnlp.transformers import Wav2Vec2Config, is_mindspore_available
 from mindnlp.utils.testing_utils import (
     CaptureLogger,
     is_pyctcdecode_available,
-    require_librosa,
-    require_mindspore,
     require_pyctcdecode,
+    require_soundfile,
+    require_mindspore,
     run_test_in_subprocess,
     slow,
 )
-from mindnlp.transformers import (
-    Wav2Vec2FeatureExtractor,
-    Wav2Vec2ForAudioFrameClassification,
-    Wav2Vec2ForCTC,
-    Wav2Vec2ForMaskedLM,
-    Wav2Vec2ForPreTraining,
-    Wav2Vec2ForSequenceClassification,
-    Wav2Vec2ForXVector,
-    Wav2Vec2Model,
-    Wav2Vec2Processor,
-)
-from mindnlp.transformers.models.wav2vec2.modeling_wav2vec2 import (
-    WAV2VEC2_ADAPTER_PT_FILE,
-    WAV2VEC2_ADAPTER_SAFE_FILE,
-    Wav2Vec2GumbelVectorQuantizer,
-    _compute_mask_indices,
-    _sample_negative_indices,
-)
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
@@ -79,8 +46,34 @@
     ids_tensor,
     random_attention_mask,
 )
+# from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_mindspore_available():
+    import mindspore
+    from mindnlp.core import ops, no_grad, nn
+    from mindnlp.core.nn import functional
+    from mindnlp.core.serialization import safe_save_file, save
+
+    from mindnlp.transformers import (
+        Wav2Vec2FeatureExtractor,
+        Wav2Vec2ForAudioFrameClassification,
+        Wav2Vec2ForCTC,
+        Wav2Vec2ForMaskedLM,
+        Wav2Vec2ForPreTraining,
+        Wav2Vec2ForSequenceClassification,
+        Wav2Vec2ForXVector,
+        Wav2Vec2Model,
+        Wav2Vec2Processor,
+    )
+    from mindnlp.transformers.models.wav2vec2.modeling_wav2vec2 import (
+        WAV2VEC2_ADAPTER_PT_FILE,
+        WAV2VEC2_ADAPTER_SAFE_FILE,
+        Wav2Vec2GumbelVectorQuantizer,
+        _compute_mask_indices,
+        _sample_negative_indices,
+    )
 
-mnp.allclose = lambda x, y, *args, **kwargs: np.allclose(x.asnumpy(), y.asnumpy(), *args, **kwargs)
 
 if is_pyctcdecode_available():
     import pyctcdecode.decoder
@@ -89,27 +82,32 @@
     from mindnlp.transformers.models.wav2vec2_with_lm import processing_wav2vec2_with_lm
 
 
+
 def _test_wav2vec2_with_lm_invalid_pool(in_queue, out_queue, timeout):
     error = None
     try:
         _ = in_queue.get(timeout=timeout)
 
-        ds = load_dataset("mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True, trust_remote_code=True)
+        ds = load_dataset(
+            "mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True, trust_remote_code=True
+        )
         sample = next(iter(ds))
 
-        resampled_audio = L.resample(
-            ms.tensor(sample["audio"]["array"]).numpy(), orig_sr=48_000, target_sr=16_000
-        )
+        resampled_audio = torchaudio.functional.resample(
+            mindspore.tensor(sample["audio"]["array"]), 48_000, 16_000
+        ).numpy()
 
         model = Wav2Vec2ForCTC.from_pretrained("patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm")
         processor = Wav2Vec2ProcessorWithLM.from_pretrained("patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm")
 
         input_values = processor(resampled_audio, return_tensors="ms").input_values
-        logits = model(input_values).logits
+
+        with no_grad():
+            logits = model(input_values).logits
 
         # use a spawn pool, which should trigger a warning if different than fork
         with CaptureLogger(pyctcdecode.decoder.logger) as cl, multiprocessing.get_context("spawn").Pool(1) as pool:
-            transcription = processor.batch_decode(logits, pool).text
+            transcription = processor.batch_decode(logits.asnumpy(), pool).text
 
         unittest.TestCase().assertIn("Falling back to sequential decoding.", cl.out)
         unittest.TestCase().assertEqual(transcription[0], "habitan aguas poco profundas y rocosas")
@@ -117,11 +115,11 @@ def _test_wav2vec2_with_lm_invalid_pool(in_queue, out_queue, timeout):
         # force batch_decode to internally create a spawn pool, which should trigger a warning if different than fork
         multiprocessing.set_start_method("spawn", force=True)
         with CaptureLogger(processing_wav2vec2_with_lm.logger) as cl:
-            transcription = processor.batch_decode(logits).text
+            transcription = processor.batch_decode(logits.asnumpy()).text
 
         unittest.TestCase().assertIn("Falling back to sequential decoding.", cl.out)
         unittest.TestCase().assertEqual(transcription[0], "habitan aguas poco profundas y rocosas")
-    except: # pylint: disable=bare-except
+    except Exception:
         error = f"{traceback.format_exc()}"
 
     results = {"error": error}
@@ -247,7 +245,7 @@ def get_config(self):
 
     def create_and_check_model(self, config, input_values, attention_mask):
         model = Wav2Vec2Model(config=config)
-        model.set_train(False)
+        model.eval()
         result = model(input_values, attention_mask=attention_mask)
         self.parent.assertEqual(
             result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size)
@@ -256,7 +254,7 @@ def create_and_check_model(self, config, input_values, attention_mask):
     def create_and_check_model_with_adapter(self, config, input_values, attention_mask):
         config.add_adapter = True
         model = Wav2Vec2Model(config=config)
-        model.set_train(False)
+        model.eval()
         result = model(input_values, attention_mask=attention_mask)
         self.parent.assertEqual(
             result.last_hidden_state.shape, (self.batch_size, self.adapter_output_seq_length, self.hidden_size)
@@ -266,7 +264,7 @@ def create_and_check_model_with_adapter_for_ctc(self, config, input_values, atte
         config.add_adapter = True
         config.output_hidden_size = 2 * config.hidden_size
         model = Wav2Vec2ForCTC(config=config)
-        model.set_train(False)
+        model.eval()
         result = model(input_values, attention_mask=attention_mask)
         self.parent.assertEqual(
             result.logits.shape, (self.batch_size, self.adapter_output_seq_length, self.vocab_size)
@@ -276,7 +274,7 @@ def create_and_check_model_with_adapter_proj_dim(self, config, input_values, att
         config.add_adapter = True
         config.output_hidden_size = 8
         model = Wav2Vec2Model(config=config)
-        model.set_train(False)
+        model.eval()
         result = model(input_values, attention_mask=attention_mask)
         self.parent.assertEqual(
             result.last_hidden_state.shape,
@@ -289,7 +287,7 @@ def create_and_check_model_with_attn_adapter(self, config, input_values, attenti
 
         self.parent.assertIsNotNone(model._get_adapters())
 
-        model.set_train(False)
+        model.eval()
         result = model(input_values, attention_mask=attention_mask)
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.output_seq_length, self.vocab_size))
 
@@ -297,10 +295,10 @@ def create_and_check_batch_inference(self, config, input_values, *args):
         # test does not pass for models making use of `group_norm`
         # check: https://github.com/pytorch/fairseq/issues/3227
         model = Wav2Vec2Model(config=config)
-        model.set_train(False)
+        model.eval()
 
         input_values = input_values[:3]
-        attention_mask = F.ones(input_values.shape, dtype=ms.bool_)
+        attention_mask = ops.ones(input_values.shape, dtype=mindspore.bool_)
 
         input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
 
@@ -316,17 +314,19 @@ def create_and_check_batch_inference(self, config, input_values, *args):
             output = model(input_slice).last_hidden_state
 
             batch_output = batch_outputs[i : i + 1, : output.shape[1]]
-            self.parent.assertTrue(mnp.allclose(output, batch_output, atol=1e-3))
+            self.parent.assertTrue(ops.allclose(output, batch_output, atol=1e-3))
 
     def check_ctc_loss(self, config, input_values, *args):
         model = Wav2Vec2ForCTC(config=config)
-        model.set_train(False) # make sure that dropout is disabled
+
+        # make sure that dropout is disabled
+        model.eval()
 
         input_values = input_values[:3]
-        attention_mask = F.ones(input_values.shape, dtype=ms.int64)
+        attention_mask = ops.ones(input_values.shape, dtype=mindspore.int64)
 
         input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(ms.tensor(input_lengths))
+        max_length_labels = model._get_feat_extract_output_lengths(mindspore.tensor(input_lengths))
         labels = ids_tensor((input_values.shape[0], min(max_length_labels).item() - 1), model.config.vocab_size)
 
         # pad input
@@ -345,10 +345,12 @@ def check_ctc_loss(self, config, input_values, *args):
 
     def check_seq_classifier_loss(self, config, input_values, *args):
         model = Wav2Vec2ForSequenceClassification(config=config)
-        model.set_train(False) # make sure that dropout is disabled
+
+        # make sure that dropout is disabled
+        model.eval()
 
         input_values = input_values[:3]
-        attention_mask = F.ones(input_values.shape, dtype=ms.int64)
+        attention_mask = ops.ones(input_values.shape, dtype=mindspore.int64)
 
         input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
         labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
@@ -365,11 +367,10 @@ def check_seq_classifier_loss(self, config, input_values, *args):
         self.parent.assertTrue(isinstance(unmasked_loss, float))
         self.parent.assertTrue(masked_loss != unmasked_loss)
 
-    @unittest.skip('ignore train temporarily')
     def check_ctc_training(self, config, input_values, *args):
         config.ctc_zero_infinity = True
         model = Wav2Vec2ForCTC(config=config)
-        model.set_train(True)
+        model.train()
 
         # freeze feature encoder
         model.freeze_feature_encoder()
@@ -377,7 +378,7 @@ def check_ctc_training(self, config, input_values, *args):
         input_values = input_values[:3]
 
         input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(ms.tensor(input_lengths))
+        max_length_labels = model._get_feat_extract_output_lengths(mindspore.tensor(input_lengths))
         labels = ids_tensor((input_values.shape[0], max(max_length_labels).item() - 2), model.config.vocab_size)
 
         # pad input
@@ -390,15 +391,14 @@ def check_ctc_training(self, config, input_values, *args):
                 labels[i, max_length_labels[i] - 1 :] = -100
 
         loss = model(input_values, labels=labels).loss
-        self.parent.assertFalse(F.isinf(loss).item())
+        self.parent.assertFalse(ops.isinf(loss).item())
 
-        # TODO: loss.backward()
+        loss.backward()
 
-    @unittest.skip('ignore train temporarily')
     def check_seq_classifier_training(self, config, input_values, *args):
         config.ctc_zero_infinity = True
         model = Wav2Vec2ForSequenceClassification(config=config)
-        model.set_train(True)
+        model.train()
 
         # freeze everything but the classification head
         model.freeze_base_model()
@@ -413,15 +413,14 @@ def check_seq_classifier_training(self, config, input_values, *args):
             input_values[i, input_lengths[i] :] = 0.0
 
         loss = model(input_values, labels=labels).loss
-        self.parent.assertFalse(F.isinf(loss).item())
+        self.parent.assertFalse(ops.isinf(loss).item())
 
-        # TODO: loss.backward()
+        loss.backward()
 
-    @unittest.skip('ignore train temporarily')
     def check_xvector_training(self, config, input_values, *args):
         config.ctc_zero_infinity = True
         model = Wav2Vec2ForXVector(config=config)
-        model.set_train(True)
+        model.train()
 
         # freeze everything but the classification head
         model.freeze_base_model()
@@ -436,18 +435,18 @@ def check_xvector_training(self, config, input_values, *args):
             input_values[i, input_lengths[i] :] = 0.0
 
         loss = model(input_values, labels=labels).loss
-        self.parent.assertFalse(F.isinf(loss).item())
+        self.parent.assertFalse(ops.isinf(loss).item())
 
-        # TODO: loss.backward()
+        loss.backward()
 
     def check_labels_out_of_vocab(self, config, input_values, *args):
         model = Wav2Vec2ForCTC(config)
-        model.set_train(True)
+        model.train()
 
         input_values = input_values[:3]
 
         input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(ms.tensor(input_lengths))
+        max_length_labels = model._get_feat_extract_output_lengths(mindspore.tensor(input_lengths))
         labels = ids_tensor((input_values.shape[0], max(max_length_labels).item() - 2), model.config.vocab_size + 100)
 
         with self.parent.assertRaises(ValueError):
@@ -461,7 +460,11 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_mindspore
 class Wav2Vec2ModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (Wav2Vec2ForCTC, Wav2Vec2Model, Wav2Vec2ForMaskedLM, Wav2Vec2ForSequenceClassification, Wav2Vec2ForPreTraining)
+    all_model_classes = (
+        (Wav2Vec2ForCTC, Wav2Vec2Model, Wav2Vec2ForMaskedLM, Wav2Vec2ForSequenceClassification, Wav2Vec2ForPreTraining)
+        if is_mindspore_available()
+        else ()
+    )
     pipeline_model_mapping = (
         {
             "audio-classification": Wav2Vec2ForSequenceClassification,
@@ -469,6 +472,8 @@ class Wav2Vec2ModelTest(ModelTesterMixin, unittest.TestCase):
             "feature-extraction": Wav2Vec2Model,
             "fill-mask": Wav2Vec2ForMaskedLM,
         }
+        if is_mindspore_available()
+        else {}
     )
     fx_compatible = True
     test_pruning = False
@@ -513,7 +518,7 @@ def test_seq_classifier_train(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.check_seq_classifier_training(*config_and_inputs)
 
-    def test_xvector_traintest_xvector_train(self):
+    def test_xvector_train(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.check_xvector_training(*config_and_inputs)
 
@@ -521,22 +526,19 @@ def test_labels_out_of_vocab(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
 
-    # Wav2Vec2 has no inputs_embeds
+    @unittest.skip(reason="Model has no inputs_embeds")
     def test_inputs_embeds(self):
         pass
 
-    # `input_ids` is renamed to `input_values`
+    @unittest.skip(reason="Model has input_values instead of input_ids")
     def test_forward_signature(self):
         pass
 
-    # Wav2Vec2 cannot resize token embeddings
-    # since it has no tokens embeddings
+    @unittest.skip(reason="Model has no tokens embeds")
     def test_resize_tokens_embeddings(self):
         pass
 
-    # Wav2Vec2 has no inputs_embeds
-    # and thus the `get_input_embeddings` fn
-    # is not implemented
+    @unittest.skip(reason="Model has no inputs_embeds")
     def test_model_get_set_embeddings(self):
         pass
 
@@ -546,7 +548,7 @@ def test_initialization(self):
         configs_no_init = _config_zero_init(config)
         for model_class in self.all_model_classes:
             model = model_class(config=configs_no_init)
-            for name, param in model.parameters_and_names():
+            for name, param in model.named_parameters():
                 uniform_init_parms = [
                     "conv.weight",
                     "conv.parametrizations.weight",
@@ -564,21 +566,36 @@ def test_initialization(self):
                 if param.requires_grad:
                     if any(x in name for x in uniform_init_parms):
                         self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
+                            -1.0 <= ((param.mean() * 1e9).round() / 1e9).item() <= 1.0,
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                         )
                     else:
                         self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            ((param.mean() * 1e9).round() / 1e9).item(),
                             [0.0, 1.0],
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                         )
 
+    # overwrite from test_modeling_common
+    def _mock_init_weights(self, module):
+        if hasattr(module, "weight") and module.weight is not None:
+            module.weight.fill_(3)
+        if hasattr(module, "weight_g") and module.weight_g is not None:
+            module.weight_g.fill_(3)
+        if hasattr(module, "weight_v") and module.weight_v is not None:
+            module.weight_v.fill_(3)
+        if hasattr(module, "bias") and module.bias is not None:
+            module.bias.fill_(3)
+        if hasattr(module, "codevectors") and module.codevectors is not None:
+            module.codevectors.fill_(3)
+        if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
+            module.masked_spec_embed.fill_(3)
+
     def test_mask_feature_prob_ctc(self):
         model = Wav2Vec2ForCTC.from_pretrained(
             "hf-internal-testing/tiny-random-wav2vec2", mask_feature_prob=0.2, mask_feature_length=2
         )
-        model.set_train(True)
+        model.train()
         processor = Wav2Vec2Processor.from_pretrained(
             "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
         )
@@ -601,7 +618,7 @@ def test_mask_time_prob_ctc(self):
         model = Wav2Vec2ForCTC.from_pretrained(
             "hf-internal-testing/tiny-random-wav2vec2", mask_time_prob=0.2, mask_time_length=2
         )
-        model.set_train(True)
+        model.train()
         processor = Wav2Vec2Processor.from_pretrained(
             "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
         )
@@ -633,13 +650,17 @@ def test_model_from_pretrained(self):
 @require_mindspore
 class Wav2Vec2RobustModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (
-        Wav2Vec2ForCTC,
-        Wav2Vec2Model,
-        Wav2Vec2ForMaskedLM,
-        Wav2Vec2ForSequenceClassification,
-        Wav2Vec2ForPreTraining,
-        Wav2Vec2ForAudioFrameClassification,
-        Wav2Vec2ForXVector,
+        (
+            Wav2Vec2ForCTC,
+            Wav2Vec2Model,
+            Wav2Vec2ForMaskedLM,
+            Wav2Vec2ForSequenceClassification,
+            Wav2Vec2ForPreTraining,
+            Wav2Vec2ForAudioFrameClassification,
+            Wav2Vec2ForXVector,
+        )
+        if is_mindspore_available()
+        else ()
     )
     test_pruning = False
     test_headmasking = False
@@ -697,22 +718,19 @@ def test_labels_out_of_vocab(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
 
-    # Wav2Vec2 has no inputs_embeds
+    @unittest.skip(reason="Model has no input_embeds")
     def test_inputs_embeds(self):
         pass
 
-    # `input_ids` is renamed to `input_values`
+    @unittest.skip(reason="Model has input_values instead of input_ids")
     def test_forward_signature(self):
         pass
 
-    # Wav2Vec2 cannot resize token embeddings
-    # since it has no tokens embeddings
+    @unittest.skip(reason="Model has no token embeddings")
     def test_resize_tokens_embeddings(self):
         pass
 
-    # Wav2Vec2 has no inputs_embeds
-    # and thus the `get_input_embeddings` fn
-    # is not implemented
+    @unittest.skip(reason="Model has no input_embeds")
     def test_model_get_set_embeddings(self):
         pass
 
@@ -722,7 +740,7 @@ def test_initialization(self):
         configs_no_init = _config_zero_init(config)
         for model_class in self.all_model_classes:
             model = model_class(config=configs_no_init)
-            for name, param in model.parameters_and_names():
+            for name, param in model.named_parameters():
                 uniform_init_parms = [
                     "conv.weight",
                     "conv.parametrizations.weight",
@@ -740,16 +758,31 @@ def test_initialization(self):
                 if param.requires_grad:
                     if any(x in name for x in uniform_init_parms):
                         self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
+                            -1.0 <= ((param.mean() * 1e9).round() / 1e9).item() <= 1.0,
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                         )
                     else:
                         self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            ((param.mean() * 1e9).round() / 1e9).item(),
                             [0.0, 1.0],
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                         )
 
+    # overwrite from test_modeling_common
+    def _mock_init_weights(self, module):
+        if hasattr(module, "weight") and module.weight is not None:
+            module.weight.fill_(3)
+        if hasattr(module, "weight_g") and module.weight_g is not None:
+            module.weight_g.fill_(3)
+        if hasattr(module, "weight_v") and module.weight_v is not None:
+            module.weight_v.fill_(3)
+        if hasattr(module, "bias") and module.bias is not None:
+            module.bias.fill_(3)
+        if hasattr(module, "codevectors") and module.codevectors is not None:
+            module.codevectors.fill_(3)
+        if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
+            module.masked_spec_embed.fill_(3)
+
     def test_model_for_pretraining(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         model = Wav2Vec2ForPreTraining(config)
@@ -767,8 +800,8 @@ def test_model_for_pretraining(self):
         )
         sampled_negative_indices = _sample_negative_indices(features_shape, 10, mask_time_indices)
 
-        mask_time_indices = Tensor.from_numpy(mask_time_indices)
-        sampled_negative_indices = Tensor.from_numpy(sampled_negative_indices)
+        mask_time_indices = ops.from_numpy(mask_time_indices)
+        sampled_negative_indices = ops.from_numpy(sampled_negative_indices)
 
         loss = model(
             inputs_dict["input_values"],
@@ -780,8 +813,8 @@ def test_model_for_pretraining(self):
         # more losses
         mask_time_indices[:, : mask_time_indices.shape[-1] // 2] = True
 
-        sampled_negative_indices = _sample_negative_indices(features_shape, 10, mask_time_indices)
-        sampled_negative_indices = Tensor.from_numpy(sampled_negative_indices)
+        sampled_negative_indices = _sample_negative_indices(features_shape, 10, mask_time_indices.asnumpy())
+        sampled_negative_indices = ops.from_numpy(sampled_negative_indices)
         loss_more_masked = model(
             inputs_dict["input_values"],
             attention_mask=inputs_dict["attention_mask"],
@@ -796,7 +829,7 @@ def test_mask_feature_prob_ctc(self):
         model = Wav2Vec2ForCTC.from_pretrained(
             "hf-internal-testing/tiny-random-wav2vec2", mask_feature_prob=0.2, mask_feature_length=2
         )
-        model.set_train(True)
+        model.train()
         processor = Wav2Vec2Processor.from_pretrained(
             "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
         )
@@ -819,7 +852,7 @@ def test_mask_time_prob_ctc(self):
         model = Wav2Vec2ForCTC.from_pretrained(
             "hf-internal-testing/tiny-random-wav2vec2", mask_time_prob=0.2, mask_time_length=2
         )
-        model.set_train(True)
+        model.train()
         processor = Wav2Vec2Processor.from_pretrained(
             "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
         )
@@ -846,7 +879,7 @@ def test_mask_time_feature_prob_ctc_single_batch(self):
             mask_time_length=2,
             mask_feature_length=2,
         )
-        model.set_train(True)
+        model.train()
         processor = Wav2Vec2Processor.from_pretrained(
             "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
         )
@@ -869,9 +902,10 @@ def test_mask_time_feature_prob_ctc_single_batch(self):
     def test_feed_forward_chunking(self):
         pass
 
-    @unittest.skip(reason="resource url unavailable")
     def test_load_and_set_attn_adapter(self):
-        processor = Wav2Vec2Processor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True)
+        processor = Wav2Vec2Processor.from_pretrained(
+            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
+        )
 
         def get_logits(model, input_features):
             batch = processor(
@@ -880,10 +914,12 @@ def get_logits(model, input_features):
                 sampling_rate=processor.feature_extractor.sampling_rate,
                 return_tensors="ms",
             )
-            logits = model(
-                input_values=batch["input_values"],
-                attention_mask=batch["attention_mask"],
-            ).logits
+
+            with no_grad():
+                logits = model(
+                    input_values=batch["input_values"],
+                    attention_mask=batch["attention_mask"],
+                ).logits
             return logits
 
         input_features = [np.random.random(16_000 * s) for s in [1, 3, 2, 6]]
@@ -897,9 +933,8 @@ def get_logits(model, input_features):
 
         logits_2 = get_logits(model_2, input_features)
 
-        self.assertTrue(mnp.allclose(logits, logits_2, atol=1e-3))
+        self.assertTrue(ops.allclose(logits, logits_2, atol=1e-3))
 
-    @unittest.skip('no torch support')
     # test that loading adapter weights with mismatched vocab sizes can be loaded
     def test_load_target_lang_with_mismatched_size(self):
         processor = Wav2Vec2Processor.from_pretrained(
@@ -913,15 +948,19 @@ def get_logits(model, input_features):
                 sampling_rate=processor.feature_extractor.sampling_rate,
                 return_tensors="ms",
             )
-            logits = model(
-                input_values=batch["input_values"],
-                attention_mask=batch["attention_mask"],
-            ).logits
+
+            with no_grad():
+                logits = model(
+                    input_values=batch["input_values"],
+                    attention_mask=batch["attention_mask"],
+                ).logits
             return logits
 
         input_features = [np.random.random(16_000 * s) for s in [1, 3, 2, 6]]
 
-        model = Wav2Vec2ForCTC.from_pretrained("hf-internal-testing/tiny-random-wav2vec2-adapter", target_lang="fr", ignore_mismatched_sizes=True)
+        model = Wav2Vec2ForCTC.from_pretrained(
+            "hf-internal-testing/tiny-random-wav2vec2-adapter", target_lang="fr", ignore_mismatched_sizes=True
+        )
 
         logits = get_logits(model, input_features)
 
@@ -930,9 +969,8 @@ def get_logits(model, input_features):
 
         logits_2 = get_logits(model_2, input_features)
 
-        self.assertTrue(mnp.allclose(logits, logits_2, atol=1e-3))
+        self.assertTrue(ops.allclose(logits, logits_2, atol=1e-3))
 
-    @unittest.skip(reason="no pytorch support")
     def test_load_attn_adapter(self):
         processor = Wav2Vec2Processor.from_pretrained(
             "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
@@ -945,10 +983,12 @@ def get_logits(model, input_features):
                 sampling_rate=processor.feature_extractor.sampling_rate,
                 return_tensors="ms",
             )
-            logits = model(
-                input_values=batch["input_values"],
-                attention_mask=batch["attention_mask"],
-            ).logits
+
+            with no_grad():
+                logits = model(
+                    input_values=batch["input_values"],
+                    attention_mask=batch["attention_mask"],
+                ).logits
             return logits
 
         input_features = [np.random.random(16_000 * s) for s in [1, 3, 2, 6]]
@@ -964,7 +1004,7 @@ def get_logits(model, input_features):
 
             # save safe weights
             safe_filepath = os.path.join(tempdir, WAV2VEC2_ADAPTER_SAFE_FILE.format("eng"))
-            safe_save_file(adapter_weights, safe_filepath, metadata={"format": "pt"})   # pylint: disable=undefined-variable
+            safe_save_file(adapter_weights, safe_filepath, metadata={"format": "ms"})
 
             model.load_adapter("eng")
             model.load_adapter("eng", use_safetensors=True)
@@ -975,7 +1015,7 @@ def get_logits(model, input_features):
                 model.load_adapter("ita", use_safetensors=True)
             logits_2 = get_logits(model, input_features)
 
-            self.assertTrue(mnp.allclose(logits, logits_2, atol=1e-3))
+            self.assertTrue(ops.allclose(logits, logits_2, atol=1e-3))
 
         with tempfile.TemporaryDirectory() as tempdir:
             model.save_pretrained(tempdir)
@@ -986,7 +1026,7 @@ def get_logits(model, input_features):
 
             # save pt weights
             pt_filepath = os.path.join(tempdir, WAV2VEC2_ADAPTER_PT_FILE.format("eng"))
-            F.save(adapter_weights, pt_filepath)
+            save(adapter_weights, pt_filepath)
 
             model.load_adapter("eng")
             model.load_adapter("eng", use_safetensors=False)
@@ -996,7 +1036,7 @@ def get_logits(model, input_features):
 
             logits_2 = get_logits(model, input_features)
 
-            self.assertTrue(mnp.allclose(logits, logits_2, atol=1e-3))
+            self.assertTrue(ops.allclose(logits, logits_2, atol=1e-3))
 
         model = Wav2Vec2ForCTC.from_pretrained("hf-internal-testing/tiny-random-wav2vec2-adapter")
         logits = get_logits(model, input_features)
@@ -1007,7 +1047,7 @@ def get_logits(model, input_features):
 
         logits_2 = get_logits(model, input_features)
 
-        self.assertTrue(mnp.allclose(logits, logits_2, atol=1e-3))
+        self.assertTrue(ops.allclose(logits, logits_2, atol=1e-3))
 
     @slow
     def test_model_from_pretrained(self):
@@ -1024,7 +1064,7 @@ def test_compute_mask_indices(self):
         mask_length = 1
 
         mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
-        mask = Tensor.from_numpy(mask)
+        mask = ops.from_numpy(mask)
 
         self.assertListEqual(mask.sum(axis=-1).tolist(), [mask_prob * sequence_length for _ in range(batch_size)])
 
@@ -1043,9 +1083,10 @@ def test_compute_mask_indices_low_prob(self):
 
         for _ in range(n_trials):
             mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
-            mask = Tensor.from_numpy(mask)
+            mask = ops.from_numpy(mask)
+
+            num_masks = ops.sum(mask).item()
 
-            num_masks = F.sum(mask).item()
             if num_masks > 0:
                 count_dimensions_masked += 1
             else:
@@ -1064,7 +1105,7 @@ def test_compute_mask_indices_overlap(self):
         mask_length = 4
 
         mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
-        mask = Tensor.from_numpy(mask)
+        mask = ops.from_numpy(mask)
 
         # because of overlap mask don't have to add up exactly to `mask_prob * sequence_length`, but have to be smaller or equal
         for batch_sum in mask.sum(axis=-1):
@@ -1076,44 +1117,27 @@ def test_compute_mask_indices_attn_mask_overlap(self):
         mask_prob = 0.5
         mask_length = 4
 
-        attention_mask = F.ones((batch_size, sequence_length), dtype=ms.int64)
+        attention_mask = ops.ones((batch_size, sequence_length), dtype=mindspore.int64)
         attention_mask[:2, sequence_length // 2 :] = 0
 
         mask = _compute_mask_indices(
             (batch_size, sequence_length), mask_prob, mask_length, attention_mask=attention_mask
         )
-        mask = Tensor.from_numpy(mask)
+        mask = ops.from_numpy(mask)
 
         for batch_sum in mask.sum(axis=-1):
             self.assertTrue(int(batch_sum) <= mask_prob * sequence_length)
 
         self.assertTrue(mask[:2, sequence_length // 2 :].sum() == 0)
 
-    def test_compute_mask_indices_short_audio(self):
-        batch_size = 4
-        sequence_length = 100
-        mask_prob = 0.05
-        mask_length = 10
-
-        attention_mask = F.ones((batch_size, sequence_length), dtype=ms.int64)
-        # force one example to be heavily padded
-        attention_mask[0, 5:] = 0
-
-        mask = _compute_mask_indices(
-            (batch_size, sequence_length), mask_prob, mask_length, attention_mask=attention_mask, min_masks=2
-        )
-
-        # make sure that non-padded examples cannot be padded
-        self.assertFalse(mask[0][attention_mask[0].astype(ms.bool_).asnumpy()].any())
-
     def test_compute_perplexity(self):
-        probs = F.arange(100, dtype=ms.float32).reshape(2, 5, 10) / 100
+        probs = ops.arange(100).reshape(2, 5, 10) / 100
 
         ppl = Wav2Vec2GumbelVectorQuantizer._compute_perplexity(probs)
         self.assertTrue(abs(ppl.item() - 141.4291) < 1e-3)
 
         # mask half of the input
-        mask = F.ones((2,), dtype=ms.bool_)
+        mask = ops.ones((2,), dtype=mindspore.bool_)
         mask[0] = 0
 
         ppl = Wav2Vec2GumbelVectorQuantizer._compute_perplexity(probs, mask)
@@ -1124,15 +1148,15 @@ def test_sample_negatives(self):
         sequence_length = 10
         hidden_size = 4
         num_negatives = 3
-        sequence = F.div(
-            F.arange(sequence_length * hidden_size), hidden_size, rounding_mode="floor"
+        sequence = ops.div(
+            ops.arange(sequence_length * hidden_size), hidden_size, rounding_mode="floor"
         )
         features = sequence.view(sequence_length, hidden_size)  # each value in vector consits of same value
-        features = features[None, :].expand(batch_size, sequence_length, hidden_size)
+        features = features[None, :].broadcast_to((batch_size, sequence_length, hidden_size))
 
         # sample negative indices
         sampled_negative_indices = _sample_negative_indices((batch_size, sequence_length), num_negatives, None)
-        sampled_negative_indices = Tensor.from_numpy(sampled_negative_indices)
+        sampled_negative_indices = ops.from_numpy(sampled_negative_indices)
         negatives = features.view(-1, hidden_size)[sampled_negative_indices.long().view(-1)]
         negatives = negatives.view(batch_size, sequence_length, -1, hidden_size).permute(2, 0, 1, 3)
         self.assertTrue(negatives.shape == (num_negatives, batch_size, sequence_length, hidden_size))
@@ -1142,13 +1166,7 @@ def test_sample_negatives(self):
             self.assertTrue(((negative - features) == 0).sum() == 0.0)
 
         # make sure that full vectors are sampled and not values of vectors => this means that `unique()` yields a single value for `hidden_size` dim
-        #self.assertEqual(negatives.unique(dim=-1).shape, (num_negatives, batch_size, sequence_length, 1))
-        # NOTE: which means [:, :, :, i] is equal for all i
-        self.assertEqual(negatives.shape[:-1], (num_negatives, batch_size, sequence_length))
-        ref = negatives[:, :, :, 0]
-        for i in range(1, negatives.shape[-1]):
-            x = negatives[:, :, :, i]
-            self.assertTrue(F.all(ref == x))
+        self.assertEqual(ops.unique(negatives, dim=-1).shape, (num_negatives, batch_size, sequence_length, 1))
 
     def test_sample_negatives_with_mask(self):
         batch_size = 2
@@ -1157,27 +1175,28 @@ def test_sample_negatives_with_mask(self):
         num_negatives = 3
 
         # second half of last input tensor is padded
-        mask = F.ones((batch_size, sequence_length), dtype=ms.int64)
+        mask = ops.ones((batch_size, sequence_length), dtype=mindspore.int64)
         mask[-1, sequence_length // 2 :] = 0
 
-        sequence = F.div(
-            F.arange(sequence_length * hidden_size), hidden_size, rounding_mode="floor"
+        sequence = ops.div(
+            ops.arange(sequence_length * hidden_size), hidden_size, rounding_mode="floor"
         )
         features = sequence.view(sequence_length, hidden_size)  # each value in vector consits of same value
-        features = features[None, :].expand(batch_size, sequence_length, hidden_size)
+        features = features[None, :].broadcast_to((batch_size, sequence_length, hidden_size))
 
         # replace masked feature vectors with -100 to test that those are not sampled
-        features = F.where(mask[:, :, None].expand(features.shape).bool(), features, -100)
+        features = ops.where(mask[:, :, None].broadcast_to(features.shape).bool(), features, -100)
 
         # sample negative indices
         sampled_negative_indices = _sample_negative_indices(
-            (batch_size, sequence_length), num_negatives, mask
+            (batch_size, sequence_length), num_negatives, mask.asnumpy()
         )
-        sampled_negative_indices = Tensor.from_numpy(sampled_negative_indices)
+        sampled_negative_indices = ops.from_numpy(sampled_negative_indices)
         negatives = features.view(-1, hidden_size)[sampled_negative_indices.long().view(-1)]
         negatives = negatives.view(batch_size, sequence_length, -1, hidden_size).permute(2, 0, 1, 3)
 
         self.assertTrue((negatives >= 0).all().item())
+
         self.assertTrue(negatives.shape == (num_negatives, batch_size, sequence_length, hidden_size))
 
         # make sure no negatively sampled vector is actually a positive one
@@ -1185,17 +1204,11 @@ def test_sample_negatives_with_mask(self):
             self.assertTrue(((negative - features) == 0).sum() == 0.0)
 
         # make sure that full vectors are sampled and not values of vectors => this means that `unique()` yields a single value for `hidden_size` dim
-        #self.assertEqual(negatives.unique(dim=-1).shape, (num_negatives, batch_size, sequence_length, 1))
-        # NOTE: which means [:, :, :, i] is equal for all i
-        self.assertEqual(negatives.shape[:-1], (num_negatives, batch_size, sequence_length))
-        ref = negatives[:, :, :, 0]
-        for i in range(1, negatives.shape[-1]):
-            print('i = ', i)
-            x = negatives[:, :, :, i]
-            self.assertTrue(F.all(ref == x))
+        self.assertEqual(ops.unique(negatives, dim=-1).shape, (num_negatives, batch_size, sequence_length, 1))
+
 
 @require_mindspore
-@require_librosa
+@require_soundfile
 @slow
 class Wav2Vec2ModelIntegrationTest(unittest.TestCase):
     def tearDown(self):
@@ -1204,15 +1217,17 @@ def tearDown(self):
         gc.collect()
 
     def _load_datasamples(self, num_samples):
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True)
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         # automatic decoding with librispeech
         speech_samples = ds.sort("id").filter(
             lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
         )[:num_samples]["audio"]
+
         return [x["array"] for x in speech_samples]
 
     def _load_superb(self, task, num_samples):
         ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True)
+
         return ds[:num_samples]
 
     def test_inference_ctc_normal(self):
@@ -1221,9 +1236,11 @@ def test_inference_ctc_normal(self):
         input_speech = self._load_datasamples(1)
 
         input_values = processor(input_speech, return_tensors="ms").input_values
-        logits = model(input_values).logits
 
-        predicted_ids = F.argmax(logits, dim=-1)
+        with no_grad():
+            logits = model(input_values).logits
+
+        predicted_ids = ops.argmax(logits, dim=-1)
         predicted_trans = processor.batch_decode(predicted_ids)
 
         EXPECTED_TRANSCRIPTIONS = ["a man said to the universe sir i exist"]
@@ -1234,11 +1251,15 @@ def test_inference_ctc_normal_batched(self):
         processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h", do_lower_case=True)
 
         input_speech = self._load_datasamples(2)
+
         inputs = processor(input_speech, return_tensors="ms", padding=True)
+
         input_values = inputs.input_values
-        logits = model(input_values).logits
 
-        predicted_ids = F.argmax(logits, dim=-1)
+        with no_grad():
+            logits = model(input_values).logits
+
+        predicted_ids = ops.argmax(logits, dim=-1)
         predicted_trans = processor.batch_decode(predicted_ids)
 
         EXPECTED_TRANSCRIPTIONS = [
@@ -1252,13 +1273,16 @@ def test_inference_ctc_robust_batched(self):
         processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self", do_lower_case=True)
 
         input_speech = self._load_datasamples(4)
+
         inputs = processor(input_speech, return_tensors="ms", padding=True)
 
         input_values = inputs.input_values
         attention_mask = inputs.attention_mask
-        logits = model(input_values, attention_mask=attention_mask).logits
 
-        predicted_ids = F.argmax(logits, dim=-1)
+        with no_grad():
+            logits = model(input_values, attention_mask=attention_mask).logits
+
+        predicted_ids = ops.argmax(logits, dim=-1)
         predicted_trans = processor.batch_decode(predicted_ids)
 
         EXPECTED_TRANSCRIPTIONS = [
@@ -1270,7 +1294,6 @@ def test_inference_ctc_robust_batched(self):
         ]
         self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
 
-    @unittest.skipIf(ms.get_context('device_target') != "CPU", "cannot make deterministic on GPU")
     def test_inference_integration(self):
         model = Wav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-base")
         feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base")
@@ -1290,11 +1313,13 @@ def test_inference_integration(self):
             model.config.mask_time_length,
             min_masks=2,
         )
-        mask_time_indices = Tensor.from_numpy(mask_time_indices)
-        outputs = model(
-            inputs_dict.input_values,
-            mask_time_indices=mask_time_indices,
-        )
+        mask_time_indices = ops.from_numpy(mask_time_indices)
+
+        with no_grad():
+            outputs = model(
+                inputs_dict.input_values,
+                mask_time_indices=mask_time_indices,
+            )
 
         # compute cosine similarity
         cosine_sim = F.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1)
@@ -1305,7 +1330,7 @@ def test_inference_integration(self):
         # cosine similarity of model is all > 0.5 as model is
         # pre-trained on contrastive loss
         # fmt: off
-        expected_cosine_sim_masked = ms.tensor([
+        expected_cosine_sim_masked = mindspore.tensor([
             0.8523, 0.5860, 0.6905, 0.5557, 0.7456, 0.5249, 0.6639, 0.7654, 0.7565,
             0.8167, 0.8222, 0.7960, 0.8034, 0.8166, 0.8310, 0.8263, 0.8274, 0.8258,
             0.8179, 0.8412, 0.8536, 0.5098, 0.4728, 0.6461, 0.4498, 0.6002, 0.5774,
@@ -1314,7 +1339,7 @@ def test_inference_integration(self):
         ])
         # fmt: on
 
-        self.assertTrue(mnp.allclose(cosine_sim_masked, expected_cosine_sim_masked, atol=1e-3))
+        self.assertTrue(ops.allclose(cosine_sim_masked, expected_cosine_sim_masked, atol=1e-3))
 
     def test_inference_pretrained(self):
         model = Wav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-base")
@@ -1330,18 +1355,21 @@ def test_inference_pretrained(self):
 
         features_shape = (batch_size, feature_seq_length)
 
+        mindspore.manual_seed(0)
         mask_time_indices = _compute_mask_indices(
             features_shape,
             model.config.mask_time_prob,
             model.config.mask_time_length,
             min_masks=2,
         )
-        mask_time_indices = Tensor.from_numpy(mask_time_indices)
-        outputs = model(
-            inputs_dict.input_values,
-            attention_mask=inputs_dict.attention_mask,
-            mask_time_indices=mask_time_indices,
-        )
+        mask_time_indices = ops.from_numpy(mask_time_indices)
+
+        with no_grad():
+            outputs = model(
+                inputs_dict.input_values,
+                attention_mask=inputs_dict.attention_mask,
+                mask_time_indices=mask_time_indices,
+            )
 
         # compute cosine similarity
         cosine_sim = F.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1)
@@ -1352,15 +1380,19 @@ def test_inference_pretrained(self):
         # ... now compare to randomly initialized model
 
         config = Wav2Vec2Config.from_pretrained("facebook/wav2vec2-base")
-        model_rand = Wav2Vec2ForPreTraining(config)
-        outputs_rand = model_rand(
-            inputs_dict.input_values,
-            attention_mask=inputs_dict.attention_mask,
-            mask_time_indices=mask_time_indices,
-        )
+        model_rand = Wav2Vec2ForPreTraining(config).eval()
+
+        with no_grad():
+            outputs_rand = model_rand(
+                inputs_dict.input_values,
+                attention_mask=inputs_dict.attention_mask,
+                mask_time_indices=mask_time_indices,
+            )
 
         # compute cosine similarity
-        cosine_sim_rand = F.cosine_similarity(outputs_rand.projected_states, outputs_rand.projected_quantized_states, dim=-1)
+        cosine_sim_rand = F.cosine_similarity(
+            outputs_rand.projected_states, outputs_rand.projected_quantized_states, dim=-1
+        )
 
         # retrieve cosine sim of masked features
         cosine_sim_masked_rand = cosine_sim_rand[mask_time_indices]
@@ -1371,7 +1403,6 @@ def test_inference_pretrained(self):
         # => the cosine similarity between quantized states and predicted states is very likely < 0.1
         self.assertTrue(cosine_sim_masked.mean().item() - 5 * cosine_sim_masked_rand.mean().item() > 0)
 
-    @unittest.skipIf(ms.get_context('device_target') != "CPU", "cannot make deterministic on GPU")
     def test_loss_pretraining(self):
         model = Wav2Vec2ForPreTraining.from_pretrained(
             "facebook/wav2vec2-base",
@@ -1380,7 +1411,7 @@ def test_loss_pretraining(self):
             hidden_dropout=0.0,
             layerdrop=0.0,
         )
-        model.set_train(True)
+        model.train()
 
         feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
             "facebook/wav2vec2-base", return_attention_mask=True
@@ -1394,7 +1425,7 @@ def test_loss_pretraining(self):
 
         features_shape = (batch_size, feature_seq_length)
 
-        ms.set_seed(0)
+        mindspore.manual_seed(0)
         np.random.seed(0)
 
         mask_time_indices = _compute_mask_indices(
@@ -1407,14 +1438,16 @@ def test_loss_pretraining(self):
             mask_time_indices.shape, model.config.num_negatives, mask_time_indices
         )
 
-        mask_time_indices = Tensor.from_numpy(mask_time_indices)
-        sampled_negative_indices = Tensor.from_numpy(sampled_negative_indices)
-        outputs = model(
-            inputs_dict.input_values,
-            attention_mask=inputs_dict.attention_mask,
-            mask_time_indices=mask_time_indices,
-            sampled_negative_indices=sampled_negative_indices,
-        )
+        mask_time_indices = ops.from_numpy(mask_time_indices)
+        sampled_negative_indices = ops.from_numpy(sampled_negative_indices)
+
+        with no_grad():
+            outputs = model(
+                inputs_dict.input_values,
+                attention_mask=inputs_dict.attention_mask,
+                mask_time_indices=mask_time_indices,
+                sampled_negative_indices=sampled_negative_indices,
+            )
 
         # check diversity loss
         num_codevectors = model.config.num_codevectors_per_group * model.config.num_codevector_groups
@@ -1424,8 +1457,7 @@ def test_loss_pretraining(self):
         # check overall loss (contrastive loss + diversity loss)
         expected_loss = 116.7094
 
-        # NOTE: Mindspore's gumbel_softmax differs in implementation detail
-        #self.assertTrue(abs(outputs.loss.item() - expected_loss) < 1e-3)
+        self.assertTrue(abs(outputs.loss.item() - expected_loss) < 1e-3)
 
     def test_inference_keyword_spotting(self):
         model = Wav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-ks")
@@ -1435,15 +1467,16 @@ def test_inference_keyword_spotting(self):
 
         input_values = inputs.input_values
         attention_mask = inputs.attention_mask
-        outputs = model(input_values, attention_mask=attention_mask)
-        predicted_logits, predicted_ids = F.max(outputs.logits, axis=-1)
+        with no_grad():
+            outputs = model(input_values, attention_mask=attention_mask)
+        predicted_logits, predicted_ids = ops.max(outputs.logits, dim=-1)
 
         expected_labels = [7, 6, 10, 9]
         # s3prl logits for the same batch
-        expected_logits = ms.tensor([6.1186, 11.8961, 10.2931, 6.0898])
+        expected_logits = mindspore.tensor([6.1186, 11.8961, 10.2931, 6.0898])
 
         self.assertListEqual(predicted_ids.tolist(), expected_labels)
-        self.assertTrue(mnp.allclose(predicted_logits, expected_logits, atol=1e-2))
+        self.assertTrue(ops.allclose(predicted_logits, expected_logits, atol=1e-2))
 
     def test_inference_intent_classification(self):
         model = Wav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-ic")
@@ -1453,26 +1486,27 @@ def test_inference_intent_classification(self):
 
         input_values = inputs.input_values
         attention_mask = inputs.attention_mask
-        outputs = model(input_values, attention_mask=attention_mask)
+        with no_grad():
+            outputs = model(input_values, attention_mask=attention_mask)
 
-        predicted_logits_action, predicted_ids_action = F.max(outputs.logits[:, :6], axis=-1)
-        predicted_logits_object, predicted_ids_object = F.max(outputs.logits[:, 6:20], axis=-1)
-        predicted_logits_location, predicted_ids_location = F.max(outputs.logits[:, 20:24], axis=-1)
+        predicted_logits_action, predicted_ids_action = ops.max(outputs.logits[:, :6], dim=-1)
+        predicted_logits_object, predicted_ids_object = ops.max(outputs.logits[:, 6:20], dim=-1)
+        predicted_logits_location, predicted_ids_location = ops.max(outputs.logits[:, 20:24], dim=-1)
 
         expected_labels_action = [0, 0, 2, 3]
-        expected_logits_action = ms.tensor([0.4568, 11.0848, 1.6621, 9.3841])
+        expected_logits_action = mindspore.tensor([0.4568, 11.0848, 1.6621, 9.3841])
         expected_labels_object = [3, 10, 3, 4]
-        expected_logits_object = ms.tensor([1.5322, 10.7094, 5.2469, 22.1318])
+        expected_logits_object = mindspore.tensor([1.5322, 10.7094, 5.2469, 22.1318])
         expected_labels_location = [0, 0, 0, 1]
-        expected_logits_location = ms.tensor([1.5335, 6.5096, 10.5704, 11.0569])
+        expected_logits_location = mindspore.tensor([1.5335, 6.5096, 10.5704, 11.0569])
 
         self.assertListEqual(predicted_ids_action.tolist(), expected_labels_action)
         self.assertListEqual(predicted_ids_object.tolist(), expected_labels_object)
         self.assertListEqual(predicted_ids_location.tolist(), expected_labels_location)
 
-        self.assertTrue(mnp.allclose(predicted_logits_action, expected_logits_action, atol=1e-2))
-        self.assertTrue(mnp.allclose(predicted_logits_object, expected_logits_object, atol=1e-2))
-        self.assertTrue(mnp.allclose(predicted_logits_location, expected_logits_location, atol=1e-2))
+        self.assertTrue(ops.allclose(predicted_logits_action, expected_logits_action, atol=1e-2))
+        self.assertTrue(ops.allclose(predicted_logits_object, expected_logits_object, atol=1e-2))
+        self.assertTrue(ops.allclose(predicted_logits_location, expected_logits_location, atol=1e-2))
 
     def test_inference_speaker_identification(self):
         model = Wav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-sid")
@@ -1480,19 +1514,20 @@ def test_inference_speaker_identification(self):
         input_data = self._load_superb("si", 4)
 
         output_logits = []
-        for example in input_data["speech"]:
-            input = processor(example, return_tensors="ms", padding=True)
-            output = model(input.input_values, attention_mask=None)
-            output_logits.append(output.logits[0])
-        output_logits = F.stack(output_logits)
-        predicted_logits, predicted_ids = F.max(output_logits, axis=-1)
+        with no_grad():
+            for example in input_data["speech"]:
+                input = processor(example, return_tensors="ms", padding=True)
+                output = model(input.input_values, attention_mask=None)
+                output_logits.append(output.logits[0])
+        output_logits = ops.stack(output_logits)
+        predicted_logits, predicted_ids = ops.max(output_logits, dim=-1)
 
         expected_labels = [251, 1, 1, 3]
         # s3prl logits for the same batch
-        expected_logits = ms.tensor([37.5627, 71.6362, 64.2419, 31.7778])
+        expected_logits = mindspore.tensor([37.5627, 71.6362, 64.2419, 31.7778])
 
         self.assertListEqual(predicted_ids.tolist(), expected_labels)
-        self.assertTrue(mnp.allclose(predicted_logits, expected_logits, atol=1e-2))
+        self.assertTrue(ops.allclose(predicted_logits, expected_logits, atol=1e-2))
 
     def test_inference_emotion_recognition(self):
         model = Wav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-er")
@@ -1502,29 +1537,32 @@ def test_inference_emotion_recognition(self):
 
         input_values = inputs.input_values
         attention_mask = inputs.attention_mask
-        outputs = model(input_values, attention_mask=attention_mask)
-        predicted_logits, predicted_ids = F.max(outputs.logits, axis=-1)
+        with no_grad():
+            outputs = model(input_values, attention_mask=attention_mask)
+        predicted_logits, predicted_ids = ops.max(outputs.logits, dim=-1)
 
         expected_labels = [1, 1, 2, 2]
         # s3prl logits for the same batch
-        expected_logits = ms.tensor([2.1722, 3.0779, 8.0287, 6.6797])
+        expected_logits = mindspore.tensor([2.1722, 3.0779, 8.0287, 6.6797])
 
         self.assertListEqual(predicted_ids.tolist(), expected_labels)
-        self.assertTrue(mnp.allclose(predicted_logits, expected_logits, atol=1e-2))
+        self.assertTrue(ops.allclose(predicted_logits, expected_logits, atol=1e-2))
 
-    @unittest.skip("espeak not available on Windows")
     def test_phoneme_recognition(self):
         model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
         processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
 
         input_speech = self._load_datasamples(4)
+
         inputs = processor(input_speech, return_tensors="ms", padding=True)
 
         input_values = inputs.input_values
         attention_mask = inputs.attention_mask
-        logits = model(input_values, attention_mask=attention_mask).logits
 
-        predicted_ids = F.argmax(logits, dim=-1)
+        with no_grad():
+            logits = model(input_values, attention_mask=attention_mask).logits
+
+        predicted_ids = ops.argmax(logits, dim=-1)
         predicted_trans = processor.batch_decode(predicted_ids)
 
         EXPECTED_TRANSCRIPTIONS = [
@@ -1546,44 +1584,50 @@ def test_phoneme_recognition(self):
         self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
 
     @require_pyctcdecode
-    @require_librosa
     def test_wav2vec2_with_lm(self):
-        ds = load_dataset("mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True, trust_remote_code=True)
+        ds = load_dataset(
+            "mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True, trust_remote_code=True
+        )
         sample = next(iter(ds))
 
-        resampled_audio = L.resample(
-            ms.tensor(sample["audio"]["array"]).numpy(), orig_sr=48_000, target_sr=16_000
-        )
+        resampled_audio = torchaudio.functional.resample(
+            mindspore.tensor(sample["audio"]["array"]), 48_000, 16_000
+        ).numpy()
 
         model = Wav2Vec2ForCTC.from_pretrained("patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm")
         processor = Wav2Vec2ProcessorWithLM.from_pretrained("patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm")
 
         input_values = processor(resampled_audio, return_tensors="ms").input_values
-        logits = model(input_values).logits
 
-        transcription = processor.batch_decode(logits).text
+        with no_grad():
+            logits = model(input_values).logits
+
+        transcription = processor.batch_decode(logits.asnumpy()).text
 
         self.assertEqual(transcription[0], "habitan aguas poco profundas y rocosas")
 
     @require_pyctcdecode
-    @require_librosa
     def test_wav2vec2_with_lm_pool(self):
-        ds = load_dataset("mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True, trust_remote_code=True)
+        ds = load_dataset(
+            "mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True, trust_remote_code=True
+        )
         sample = next(iter(ds))
 
-        resampled_audio = L.resample(
-            ms.tensor(sample["audio"]["array"]).numpy(), orig_sr=48_000, target_sr=16_000
-        )
+        resampled_audio = torchaudio.functional.resample(
+            mindspore.tensor(sample["audio"]["array"]), 48_000, 16_000
+        ).numpy()
 
         model = Wav2Vec2ForCTC.from_pretrained("patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm")
         processor = Wav2Vec2ProcessorWithLM.from_pretrained("patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm")
 
         input_values = processor(resampled_audio, return_tensors="ms").input_values
-        logits = model(input_values).logits
+
+        with no_grad():
+            logits = model(input_values).logits
 
         # test user-managed pool
         with multiprocessing.get_context("fork").Pool(2) as pool:
-            transcription = processor.batch_decode(logits, pool).text
+            transcription = processor.batch_decode(logits.asnumpy(), pool).text
 
         self.assertEqual(transcription[0], "habitan aguas poco profundas y rocosas")
 
@@ -1591,7 +1635,7 @@ def test_wav2vec2_with_lm_pool(self):
         with CaptureLogger(processing_wav2vec2_with_lm.logger) as cl, multiprocessing.get_context("fork").Pool(
             2
         ) as pool:
-            transcription = processor.batch_decode(logits, pool, num_processes=2).text
+            transcription = processor.batch_decode(logits.asnumpy(), pool, num_processes=2).text
 
         self.assertIn("num_process", cl.out)
         self.assertIn("it will be ignored", cl.out)
@@ -1599,7 +1643,6 @@ def test_wav2vec2_with_lm_pool(self):
         self.assertEqual(transcription[0], "habitan aguas poco profundas y rocosas")
 
     @require_pyctcdecode
-    @require_librosa
     def test_wav2vec2_with_lm_invalid_pool(self):
         run_test_in_subprocess(test_case=self, target_func=_test_wav2vec2_with_lm_invalid_pool, inputs=None)
 
@@ -1611,12 +1654,13 @@ def test_inference_diarization(self):
 
         input_values = inputs.input_values
         attention_mask = inputs.attention_mask
-        outputs = model(input_values, attention_mask=attention_mask)
+        with no_grad():
+            outputs = model(input_values, attention_mask=attention_mask)
         # labels is a one-hot array of shape (num_frames, num_speakers)
         labels = (outputs.logits > 0).long()
 
         # s3prl logits for the same batch
-        expected_logits = ms.tensor(
+        expected_logits = mindspore.tensor(
             [
                 [[-5.2807, -5.1272], [-5.4059, -4.7757], [-5.2764, -4.9621], [-5.0117, -4.5851]],
                 [[-1.7643, -0.5462], [-1.7369, -0.2649], [-1.5066, -0.6200], [-4.5703, -2.4863]],
@@ -1626,7 +1670,7 @@ def test_inference_diarization(self):
         )
         self.assertEqual(labels[0, :, 0].sum(), 555)
         self.assertEqual(labels[0, :, 1].sum(), 299)
-        self.assertTrue(mnp.allclose(outputs.logits[:, :4], expected_logits, atol=1e-2))
+        self.assertTrue(ops.allclose(outputs.logits[:, :4], expected_logits, atol=1e-2))
 
     def test_inference_speaker_verification(self):
         model = Wav2Vec2ForXVector.from_pretrained("anton-l/wav2vec2-base-superb-sv")
@@ -1634,14 +1678,15 @@ def test_inference_speaker_verification(self):
         input_data = self._load_superb("si", 4)
 
         inputs = processor(input_data["speech"], return_tensors="ms", padding=True, sampling_rate=16_000)
-        labels = ms.tensor([5, 1, 1, 3]).T
+        labels = mindspore.tensor([5, 1, 1, 3]).T
 
-        input_values = inputs.input_values
-        attention_mask = inputs.attention_mask
-        outputs = model(input_values, attention_mask=attention_mask, labels=labels)
-        embeddings = outputs.embeddings / F.norm(outputs.embeddings, dim=-1, keepdim=True)
+        with no_grad():
+            input_values = inputs.input_values
+            attention_mask = inputs.attention_mask
+            outputs = model(input_values, attention_mask=attention_mask, labels=labels)
+        embeddings = nn.functional.normalize(outputs.embeddings, dim=-1)
 
-        cosine_sim = lambda x, y: F.cosine_similarity(x, y, dim=-1)
+        cosine_sim = nn.CosineSimilarity(dim=-1)
         # id10002 vs id10002
         self.assertAlmostEqual(cosine_sim(embeddings[1], embeddings[2]).numpy(), 0.9758, 3)
         # id10006 vs id10002
@@ -1651,8 +1696,6 @@ def test_inference_speaker_verification(self):
 
         self.assertAlmostEqual(outputs.loss.item(), 17.7963, 2)
 
-    @unittest.skip('no torch support')
-    @require_librosa
     def test_inference_mms_1b_all(self):
         model = Wav2Vec2ForCTC.from_pretrained("facebook/mms-1b-all")
         processor = Wav2Vec2Processor.from_pretrained("facebook/mms-1b-all")
@@ -1660,7 +1703,9 @@ def test_inference_mms_1b_all(self):
         LANG_MAP = {"it": "ita", "es": "spa", "fr": "fra", "en": "eng"}
 
         def run_model(lang):
-            ds = load_dataset("mozilla-foundation/common_voice_11_0", lang, split="test", streaming=True, trust_remote_code=True)
+            ds = load_dataset(
+                "mozilla-foundation/common_voice_11_0", lang, split="test", streaming=True, trust_remote_code=True
+            )
             sample = next(iter(ds))
 
             wav2vec2_lang = LANG_MAP[lang]
@@ -1668,15 +1713,18 @@ def run_model(lang):
             model.load_adapter(wav2vec2_lang)
             processor.tokenizer.set_target_lang(wav2vec2_lang)
 
-            resampled_audio = L.resample(
-                ms.tensor(sample["audio"]["array"]).numpy(), orig_sr=48_000, target_sr=16_000
-            )
+            resampled_audio = torchaudio.functional.resample(
+                mindspore.tensor(sample["audio"]["array"]), 48_000, 16_000
+            ).numpy()
 
             inputs = processor(resampled_audio, sampling_rate=16_000, return_tensors="ms")
             input_values = inputs.input_values
             attention_mask = inputs.attention_mask
-            outputs = model(input_values, attention_mask=attention_mask).logits
-            ids = F.argmax(outputs, dim=-1)[0]
+
+            with no_grad():
+                outputs = model(input_values, attention_mask=attention_mask).logits
+
+            ids = ops.argmax(outputs, dim=-1)[0]
 
             transcription = processor.decode(ids)
             return transcription
@@ -1688,5 +1736,5 @@ def run_model(lang):
             "en": "joe keton disapproved of films and buster also had reservations about the media",
         }
 
-        for lang in LANG_MAP:
+        for lang in LANG_MAP.keys():
             assert run_model(lang) == TRANSCRIPTIONS[lang]
diff --git a/tests/ut/transformers/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py b/tests/ut/transformers/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py
index 1e74b4809..a8164e97f 100644
--- a/tests/ut/transformers/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py
+++ b/tests/ut/transformers/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py
@@ -21,7 +21,6 @@
 from datasets import load_dataset
 
 import numpy as np
-from mindspore import ops
 from mindspore import Tensor
 from mindnlp.transformers import Wav2Vec2BertConfig
 from mindnlp.utils.testing_utils import (
@@ -43,6 +42,7 @@
 
 if is_mindspore_available():
     import mindspore
+    from mindnlp.core import ops
 
     from mindnlp.transformers import (
         AutoFeatureExtractor,
@@ -779,7 +779,7 @@ def test_sample_negatives(self):
         features = (ops.arange(sequence_length * hidden_size) // hidden_size).view(
             sequence_length, hidden_size
         )  # each value in vector consits of same value
-        features = features[None, :].expand(batch_size, sequence_length, hidden_size)
+        features = features[None, :].broadcast_to((batch_size, sequence_length, hidden_size))
 
         # sample negative indices
         sampled_negative_indices = _sample_negative_indices((batch_size, sequence_length), num_negatives, None)
@@ -814,16 +814,16 @@ def test_sample_negatives_with_mask(self):
         features = (ops.arange(sequence_length * hidden_size) // hidden_size).view(
             sequence_length, hidden_size
         )  # each value in vector consits of same value
-        features = features[None, :].expand(batch_size, sequence_length, hidden_size)
+        features = features[None, :].broadcast_to((batch_size, sequence_length, hidden_size))
 
         # replace masked feature vectors with -100 to test that those are not sampled
-        features = ops.where(mask[:, :, None].expand(features.shape).bool(), features, -100)
+        features = ops.where(mask[:, :, None].broadcast_to(features.shape).bool(), features, -100)
 
         # sample negative indices
         sampled_negative_indices = _sample_negative_indices(
-            (batch_size, sequence_length), num_negatives, mask
+            (batch_size, sequence_length), num_negatives, mask.asnumpy()
         )
-        sampled_negative_indices = Tensor.from_numpy(sampled_negative_indices)
+        sampled_negative_indices = ops.from_numpy(sampled_negative_indices)
         negatives = features.view(-1, hidden_size)[sampled_negative_indices.long().view(-1)]
         negatives = negatives.view(batch_size, sequence_length, -1, hidden_size).permute(2, 0, 1, 3)
 
@@ -836,12 +836,7 @@ def test_sample_negatives_with_mask(self):
             self.assertTrue(((negative - features) == 0).sum() == 0.0)
 
         # make sure that full vectors are sampled and not values of vectors => this means that `unique()` yields a single value for `hidden_size` dim
-        #self.assertTrue(negatives.unique(dim=-1).shape, (num_negatives, batch_size, sequence_length, 1))
-        self.assertEqual(negatives.shape[:-1], (num_negatives, batch_size, sequence_length))
-        ref = negatives[:, :, :, 0]
-        for i in range(1, negatives.shape[-1]):
-            x = negatives[:, :, :, i]
-            self.assertTrue(ops.all(ref == x))
+        self.assertTrue(ops.unique(negatives, dim=-1).shape, (num_negatives, batch_size, sequence_length, 1))
 
 @require_mindspore
 @slow
diff --git a/tests/ut/transformers/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py b/tests/ut/transformers/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py
index 5a9f87db7..a13f7d8cc 100644
--- a/tests/ut/transformers/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py
+++ b/tests/ut/transformers/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py
@@ -19,7 +19,7 @@
 import unittest
 
 import mindspore
-from mindspore import ops, Tensor
+from mindspore import Tensor
 import numpy as np
 from datasets import load_dataset
 
@@ -42,6 +42,7 @@
 
 if is_mindspore_available():
     import mindspore
+    from mindnlp.core import ops
 
     from mindnlp.transformers import (
         Wav2Vec2ConformerForAudioFrameClassification,
@@ -519,65 +520,6 @@ def test_resize_tokens_embeddings(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    #@is_pt_flax_cross_test
-    # non-robust architecture does not exist in Flax
-    def test_equivalence_flax_to_pt(self):
-        pass
-
-    #@is_pt_flax_cross_test
-    # non-robust architecture does not exist in Flax
-    def test_equivalence_pt_to_flax(self):
-        pass
-
-    @unittest.skip('delated in wav2vec2')
-    def test_retain_grad_hidden_states_attentions(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        config.output_hidden_states = True
-        config.output_attentions = True
-
-        # no need to test all models as different heads yield the same functionality
-        model_class = self.all_model_classes[0]
-        model = model_class(config)
-        
-
-        # set layer drop to 0
-        model.config.layerdrop = 0.0
-
-        input_values = inputs_dict["input_values"]
-        attention_mask_ = inputs_dict["attention_mask"]
-
-
-        tmp = ops.ones_like(attention_mask_,dtype = mindspore.int64)
-        inputs_dict["attention_mask"] = tmp
-
-        outputs = model(**inputs_dict)
-
-        input_lengths = Tensor(
-            [input_values.shape[1] for _ in range(input_values.shape[0])], dtype=mindspore.int64
-        )
-        # output_lengths = model._get_feat_extract_output_lengths(input_lengths)
-        # labels = ids_tensor([input_values.shape[0], output_lengths[0] - 2], self.model_tester.vocab_size)
-        
-        # inputs_dict["labels"] = labels
-
-        # print(inputs_dict)
-
-        output = outputs[0]
-
-        # Encoder-/Decoder-only models
-        hidden_states = outputs.hidden_states[0]
-        attentions = outputs.attentions[0]
-
-        grad_fn = ops.GradOperation(get_by_list=True)
-        hidden_states.retain_grad()
-        attentions.retain_grad()
-
-        output.flatten()[0].backward(retain_graph=True)
-
-        self.assertIsNotNone(hidden_states.grad)
-        self.assertIsNotNone(attentions.grad)
-
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -788,8 +730,8 @@ def test_compute_mask_indices_short_audio(self):
             (batch_size, sequence_length), mask_prob, mask_length, attention_mask=attention_mask, min_masks=2
         )
 
-        mask_bool = ops.cast(Tensor(mask[0]), mindspore.bool_)
-        attention_mask_bool = ops.cast(attention_mask[0], mindspore.bool_)
+        mask_bool = Tensor(mask[0], mindspore.bool_)
+        attention_mask_bool = attention_mask[0].to(mindspore.bool_)
         # make sure that non-padded examples cannot be padded
         self.assertFalse(mask_bool[attention_mask_bool].any())
 
@@ -815,7 +757,7 @@ def test_sample_negatives(self):
         features = (ops.arange(sequence_length * hidden_size) // hidden_size).view(
             sequence_length, hidden_size
         )  # each value in vector consits of same value
-        features = features[None, :].expand(batch_size, sequence_length, hidden_size).contiguous()
+        features = features[None, :].broadcast_to((batch_size, sequence_length, hidden_size))
 
         # sample negative indices
         sampled_negative_indices = _sample_negative_indices((batch_size, sequence_length), num_negatives, None)
@@ -836,7 +778,7 @@ def test_sample_negatives(self):
         # make sure that full vectors are sampled and not values of vectors => this means that `unique()` yields a single value for `hidden_size` dim
         self.assertTrue(unique_negatives_tensor.shape, (num_negatives, batch_size, sequence_length, 1))
 
-    def test_sample_negatives_with_mask(self):  #TODO
+    def test_sample_negatives_with_mask(self):
         batch_size = 2
         sequence_length = 10
         hidden_size = 4
@@ -846,22 +788,19 @@ def test_sample_negatives_with_mask(self):  #TODO
         mask = ops.ones((batch_size, sequence_length), dtype=mindspore.int64)
         mask[-1, sequence_length // 2 :] = 0
 
-        sequence = ops.div(
-            ops.arange(sequence_length * hidden_size),
-            hidden_size,
-            rounding_mode="floor"
-        )
-        features = sequence.view(sequence_length, hidden_size)  # each value in vector consits of same value
-        features = features[None, :].expand(batch_size, sequence_length, hidden_size)
+        features = (ops.arange(sequence_length * hidden_size) // hidden_size).view(
+            sequence_length, hidden_size
+        )  # each value in vector consits of same value
+        features = features[None, :].broadcast_to((batch_size, sequence_length, hidden_size))
 
         # replace masked feature vectors with -100 to test that those are not sampled
-        features = ops.where(mask[:, :, None].expand(features.shape).bool(), features, -100)
+        features = ops.where(mask[:, :, None].broadcast_to(features.shape).bool(), features, -100)
 
         # sample negative indices
         sampled_negative_indices = _sample_negative_indices(
-            (batch_size, sequence_length), num_negatives, mask
+            (batch_size, sequence_length), num_negatives, mask.asnumpy()
         )
-        sampled_negative_indices = Tensor.from_numpy(sampled_negative_indices)
+        sampled_negative_indices = ops.from_numpy(sampled_negative_indices)
         negatives = features.view(-1, hidden_size)[sampled_negative_indices.long().view(-1)]
         negatives = negatives.view(batch_size, sequence_length, -1, hidden_size).permute(2, 0, 1, 3)
 
@@ -874,14 +813,7 @@ def test_sample_negatives_with_mask(self):  #TODO
             self.assertTrue(((negative - features) == 0).sum() == 0.0)
 
         # make sure that full vectors are sampled and not values of vectors => this means that `unique()` yields a single value for `hidden_size` dim
-        # self.assertTrue(check_unique_values(negatives,dim=-1).shape, (num_negatives, batch_size, sequence_length, 1))
-        # copy from wav2vec2
-        self.assertEqual(negatives.shape[:-1], (num_negatives, batch_size, sequence_length))
-        ref = negatives[:, :, :, 0]
-        for i in range(1, negatives.shape[-1]):
-            print('i = ', i)
-            x = negatives[:, :, :, i]
-            self.assertTrue(ops.all(ref == x))
+        self.assertTrue(ops.unique(negatives, dim=-1).shape, (num_negatives, batch_size, sequence_length, 1))
 
 @require_mindspore
 @slow
diff --git a/tests/ut/transformers/models/wavlm/test_modeling_wavlm.py b/tests/ut/transformers/models/wavlm/test_modeling_wavlm.py
index 4e9c14edf..3e3beeda2 100644
--- a/tests/ut/transformers/models/wavlm/test_modeling_wavlm.py
+++ b/tests/ut/transformers/models/wavlm/test_modeling_wavlm.py
@@ -38,7 +38,7 @@
 
 if is_mindspore_available():
     import mindspore
-    from mindspore import ops
+    from mindnlp.core import ops
 
     from mindnlp.transformers import (
         Wav2Vec2FeatureExtractor,
@@ -210,10 +210,10 @@ def check_ctc_loss(self, config, input_values, *args):
             attention_mask[i, input_lengths[i] :] = 0
 
         model.config.ctc_loss_reduction = "sum"
-        sum_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss[0].item()
+        sum_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
 
         model.config.ctc_loss_reduction = "mean"
-        mean_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss[0].item()
+        mean_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
 
         self.parent.assertTrue(isinstance(sum_loss, float))
         self.parent.assertTrue(isinstance(mean_loss, float))
@@ -267,7 +267,7 @@ def check_ctc_training(self, config, input_values, *args):
                 # one shorter than logit lengths to prevent -inf
                 labels[i, max_length_labels[i] - 1 :] = -100
 
-        loss = model(input_values, labels=labels).loss[0]
+        loss = model(input_values, labels=labels).loss
         self.parent.assertFalse(ops.isinf(loss).item())
 
 
@@ -441,8 +441,7 @@ def test_initialization(self):
         configs_no_init = _config_zero_init(config)
         for model_class in self.all_model_classes:
             model = model_class(config=configs_no_init)
-            for param in model.get_parameters():
-                name=param.name
+            for name, param in model.named_parameters():
                 uniform_init_parms = [
                     "conv.weight",
                     "conv.parametrizations.weight",
@@ -462,12 +461,12 @@ def test_initialization(self):
                 if param.requires_grad:
                     if any(x in name for x in uniform_init_parms):
                         self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
+                            -1.0 <= ((param.mean() * 1e9).round() / 1e9).item() <= 1.0,
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                         )
                     else:
                         self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            ((param.mean() * 1e9).round() / 1e9).item(),
                             [0.0, 1.0],
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                         )
@@ -475,17 +474,17 @@ def test_initialization(self):
     # overwrite from test_modeling_common
     def _mock_init_weights(self, module):
         if hasattr(module, "weight") and module.weight is not None:
-            module.weight.data.fill_(3)
+            module.weight.fill_(3)
         if hasattr(module, "weight_g") and module.weight_g is not None:
-            module.weight_g.data.fill_(3)
+            module.weight_g.fill_(3)
         if hasattr(module, "weight_v") and module.weight_v is not None:
-            module.weight_v.data.fill_(3)
+            module.weight_v.fill_(3)
         if hasattr(module, "bias") and module.bias is not None:
-            module.bias.data.fill_(3)
+            module.bias.fill_(3)
         if hasattr(module, "codevectors") and module.codevectors is not None:
-            module.codevectors.data.fill_(3)
+            module.codevectors.fill_(3)
         if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
-            module.masked_spec_embed.data.fill_(3)
+            module.masked_spec_embed.fill_(3)
 
     @unittest.skip(reason="Feed forward chunking is not implemented for WavLM")
     def test_feed_forward_chunking(self):