diff --git a/mindnlp/core/nn/functional.py b/mindnlp/core/nn/functional.py index 47d060454..133d68be4 100644 --- a/mindnlp/core/nn/functional.py +++ b/mindnlp/core/nn/functional.py @@ -177,6 +177,25 @@ def binary_cross_entropy_with_logits(input, target, weight=None, reduction='mean return mindspore.mint.nn.functional.binary_cross_entropy_with_logits(input, target, weight, reduction, pos_weight) return ops.binary_cross_entropy_with_logits(input, target, weight, pos_weight, reduction) +def gumbel_softmax(logits: Tensor, tau: float = 1, hard: bool = False, eps: float = 1e-10, dim: int = -1) -> Tensor: + if eps != 1e-10: + warnings.warn("`eps` parameter is deprecated and has no effect.") + + uniform_samples = _get_cache_prim(ops.UniformReal)()(logits.shape) + gumbels = -ops.log(-ops.log(uniform_samples + eps) + eps) # ~Gumbel(0, 1) + gumbels = (logits + gumbels) / tau # ~Gumbel(logits,tau) + y_soft = softmax(gumbels, dim) + + if hard: + # Straight through. + index = y_soft.argmax(dim) + y_hard = one_hot(index, logits.shape[dim]) + ret = ops.stop_gradient(y_hard - y_soft) + y_soft + else: + # Reparametrization trick. + ret = y_soft + return ret + def log_softmax(input, dim=-1, dtype=None): out = ops.log_softmax(input, dim) if dtype is not None: @@ -791,7 +810,7 @@ def multi_head_attention_forward( assert key_padding_mask.shape == (bsz, src_len), \ f"expecting key_padding_mask shape of {(bsz, src_len)}, but got {key_padding_mask.shape}" key_padding_mask = key_padding_mask.view(bsz, 1, 1, src_len). \ - expand(-1, num_heads, -1, -1).reshape(bsz * num_heads, 1, src_len) + broadcast_to((-1, num_heads, -1, -1)).reshape(bsz * num_heads, 1, src_len) if attn_mask is None: attn_mask = key_padding_mask else: diff --git a/mindnlp/core/serialization.py b/mindnlp/core/serialization.py index b7164cda7..eec39ac9b 100644 --- a/mindnlp/core/serialization.py +++ b/mindnlp/core/serialization.py @@ -35,7 +35,7 @@ import numpy as np import mindspore -from mindspore import Tensor +from mindspore import Tensor, Parameter from mindspore.train.serialization import _exec_save, _parse_ckpt_proto, tensor_to_np_type, tensor_to_ms_type import safetensors @@ -756,6 +756,13 @@ def _open_zipfile_writer(name_or_buffer): container = _open_zipfile_writer_buffer return container(name_or_buffer) +def _rebuild_parameter(data, requires_grad, backward_hooks): + param = Parameter(data, requires_grad=requires_grad) + # NB: This line exists only for backwards compatibility; the + # general expectation is that backward_hooks is an empty + # OrderedDict. See Note [Don't serialize hooks] + return param + def _rebuild_tensor_v2(storage, storage_offset, size, stride, requires_grad, backward_hooks, metadata=None): '''Rebuilds a tensor based on the provided parameters. diff --git a/mindnlp/peft/peft_model.py b/mindnlp/peft/peft_model.py index 922f34c57..32e51bbcd 100644 --- a/mindnlp/peft/peft_model.py +++ b/mindnlp/peft/peft_model.py @@ -253,7 +253,7 @@ def load_adapter(self, model_id: str, adapter_name: str, is_trainable: bool = Fa return load_result - def get_nb_trainable_parameters(self) -> tuple[int, int]: + def get_nb_trainable_parameters(self): r""" Returns the number of trainable parameters and the number of all parameters in the model. """ diff --git a/mindnlp/transformers/cache_utils.py b/mindnlp/transformers/cache_utils.py index 98206eb11..71e154499 100644 --- a/mindnlp/transformers/cache_utils.py +++ b/mindnlp/transformers/cache_utils.py @@ -990,7 +990,7 @@ def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: """Returns the sequence length of the cached states. A layer index can be optionally passed.""" if len(self.self_attention_cache.key_cache) <= layer_idx: return 0 - return (ops.any(self.self_attention_cache.key_cache[layer_idx][0, 0], dim=-1)).sum().item() + return (ops.any(self.self_attention_cache.key_cache[layer_idx][0, 0].bool(), dim=-1)).sum().item() def reset(self): if hasattr(self.self_attention_cache, "reset"): diff --git a/mindnlp/transformers/models/data2vec/modeling_data2vec_audio.py b/mindnlp/transformers/models/data2vec/modeling_data2vec_audio.py index 4d477afcb..0f511cc1a 100644 --- a/mindnlp/transformers/models/data2vec/modeling_data2vec_audio.py +++ b/mindnlp/transformers/models/data2vec/modeling_data2vec_audio.py @@ -20,7 +20,7 @@ import mindspore from mindspore.common.initializer import Uniform, HeNormal, initializer,Normal -from mindnlp.core import nn, ops +from mindnlp.core import nn, ops, no_grad from mindnlp.core.nn import functional as F from mindnlp.utils import logging from ...activations import ACT2FN @@ -969,8 +969,9 @@ def forward( if labels is not None: # retrieve loss input_lengths from attention_mask labels = labels.astype(mindspore.int32) - # if labels.max() >= self.config.vocab_size: - # raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") + with no_grad(): + if ops.max(labels) >= self.config.vocab_size: + raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") attention_mask = ( attention_mask if attention_mask is not None else ops.ones_like(input_values, dtype=mindspore.int64) ) diff --git a/mindnlp/transformers/models/deformable_detr/modeling_deformable_detr.py b/mindnlp/transformers/models/deformable_detr/modeling_deformable_detr.py index e4e129845..2620b6384 100644 --- a/mindnlp/transformers/models/deformable_detr/modeling_deformable_detr.py +++ b/mindnlp/transformers/models/deformable_detr/modeling_deformable_detr.py @@ -1687,9 +1687,9 @@ def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes) proposals.append(proposal) _cur += height * width output_proposals = ops.cat(proposals, 1) - output_proposals_valid = ( - (output_proposals > 0.01).int() & (output_proposals < 0.99).int() - ).all(-1, keep_dims=True) + output_proposals_valid = ops.all( + ((output_proposals > 0.01).int() & (output_proposals < 0.99).int()).bool(), -1, keepdim=True + ) output_proposals = ops.log( output_proposals / (1 - output_proposals) ) # inverse sigmoid @@ -2291,8 +2291,9 @@ def loss_labels(self, outputs, targets, indices, num_boxes): source_logits.shape[2] + 1, dtype=source_logits.dtype, ) + target_classes = target_classes.unsqueeze(-1) target_classes_onehot = ops.scatter( - target_classes_onehot, 2, target_classes.unsqueeze(-1), ops.ones_like(target_classes_onehot) + target_classes_onehot, 2, target_classes, ops.ones_like(target_classes, dtype=target_classes_onehot.dtype) ) target_classes_onehot = target_classes_onehot[:, :, :-1] loss_ce = ( diff --git a/mindnlp/transformers/models/wav2vec2/__init__.py b/mindnlp/transformers/models/wav2vec2/__init__.py index e1790d678..c380c1179 100644 --- a/mindnlp/transformers/models/wav2vec2/__init__.py +++ b/mindnlp/transformers/models/wav2vec2/__init__.py @@ -15,21 +15,11 @@ ''' Wav2Vec2 Model ''' from . import configuration_wav2vec2, feature_extraction_wav2vec2, processing_wav2vec2, tokenization_wav2vec2, modeling_wav2vec2 -from .configuration_wav2vec2 import WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP, Wav2Vec2Config +from .configuration_wav2vec2 import * from .feature_extraction_wav2vec2 import Wav2Vec2FeatureExtractor from .processing_wav2vec2 import Wav2Vec2Processor from .tokenization_wav2vec2 import Wav2Vec2CTCTokenizer, Wav2Vec2Tokenizer -from .modeling_wav2vec2 import ( - WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST, - Wav2Vec2ForAudioFrameClassification, - Wav2Vec2ForCTC, - Wav2Vec2ForMaskedLM, - Wav2Vec2ForPreTraining, - Wav2Vec2ForSequenceClassification, - Wav2Vec2ForXVector, - Wav2Vec2Model, - Wav2Vec2PreTrainedModel, -) +from .modeling_wav2vec2 import * __all__ = [] __all__.extend(configuration_wav2vec2.__all__) diff --git a/mindnlp/transformers/models/wav2vec2/modeling_wav2vec2.py b/mindnlp/transformers/models/wav2vec2/modeling_wav2vec2.py index 10d0f3ad4..73b795581 100644 --- a/mindnlp/transformers/models/wav2vec2/modeling_wav2vec2.py +++ b/mindnlp/transformers/models/wav2vec2/modeling_wav2vec2.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" Mindspore Wav2Vec2 model. """ +"""MindSpore Wav2Vec2 model.""" import math import warnings @@ -21,14 +21,12 @@ import numpy as np import mindspore -from mindspore import Tensor, Parameter -from mindspore.common.initializer import initializer, Normal, Uniform - from mindnlp.core import nn, ops -from mindnlp.core.nn import functional as F +from mindnlp.core.nn import CrossEntropyLoss +from mindnlp.core.serialization import load, safe_load_file + from ...activations import ACT2FN from ...modeling_outputs import ( - ModelOutput, BaseModelOutput, CausalLMOutput, MaskedLMOutput, @@ -39,37 +37,46 @@ ) from ...modeling_utils import PreTrainedModel from ....utils import ( + ModelOutput, cached_file, + is_safetensors_available, logging, ) - from .configuration_wav2vec2 import Wav2Vec2Config -__all__ = [ - 'WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST', - 'Wav2Vec2PreTrainedModel', - 'Wav2Vec2Model', - 'Wav2Vec2ForPreTraining', - 'Wav2Vec2ForMaskedLM', - 'Wav2Vec2ForCTC', - 'Wav2Vec2ForSequenceClassification', - 'Wav2Vec2ForAudioFrameClassification', - 'Wav2Vec2ForXVector', -] + +WAV2VEC2_ADAPTER_PT_FILE = "adapter.{}.bin" +WAV2VEC2_ADAPTER_SAFE_FILE = "adapter.{}.safetensors" + logger = logging.get_logger(__name__) + _HIDDEN_STATES_START_POSITION = 2 -WAV2VEC2_ADAPTER_PT_FILE = "adapter.{}.bin" -WAV2VEC2_ADAPTER_SAFE_FILE = "adapter.{}.safetensors" -WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST = [ - "facebook/wav2vec2-base-960h", - "facebook/wav2vec2-large-960h", - "facebook/wav2vec2-large-960h-lv60", - "facebook/wav2vec2-large-960h-lv60-self", - # See all Wav2Vec2 models at https://hf-mirror.com/models?filter=wav2vec2 -] +# General docstring +_CONFIG_FOR_DOC = "Wav2Vec2Config" + +# Base docstring +_CHECKPOINT_FOR_DOC = "facebook/wav2vec2-base-960h" +_EXPECTED_OUTPUT_SHAPE = [1, 292, 768] + +# CTC docstring +_CTC_EXPECTED_OUTPUT = "'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'" +_CTC_EXPECTED_LOSS = 53.48 + +# Audio class docstring +_SEQ_CLASS_CHECKPOINT = "superb/wav2vec2-base-superb-ks" +_SEQ_CLASS_EXPECTED_OUTPUT = "'_unknown_'" +_SEQ_CLASS_EXPECTED_LOSS = 6.54 + +# Frame class docstring +_FRAME_CLASS_CHECKPOINT = "anton-l/wav2vec2-base-superb-sd" +_FRAME_EXPECTED_OUTPUT = [0, 0] + +# Speaker Verification docstring +_XVECTOR_CHECKPOINT = "anton-l/wav2vec2-base-superb-sv" +_XVECTOR_EXPECTED_OUTPUT = 0.98 @dataclass @@ -78,48 +85,47 @@ class Wav2Vec2ForPreTrainingOutput(ModelOutput): Output type of [`Wav2Vec2ForPreTraining`], with potential hidden states and attentions. Args: - loss (*optional*, returned when `sample_negative_indices` are passed, `Tensor` of shape `(1,)`): + loss (*optional*, returned when `sample_negative_indices` are passed, `mindspore.Tensor` of shape `(1,)`): Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official paper](https://arxiv.org/pdf/2006.11477.pdf) . (classification) loss. - projected_states (`Tensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`): + projected_states (`mindspore.Tensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`): Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked projected quantized states. - projected_quantized_states (`Tensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`): + projected_quantized_states (`mindspore.Tensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`): Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive target vectors for contrastive loss. - hidden_states (`tuple(Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when - `config.output_hidden_states=True`): - Tuple of `Tensor` (one for the output of the embeddings + one for the output of each layer) of + hidden_states (`tuple(mindspore.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `mindspore.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (`tuple(Tensor)`, *optional*, returned when `output_attentions=True` is passed or when - `config.output_attentions=True`): - Tuple of `Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + attentions (`tuple(mindspore.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `mindspore.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. - contrastive_loss (*optional*, returned when `sample_negative_indices` are passed, `Tensor` of shape `(1,)`): + contrastive_loss (*optional*, returned when `sample_negative_indices` are passed, `mindspore.Tensor` of shape `(1,)`): The contrastive loss (L_m) as stated in the [official paper](https://arxiv.org/pdf/2006.11477.pdf) . - diversity_loss (*optional*, returned when `sample_negative_indices` are passed, `Tensor` of shape `(1,)`): + diversity_loss (*optional*, returned when `sample_negative_indices` are passed, `mindspore.Tensor` of shape `(1,)`): The diversity loss (L_d) as stated in the [official paper](https://arxiv.org/pdf/2006.11477.pdf) . """ - loss: Optional[Tensor] = None - projected_states: Tensor = None - projected_quantized_states: Tensor = None - codevector_perplexity: Tensor = None - hidden_states: Optional[Tuple[Tensor]] = None - attentions: Optional[Tuple[Tensor]] = None - contrastive_loss: Optional[Tensor] = None - diversity_loss: Optional[Tensor] = None + + loss: Optional[mindspore.Tensor] = None + projected_states: mindspore.Tensor = None + projected_quantized_states: mindspore.Tensor = None + codevector_perplexity: mindspore.Tensor = None + hidden_states: Optional[Tuple[mindspore.Tensor]] = None + attentions: Optional[Tuple[mindspore.Tensor]] = None + contrastive_loss: Optional[mindspore.Tensor] = None + diversity_loss: Optional[mindspore.Tensor] = None def _compute_mask_indices( shape: Tuple[int, int], mask_prob: float, mask_length: int, - attention_mask: Optional[Tensor] = None, + attention_mask: Optional[mindspore.Tensor] = None, min_masks: int = 0, ) -> np.ndarray: """ @@ -129,15 +135,15 @@ def _compute_mask_indices( Args: shape: The shape for which to compute masks. This should be of a tuple of size 2 where - the first element is the batch size and the second element is the length of the axis to span. + the first element is the batch size and the second element is the length of the axis to span. mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of - independently generated mask spans of length `mask_length` is computed by - `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the - actual percentage will be smaller. + independently generated mask spans of length `mask_length` is computed by + `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the + actual percentage will be smaller. mask_length: size of the mask min_masks: minimum number of masked spans attention_mask: A (right-padded) attention mask which independently shortens the feature axis of - each batch dimension. + each batch dimension. """ batch_size, sequence_length = shape @@ -248,8 +254,6 @@ def _sample_negative_indices( # get `num_negatives` random vector indices from the same utterance sampled_negative_indices = np.zeros(shape=(batch_size, sequence_length, num_negatives), dtype=np.int32) - if isinstance(mask_time_indices, Tensor): - mask_time_indices = mask_time_indices.asnumpy() mask_time_indices = ( mask_time_indices.astype(bool) if mask_time_indices is not None else np.ones(features_shape, dtype=bool) ) @@ -273,48 +277,7 @@ def _sample_negative_indices( class Wav2Vec2NoLayerNormConvLayer(nn.Module): - - """ - Wav2Vec2NoLayerNormConvLayer is a Python class representing a convolutional layer without layer normalization for - the Wav2Vec2 model. This class inherits from nn.Module and is used for processing audio features. - - Attributes: - config (Wav2Vec2Config): The configuration object for the Wav2Vec2 model. - layer_id (int): The index of the convolutional layer. - in_conv_dim (int): The input dimension of the convolutional layer. - out_conv_dim (int): The output dimension of the convolutional layer. - conv (nn.Conv1d): The 1D convolutional operation applied to the input. - activation (function): The activation function used to process the convolutional output. - - Methods: - __init__: Initializes the Wav2Vec2NoLayerNormConvLayer with the provided configuration and layer index. - forward: Applies the convolutional and activation operations to the input hidden_states. - - Note: - This class is part of the Wav2Vec2 model and is specifically designed for processing audio features without - layer normalization. - """ - def __init__(self, config: Wav2Vec2Config, layer_id=0): - """ - __init__(self, config: Wav2Vec2Config, layer_id=0) - - Initializes a new instance of the Wav2Vec2NoLayerNormConvLayer class. - - Args: - self: The instance of the class. - config (Wav2Vec2Config): An instance of the Wav2Vec2Config class containing the configuration parameters - for the Wav2Vec2 model. - layer_id (int, optional): The index of the layer. Defaults to 0. Specifies the layer for which the - convolutional layer is initialized. - - Returns: - None. - - Raises: - ValueError: If the layer_id is less than 0. - AttributeError: If the layer_id exceeds the maximum index available in the configuration parameters. - TypeError: If the provided config parameter is not an instance of the Wav2Vec2Config class. - """ + def __init__(self, config, layer_id=0): super().__init__() self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1 self.out_conv_dim = config.conv_dim[layer_id] @@ -329,56 +292,13 @@ def __init__(self, config: Wav2Vec2Config, layer_id=0): self.activation = ACT2FN[config.feat_extract_activation] def forward(self, hidden_states): - """ - Constructs the hidden states using convolutional layer and activation function. - - Args: - self (Wav2Vec2NoLayerNormConvLayer): The instance of the Wav2Vec2NoLayerNormConvLayer class. - hidden_states (torch.Tensor): The input hidden states tensor. - - Returns: - torch.Tensor: The forwarded hidden states after applying convolution and activation. - - Raises: - TypeError: If the input hidden_states is not a torch.Tensor. - """ hidden_states = self.conv(hidden_states) hidden_states = self.activation(hidden_states) return hidden_states class Wav2Vec2LayerNormConvLayer(nn.Module): - - """ - This class represents a convolutional layer with layer normalization in the Wav2Vec2 model. - It inherits from the nn.Module class. - - Attributes: - config (Wav2Vec2Config): The configuration object for the Wav2Vec2 model. - layer_id (int): The ID of the current layer. - - Methods: - __init__: - Initializes the Wav2Vec2LayerNormConvLayer with the given configuration and layer ID. - - forward: - Applies the convolutional layer with layer normalization to the input hidden states. - - """ - def __init__(self, config: Wav2Vec2Config, layer_id=0): - """ - Initialize the Wav2Vec2LayerNormConvLayer. - - Args: - config (Wav2Vec2Config): The configuration object containing the parameters for the layer. - layer_id (int, optional): The ID of the layer. Defaults to 0. - - Returns: - None - - Raises: - None - """ + def __init__(self, config, layer_id=0): super().__init__() self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1 self.out_conv_dim = config.conv_dim[layer_id] @@ -390,71 +310,22 @@ def __init__(self, config: Wav2Vec2Config, layer_id=0): stride=config.conv_stride[layer_id], bias=config.conv_bias, ) - self.layer_norm = nn.LayerNorm(self.out_conv_dim) + self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True) self.activation = ACT2FN[config.feat_extract_activation] def forward(self, hidden_states): - """ - Construct the hidden states using the Wav2Vec2LayerNormConvLayer method. - - Args: - self (Wav2Vec2LayerNormConvLayer): An instance of the Wav2Vec2LayerNormConvLayer class. - hidden_states (Tensor): The input hidden states to be processed. - It should have the shape (batch_size, sequence_length, feature_dim). - - Returns: - None. - - Raises: - None. - """ hidden_states = self.conv(hidden_states) - hidden_states = hidden_states.swapaxes(-2, -1) + + hidden_states = ops.transpose(hidden_states, -2, -1) hidden_states = self.layer_norm(hidden_states) - hidden_states = hidden_states.swapaxes(-2, -1) + hidden_states = ops.transpose(hidden_states, -2, -1) + hidden_states = self.activation(hidden_states) return hidden_states class Wav2Vec2GroupNormConvLayer(nn.Module): - - """ - This class represents a group normalization convolutional layer used in the Wav2Vec2 model. - It applies a 1D convolution operation followed by group normalization, activation, and layer normalization to the - input hidden states. - - Args: - config (Wav2Vec2Config): The configuration object containing the settings for the Wav2Vec2 model. - layer_id (int, optional): The index of the convolutional layer in the model. Defaults to 0. - - Attributes: - in_conv_dim (int): The input dimension of the convolutional layer. - out_conv_dim (int): The output dimension of the convolutional layer. - conv (nn.Conv1d): The 1D convolutional layer used to process the hidden states. - activation (function): The activation function applied to the processed hidden states. - layer_norm (nn.GroupNorm): The group normalization layer applied to the hidden states. - - Methods: - forward: Applies the convolutional layer, normalization, activation, and returns the processed hidden states. - - """ - def __init__(self, config: Wav2Vec2Config, layer_id=0): - """ - Initializes an instance of the Wav2Vec2GroupNormConvLayer class. - - Args: - self: The current instance of the class. - config (Wav2Vec2Config): An instance of the Wav2Vec2Config class containing configuration settings. - layer_id (int): The index of the convolutional layer within the configuration. Defaults to 0. - - Returns: - None. - - Raises: - ValueError: If the layer_id is less than 0. - KeyError: If the specified activation function in config is not found in the ACT2FN dictionary. - ValueError: If the specified pad_mode in the nn.Conv1d function is not 'valid'. - """ + def __init__(self, config, layer_id=0): super().__init__() self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1 self.out_conv_dim = config.conv_dim[layer_id] @@ -467,68 +338,18 @@ def __init__(self, config: Wav2Vec2Config, layer_id=0): bias=config.conv_bias, ) self.activation = ACT2FN[config.feat_extract_activation] + self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim, affine=True) def forward(self, hidden_states): - """ - This method forwards a group normalization convolutional layer for the Wav2Vec2 model. - - Args: - self (Wav2Vec2GroupNormConvLayer): The instance of the Wav2Vec2GroupNormConvLayer class. - hidden_states (torch.Tensor): The input tensor representing the hidden states to be processed by the group normalization convolutional layer. - - Returns: - torch.Tensor: The processed tensor representing the hidden states after passing through the group normalization convolutional layer. - - Raises: - None. - """ hidden_states = self.conv(hidden_states) - hidden_states = self.layer_norm(hidden_states.unsqueeze(-1)).squeeze(-1) # tmfix: GroupNorm only support 4D + hidden_states = self.layer_norm(hidden_states) hidden_states = self.activation(hidden_states) return hidden_states class Wav2Vec2PositionalConvEmbedding(nn.Module): - - """ - This class represents a positional convolutional embedding layer in the Wav2Vec2 model architecture. - It inherits from nn.Module and is designed to process hidden states through convolutional and activation operations. - - Attributes: - config: Wav2Vec2Config - An instance of Wav2Vec2Config containing configuration parameters for the layer. - - Methods: - __init__: - Initializes the Wav2Vec2PositionalConvEmbedding with the provided configuration. - - forward: - Applies positional convolutional embedding operations on the input hidden_states and returns the - transformed output. - - Usage: - Instantiate this class by providing a Wav2Vec2Config object as configuration, then call the forward method - with hidden states to process them. - - Note: - This class utilizes a convolutional layer, padding layer, and activation function to process hidden states - efficiently. - """ - def __init__(self, config: Wav2Vec2Config): - """ - Initializes a new instance of the Wav2Vec2PositionalConvEmbedding class. - - Args: - self: An instance of the Wav2Vec2PositionalConvEmbedding class. - config (Wav2Vec2Config): The configuration object containing various settings for the Wav2Vec2 model. - - Returns: - None - - Raises: - None - """ + def __init__(self, config): super().__init__() self.conv = nn.Conv1d( config.hidden_size, @@ -536,88 +357,32 @@ def __init__(self, config: Wav2Vec2Config): kernel_size=config.num_conv_pos_embeddings, padding=config.num_conv_pos_embeddings // 2, groups=config.num_conv_pos_embedding_groups, - bias=True, ) - self.conv = F.weight_norm(self.conv, name='weight', dim=2) + weight_norm = nn.utils.weight_norm + + self.conv = weight_norm(self.conv, name="weight", dim=2) + self.padding = Wav2Vec2SamePadLayer(config.num_conv_pos_embeddings) self.activation = ACT2FN[config.feat_extract_activation] def forward(self, hidden_states): - """ - This method forwards the positional convolutional embedding for the Wav2Vec2 model. - - Args: - self (Wav2Vec2PositionalConvEmbedding): The instance of the Wav2Vec2PositionalConvEmbedding class. - hidden_states (array-like): The input hidden states with shape (batch_size, sequence_length, hidden_size). - - Returns: - None: This method does not return any value. The positional convolutional embedding is applied to the - input hidden states in place. + hidden_states = ops.transpose(hidden_states, 1, 2) - Raises: - ValueError: If the input hidden_states is not in the expected format or shape. - RuntimeError: If an error occurs during the convolution or activation process. - """ - hidden_states = hidden_states.swapaxes(1, 2) hidden_states = self.conv(hidden_states) hidden_states = self.padding(hidden_states) hidden_states = self.activation(hidden_states) - hidden_states = hidden_states.swapaxes(1, 2) + + hidden_states = ops.transpose(hidden_states, 1, 2) return hidden_states class Wav2Vec2SamePadLayer(nn.Module): - - """ - This class represents a layer in the Wav2Vec2 model that performs padding removal. - - Wav2Vec2SamePadLayer is a subclass of nn.Module and is designed to remove padding from hidden states in the - Wav2Vec2 model. It is primarily used in the Wav2Vec2 model for speech recognition tasks. - - Attributes: - num_pad_remove (int): The number of padding elements to remove from the hidden states. - - Methods: - __init__: Initializes a new instance of the Wav2Vec2SamePadLayer class. - forward: Removes padding elements from the hidden states. - - """ def __init__(self, num_conv_pos_embeddings): - """ - Initializes an instance of the Wav2Vec2SamePadLayer class. - - Args: - self (Wav2Vec2SamePadLayer): The current instance of the Wav2Vec2SamePadLayer class. - num_conv_pos_embeddings (int): The number of convolutional positional embeddings. - It is used to determine the value of the num_pad_remove attribute. - The value must be a non-negative integer. - - Returns: - None. - - Raises: - None. - """ super().__init__() self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0 def forward(self, hidden_states): - """ - Constructs the hidden states of the Wav2Vec2SamePadLayer. - - Args: - self (Wav2Vec2SamePadLayer): An instance of the Wav2Vec2SamePadLayer class. - hidden_states (torch.Tensor): The hidden states to be processed. - Expected shape is (batch_size, sequence_length, hidden_size). - The hidden states are processed based on the `num_pad_remove` value. - - Returns: - None. - - Raises: - None. - """ if self.num_pad_remove > 0: hidden_states = hidden_states[:, :, : -self.num_pad_remove] return hidden_states @@ -625,29 +390,8 @@ def forward(self, hidden_states): class Wav2Vec2FeatureEncoder(nn.Module): """Construct the features from raw audio waveform""" - def __init__(self, config: Wav2Vec2Config): - """ - Initializes a new instance of the Wav2Vec2FeatureEncoder class. - - Args: - self: The object itself. - config (Wav2Vec2Config): - The configuration object for the feature encoder. - - - config.feat_extract_norm (str): The type of normalization to be applied during feature extraction. - - - 'group': Applies group normalization to the convolutional layers. - - 'layer': Applies layer normalization to the convolutional layers. - - - config.num_feat_extract_layers (int): The number of feature extraction layers. - - Returns: - None. - - Raises: - ValueError: If `config.feat_extract_norm` is not one of ['group', 'layer']. - """ + def __init__(self, config): super().__init__() if config.feat_extract_norm == "group": @@ -663,74 +407,35 @@ def __init__(self, config: Wav2Vec2Config): f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']" ) self.conv_layers = nn.ModuleList(conv_layers) + self.gradient_checkpointing = False self._requires_grad = True def _freeze_parameters(self): - """ - Freezes the parameters of the Wav2Vec2FeatureEncoder. - - Args: - self: An instance of the Wav2Vec2FeatureEncoder class. - - Returns: - None. - - Raises: - None. - """ - for _, param in self.parameters_and_names(): + for param in self.parameters(): param.requires_grad = False self._requires_grad = False def forward(self, input_values): - """ - Method 'forward' in the class 'Wav2Vec2FeatureEncoder' forwards the hidden states from the input values - using convolutional layers. - - Args: - self (object): The instance of the class. - input_values (tensor): The input values for forwarding hidden states. It is expected to be a 2D tensor. + hidden_states = input_values[:, None] - Returns: - tensor: The forwarded hidden states after passing through the convolutional layers. + # make sure hidden_states require grad for gradient_checkpointing + if self._requires_grad and self.training: + hidden_states.requires_grad = True - Raises: - None - """ - hidden_states = input_values[:, None] for conv_layer in self.conv_layers: - hidden_states = conv_layer(hidden_states) + if self._requires_grad and self.gradient_checkpointing and self.training: + hidden_states = self._gradient_checkpointing_func( + conv_layer.__call__, + hidden_states, + ) + else: + hidden_states = conv_layer(hidden_states) + return hidden_states class Wav2Vec2FeatureExtractor(Wav2Vec2FeatureEncoder): - - """ - Wav2Vec2FeatureExtractor is a class that represents a feature extractor for Wav2Vec2 models. - It is designed to extract features from audio data for use in Wav2Vec2 models. - - This class inherits from Wav2Vec2FeatureEncoder, and it is recommended to use Wav2Vec2FeatureEncoder instead of - this class, as Wav2Vec2FeatureExtractor has been deprecated. - - Please refer to the documentation for Wav2Vec2FeatureEncoder for feature extraction and encoding in Wav2Vec2 models. - """ - def __init__(self, config: Wav2Vec2Config): - """ - This method initializes an instance of the Wav2Vec2FeatureExtractor class. - - Args: - self: The instance of the class. - config (Wav2Vec2Config): An instance of the Wav2Vec2Config class containing the configuration parameters - for the feature extractor. - - Returns: - None. - - Raises: - FutureWarning: If the class Wav2Vec2FeatureExtractor is used, a FutureWarning is raised indicating that - the class has been depreciated. It is recommended to use the base - class instead. - """ + def __init__(self, config): super().__init__(config) warnings.warn( f"The class `{self.__class__.__name__}` has been depreciated " @@ -741,63 +446,13 @@ class instead. class Wav2Vec2FeatureProjection(nn.Module): - - """ - Wav2Vec2FeatureProjection is a Python class that represents a feature projection module for Wav2Vec2. - This class inherits from nn.Module and contains methods for initializing the feature projection and forwarding the - hidden states. - - The __init__ method initializes the feature projection module by setting up layer normalization, dense projection, - and dropout. - - The forward method applies layer normalization to the hidden states, projects the normalized states using dense - projection, and applies dropout to the projected states before returning the hidden states and the normalized - hidden states. - """ - def __init__(self, config: Wav2Vec2Config): - """ - Initializes the Wav2Vec2FeatureProjection class. - - Args: - self: The instance of the Wav2Vec2FeatureProjection class. - config (Wav2Vec2Config): An instance of the Wav2Vec2Config class containing the configuration parameters - for the Wav2Vec2 feature projection. It specifies the configuration for the layer - normalization, projection, and dropout layers. - - Returns: - None. - - Raises: - TypeError: If the config parameter is not of type Wav2Vec2Config. - ValueError: If the config.conv_dim[-1] is not valid or if the config.hidden_size is not valid. - RuntimeError: If an error occurs during the initialization of layer normalization, projection, - or dropout layers. - """ + def __init__(self, config): super().__init__() self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps) self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size) - self.dropout = nn.Dropout(p=config.feat_proj_dropout) + self.dropout = nn.Dropout(config.feat_proj_dropout) def forward(self, hidden_states): - """ - This method forwards the hidden states by applying layer normalization, projection, and dropout. - - Args: - self (Wav2Vec2FeatureProjection): The instance of the Wav2Vec2FeatureProjection class. - hidden_states (Tensor): The input hidden states to be processed. It should be a tensor of shape - (batch_size, sequence_length, feature_dim). - - Returns: - Tuple[Tensor, Tensor]: - A tuple containing two tensors: - - - hidden_states (Tensor): The processed hidden states after applying layer normalization, projection, - and dropout. - - norm_hidden_states (Tensor): The normalized hidden states obtained after applying layer normalization. - - Raises: - None. - """ # non-projected hidden states are needed for quantization norm_hidden_states = self.layer_norm(hidden_states) hidden_states = self.projection(norm_hidden_states) @@ -808,6 +463,7 @@ def forward(self, hidden_states): # Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->Wav2Vec2 class Wav2Vec2Attention(nn.Module): """Multi-headed attention from 'Attention Is All You Need' paper""" + def __init__( self, embed_dim: int, @@ -818,24 +474,6 @@ def __init__( is_causal: bool = False, config: Optional[Wav2Vec2Config] = None, ): - """ - Initializes an instance of the Wav2Vec2Attention class. - - Args: - embed_dim (int): The dimension of the input embeddings. - num_heads (int): The number of attention heads. - dropout (float, optional): The dropout probability. Defaults to 0.0. - is_decoder (bool, optional): Whether the attention module is used as a decoder. Defaults to False. - bias (bool, optional): Whether to include bias in linear projections. Defaults to True. - is_causal (bool, optional): Whether the attention is causal. Defaults to False. - config (Optional[Wav2Vec2Config], optional): The configuration object. Defaults to None. - - Returns: - None - - Raises: - ValueError: If embed_dim is not divisible by num_heads. - """ super().__init__() self.embed_dim = embed_dim self.num_heads = num_heads @@ -857,34 +495,20 @@ def __init__( self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) - def _shape(self, tensor: Tensor, seq_len: int, bsz: int): - """ - This method '_shape' is defined in the class 'Wav2Vec2Attention' and is used to reshape the input tensor to - the specified shape. - - Args: - tensor (Tensor): The input tensor to be reshaped. It should be of type Tensor. - seq_len (int): The length of the sequence. It should be an integer. - bsz (int): The batch size. It should be an integer. - - Returns: - None. - - Raises: - None. - """ - return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).swapaxes(1, 2) + def _shape(self, tensor: mindspore.Tensor, seq_len: int, bsz: int): + return ops.transpose(tensor.view(bsz, seq_len, self.num_heads, self.head_dim), 1, 2) def forward( self, - hidden_states: Tensor, - key_value_states: Optional[Tensor] = None, - past_key_value: Optional[Tuple[Tensor]] = None, - attention_mask: Optional[Tensor] = None, - layer_head_mask: Optional[Tensor] = None, + hidden_states: mindspore.Tensor, + key_value_states: Optional[mindspore.Tensor] = None, + past_key_value: Optional[Tuple[mindspore.Tensor]] = None, + attention_mask: Optional[mindspore.Tensor] = None, + layer_head_mask: Optional[mindspore.Tensor] = None, output_attentions: bool = False, - ) -> Tuple[Tensor, Optional[Tensor], Optional[Tuple[Tensor]]]: + ) -> Tuple[mindspore.Tensor, Optional[mindspore.Tensor], Optional[Tuple[mindspore.Tensor]]]: """Input shape: Batch x Time x Channel""" + # if key_value_states are provided this layer is used as a cross-attention layer # for the decoder is_cross_attention = key_value_states is not None @@ -921,10 +545,10 @@ def forward( value_states = self._shape(self.v_proj(hidden_states), -1, bsz) if self.is_decoder: - # if cross_attention save Tuple(Tensor, Tensor) of all cross attention key/value_states. + # if cross_attention save Tuple(mindspore.Tensor, mindspore.Tensor) of all cross attention key/value_states. # Further calls to cross_attention layer can then reuse all cross-attention # key/value_states (first "if" case) - # if uni-directional self-attention (decoder) save Tuple(Tensor, Tensor) of + # if uni-directional self-attention (decoder) save Tuple(mindspore.Tensor, mindspore.Tensor) of # all previous decoder key/value_states. Further calls to uni-directional self-attention # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) # if encoder bi-directional self-attention `past_key_value` is always `None` @@ -936,7 +560,7 @@ def forward( value_states = value_states.reshape(*proj_shape) src_len = key_states.shape[1] - attn_weights = ops.bmm(query_states, key_states.swapaxes(1, 2)) + attn_weights = ops.bmm(query_states, ops.transpose(key_states, 1, 2)) if attn_weights.shape != (bsz * self.num_heads, tgt_len, src_len): raise ValueError( @@ -952,7 +576,7 @@ def forward( attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) - attn_weights = ops.softmax(attn_weights, dim=-1) + attn_weights = nn.functional.softmax(attn_weights, dim=-1) if layer_head_mask is not None: if layer_head_mask.shape != (self.num_heads,): @@ -973,7 +597,7 @@ def forward( else: attn_weights_reshaped = None - attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training) + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) attn_output = ops.bmm(attn_probs, value_states) @@ -984,7 +608,7 @@ def forward( ) attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) - attn_output = attn_output.swapaxes(1, 2) + attn_output = ops.transpose(attn_output, 1, 2) # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be # partitioned across GPUs when using tensor-parallelism. @@ -995,49 +619,15 @@ def forward( return attn_output, attn_weights_reshaped, past_key_value -class Wav2Vec2FeedForward(nn.Module): - - """ - Wav2Vec2FeedForward is a class representing the feedforward network for the Wav2Vec2 model. - This class inherits from nn.Module and contains methods for initializing the network and forwarding the - feedforward layers. - - The __init__ method initializes the feedforward network with the provided configuration. - It sets up the intermediate dropout, intermediate dense, intermediate activation function, output dense, and output - dropout layers based on the configuration parameters. - - The forward method takes hidden states as input and processes them through the intermediate dense layer, - intermediate activation function, intermediate dropout layer, output dense layer, and output dropout layer. - It then returns the processed hidden states. - - Note: - This docstring is based on the provided code snippet and may need to be updated with additional information once - the entire class implementation is available. - """ - def __init__(self, config: Wav2Vec2Config): - """ - Initialize the Wav2Vec2FeedForward class. +WAV2VEC2_ATTENTION_CLASSES = { + "eager": Wav2Vec2Attention, +} - Args: - self: Instance of the class. - config (Wav2Vec2Config): Configuration object containing parameters for initialization. - The config parameter is of type Wav2Vec2Config and holds the configuration settings required for - initializing the feed-forward module. - It is expected to contain the following attributes: - - activation_dropout (float): Dropout probability for intermediate layers. - - hidden_size (int): Size of the hidden layers. - - intermediate_size (int): Size of the intermediate layer. - - hidden_act (str or function): Activation function for the hidden layers. - - Returns: - None. - - Raises: - None. - """ +class Wav2Vec2FeedForward(nn.Module): + def __init__(self, config): super().__init__() - self.intermediate_dropout = nn.Dropout(p=config.activation_dropout) + self.intermediate_dropout = nn.Dropout(config.activation_dropout) self.intermediate_dense = nn.Linear(config.hidden_size, config.intermediate_size) if isinstance(config.hidden_act, str): @@ -1046,30 +636,9 @@ def __init__(self, config: Wav2Vec2Config): self.intermediate_act_fn = config.hidden_act self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size) - self.output_dropout = nn.Dropout(p=config.hidden_dropout) + self.output_dropout = nn.Dropout(config.hidden_dropout) def forward(self, hidden_states): - """ - Constructs the feed-forward network for the Wav2Vec2 model. - - Args: - self (Wav2Vec2FeedForward): An instance of the Wav2Vec2FeedForward class. - hidden_states (torch.Tensor): The input hidden states to be passed through the feed-forward network. - - Returns: - torch.Tensor: The output hidden states after passing through the feed-forward network. - - Raises: - TypeError: If the input hidden_states is not of type torch.Tensor. - ValueError: If the input hidden_states does not have a rank of 2. - - This method takes the input hidden states and passes them through a feed-forward network consisting of several - layers. The feed-forward network is forwarded using intermediate dense layers, activation functions, - and dropout layers. The hidden_states are first passed through the intermediate dense layer, followed by the - intermediate activation function and dropout layer. The resulting hidden_states are then passed through the - output dense layer and another dropout layer. The final output hidden_states are returned. - Note that the input hidden_states must be a tensor of rank 2, representing a batch of hidden states. - """ hidden_states = self.intermediate_dense(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) hidden_states = self.intermediate_dropout(hidden_states) @@ -1080,94 +649,21 @@ def forward(self, hidden_states): class Wav2Vec2EncoderLayer(nn.Module): - - """A class representing an encoder layer of the Wav2Vec2 model. - - The Wav2Vec2EncoderLayer class inherits from the nn.Module class and implements the functionality of a single encoder - layer in the Wav2Vec2 model architecture. It consists of multiple sub-modules, including an attention mechanism, - dropout layers, layer normalization, and a feed-forward neural network. - - Attributes: - attention (Wav2Vec2Attention): The attention mechanism used in the layer. - dropout (nn.Dropout): The dropout layer applied to the hidden states. - layer_norm (nn.LayerNorm): The layer normalization applied to the hidden states. - feed_forward (Wav2Vec2FeedForward): The feed-forward neural network used in the layer. - final_layer_norm (nn.LayerNorm): The final layer normalization applied to the hidden states. - - Methods: - forward(hidden_states, attention_mask=None, output_attentions=False): - Applies the forward pass of the encoder layer. - - Args: - - - hidden_states (Tensor): The input hidden states. - - attention_mask (Tensor, optional): The attention mask to apply to the attention mechanism (default: None). - - output_attentions (bool, optional): Whether to return the attention weights (default: False). - - Returns: - - - outputs (tuple): A tuple containing the output hidden states. If output_attentions is True, the tuple - also contains the attention weights. - - Note: - The Wav2Vec2EncoderLayer class is designed to be used within the Wav2Vec2Encoder class, which stacks multiple - encoder layers to form the complete Wav2Vec2 model. - """ - def __init__(self, config: Wav2Vec2Config): - """ - Initializes a Wav2Vec2EncoderLayer instance. - - Args: - self (Wav2Vec2EncoderLayer): The instance of the Wav2Vec2EncoderLayer class. - config (Wav2Vec2Config): - An instance of Wav2Vec2Config containing configuration parameters for the encoder layer. - - - Wav2Vec2Config.hidden_size (int): The hidden size for the encoder layer. - - Wav2Vec2Config.num_attention_heads (int): The number of attention heads in the attention mechanism. - - Wav2Vec2Config.attention_dropout (float): The dropout probability for the attention mechanism. - - Wav2Vec2Config.hidden_dropout (float): The dropout probability for the hidden layers. - - Wav2Vec2Config.layer_norm_eps (float): The epsilon value for layer normalization. - - Returns: - None. - - Raises: - None. - """ + def __init__(self, config): super().__init__() - self.attention = Wav2Vec2Attention( + self.attention = WAV2VEC2_ATTENTION_CLASSES[config._attn_implementation]( embed_dim=config.hidden_size, num_heads=config.num_attention_heads, dropout=config.attention_dropout, is_decoder=False, ) - self.dropout = nn.Dropout(p=config.hidden_dropout) + + self.dropout = nn.Dropout(config.hidden_dropout) self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.feed_forward = Wav2Vec2FeedForward(config) self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) def forward(self, hidden_states, attention_mask=None, output_attentions=False): - """ - Constructs the Wav2Vec2EncoderLayer. - - This method applies the Wav2Vec2EncoderLayer to the input hidden_states. It performs attention, residual - connections, layer normalization, feed-forward, and final layer normalization. - - Args: - self (Wav2Vec2EncoderLayer): The instance of the Wav2Vec2EncoderLayer class. - hidden_states (torch.Tensor): The input hidden states of shape (batch_size, sequence_length, hidden_size). - attention_mask (torch.Tensor, optional): The attention mask of shape (batch_size, sequence_length). - Defaults to None. - output_attentions (bool, optional): Whether to output the attention weights. Defaults to False. - - Returns: - tuple: A tuple containing the hidden states of shape (batch_size, sequence_length, hidden_size). - If output_attentions is True, the tuple also contains the attention weights of shape (batch_size, - num_heads, sequence_length, sequence_length). - - Raises: - None - """ attn_residual = hidden_states hidden_states, attn_weights, _ = self.attention( hidden_states, attention_mask=attention_mask, output_attentions=output_attentions @@ -1188,59 +684,15 @@ def forward(self, hidden_states, attention_mask=None, output_attentions=False): class Wav2Vec2EncoderLayerStableLayerNorm(nn.Module): - - """ - This class represents an encoder layer in the Wav2Vec2 model with stable layer normalization. - It inherits from the nn.Module class. - - Attributes: - attention (Wav2Vec2Attention): An instance of the Wav2Vec2Attention class for attention mechanism. - dropout (nn.Dropout): An instance of the nn.Dropout class for dropout regularization. - layer_norm (nn.LayerNorm): An instance of the nn.LayerNorm class for stable layer normalization. - feed_forward (Wav2Vec2FeedForward): An instance of the Wav2Vec2FeedForward class for feed-forward layer. - final_layer_norm (nn.LayerNorm): An instance of the nn.LayerNorm class for stable layer normalization of final - output. - adapter_layer (Wav2Vec2AttnAdapterLayer or None): An instance of the Wav2Vec2AttnAdapterLayer class for adapter - layer, if provided. None otherwise. - - Methods: - forward: - Applies the encoder layer operations on the input hidden states. - - Args: - - - hidden_states (Tensor): The input hidden states. - - attention_mask (Optional[Tensor]): The attention mask tensor, if provided. Defaults to None. - - output_attentions (bool): Whether to output attention weights. Defaults to False. - - Returns: - - - Tuple[Tensor, Union[Tensor, None]]: A tuple containing the final hidden states and optionally the - attention weights, if output_attentions is True. - """ - def __init__(self, config: Wav2Vec2Config): - """ - Initializes a new instance of the Wav2Vec2EncoderLayerStableLayerNorm class. - - Args: - self: The instance of the class. - config (Wav2Vec2Config): The configuration object containing the settings for the encoder layer. - It should be an instance of the Wav2Vec2Config class. - - Returns: - None. - - Raises: - None. - """ + def __init__(self, config): super().__init__() - self.attention = Wav2Vec2Attention( + self.attention = WAV2VEC2_ATTENTION_CLASSES[config._attn_implementation]( embed_dim=config.hidden_size, num_heads=config.num_attention_heads, dropout=config.attention_dropout, is_decoder=False, ) - self.dropout = nn.Dropout(p=config.hidden_dropout) + self.dropout = nn.Dropout(config.hidden_dropout) self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.feed_forward = Wav2Vec2FeedForward(config) self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) @@ -1252,27 +704,10 @@ def __init__(self, config: Wav2Vec2Config): def forward( self, - hidden_states: Tensor, - attention_mask: Optional[Tensor] = None, + hidden_states: mindspore.Tensor, + attention_mask: Optional[mindspore.Tensor] = None, output_attentions: bool = False, ): - """ - Constructs the Wav2Vec2EncoderLayerStableLayerNorm. - - Args: - self: Instance of the Wav2Vec2EncoderLayerStableLayerNorm class. - hidden_states (Tensor): The input hidden states to be processed by the encoder layer. - attention_mask (Optional[Tensor]): Optional tensor representing the attention mask. - Defaults to None. If provided, masks certain elements in the attention computation. - output_attentions (bool): Flag indicating whether to output attention weights during computation. - Defaults to False. - - Returns: - Tuple: A tuple containing the processed hidden states and optionally the attention weights. - - Raises: - None. - """ attn_residual = hidden_states hidden_states = self.layer_norm(hidden_states) hidden_states, attn_weights, _ = self.attention( @@ -1294,103 +729,41 @@ def forward( class Wav2Vec2Encoder(nn.Module): - - """ - A class representing the Wav2Vec2Encoder in the Wav2Vec2 model architecture. - - The Wav2Vec2Encoder is responsible for encoding the input hidden states with positional embeddings and applying - a series of Wav2Vec2EncoderLayer for feature extraction. - - Attributes: - config (Wav2Vec2Config): The configuration for the Wav2Vec2 model. - pos_conv_embed (Wav2Vec2PositionalConvEmbedding): The positional convolutional embedding layer. - layer_norm (nn.LayerNorm): The layer normalization layer. - dropout (nn.Dropout): The dropout layer. - layers (nn.ModuleList): The list of Wav2Vec2EncoderLayer instances. - - Methods: - forward(hidden_states, attention_mask=None, output_attentions=False, output_hidden_states=False, return_dict=True): - Applies the Wav2Vec2Encoder layer-wise to the hidden states. - - Args: - - - hidden_states (Tensor): The input hidden states. - - attention_mask (Optional[Tensor], optional): The attention mask tensor. Defaults to None. - - output_attentions (bool, optional): Whether to output the attentions. Defaults to False. - - output_hidden_states (bool, optional): Whether to output the hidden states. Defaults to False. - - return_dict (bool, optional): Whether to return a BaseModelOutput dictionary. Defaults to True. - - Returns: - - - BaseModelOutput or Tuple[Tensor, Tuple[Tensor], Tuple[Tensor]]: The encoded hidden states, all hidden - states (if output_hidden_states=True), and all self-attentions (if output_attentions=True). - """ - def __init__(self, config: Wav2Vec2Config): - """ - Initializes the Wav2Vec2Encoder class. - - Args: - self: The instance of the class. - config (Wav2Vec2Config): An instance of the Wav2Vec2Config class containing the configuration parameters - for the encoder. It specifies the configuration for the Wav2Vec2 model, such as hidden size, - layer normalization epsilon, hidden dropout probability, and the number of hidden layers. - - Returns: - None. - - Raises: - None: This method does not raise any exceptions explicitly. However, exceptions may be raised during the - initialization of the Wav2Vec2PositionalConvEmbedding, nn.LayerNorm, nn.Dropout, and nn.ModuleList objects. - """ + def __init__(self, config): super().__init__() self.config = config self.pos_conv_embed = Wav2Vec2PositionalConvEmbedding(config) self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.dropout = nn.Dropout(p=config.hidden_dropout) + self.dropout = nn.Dropout(config.hidden_dropout) self.layers = nn.ModuleList([Wav2Vec2EncoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2" def forward( self, - hidden_states: Tensor, - attention_mask: Optional[Tensor] = None, + hidden_states: mindspore.tensor, + attention_mask: Optional[mindspore.Tensor] = None, output_attentions: bool = False, output_hidden_states: bool = False, return_dict: bool = True, ): - """ - Constructs the Wav2Vec2Encoder. - - Args: - self (Wav2Vec2Encoder): The instance of the Wav2Vec2Encoder class. - hidden_states (Tensor): The input hidden states. A tensor of shape (batch_size, sequence_length, hidden_size). - attention_mask (Optional[Tensor]): An optional tensor specifying the attention mask. Defaults to None. - output_attentions (bool): Whether to output attentions. Defaults to False. - output_hidden_states (bool): Whether to output hidden states. Defaults to False. - return_dict (bool): Whether to return a dictionary. Defaults to True. - - Returns: - None. - - Raises: - ValueError: If the hidden_states tensor has invalid shape or type. - ValueError: If the attention_mask tensor has invalid shape or type. - TypeError: If the output_attentions or output_hidden_states parameters are not of type bool. - TypeError: If the return_dict parameter is not of type bool. - """ all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None if attention_mask is not None: # make sure padded tokens output 0 - expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2]) + expand_attention_mask = attention_mask.unsqueeze(-1).tile((1, 1, hidden_states.shape[2])) hidden_states[~expand_attention_mask] = 0 - - # extend attention_mask - attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype) - attention_mask = attention_mask * float(ops.finfo(hidden_states.dtype).min) - attention_mask = attention_mask.expand( - attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1] - ) + if self._use_flash_attention_2: + # 2d mask is passed through the layers + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None + else: + # extend attention_mask + attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype) + attention_mask = attention_mask * float(ops.finfo(hidden_states.dtype).min) + attention_mask = attention_mask.broadcast_to( + (attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]) + ) position_embeddings = self.pos_conv_embed(hidden_states) hidden_states = hidden_states + position_embeddings @@ -1406,9 +779,18 @@ def forward( skip_the_layer = self.training and (dropout_probability < self.config.layerdrop) if not skip_the_layer: - layer_outputs = layer( - hidden_states, attention_mask=attention_mask, output_attentions=output_attentions - ) + # under deepspeed zero3 all gpus must run in sync + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + layer.__call__, + hidden_states, + attention_mask, + output_attentions, + ) + else: + layer_outputs = layer( + hidden_states, attention_mask=attention_mask, output_attentions=output_attentions + ) hidden_states = layer_outputs[0] if skip_the_layer: @@ -1430,46 +812,17 @@ def forward( class Wav2Vec2EncoderStableLayerNorm(nn.Module): - - """ - Wav2Vec2EncoderStableLayerNorm is a Python class that represents an encoder with stable layer normalization for - the Wav2Vec2 model. This class inherits from the nn.Module module. - - This class initializes with a Wav2Vec2Config object and forwards a series of encoder layers with stable - layer normalization. The encoder layers operate on the input hidden states and optionally apply - attention masks, producing hidden states with added positional embeddings and layer normalization. - - The forward method applies the encoder layers to the input hidden states, handling attention masks, - outputting hidden states, and attentions based on the specified configurations. - - This class provides functionalities for building and using a stable layer normalization encoder for the Wav2Vec2 - model, supporting various output options and configurations. - - For detailed information on the class methods and usage, please refer to the specific method docstrings within - the source code. - """ - def __init__(self, config: Wav2Vec2Config): - """ - Initializes an instance of the Wav2Vec2EncoderStableLayerNorm class. - - Args: - self: The object instance. - config (Wav2Vec2Config): The configuration object for the Wav2Vec2 model. - - Returns: - None - - Raises: - None - """ + def __init__(self, config): super().__init__() self.config = config self.pos_conv_embed = Wav2Vec2PositionalConvEmbedding(config) self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.dropout = nn.Dropout(p=config.hidden_dropout) + self.dropout = nn.Dropout(config.hidden_dropout) self.layers = nn.ModuleList( [Wav2Vec2EncoderLayerStableLayerNorm(config) for _ in range(config.num_hidden_layers)] ) + self.gradient_checkpointing = False + self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2" def forward( self, @@ -1479,43 +832,29 @@ def forward( output_hidden_states=False, return_dict=True, ): - """ - Constructs the Wav2Vec2EncoderStableLayerNorm. - - Args: - - - hidden_states: The input hidden states of shape (batch_size, sequence_length, hidden_size). - - attention_mask: Optional attention mask of shape (batch_size, sequence_length). - It is used to mask the attention scores. - - output_attentions: Boolean flag indicating whether to output attention weights. Defaults to False. - - output_hidden_states: Boolean flag indicating whether to output hidden states of all layers. Defaults to False. - - return_dict: Boolean flag indicating whether to return a dictionary as output. Defaults to True. - - Returns: - None - - Raises: - None - """ all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None if attention_mask is not None: # make sure padded tokens are not attended to - expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2]) + expand_attention_mask = attention_mask.unsqueeze(-1).tile((1, 1, hidden_states.shape[2])) hidden_states[~expand_attention_mask] = 0 - - # extend attention_mask - attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype) - attention_mask = attention_mask * float(ops.finfo(hidden_states.dtype).min) - attention_mask = attention_mask.expand( - attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1] - ) + if self._use_flash_attention_2: + # 2d mask is passed through the layers + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None + else: + # extend attention_mask + attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype) + attention_mask = attention_mask * float(ops.finfo(hidden_states.dtype).min) + attention_mask = attention_mask.broadcast_to( + (attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]) + ) position_embeddings = self.pos_conv_embed(hidden_states) hidden_states = hidden_states + position_embeddings hidden_states = self.dropout(hidden_states) + for layer in self.layers: if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) @@ -1525,9 +864,17 @@ def forward( skip_the_layer = self.training and (dropout_probability < self.config.layerdrop) if not skip_the_layer: - layer_outputs = layer( - hidden_states, attention_mask=attention_mask, output_attentions=output_attentions - ) + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + layer.__call__, + hidden_states, + attention_mask, + output_attentions, + ) + else: + layer_outputs = layer( + hidden_states, attention_mask=attention_mask, output_attentions=output_attentions + ) hidden_states = layer_outputs[0] if skip_the_layer: @@ -1550,30 +897,13 @@ def forward( ) -class Wav2Vec2GumbelVectorQuantizer(nn.Module): - """ - Vector quantization using gumbel softmax. See `[CATEGORICAL REPARAMETERIZATION WITH - GUMBEL-SOFTMAX](https://arxiv.org/pdf/1611.01144.pdf) for more information. - """ - def __init__(self, config: Wav2Vec2Config): - """ - Initializes a new instance of the Wav2Vec2GumbelVectorQuantizer class. - - Args: - self: The instance of the Wav2Vec2GumbelVectorQuantizer class. - config (Wav2Vec2Config): An instance of the Wav2Vec2Config class containing configuration parameters - for the vector quantizer. - - - num_codevector_groups (int): The number of codevector groups. - - num_codevectors_per_group (int): The number of codevectors per group. - - codevector_dim (int): The dimension of the codevectors. - - Returns: - None. +class Wav2Vec2GumbelVectorQuantizer(nn.Module): + """ + Vector quantization using gumbel softmax. See `[CATEGORICAL REPARAMETERIZATION WITH + GUMBEL-SOFTMAX](https://arxiv.org/pdf/1611.01144.pdf) for more information. + """ - Raises: - ValueError: If `config.codevector_dim` is not divisible by `config.num_codevector_groups` for concatenation. - """ + def __init__(self, config): super().__init__() self.num_groups = config.num_codevector_groups self.num_vars = config.num_codevectors_per_group @@ -1585,8 +915,8 @@ def __init__(self, config: Wav2Vec2Config): ) # storage for codebook variables (codewords) - self.codevectors = Parameter( - ops.zeros((1, self.num_groups * self.num_vars, config.codevector_dim // self.num_groups)) + self.codevectors = nn.Parameter( + ops.randn(1, self.num_groups * self.num_vars, config.codevector_dim // self.num_groups) ) self.weight_proj = nn.Linear(config.conv_dim[-1], self.num_groups * self.num_vars) @@ -1595,53 +925,17 @@ def __init__(self, config: Wav2Vec2Config): @staticmethod def _compute_perplexity(probs, mask=None): - """ - Compute the perplexity of given probability distribution. - - Args: - probs (Tensor): The input probability distribution. It should be a tensor of shape (N, D) where N is the - number of elements and D is the dimensionality of the distribution. mask (Tensor, optional): - A boolean tensor of the same shape as probs, indicating which elements to include in the computation. - If provided, only the elements where mask is True will be considered. Defaults to None. - - Returns: - None: This method does not return anything but updates the internal state of the class. - - Raises: - ValueError: If the shape of probs and mask do not match. - ValueError: If the dimensionality of probs is not 2. - """ if mask is not None: - mask_extended = mask.flatten()[:, None, None].expand(probs.shape) + mask_extended = mask.flatten()[:, None, None].broadcast_to(probs.shape) probs = ops.where(mask_extended, probs, ops.zeros_like(probs)) - marginal_probs = probs.sum(axis=0) / mask.sum() + marginal_probs = ops.sum(probs, dim=0) / mask.sum() else: - marginal_probs = probs.mean(axis=0) + marginal_probs = ops.mean(probs, dim=0) perplexity = ops.exp(-ops.sum(marginal_probs * ops.log(marginal_probs + 1e-7), dim=-1)).sum() return perplexity def forward(self, hidden_states, mask_time_indices=None): - ''' - Constructs codevectors and computes perplexity for Wav2Vec2GumbelVectorQuantizer. - - Args: - self: The instance of the Wav2Vec2GumbelVectorQuantizer class. - hidden_states (tensor): The input hidden states with shape (batch_size, sequence_length, hidden_size). - mask_time_indices (tensor, optional): A binary mask tensor of shape (batch_size, sequence_length) where - 1s indicate valid time indices and 0s indicate masked time indices. Default is None. - - Returns: - tuple: - A tuple containing: - - - codevectors (tensor): The forwarded codevectors with shape (batch_size, sequence_length, -1). - - perplexity (tensor): The computed perplexity. - - Raises: - ValueError: If the input hidden_states tensor has an invalid shape. - RuntimeError: If the function encounters a runtime error during computation. - ''' batch_size, sequence_length, hidden_size = hidden_states.shape # project to codevector dim @@ -1650,8 +944,8 @@ def forward(self, hidden_states, mask_time_indices=None): if self.training: # sample code vector probs via gumbel in differentiateable way - codevector_probs = ops.gumbel_softmax( - hidden_states.float(), tau=float(self.temperature), hard=True + codevector_probs = nn.functional.gumbel_softmax( + hidden_states.float(), tau=self.temperature, hard=True ).type_as(hidden_states) # compute perplexity @@ -1662,13 +956,12 @@ def forward(self, hidden_states, mask_time_indices=None): else: # take argmax in non-differentiable way # comptute hard codevector distribution (one hot) - # NOTE: 把 hidden_states 变成 hardsoftmax(dim=-1) 形式 - codevector_idx = ops.argmax(hidden_states, dim=-1) # (364) => (364, 1) - x = hidden_states.new_zeros(hidden_states.shape) # (364, 320) - index = codevector_idx.view(-1, 1) - update = ops.ones_like(index, dtype=hidden_states.dtype) # fill with onehot - codevector_probs = ops.scatter(x, -1, index, update) - codevector_probs = codevector_probs.view(batch_size * sequence_length, self.num_groups, -1) # (182, 2, 320) + codevector_idx = ops.argmax(hidden_states, dim=-1).view(-1, 1) + codevector_probs = ops.scatter( + ops.zeros(hidden_states.shape, dtype=hidden_states.dtype), + -1, codevector_idx, ops.ones(codevector_idx.shape, dtype=hidden_states.dtype) + ) + codevector_probs = codevector_probs.view(batch_size * sequence_length, self.num_groups, -1) perplexity = self._compute_perplexity(codevector_probs, mask_time_indices) @@ -1682,39 +975,7 @@ def forward(self, hidden_states, mask_time_indices=None): class Wav2Vec2Adapter(nn.Module): - - """ - Wav2Vec2Adapter is a class that represents an adapter layer for adapting the hidden states of a Wav2Vec2 model. - This class inherits from nn.Module and implements methods for initializing and forwarding the adapter layer. - - Attributes: - proj (nn.Linear or None): A dense layer used for projecting hidden states if output_hidden_size is - different from hidden_size. - proj_layer_norm (nn.LayerNorm or None): A layer normalization module applied after projection if needed. - layers (nn.ModuleList): A list of Wav2Vec2AdapterLayer instances representing adapter layers. - layerdrop (float): The probability of dropping a layer during training. - - Methods: - __init__: Initializes the Wav2Vec2Adapter object with the provided configuration. - forward: Applies the adapter layer transformations to the input hidden states. - - """ - def __init__(self, config: Wav2Vec2Config): - """ - Initializes a new instance of the Wav2Vec2Adapter class. - - Args: - self: The current instance of the class. - config (Wav2Vec2Config): An instance of Wav2Vec2Config containing configuration parameters for the adapter. - This parameter is required for initializing the adapter and must be an instance of Wav2Vec2Config. - - Returns: - None. - - Raises: - TypeError: If the config parameter is not of type Wav2Vec2Config. - ValueError: If the output_hidden_size in the config parameter does not match the hidden_size. - """ + def __init__(self, config): super().__init__() # feature dim might need to be down-projected @@ -1724,73 +985,28 @@ def __init__(self, config: Wav2Vec2Config): else: self.proj = self.proj_layer_norm = None - self.layers = nn.ModuleList([Wav2Vec2AdapterLayer(config) for _ in range(config.num_adapter_layers)]) + self.layers = nn.ModuleList(Wav2Vec2AdapterLayer(config) for _ in range(config.num_adapter_layers)) self.layerdrop = config.layerdrop def forward(self, hidden_states): - """ - This method forwards the hidden states by applying transformations and layers. - - Args: - self (object): The instance of the Wav2Vec2Adapter class. - hidden_states (numpy.ndarray): The input hidden states to be processed. - It is expected to be a 3D array with shape (batch_size, sequence_length, hidden_size). - - Returns: - numpy.ndarray: The processed hidden states with shape (batch_size, sequence_length, hidden_size). - - Raises: - None - """ # down project hidden_states if necessary if self.proj is not None and self.proj_layer_norm is not None: hidden_states = self.proj(hidden_states) hidden_states = self.proj_layer_norm(hidden_states) - hidden_states = hidden_states.swapaxes(1, 2) + hidden_states = ops.transpose(hidden_states, 1, 2) for layer in self.layers: layerdrop_prob = np.random.random() if not self.training or (layerdrop_prob > self.layerdrop): hidden_states = layer(hidden_states) - hidden_states = hidden_states.swapaxes(1, 2) + hidden_states = ops.transpose(hidden_states, 1, 2) return hidden_states class Wav2Vec2AdapterLayer(nn.Module): - - ''' - Wav2Vec2AdapterLayer is a Python class that represents an adapter layer for the Wav2Vec2 model. - This class inherits from nn.Module. - - The adapter layer contains methods for initialization and forwardion. - - The __init__ method initializes the adapter layer with the provided configuration. It sets up a 1D convolutional - layer with specified parameters such as kernel size, stride, padding, and bias. - - The forward method takes hidden_states as input and applies the convolutional layer followed by the - gated linear unit (GLU) activation function. It then returns the processed hidden states. - - This class provides functionality for creating and processing adapter layers within the Wav2Vec2 model. - ''' - def __init__(self, config: Wav2Vec2Config): - """ - __init__ - - Initializes a new instance of the Wav2Vec2AdapterLayer class. - - Args: - self: The instance of the Wav2Vec2AdapterLayer class. - config (Wav2Vec2Config): An instance of the Wav2Vec2Config class containing the configuration parameters - for the adapter layer. - - Returns: - None. - - Raises: - None. - """ + def __init__(self, config): super().__init__() self.conv = nn.Conv1d( config.output_hidden_size, @@ -1798,55 +1014,17 @@ def __init__(self, config: Wav2Vec2Config): config.adapter_kernel_size, stride=config.adapter_stride, padding=1, - bias=True, ) def forward(self, hidden_states): - """ - Method to forward the Wav2Vec2AdapterLayer. - - Args: - self (Wav2Vec2AdapterLayer): The instance of the Wav2Vec2AdapterLayer class. - hidden_states (Tensor): The input hidden states to be processed. It should be a tensor. - - Returns: - Tensor: The processed hidden states after applying convolution and gated linear units (GLU) operation. - - Raises: - None. - """ hidden_states = self.conv(hidden_states) - hidden_states = F.glu(hidden_states, dim=1) + hidden_states = nn.functional.glu(hidden_states, dim=1) + return hidden_states class Wav2Vec2AttnAdapterLayer(nn.Module): - - """ - This class represents a single layer of an attention adapter module in the Wav2Vec2 model. The adapter module is - designed to enhance the training throughput by directly implementing the adapter modules with 3D tensor weights as - parameters, without using ModuleList. - - Attributes: - input_dim (int): The dimension of the input tensor to the adapter module. - hidden_dim (int): The hidden dimension of the adapter module. - norm (nn.LayerNorm): A layer normalization module to normalize the hidden states. - linear_1 (nn.Linear): A linear transformation module that maps the hidden states to the input dimension. - act_fn (nn.ReLU): An activation function module that applies the ReLU activation to the hidden states. - linear_2 (nn.Linear): A linear transformation module that maps the hidden states back to the hidden dimension. - - Methods: - forward: - Applies the attention adapter layer operations to the input hidden states tensor. - - Args: - - - hidden_states (Tensor): The input hidden states tensor. - Returns: - - - Tensor: The output hidden states tensor after applying the attention adapter layer operations. - """ - def __init__(self, config: Wav2Vec2Config): + def __init__(self, config): """ Implements adapter modules directly with 3D tensor weight as parameters and without using ModuleList to speed up training throughput. @@ -1860,28 +1038,13 @@ def __init__(self, config: Wav2Vec2Config): self.act_fn = nn.ReLU() self.linear_2 = nn.Linear(self.input_dim, self.hidden_dim) - def forward(self, hidden_states: Tensor): - """ - Method: forward - - Description: - Constructs the adaptation layer for the Wav2Vec2AttnAdapterModel. - - Args: - self: (Wav2Vec2AttnAdapterLayer) The instance of the Wav2Vec2AttnAdapterLayer class. - hidden_states: (Tensor) The input hidden states to be processed by the adaptation layer. - - Returns: - None - - Raises: - ValueError: If the input hidden_states tensor is empty or invalid. - TypeError: If the input hidden_states is not of type Tensor. - """ + def forward(self, hidden_states: mindspore.Tensor): hidden_states = self.norm(hidden_states) + hidden_states = self.linear_1(hidden_states) hidden_states = self.act_fn(hidden_states) hidden_states = self.linear_2(hidden_states) + return hidden_states @@ -1890,57 +1053,61 @@ class Wav2Vec2PreTrainedModel(PreTrainedModel): An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ + config_class = Wav2Vec2Config base_model_prefix = "wav2vec2" main_input_name = "input_values" + supports_gradient_checkpointing = True - def _init_weights(self, cell): + def _init_weights(self, module): """Initialize the weights""" # Wav2Vec2ForPreTraining last 2 linear layers need standard Linear init. - if isinstance(cell, Wav2Vec2ForPreTraining): - cell.project_hid._is_initialized = True - cell.project_q._is_initialized = True + if isinstance(module, Wav2Vec2ForPreTraining): + module.project_hid.reset_parameters() + module.project_q.reset_parameters() + module.project_hid._is_initialized = True + module.project_q._is_initialized = True # gumbel softmax requires special init - elif isinstance(cell, Wav2Vec2GumbelVectorQuantizer): - cell.weight_proj.weight.set_data(initializer(Normal(1.0), cell.weight_proj.weight.shape, cell.weight_proj.weight.dtype)) - cell.weight_proj.bias.set_data(initializer('zeros', cell.weight_proj.bias.shape, cell.weight_proj.bias.dtype)) - cell.codevectors.set_data(initializer('uniform', cell.codevectors.shape, cell.codevectors.dtype)) - elif isinstance(cell, Wav2Vec2PositionalConvEmbedding): - cell.conv.weight.set_data( - initializer(Normal(2 * math.sqrt(1 / (cell.conv.kernel_size[0] * cell.conv.in_channels))), - cell.conv.weight.shape, cell.conv.weight.dtype)) - cell.conv.bias.set_data(initializer('zeros', cell.conv.bias.shape, cell.conv.bias.dtype)) - elif isinstance(cell, Wav2Vec2FeatureProjection): - k = math.sqrt(1 / cell.projection.in_channels) - cell.projection.weight.set_data( - initializer(Uniform(k), cell.projection.weight.shape, cell.projection.weight.dtype)) - cell.projection.bias.set_data( - initializer(Uniform(k), cell.projection.bias.shape, cell.projection.bias.dtype)) - elif isinstance(cell, nn.Linear): - cell.weight.set_data(initializer(Normal(self.config.initializer_range), cell.weight.shape, cell.weight.dtype)) - if cell.bias is not None: - cell.bias.set_data(initializer('zeros', cell.bias.shape, cell.bias.dtype)) - elif isinstance(cell, (nn.LayerNorm, nn.GroupNorm)): - cell.weight.set_data(initializer('ones', cell.weight.shape, cell.weight.dtype)) - cell.bias.set_data(initializer('zeros', cell.bias.shape, cell.bias.dtype)) - elif isinstance(cell, nn.Conv1d): - cell.weight.set_data(initializer('he_normal', cell.weight.shape, cell.weight.dtype)) - if cell.bias is not None: - k = math.sqrt(cell.group / (cell.in_channels * cell.kernel_size[0])) - cell.bias.set_data(initializer(Uniform(k), cell.bias.shape, cell.bias.dtype)) + elif isinstance(module, Wav2Vec2GumbelVectorQuantizer): + nn.init.normal_(module.weight_proj.weight, mean=0.0, std=1) + nn.init.zeros_(module.weight_proj.bias) + nn.init.uniform_(module.codevectors) + elif isinstance(module, Wav2Vec2PositionalConvEmbedding): + nn.init.normal_( + module.conv.weight, + mean=0, + std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)), + ) + nn.init.constant_(module.conv.bias, 0) + elif isinstance(module, Wav2Vec2FeatureProjection): + k = math.sqrt(1 / module.projection.in_features) + nn.init.uniform_(module.projection.weight, a=-k, b=k) + nn.init.uniform_(module.projection.bias, a=-k, b=k) + elif isinstance(module, nn.Linear): + nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + nn.init.zeros_(module.bias) + elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)): + nn.init.zeros_(module.bias) + nn.init.ones_(module.weight) + elif isinstance(module, nn.Conv1d): + nn.init.kaiming_normal_(module.weight) + if module.bias is not None: + k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0])) + nn.init.uniform_(module.bias, a=-k, b=k) def _get_feat_extract_output_lengths( - self, input_lengths: Union[Tensor, int], add_adapter: Optional[bool] = None + self, input_lengths: Union[mindspore.Tensor, int], add_adapter: Optional[bool] = None ): """ Computes the output length of the convolutional layers """ + add_adapter = self.config.add_adapter if add_adapter is None else add_adapter def _conv_out_length(input_length, kernel_size, stride): # 1D convolutional layer output length formula taken - # from https://pyops.org/docs/stable/generated/ops.nn.Conv1d.html - return (input_length - kernel_size) // stride + 1 + return ops.div(input_length - kernel_size, stride, rounding_mode="floor") + 1 for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride): input_lengths = _conv_out_length(input_lengths, kernel_size, stride) @@ -1952,26 +1119,11 @@ def _conv_out_length(input_length, kernel_size, stride): return input_lengths def _get_feature_vector_attention_mask( - self, feature_vector_length: int, attention_mask: Tensor, add_adapter=None + self, feature_vector_length: int, attention_mask: mindspore.Tensor, add_adapter=None ): - """ - This method calculates the attention mask for the feature vectors in a Wav2Vec2 model. - - Args: - self (Wav2Vec2PreTrainedModel): The instance of the Wav2Vec2PreTrainedModel class. - feature_vector_length (int): The length of the feature vectors. - attention_mask (Tensor): The attention mask tensor. - add_adapter (Optional): An optional parameter to add adapter. - - Returns: - attention_mask (Tensor): The attention mask tensor for the feature vectors. - - Raises: - None. - """ # Effectively attention_mask.sum(-1), but not inplace to be able to run # on inference mode. - non_padded_lengths = attention_mask.cumsum(axis=-1)[:, -1] + non_padded_lengths = ops.cumsum(attention_mask, dim=-1)[:, -1] output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter) output_lengths = output_lengths.to(mindspore.int64) @@ -1983,37 +1135,21 @@ def _get_feature_vector_attention_mask( ) # these two operations makes sure that all values before the output lengths idxs are attended to attention_mask[(ops.arange(attention_mask.shape[0]), output_lengths - 1)] = 1 - attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool() + attention_mask = attention_mask.flip([-1]).int().cumsum(-1).flip([-1]).bool() return attention_mask def _get_adapters(self): - """ - Method _get_adapters in the class Wav2Vec2PreTrainedModel. - - Args: - self (object): The instance of the class Wav2Vec2PreTrainedModel. - - Returns: - dict: A dictionary containing adapter weights. - The keys are composed of the parameter names from the adapter layers and the LM head, and the values are - the corresponding parameters. - - Raises: - ValueError: If the 'adapter_attn_dim' attribute in 'config' is not defined, a ValueError is raised with - a message indicating that the class has no adapter layers and prompting to define - 'config.adapter_attn_dim'. - """ if self.config.adapter_attn_dim is None: raise ValueError(f"{self.__class__} has no adapter layers. Make sure to define `config.adapter_attn_dim`.") adapter_weights = {} - for name, module in self.parameters_and_names(): + for name, module in self.named_modules(): if isinstance(module, Wav2Vec2AttnAdapterLayer): - for param_name, param in module.parameters_and_names(): + for param_name, param in module.named_parameters(): adapter_weights[".".join([name, param_name])] = param if isinstance(self, Wav2Vec2ForCTC): - for name, param in self.lm_head.parameters_and_names(): + for name, param in self.lm_head.named_parameters(): adapter_weights[".".join(["lm_head", name])] = param return adapter_weights @@ -2023,7 +1159,7 @@ def init_adapter_layers(self): (Re-)initialize attention adapter layers and lm head for adapter-only fine-tuning """ # init attention adapters - for module in self.cells(): + for module in self.modules(): if isinstance(module, Wav2Vec2AttnAdapterLayer): self._init_weights(module) @@ -2047,9 +1183,9 @@ def load_adapter(self, target_lang: str, force_load=True, **kwargs): force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to delete incompletely received files. Will attempt to resume the download if such a - file exists. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. + Will be removed in v5 of Transformers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. @@ -2060,7 +1196,7 @@ def load_adapter(self, target_lang: str, force_load=True, **kwargs): the token generated when running `huggingface-cli login` (stored in `~/.huggingface`). revision (`str`, *optional*, defaults to `"main"`): The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a - git-based system for storing models and other artifacts on hf-mirror.com, so `revision` can be any + git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any identifier allowed by git. @@ -2076,22 +1212,23 @@ def load_adapter(self, target_lang: str, force_load=True, **kwargs): - Activate the special ["offline-mode"](https://hf-mirror.com/transformers/installation.html#offline-mode) to + Activate the special ["offline-mode"](https://huggingface.co/transformers/installation.html#offline-mode) to use this method in a firewalled environment. - Example: - ```python - >>> from transformers import Wav2Vec2ForCTC, AutoProcessor - ... - >>> ckpt = "facebook/mms-1b-all" - >>> processor = AutoProcessor.from_pretrained(ckpt) - >>> model = Wav2Vec2ForCTC.from_pretrained(ckpt, target_lang="eng") - >>> # set specific language - >>> processor.tokenizer.set_target_lang("spa") - >>> model.load_adapter("spa") - ``` + Examples: + + ```python + >>> from transformers import Wav2Vec2ForCTC, AutoProcessor + + >>> ckpt = "facebook/mms-1b-all" + >>> processor = AutoProcessor.from_pretrained(ckpt) + >>> model = Wav2Vec2ForCTC.from_pretrained(ckpt, target_lang="eng") + >>> # set specific language + >>> processor.tokenizer.set_target_lang("spa") + >>> model.load_adapter("spa") + ``` """ if self.config.adapter_attn_dim is None: raise ValueError(f"Cannot load_adapter for {target_lang} if `config.adapter_attn_dim` is not defined.") @@ -2102,16 +1239,17 @@ def load_adapter(self, target_lang: str, force_load=True, **kwargs): cache_dir = kwargs.pop("cache_dir", None) force_download = kwargs.pop("force_download", False) - resume_download = kwargs.pop("resume_download", False) + resume_download = kwargs.pop("resume_download", None) proxies = kwargs.pop("proxies", None) local_files_only = kwargs.pop("local_files_only", False) token = kwargs.pop("token", None) use_auth_token = kwargs.pop("use_auth_token", None) - use_safetensors = kwargs.pop("use_safetensors", False) + revision = kwargs.pop("revision", None) + use_safetensors = kwargs.pop("use_safetensors", None if is_safetensors_available() else False) if use_auth_token is not None: warnings.warn( - "The `use_auth_token` argument is deprecated. Please use `token` instead.", + "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.", FutureWarning, ) if token is not None: @@ -2135,26 +1273,28 @@ def load_adapter(self, target_lang: str, force_load=True, **kwargs): resume_download=resume_download, proxies=proxies, local_files_only=local_files_only, + token=token, + revision=revision, cache_dir=cache_dir, ) - # state_dict = safe_load_file(weight_path) - state_dict = None + state_dict = safe_load_file(weight_path) + except EnvironmentError: if use_safetensors: # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted # to the original exception. raise - except Exception as exc: + except Exception: # For any other exception, we throw a generic error. if use_safetensors: raise EnvironmentError( f"Can't load the model for '{model_path_or_id}'. If you were trying to load it" - " from 'https://hf-mirror.com/models', make sure you don't have a local directory with the" + " from 'https://huggingface.co/models', make sure you don't have a local directory with the" f" same name. Otherwise, make sure '{model_path_or_id}' is the correct path to a" f" directory containing a file named {filepath}." - ) from exc + ) # 2. If this didn't work let's try loading a PyTorch adapter weight if state_dict is None: @@ -2168,29 +1308,27 @@ def load_adapter(self, target_lang: str, force_load=True, **kwargs): resume_download=resume_download, proxies=proxies, local_files_only=local_files_only, + token=token, + revision=revision, cache_dir=cache_dir, ) - weights_only_kwarg = {"weights_only": True} - state_dict = ops.load( - weight_path, - map_location="cpu", - **weights_only_kwarg, - ) + state_dict = load(weight_path) except EnvironmentError: # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted # to the original exception. raise - except Exception as exc: + except Exception as e: + print(e) # For any other exception, we throw a generic error. raise EnvironmentError( f"Can't load the model for '{model_path_or_id}'. If you were trying to load it" - " from 'https://hf-mirror.com/models', make sure you don't have a local directory with the" + " from 'https://huggingface.co/models', make sure you don't have a local directory with the" f" same name. Otherwise, make sure '{model_path_or_id}' is the correct path to a" f" directory containing a file named {filepath}." - ) from exc + ) adapter_weights = self._get_adapters() unexpected_keys = set(state_dict.keys()) - set(adapter_weights.keys()) @@ -2210,7 +1348,7 @@ def load_adapter(self, target_lang: str, force_load=True, **kwargs): self.config.vocab_size = target_vocab_size # make sure that adapter weights are put in exactly the same precision and device placement and overwritten adapter weights - state_dict = {k: v.to(adapter_weights[k]) for k, v in state_dict.items()} + state_dict = {k: v.to(adapter_weights[k].dtype) for k, v in state_dict.items()} self.load_state_dict(state_dict, strict=False) # set target language corectly @@ -2218,58 +1356,7 @@ def load_adapter(self, target_lang: str, force_load=True, **kwargs): class Wav2Vec2Model(Wav2Vec2PreTrainedModel): - - """ - The `Wav2Vec2Model` class is a Python class that represents a Wav2Vec2 model for speech recognition. - It is a subclass of the `Wav2Vec2PreTrainedModel` class. - - Wav2Vec2Model inherits the following attributes and methods from the parent class: - - - `config`: An instance of the `Wav2Vec2Config` class, containing the configuration parameters for the model. - - `feature_extractor`: An instance of the `Wav2Vec2FeatureEncoder` class, responsible for extracting features - from the input waveform. - - `feature_projection`: An instance of the `Wav2Vec2FeatureProjection` class, responsible for projecting the - extracted features. - - `encoder`: An instance of the `Wav2Vec2Encoder` or `Wav2Vec2EncoderStableLayerNorm` class, responsible for - encoding the hidden states. - - `adapter`: An instance of the `Wav2Vec2Adapter` class, used to adapt the hidden states (optional). - - `post_init()`: A method called after the initialization of the model. - - The `Wav2Vec2Model` class also defines the following methods: - - - `freeze_feature_extractor`: Disables the gradient computation for the feature encoder, preventing its parameters - from being updated during training. - - `freeze_feature_encoder`: Disables the gradient computation for the feature encoder, preventing its parameters - from being updated during training. - - `_mask_hidden_states`: Masks extracted features along - the time axis and/or the feature axis according to SpecAugment. - - `forward`: Constructs the model by processing the input values and returns the model outputs. - - Please note that the `freeze_feature_extractor()` method is deprecated. - The equivalent `freeze_feature_encoder()` method should be used instead. - - For more information about the Wav2Vec2 model, please refer to the official paper [SpecAugment] - (https://arxiv.org/abs/1904.08779). - """ def __init__(self, config: Wav2Vec2Config): - """ - Initializes a new instance of the Wav2Vec2Model class. - - Args: - self: The instance of the Wav2Vec2Model class. - config (Wav2Vec2Config): An instance of the Wav2Vec2Config class containing the configuration parameters - for the model. - - Returns: - None. - - Raises: - TypeError: If the config parameter is not of type Wav2Vec2Config. - ValueError: If the config parameters mask_time_prob or mask_feature_prob are less than 0.0. - ValueError: If the config parameter do_stable_layer_norm is not a boolean value. - ValueError: If the config parameter hidden_size is not defined. - ValueError: If an error occurs during the initialization process. - """ super().__init__(config) self.config = config self.feature_extractor = Wav2Vec2FeatureEncoder(config) @@ -2277,7 +1364,7 @@ def __init__(self, config: Wav2Vec2Config): # model only needs masking vector if mask prob is > 0.0 if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0: - self.masked_spec_embed = Parameter(initializer(Uniform(), (config.hidden_size,), dtype=mindspore.float32)) + self.masked_spec_embed = nn.Parameter(ops.randn(config.hidden_size)) if config.do_stable_layer_norm: self.encoder = Wav2Vec2EncoderStableLayerNorm(config) @@ -2295,7 +1382,7 @@ def freeze_feature_extractor(self): not be updated during training. """ warnings.warn( - "The method `freeze_feature_extractor` is deprecated. " + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " "Please use the equivalent `freeze_feature_encoder` method instead.", FutureWarning, ) @@ -2310,14 +1397,15 @@ def freeze_feature_encoder(self): def _mask_hidden_states( self, - hidden_states: Tensor, - mask_time_indices: Optional[Tensor] = None, - attention_mask: Optional[Tensor] = None, + hidden_states: mindspore.Tensor, + mask_time_indices: Optional[mindspore.Tensor] = None, + attention_mask: Optional[mindspore.Tensor] = None, ): """ Masks extracted features along time axis and/or along feature axis according to [SpecAugment](https://arxiv.org/abs/1904.08779). """ + # `config.apply_spec_augment` can set masking to False if not getattr(self.config, "apply_spec_augment", True): return hidden_states @@ -2336,7 +1424,7 @@ def _mask_hidden_states( attention_mask=attention_mask, min_masks=self.config.mask_time_min_masks, ) - mask_time_indices = Tensor(mask_time_indices, dtype=mindspore.bool_) + mask_time_indices = mindspore.tensor(mask_time_indices, dtype=mindspore.bool_) hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype) if self.config.mask_feature_prob > 0 and self.training: @@ -2347,43 +1435,21 @@ def _mask_hidden_states( mask_length=self.config.mask_feature_length, min_masks=self.config.mask_feature_min_masks, ) - mask_feature_indices = Tensor(mask_feature_indices, dtype=mindspore.bool_) - mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1) + mask_feature_indices = mindspore.tensor(mask_feature_indices, dtype=mindspore.bool_) + mask_feature_indices = mask_feature_indices[:, None].broadcast_to((-1, sequence_length, -1)) hidden_states[mask_feature_indices] = 0 return hidden_states def forward( self, - input_values: Optional[Tensor], - attention_mask: Optional[Tensor] = None, - mask_time_indices: Optional[Tensor] = None, + input_values: Optional[mindspore.Tensor], + attention_mask: Optional[mindspore.Tensor] = None, + mask_time_indices: Optional[mindspore.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, Wav2Vec2BaseModelOutput]: - """ - Constructs the Wav2Vec2 model for processing input audio data. - - Args: - self (Wav2Vec2Model): The instance of the Wav2Vec2Model class. - input_values (Optional[Tensor]): The input audio data values with shape (batch_size, audio_length). - attention_mask (Optional[Tensor]): The attention mask for the input audio data with shape - (batch_size, audio_length). - mask_time_indices (Optional[Tensor]): The mask for time indices with shape (batch_size, audio_length). - output_attentions (Optional[bool]): Whether to output attentions. Defaults to None. - output_hidden_states (Optional[bool]): Whether to output hidden states. Defaults to None. - return_dict (Optional[bool]): Whether to return a dictionary of output. Defaults to None. - - Returns: - Union[Tuple, Wav2Vec2BaseModelOutput]: The forwarded model output, which can be a tuple or a - Wav2Vec2BaseModelOutput object. - - Raises: - ValueError: If the input_values and attention_mask have mismatched shapes. - TypeError: If the input_values or attention_mask is not a Tensor. - RuntimeError: If the encoder fails to process the input audio data. - """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -2391,7 +1457,7 @@ def forward( return_dict = return_dict if return_dict is not None else self.config.use_return_dict extract_features = self.feature_extractor(input_values) - extract_features = extract_features.swapaxes(1, 2) + extract_features = ops.transpose(extract_features, 1, 2) if attention_mask is not None: # compute reduced attention_mask corresponding to feature vectors @@ -2429,47 +1495,10 @@ def forward( class Wav2Vec2ForPreTraining(Wav2Vec2PreTrainedModel): - - """Wav2Vec2ForPreTraining - - This class represents a pre-training model for Wav2Vec2, which is used for pre-training the Wav2Vec2 model. - It includes methods for setting Gumbel softmax temperature, freezing the feature encoder, computing contrastive - logits, and forwarding the model for pre-training. - - Methods: - set_gumbel_temperature: Set the Gumbel softmax temperature to a given value. Only necessary for training. - freeze_feature_extractor: Disable gradient computation for the feature encoder to prevent parameter updates - during training. - freeze_feature_encoder: Disable gradient computation for the feature encoder to prevent parameter updates - during training. - compute_contrastive_logits: Compute logits for contrastive loss based on cosine similarity between features - and apply temperature. - forward: Construct the model for pre-training, including masking features for contrastive loss. - - Attributes: - wav2vec2: Wav2Vec2Model instance for the Wav2Vec2 model. - dropout_features: Dropout layer for feature vectors. - quantizer: Wav2Vec2GumbelVectorQuantizer instance for quantization. - project_hid: Dense layer for projecting hidden states. - project_q: Dense layer for projecting quantized features. - """ def __init__(self, config: Wav2Vec2Config): - """ - Initializes a new instance of the Wav2Vec2ForPreTraining class. - - Args: - self: The instance of the Wav2Vec2ForPreTraining class. - config (Wav2Vec2Config): The configuration object for the Wav2Vec2 model. - - Returns: - None. - - Raises: - None - """ super().__init__(config) self.wav2vec2 = Wav2Vec2Model(config) - self.dropout_features = nn.Dropout(p=config.feat_quantizer_dropout) + self.dropout_features = nn.Dropout(config.feat_quantizer_dropout) self.quantizer = Wav2Vec2GumbelVectorQuantizer(config) @@ -2491,7 +1520,7 @@ def freeze_feature_extractor(self): not be updated during training. """ warnings.warn( - "The method `freeze_feature_extractor` is deprecated. " + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " "Please use the equivalent `freeze_feature_encoder` method instead.", FutureWarning, ) @@ -2506,9 +1535,9 @@ def freeze_feature_encoder(self): @staticmethod def compute_contrastive_logits( - target_features: Tensor, - negative_features: Tensor, - predicted_features: Tensor, + target_features: mindspore.Tensor, + negative_features: mindspore.Tensor, + predicted_features: mindspore.Tensor, temperature: int = 0.1, ): """ @@ -2516,78 +1545,82 @@ def compute_contrastive_logits( `[positive_feature, negative_features]` and `[predicted_features]`. Additionally, temperature can be applied. """ target_features = ops.cat([target_features, negative_features], dim=0) - logits = ops.cosine_similarity(predicted_features.float(), target_features.float(), dim=-1).type_as(target_features) + + logits = nn.functional.cosine_similarity(predicted_features.float(), target_features.float(), dim=-1).type_as( + target_features + ) + # apply temperature logits = logits / temperature return logits def forward( self, - input_values: Optional[Tensor], - attention_mask: Optional[Tensor] = None, - mask_time_indices: Optional[Tensor] = None, - sampled_negative_indices: Optional[Tensor] = None, + input_values: Optional[mindspore.Tensor], + attention_mask: Optional[mindspore.Tensor] = None, + mask_time_indices: Optional[mindspore.Tensor] = None, + sampled_negative_indices: Optional[mindspore.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, Wav2Vec2ForPreTrainingOutput]: r""" - Args: - mask_time_indices (`Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict - masked extracted features in *config.proj_codevector_dim* space. - sampled_negative_indices (`Tensor` of shape `(batch_size, sequence_length, num_negatives)`, *optional*): - Indices indicating which quantized target vectors are used as negative sampled vectors in contrastive loss. - Required input for pre-training. + mask_time_indices (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict + masked extracted features in *config.proj_codevector_dim* space. + sampled_negative_indices (`mindspore.Tensor` of shape `(batch_size, sequence_length, num_negatives)`, *optional*): + Indices indicating which quantized target vectors are used as negative sampled vectors in contrastive loss. + Required input for pre-training. Returns: - Union[Tuple, Wav2Vec2ForPreTrainingOutput] Example: - ```python - >>> import torch - >>> from transformers import AutoFeatureExtractor, Wav2Vec2ForPreTraining - >>> from transformers.models.wav2vec2.modeling_wav2vec2 import _compute_mask_indices, _sample_negative_indices - >>> from datasets import load_dataset - ... - >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base") - >>> model = Wav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-base") - ... - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") - >>> input_values = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt").input_values # Batch size 1 - ... - >>> # compute masked indices - >>> batch_size, raw_sequence_length = input_values.shape - >>> sequence_length = model._get_feat_extract_output_lengths(raw_sequence_length).item() - >>> mask_time_indices = _compute_mask_indices( - ... shape=(batch_size, sequence_length), mask_prob=0.2, mask_length=2 - ... ) - >>> sampled_negative_indices = _sample_negative_indices( - ... features_shape=(batch_size, sequence_length), - ... num_negatives=model.config.num_negatives, - ... mask_time_indices=mask_time_indices, - ... ) - >>> mask_time_indices = Tensor(data=mask_time_indices, device=input_values.device, dtype=mindspore.int64) - >>> sampled_negative_indices = Tensor( - ... data=sampled_negative_indices, device=input_values.device, dtype=mindspore.int64 - ... ) - ... - >>> with ops.no_grad(): - ... outputs = model(input_values, mask_time_indices=mask_time_indices) - ... - >>> # compute cosine similarity between predicted (=projected_states) and target (=projected_quantized_states) - >>> cosine_sim = ops.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1) - ... - >>> # show that cosine similarity is much higher than random - >>> cosine_sim[mask_time_indices.to(mindspore.bool_)].mean() > 0.5 - tensor(True) - >>> # for contrastive loss training model should be put into train mode - >>> model = model.train() - >>> loss = model( - ... input_values, mask_time_indices=mask_time_indices, sampled_negative_indices=sampled_negative_indices - ... ).loss - ``` - """ + + ```python + >>> import torch + >>> from transformers import AutoFeatureExtractor, Wav2Vec2ForPreTraining + >>> from transformers.models.wav2vec2.modeling_wav2vec2 import _compute_mask_indices, _sample_negative_indices + >>> from datasets import load_dataset + + >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base") + >>> model = Wav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-base") + + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + >>> input_values = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt").input_values # Batch size 1 + + >>> # compute masked indices + >>> batch_size, raw_sequence_length = input_values.shape + >>> sequence_length = model._get_feat_extract_output_lengths(raw_sequence_length).item() + >>> mask_time_indices = _compute_mask_indices( + ... shape=(batch_size, sequence_length), mask_prob=0.2, mask_length=2 + ... ) + >>> sampled_negative_indices = _sample_negative_indices( + ... features_shape=(batch_size, sequence_length), + ... num_negatives=model.config.num_negatives, + ... mask_time_indices=mask_time_indices, + ... ) + >>> mask_time_indices = mindspore.tensor(data=mask_time_indices, dtype=mindspore.int64) + >>> sampled_negative_indices = mindspore.tensor( + ... data=sampled_negative_indices, dtype=mindspore.int64 + ... ) + + >>> with no_grad(): + ... outputs = model(input_values, mask_time_indices=mask_time_indices) + + >>> # compute cosine similarity between predicted (=projected_states) and target (=projected_quantized_states) + >>> cosine_sim = ops.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1) + + >>> # show that cosine similarity is much higher than random + >>> cosine_sim[mask_time_indices.to(mindspore.bool_)].mean() > 0.5 + tensor(True) + + >>> # for contrastive loss training model should be put into train mode + >>> model = model.train() + >>> loss = model( + ... input_values, mask_time_indices=mask_time_indices, sampled_negative_indices=sampled_negative_indices + ... ).loss + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if mask_time_indices is not None: @@ -2617,6 +1650,8 @@ def forward( quantized_features, codevector_perplexity = self.quantizer( extract_features, mask_time_indices=mask_time_indices ) + + quantized_features = quantized_features.to(self.project_q.weight.dtype) quantized_features = self.project_q(quantized_features) loss = contrastive_loss = diversity_loss = None @@ -2648,16 +1683,14 @@ def forward( neg_is_pos = (quantized_features == negative_quantized_features).all(-1) if neg_is_pos.any(): - # NOTE: avoid loss NaN - # float("-inf") => finfo(logits.dtype, 'min') := -3.40282e+38 - logits[1:][neg_is_pos] = -3.40282e+35 + logits[1:][neg_is_pos] = float("-inf") # 6. compute contrastive loss \mathbf{L}_m = cross_entropy(logs) = # -log(exp(sim(c_t, q_t)/\kappa) / \sum_{\sim{q}} exp(sim(c_t, \sim{q})/\kappa)) - logits = logits.swapaxes(0, 2).reshape(-1, logits.shape[0]) - target = ((1 - mask_time_indices.long()) * -100).swapaxes(0, 1).flatten() + logits = ops.transpose(logits, 0, 2).reshape(-1, logits.shape[0]) + target = ops.transpose(((1 - mask_time_indices.long()) * -100), 0, 1).flatten() - contrastive_loss = F.cross_entropy(logits.float(), target, reduction="sum") + contrastive_loss = nn.functional.cross_entropy(logits.float(), target, reduction="sum") # 7. compute diversity loss: \mathbf{L}_d num_codevectors = self.config.num_codevectors_per_group * self.config.num_codevector_groups diversity_loss = ((num_codevectors - codevector_perplexity) / num_codevectors) * mask_time_indices.sum() @@ -2683,63 +1716,7 @@ def forward( class Wav2Vec2ForMaskedLM(Wav2Vec2PreTrainedModel): - - """ - This class represents a Wav2Vec2 model for Masked Language Modeling (MLM). - It is deprecated and should be replaced with `Wav2Vec2ForCTC`. - - The `Wav2Vec2ForMaskedLM` class inherits from the `Wav2Vec2PreTrainedModel` class. - - Attributes: - `wav2vec2`: The underlying Wav2Vec2Model. - `dropout`: A dropout layer for regularization. - `lm_head`: A dense layer for language modeling prediction. - - Methods: - `__init__`: Initializes a new instance of the `Wav2Vec2ForMaskedLM` class. - `forward`: Constructs the model for masked language modeling. - - Note: - This class is deprecated and should be replaced with `Wav2Vec2ForCTC`. - """ - def __init__(self, config: Wav2Vec2Config): - """ - Initializes an instance of the 'Wav2Vec2ForMaskedLM' class. - - Args: - self: The object instance. - config (Wav2Vec2Config): - The configuration object containing various hyperparameters for the model. - - - `config` should be an instance of the 'Wav2Vec2Config' class. - - This parameter is required. - - Returns: - None - - Raises: - FutureWarning: Raised if the class `Wav2Vec2ForMaskedLM` is used, as it is deprecated. - Recommends using `Wav2Vec2ForCTC` instead. - This warning is raised as a future version may not support the deprecated class. - - Description: - This method initializes an instance of the 'Wav2Vec2ForMaskedLM' class. It sets up the model architecture - and initializes the necessary components. The initialization process includes the following steps: - - 1. Calls the parent class '__init__' method using 'super()' to initialize the base class. - 2. Raises a 'FutureWarning' to notify users that the class `Wav2Vec2ForMaskedLM` is deprecated and - recommends using `Wav2Vec2ForCTC` instead. - 3. Initializes the 'wav2vec2' attribute as an instance of 'Wav2Vec2Model' using the provided 'config'. - 4. Initializes the 'dropout' attribute as an instance of 'nn.Dropout' with the dropout probability specified - in 'config'. - 5. Initializes the 'lm_head' attribute as an instance of 'nn.Linear' with the hidden size and vocabulary - size specified in 'config'. - 6. Calls the 'post_init' method to perform any additional post-initialization steps. - - Note: - The 'Wav2Vec2ForMaskedLM' class is deprecated and may not be supported in future versions. It is recommended - to use the 'Wav2Vec2ForCTC' class instead. - """ + def __init__(self, config): super().__init__(config) warnings.warn( @@ -2747,7 +1724,7 @@ def __init__(self, config: Wav2Vec2Config): ) self.wav2vec2 = Wav2Vec2Model(config) - self.dropout = nn.Dropout(p=config.final_dropout) + self.dropout = nn.Dropout(config.final_dropout) self.lm_head = nn.Linear(config.hidden_size, config.vocab_size) # Initialize weights and apply final processing @@ -2755,41 +1732,13 @@ def __init__(self, config: Wav2Vec2Config): def forward( self, - input_values: Tensor, - attention_mask: Optional[Tensor] = None, + input_values: mindspore.Tensor, + attention_mask: Optional[mindspore.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, - labels: Optional[Tensor] = None, + labels: Optional[mindspore.Tensor] = None, ) -> Union[Tuple, MaskedLMOutput]: - """ - Args: - self (Wav2Vec2ForMaskedLM): The instance of the Wav2Vec2ForMaskedLM class. - input_values (Tensor): The input tensor representing the input audio features. Its shape is - (batch_size, sequence_length, feature_dim). - attention_mask (Optional[Tensor]): Optional tensor representing the attention mask for the input. - If provided, should have the shape (batch_size, sequence_length). - output_attentions (Optional[bool]): Optional flag to indicate whether to return attentions in the output. - Defaults to None. - output_hidden_states (Optional[bool]): Optional flag to indicate whether to return hidden states - in the output. Defaults to None. - return_dict (Optional[bool]): Optional flag to indicate whether to return the output as a dictionary. - If not provided, it defaults to the value specified in the configuration. - labels (Optional[Tensor]): Optional tensor representing the labels for the masked language modeling task. - Its shape is (batch_size, sequence_length). - - Returns: - Union[Tuple, MaskedLMOutput]: - The return value can be either a tuple or a MaskedLMOutput object. - - - If return_dict is False, it returns a tuple containing the logits and, optionally, the hidden states - and attentions. - - If return_dict is True, it returns a MaskedLMOutput object containing the logits, - hidden states, and attentions. - - Raises: - None - """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.wav2vec2( @@ -2811,50 +1760,11 @@ def forward( class Wav2Vec2ForCTC(Wav2Vec2PreTrainedModel): - - """ - This class represents a Wav2Vec2 model fine-tuned for Connectionist Temporal Classification (CTC) tasks. - It inherits from the Wav2Vec2PreTrainedModel, providing methods for initializing the model, tying weights, - freezing the feature extractor, feature encoder, and base model, as well as forwarding the model - for inference and training. - - The Wav2Vec2ForCTC class encapsulates the Wav2Vec2 model with additional methods for CTC-specific functionality, - such as handling labels for CTC, computing CTC loss, and processing input values for CTC tasks. - - The class provides methods for fine-tuning the Wav2Vec2 model for CTC tasks, including freezing specific components - of the model, as well as forwarding the model for CTC inference and training. - - Additionally, the class provides methods for tying weights and freezing specific components of the model to ensure - compatibility with adapter weights and to control parameter updates during training. - - This class is designed for fine-tuning the Wav2Vec2 model for CTC tasks, providing a comprehensive set of methods - for customizing the model's behavior and supporting CTC-specific functionality. - """ - def __init__(self, config: Wav2Vec2Config, target_lang: Optional[str] = None): - """ - Initializes a new instance of the Wav2Vec2ForCTC class. - - Args: - self: The object itself. - config (Wav2Vec2Config): The configuration for the Wav2Vec2Model. - target_lang (Optional[str], optional): The target language. Defaults to None. - - Returns: - None - - Raises: - ValueError: If the configuration does not define the vocabulary size of the language model head. - - Note: - The vocabulary size of the language model head must be defined either by instantiating the model - with `Wav2Vec2ForCTC.from_pretrained(..., vocab_size=vocab_size)` or by explicitly defining the - `vocab_size` in the model's configuration. - - """ + def __init__(self, config, target_lang: Optional[str] = None): super().__init__(config) self.wav2vec2 = Wav2Vec2Model(config) - self.dropout = nn.Dropout(p=config.final_dropout) + self.dropout = nn.Dropout(config.final_dropout) self.target_lang = target_lang @@ -2880,6 +1790,7 @@ def tie_weights(self): This method is **not** supposed to be called by the user and is prone to be changed in the future. """ + # Note that `tie_weights` is usually used to tie input and output embedding weights. The method is re-purposed to # correctly load adapter layers for Wav2Vec2 so that we do not have to introduce a new API to # [`PreTrainedModel`]. While slightly hacky, Wav2Vec2 never has to tie input and output embeddings, so that it is @@ -2899,7 +1810,7 @@ def freeze_feature_extractor(self): not be updated during training. """ warnings.warn( - "The method `freeze_feature_extractor` is deprecated. " + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " "Please use the equivalent `freeze_feature_encoder` method instead.", FutureWarning, ) @@ -2917,28 +1828,30 @@ def freeze_base_model(self): Calling this function will disable the gradient computation for the base model so that its parameters will not be updated during training. Only the classification head will be updated. """ - for _, param in self.wav2vec2.parameters_and_names(): + for param in self.wav2vec2.parameters(): param.requires_grad = False def forward( self, - input_values: Optional[Tensor], - attention_mask: Optional[Tensor] = None, + input_values: Optional[mindspore.Tensor], + attention_mask: Optional[mindspore.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, - labels: Optional[Tensor] = None, + labels: Optional[mindspore.Tensor] = None, ) -> Union[Tuple, CausalLMOutput]: r""" - Args: - labels (`Tensor` of shape `(batch_size, target_length)`, *optional*): - Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to - the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`. - All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., - config.vocab_size - 1]`. + labels (`mindspore.Tensor` of shape `(batch_size, target_length)`, *optional*): + Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to + the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`. + All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., + config.vocab_size - 1]`. """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if labels is not None and labels.max() >= self.config.vocab_size: + raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") + outputs = self.wav2vec2( input_values, attention_mask=attention_mask, @@ -2954,10 +1867,6 @@ def forward( loss = None if labels is not None: - labels = labels.astype(mindspore.int32) - if labels.max() >= self.config.vocab_size: - raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") - # retrieve loss input_lengths from attention_mask attention_mask = ( attention_mask if attention_mask is not None else ops.ones_like(input_values, dtype=mindspore.int64) @@ -2971,11 +1880,11 @@ def forward( flattened_targets = labels.masked_select(labels_mask) # ctc_loss doesn't support fp16 - log_probs = F.log_softmax(logits, dim=-1).swapaxes(0, 1) + log_probs = ops.transpose(nn.functional.log_softmax(logits, dim=-1, dtype=mindspore.float32), 0, 1) - loss, log_alpha = F.ctc_loss( + loss = nn.functional.ctc_loss( log_probs, - labels, # flattened_targets + labels, input_lengths, target_lengths, blank=self.config.pad_token_id, @@ -2993,39 +1902,7 @@ def forward( class Wav2Vec2ForSequenceClassification(Wav2Vec2PreTrainedModel): - - """ - The `Wav2Vec2ForSequenceClassification` class represents a Wav2Vec2 model for sequence classification tasks. - It inherits from the `Wav2Vec2PreTrainedModel` class. This class provides methods for initializing the model, - freezing specific components, and computing the sequence classification output. It also includes methods for - handling the feature extractor, feature encoder, and base model. The class supports the forwardion of the sequence - classification output and provides options for setting various parameters such as attention masks, output attentions, - output hidden states, and labels. - - Deprecated methods such as `freeze_feature_extractor` and `freeze_base_model` are included along with their - corresponding replacements. The `forward` method computes the sequence classification/regression loss and handles - the classification output based on the input values, attention masks, and labels. The class allows for fine-tuning - the model for sequence classification tasks while providing flexibility in handling different components and - parameters. - - For detailed information about the class and its methods, refer to the individual method docstrings and the base - class `Wav2Vec2PreTrainedModel` for additional context and functionality. - """ - def __init__(self, config: Wav2Vec2Config): - """ - Initializes a new instance of the Wav2Vec2ForSequenceClassification class. - - Args: - self: The object itself. - config (Wav2Vec2Config): An instance of Wav2Vec2Config containing the configuration settings for the model. - - Returns: - None. - - Raises: - ValueError: Raised if the 'add_adapter' attribute is set to True in the config, as sequence classification - does not support the use of Wav2Vec2 adapters. - """ + def __init__(self, config): super().__init__(config) if hasattr(config, "add_adapter") and config.add_adapter: @@ -3035,7 +1912,7 @@ def __init__(self, config: Wav2Vec2Config): self.wav2vec2 = Wav2Vec2Model(config) num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings if config.use_weighted_layer_sum: - self.layer_weights = Parameter(ops.ones(num_layers) / num_layers) + self.layer_weights = nn.Parameter(ops.ones(num_layers) / num_layers) self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size) self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels) @@ -3048,7 +1925,7 @@ def freeze_feature_extractor(self): not be updated during training. """ warnings.warn( - "The method `freeze_feature_extractor` is deprecated. " + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " "Please use the equivalent `freeze_feature_encoder` method instead.", FutureWarning, ) @@ -3066,25 +1943,25 @@ def freeze_base_model(self): Calling this function will disable the gradient computation for the base model so that its parameters will not be updated during training. Only the classification head will be updated. """ - for _, param in self.wav2vec2.parameters_and_names(): + for param in self.wav2vec2.parameters(): param.requires_grad = False def forward( self, - input_values: Optional[Tensor], - attention_mask: Optional[Tensor] = None, + input_values: Optional[mindspore.Tensor], + attention_mask: Optional[mindspore.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, - labels: Optional[Tensor] = None, + labels: Optional[mindspore.Tensor] = None, ) -> Union[Tuple, SequenceClassifierOutput]: r""" - Args: - labels (`Tensor` of shape `(batch_size,)`, *optional*): - Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., - config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If - `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states @@ -3099,25 +1976,25 @@ def forward( if self.config.use_weighted_layer_sum: hidden_states = outputs[_HIDDEN_STATES_START_POSITION] hidden_states = ops.stack(hidden_states, dim=1) - norm_weights = ops.softmax(self.layer_weights, dim=-1) - hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(axis=1) + norm_weights = nn.functional.softmax(self.layer_weights, dim=-1) + hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1) else: hidden_states = outputs[0] hidden_states = self.projector(hidden_states) if attention_mask is None: - pooled_output = hidden_states.mean(axis=1) + pooled_output = ops.mean(hidden_states, dim=1) else: padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask) hidden_states[~padding_mask] = 0.0 - pooled_output = hidden_states.sum(axis=1) / padding_mask.sum(axis=1).view(-1, 1) + pooled_output = ops.sum(hidden_states, dim=1) / ops.sum(padding_mask, dim=1).view(-1, 1) logits = self.classifier(pooled_output) loss = None if labels is not None: - labels = labels.astype(mindspore.int32) - loss = F.cross_entropy(logits.view(-1, self.config.num_labels), labels.view(-1)) + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1)) if not return_dict: output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:] @@ -3130,53 +2007,8 @@ def forward( attentions=outputs.attentions, ) - class Wav2Vec2ForAudioFrameClassification(Wav2Vec2PreTrainedModel): - - """ - This class represents a Wav2Vec2 model for audio frame classification. It inherits from the Wav2Vec2PreTrainedModel - and includes methods for initializing the model, freezing the feature encoder and base model, as well as - forwarding the model for inference and training. - - Attributes: - wav2vec2 (Wav2Vec2Model): The Wav2Vec2Model used for audio frame classification. - classifier (nn.Linear): The classification head for the model. - num_labels (int): The number of labels for classification. - layer_weights (Parameter, optional): The weights for weighted layer sum if configured. - - Methods: - __init__: - Initializes the Wav2Vec2ForAudioFrameClassification model with the provided configuration. - - freeze_feature_encoder: - Disables the gradient computation for the feature encoder, preventing its parameters from being updated - during training. - - freeze_base_model: - Disables the gradient computation for the base model, preventing its parameters from being updated during - training while allowing the classification head to be updated. - - forward: - Constructs the model for inference and training, handling input values, attention masks, labels, and other - optional parameters. Returns TokenClassifierOutput containing loss, logits, hidden states, and attentions. - """ - def __init__(self, config: Wav2Vec2Config): - """ - Initializes a new instance of the Wav2Vec2ForAudioFrameClassification class. - - Args: - self: The instance of the class. - config (Wav2Vec2Config): The configuration object for the Wav2Vec2 model. - It specifies the parameters and settings for the model initialization. - Must be an instance of Wav2Vec2Config. - - Returns: - None. - - Raises: - ValueError: If the 'config' object has the attribute 'add_adapter' set to True, - which is not supported for audio frame classification with Wav2Vec2. - """ + def __init__(self, config): super().__init__(config) if hasattr(config, "add_adapter") and config.add_adapter: @@ -3186,12 +2018,24 @@ def __init__(self, config: Wav2Vec2Config): self.wav2vec2 = Wav2Vec2Model(config) num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings if config.use_weighted_layer_sum: - self.layer_weights = Parameter(ops.ones(num_layers) / num_layers) + self.layer_weights = nn.Parameter(ops.ones(num_layers) / num_layers) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.num_labels = config.num_labels self.init_weights() + def freeze_feature_extractor(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + def freeze_feature_encoder(self): """ Calling this function will disable the gradient computation for the feature encoder so that its parameter will @@ -3204,25 +2048,25 @@ def freeze_base_model(self): Calling this function will disable the gradient computation for the base model so that its parameters will not be updated during training. Only the classification head will be updated. """ - for _, param in self.wav2vec2.parameters_and_names(): + for param in self.wav2vec2.parameters(): param.requires_grad = False def forward( self, - input_values: Optional[Tensor], - attention_mask: Optional[Tensor] = None, - labels: Optional[Tensor] = None, + input_values: Optional[mindspore.Tensor], + attention_mask: Optional[mindspore.Tensor] = None, + labels: Optional[mindspore.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, TokenClassifierOutput]: r""" - Args: - labels (`Tensor` of shape `(batch_size,)`, *optional*): - Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., - config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If - `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states @@ -3237,8 +2081,8 @@ def forward( if self.config.use_weighted_layer_sum: hidden_states = outputs[_HIDDEN_STATES_START_POSITION] hidden_states = ops.stack(hidden_states, dim=1) - norm_weights = ops.softmax(self.layer_weights, dim=-1) - hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(axis=1) + norm_weights = nn.functional.softmax(self.layer_weights, dim=-1) + hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1) else: hidden_states = outputs[0] @@ -3246,8 +2090,8 @@ def forward( loss = None if labels is not None: - labels = labels.astype(mindspore.int32) - loss = F.cross_entropy(logits.view(-1, self.num_labels), ops.argmax(labels.view(-1, self.num_labels), dim=1)) + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), ops.argmax(labels.view(-1, self.num_labels), dim=1)) if not return_dict: output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:] @@ -3262,120 +2106,30 @@ def forward( class AMSoftmaxLoss(nn.Module): - - """ - The AMSoftmaxLoss class represents a neural network cell for computing the AM-Softmax loss. This class inherits - from nn.Module and provides methods for initializing the loss function and forwarding the computation graph. - - Attributes: - scale (float): The scale parameter for the AM-Softmax loss function. - margin (float): The margin parameter for the AM-Softmax loss function. - num_labels (int): The number of unique labels in the dataset. - weight (Parameter): The weight parameter for the neural network. - - Methods: - __init__: Initializes the AMSoftmaxLoss instance with input dimension, number of labels, scale, and margin. - - forward: Constructs the computation graph for the AM-Softmax loss function using the given - hidden states and labels. - - Note: - The AMSoftmaxLoss class is designed for use in neural network training and optimization tasks. - """ def __init__(self, input_dim, num_labels, scale=30.0, margin=0.4): - """ - __init__ - - Initializes an instance of the AMSoftmaxLoss class. - - Args: - self (object): The instance of the class. - input_dim (int): The dimension of the input features. - num_labels (int): The number of unique labels for classification. - scale (float, optional): The scale factor for the angular margin. Defaults to 30.0. - margin (float, optional): The angular margin value. Defaults to 0.4. - - Returns: - None. - - Raises: - ValueError: If input_dim or num_labels are not positive integers. - TypeError: If scale or margin are not of type float. - """ - super().__init__() + super(AMSoftmaxLoss, self).__init__() self.scale = scale self.margin = margin self.num_labels = num_labels - self.weight = Parameter(ops.randn(input_dim, num_labels), requires_grad=True) + self.weight = nn.Parameter(ops.randn(input_dim, num_labels), requires_grad=True) + self.loss = nn.CrossEntropyLoss() def forward(self, hidden_states, labels): - """ - This method forwards an AMSoftmax loss function. - - Args: - self (object): The instance of the AMSoftmaxLoss class. - hidden_states (tensor): A tensor representing the hidden states of the model. - labels (tensor): A tensor containing the ground truth labels for the corresponding hidden states. - It is expected that the labels are flattened for processing. - - Returns: - None. - - Raises: - ValueError: If the dimensions of the weight tensor and hidden_states tensor are not compatible - for matrix multiplication. - RuntimeError: If there is an issue with the normalization operation on the weight or hidden_states tensor. - ValueError: If the labels tensor does not match the expected shape for one-hot encoding. - RuntimeError: If there is a problem with the cross-entropy calculation. - """ labels = labels.flatten() - weight = self.weight / ops.norm(self.weight, dim=0, keepdim=True) - hidden_states = hidden_states / ops.norm(hidden_states, dim=1, keepdim=True) + weight = nn.functional.normalize(self.weight, dim=0) + hidden_states = nn.functional.normalize(hidden_states, dim=1) cos_theta = ops.mm(hidden_states, weight) psi = cos_theta - self.margin - onehot = ops.one_hot(labels, self.num_labels) + onehot = nn.functional.one_hot(labels, self.num_labels) logits = self.scale * ops.where(onehot.bool(), psi, cos_theta) - loss = F.cross_entropy(logits, labels) + loss = self.loss(logits, labels) + return loss class TDNNLayer(nn.Module): - - """TDNNLayer represents a time-delay neural network (TDNN) layer for processing sequential data. - It inherits from nn.Module and is initialized with a Wav2Vec2Config and an optional layer_id. - - Attributes: - config (Wav2Vec2Config): The configuration for the Wav2Vec2 model. - layer_id (int): The index of the TDNN layer. - - Methods: - forward(hidden_states): Applies the TDNN layer operations to the input hidden_states. - - The TDNNLayer class applies a convolutional layer with specified kernel size and dilation to the input data. - It then applies a ReLU activation function to the output. - - Note: - This class is part of the Wav2Vec2 model architecture. - - """ - def __init__(self, config: Wav2Vec2Config, layer_id=0): - """ - Initializes a TDNNLayer object. - - Args: - self: The instance of the TDNNLayer class. - config (Wav2Vec2Config): An instance of Wav2Vec2Config that holds configuration parameters for the layer. - layer_id (int): An integer representing the ID of the layer. Default is 0. Must be within the range of - available layers in the configuration. - - Returns: - None. - - Raises: - TypeError: If the config parameter is not of type Wav2Vec2Config. - ValueError: If the layer_id is outside the valid range of available layers in the configuration. - """ + def __init__(self, config, layer_id=0): super().__init__() self.in_conv_dim = config.tdnn_dim[layer_id - 1] if layer_id > 0 else config.tdnn_dim[layer_id] self.out_conv_dim = config.tdnn_dim[layer_id] @@ -3385,76 +2139,32 @@ def __init__(self, config: Wav2Vec2Config, layer_id=0): self.kernel = nn.Linear(self.in_conv_dim * self.kernel_size, self.out_conv_dim) self.activation = nn.ReLU() - def forward(self, hidden_states): - ''' - Constructs the TDNN layer with the input hidden_states. - - Args: - self (TDNNLayer): The instance of the TDNNLayer class. - hidden_states (Tensor): The input hidden states to be processed by the TDNN layer. - It should be a tensor of shape (batch_size, in_channels, sequence_length). + def forward(self, hidden_states: mindspore.Tensor) -> mindspore.Tensor: + from ....peft.tuners.lora import LoraLayer + if isinstance(self.kernel, LoraLayer): + warnings.warn( + "Detected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. " + "You should exclude TDNNLayer from LoRA's target modules.", + ) - Returns: - hidden_states (Tensor): The processed hidden states after applying the TDNN layer operations. - It will be a tensor of shape (batch_size, out_channels, new_length), where out_channels is the number - of output channels and new_length is the length of the output sequence. - - Raises: - TypeError: If the input hidden_states is not a tensor. - ValueError: If the input hidden_states does not have the expected shape or dimensions. - ''' - hidden_states = hidden_states.unsqueeze(1) - hidden_states = F.unfold( - hidden_states, - (self.kernel_size, self.in_conv_dim), - stride=(1, self.in_conv_dim), - dilation=(self.dilation, 1), - ) - hidden_states = hidden_states.swapaxes(1, 2) - hidden_states = self.kernel(hidden_states) + # for backward compatibility, we keep nn.Linear but call F.conv1d for speed up + hidden_states = ops.transpose(hidden_states, 1, 2) + weight = ops.transpose(self.kernel.weight.view(self.out_conv_dim, self.kernel_size, self.in_conv_dim), 1, 2) + hidden_states = nn.functional.conv1d(hidden_states, weight, self.kernel.bias, dilation=self.dilation) + hidden_states = ops.transpose(hidden_states, 1, 2) hidden_states = self.activation(hidden_states) return hidden_states class Wav2Vec2ForXVector(Wav2Vec2PreTrainedModel): - - """ - This class represents a Wav2Vec2 model for extracting x-vector embeddings from audio data. It inherits from the - Wav2Vec2PreTrainedModel class, and provides methods for freezing specific model components and computing x-vector - embeddings from input audio data. - - The class contains methods for freezing the feature extractor, freezing the feature encoder, and freezing the base - model to disable gradient computation for specific model components. Additionally, it includes methods for computing - the output length of the TDNN layers and for forwarding x-vector embeddings from input audio data. - - The forward method takes input audio data and optional parameters such as attention mask and labels, and returns - x-vector embeddings along with optional loss and hidden states. The method also supports outputting hidden states - and attentions based on the configuration settings. - - This class is designed to be used for x-vector extraction tasks and provides flexibility for customizing the model's - behavior and freezing specific components during training. - """ - def __init__(self, config: Wav2Vec2Config): - """ - Initializes an instance of the Wav2Vec2ForXVector class. - - Args: - self: The instance of the Wav2Vec2ForXVector class. - config (Wav2Vec2Config): An object of type Wav2Vec2Config containing configuration settings for the model. - - Returns: - None. - - Raises: - None. - """ + def __init__(self, config): super().__init__(config) self.wav2vec2 = Wav2Vec2Model(config) num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings if config.use_weighted_layer_sum: - self.layer_weights = Parameter(ops.ones(num_layers) / num_layers) + self.layer_weights = nn.Parameter(ops.ones(num_layers) / num_layers) self.projector = nn.Linear(config.hidden_size, config.tdnn_dim[0]) tdnn_layers = [TDNNLayer(config, i) for i in range(len(config.tdnn_dim))] @@ -3473,7 +2183,7 @@ def freeze_feature_extractor(self): not be updated during training. """ warnings.warn( - "The method `freeze_feature_extractor` is deprecated. " + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " "Please use the equivalent `freeze_feature_encoder` method instead.", FutureWarning, ) @@ -3491,16 +2201,16 @@ def freeze_base_model(self): Calling this function will disable the gradient computation for the base model so that its parameters will not be updated during training. Only the classification head will be updated. """ - for named, param in self.wav2vec2.parameters_and_names(): + for param in self.wav2vec2.parameters(): param.requires_grad = False - def _get_tdnn_output_lengths(self, input_lengths: Union[Tensor, int]): + def _get_tdnn_output_lengths(self, input_lengths: Union[mindspore.Tensor, int]): """ Computes the output length of the TDNN layers """ + def _conv_out_length(input_length, kernel_size, stride): # 1D convolutional layer output length formula taken - # from https://pyops.org/docs/stable/generated/ops.nn.Conv1d.html return (input_length - kernel_size) // stride + 1 for kernel_size in self.config.tdnn_kernel: @@ -3510,20 +2220,20 @@ def _conv_out_length(input_length, kernel_size, stride): def forward( self, - input_values: Optional[Tensor], - attention_mask: Optional[Tensor] = None, + input_values: Optional[mindspore.Tensor], + attention_mask: Optional[mindspore.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, - labels: Optional[Tensor] = None, + labels: Optional[mindspore.Tensor] = None, ) -> Union[Tuple, XVectorOutput]: r""" - Args: - labels (`Tensor` of shape `(batch_size,)`, *optional*): - Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., - config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If - `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states @@ -3538,8 +2248,8 @@ def forward( if self.config.use_weighted_layer_sum: hidden_states = outputs[_HIDDEN_STATES_START_POSITION] hidden_states = ops.stack(hidden_states, dim=1) - norm_weights = ops.softmax(self.layer_weights, dim=-1) - hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(axis=1) + norm_weights = nn.functional.softmax(self.layer_weights, dim=-1) + hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1) else: hidden_states = outputs[0] @@ -3550,17 +2260,16 @@ def forward( # Statistic Pooling if attention_mask is None: - mean_features = hidden_states.mean(axis=1) - #std_features = hidden_states.std(axis=1) # NOTE: buggy API - std_features = ops.std(hidden_states, dim=1, keepdim=True).squeeze(1) + mean_features = ops.mean(hidden_states, dim=1) + std_features = ops.std(hidden_states, dim=1) else: - feat_extract_output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(axis=1)) + feat_extract_output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(dim=1)) tdnn_output_lengths = self._get_tdnn_output_lengths(feat_extract_output_lengths) mean_features = [] std_features = [] for i, length in enumerate(tdnn_output_lengths): - mean_features.append(hidden_states[i, :length].mean(axis=0)) - std_features.append(hidden_states[i, :length].std(axis=0)) + mean_features.append(ops.mean(hidden_states[i, :length], dim=0)) + std_features.append(ops.std(hidden_states[i, :length], dim=0)) mean_features = ops.stack(mean_features) std_features = ops.stack(std_features) statistic_pooling = ops.cat([mean_features, std_features], dim=-1) @@ -3570,7 +2279,6 @@ def forward( loss = None if labels is not None: - labels = labels.astype(mindspore.int32) loss = self.objective(logits, labels) if not return_dict: @@ -3584,3 +2292,14 @@ def forward( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + +__all__ = [ + "Wav2Vec2ForAudioFrameClassification", + "Wav2Vec2ForCTC", + "Wav2Vec2ForMaskedLM", + "Wav2Vec2ForPreTraining", + "Wav2Vec2ForSequenceClassification", + "Wav2Vec2ForXVector", + "Wav2Vec2Model", + "Wav2Vec2PreTrainedModel", +] diff --git a/mindnlp/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py b/mindnlp/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py index fca66596d..eef59c342 100644 --- a/mindnlp/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +++ b/mindnlp/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py @@ -15,15 +15,12 @@ """MindSpore Wav2Vec2-BERT model.""" import math +import warnings from typing import Optional, Tuple, Union import numpy as np import mindspore -from mindspore import Parameter -from mindspore.common.initializer import initializer, Normal, Uniform, HeNormal, XavierUniform - from mindnlp.core import nn, ops -from mindnlp.core.nn import functional as F from mindnlp.core.nn import CrossEntropyLoss from ...activations import ACT2FN @@ -48,6 +45,18 @@ _HIDDEN_STATES_START_POSITION = 2 +# General docstring +_CONFIG_FOR_DOC = "Wav2Vec2BertConfig" + +# Base docstring +_BASE_CHECKPOINT_FOR_DOC = "facebook/w2v-bert-2.0" +_PRETRAINED_CHECKPOINT_FOR_DOC = "hf-audio/wav2vec2-bert-CV16-en" +_EXPECTED_OUTPUT_SHAPE = [1, 146, 1024] + +# CTC docstring +_CTC_EXPECTED_OUTPUT = "'mr quilter is the apostle of the middle classes and we are glad to welcome his gospel'" +_CTC_EXPECTED_LOSS = 17.04 + # Copied from transformers.models.seamless_m4t_v2.modeling_seamless_m4t_v2._compute_new_attention_mask def _compute_new_attention_mask(hidden_states: mindspore.Tensor, seq_lens: mindspore.Tensor): @@ -64,9 +73,9 @@ def _compute_new_attention_mask(hidden_states: mindspore.Tensor, seq_lens: minds """ batch_size, mask_seq_len = hidden_states.shape[:2] - indices = ops.arange(mask_seq_len).expand(batch_size, -1) + indices = ops.arange(mask_seq_len).broadcast_to((batch_size, -1)) - bool_mask = indices >= seq_lens.unsqueeze(1).expand(-1, mask_seq_len) + bool_mask = indices >= seq_lens.unsqueeze(1).broadcast_to((-1, mask_seq_len)) mask = hidden_states.new_ones((batch_size, mask_seq_len)) @@ -213,7 +222,6 @@ def _sample_negative_indices( mask_time_indices = ( mask_time_indices.astype(bool) if mask_time_indices is not None else np.ones(features_shape, dtype=bool) ) - mask_time_indices = mask_time_indices.numpy() if isinstance(mask_time_indices, mindspore.Tensor) else mask_time_indices for batch_idx in range(batch_size): high = mask_time_indices[batch_idx].sum() - 1 @@ -246,7 +254,7 @@ def __init__(self, config): inv_freq = 1.0 / (base ** (ops.arange(0, dim, 2, dtype=mindspore.int64).float() / dim)) # Ignore copy - self.inv_freq = inv_freq + self.register_buffer("inv_freq", inv_freq, persistent=False) self.cached_sequence_length = None self.cached_rotary_positional_embedding = None @@ -278,7 +286,7 @@ def __init__(self, config): self.max_len = config.max_source_positions self.d_model = config.hidden_size self.pe = None - self.extend_pe(mindspore.tensor(0.0).expand(1, self.max_len)) + self.extend_pe(mindspore.tensor(0.0).broadcast_to((1, self.max_len))) def extend_pe(self, x): # Reset the positional encodings @@ -325,7 +333,7 @@ def __init__(self, config): super().__init__() self.layer_norm = nn.LayerNorm(config.feature_projection_input_dim, eps=config.layer_norm_eps) self.projection = nn.Linear(config.feature_projection_input_dim, config.hidden_size) - self.dropout = nn.Dropout(p=config.feat_proj_dropout) + self.dropout = nn.Dropout(config.feat_proj_dropout) def forward(self, hidden_states): # non-projected hidden states are needed for quantization @@ -340,13 +348,13 @@ def __init__(self, config, act_fn=None, hidden_size=None): super().__init__() act_fn = act_fn if act_fn is not None else config.hidden_act hidden_size = hidden_size if hidden_size is not None else config.hidden_size - self.intermediate_dropout = nn.Dropout(p=config.activation_dropout) + self.intermediate_dropout = nn.Dropout(config.activation_dropout) self.intermediate_dense = nn.Linear(hidden_size, config.intermediate_size) self.intermediate_act_fn = ACT2FN[act_fn] if isinstance(act_fn, str) else act_fn self.output_dense = nn.Linear(config.intermediate_size, hidden_size) - self.output_dropout = nn.Dropout(p=config.hidden_dropout) + self.output_dropout = nn.Dropout(config.hidden_dropout) # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeedForward.forward def forward(self, hidden_states): @@ -396,7 +404,7 @@ def __init__(self, config): padding=0, bias=False, ) - self.dropout = nn.Dropout(p=config.conformer_conv_dropout) + self.dropout = nn.Dropout(config.conformer_conv_dropout) def forward(self, hidden_states, attention_mask=None): hidden_states = self.layer_norm(hidden_states) @@ -416,7 +424,7 @@ def forward(self, hidden_states, attention_mask=None): hidden_states = self.glu(hidden_states) # Pad the sequence entirely on the left because of causal convolution. - hidden_states = ops.pad(hidden_states, (self.depthwise_conv.kernel_size[0] - 1, 0)) + hidden_states = nn.functional.pad(hidden_states, (self.depthwise_conv.kernel_size[0] - 1, 0)) # 1D Depthwise Conv hidden_states = self.depthwise_conv(hidden_states) @@ -456,8 +464,8 @@ def __init__(self, config, is_adapter_attention=False): self.linear_pos = nn.Linear(hidden_size, hidden_size, bias=False) # these two learnable bias are used in matrix c and matrix d # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = Parameter(ops.zeros(self.num_heads, self.head_size)) - self.pos_bias_v = Parameter(ops.zeros(self.num_heads, self.head_size)) + self.pos_bias_u = nn.Parameter(ops.zeros(self.num_heads, self.head_size)) + self.pos_bias_v = nn.Parameter(ops.zeros(self.num_heads, self.head_size)) if self.position_embeddings_type == "relative_key": self.left_max_position_embeddings = config.left_max_position_embeddings @@ -616,7 +624,7 @@ def __init__(self, config): # Self-Attention self.self_attn_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) - self.self_attn_dropout = nn.Dropout(p=dropout) + self.self_attn_dropout = nn.Dropout(dropout) self.self_attn = Wav2Vec2BertSelfAttention(config) # Conformer Convolution @@ -680,8 +688,9 @@ def __init__(self, config): else: self.embed_positions = None - self.dropout = nn.Dropout(p=config.hidden_dropout) + self.dropout = nn.Dropout(config.hidden_dropout) self.layers = nn.ModuleList([Wav2Vec2BertEncoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False def forward( self, @@ -702,8 +711,8 @@ def forward( # extend attention_mask attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype) attention_mask = attention_mask * float(ops.finfo(hidden_states.dtype).min) - attention_mask = attention_mask.expand( - attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1] + attention_mask = attention_mask.broadcast_to( + (attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]) ) hidden_states = self.dropout(hidden_states) @@ -722,13 +731,24 @@ def forward( skip_the_layer = self.training and (dropout_probability < self.config.layerdrop) if not skip_the_layer: - layer_outputs = layer( - hidden_states, - attention_mask=attention_mask, - relative_position_embeddings=relative_position_embeddings, - output_attentions=output_attentions, - conv_attention_mask=conv_attention_mask, - ) + # under deepspeed zero3 all gpus must run in sync + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + layer.__call__, + hidden_states, + attention_mask, + relative_position_embeddings, + output_attentions, + conv_attention_mask, + ) + else: + layer_outputs = layer( + hidden_states, + attention_mask=attention_mask, + relative_position_embeddings=relative_position_embeddings, + output_attentions=output_attentions, + conv_attention_mask=conv_attention_mask, + ) hidden_states = layer_outputs[0] if skip_the_layer: @@ -758,7 +778,7 @@ def __init__(self, config): self.proj_layer_norm = nn.LayerNorm(config.output_hidden_size, eps=config.layer_norm_eps) else: self.proj = self.proj_layer_norm = None - self.layers = nn.ModuleList([Wav2Vec2BertAdapterLayer(config) for _ in range(config.num_adapter_layers)]) + self.layers = nn.ModuleList(Wav2Vec2BertAdapterLayer(config) for _ in range(config.num_adapter_layers)) self.layerdrop = config.layerdrop self.kernel_size = config.adapter_kernel_size @@ -769,7 +789,7 @@ def _compute_sub_sample_lengths_from_attention_mask(self, seq_lens): return seq_lens pad = self.kernel_size // 2 seq_lens = ((seq_lens + 2 * pad - self.kernel_size) / self.stride) + 1 - return seq_lens #.floor() + return seq_lens.floor() def forward(self, hidden_states, attention_mask=None): # down project hidden_states if necessary @@ -822,7 +842,7 @@ def __init__(self, config): padding=self.stride // 2, ) self.self_attn = Wav2Vec2BertSelfAttention(config, is_adapter_attention=True) - self.self_attn_dropout = nn.Dropout(p=dropout) + self.self_attn_dropout = nn.Dropout(dropout) # Feed-forward self.ffn_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) @@ -890,34 +910,34 @@ class Wav2Vec2BertPreTrainedModel(PreTrainedModel): config_class = Wav2Vec2BertConfig base_model_prefix = "wav2vec2_bert" main_input_name = "input_features" - supports_gradient_checkpointing = False + supports_gradient_checkpointing = True # Ignore copy def _init_weights(self, module): """Initialize the weights""" if isinstance(module, Wav2Vec2BertSelfAttention): if hasattr(module, "pos_bias_u"): - module.pos_bias_u.set_data(initializer(XavierUniform(), module.pos_bias_u.shape, module.pos_bias_u.dtype)) + nn.init.xavier_uniform_(module.pos_bias_u) if hasattr(module, "pos_bias_v"): - module.pos_bias_v.set_data(initializer(XavierUniform(), module.pos_bias_v.shape, module.pos_bias_v.dtype)) + nn.init.xavier_uniform_(module.pos_bias_v) elif isinstance(module, Wav2Vec2BertFeatureProjection): - k = math.sqrt(1 / module.projection.in_channels) - module.projection.weight.set_data(initializer(Uniform(k), module.projection.weight.shape, module.projection.weight.dtype)) - module.projection.bias.set_data(initializer(Uniform(k), module.projection.bias.shape, module.projection.bias.dtype)) + k = math.sqrt(1 / module.projection.in_features) + nn.init.uniform_(module.projection.weight, a=-k, b=k) + nn.init.uniform_(module.projection.bias, a=-k, b=k) elif isinstance(module, nn.Linear): - module.weight.set_data(initializer(Normal(self.config.initializer_range), module.weight.shape, module.weight.dtype)) + nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range) if module.bias is not None: - module.bias.set_data(initializer('zeros', module.bias.shape, module.bias.dtype)) + nn.init.zeros_(module.bias) elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)): - module.bias.set_data(initializer('zeros', module.bias.shape, module.bias.dtype)) - module.weight.set_data(initializer('ones', module.weight.shape, module.weight.dtype)) + nn.init.zeros_(module.bias) + nn.init.ones_(module.weight) elif isinstance(module, nn.Conv1d): - module.weight.set_data(initializer(HeNormal(), module.weight.shape, module.weight.dtype)) + nn.init.kaiming_normal_(module.weight) if module.bias is not None: - k = math.sqrt(module.group / (module.in_channels * module.kernel_size[0])) - module.bias.set_data(initializer(Uniform(k), module.bias.shape, module.bias.dtype)) + k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0])) + nn.init.uniform_(module.bias, a=-k, b=k) # Ignore copy def _get_feat_extract_output_lengths( @@ -931,7 +951,6 @@ def _get_feat_extract_output_lengths( def _conv_out_length(input_length, kernel_size, stride, padding): # 1D convolutional layer output length formula taken - # from https://pyops.org/docs/stable/generated/ops.nn.Conv1d.html return ops.div(input_length + 2 * padding - kernel_size, stride, rounding_mode="floor") + 1 if add_adapter: @@ -948,7 +967,7 @@ def _get_feature_vector_attention_mask( ): # Effectively attention_mask.sum(-1), but not inplace to be able to run # on inference mode. - non_padded_lengths = attention_mask.cumsum(axis=-1)[:, -1] + non_padded_lengths = ops.cumsum(attention_mask, dim=-1)[:, -1] output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter) output_lengths = output_lengths.to(mindspore.int64) @@ -960,54 +979,10 @@ def _get_feature_vector_attention_mask( ) # these two operations makes sure that all values before the output lengths idxs are attended to attention_mask[(ops.arange(attention_mask.shape[0]), output_lengths - 1)] = 1 - attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool() + attention_mask = attention_mask.flip([-1]).int().cumsum(-1).flip([-1]).bool() return attention_mask -WAV2VEC2_BERT_START_DOCSTRING = r""" - Wav2Vec2Bert was proposed in [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech - Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael - Auli. - - This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the - library implements for all its model (such as downloading or saving etc.). - - This model is a MindSpore [nn.Module](https://pyops.org/docs/stable/nn.html#nn.Module) sub-class. Use it as a - regular MindSpore Module and refer to the MindSpore documentation for all matter related to general usage and behavior. - - Parameters: - config ([`Wav2Vec2BertConfig`]): Model configuration class with all the parameters of the model. - Initializing with a config file does not load the weights associated with the model, only the - configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. -""" - - -WAV2VEC2_BERT_INPUTS_DOCSTRING = r""" - Args: - input_features (`mindspore.Tensor` of shape `(batch_size, sequence_length)`): - Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_features`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `mindspore.Tensor`. See [`Wav2Vec2BertProcessor.__call__`] for details. - attention_mask (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0, - 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned - tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for - more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. -""" - - class Wav2Vec2BertModel(Wav2Vec2BertPreTrainedModel): def __init__(self, config: Wav2Vec2BertConfig): super().__init__(config) @@ -1016,8 +991,7 @@ def __init__(self, config: Wav2Vec2BertConfig): # model only needs masking vector if mask prob is > 0.0 if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0: - tensor = ops.zeros([config.hidden_size]) - self.masked_spec_embed = Parameter(initializer(Uniform(), tensor.shape, tensor.dtype)) + self.masked_spec_embed = nn.Parameter(ops.randn(config.hidden_size)) self.encoder = Wav2Vec2BertEncoder(config) @@ -1072,7 +1046,7 @@ def _mask_hidden_states( min_masks=self.config.mask_feature_min_masks, ) mask_feature_indices = mindspore.tensor(mask_feature_indices, dtype=mindspore.bool_) - mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1) + mask_feature_indices = mask_feature_indices[:, None].broadcast_to((-1, sequence_length, -1)) hidden_states[mask_feature_indices] = 0 return hidden_states @@ -1126,11 +1100,12 @@ def forward( class Wav2Vec2BertForCTC(Wav2Vec2BertPreTrainedModel): + # Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerForCTC.__init__ with Wav2Vec2Conformer->Wav2Vec2Bert,WAV2VEC2_CONFORMER->WAV2VEC2_BERT,wav2vec2_conformer->wav2vec2_bert def __init__(self, config, target_lang: Optional[str] = None): super().__init__(config) self.wav2vec2_bert = Wav2Vec2BertModel(config) - self.dropout = nn.Dropout(p=config.final_dropout) + self.dropout = nn.Dropout(config.final_dropout) self.target_lang = target_lang @@ -1185,8 +1160,6 @@ def forward( loss = None if labels is not None: - labels = labels.astype(mindspore.int32) - # retrieve loss input_lengths from attention_mask attention_mask = ( attention_mask @@ -1202,11 +1175,11 @@ def forward( flattened_targets = labels.masked_select(labels_mask) # ctc_loss doesn't support fp16 - log_probs = F.log_softmax(logits, dim=-1).swapaxes(0, 1) + log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=mindspore.float32).swapaxes(0, 1) - loss = F.ctc_loss( + loss = nn.functional.ctc_loss( log_probs, - labels, # flattened_targets + labels, input_lengths, target_lengths, blank=self.config.pad_token_id, @@ -1222,7 +1195,6 @@ def forward( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions ) - class Wav2Vec2BertForSequenceClassification(Wav2Vec2BertPreTrainedModel): # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.__init__ with Wav2Vec2->Wav2Vec2Bert,wav2vec2->wav2vec2_bert def __init__(self, config): @@ -1235,7 +1207,7 @@ def __init__(self, config): self.wav2vec2_bert = Wav2Vec2BertModel(config) num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings if config.use_weighted_layer_sum: - self.layer_weights = Parameter(ops.ones(num_layers) / num_layers) + self.layer_weights = nn.Parameter(ops.ones(num_layers) / num_layers) self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size) self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels) @@ -1281,24 +1253,23 @@ def forward( if self.config.use_weighted_layer_sum: hidden_states = outputs[_HIDDEN_STATES_START_POSITION] hidden_states = ops.stack(hidden_states, dim=1) - norm_weights = ops.softmax(self.layer_weights, dim=-1) - hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(axis=1) + norm_weights = nn.functional.softmax(self.layer_weights, dim=-1) + hidden_states = ops.sum((hidden_states * norm_weights.view(-1, 1, 1)), dim=1) else: hidden_states = outputs[0] hidden_states = self.projector(hidden_states) if attention_mask is None: - pooled_output = hidden_states.mean(axis=1) + pooled_output = ops.mean(hidden_states, dim=1) else: padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask) hidden_states[~padding_mask] = 0.0 - pooled_output = hidden_states.sum(axis=1) / padding_mask.sum(axis=1).view(-1, 1) + pooled_output = ops.sum(hidden_states, dim=1) / ops.sum(padding_mask, dim=1).view(-1, 1) logits = self.classifier(pooled_output) loss = None if labels is not None: - labels = labels.astype(mindspore.int32) loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1)) @@ -1313,8 +1284,8 @@ def forward( attentions=outputs.attentions, ) - class Wav2Vec2BertForAudioFrameClassification(Wav2Vec2BertPreTrainedModel): + # Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerForAudioFrameClassification.__init__ with Wav2Vec2Conformer->Wav2Vec2Bert,WAV2VEC2_CONFORMER->WAV2VEC2_BERT,wav2vec2_conformer->wav2vec2_bert def __init__(self, config): super().__init__(config) @@ -1325,7 +1296,7 @@ def __init__(self, config): self.wav2vec2_bert = Wav2Vec2BertModel(config) num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings if config.use_weighted_layer_sum: - self.layer_weights = Parameter(ops.ones(num_layers) / num_layers) + self.layer_weights = nn.Parameter(ops.ones(num_layers) / num_layers) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.num_labels = config.num_labels @@ -1371,8 +1342,8 @@ def forward( if self.config.use_weighted_layer_sum: hidden_states = outputs[_HIDDEN_STATES_START_POSITION] hidden_states = ops.stack(hidden_states, dim=1) - norm_weights = ops.softmax(self.layer_weights, dim=-1) - hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(axis=1) + norm_weights = nn.functional.softmax(self.layer_weights, dim=-1) + hidden_states = ops.sum((hidden_states * norm_weights.view(-1, 1, 1)), dim=1) else: hidden_states = outputs[0] @@ -1380,7 +1351,6 @@ def forward( loss = None if labels is not None: - labels = labels.astype(mindspore.int32) loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), ops.argmax(labels.view(-1, self.num_labels), dim=1)) @@ -1403,17 +1373,17 @@ def __init__(self, input_dim, num_labels, scale=30.0, margin=0.4): self.scale = scale self.margin = margin self.num_labels = num_labels - self.weight = Parameter(ops.randn(input_dim, num_labels), requires_grad=True) + self.weight = nn.Parameter(ops.randn(input_dim, num_labels), requires_grad=True) self.loss = nn.CrossEntropyLoss() def forward(self, hidden_states, labels): labels = labels.flatten() - weight = F.normalize(self.weight, dim=0) - hidden_states = F.normalize(hidden_states, dim=1) + weight = nn.functional.normalize(self.weight, dim=0) + hidden_states = nn.functional.normalize(hidden_states, dim=1) cos_theta = ops.mm(hidden_states, weight) psi = cos_theta - self.margin - onehot = ops.one_hot(labels, self.num_labels) + onehot = nn.functional.one_hot(labels, self.num_labels) logits = self.scale * ops.where(onehot.bool(), psi, cos_theta) loss = self.loss(logits, labels) @@ -1433,10 +1403,18 @@ def __init__(self, config, layer_id=0): self.activation = nn.ReLU() def forward(self, hidden_states: mindspore.Tensor) -> mindspore.Tensor: + from ....peft.tuners.lora import LoraLayer + + if isinstance(self.kernel, LoraLayer): + warnings.warn( + "Detected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. " + "You should exclude TDNNLayer from LoRA's target modules.", + ) + # for backward compatibility, we keep nn.Linear but call F.conv1d for speed up hidden_states = hidden_states.swapaxes(1, 2) weight = self.kernel.weight.view(self.out_conv_dim, self.kernel_size, self.in_conv_dim).swapaxes(1, 2) - hidden_states = ops.conv1d(hidden_states, weight, self.kernel.bias, dilation=self.dilation) + hidden_states = nn.functional.conv1d(hidden_states, weight, self.kernel.bias, dilation=self.dilation) hidden_states = hidden_states.swapaxes(1, 2) hidden_states = self.activation(hidden_states) @@ -1444,13 +1422,14 @@ def forward(self, hidden_states: mindspore.Tensor) -> mindspore.Tensor: class Wav2Vec2BertForXVector(Wav2Vec2BertPreTrainedModel): + # Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerForXVector.__init__ with Wav2Vec2Conformer->Wav2Vec2Bert,WAV2VEC2_CONFORMER->WAV2VEC2_BERT,wav2vec2_conformer->wav2vec2_bert def __init__(self, config): super().__init__(config) self.wav2vec2_bert = Wav2Vec2BertModel(config) num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings if config.use_weighted_layer_sum: - self.layer_weights = Parameter(ops.ones(num_layers) / num_layers) + self.layer_weights = nn.Parameter(ops.ones(num_layers) / num_layers) self.projector = nn.Linear(config.hidden_size, config.tdnn_dim[0]) tdnn_layers = [TDNNLayer(config, i) for i in range(len(config.tdnn_dim))] @@ -1480,7 +1459,6 @@ def _get_tdnn_output_lengths(self, input_lengths: Union[mindspore.Tensor, int]): def _conv_out_length(input_length, kernel_size, stride): # 1D convolutional layer output length formula taken - # from https://pyops.org/docs/stable/generated/ops.nn.Conv1d.html return (input_length - kernel_size) // stride + 1 for kernel_size in self.config.tdnn_kernel: @@ -1519,8 +1497,8 @@ def forward( if self.config.use_weighted_layer_sum: hidden_states = outputs[_HIDDEN_STATES_START_POSITION] hidden_states = ops.stack(hidden_states, dim=1) - norm_weights = ops.softmax(self.layer_weights, dim=-1) - hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(axis=1) + norm_weights = nn.functional.softmax(self.layer_weights, dim=-1) + hidden_states = ops.sum((hidden_states * norm_weights.view(-1, 1, 1)), dim=1) else: hidden_states = outputs[0] @@ -1531,16 +1509,16 @@ def forward( # Statistic Pooling if attention_mask is None: - mean_features = hidden_states.mean(axis=1) - std_features = ops.std(hidden_states, dim=1, keepdim=True).squeeze(1) + mean_features = ops.mean(hidden_states, dim=1) + std_features = ops.std(hidden_states, dim=1) else: - feat_extract_output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(axis=1)) + feat_extract_output_lengths = self._get_feat_extract_output_lengths(ops.sum(attention_mask, dim=1)) tdnn_output_lengths = self._get_tdnn_output_lengths(feat_extract_output_lengths) mean_features = [] std_features = [] for i, length in enumerate(tdnn_output_lengths): - mean_features.append(hidden_states[i, :length].mean(axis=0)) - std_features.append(hidden_states[i, :length].std(axis=0)) + mean_features.append(ops.mean(hidden_states[i, :length], dim=0)) + std_features.append(ops.std(hidden_states[i, :length], dim=0)) mean_features = ops.stack(mean_features) std_features = ops.stack(std_features) statistic_pooling = ops.cat([mean_features, std_features], dim=-1) @@ -1564,7 +1542,6 @@ def forward( attentions=outputs.attentions, ) - __all__ = [ 'Wav2Vec2BertForAudioFrameClassification', 'Wav2Vec2BertForCTC', diff --git a/mindnlp/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py b/mindnlp/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py index f6ce98c36..6a7f6cfce 100644 --- a/mindnlp/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +++ b/mindnlp/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""mindnlp Wav2Vec2-Conformer model.""" +"""MindSpore Wav2Vec2-Conformer model.""" import math import warnings @@ -21,15 +21,9 @@ import numpy as np import mindspore -from mindspore import Tensor, Parameter -from mindspore.common.initializer import initializer, Uniform, Normal - from mindnlp.core import nn, ops -from mindnlp.core.nn import functional as F -from mindnlp.utils import ( - ModelOutput, - logging, -) +from mindnlp.core.nn import CrossEntropyLoss + from ...activations import ACT2FN from ...modeling_outputs import ( BaseModelOutput, @@ -40,20 +34,15 @@ XVectorOutput, ) from ...modeling_utils import PreTrainedModel +from ....utils import ( + ModelOutput, + logging, +) from .configuration_wav2vec2_conformer import Wav2Vec2ConformerConfig logger = logging.get_logger(__name__) -__all__ = [ - "Wav2Vec2ConformerForAudioFrameClassification", - "Wav2Vec2ConformerForCTC", - "Wav2Vec2ConformerForPreTraining", - "Wav2Vec2ConformerForSequenceClassification", - "Wav2Vec2ConformerForXVector", - "Wav2Vec2ConformerModel", - "Wav2Vec2ConformerPreTrainedModel", -] _HIDDEN_STATES_START_POSITION = 2 @@ -85,14 +74,12 @@ class Wav2Vec2ConformerForPreTrainingOutput(ModelOutput): projected_quantized_states (`mindspore.Tensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`): Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive target vectors for contrastive loss. - hidden_states (`tuple(mindspore.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed - or when `config.output_hidden_states=True`): + hidden_states (`tuple(mindspore.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple of `mindspore.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (`tuple(mindspore.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when - `config.output_attentions=True`): + attentions (`tuple(mindspore.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): Tuple of `mindspore.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. @@ -114,14 +101,6 @@ class Wav2Vec2ConformerForPreTrainingOutput(ModelOutput): diversity_loss: Optional[mindspore.Tensor] = None -def is_deepspeed_zero3_enabled(): - return False - - -def is_peft_available(): - return False - - # Copied from transformers.models.wav2vec2.modeling_wav2vec2._compute_mask_indices def _compute_mask_indices( shape: Tuple[int, int], @@ -137,15 +116,15 @@ def _compute_mask_indices( Args: shape: The shape for which to compute masks. This should be of a tuple of size 2 where - the first element is the batch size and the second element is the length of the axis to span. - mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of - independently generated mask spans of length `mask_length` is computed by - `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the - actual percentage will be smaller. + the first element is the batch size and the second element is the length of the axis to span. + mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of + independently generated mask spans of length `mask_length` is computed by + `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the + actual percentage will be smaller. mask_length: size of the mask min_masks: minimum number of masked spans attention_mask: A (right-padded) attention mask which independently shortens the feature axis of - each batch dimension. + each batch dimension. """ batch_size, sequence_length = shape @@ -178,7 +157,7 @@ def compute_num_masked_span(input_length): # compute number of masked spans in batch input_lengths = ( - attention_mask.sum(-1).asnumpy().tolist() + attention_mask.sum(-1).tolist() if attention_mask is not None else [sequence_length for _ in range(batch_size)] ) @@ -219,7 +198,7 @@ def compute_num_masked_span(input_length): spec_aug_mask_idxs = np.array(spec_aug_mask_idxs) - # expand(broadcast_to) masked indices to masked spans + # expand masked indices to masked spans spec_aug_mask_idxs = np.broadcast_to( spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length) ) @@ -257,8 +236,6 @@ def _sample_negative_indices( # get `num_negatives` random vector indices from the same utterance sampled_negative_indices = np.zeros(shape=(batch_size, sequence_length, num_negatives), dtype=np.int32) - if isinstance(mask_time_indices, Tensor): - mask_time_indices = mask_time_indices.asnumpy() mask_time_indices = ( mask_time_indices.astype(bool) if mask_time_indices is not None else np.ones(features_shape, dtype=bool) ) @@ -317,7 +294,7 @@ def __init__(self, config, layer_id=0): stride=config.conv_stride[layer_id], bias=config.conv_bias, ) - self.layer_norm = nn.LayerNorm([self.out_conv_dim]) + self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True) self.activation = ACT2FN[config.feat_extract_activation] def forward(self, hidden_states): @@ -369,7 +346,9 @@ def __init__(self, config): ) weight_norm = nn.utils.weight_norm - self.conv = weight_norm(self.conv, name='weight', dim=2) + + self.conv = weight_norm(self.conv, name="weight", dim=2) + self.padding = Wav2Vec2ConformerSamePadLayer(config.num_conv_pos_embeddings) self.activation = ACT2FN[config.feat_extract_activation] @@ -395,7 +374,7 @@ def __init__(self, config): base = config.rotary_embedding_base inv_freq = 1.0 / (base ** (ops.arange(0, dim, 2, dtype=mindspore.int64).float() / dim)) - self.inv_freq = inv_freq + self.register_buffer("inv_freq", inv_freq) self.cached_sequence_length = None self.cached_rotary_positional_embedding = None @@ -426,7 +405,7 @@ def __init__(self, config): self.max_len = config.max_source_positions self.d_model = config.hidden_size self.pe = None - self.extend_pe(mindspore.Tensor(0.0).broadcast_to((1, self.max_len))) + self.extend_pe(mindspore.tensor(0.0).broadcast_to((1, self.max_len))) def extend_pe(self, x): # Reset the positional encodings @@ -434,7 +413,7 @@ def extend_pe(self, x): # self.pe contains both positive and negative parts # the length of self.pe is 2 * input_len - 1 if self.pe.shape[1] >= x.shape[1] * 2 - 1: - if self.pe.dtype != x.dtype : + if self.pe.dtype != x.dtype: self.pe = self.pe.to(dtype=x.dtype) return # Suppose `i` is the position of query vector and `j` is the @@ -501,30 +480,40 @@ def __init__(self, config): f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']" ) self.conv_layers = nn.ModuleList(conv_layers) - # self.gradient_checkpointing = False + self.gradient_checkpointing = False self._requires_grad = True def _freeze_parameters(self): - for _, param in self.parameters_and_names(): + for param in self.parameters(): param.requires_grad = False self._requires_grad = False def forward(self, input_values): hidden_states = input_values[:, None] - for conv_layer in self.conv_layers: - hidden_states = conv_layer(hidden_states) - return hidden_states + # make sure hidden_states require grad for gradient_checkpointing + if self._requires_grad and self.training: + hidden_states.requires_grad = True + for conv_layer in self.conv_layers: + if self._requires_grad and self.gradient_checkpointing and self.training: + hidden_states = self._gradient_checkpointing_func( + conv_layer.__call__, + hidden_states, + ) + else: + hidden_states = conv_layer(hidden_states) + + return hidden_states # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureProjection with Wav2Vec2->Wav2Vec2Conformer class Wav2Vec2ConformerFeatureProjection(nn.Module): def __init__(self, config): super().__init__() - self.layer_norm = nn.LayerNorm([config.conv_dim[-1]], eps=config.layer_norm_eps) + self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps) self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size) - self.dropout = nn.Dropout(p = config.feat_proj_dropout) + self.dropout = nn.Dropout(config.feat_proj_dropout) def forward(self, hidden_states): # non-projected hidden states are needed for quantization @@ -538,7 +527,7 @@ def forward(self, hidden_states): class Wav2Vec2ConformerFeedForward(nn.Module): def __init__(self, config): super().__init__() - self.intermediate_dropout = nn.Dropout(p = config.activation_dropout) + self.intermediate_dropout = nn.Dropout(config.activation_dropout) self.intermediate_dense = nn.Linear(config.hidden_size, config.intermediate_size) if isinstance(config.hidden_act, str): @@ -547,7 +536,7 @@ def __init__(self, config): self.intermediate_act_fn = config.hidden_act self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size) - self.output_dropout = nn.Dropout(p = config.hidden_dropout) + self.output_dropout = nn.Dropout(config.hidden_dropout) def forward(self, hidden_states): hidden_states = self.intermediate_dense(hidden_states) @@ -566,7 +555,7 @@ def __init__(self, config): super().__init__() if (config.conv_depthwise_kernel_size - 1) % 2 == 1: raise ValueError("`config.conv_depthwise_kernel_size` should be a odd number for 'SAME' padding") - self.layer_norm = nn.LayerNorm([config.hidden_size]) + self.layer_norm = nn.LayerNorm(config.hidden_size) self.pointwise_conv1 = nn.Conv1d( config.hidden_size, 2 * config.hidden_size, @@ -595,7 +584,7 @@ def __init__(self, config): padding=0, bias=False, ) - self.dropout = nn.Dropout(p = config.conformer_conv_dropout) + self.dropout = nn.Dropout(config.conformer_conv_dropout) def forward(self, hidden_states): hidden_states = self.layer_norm(hidden_states) @@ -620,8 +609,7 @@ def forward(self, hidden_states): class Wav2Vec2ConformerSelfAttention(nn.Module): - """ - Construct an Wav2Vec2ConformerSelfAttention object. + """Construct an Wav2Vec2ConformerSelfAttention object. Can be enhanced with rotary or relative position embeddings. """ @@ -644,8 +632,8 @@ def __init__(self, config): self.linear_pos = nn.Linear(config.hidden_size, config.hidden_size, bias=False) # these two learnable bias are used in matrix c and matrix d # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - self.pos_bias_u = mindspore.Parameter(ops.zeros(self.num_heads, self.head_size),'pos_bias_u') - self.pos_bias_v = mindspore.Parameter(ops.zeros(self.num_heads, self.head_size),'pos_bias_v') + self.pos_bias_u = nn.Parameter(ops.zeros(self.num_heads, self.head_size)) + self.pos_bias_v = nn.Parameter(ops.zeros(self.num_heads, self.head_size)) def forward( self, @@ -777,21 +765,21 @@ def __init__(self, config): dropout = config.attention_dropout # Feed-forward 1 - self.ffn1_layer_norm = nn.LayerNorm([embed_dim]) + self.ffn1_layer_norm = nn.LayerNorm(embed_dim) self.ffn1 = Wav2Vec2ConformerFeedForward(config) # Self-Attention - self.self_attn_layer_norm = nn.LayerNorm([embed_dim]) - self.self_attn_dropout = nn.Dropout(p = dropout) + self.self_attn_layer_norm = nn.LayerNorm(embed_dim) + self.self_attn_dropout = nn.Dropout(dropout) self.self_attn = Wav2Vec2ConformerSelfAttention(config) # Conformer Convolution self.conv_module = Wav2Vec2ConformerConvolutionModule(config) # Feed-forward 2 - self.ffn2_layer_norm = nn.LayerNorm([embed_dim]) + self.ffn2_layer_norm = nn.LayerNorm(embed_dim) self.ffn2 = Wav2Vec2ConformerFeedForward(config) - self.final_layer_norm = nn.LayerNorm([embed_dim]) + self.final_layer_norm = nn.LayerNorm(embed_dim) def forward( self, @@ -800,7 +788,6 @@ def forward( relative_position_embeddings: Optional[mindspore.Tensor] = None, output_attentions: bool = False, ): - # 1. Feed-Forward 1 layer residual = hidden_states hidden_states = self.ffn1_layer_norm(hidden_states) @@ -847,8 +834,8 @@ def __init__(self, config): self.embed_positions = None self.pos_conv_embed = Wav2Vec2ConformerPositionalConvEmbedding(config) - self.layer_norm = nn.LayerNorm([config.hidden_size], eps=config.layer_norm_eps) - self.dropout = nn.Dropout(p = config.hidden_dropout) + self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout) self.layers = nn.ModuleList([Wav2Vec2ConformerEncoderLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False @@ -870,9 +857,9 @@ def forward( # extend attention_mask attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype) attention_mask = attention_mask * float(ops.finfo(hidden_states.dtype).min) - attention_mask = attention_mask.broadcast_to(( - attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1] - )) + attention_mask = attention_mask.broadcast_to( + (attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]) + ) hidden_states = self.dropout(hidden_states) @@ -881,7 +868,6 @@ def forward( else: relative_position_embeddings = None - deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled() for i, layer in enumerate(self.layers): if output_hidden_states: @@ -891,7 +877,7 @@ def forward( dropout_probability = ops.rand([]) skip_the_layer = self.training and (dropout_probability < self.config.layerdrop) - if not skip_the_layer or deepspeed_zero3_is_enabled: + if not skip_the_layer: # under deepspeed zero3 all gpus must run in sync if self.gradient_checkpointing and self.training: layer_outputs = self._gradient_checkpointing_func( @@ -948,9 +934,7 @@ def __init__(self, config): ) # storage for codebook variables (codewords) - self.codevectors = Parameter( - ops.zeros((1, self.num_groups * self.num_vars, config.codevector_dim // self.num_groups)) - ) + self.codevectors = nn.Parameter(ops.randn(1, self.num_groups * self.num_vars, config.codevector_dim // self.num_groups)) self.weight_proj = nn.Linear(config.conv_dim[-1], self.num_groups * self.num_vars) # can be decayed for training @@ -959,11 +943,11 @@ def __init__(self, config): @staticmethod def _compute_perplexity(probs, mask=None): if mask is not None: - mask_extended = mask.flatten()[:, None, None].broadcast_to((probs.shape)) + mask_extended = mask.flatten()[:, None, None].broadcast_to(probs.shape) probs = ops.where(mask_extended, probs, ops.zeros_like(probs)) - marginal_probs = probs.sum(axis=0) / mask.sum() + marginal_probs = ops.sum(probs, dim=0) / mask.sum() else: - marginal_probs = probs.mean(axis=0) + marginal_probs = ops.mean(probs, dim=0) perplexity = ops.exp(-ops.sum(marginal_probs * ops.log(marginal_probs + 1e-7), dim=-1)).sum() return perplexity @@ -977,7 +961,7 @@ def forward(self, hidden_states, mask_time_indices=None): if self.training: # sample code vector probs via gumbel in differentiateable way - codevector_probs = ops.gumbel_softmax( + codevector_probs = nn.functional.gumbel_softmax( hidden_states.float(), tau=self.temperature, hard=True ).type_as(hidden_states) @@ -989,12 +973,12 @@ def forward(self, hidden_states, mask_time_indices=None): else: # take argmax in non-differentiable way # comptute hard codevector distribution (one hot) - codevector_idx = ops.argmax(hidden_states,dim=-1) - x = hidden_states.new_zeros(hidden_states.shape) # (364, 320) - index = codevector_idx.view(-1, 1) - update = ops.ones_like(index, dtype=hidden_states.dtype) # fill with onehot - codevector_probs = ops.scatter(x, -1, index, update) - codevector_probs = codevector_probs.view(batch_size * sequence_length, self.num_groups, -1) # (182, 2, 320) + codevector_idx = ops.argmax(hidden_states, dim=-1).view(-1, 1) + codevector_probs = ops.scatter( + ops.zeros(hidden_states.shape, dtype=hidden_states.dtype), + -1, codevector_idx, ops.ones(codevector_idx.shape, dtype=hidden_states.dtype) + ) + codevector_probs = codevector_probs.view(batch_size * sequence_length, self.num_groups, -1) perplexity = self._compute_perplexity(codevector_probs, mask_time_indices) @@ -1015,11 +999,11 @@ def __init__(self, config): # feature dim might need to be down-projected if config.output_hidden_size != config.hidden_size: self.proj = nn.Linear(config.hidden_size, config.output_hidden_size) - self.proj_layer_norm = nn.LayerNorm([config.output_hidden_size]) + self.proj_layer_norm = nn.LayerNorm(config.output_hidden_size) else: self.proj = self.proj_layer_norm = None - self.layers = nn.ModuleList([Wav2Vec2ConformerAdapterLayer(config) for _ in range(config.num_adapter_layers)]) + self.layers = nn.ModuleList(Wav2Vec2ConformerAdapterLayer(config) for _ in range(config.num_adapter_layers)) self.layerdrop = config.layerdrop def forward(self, hidden_states): @@ -1053,7 +1037,7 @@ def __init__(self, config): def forward(self, hidden_states): hidden_states = self.conv(hidden_states) - hidden_states = F.glu(hidden_states, dim=1) + hidden_states = nn.functional.glu(hidden_states, dim=1) return hidden_states @@ -1069,46 +1053,49 @@ class Wav2Vec2ConformerPreTrainedModel(PreTrainedModel): main_input_name = "input_values" supports_gradient_checkpointing = True - - def _init_weights(self, cell): + def _init_weights(self, module): """Initialize the weights""" # Wav2Vec2ForPreTraining last 2 linear layers need standard Linear init. - if isinstance(cell, Wav2Vec2ConformerForPreTraining): - cell.project_hid._is_hf_initialized = True - cell.project_q._is_hf_initialized = True + if isinstance(module, Wav2Vec2ConformerForPreTraining): + module.project_hid.reset_parameters() + module.project_q.reset_parameters() + module.project_hid._is_initialized = True + module.project_q._is_initialized = True # gumbel softmax requires special init - elif isinstance(cell, Wav2Vec2ConformerGumbelVectorQuantizer): - cell.weight_proj.weight.set_data(initializer(Normal(1.0), cell.weight_proj.weight.shape, cell.weight_proj.weight.dtype)) - cell.weight_proj.bias.set_data(initializer('zeros', cell.weight_proj.bias.shape, cell.weight_proj.bias.dtype)) - cell.codevectors.set_data(initializer('uniform', cell.codevectors.shape, cell.codevectors.dtype)) - elif isinstance(cell, Wav2Vec2ConformerSelfAttention): - if hasattr(cell, "pos_bias_u"): - cell.pos_bias_u.set_data(initializer('XavierUniform', cell.pos_bias_u.shape, cell.pos_bias_u.dtype)) - if hasattr(cell, "pos_bias_v"): - cell.pos_bias_v.set_data(initializer('XavierUniform', cell.pos_bias_u.shape, cell.pos_bias_u.dtype)) - elif isinstance(cell, Wav2Vec2ConformerPositionalConvEmbedding): - cell.conv.weight.set_data( - initializer(Normal(2 * math.sqrt(1 / (cell.conv.kernel_size[0] * cell.conv.in_channels))), - cell.conv.weight.shape, cell.conv.weight.dtype)) - cell.conv.bias.set_data(initializer('zeros', cell.conv.bias.shape, cell.conv.bias.dtype)) - elif isinstance(cell, Wav2Vec2ConformerFeatureProjection): - k = math.sqrt(1 / cell.projection.in_channels) - cell.projection.weight.set_data( - initializer(Uniform(k), cell.projection.weight.shape, cell.projection.weight.dtype)) - cell.projection.bias.set_data( - initializer(Uniform(k), cell.projection.bias.shape, cell.projection.bias.dtype)) - elif isinstance(cell, nn.Linear): - cell.weight.set_data(initializer(Normal(self.config.initializer_range), cell.weight.shape, cell.weight.dtype)) - if cell.bias is not None: - cell.bias.set_data(initializer('zeros', cell.bias.shape, cell.bias.dtype)) - elif isinstance(cell, (nn.LayerNorm, nn.GroupNorm)): - cell.weight.set_data(initializer('ones', cell.weight.shape, cell.weight.dtype)) - cell.bias.set_data(initializer('zeros', cell.bias.shape, cell.bias.dtype)) - elif isinstance(cell, nn.Conv1d): - cell.weight.set_data(initializer('he_normal', cell.weight.shape, cell.weight.dtype)) - if cell.bias is not None: - k = math.sqrt(cell.group / (cell.in_channels * cell.kernel_size[0])) - cell.bias.set_data(initializer(Uniform(k), cell.bias.shape, cell.bias.dtype)) + elif isinstance(module, Wav2Vec2ConformerGumbelVectorQuantizer): + nn.init.normal_(module.weight_proj.weight, mean=0.0, std=1) + nn.init.zeros_(module.weight_proj.bias) + nn.init.uniform_(module.codevectors) + elif isinstance(module, Wav2Vec2ConformerSelfAttention): + if hasattr(module, "pos_bias_u"): + nn.init.xavier_uniform_(module.pos_bias_u) + if hasattr(module, "pos_bias_v"): + nn.init.xavier_uniform_(module.pos_bias_v) + elif isinstance(module, Wav2Vec2ConformerPositionalConvEmbedding): + nn.init.normal_( + module.conv.weight, + mean=0, + std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)), + ) + nn.init.constant_(module.conv.bias, 0) + elif isinstance(module, Wav2Vec2ConformerFeatureProjection): + k = math.sqrt(1 / module.projection.in_features) + nn.init.uniform_(module.projection.weight, a=-k, b=k) + nn.init.uniform_(module.projection.bias, a=-k, b=k) + elif isinstance(module, nn.Linear): + nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range) + + if module.bias is not None: + nn.init.zeros_(module.bias) + elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)): + nn.init.zeros_(module.bias) + nn.init.ones_(module.weight) + elif isinstance(module, nn.Conv1d): + nn.init.kaiming_normal_(module.weight) + + if module.bias is not None: + k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0])) + nn.init.uniform_(module.bias, a=-k, b=k) def _get_feat_extract_output_lengths( self, input_lengths: Union[mindspore.Tensor, int], add_adapter: Optional[bool] = None @@ -1121,8 +1108,7 @@ def _get_feat_extract_output_lengths( def _conv_out_length(input_length, kernel_size, stride): # 1D convolutional layer output length formula taken - # from https://pyops.org/docs/stable/generated/ops.nn.Conv1d.html - return (input_length - kernel_size) // stride + 1 + return ops.div(input_length - kernel_size, stride, rounding_mode="floor") + 1 for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride): input_lengths = _conv_out_length(input_lengths, kernel_size, stride) @@ -1138,7 +1124,7 @@ def _get_feature_vector_attention_mask( ): # Effectively attention_mask.sum(-1), but not inplace to be able to run # on inference mode. - non_padded_lengths = attention_mask.cumsum(axis=-1)[:, -1] + non_padded_lengths = ops.cumsum(attention_mask, dim=-1)[:, -1] output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter) output_lengths = output_lengths.to(mindspore.int64) @@ -1150,66 +1136,10 @@ def _get_feature_vector_attention_mask( ) # these two operations makes sure that all values before the output lengths idxs are attended to attention_mask[(ops.arange(attention_mask.shape[0]), output_lengths - 1)] = 1 - attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool() + attention_mask = attention_mask.flip([-1]).int().cumsum(-1).flip([-1]).bool() return attention_mask -WAV2VEC2_CONFORMER_START_DOCSTRING = r""" - Wav2Vec2Conformer was proposed in [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech - Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael - Auli. - - This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the - library implements for all its model (such as downloading or saving etc.). - - This model is a PyTorch [nn.Module](https://pyops.org/docs/stable/nn.html#nn.Module) sub-class. Use it as a - regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. - - Parameters: - config ([`Wav2Vec2ConformerConfig`]): Model configuration class with all the parameters of the model. - Initializing with a config file does not load the weights associated with the model, only the - configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. -""" - - -WAV2VEC2_CONFORMER_INPUTS_DOCSTRING = r""" - Args: - input_values (`mindspore.Tensor` of shape `(batch_size, sequence_length)`): - Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `mindspore.Tensor`. See [`Wav2Vec2Processor.__call__`] for details. - attention_mask (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - - - - `attention_mask` should only be passed if the corresponding processor has `config.return_attention_mask == - True`. For all models whose processor has `config.return_attention_mask == False`, such as - [wav2vec2-conformer-rel-pos-large](https://huggingface.co/facebook/wav2vec2-conformer-rel-pos-large), - `attention_mask` should **not** be passed to avoid degraded performance when doing batched inference. For - such models `input_values` should simply be padded with 0 and passed without `attention_mask`. Be aware - that these models also yield slightly different results depending on whether `input_values` is padded or - not. - - - - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned - tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for - more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. -""" - - class Wav2Vec2ConformerModel(Wav2Vec2ConformerPreTrainedModel): def __init__(self, config: Wav2Vec2ConformerConfig): super().__init__(config) @@ -1219,7 +1149,7 @@ def __init__(self, config: Wav2Vec2ConformerConfig): # model only needs masking vector if mask prob is > 0.0 if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0: - self.masked_spec_embed = Parameter(initializer(Normal(), [config.hidden_size]), 'masked_spec_embed') + self.masked_spec_embed = nn.Parameter(ops.randn(config.hidden_size)) self.encoder = Wav2Vec2ConformerEncoder(config) @@ -1266,7 +1196,7 @@ def _mask_hidden_states( attention_mask=attention_mask, min_masks=self.config.mask_time_min_masks, ) - mask_time_indices = mindspore.Tensor(mask_time_indices, dtype=mindspore.bool_) + mask_time_indices = mindspore.tensor(mask_time_indices, dtype=mindspore.bool_) hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype) if self.config.mask_feature_prob > 0 and self.training: @@ -1277,7 +1207,7 @@ def _mask_hidden_states( mask_length=self.config.mask_feature_length, min_masks=self.config.mask_feature_min_masks, ) - mask_feature_indices = mindspore.Tensor(mask_feature_indices, dtype=mindspore.bool_) + mask_feature_indices = mindspore.tensor(mask_feature_indices, dtype=mindspore.bool_) mask_feature_indices = mask_feature_indices[:, None].broadcast_to((-1, sequence_length, -1)) hidden_states[mask_feature_indices] = 0 @@ -1299,22 +1229,20 @@ def forward( ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict - extract_features = self.feature_extractor(input_values) extract_features = extract_features.swapaxes(1, 2) - if attention_mask is not None: # compute reduced attention_mask corresponding to feature vectors attention_mask = self._get_feature_vector_attention_mask( extract_features.shape[1], attention_mask, add_adapter=False ) - hidden_states, extract_features = self.feature_projection(extract_features) hidden_states = self._mask_hidden_states( hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask ) + encoder_outputs = self.encoder( hidden_states, attention_mask=attention_mask, @@ -1344,7 +1272,7 @@ class Wav2Vec2ConformerForPreTraining(Wav2Vec2ConformerPreTrainedModel): def __init__(self, config: Wav2Vec2ConformerConfig): super().__init__(config) self.wav2vec2_conformer = Wav2Vec2ConformerModel(config) - self.dropout_features = nn.Dropout(p = config.feat_quantizer_dropout) + self.dropout_features = nn.Dropout(config.feat_quantizer_dropout) self.quantizer = Wav2Vec2ConformerGumbelVectorQuantizer(config) @@ -1403,62 +1331,61 @@ def forward( return_dict: Optional[bool] = None, ) -> Union[Tuple, Wav2Vec2ConformerForPreTrainingOutput]: r""" - Args: - mask_time_indices (`ops.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*): - Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict - masked extracted features in *config.proj_codevector_dim* space. - sampled_negative_indices (`ops.BoolTensor` of shape `(batch_size, sequence_length, num_negatives)`, *optional*): - Indices indicating which quantized target vectors are used as negative sampled vectors in contrastive loss. - Required input for pre-training. + mask_time_indices (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict + masked extracted features in *config.proj_codevector_dim* space. + sampled_negative_indices (`mindspore.Tensor` of shape `(batch_size, sequence_length, num_negatives)`, *optional*): + Indices indicating which quantized target vectors are used as negative sampled vectors in contrastive loss. + Required input for pre-training. Returns: - `Union[Tuple, Wav2Vec2ConformerForPreTrainingOutput]` Example: - ```python - >>> import torch - >>> from transformers import AutoFeatureExtractor, Wav2Vec2ConformerForPreTraining - >>> from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer import _compute_mask_indices, _sample_negative_indices - >>> from datasets import load_dataset - ... - >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-conformer-rel-pos-large") - >>> model = Wav2Vec2ConformerForPreTraining.from_pretrained("facebook/wav2vec2-conformer-rel-pos-large") - ... - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") - >>> input_values = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt").input_values # Batch size 1 - ... - >>> # compute masked indices - >>> batch_size, raw_sequence_length = input_values.shape - >>> sequence_length = model._get_feat_extract_output_lengths(raw_sequence_length).item() - >>> mask_time_indices = _compute_mask_indices( - ... shape=(batch_size, sequence_length), mask_prob=0.2, mask_length=2 - ... ) - >>> sampled_negative_indices = _sample_negative_indices( - ... features_shape=(batch_size, sequence_length), - ... num_negatives=model.config.num_negatives, - ... mask_time_indices=mask_time_indices, - ... ) - >>> mask_time_indices = mindspore.Tensor(data=mask_time_indices, dtype=mindspore.int64) - >>> sampled_negative_indices = mindspore.Tensor( - ... data=sampled_negative_indices, dtype=mindspore.int64 - ... ) - ... - >>> with ops.no_grad(): - ... outputs = model(input_values, mask_time_indices=mask_time_indices) - ... - >>> # compute cosine similarity between predicted (=projected_states) and target (=projected_quantized_states) - >>> cosine_sim = ops.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1) - ... - >>> # show that cosine similarity is much higher than random - >>> cosine_sim[mask_time_indices.to(ops.bool)].mean() > 0.5 - tensor(True) - >>> # for contrastive loss training model should be put into train mode - >>> model = model.train() - >>> loss = model( - ... input_values, mask_time_indices=mask_time_indices, sampled_negative_indices=sampled_negative_indices - ... ).loss - ``` - """ + + ```python + >>> import torch + >>> from transformers import AutoFeatureExtractor, Wav2Vec2ConformerForPreTraining + >>> from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer import _compute_mask_indices, _sample_negative_indices + >>> from datasets import load_dataset + + >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-conformer-rel-pos-large") + >>> model = Wav2Vec2ConformerForPreTraining.from_pretrained("facebook/wav2vec2-conformer-rel-pos-large") + + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + >>> input_values = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt").input_values # Batch size 1 + + >>> # compute masked indices + >>> batch_size, raw_sequence_length = input_values.shape + >>> sequence_length = model._get_feat_extract_output_lengths(raw_sequence_length).item() + >>> mask_time_indices = _compute_mask_indices( + ... shape=(batch_size, sequence_length), mask_prob=0.2, mask_length=2 + ... ) + >>> sampled_negative_indices = _sample_negative_indices( + ... features_shape=(batch_size, sequence_length), + ... num_negatives=model.config.num_negatives, + ... mask_time_indices=mask_time_indices, + ... ) + >>> mask_time_indices = mindspore.tensor(data=mask_time_indices, dtype=mindspore.int64) + >>> sampled_negative_indices = mindspore.tensor( + ... data=sampled_negative_indices, dtype=mindspore.int64 + ... ) + + >>> with no_grad(): + ... outputs = model(input_values, mask_time_indices=mask_time_indices) + + >>> # compute cosine similarity between predicted (=projected_states) and target (=projected_quantized_states) + >>> cosine_sim = ops.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1) + + >>> # show that cosine similarity is much higher than random + >>> cosine_sim[mask_time_indices.to(mindspore.bool_)].mean() > 0.5 + tensor(True) + + >>> # for contrastive loss training model should be put into train mode + >>> model = model.train() + >>> loss = model( + ... input_values, mask_time_indices=mask_time_indices, sampled_negative_indices=sampled_negative_indices + ... ).loss + ```""" return_dict = return_dict if return_dict is not None else self.config.use_return_dict @@ -1522,16 +1449,14 @@ def forward( neg_is_pos = (quantized_features == negative_quantized_features).all(-1) if neg_is_pos.any(): - # NOTE: avoid loss NaN - # float("-inf") => finfo(logits.dtype, 'min') := -3.40282e+38 - logits[1:][neg_is_pos] = -3.40282e+35 + logits[1:][neg_is_pos] = float("-inf") # 6. compute contrastive loss \mathbf{L}_m = cross_entropy(logs) = # -log(exp(sim(c_t, q_t)/\kappa) / \sum_{\sim{q}} exp(sim(c_t, \sim{q})/\kappa)) logits = logits.swapaxes(0, 2).reshape(-1, logits.shape[0]) target = ((1 - mask_time_indices.long()) * -100).swapaxes(0, 1).flatten() - contrastive_loss = F.cross_entropy(logits.float(), target, reduction="sum") + contrastive_loss = nn.functional.cross_entropy(logits.float(), target, reduction="sum") # 7. compute diversity loss: \mathbf{L}_d num_codevectors = self.config.num_codevectors_per_group * self.config.num_codevector_groups diversity_loss = ((num_codevectors - codevector_perplexity) / num_codevectors) * mask_time_indices.sum() @@ -1555,13 +1480,14 @@ def forward( diversity_loss=diversity_loss, ) + class Wav2Vec2ConformerForCTC(Wav2Vec2ConformerPreTrainedModel): # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC.__init__ with Wav2Vec2->Wav2Vec2Conformer,wav2vec2->wav2vec2_conformer def __init__(self, config, target_lang: Optional[str] = None): super().__init__(config) self.wav2vec2_conformer = Wav2Vec2ConformerModel(config) - self.dropout = nn.Dropout(p = config.final_dropout) + self.dropout = nn.Dropout(config.final_dropout) self.target_lang = target_lang @@ -1580,39 +1506,6 @@ def __init__(self, config, target_lang: Optional[str] = None): # Initialize weights and apply final processing self.post_init() - #Copied from wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC.freeze_feature_encoder with wav2vec2->wav2vec2_conformer - def tie_weights(self): - """ - This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when - passing `target_lang=...` to `from_pretrained(...)`. - - This method is **not** supposed to be called by the user and is prone to be changed in the future. - """ - # Note that `tie_weights` is usually used to tie input and output embedding weights. The method is re-purposed to - # correctly load adapter layers for Wav2Vec2 so that we do not have to introduce a new API to - # [`PreTrainedModel`]. While slightly hacky, Wav2Vec2 never has to tie input and output embeddings, so that it is - # ok to repurpose this function here. - target_lang = self.target_lang - - if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None: - raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.") - elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None: - logger.info("By default `target_lang` is set to 'eng'.") - elif target_lang is not None: - self.load_adapter(target_lang) - - def freeze_feature_extractor(self): - """ - Calling this function will disable the gradient computation for the feature encoder so that its parameter will - not be updated during training. - """ - warnings.warn( - "The method `freeze_feature_extractor` is deprecated. " - "Please use the equivalent `freeze_feature_encoder` method instead.", - FutureWarning, - ) - self.freeze_feature_encoder() - # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC.freeze_feature_encoder with wav2vec2->wav2vec2_conformer def freeze_feature_encoder(self): """ @@ -1621,14 +1514,6 @@ def freeze_feature_encoder(self): """ self.wav2vec2_conformer.feature_extractor._freeze_parameters() - def freeze_base_model(self): - """ - Calling this function will disable the gradient computation for the base model so that its parameters will not - be updated during training. Only the classification head will be updated. - """ - for _, param in self.wav2vec2.parameters_and_names(): - param.requires_grad = False - # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC.forward with Wav2Vec2->Wav2Vec2Conformer,wav2vec2->wav2vec2_conformer def forward( self, @@ -1640,16 +1525,17 @@ def forward( labels: Optional[mindspore.Tensor] = None, ) -> Union[Tuple, CausalLMOutput]: r""" - Args: - labels (`mindspore.Tensor` of shape `(batch_size, target_length)`, *optional*): - Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to - the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`. - All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., - config.vocab_size - 1]`. + labels (`mindspore.Tensor` of shape `(batch_size, target_length)`, *optional*): + Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to + the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`. + All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., + config.vocab_size - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if labels is not None and labels.max() >= self.config.vocab_size: + raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") + outputs = self.wav2vec2_conformer( input_values, attention_mask=attention_mask, @@ -1665,8 +1551,6 @@ def forward( loss = None if labels is not None: - if labels.max() >= self.config.vocab_size: - raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") # retrieve loss input_lengths from attention_mask attention_mask = ( attention_mask if attention_mask is not None else ops.ones_like(input_values, dtype=mindspore.int64) @@ -1680,11 +1564,11 @@ def forward( flattened_targets = labels.masked_select(labels_mask) # ctc_loss doesn't support fp16 - log_probs = F.log_softmax(logits, dim=-1).swapaxes(0, 1) + log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=mindspore.float32).swapaxes(0, 1) - loss = F.ctc_loss( + loss = nn.functional.ctc_loss( log_probs, - labels, # flattened_targets + labels, input_lengths, target_lengths, blank=self.config.pad_token_id, @@ -1713,7 +1597,7 @@ def __init__(self, config): self.wav2vec2_conformer = Wav2Vec2ConformerModel(config) num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings if config.use_weighted_layer_sum: - self.layer_weights = mindspore.Parameter(ops.ones(num_layers) / num_layers,'layer_weights') + self.layer_weights = nn.Parameter(ops.ones(num_layers) / num_layers) self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size) self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels) @@ -1733,7 +1617,7 @@ def freeze_base_model(self): Calling this function will disable the gradient computation for the base model so that its parameters will not be updated during training. Only the classification head will be updated. """ - for _, param in self.wav2vec2_conformer.parameters_and_names(): + for param in self.wav2vec2_conformer.parameters(): param.requires_grad = False # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.forward with Wav2Vec2->Wav2Vec2Conformer,wav2vec2->wav2vec2_conformer,WAV_2_VEC_2->WAV2VEC2_CONFORMER @@ -1747,11 +1631,10 @@ def forward( labels: Optional[mindspore.Tensor] = None, ) -> Union[Tuple, SequenceClassifierOutput]: r""" - Args: - labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*): - Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., - config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If - `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict @@ -1768,24 +1651,25 @@ def forward( if self.config.use_weighted_layer_sum: hidden_states = outputs[_HIDDEN_STATES_START_POSITION] hidden_states = ops.stack(hidden_states, dim=1) - norm_weights = ops.softmax(self.layer_weights, dim=-1) + norm_weights = nn.functional.softmax(self.layer_weights, dim=-1) hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1) else: hidden_states = outputs[0] hidden_states = self.projector(hidden_states) if attention_mask is None: - pooled_output = hidden_states.mean(axis=1) + pooled_output = ops.mean(hidden_states, dim=1) else: padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask) hidden_states[~padding_mask] = 0.0 - pooled_output = hidden_states.sum(axis=1) / padding_mask.sum(axis=1).view(-1, 1) + pooled_output = ops.sum(hidden_states, dim=1) / ops.sum(padding_mask, dim=1).view(-1, 1) logits = self.classifier(pooled_output) loss = None if labels is not None: - loss = F.cross_entropy(logits.view(-1, self.config.num_labels), labels.view(-1)) + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1)) if not return_dict: output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:] @@ -1811,7 +1695,7 @@ def __init__(self, config): self.wav2vec2_conformer = Wav2Vec2ConformerModel(config) num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings if config.use_weighted_layer_sum: - self.layer_weights = mindspore.Parameter(ops.ones(num_layers) / num_layers,'layer_weights') + self.layer_weights = nn.Parameter(ops.ones(num_layers) / num_layers) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.num_labels = config.num_labels @@ -1845,11 +1729,10 @@ def forward( return_dict: Optional[bool] = None, ) -> Union[Tuple, TokenClassifierOutput]: r""" - Args: - labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*): - Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., - config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If - `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict @@ -1866,8 +1749,8 @@ def forward( if self.config.use_weighted_layer_sum: hidden_states = outputs[_HIDDEN_STATES_START_POSITION] hidden_states = ops.stack(hidden_states, dim=1) - norm_weights = ops.softmax(self.layer_weights, dim=-1) - hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(axis=1) + norm_weights = nn.functional.softmax(self.layer_weights, dim=-1) + hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1) else: hidden_states = outputs[0] @@ -1875,7 +1758,8 @@ def forward( loss = None if labels is not None: - loss = F.cross_entropy(logits.view(-1, self.num_labels), ops.argmax(labels.view(-1, self.num_labels), dim=1)) + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), ops.argmax(labels.view(-1, self.num_labels), dim=1)) if not return_dict: output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:] @@ -1896,20 +1780,19 @@ def __init__(self, input_dim, num_labels, scale=30.0, margin=0.4): self.scale = scale self.margin = margin self.num_labels = num_labels - # self.weight = mindspore.Parameter(ops.randn(input_dim, num_labels), 'weight').requires_grad = True - self.weight = Parameter(ops.randn(input_dim, num_labels), requires_grad=True) - #self.loss = F.cross_entropy() - + self.weight = nn.Parameter(ops.randn(input_dim, num_labels), requires_grad=True) + self.loss = nn.CrossEntropyLoss() def forward(self, hidden_states, labels): labels = labels.flatten() - weight = self.weight / ops.norm(self.weight, dim=0, keepdim=True) - hidden_states = hidden_states / ops.norm(hidden_states, dim=1, keepdim=True) + weight = nn.functional.normalize(self.weight, dim=0) + hidden_states = nn.functional.normalize(hidden_states, dim=1) cos_theta = ops.mm(hidden_states, weight) psi = cos_theta - self.margin - onehot = ops.one_hot(labels, self.num_labels) + + onehot = nn.functional.one_hot(labels, self.num_labels) logits = self.scale * ops.where(onehot.bool(), psi, cos_theta) - loss = F.cross_entropy(logits, labels) + loss = self.loss(logits, labels) return loss @@ -1927,17 +1810,18 @@ def __init__(self, config, layer_id=0): self.activation = nn.ReLU() def forward(self, hidden_states: mindspore.Tensor) -> mindspore.Tensor: - # if is_peft_available(): - # from peft.tuners.lora import LoraLayer - # if isinstance(self.kernel, LoraLayer): - # warnings.warn( - # "Detected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. " - # "You should exclude TDNNLayer from LoRA's target modules.", - # ) + from ....peft.tuners.lora import LoraLayer + + if isinstance(self.kernel, LoraLayer): + warnings.warn( + "Detected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. " + "You should exclude TDNNLayer from LoRA's target modules.", + ) + # for backward compatibility, we keep nn.Linear but call F.conv1d for speed up hidden_states = hidden_states.swapaxes(1, 2) weight = self.kernel.weight.view(self.out_conv_dim, self.kernel_size, self.in_conv_dim).swapaxes(1, 2) - hidden_states = ops.conv1d(hidden_states, weight, self.kernel.bias, dilation=self.dilation) + hidden_states = nn.functional.conv1d(hidden_states, weight, self.kernel.bias, dilation=self.dilation) hidden_states = hidden_states.swapaxes(1, 2) hidden_states = self.activation(hidden_states) @@ -1951,7 +1835,7 @@ def __init__(self, config): self.wav2vec2_conformer = Wav2Vec2ConformerModel(config) num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings if config.use_weighted_layer_sum: - self.layer_weights = mindspore.Parameter(ops.ones(num_layers) / num_layers) + self.layer_weights = nn.Parameter(ops.ones(num_layers) / num_layers) self.projector = nn.Linear(config.hidden_size, config.tdnn_dim[0]) tdnn_layers = [TDNNLayer(config, i) for i in range(len(config.tdnn_dim))] @@ -1978,7 +1862,7 @@ def freeze_base_model(self): Calling this function will disable the gradient computation for the base model so that its parameters will not be updated during training. Only the classification head will be updated. """ - for _, param in self.wav2vec2_conformer.parameters_and_names(): + for param in self.wav2vec2_conformer.parameters(): param.requires_grad = False # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForXVector._get_tdnn_output_lengths with wav2vec2->wav2vec2_conformer @@ -1989,7 +1873,6 @@ def _get_tdnn_output_lengths(self, input_lengths: Union[mindspore.Tensor, int]): def _conv_out_length(input_length, kernel_size, stride): # 1D convolutional layer output length formula taken - # from https://pyops.org/docs/stable/generated/ops.nn.Conv1d.html return (input_length - kernel_size) // stride + 1 for kernel_size in self.config.tdnn_kernel: @@ -2008,11 +1891,10 @@ def forward( labels: Optional[mindspore.Tensor] = None, ) -> Union[Tuple, XVectorOutput]: r""" - Args: - labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*): - Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., - config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If - `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict @@ -2029,8 +1911,8 @@ def forward( if self.config.use_weighted_layer_sum: hidden_states = outputs[_HIDDEN_STATES_START_POSITION] hidden_states = ops.stack(hidden_states, dim=1) - norm_weights = ops.softmax(self.layer_weights, dim=-1) - hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(axis=1) + norm_weights = nn.functional.softmax(self.layer_weights, dim=-1) + hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1) else: hidden_states = outputs[0] @@ -2041,16 +1923,16 @@ def forward( # Statistic Pooling if attention_mask is None: - mean_features = hidden_states.mean(axis=1) - std_features = ops.std(hidden_states, dim=1, keepdim=True).squeeze(1) + mean_features = ops.mean(hidden_states, dim=1) + std_features = ops.std(hidden_states, dim=1) else: feat_extract_output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(dim=1)) tdnn_output_lengths = self._get_tdnn_output_lengths(feat_extract_output_lengths) mean_features = [] std_features = [] for i, length in enumerate(tdnn_output_lengths): - mean_features.append(hidden_states[i, :length].mean(dim=0)) - std_features.append(hidden_states[i, :length].std(dim=0)) + mean_features.append(ops.mean(hidden_states[i, :length], dim=0)) + std_features.append(ops.std(hidden_states[i, :length], dim=0)) mean_features = ops.stack(mean_features) std_features = ops.stack(std_features) statistic_pooling = ops.cat([mean_features, std_features], dim=-1) @@ -2060,7 +1942,6 @@ def forward( loss = None if labels is not None: - labels = labels.astype(mindspore.int32) loss = self.objective(logits, labels) if not return_dict: @@ -2074,3 +1955,13 @@ def forward( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + +__all__ = [ + "Wav2Vec2ConformerForAudioFrameClassification", + "Wav2Vec2ConformerForCTC", + "Wav2Vec2ConformerForPreTraining", + "Wav2Vec2ConformerForSequenceClassification", + "Wav2Vec2ConformerForXVector", + "Wav2Vec2ConformerModel", + "Wav2Vec2ConformerPreTrainedModel", +] diff --git a/mindnlp/transformers/models/wavlm/modeling_wavlm.py b/mindnlp/transformers/models/wavlm/modeling_wavlm.py index 33884f413..28d121aa7 100644 --- a/mindnlp/transformers/models/wavlm/modeling_wavlm.py +++ b/mindnlp/transformers/models/wavlm/modeling_wavlm.py @@ -16,17 +16,14 @@ import math import warnings -from typing import Optional, Tuple, Union, List +from typing import Optional, Tuple, Union import numpy as np import mindspore -from mindspore.common.initializer import initializer, Normal, TruncatedNormal, Uniform, HeNormal - -from mindnlp.core import nn, ops from mindnlp.core.nn import functional as F -from mindnlp.utils import logging +from mindnlp.core import nn, ops +from mindnlp.core.nn import CrossEntropyLoss -from .configuration_wavlm import WavLMConfig from ...activations import ACT2FN from ...modeling_outputs import ( BaseModelOutput, @@ -37,6 +34,11 @@ XVectorOutput, ) from ...modeling_utils import PreTrainedModel +from ....utils import ( + logging, +) +from .configuration_wavlm import WavLMConfig + logger = logging.get_logger(__name__) @@ -63,112 +65,6 @@ _XVECTOR_EXPECTED_OUTPUT = 0.97 -def _canonical_mask( - mask: Optional[mindspore.Tensor], - mask_name: str, - other_type: Optional[int], - other_name: str, - target_type: int, - check_other: bool = True, -) -> Optional[mindspore.Tensor]: - if mask is not None: - _mask_dtype = mask.dtype - _mask_is_float = ops.is_floating_point(mask) - if _mask_dtype != mindspore.bool_ and not _mask_is_float: - raise AssertionError( - f"only bool and floating types of {mask_name} are supported") - if check_other and other_type is not None: - if _mask_dtype != other_type: - warnings.warn( - f"Support for mismatched {mask_name} and {other_name} " - "is deprecated. Use same type for both instead." - ) - if not _mask_is_float: - zero_tensor = ops.zeros_like(mask, dtype=target_type) - mask = ops.where(mask, mindspore.Tensor(float("-inf"), target_type), zero_tensor) - # mask = ( - # ops.zeros_like(mask, dtype=target_type) - # .masked_fill_(mask, float("-inf")) - # ) - return mask - -def linear(x, weight, bias): - """inner linear""" - out = ops.matmul(x, weight.swapaxes(-1, -2)) - if bias is not None: - out = out + bias - return out - -def _none_or_dtype(input: Optional[mindspore.Tensor]) -> Optional[int]: - if input is None: - return None - elif isinstance(input, mindspore.Tensor): - return input.dtype - raise RuntimeError("input to _none_or_dtype() must be None or mindspore.Tensor") - -def _in_projection_packed( - q: mindspore.Tensor, - k: mindspore.Tensor, - v: mindspore.Tensor, - w: mindspore.Tensor, - b: Optional[mindspore.Tensor] = None, -) -> List[mindspore.Tensor]: - r"""Perform the in-projection step of the attention operation, using packed weights. - - Output is a triple containing projection tensors for query, key and value. - - Args: - q, k, v: query, key and value tensors to be projected. For self-attention, - these are typically the same tensor; for encoder-decoder attention, - k and v are typically the same tensor. (We take advantage of these - identities for performance if they are present.) Regardless, q, k and v - must share a common embedding dimension; otherwise their shapes may vary. - w: projection weights for q, k and v, packed into a single tensor. Weights - are packed along dimension 0, in q, k, v order. - b: optional projection biases for q, k and v, packed into a single tensor - in q, k, v order. - - Shape: - Inputs: - - q: :math:`(..., E)` where E is the embedding dimension - - k: :math:`(..., E)` where E is the embedding dimension - - v: :math:`(..., E)` where E is the embedding dimension - - w: :math:`(E * 3, E)` where E is the embedding dimension - - b: :math:`E * 3` where E is the embedding dimension - - Output: - - in output list :math:`[q', k', v']`, each output tensor will have the - same shape as the corresponding input tensor. - """ - E = q.size(-1) - if k is v: - if q is k: - # self-attention - proj = linear(q, w, b) - # reshape to 3, E and not E, 3 is deliberate for better memory coalescing and keeping same order as chunk() - proj = proj.unflatten(-1, (3, E)).unsqueeze(0).swapaxes(0, -2).squeeze(-2) - return proj[0], proj[1], proj[2] - else: - # encoder-decoder attention - w_q, w_kv = w.split([E, E * 2]) - if b is None: - b_q = b_kv = None - else: - b_q, b_kv = b.split([E, E * 2]) - q_proj = linear(q, w_q, b_q) - kv_proj = linear(k, w_kv, b_kv) - # reshape to 2, E and not E, 2 is deliberate for better memory coalescing and keeping same order as chunk() - kv_proj = kv_proj.unflatten(-1, (2, E)).unsqueeze(0).swapaxes(0, -2).squeeze(-2) - return (q_proj, kv_proj[0], kv_proj[1]) - else: - w_q, w_k, w_v = w.chunk(3) - if b is None: - b_q = b_k = b_v = None - else: - b_q, b_k, b_v = b.chunk(3) - return linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v) - - # Copied from transformers.models.wav2vec2.modeling_wav2vec2._compute_mask_indices def _compute_mask_indices( shape: Tuple[int, int], @@ -184,15 +80,15 @@ def _compute_mask_indices( Args: shape: The shape for which to compute masks. This should be of a tuple of size 2 where - the first element is the batch size and the second element is the length of the axis to span. + the first element is the batch size and the second element is the length of the axis to span. mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of - independently generated mask spans of length `mask_length` is computed by - `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the - actual percentage will be smaller. + independently generated mask spans of length `mask_length` is computed by + `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the + actual percentage will be smaller. mask_length: size of the mask min_masks: minimum number of masked spans attention_mask: A (right-padded) attention mask which independently shortens the feature axis of - each batch dimension. + each batch dimension. """ batch_size, sequence_length = shape @@ -225,7 +121,7 @@ def compute_num_masked_span(input_length): # compute number of masked spans in batch input_lengths = ( - ops.stop_gradient(attention_mask.sum(-1)).tolist() + attention_mask.sum(-1).tolist() if attention_mask is not None else [sequence_length for _ in range(batch_size)] ) @@ -325,15 +221,15 @@ def __init__(self, config, layer_id=0): stride=config.conv_stride[layer_id], bias=config.conv_bias, ) - self.layer_norm = nn.LayerNorm(self.out_conv_dim) + self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True) self.activation = ACT2FN[config.feat_extract_activation] def forward(self, hidden_states): hidden_states = self.conv(hidden_states) - hidden_states = hidden_states.swapaxes(-2, -1) + hidden_states = ops.transpose(hidden_states, -2, -1) hidden_states = self.layer_norm(hidden_states) - hidden_states = hidden_states.swapaxes(-2, -1) + hidden_states = ops.transpose(hidden_states, -2, -1) hidden_states = self.activation(hidden_states) return hidden_states @@ -374,22 +270,22 @@ def __init__(self, config): kernel_size=config.num_conv_pos_embeddings, padding=config.num_conv_pos_embeddings // 2, groups=config.num_conv_pos_embedding_groups, - bias=True ) - self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2) + weight_norm = nn.utils.weight_norm + self.conv = weight_norm(self.conv, name="weight", dim=2) self.padding = WavLMSamePadLayer(config.num_conv_pos_embeddings) self.activation = ACT2FN[config.feat_extract_activation] def forward(self, hidden_states): - hidden_states = hidden_states.swapaxes(1, 2) + hidden_states = ops.transpose(hidden_states, 1, 2) hidden_states = self.conv(hidden_states) hidden_states = self.padding(hidden_states) hidden_states = self.activation(hidden_states) - hidden_states = hidden_states.swapaxes(1, 2) + hidden_states = ops.transpose(hidden_states, 1, 2) return hidden_states @@ -427,7 +323,7 @@ def __init__(self, config): self._requires_grad = True def _freeze_parameters(self): - for param in self.get_parameters(): + for param in self.parameters(): param.requires_grad = False self._requires_grad = False @@ -467,7 +363,7 @@ def __init__(self, config): super().__init__() self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps) self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size) - self.dropout = nn.Dropout(p=config.feat_proj_dropout) + self.dropout = nn.Dropout(config.feat_proj_dropout) def forward(self, hidden_states): # non-projected hidden states are needed for quantization @@ -510,7 +406,7 @@ def __init__( self.num_buckets = num_buckets self.max_distance = max_distance - self.gru_rel_pos_const = mindspore.Parameter(ops.ones(1, self.num_heads, 1, 1)) + self.gru_rel_pos_const = nn.Parameter(ops.ones(1, self.num_heads, 1, 1)) self.gru_rel_pos_linear = nn.Linear(self.head_dim, 8) if has_relative_position_bias: @@ -531,7 +427,7 @@ def forward( if position_bias is None: position_bias = self.compute_bias(tgt_len, tgt_len) position_bias = ( - position_bias.unsqueeze(0).repeat(bsz, 1, 1, 1).view(bsz * self.num_heads, tgt_len, tgt_len) + position_bias.unsqueeze(0).tile((bsz, 1, 1, 1)).view(bsz * self.num_heads, tgt_len, tgt_len) ) # Compute relative position bias: @@ -544,7 +440,7 @@ def forward( relative_position_proj = relative_position_proj.view(gated_hidden_states.shape[:-1] + (2, 4)).sum(-1) # 3) compute gate for position bias from projected hidden states - gate_a, gate_b = ops.sigmoid(relative_position_proj).chunk(2, axis=-1) + gate_a, gate_b = ops.chunk(ops.sigmoid(relative_position_proj), 2, dim=-1) gate_output = gate_a * (gate_b * self.gru_rel_pos_const - 1.0) + 2.0 # 4) apply gate to position bias to compute gated position_bias @@ -555,7 +451,6 @@ def forward( hidden_states, attention_mask, gated_position_bias, output_attentions ) - return attn_output, attn_weights, position_bias def torch_multi_head_self_attention( @@ -567,14 +462,13 @@ def torch_multi_head_self_attention( ) -> (mindspore.Tensor, mindspore.Tensor): """simple wrapper around torch's multi_head_attention_forward function""" # self-attention assumes q = k = v - query = key = value = hidden_states.swapaxes(0, 1) + query = key = value = ops.transpose(hidden_states, 0, 1) key_padding_mask = attention_mask.ne(1) if attention_mask is not None else None # disable bias and add_zero_attn bias_k = bias_v = None add_zero_attn = False - # PyTorch 1.3.0 has F.multi_head_attention_forward defined # so no problem with backwards compatibility attn_output, attn_weights = F.multi_head_attention_forward( @@ -583,7 +477,7 @@ def torch_multi_head_self_attention( value, self.embed_dim, self.num_heads, - ops.zeros([0]), + ops.empty([0]), ops.cat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias)), bias_k, bias_v, @@ -593,8 +487,8 @@ def torch_multi_head_self_attention( self.out_proj.bias, self.training, key_padding_mask, - # attention_mask, - attn_mask=gated_position_bias, + output_attentions, + gated_position_bias, use_separate_proj_weight=True, q_proj_weight=self.q_proj.weight, k_proj_weight=self.k_proj.weight, @@ -602,7 +496,7 @@ def torch_multi_head_self_attention( ) # [Seq_Len, Batch Size, ...] -> [Batch Size, Seq_Len, ...] - attn_output = attn_output.swapaxes(0, 1) + attn_output = ops.transpose(attn_output, 0, 1) if attn_weights is not None: # IMPORTANT: Attention weights are averaged weights @@ -626,7 +520,7 @@ def compute_bias(self, query_length: int, key_length: int) -> mindspore.Tensor: def _relative_positions_bucket(self, relative_positions: mindspore.Tensor) -> mindspore.Tensor: num_buckets = self.num_buckets // 2 - relative_buckets = (relative_positions > 0).astype(mindspore.int64) * num_buckets + relative_buckets = (relative_positions > 0).to(mindspore.int64) * num_buckets relative_positions = ops.abs(relative_positions) max_exact = num_buckets // 2 @@ -635,17 +529,11 @@ def _relative_positions_bucket(self, relative_positions: mindspore.Tensor) -> mi relative_positions_if_large = ops.log(relative_positions.float() / max_exact) relative_positions_if_large = relative_positions_if_large / math.log(self.max_distance / max_exact) relative_positions_if_large = relative_positions_if_large * (num_buckets - max_exact) - relative_position_if_large = (max_exact + relative_positions_if_large).astype(mindspore.int64) - # relative_position_if_large = ops.min( - # relative_position_if_large, ops.full_like(relative_position_if_large, num_buckets - 1) - # ) - relative_position_if_large = ops.where( - relative_position_if_large < ops.full_like(relative_position_if_large, num_buckets - 1), - relative_position_if_large, - ops.full_like(relative_position_if_large, num_buckets - 1) + relative_position_if_large = (max_exact + relative_positions_if_large).to(mindspore.int64) + relative_position_if_large = ops.minimum( + relative_position_if_large, ops.full_like(relative_position_if_large, num_buckets - 1) ) - relative_buckets += ops.where(is_small, relative_positions, relative_position_if_large) return relative_buckets @@ -654,7 +542,7 @@ def _relative_positions_bucket(self, relative_positions: mindspore.Tensor) -> mi class WavLMFeedForward(nn.Module): def __init__(self, config): super().__init__() - self.intermediate_dropout = nn.Dropout(p=config.activation_dropout) + self.intermediate_dropout = nn.Dropout(config.activation_dropout) self.intermediate_dense = nn.Linear(config.hidden_size, config.intermediate_size) if isinstance(config.hidden_act, str): @@ -663,7 +551,7 @@ def __init__(self, config): self.intermediate_act_fn = config.hidden_act self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size) - self.output_dropout = nn.Dropout(p=config.hidden_dropout) + self.output_dropout = nn.Dropout(config.hidden_dropout) def forward(self, hidden_states): hidden_states = self.intermediate_dense(hidden_states) @@ -686,7 +574,7 @@ def __init__(self, config: WavLMConfig, has_relative_position_bias: bool = True) max_distance=config.max_bucket_distance, has_relative_position_bias=has_relative_position_bias, ) - self.dropout = nn.Dropout(p=config.hidden_dropout) + self.dropout = nn.Dropout(config.hidden_dropout) self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.feed_forward = WavLMFeedForward(config) self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) @@ -700,8 +588,6 @@ def forward(self, hidden_states, attention_mask=None, position_bias=None, output output_attentions=output_attentions, index=index, ) - - hidden_states = self.dropout(hidden_states) hidden_states = attn_residual + hidden_states @@ -715,9 +601,6 @@ def forward(self, hidden_states, attention_mask=None, position_bias=None, output if output_attentions: outputs += (attn_weights,) - - - return outputs @@ -732,7 +615,7 @@ def __init__(self, config: WavLMConfig, has_relative_position_bias: bool = True) max_distance=config.max_bucket_distance, has_relative_position_bias=has_relative_position_bias, ) - self.dropout = nn.Dropout(p=config.hidden_dropout) + self.dropout = nn.Dropout(config.hidden_dropout) self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.feed_forward = WavLMFeedForward(config) self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) @@ -764,7 +647,7 @@ def __init__(self, config): self.config = config self.pos_conv_embed = WavLMPositionalConvEmbedding(config) self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.dropout = nn.Dropout(p=config.hidden_dropout) + self.dropout = nn.Dropout(config.hidden_dropout) self.layers = nn.ModuleList( [WavLMEncoderLayer(config, has_relative_position_bias=(i == 0)) for i in range(config.num_hidden_layers)] ) @@ -845,7 +728,7 @@ def __init__(self, config): self.config = config self.pos_conv_embed = WavLMPositionalConvEmbedding(config) self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.dropout = nn.Dropout(p=config.hidden_dropout) + self.dropout = nn.Dropout(config.hidden_dropout) self.layers = nn.ModuleList( [ WavLMEncoderLayerStableLayerNorm(config, has_relative_position_bias=(i == 0)) @@ -938,7 +821,7 @@ def __init__(self, config): ) # storage for codebook variables (codewords) - self.codevectors = mindspore.Parameter( + self.codevectors = nn.Parameter( mindspore.Tensor(1, self.num_groups * self.num_vars, config.codevector_dim // self.num_groups) ) self.weight_proj = nn.Linear(config.conv_dim[-1], self.num_groups * self.num_vars) @@ -948,7 +831,7 @@ def __init__(self, config): @staticmethod def _compute_perplexity(probs): - marginal_probs = probs.mean(axis=0) + marginal_probs = probs.mean(dim=0) perplexity = ops.exp(-ops.sum(marginal_probs * ops.log(marginal_probs + 1e-7), dim=-1)).sum() return perplexity @@ -961,7 +844,7 @@ def forward(self, hidden_states): if self.training: # sample code vector probs via gumbel in differentiateable way - codevector_probs = ops.gumbel_softmax(hidden_states.float(), tau=self.temperature, hard=True) + codevector_probs = nn.functional.gumbel_softmax(hidden_states.float(), tau=self.temperature, hard=True) codevector_probs = codevector_probs.type_as(hidden_states) # compute perplexity @@ -1010,14 +893,14 @@ def forward(self, hidden_states): hidden_states = self.proj(hidden_states) hidden_states = self.proj_layer_norm(hidden_states) - hidden_states = hidden_states.swapaxes(1, 2) + hidden_states = ops.transpose(hidden_states, 1, 2) for layer in self.layers: layerdrop_prob = np.random.random() if not self.training or (layerdrop_prob > self.layerdrop): hidden_states = layer(hidden_states) - hidden_states = hidden_states.swapaxes(1, 2) + hidden_states = ops.transpose(hidden_states, 1, 2) return hidden_states @@ -1035,7 +918,7 @@ def __init__(self, config): def forward(self, hidden_states): hidden_states = self.conv(hidden_states) - hidden_states = F.glu(hidden_states, dim=1) + hidden_states = nn.functional.glu(hidden_states, dim=1) return hidden_states @@ -1051,69 +934,38 @@ class WavLMPreTrainedModel(PreTrainedModel): main_input_name = "input_values" supports_gradient_checkpointing = True - def _init_weights(self, cell): + def _init_weights(self, module): """Initialize the weights""" # gumbel softmax requires special init - if isinstance(cell, WavLMGumbelVectorQuantizer): - # module.weight_proj.weight.data.normal_(mean=0.0, std=1) - # module.weight_proj.bias.data.zero_() - # nn.init.uniform_(module.codevectors) - cell.weight_proj.weight.set_data(initializer(Normal(1), - cell.weight.shape, cell.weight.dtype)) - cell.weight_proj.bias.set_data(initializer('zeros', cell.bias.shape, cell.bias.dtype)) - cell.codevectors.set_data(initializer(TruncatedNormal(sigma=0.2, mean=0.5, a=-2.5, b=2.5), - cell.codevectors.shape, cell.codevectors.dtype)) - - elif isinstance(cell, WavLMPositionalConvEmbedding): - # nn.init.normal_( - # module.conv.weight, - # mean=0, - # std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)), - # ) - # nn.init.constant_(module.conv.bias, 0) - cell.conv.weight.set_data(initializer(Normal(2 * math.sqrt(1 / (cell.conv.kernel_size[0] * cell.conv.in_channels))), - cell.conv.weight.shape, cell.conv.weight.dtype)) - cell.conv.bias.set_data(initializer('zeros', cell.conv.bias.shape, cell.conv.bias.dtype)) - - - - elif isinstance(cell, WavLMFeatureProjection): - # k = math.sqrt(1 / module.projection.in_features) - # nn.init.uniform_(module.projection.weight, a=-k, b=k) - # nn.init.uniform_(module.projection.bias, a=-k, b=k) - k = math.sqrt(1 / cell.projection.in_channels) - cell.projection.weight.set_data(initializer(Uniform(scale=k), - cell.projection.weight.shape, cell.projection.weight.dtype)) - cell.projection.bias.set_data(initializer(Uniform(scale=k), - cell.projection.bias.shape, cell.projection.bias.dtype)) - - elif isinstance(cell, nn.Linear): - # module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) - # - # if module.bias is not None: - # module.bias.data.zero_() - cell.weight.set_data(initializer(Normal(self.config.initializer_range), - cell.weight.shape, cell.weight.dtype)) - if cell.bias is not None: - cell.bias.set_data(initializer('zeros', cell.bias.shape, cell.bias.dtype)) - elif isinstance(cell, (nn.LayerNorm, nn.GroupNorm)): - # module.bias.data.zero_() - # module.weight.data.fill_(1.0) - cell.weight.set_data(initializer('ones', cell.weight.shape, cell.weight.dtype)) - cell.bias.set_data(initializer('zeros', cell.bias.shape, cell.bias.dtype)) - elif isinstance(cell, nn.Conv1d): - # nn.init.kaiming_normal_(module.weight) - # - # if module.bias is not None: - # k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0])) - # nn.init.uniform_(module.bias, a=-k, b=k) - cell.weight.set_data( - initializer(HeNormal(),cell.weight.shape, cell.weight.dtype)) - if cell.bias is not None: - k = math.sqrt(cell.group / (cell.in_channels * cell.kernel_size[0])) - cell.bias.set_data(initializer(Uniform(scale=k), - cell.bias.shape, cell.bias.dtype)) - + if isinstance(module, WavLMGumbelVectorQuantizer): + nn.init.normal_(module.weight_proj.weight, mean=0.0, std=1) + nn.init.zeros_(module.weight_proj.bias) + nn.init.uniform_(module.codevectors) + elif isinstance(module, WavLMPositionalConvEmbedding): + nn.init.normal_( + module.conv.weight, + mean=0, + std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)), + ) + nn.init.constant_(module.conv.bias, 0) + elif isinstance(module, WavLMFeatureProjection): + k = math.sqrt(1 / module.projection.in_features) + nn.init.uniform_(module.projection.weight, a=-k, b=k) + nn.init.uniform_(module.projection.bias, a=-k, b=k) + elif isinstance(module, nn.Linear): + nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range) + + if module.bias is not None: + nn.init.zeros_(module.bias) + elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)): + nn.init.zeros_(module.bias) + nn.init.ones_(module.weight) + elif isinstance(module, nn.Conv1d): + nn.init.kaiming_normal_(module.weight) + + if module.bias is not None: + k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0])) + nn.init.uniform_(module.bias, a=-k, b=k) def _get_feat_extract_output_lengths( self, input_lengths: Union[mindspore.Tensor, int], add_adapter: Optional[bool] = None @@ -1126,7 +978,6 @@ def _get_feat_extract_output_lengths( def _conv_out_length(input_length, kernel_size, stride): # 1D convolutional layer output length formula taken - # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html return ops.div(input_length - kernel_size, stride, rounding_mode="floor") + 1 for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride): @@ -1143,10 +994,10 @@ def _get_feature_vector_attention_mask( ): # Effectively attention_mask.sum(-1), but not inplace to be able to run # on inference mode. - non_padded_lengths = attention_mask.cumsum(axis=-1)[:, -1] + non_padded_lengths = ops.cumsum(attention_mask, dim=-1)[:, -1] output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter) - output_lengths = output_lengths.astype(mindspore.int64) + output_lengths = output_lengths.to(mindspore.int64) batch_size = attention_mask.shape[0] @@ -1155,66 +1006,10 @@ def _get_feature_vector_attention_mask( ) # these two operations makes sure that all values before the output lengths idxs are attended to attention_mask[(ops.arange(attention_mask.shape[0]), output_lengths - 1)] = 1 - attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool() + attention_mask = ops.cumsum(attention_mask.flip([-1]), -1).flip([-1]).bool() return attention_mask -WAVLM_START_DOCSTRING = r""" - WavLM was proposed in [WavLM: Unified Speech Representation Learning with Labeled and Unlabeled - Data](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo - Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, - Jian Wu, Michael Zeng, Xiangzhan Yu, Furu Wei. - - This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the - library implements for all its model (such as downloading or saving etc.). - - This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use - it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and - behavior. - - Parameters: - config ([`WavLMConfig`]): Model configuration class with all the parameters of the model. - Initializing with a config file does not load the weights associated with the model, only the - configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. -""" - - -WAVLM_INPUTS_DOCSTRING = r""" - Args: - input_values (`mindspore.Tensor` of shape `(batch_size, sequence_length)`): - Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `mindspore.Tensor`. See [`Wav2Vec2Processor.__call__`] for details. - attention_mask (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - - - - `attention_mask` should only be passed if the corresponding processor has `config.return_attention_mask == - True`. For all models whose processor has `config.return_attention_mask == False`, `attention_mask` should - **not** be passed to avoid degraded performance when doing batched inference. For such models - `input_values` should simply be padded with 0 and passed without `attention_mask`. Be aware that these - models also yield slightly different results depending on whether `input_values` is padded or not. - - - - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned - tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for - more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. -""" - - # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model with Wav2Vec2->WavLM, wav2vec2->wavlm, WAV_2_VEC_2->WAVLM, WavLMBaseModelOutput->Wav2Vec2BaseModelOutput class WavLMModel(WavLMPreTrainedModel): def __init__(self, config: WavLMConfig): @@ -1225,99 +1020,7 @@ def __init__(self, config: WavLMConfig): # model only needs masking vector if mask prob is > 0.0 if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0: - - self.masked_spec_embed_fixed = mindspore.Tensor([0.6690, 0.8174, 0.0483, 0.8542, 0.5385, 0.7270, 0.8509, 0.7227, 0.4435, - 0.9075, 0.5943, 0.5755, 0.2277, 0.5103, 0.1635, 0.6906, 0.3977, 0.9756, - 0.0362, 0.9023, 0.3385, 0.1798, 0.5457, 0.9846, 0.8872, 0.7534, 0.7174, - 0.9129, 0.0361, 0.5914, 0.6458, 0.0551, 0.4543, 0.2475, 0.5665, 0.5622, - 0.7827, 0.2933, 0.4264, 0.2142, 0.8809, 0.7395, 0.8117, 0.8880, 0.9114, - 0.7873, 0.1974, 0.5749, 0.2186, 0.7509, 0.9451, 0.5604, 0.4548, 0.3830, - 0.8748, 0.0481, 0.7892, 0.6930, 0.6757, 0.3346, 0.5754, 0.0830, 0.3630, - 0.3927, 0.4438, 0.3057, 0.2056, 0.6541, 0.8959, 0.3882, 0.3742, 0.6756, - 0.2212, 0.4545, 0.4845, 0.5233, 0.9661, 0.8705, 0.0297, 0.2031, 0.9059, - 0.2570, 0.3765, 0.6301, 0.2756, 0.4591, 0.2101, 0.5576, 0.1532, 0.3753, - 0.6413, 0.1778, 0.5639, 0.7753, 0.4551, 0.7990, 0.1866, 0.0881, 0.5993, - 0.0529, 0.9180, 0.4496, 0.7429, 0.7545, 0.8755, 0.8374, 0.0907, 0.7265, - 0.7455, 0.0652, 0.0794, 0.3860, 0.9730, 0.7865, 0.8821, 0.2630, 0.2690, - 0.6491, 0.0887, 0.4657, 0.8514, 0.0096, 0.6633, 0.7675, 0.9290, 0.9126, - 0.0885, 0.7826, 0.8512, 0.6113, 0.7821, 0.0923, 0.9687, 0.3606, 0.7457, - 0.3216, 0.4239, 0.0411, 0.1968, 0.6589, 0.9997, 0.6803, 0.3238, 0.0318, - 0.3006, 0.0840, 0.3048, 0.7558, 0.5318, 0.0110, 0.6965, 0.9264, 0.8576, - 0.8286, 0.7549, 0.3492, 0.6382, 0.4695, 0.6429, 0.8461, 0.4037, 0.6143, - 0.6750, 0.0130, 0.5454, 0.8819, 0.7204, 0.8509, 0.5713, 0.3463, 0.3251, - 0.1364, 0.9822, 0.1932, 0.4651, 0.8423, 0.0824, 0.0385, 0.6319, 0.4540, - 0.9898, 0.0858, 0.2168, 0.8091, 0.2082, 0.0317, 0.5799, 0.8108, 0.2224, - 0.1679, 0.2297, 0.1149, 0.6511, 0.8530, 0.2673, 0.2593, 0.1479, 0.6914, - 0.1220, 0.2791, 0.2264, 0.3477, 0.0301, 0.4977, 0.9622, 0.9822, 0.1609, - 0.9212, 0.2130, 0.7508, 0.9012, 0.8798, 0.9235, 0.2774, 0.1695, 0.1931, - 0.6583, 0.8880, 0.1824, 0.5290, 0.8476, 0.5914, 0.2393, 0.2043, 0.5509, - 0.4092, 0.5522, 0.1584, 0.1846, 0.5055, 0.3038, 0.2121, 0.1347, 0.8977, - 0.4759, 0.3980, 0.1729, 0.5186, 0.3864, 0.1076, 0.7897, 0.5062, 0.6262, - 0.3445, 0.7281, 0.5154, 0.1098, 0.8532, 0.8998, 0.1109, 0.1660, 0.2890, - 0.3983, 0.9154, 0.2710, 0.6147, 0.1245, 0.2494, 0.1251, 0.6717, 0.4353, - 0.8889, 0.4446, 0.2871, 0.5897, 0.8086, 0.4644, 0.5078, 0.5242, 0.4318, - 0.9208, 0.2187, 0.1061, 0.2322, 0.9779, 0.1891, 0.5374, 0.8748, 0.2969, - 0.9084, 0.4123, 0.2679, 0.1227, 0.2493, 0.0069, 0.4302, 0.7309, 0.6150, - 0.8707, 0.9405, 0.0665, 0.0617, 0.4912, 0.8631, 0.3454, 0.5959, 0.4082, - 0.5628, 0.1539, 0.4820, 0.2230, 0.7901, 0.9863, 0.3853, 0.6251, 0.0294, - 0.5922, 0.4190, 0.1238, 0.9131, 0.7443, 0.7243, 0.2333, 0.5575, 0.9056, - 0.6038, 0.6373, 0.3231, 0.1106, 0.7115, 0.0738, 0.1821, 0.5646, 0.6631, - 0.9203, 0.3644, 0.8854, 0.7089, 0.9513, 0.6969, 0.6221, 0.9998, 0.3835, - 0.1778, 0.8368, 0.4535, 0.0226, 0.7247, 0.3746, 0.3204, 0.0739, 0.5398, - 0.9403, 0.6918, 0.7779, 0.1451, 0.2665, 0.2724, 0.9406, 0.7556, 0.4615, - 0.9865, 0.9019, 0.4024, 0.0430, 0.5586, 0.0194, 0.4044, 0.8839, 0.6115, - 0.9678, 0.0424, 0.1750, 0.1324, 0.3528, 0.0426, 0.4412, 0.0817, 0.5239, - 0.1943, 0.2168, 0.1862, 0.1268, 0.9675, 0.7493, 0.9916, 0.0120, 0.6652, - 0.3382, 0.1434, 0.0340, 0.5746, 0.2504, 0.6652, 0.4948, 0.9776, 0.8149, - 0.8904, 0.6182, 0.5081, 0.9500, 0.6186, 0.7949, 0.9912, 0.0316, 0.5226, - 0.6809, 0.6388, 0.8631, 0.3738, 0.3314, 0.0405, 0.1620, 0.3713, 0.8028, - 0.9732, 0.9597, 0.3242, 0.2495, 0.2347, 0.2002, 0.5536, 0.1284, 0.7263, - 0.5329, 0.3998, 0.5114, 0.9307, 0.3562, 0.7596, 0.7474, 0.5452, 0.6765, - 0.9079, 0.6698, 0.3373, 0.7954, 0.8829, 0.8574, 0.2378, 0.5754, 0.4218, - 0.4776, 0.6210, 0.0870, 0.7172, 0.4000, 0.7223, 0.3835, 0.0187, 0.6055, - 0.2987, 0.1763, 0.9496, 0.0019, 0.6128, 0.2233, 0.6464, 0.6703, 0.3060, - 0.5027, 0.5011, 0.1066, 0.9224, 0.6772, 0.1122, 0.4799, 0.0956, 0.6784, - 0.2987, 0.4378, 0.8626, 0.1457, 0.8810, 0.2955, 0.3982, 0.9872, 0.2424, - 0.4985, 0.9825, 0.8322, 0.6646, 0.5974, 0.9266, 0.7363, 0.8470, 0.3441, - 0.6455, 0.0959, 0.3900, 0.0110, 0.5135, 0.7431, 0.9956, 0.4753, 0.2459, - 0.1745, 0.4280, 0.3137, 0.5803, 0.8807, 0.0013, 0.2719, 0.2735, 0.0174, - 0.5792, 0.2755, 0.7145, 0.6616, 0.7531, 0.0317, 0.1691, 0.2877, 0.9014, - 0.3965, 0.5576, 0.0569, 0.0952, 0.7354, 0.6605, 0.4193, 0.0895, 0.3981, - 0.5928, 0.1463, 0.7944, 0.8587, 0.8905, 0.5828, 0.8698, 0.0869, 0.5440, - 0.0108, 0.9643, 0.2618, 0.0239, 0.5285, 0.9577, 0.5655, 0.6379, 0.2955, - 0.6893, 0.6071, 0.1768, 0.3647, 0.6052, 0.7924, 0.8311, 0.4018, 0.4684, - 0.7488, 0.9257, 0.1174, 0.9175, 0.2108, 0.7104, 0.0650, 0.9683, 0.1456, - 0.3139, 0.9895, 0.4817, 0.3550, 0.3194, 0.2714, 0.3304, 0.3714, 0.6225, - 0.5636, 0.6906, 0.1564, 0.2612, 0.8385, 0.2389, 0.6572, 0.1156, 0.5804, - 0.3947, 0.0016, 0.2312, 0.0136, 0.2436, 0.7072, 0.4118, 0.6912, 0.1629, - 0.0368, 0.5640, 0.7028, 0.0881, 0.9698, 0.7337, 0.0634, 0.7968, 0.0754, - 0.6724, 0.2065, 0.7023, 0.1979, 0.4276, 0.3267, 0.3916, 0.9641, 0.5335, - 0.3355, 0.5741, 0.9364, 0.7964, 0.2325, 0.4632, 0.0586, 0.4343, 0.9153, - 0.3367, 0.3897, 0.8585, 0.4316, 0.3008, 0.4461, 0.3888, 0.4275, 0.2071, - 0.7893, 0.7605, 0.4429, 0.1573, 0.0303, 0.7489, 0.9437, 0.2839, 0.2179, - 0.3195, 0.4809, 0.1952, 0.8383, 0.0198, 0.8895, 0.4406, 0.9321, 0.5931, - 0.3670, 0.9503, 0.5326, 0.9467, 0.2632, 0.4534, 0.7885, 0.7485, 0.9038, - 0.5202, 0.4448, 0.6610, 0.1788, 0.2415, 0.0186, 0.3090, 0.3962, 0.7363, - 0.5319, 0.0024, 0.5918, 0.0702, 0.3051, 0.3310, 0.6551, 0.7465, 0.2650, - 0.3644, 0.8870, 0.9065, 0.9198, 0.6367, 0.5113, 0.1910, 0.8260, 0.4486, - 0.8939, 0.9591, 0.0051, 0.9798, 0.6846, 0.9752, 0.6470, 0.2136, 0.8094, - 0.1351, 0.6637, 0.1317, 0.5875, 0.3815, 0.3004, 0.5598, 0.2138, 0.2395, - 0.7725, 0.4870, 0.2897, 0.5427, 0.7458, 0.4651, 0.7445, 0.5091, 0.5224, - 0.1761, 0.3968, 0.8253, 0.0378, 0.1911, 0.2917, 0.8945, 0.5533, 0.9208, - 0.9452, 0.5043, 0.4790, 0.6593, 0.4681, 0.5305, 0.2849, 0.7655, 0.8555, - 0.2354, 0.5224, 0.2482, 0.6614, 0.4972, 0.8426, 0.3883, 0.1001, 0.4299, - 0.6966, 0.4446, 0.9288, 0.4683, 0.0273, 0.1940, 0.8093, 0.3530, 0.8765, - 0.8774, 0.7397, 0.6672, 0.8504, 0.9556, 0.9929, 0.3112, 0.7945, 0.2682, - 0.4824, 0.1706, 0.8585, 0.9539, 0.1334, 0.0866, 0.8030, 0.8256, 0.1504, - 0.0553, 0.5819, 0.3482, 0.9587, 0.3867, 0.5643, 0.7611, 0.5880, 0.2536, - 0.6834, 0.3636, 0.3593, 0.1886, 0.2166, 0.0668, 0.8122, 0.2461, 0.5877, - 0.0802, 0.4127, 0.1399]) - if config.hidden_size >= 768: - self.masked_spec_embed=self.masked_spec_embed_fixed - else: - self.masked_spec_embed = ops.abs(mindspore.Tensor(shape=(config.hidden_size), dtype=mindspore.float32, init=Uniform(1.0))) - - + self.masked_spec_embed = nn.Parameter(ops.randn(config.hidden_size)) if config.do_stable_layer_norm: self.encoder = WavLMEncoderStableLayerNorm(config) @@ -1335,7 +1038,7 @@ def freeze_feature_extractor(self): not be updated during training. """ warnings.warn( - "The method `freeze_feature_extractor` is deprecated. " + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " "Please use the equivalent `freeze_feature_encoder` method instead.", FutureWarning, ) @@ -1368,7 +1071,7 @@ def _mask_hidden_states( if mask_time_indices is not None: # apply SpecAugment along time axis with given mask_time_indices - hidden_states[mask_time_indices] = self.masked_spec_embed.astype(hidden_states.dtype) + hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype) elif self.config.mask_time_prob > 0 and self.training: mask_time_indices = _compute_mask_indices( (batch_size, sequence_length), @@ -1377,10 +1080,10 @@ def _mask_hidden_states( attention_mask=attention_mask, min_masks=self.config.mask_time_min_masks, ) - mask_time_indices = mindspore.Tensor(mask_time_indices, dtype=mindspore.bool_) - hidden_states[mask_time_indices] = self.masked_spec_embed.astype(hidden_states.dtype) + mask_time_indices = mindspore.tensor(mask_time_indices, dtype=mindspore.bool_) + hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype) - if self.config.mask_feature_prob > 0: + if self.config.mask_feature_prob > 0 and self.training: # generate indices & apply SpecAugment along feature axis mask_feature_indices = _compute_mask_indices( (batch_size, hidden_size), @@ -1388,11 +1091,10 @@ def _mask_hidden_states( mask_length=self.config.mask_feature_length, min_masks=self.config.mask_feature_min_masks, ) - mask_feature_indices = mindspore.Tensor(mask_feature_indices, dtype=mindspore.bool_) - mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1) + mask_feature_indices = mindspore.tensor(mask_feature_indices, dtype=mindspore.bool_) + mask_feature_indices = mask_feature_indices[:, None].broadcast_to((-1, sequence_length, -1)) hidden_states[mask_feature_indices] = 0 - return hidden_states def forward( @@ -1411,7 +1113,7 @@ def forward( return_dict = return_dict if return_dict is not None else self.config.use_return_dict extract_features = self.feature_extractor(input_values) - extract_features = extract_features.swapaxes(1, 2) + extract_features = ops.transpose(extract_features, 1, 2) if attention_mask is not None: # compute reduced attention_mask corresponding to feature vectors @@ -1424,8 +1126,6 @@ def forward( hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask ) - - encoder_outputs = self.encoder( hidden_states, attention_mask=attention_mask, @@ -1436,7 +1136,6 @@ def forward( hidden_states = encoder_outputs[0] - if self.adapter is not None: hidden_states = self.adapter(hidden_states) @@ -1457,7 +1156,7 @@ def __init__(self, config, target_lang: Optional[str] = None): super().__init__(config) self.wavlm = WavLMModel(config) - self.dropout = nn.Dropout(p=config.final_dropout) + self.dropout = nn.Dropout(config.final_dropout) self.target_lang = target_lang @@ -1503,7 +1202,7 @@ def freeze_feature_extractor(self): not be updated during training. """ warnings.warn( - "The method `freeze_feature_extractor` is deprecated. " + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " "Please use the equivalent `freeze_feature_encoder` method instead.", FutureWarning, ) @@ -1521,7 +1220,7 @@ def freeze_base_model(self): Calling this function will disable the gradient computation for the base model so that its parameters will not be updated during training. Only the classification head will be updated. """ - for param in self.wavlm.get_parameters(): + for param in self.wavlm.parameters(): param.requires_grad = False def forward( @@ -1534,14 +1233,14 @@ def forward( labels: Optional[mindspore.Tensor] = None, ) -> Union[Tuple, CausalLMOutput]: r""" - Args: - labels (`mindspore.Tensor` of shape `(batch_size, target_length)`, *optional*): - Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to - the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`. - All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., - config.vocab_size - 1]`. + labels (`mindspore.Tensor` of shape `(batch_size, target_length)`, *optional*): + Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to + the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`. + All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., + config.vocab_size - 1]`. """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if labels is not None and labels.max() >= self.config.vocab_size: raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") @@ -1564,7 +1263,7 @@ def forward( attention_mask = ( attention_mask if attention_mask is not None else ops.ones_like(input_values, dtype=mindspore.int64) ) - input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).astype(mindspore.int64) + input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(mindspore.int64) # assuming that padded tokens are filled with -100 # when not being attended to @@ -1573,10 +1272,9 @@ def forward( flattened_targets = labels.masked_select(labels_mask) # ctc_loss doesn't support fp16 - log_probs = F.log_softmax(logits.astype(mindspore.float32), dim=-1).swapaxes(0, 1) + log_probs = ops.transpose(nn.functional.log_softmax(logits, dim=-1, dtype=mindspore.float32), 0, 1) - # with torch.backends.cudnn.flags(enabled=False): - loss = F.ctc_loss( + loss = nn.functional.ctc_loss( log_probs, labels, input_lengths, @@ -1606,7 +1304,7 @@ def __init__(self, config): self.wavlm = WavLMModel(config) num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings if config.use_weighted_layer_sum: - self.layer_weights = mindspore.Parameter(ops.ones(num_layers) / num_layers) + self.layer_weights = nn.Parameter(ops.ones(num_layers) / num_layers) self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size) self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels) @@ -1620,7 +1318,7 @@ def freeze_feature_extractor(self): not be updated during training. """ warnings.warn( - "The method `freeze_feature_extractor` is deprecated. " + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " "Please use the equivalent `freeze_feature_encoder` method instead.", FutureWarning, ) @@ -1640,7 +1338,7 @@ def freeze_base_model(self): Calling this function will disable the gradient computation for the base model so that its parameters will not be updated during training. Only the classification head will be updated. """ - for param in self.wavlm.get_parameters(): + for param in self.wavlm.parameters(): param.requires_grad = False # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.forward with Wav2Vec2->WavLM, wav2vec2->wavlm @@ -1654,11 +1352,10 @@ def forward( labels: Optional[mindspore.Tensor] = None, ) -> Union[Tuple, SequenceClassifierOutput]: r""" - Args: - labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*): - Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., - config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If - `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict @@ -1672,28 +1369,28 @@ def forward( return_dict=return_dict, ) - if self.config.use_weighted_layer_sum: hidden_states = outputs[_HIDDEN_STATES_START_POSITION] hidden_states = ops.stack(hidden_states, dim=1) - norm_weights = ops.softmax(self.layer_weights, dim=-1) - hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(axis=1) + norm_weights = nn.functional.softmax(self.layer_weights, dim=-1) + hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1) else: hidden_states = outputs[0] hidden_states = self.projector(hidden_states) if attention_mask is None: - pooled_output = hidden_states.mean(axis=1) + pooled_output = ops.mean(hidden_states, dim=1) else: padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask) hidden_states[~padding_mask] = 0.0 - pooled_output = hidden_states.sum(axis=1) / padding_mask.sum(axis=1).view(-1, 1) + pooled_output = ops.sum(hidden_states, dim=1) / ops.sum(padding_mask, dim=1).view(-1, 1) logits = self.classifier(pooled_output) loss = None if labels is not None: - loss = F.cross_entropy(logits.view(-1, self.config.num_labels), labels.view(-1)) + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1)) if not return_dict: output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:] @@ -1706,6 +1403,7 @@ def forward( attentions=outputs.attentions, ) + # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForAudioFrameClassification with Wav2Vec2->WavLM, wav2vec2->wavlm, WAV_2_VEC_2->WAVLM class WavLMForAudioFrameClassification(WavLMPreTrainedModel): def __init__(self, config): @@ -1718,7 +1416,7 @@ def __init__(self, config): self.wavlm = WavLMModel(config) num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings if config.use_weighted_layer_sum: - self.layer_weights = mindspore.Parameter(ops.ones(num_layers) / num_layers) + self.layer_weights = nn.Parameter(ops.ones(num_layers) / num_layers) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.num_labels = config.num_labels @@ -1730,7 +1428,7 @@ def freeze_feature_extractor(self): not be updated during training. """ warnings.warn( - "The method `freeze_feature_extractor` is deprecated. " + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " "Please use the equivalent `freeze_feature_encoder` method instead.", FutureWarning, ) @@ -1748,7 +1446,7 @@ def freeze_base_model(self): Calling this function will disable the gradient computation for the base model so that its parameters will not be updated during training. Only the classification head will be updated. """ - for param in self.wavlm.get_parameters(): + for param in self.wavlm.parameters(): param.requires_grad = False def forward( @@ -1761,11 +1459,10 @@ def forward( return_dict: Optional[bool] = None, ) -> Union[Tuple, TokenClassifierOutput]: r""" - Args: - labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*): - Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., - config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If - `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict @@ -1782,8 +1479,8 @@ def forward( if self.config.use_weighted_layer_sum: hidden_states = outputs[_HIDDEN_STATES_START_POSITION] hidden_states = ops.stack(hidden_states, dim=1) - norm_weights = ops.softmax(self.layer_weights, dim=-1) - hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(axis=1) + norm_weights = nn.functional.softmax(self.layer_weights, dim=-1) + hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1) else: hidden_states = outputs[0] @@ -1791,8 +1488,8 @@ def forward( loss = None if labels is not None: - loss = F.cross_entropy(logits.view(-1, self.num_labels), ops.argmax(labels.view(-1, self.num_labels), dim=1)) - + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), ops.argmax(labels.view(-1, self.num_labels), dim=1)) if not return_dict: output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:] @@ -1813,19 +1510,19 @@ def __init__(self, input_dim, num_labels, scale=30.0, margin=0.4): self.scale = scale self.margin = margin self.num_labels = num_labels - self.weight = mindspore.Parameter(ops.randn(input_dim, num_labels), requires_grad=True) - # self.loss = nn.CrossEntropyLoss() + self.weight = nn.Parameter(ops.randn(input_dim, num_labels), requires_grad=True) + self.loss = nn.CrossEntropyLoss() def forward(self, hidden_states, labels): labels = labels.flatten() - weight = F.normalize(self.weight, dim=0) - hidden_states = F.normalize(hidden_states, dim=1) + weight = nn.functional.normalize(self.weight, dim=0) + hidden_states = nn.functional.normalize(hidden_states, dim=1) cos_theta = ops.mm(hidden_states, weight) psi = cos_theta - self.margin - onehot = F.one_hot(labels, self.num_labels) + onehot = nn.functional.one_hot(labels, self.num_labels) logits = self.scale * ops.where(onehot.bool(), psi, cos_theta) - loss = F.cross_entropy(logits, labels) + loss = self.loss(logits, labels) return loss @@ -1843,20 +1540,19 @@ def __init__(self, config, layer_id=0): self.activation = nn.ReLU() def forward(self, hidden_states: mindspore.Tensor) -> mindspore.Tensor: - # if is_peft_available(): - # from peft.tuners.lora import LoraLayer - # - # if isinstance(self.kernel, LoraLayer): - # warnings.warn( - # "Detected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. " - # "You should exclude TDNNLayer from LoRA's target modules.", - # ) + from ....peft.tuners.lora import LoraLayer + + if isinstance(self.kernel, LoraLayer): + warnings.warn( + "Detected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. " + "You should exclude TDNNLayer from LoRA's target modules.", + ) # for backward compatibility, we keep nn.Linear but call F.conv1d for speed up - hidden_states = hidden_states.swapaxes(1, 2) - weight = self.kernel.weight.view(self.out_conv_dim, self.kernel_size, self.in_conv_dim).swapaxes(1, 2) - hidden_states = ops.conv1d(hidden_states, weight, self.kernel.bias, dilation=self.dilation) - hidden_states = hidden_states.swapaxes(1, 2) + hidden_states = ops.transpose(hidden_states, 1, 2) + weight = ops.transpose(self.kernel.weight.view(self.out_conv_dim, self.kernel_size, self.in_conv_dim), 1, 2) + hidden_states = nn.functional.conv1d(hidden_states, weight, self.kernel.bias, dilation=self.dilation) + hidden_states = ops.transpose(hidden_states, 1, 2) hidden_states = self.activation(hidden_states) return hidden_states @@ -1870,7 +1566,7 @@ def __init__(self, config): self.wavlm = WavLMModel(config) num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings if config.use_weighted_layer_sum: - self.layer_weights = mindspore.Parameter(ops.ones(num_layers) / num_layers) + self.layer_weights = nn.Parameter(ops.ones(num_layers) / num_layers) self.projector = nn.Linear(config.hidden_size, config.tdnn_dim[0]) tdnn_layers = [TDNNLayer(config, i) for i in range(len(config.tdnn_dim))] @@ -1889,7 +1585,7 @@ def freeze_feature_extractor(self): not be updated during training. """ warnings.warn( - "The method `freeze_feature_extractor` is deprecated. " + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. " "Please use the equivalent `freeze_feature_encoder` method instead.", FutureWarning, ) @@ -1907,7 +1603,7 @@ def freeze_base_model(self): Calling this function will disable the gradient computation for the base model so that its parameters will not be updated during training. Only the classification head will be updated. """ - for param in self.wavlm.get_parameters(): + for param in self.wavlm.parameters(): param.requires_grad = False def _get_tdnn_output_lengths(self, input_lengths: Union[mindspore.Tensor, int]): @@ -1917,7 +1613,6 @@ def _get_tdnn_output_lengths(self, input_lengths: Union[mindspore.Tensor, int]): def _conv_out_length(input_length, kernel_size, stride): # 1D convolutional layer output length formula taken - # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html return (input_length - kernel_size) // stride + 1 for kernel_size in self.config.tdnn_kernel: @@ -1935,11 +1630,10 @@ def forward( labels: Optional[mindspore.Tensor] = None, ) -> Union[Tuple, XVectorOutput]: r""" - Args: - labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*): - Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., - config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If - `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict @@ -1956,8 +1650,8 @@ def forward( if self.config.use_weighted_layer_sum: hidden_states = outputs[_HIDDEN_STATES_START_POSITION] hidden_states = ops.stack(hidden_states, dim=1) - norm_weights = ops.softmax(self.layer_weights, dim=-1) - hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(axis=1) + norm_weights = nn.functional.softmax(self.layer_weights, dim=-1) + hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1) else: hidden_states = outputs[0] @@ -1968,15 +1662,15 @@ def forward( # Statistic Pooling if attention_mask is None: - mean_features = hidden_states.mean(axis=1) + mean_features = ops.mean(hidden_states, dim=1) std_features = ops.std(hidden_states, dim=1) else: - feat_extract_output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(axis=1)) + feat_extract_output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(dim=1)) tdnn_output_lengths = self._get_tdnn_output_lengths(feat_extract_output_lengths) mean_features = [] std_features = [] for i, length in enumerate(tdnn_output_lengths): - mean_features.append(hidden_states[i, :length].mean(axis=0)) + mean_features.append(ops.mean(hidden_states[i, :length], dim=0)) std_features.append(ops.std(hidden_states[i, :length], dim=0)) mean_features = ops.stack(mean_features) std_features = ops.stack(std_features) diff --git a/tests/ut/transformers/models/deta/test_modeling_deta.py b/tests/ut/transformers/models/deta/test_modeling_deta.py index 54318889b..f7fbed3a7 100644 --- a/tests/ut/transformers/models/deta/test_modeling_deta.py +++ b/tests/ut/transformers/models/deta/test_modeling_deta.py @@ -370,6 +370,10 @@ def test_resize_tokens_embeddings(self): def test_feed_forward_chunking(self): pass + @unittest.skip(reason="grid_sampler_2d_grad_cpu_kernel.h:162] store] memcpy_s failed. errorno is: 34") + def test_training(self): + pass + def test_attention_outputs(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config.return_dict = True diff --git a/tests/ut/transformers/models/wav2vec2/test_modeling_wav2vec2.py b/tests/ut/transformers/models/wav2vec2/test_modeling_wav2vec2.py index 1c183c6c7..ea6268be1 100644 --- a/tests/ut/transformers/models/wav2vec2/test_modeling_wav2vec2.py +++ b/tests/ut/transformers/models/wav2vec2/test_modeling_wav2vec2.py @@ -1,4 +1,3 @@ - # coding=utf-8 # Copyright 2021 The HuggingFace Inc. team. All rights reserved. # @@ -13,63 +12,31 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# pylint: disable=missing-class-docstring -# pylint: disable=missing-function-docstring -# pylint: disable=unused-argument -# pylint: disable=unused-variable -# pylint: disable=invalid-name -# pylint: disable=consider-using-enumerate -# pylint: disable=import-error -# pylint: disable=redefined-builtin -# pylint: disable=ungrouped-imports -""" Testing suite for the PyTorch Wav2Vec2 model. """ +"""Testing suite for the PyTorch Wav2Vec2 model.""" import gc import math import multiprocessing import os +import pickle import tempfile import traceback import unittest import numpy as np -import librosa as L from datasets import load_dataset +from pytest import mark -import mindspore as ms -import mindspore.ops as F -import mindspore.numpy as mnp -from mindspore import nn -from mindspore import Tensor - -from mindnlp.transformers import Wav2Vec2Config +from mindnlp.transformers import Wav2Vec2Config, is_mindspore_available from mindnlp.utils.testing_utils import ( CaptureLogger, is_pyctcdecode_available, - require_librosa, - require_mindspore, require_pyctcdecode, + require_soundfile, + require_mindspore, run_test_in_subprocess, slow, ) -from mindnlp.transformers import ( - Wav2Vec2FeatureExtractor, - Wav2Vec2ForAudioFrameClassification, - Wav2Vec2ForCTC, - Wav2Vec2ForMaskedLM, - Wav2Vec2ForPreTraining, - Wav2Vec2ForSequenceClassification, - Wav2Vec2ForXVector, - Wav2Vec2Model, - Wav2Vec2Processor, -) -from mindnlp.transformers.models.wav2vec2.modeling_wav2vec2 import ( - WAV2VEC2_ADAPTER_PT_FILE, - WAV2VEC2_ADAPTER_SAFE_FILE, - Wav2Vec2GumbelVectorQuantizer, - _compute_mask_indices, - _sample_negative_indices, -) from ...test_configuration_common import ConfigTester from ...test_modeling_common import ( @@ -79,8 +46,34 @@ ids_tensor, random_attention_mask, ) +# from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_mindspore_available(): + import mindspore + from mindnlp.core import ops, no_grad, nn + from mindnlp.core.nn import functional + from mindnlp.core.serialization import safe_save_file, save + + from mindnlp.transformers import ( + Wav2Vec2FeatureExtractor, + Wav2Vec2ForAudioFrameClassification, + Wav2Vec2ForCTC, + Wav2Vec2ForMaskedLM, + Wav2Vec2ForPreTraining, + Wav2Vec2ForSequenceClassification, + Wav2Vec2ForXVector, + Wav2Vec2Model, + Wav2Vec2Processor, + ) + from mindnlp.transformers.models.wav2vec2.modeling_wav2vec2 import ( + WAV2VEC2_ADAPTER_PT_FILE, + WAV2VEC2_ADAPTER_SAFE_FILE, + Wav2Vec2GumbelVectorQuantizer, + _compute_mask_indices, + _sample_negative_indices, + ) -mnp.allclose = lambda x, y, *args, **kwargs: np.allclose(x.asnumpy(), y.asnumpy(), *args, **kwargs) if is_pyctcdecode_available(): import pyctcdecode.decoder @@ -89,27 +82,32 @@ from mindnlp.transformers.models.wav2vec2_with_lm import processing_wav2vec2_with_lm + def _test_wav2vec2_with_lm_invalid_pool(in_queue, out_queue, timeout): error = None try: _ = in_queue.get(timeout=timeout) - ds = load_dataset("mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True, trust_remote_code=True) + ds = load_dataset( + "mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True, trust_remote_code=True + ) sample = next(iter(ds)) - resampled_audio = L.resample( - ms.tensor(sample["audio"]["array"]).numpy(), orig_sr=48_000, target_sr=16_000 - ) + resampled_audio = torchaudio.functional.resample( + mindspore.tensor(sample["audio"]["array"]), 48_000, 16_000 + ).numpy() model = Wav2Vec2ForCTC.from_pretrained("patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm") processor = Wav2Vec2ProcessorWithLM.from_pretrained("patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm") input_values = processor(resampled_audio, return_tensors="ms").input_values - logits = model(input_values).logits + + with no_grad(): + logits = model(input_values).logits # use a spawn pool, which should trigger a warning if different than fork with CaptureLogger(pyctcdecode.decoder.logger) as cl, multiprocessing.get_context("spawn").Pool(1) as pool: - transcription = processor.batch_decode(logits, pool).text + transcription = processor.batch_decode(logits.asnumpy(), pool).text unittest.TestCase().assertIn("Falling back to sequential decoding.", cl.out) unittest.TestCase().assertEqual(transcription[0], "habitan aguas poco profundas y rocosas") @@ -117,11 +115,11 @@ def _test_wav2vec2_with_lm_invalid_pool(in_queue, out_queue, timeout): # force batch_decode to internally create a spawn pool, which should trigger a warning if different than fork multiprocessing.set_start_method("spawn", force=True) with CaptureLogger(processing_wav2vec2_with_lm.logger) as cl: - transcription = processor.batch_decode(logits).text + transcription = processor.batch_decode(logits.asnumpy()).text unittest.TestCase().assertIn("Falling back to sequential decoding.", cl.out) unittest.TestCase().assertEqual(transcription[0], "habitan aguas poco profundas y rocosas") - except: # pylint: disable=bare-except + except Exception: error = f"{traceback.format_exc()}" results = {"error": error} @@ -247,7 +245,7 @@ def get_config(self): def create_and_check_model(self, config, input_values, attention_mask): model = Wav2Vec2Model(config=config) - model.set_train(False) + model.eval() result = model(input_values, attention_mask=attention_mask) self.parent.assertEqual( result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size) @@ -256,7 +254,7 @@ def create_and_check_model(self, config, input_values, attention_mask): def create_and_check_model_with_adapter(self, config, input_values, attention_mask): config.add_adapter = True model = Wav2Vec2Model(config=config) - model.set_train(False) + model.eval() result = model(input_values, attention_mask=attention_mask) self.parent.assertEqual( result.last_hidden_state.shape, (self.batch_size, self.adapter_output_seq_length, self.hidden_size) @@ -266,7 +264,7 @@ def create_and_check_model_with_adapter_for_ctc(self, config, input_values, atte config.add_adapter = True config.output_hidden_size = 2 * config.hidden_size model = Wav2Vec2ForCTC(config=config) - model.set_train(False) + model.eval() result = model(input_values, attention_mask=attention_mask) self.parent.assertEqual( result.logits.shape, (self.batch_size, self.adapter_output_seq_length, self.vocab_size) @@ -276,7 +274,7 @@ def create_and_check_model_with_adapter_proj_dim(self, config, input_values, att config.add_adapter = True config.output_hidden_size = 8 model = Wav2Vec2Model(config=config) - model.set_train(False) + model.eval() result = model(input_values, attention_mask=attention_mask) self.parent.assertEqual( result.last_hidden_state.shape, @@ -289,7 +287,7 @@ def create_and_check_model_with_attn_adapter(self, config, input_values, attenti self.parent.assertIsNotNone(model._get_adapters()) - model.set_train(False) + model.eval() result = model(input_values, attention_mask=attention_mask) self.parent.assertEqual(result.logits.shape, (self.batch_size, self.output_seq_length, self.vocab_size)) @@ -297,10 +295,10 @@ def create_and_check_batch_inference(self, config, input_values, *args): # test does not pass for models making use of `group_norm` # check: https://github.com/pytorch/fairseq/issues/3227 model = Wav2Vec2Model(config=config) - model.set_train(False) + model.eval() input_values = input_values[:3] - attention_mask = F.ones(input_values.shape, dtype=ms.bool_) + attention_mask = ops.ones(input_values.shape, dtype=mindspore.bool_) input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]] @@ -316,17 +314,19 @@ def create_and_check_batch_inference(self, config, input_values, *args): output = model(input_slice).last_hidden_state batch_output = batch_outputs[i : i + 1, : output.shape[1]] - self.parent.assertTrue(mnp.allclose(output, batch_output, atol=1e-3)) + self.parent.assertTrue(ops.allclose(output, batch_output, atol=1e-3)) def check_ctc_loss(self, config, input_values, *args): model = Wav2Vec2ForCTC(config=config) - model.set_train(False) # make sure that dropout is disabled + + # make sure that dropout is disabled + model.eval() input_values = input_values[:3] - attention_mask = F.ones(input_values.shape, dtype=ms.int64) + attention_mask = ops.ones(input_values.shape, dtype=mindspore.int64) input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]] - max_length_labels = model._get_feat_extract_output_lengths(ms.tensor(input_lengths)) + max_length_labels = model._get_feat_extract_output_lengths(mindspore.tensor(input_lengths)) labels = ids_tensor((input_values.shape[0], min(max_length_labels).item() - 1), model.config.vocab_size) # pad input @@ -345,10 +345,12 @@ def check_ctc_loss(self, config, input_values, *args): def check_seq_classifier_loss(self, config, input_values, *args): model = Wav2Vec2ForSequenceClassification(config=config) - model.set_train(False) # make sure that dropout is disabled + + # make sure that dropout is disabled + model.eval() input_values = input_values[:3] - attention_mask = F.ones(input_values.shape, dtype=ms.int64) + attention_mask = ops.ones(input_values.shape, dtype=mindspore.int64) input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]] labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label)) @@ -365,11 +367,10 @@ def check_seq_classifier_loss(self, config, input_values, *args): self.parent.assertTrue(isinstance(unmasked_loss, float)) self.parent.assertTrue(masked_loss != unmasked_loss) - @unittest.skip('ignore train temporarily') def check_ctc_training(self, config, input_values, *args): config.ctc_zero_infinity = True model = Wav2Vec2ForCTC(config=config) - model.set_train(True) + model.train() # freeze feature encoder model.freeze_feature_encoder() @@ -377,7 +378,7 @@ def check_ctc_training(self, config, input_values, *args): input_values = input_values[:3] input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]] - max_length_labels = model._get_feat_extract_output_lengths(ms.tensor(input_lengths)) + max_length_labels = model._get_feat_extract_output_lengths(mindspore.tensor(input_lengths)) labels = ids_tensor((input_values.shape[0], max(max_length_labels).item() - 2), model.config.vocab_size) # pad input @@ -390,15 +391,14 @@ def check_ctc_training(self, config, input_values, *args): labels[i, max_length_labels[i] - 1 :] = -100 loss = model(input_values, labels=labels).loss - self.parent.assertFalse(F.isinf(loss).item()) + self.parent.assertFalse(ops.isinf(loss).item()) - # TODO: loss.backward() + loss.backward() - @unittest.skip('ignore train temporarily') def check_seq_classifier_training(self, config, input_values, *args): config.ctc_zero_infinity = True model = Wav2Vec2ForSequenceClassification(config=config) - model.set_train(True) + model.train() # freeze everything but the classification head model.freeze_base_model() @@ -413,15 +413,14 @@ def check_seq_classifier_training(self, config, input_values, *args): input_values[i, input_lengths[i] :] = 0.0 loss = model(input_values, labels=labels).loss - self.parent.assertFalse(F.isinf(loss).item()) + self.parent.assertFalse(ops.isinf(loss).item()) - # TODO: loss.backward() + loss.backward() - @unittest.skip('ignore train temporarily') def check_xvector_training(self, config, input_values, *args): config.ctc_zero_infinity = True model = Wav2Vec2ForXVector(config=config) - model.set_train(True) + model.train() # freeze everything but the classification head model.freeze_base_model() @@ -436,18 +435,18 @@ def check_xvector_training(self, config, input_values, *args): input_values[i, input_lengths[i] :] = 0.0 loss = model(input_values, labels=labels).loss - self.parent.assertFalse(F.isinf(loss).item()) + self.parent.assertFalse(ops.isinf(loss).item()) - # TODO: loss.backward() + loss.backward() def check_labels_out_of_vocab(self, config, input_values, *args): model = Wav2Vec2ForCTC(config) - model.set_train(True) + model.train() input_values = input_values[:3] input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]] - max_length_labels = model._get_feat_extract_output_lengths(ms.tensor(input_lengths)) + max_length_labels = model._get_feat_extract_output_lengths(mindspore.tensor(input_lengths)) labels = ids_tensor((input_values.shape[0], max(max_length_labels).item() - 2), model.config.vocab_size + 100) with self.parent.assertRaises(ValueError): @@ -461,7 +460,11 @@ def prepare_config_and_inputs_for_common(self): @require_mindspore class Wav2Vec2ModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = (Wav2Vec2ForCTC, Wav2Vec2Model, Wav2Vec2ForMaskedLM, Wav2Vec2ForSequenceClassification, Wav2Vec2ForPreTraining) + all_model_classes = ( + (Wav2Vec2ForCTC, Wav2Vec2Model, Wav2Vec2ForMaskedLM, Wav2Vec2ForSequenceClassification, Wav2Vec2ForPreTraining) + if is_mindspore_available() + else () + ) pipeline_model_mapping = ( { "audio-classification": Wav2Vec2ForSequenceClassification, @@ -469,6 +472,8 @@ class Wav2Vec2ModelTest(ModelTesterMixin, unittest.TestCase): "feature-extraction": Wav2Vec2Model, "fill-mask": Wav2Vec2ForMaskedLM, } + if is_mindspore_available() + else {} ) fx_compatible = True test_pruning = False @@ -513,7 +518,7 @@ def test_seq_classifier_train(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.check_seq_classifier_training(*config_and_inputs) - def test_xvector_traintest_xvector_train(self): + def test_xvector_train(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.check_xvector_training(*config_and_inputs) @@ -521,22 +526,19 @@ def test_labels_out_of_vocab(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.check_labels_out_of_vocab(*config_and_inputs) - # Wav2Vec2 has no inputs_embeds + @unittest.skip(reason="Model has no inputs_embeds") def test_inputs_embeds(self): pass - # `input_ids` is renamed to `input_values` + @unittest.skip(reason="Model has input_values instead of input_ids") def test_forward_signature(self): pass - # Wav2Vec2 cannot resize token embeddings - # since it has no tokens embeddings + @unittest.skip(reason="Model has no tokens embeds") def test_resize_tokens_embeddings(self): pass - # Wav2Vec2 has no inputs_embeds - # and thus the `get_input_embeddings` fn - # is not implemented + @unittest.skip(reason="Model has no inputs_embeds") def test_model_get_set_embeddings(self): pass @@ -546,7 +548,7 @@ def test_initialization(self): configs_no_init = _config_zero_init(config) for model_class in self.all_model_classes: model = model_class(config=configs_no_init) - for name, param in model.parameters_and_names(): + for name, param in model.named_parameters(): uniform_init_parms = [ "conv.weight", "conv.parametrizations.weight", @@ -564,21 +566,36 @@ def test_initialization(self): if param.requires_grad: if any(x in name for x in uniform_init_parms): self.assertTrue( - -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, + -1.0 <= ((param.mean() * 1e9).round() / 1e9).item() <= 1.0, msg=f"Parameter {name} of model {model_class} seems not properly initialized", ) else: self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), + ((param.mean() * 1e9).round() / 1e9).item(), [0.0, 1.0], msg=f"Parameter {name} of model {model_class} seems not properly initialized", ) + # overwrite from test_modeling_common + def _mock_init_weights(self, module): + if hasattr(module, "weight") and module.weight is not None: + module.weight.fill_(3) + if hasattr(module, "weight_g") and module.weight_g is not None: + module.weight_g.fill_(3) + if hasattr(module, "weight_v") and module.weight_v is not None: + module.weight_v.fill_(3) + if hasattr(module, "bias") and module.bias is not None: + module.bias.fill_(3) + if hasattr(module, "codevectors") and module.codevectors is not None: + module.codevectors.fill_(3) + if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None: + module.masked_spec_embed.fill_(3) + def test_mask_feature_prob_ctc(self): model = Wav2Vec2ForCTC.from_pretrained( "hf-internal-testing/tiny-random-wav2vec2", mask_feature_prob=0.2, mask_feature_length=2 ) - model.set_train(True) + model.train() processor = Wav2Vec2Processor.from_pretrained( "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True ) @@ -601,7 +618,7 @@ def test_mask_time_prob_ctc(self): model = Wav2Vec2ForCTC.from_pretrained( "hf-internal-testing/tiny-random-wav2vec2", mask_time_prob=0.2, mask_time_length=2 ) - model.set_train(True) + model.train() processor = Wav2Vec2Processor.from_pretrained( "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True ) @@ -633,13 +650,17 @@ def test_model_from_pretrained(self): @require_mindspore class Wav2Vec2RobustModelTest(ModelTesterMixin, unittest.TestCase): all_model_classes = ( - Wav2Vec2ForCTC, - Wav2Vec2Model, - Wav2Vec2ForMaskedLM, - Wav2Vec2ForSequenceClassification, - Wav2Vec2ForPreTraining, - Wav2Vec2ForAudioFrameClassification, - Wav2Vec2ForXVector, + ( + Wav2Vec2ForCTC, + Wav2Vec2Model, + Wav2Vec2ForMaskedLM, + Wav2Vec2ForSequenceClassification, + Wav2Vec2ForPreTraining, + Wav2Vec2ForAudioFrameClassification, + Wav2Vec2ForXVector, + ) + if is_mindspore_available() + else () ) test_pruning = False test_headmasking = False @@ -697,22 +718,19 @@ def test_labels_out_of_vocab(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.check_labels_out_of_vocab(*config_and_inputs) - # Wav2Vec2 has no inputs_embeds + @unittest.skip(reason="Model has no input_embeds") def test_inputs_embeds(self): pass - # `input_ids` is renamed to `input_values` + @unittest.skip(reason="Model has input_values instead of input_ids") def test_forward_signature(self): pass - # Wav2Vec2 cannot resize token embeddings - # since it has no tokens embeddings + @unittest.skip(reason="Model has no token embeddings") def test_resize_tokens_embeddings(self): pass - # Wav2Vec2 has no inputs_embeds - # and thus the `get_input_embeddings` fn - # is not implemented + @unittest.skip(reason="Model has no input_embeds") def test_model_get_set_embeddings(self): pass @@ -722,7 +740,7 @@ def test_initialization(self): configs_no_init = _config_zero_init(config) for model_class in self.all_model_classes: model = model_class(config=configs_no_init) - for name, param in model.parameters_and_names(): + for name, param in model.named_parameters(): uniform_init_parms = [ "conv.weight", "conv.parametrizations.weight", @@ -740,16 +758,31 @@ def test_initialization(self): if param.requires_grad: if any(x in name for x in uniform_init_parms): self.assertTrue( - -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, + -1.0 <= ((param.mean() * 1e9).round() / 1e9).item() <= 1.0, msg=f"Parameter {name} of model {model_class} seems not properly initialized", ) else: self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), + ((param.mean() * 1e9).round() / 1e9).item(), [0.0, 1.0], msg=f"Parameter {name} of model {model_class} seems not properly initialized", ) + # overwrite from test_modeling_common + def _mock_init_weights(self, module): + if hasattr(module, "weight") and module.weight is not None: + module.weight.fill_(3) + if hasattr(module, "weight_g") and module.weight_g is not None: + module.weight_g.fill_(3) + if hasattr(module, "weight_v") and module.weight_v is not None: + module.weight_v.fill_(3) + if hasattr(module, "bias") and module.bias is not None: + module.bias.fill_(3) + if hasattr(module, "codevectors") and module.codevectors is not None: + module.codevectors.fill_(3) + if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None: + module.masked_spec_embed.fill_(3) + def test_model_for_pretraining(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() model = Wav2Vec2ForPreTraining(config) @@ -767,8 +800,8 @@ def test_model_for_pretraining(self): ) sampled_negative_indices = _sample_negative_indices(features_shape, 10, mask_time_indices) - mask_time_indices = Tensor.from_numpy(mask_time_indices) - sampled_negative_indices = Tensor.from_numpy(sampled_negative_indices) + mask_time_indices = ops.from_numpy(mask_time_indices) + sampled_negative_indices = ops.from_numpy(sampled_negative_indices) loss = model( inputs_dict["input_values"], @@ -780,8 +813,8 @@ def test_model_for_pretraining(self): # more losses mask_time_indices[:, : mask_time_indices.shape[-1] // 2] = True - sampled_negative_indices = _sample_negative_indices(features_shape, 10, mask_time_indices) - sampled_negative_indices = Tensor.from_numpy(sampled_negative_indices) + sampled_negative_indices = _sample_negative_indices(features_shape, 10, mask_time_indices.asnumpy()) + sampled_negative_indices = ops.from_numpy(sampled_negative_indices) loss_more_masked = model( inputs_dict["input_values"], attention_mask=inputs_dict["attention_mask"], @@ -796,7 +829,7 @@ def test_mask_feature_prob_ctc(self): model = Wav2Vec2ForCTC.from_pretrained( "hf-internal-testing/tiny-random-wav2vec2", mask_feature_prob=0.2, mask_feature_length=2 ) - model.set_train(True) + model.train() processor = Wav2Vec2Processor.from_pretrained( "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True ) @@ -819,7 +852,7 @@ def test_mask_time_prob_ctc(self): model = Wav2Vec2ForCTC.from_pretrained( "hf-internal-testing/tiny-random-wav2vec2", mask_time_prob=0.2, mask_time_length=2 ) - model.set_train(True) + model.train() processor = Wav2Vec2Processor.from_pretrained( "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True ) @@ -846,7 +879,7 @@ def test_mask_time_feature_prob_ctc_single_batch(self): mask_time_length=2, mask_feature_length=2, ) - model.set_train(True) + model.train() processor = Wav2Vec2Processor.from_pretrained( "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True ) @@ -869,9 +902,10 @@ def test_mask_time_feature_prob_ctc_single_batch(self): def test_feed_forward_chunking(self): pass - @unittest.skip(reason="resource url unavailable") def test_load_and_set_attn_adapter(self): - processor = Wav2Vec2Processor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True) + processor = Wav2Vec2Processor.from_pretrained( + "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True + ) def get_logits(model, input_features): batch = processor( @@ -880,10 +914,12 @@ def get_logits(model, input_features): sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="ms", ) - logits = model( - input_values=batch["input_values"], - attention_mask=batch["attention_mask"], - ).logits + + with no_grad(): + logits = model( + input_values=batch["input_values"], + attention_mask=batch["attention_mask"], + ).logits return logits input_features = [np.random.random(16_000 * s) for s in [1, 3, 2, 6]] @@ -897,9 +933,8 @@ def get_logits(model, input_features): logits_2 = get_logits(model_2, input_features) - self.assertTrue(mnp.allclose(logits, logits_2, atol=1e-3)) + self.assertTrue(ops.allclose(logits, logits_2, atol=1e-3)) - @unittest.skip('no torch support') # test that loading adapter weights with mismatched vocab sizes can be loaded def test_load_target_lang_with_mismatched_size(self): processor = Wav2Vec2Processor.from_pretrained( @@ -913,15 +948,19 @@ def get_logits(model, input_features): sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="ms", ) - logits = model( - input_values=batch["input_values"], - attention_mask=batch["attention_mask"], - ).logits + + with no_grad(): + logits = model( + input_values=batch["input_values"], + attention_mask=batch["attention_mask"], + ).logits return logits input_features = [np.random.random(16_000 * s) for s in [1, 3, 2, 6]] - model = Wav2Vec2ForCTC.from_pretrained("hf-internal-testing/tiny-random-wav2vec2-adapter", target_lang="fr", ignore_mismatched_sizes=True) + model = Wav2Vec2ForCTC.from_pretrained( + "hf-internal-testing/tiny-random-wav2vec2-adapter", target_lang="fr", ignore_mismatched_sizes=True + ) logits = get_logits(model, input_features) @@ -930,9 +969,8 @@ def get_logits(model, input_features): logits_2 = get_logits(model_2, input_features) - self.assertTrue(mnp.allclose(logits, logits_2, atol=1e-3)) + self.assertTrue(ops.allclose(logits, logits_2, atol=1e-3)) - @unittest.skip(reason="no pytorch support") def test_load_attn_adapter(self): processor = Wav2Vec2Processor.from_pretrained( "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True @@ -945,10 +983,12 @@ def get_logits(model, input_features): sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="ms", ) - logits = model( - input_values=batch["input_values"], - attention_mask=batch["attention_mask"], - ).logits + + with no_grad(): + logits = model( + input_values=batch["input_values"], + attention_mask=batch["attention_mask"], + ).logits return logits input_features = [np.random.random(16_000 * s) for s in [1, 3, 2, 6]] @@ -964,7 +1004,7 @@ def get_logits(model, input_features): # save safe weights safe_filepath = os.path.join(tempdir, WAV2VEC2_ADAPTER_SAFE_FILE.format("eng")) - safe_save_file(adapter_weights, safe_filepath, metadata={"format": "pt"}) # pylint: disable=undefined-variable + safe_save_file(adapter_weights, safe_filepath, metadata={"format": "ms"}) model.load_adapter("eng") model.load_adapter("eng", use_safetensors=True) @@ -975,7 +1015,7 @@ def get_logits(model, input_features): model.load_adapter("ita", use_safetensors=True) logits_2 = get_logits(model, input_features) - self.assertTrue(mnp.allclose(logits, logits_2, atol=1e-3)) + self.assertTrue(ops.allclose(logits, logits_2, atol=1e-3)) with tempfile.TemporaryDirectory() as tempdir: model.save_pretrained(tempdir) @@ -986,7 +1026,7 @@ def get_logits(model, input_features): # save pt weights pt_filepath = os.path.join(tempdir, WAV2VEC2_ADAPTER_PT_FILE.format("eng")) - F.save(adapter_weights, pt_filepath) + save(adapter_weights, pt_filepath) model.load_adapter("eng") model.load_adapter("eng", use_safetensors=False) @@ -996,7 +1036,7 @@ def get_logits(model, input_features): logits_2 = get_logits(model, input_features) - self.assertTrue(mnp.allclose(logits, logits_2, atol=1e-3)) + self.assertTrue(ops.allclose(logits, logits_2, atol=1e-3)) model = Wav2Vec2ForCTC.from_pretrained("hf-internal-testing/tiny-random-wav2vec2-adapter") logits = get_logits(model, input_features) @@ -1007,7 +1047,7 @@ def get_logits(model, input_features): logits_2 = get_logits(model, input_features) - self.assertTrue(mnp.allclose(logits, logits_2, atol=1e-3)) + self.assertTrue(ops.allclose(logits, logits_2, atol=1e-3)) @slow def test_model_from_pretrained(self): @@ -1024,7 +1064,7 @@ def test_compute_mask_indices(self): mask_length = 1 mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length) - mask = Tensor.from_numpy(mask) + mask = ops.from_numpy(mask) self.assertListEqual(mask.sum(axis=-1).tolist(), [mask_prob * sequence_length for _ in range(batch_size)]) @@ -1043,9 +1083,10 @@ def test_compute_mask_indices_low_prob(self): for _ in range(n_trials): mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length) - mask = Tensor.from_numpy(mask) + mask = ops.from_numpy(mask) + + num_masks = ops.sum(mask).item() - num_masks = F.sum(mask).item() if num_masks > 0: count_dimensions_masked += 1 else: @@ -1064,7 +1105,7 @@ def test_compute_mask_indices_overlap(self): mask_length = 4 mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length) - mask = Tensor.from_numpy(mask) + mask = ops.from_numpy(mask) # because of overlap mask don't have to add up exactly to `mask_prob * sequence_length`, but have to be smaller or equal for batch_sum in mask.sum(axis=-1): @@ -1076,44 +1117,27 @@ def test_compute_mask_indices_attn_mask_overlap(self): mask_prob = 0.5 mask_length = 4 - attention_mask = F.ones((batch_size, sequence_length), dtype=ms.int64) + attention_mask = ops.ones((batch_size, sequence_length), dtype=mindspore.int64) attention_mask[:2, sequence_length // 2 :] = 0 mask = _compute_mask_indices( (batch_size, sequence_length), mask_prob, mask_length, attention_mask=attention_mask ) - mask = Tensor.from_numpy(mask) + mask = ops.from_numpy(mask) for batch_sum in mask.sum(axis=-1): self.assertTrue(int(batch_sum) <= mask_prob * sequence_length) self.assertTrue(mask[:2, sequence_length // 2 :].sum() == 0) - def test_compute_mask_indices_short_audio(self): - batch_size = 4 - sequence_length = 100 - mask_prob = 0.05 - mask_length = 10 - - attention_mask = F.ones((batch_size, sequence_length), dtype=ms.int64) - # force one example to be heavily padded - attention_mask[0, 5:] = 0 - - mask = _compute_mask_indices( - (batch_size, sequence_length), mask_prob, mask_length, attention_mask=attention_mask, min_masks=2 - ) - - # make sure that non-padded examples cannot be padded - self.assertFalse(mask[0][attention_mask[0].astype(ms.bool_).asnumpy()].any()) - def test_compute_perplexity(self): - probs = F.arange(100, dtype=ms.float32).reshape(2, 5, 10) / 100 + probs = ops.arange(100).reshape(2, 5, 10) / 100 ppl = Wav2Vec2GumbelVectorQuantizer._compute_perplexity(probs) self.assertTrue(abs(ppl.item() - 141.4291) < 1e-3) # mask half of the input - mask = F.ones((2,), dtype=ms.bool_) + mask = ops.ones((2,), dtype=mindspore.bool_) mask[0] = 0 ppl = Wav2Vec2GumbelVectorQuantizer._compute_perplexity(probs, mask) @@ -1124,15 +1148,15 @@ def test_sample_negatives(self): sequence_length = 10 hidden_size = 4 num_negatives = 3 - sequence = F.div( - F.arange(sequence_length * hidden_size), hidden_size, rounding_mode="floor" + sequence = ops.div( + ops.arange(sequence_length * hidden_size), hidden_size, rounding_mode="floor" ) features = sequence.view(sequence_length, hidden_size) # each value in vector consits of same value - features = features[None, :].expand(batch_size, sequence_length, hidden_size) + features = features[None, :].broadcast_to((batch_size, sequence_length, hidden_size)) # sample negative indices sampled_negative_indices = _sample_negative_indices((batch_size, sequence_length), num_negatives, None) - sampled_negative_indices = Tensor.from_numpy(sampled_negative_indices) + sampled_negative_indices = ops.from_numpy(sampled_negative_indices) negatives = features.view(-1, hidden_size)[sampled_negative_indices.long().view(-1)] negatives = negatives.view(batch_size, sequence_length, -1, hidden_size).permute(2, 0, 1, 3) self.assertTrue(negatives.shape == (num_negatives, batch_size, sequence_length, hidden_size)) @@ -1142,13 +1166,7 @@ def test_sample_negatives(self): self.assertTrue(((negative - features) == 0).sum() == 0.0) # make sure that full vectors are sampled and not values of vectors => this means that `unique()` yields a single value for `hidden_size` dim - #self.assertEqual(negatives.unique(dim=-1).shape, (num_negatives, batch_size, sequence_length, 1)) - # NOTE: which means [:, :, :, i] is equal for all i - self.assertEqual(negatives.shape[:-1], (num_negatives, batch_size, sequence_length)) - ref = negatives[:, :, :, 0] - for i in range(1, negatives.shape[-1]): - x = negatives[:, :, :, i] - self.assertTrue(F.all(ref == x)) + self.assertEqual(ops.unique(negatives, dim=-1).shape, (num_negatives, batch_size, sequence_length, 1)) def test_sample_negatives_with_mask(self): batch_size = 2 @@ -1157,27 +1175,28 @@ def test_sample_negatives_with_mask(self): num_negatives = 3 # second half of last input tensor is padded - mask = F.ones((batch_size, sequence_length), dtype=ms.int64) + mask = ops.ones((batch_size, sequence_length), dtype=mindspore.int64) mask[-1, sequence_length // 2 :] = 0 - sequence = F.div( - F.arange(sequence_length * hidden_size), hidden_size, rounding_mode="floor" + sequence = ops.div( + ops.arange(sequence_length * hidden_size), hidden_size, rounding_mode="floor" ) features = sequence.view(sequence_length, hidden_size) # each value in vector consits of same value - features = features[None, :].expand(batch_size, sequence_length, hidden_size) + features = features[None, :].broadcast_to((batch_size, sequence_length, hidden_size)) # replace masked feature vectors with -100 to test that those are not sampled - features = F.where(mask[:, :, None].expand(features.shape).bool(), features, -100) + features = ops.where(mask[:, :, None].broadcast_to(features.shape).bool(), features, -100) # sample negative indices sampled_negative_indices = _sample_negative_indices( - (batch_size, sequence_length), num_negatives, mask + (batch_size, sequence_length), num_negatives, mask.asnumpy() ) - sampled_negative_indices = Tensor.from_numpy(sampled_negative_indices) + sampled_negative_indices = ops.from_numpy(sampled_negative_indices) negatives = features.view(-1, hidden_size)[sampled_negative_indices.long().view(-1)] negatives = negatives.view(batch_size, sequence_length, -1, hidden_size).permute(2, 0, 1, 3) self.assertTrue((negatives >= 0).all().item()) + self.assertTrue(negatives.shape == (num_negatives, batch_size, sequence_length, hidden_size)) # make sure no negatively sampled vector is actually a positive one @@ -1185,17 +1204,11 @@ def test_sample_negatives_with_mask(self): self.assertTrue(((negative - features) == 0).sum() == 0.0) # make sure that full vectors are sampled and not values of vectors => this means that `unique()` yields a single value for `hidden_size` dim - #self.assertEqual(negatives.unique(dim=-1).shape, (num_negatives, batch_size, sequence_length, 1)) - # NOTE: which means [:, :, :, i] is equal for all i - self.assertEqual(negatives.shape[:-1], (num_negatives, batch_size, sequence_length)) - ref = negatives[:, :, :, 0] - for i in range(1, negatives.shape[-1]): - print('i = ', i) - x = negatives[:, :, :, i] - self.assertTrue(F.all(ref == x)) + self.assertEqual(ops.unique(negatives, dim=-1).shape, (num_negatives, batch_size, sequence_length, 1)) + @require_mindspore -@require_librosa +@require_soundfile @slow class Wav2Vec2ModelIntegrationTest(unittest.TestCase): def tearDown(self): @@ -1204,15 +1217,17 @@ def tearDown(self): gc.collect() def _load_datasamples(self, num_samples): - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").filter( lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] )[:num_samples]["audio"] + return [x["array"] for x in speech_samples] def _load_superb(self, task, num_samples): ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True) + return ds[:num_samples] def test_inference_ctc_normal(self): @@ -1221,9 +1236,11 @@ def test_inference_ctc_normal(self): input_speech = self._load_datasamples(1) input_values = processor(input_speech, return_tensors="ms").input_values - logits = model(input_values).logits - predicted_ids = F.argmax(logits, dim=-1) + with no_grad(): + logits = model(input_values).logits + + predicted_ids = ops.argmax(logits, dim=-1) predicted_trans = processor.batch_decode(predicted_ids) EXPECTED_TRANSCRIPTIONS = ["a man said to the universe sir i exist"] @@ -1234,11 +1251,15 @@ def test_inference_ctc_normal_batched(self): processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h", do_lower_case=True) input_speech = self._load_datasamples(2) + inputs = processor(input_speech, return_tensors="ms", padding=True) + input_values = inputs.input_values - logits = model(input_values).logits - predicted_ids = F.argmax(logits, dim=-1) + with no_grad(): + logits = model(input_values).logits + + predicted_ids = ops.argmax(logits, dim=-1) predicted_trans = processor.batch_decode(predicted_ids) EXPECTED_TRANSCRIPTIONS = [ @@ -1252,13 +1273,16 @@ def test_inference_ctc_robust_batched(self): processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self", do_lower_case=True) input_speech = self._load_datasamples(4) + inputs = processor(input_speech, return_tensors="ms", padding=True) input_values = inputs.input_values attention_mask = inputs.attention_mask - logits = model(input_values, attention_mask=attention_mask).logits - predicted_ids = F.argmax(logits, dim=-1) + with no_grad(): + logits = model(input_values, attention_mask=attention_mask).logits + + predicted_ids = ops.argmax(logits, dim=-1) predicted_trans = processor.batch_decode(predicted_ids) EXPECTED_TRANSCRIPTIONS = [ @@ -1270,7 +1294,6 @@ def test_inference_ctc_robust_batched(self): ] self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS) - @unittest.skipIf(ms.get_context('device_target') != "CPU", "cannot make deterministic on GPU") def test_inference_integration(self): model = Wav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-base") feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base") @@ -1290,11 +1313,13 @@ def test_inference_integration(self): model.config.mask_time_length, min_masks=2, ) - mask_time_indices = Tensor.from_numpy(mask_time_indices) - outputs = model( - inputs_dict.input_values, - mask_time_indices=mask_time_indices, - ) + mask_time_indices = ops.from_numpy(mask_time_indices) + + with no_grad(): + outputs = model( + inputs_dict.input_values, + mask_time_indices=mask_time_indices, + ) # compute cosine similarity cosine_sim = F.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1) @@ -1305,7 +1330,7 @@ def test_inference_integration(self): # cosine similarity of model is all > 0.5 as model is # pre-trained on contrastive loss # fmt: off - expected_cosine_sim_masked = ms.tensor([ + expected_cosine_sim_masked = mindspore.tensor([ 0.8523, 0.5860, 0.6905, 0.5557, 0.7456, 0.5249, 0.6639, 0.7654, 0.7565, 0.8167, 0.8222, 0.7960, 0.8034, 0.8166, 0.8310, 0.8263, 0.8274, 0.8258, 0.8179, 0.8412, 0.8536, 0.5098, 0.4728, 0.6461, 0.4498, 0.6002, 0.5774, @@ -1314,7 +1339,7 @@ def test_inference_integration(self): ]) # fmt: on - self.assertTrue(mnp.allclose(cosine_sim_masked, expected_cosine_sim_masked, atol=1e-3)) + self.assertTrue(ops.allclose(cosine_sim_masked, expected_cosine_sim_masked, atol=1e-3)) def test_inference_pretrained(self): model = Wav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-base") @@ -1330,18 +1355,21 @@ def test_inference_pretrained(self): features_shape = (batch_size, feature_seq_length) + mindspore.manual_seed(0) mask_time_indices = _compute_mask_indices( features_shape, model.config.mask_time_prob, model.config.mask_time_length, min_masks=2, ) - mask_time_indices = Tensor.from_numpy(mask_time_indices) - outputs = model( - inputs_dict.input_values, - attention_mask=inputs_dict.attention_mask, - mask_time_indices=mask_time_indices, - ) + mask_time_indices = ops.from_numpy(mask_time_indices) + + with no_grad(): + outputs = model( + inputs_dict.input_values, + attention_mask=inputs_dict.attention_mask, + mask_time_indices=mask_time_indices, + ) # compute cosine similarity cosine_sim = F.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1) @@ -1352,15 +1380,19 @@ def test_inference_pretrained(self): # ... now compare to randomly initialized model config = Wav2Vec2Config.from_pretrained("facebook/wav2vec2-base") - model_rand = Wav2Vec2ForPreTraining(config) - outputs_rand = model_rand( - inputs_dict.input_values, - attention_mask=inputs_dict.attention_mask, - mask_time_indices=mask_time_indices, - ) + model_rand = Wav2Vec2ForPreTraining(config).eval() + + with no_grad(): + outputs_rand = model_rand( + inputs_dict.input_values, + attention_mask=inputs_dict.attention_mask, + mask_time_indices=mask_time_indices, + ) # compute cosine similarity - cosine_sim_rand = F.cosine_similarity(outputs_rand.projected_states, outputs_rand.projected_quantized_states, dim=-1) + cosine_sim_rand = F.cosine_similarity( + outputs_rand.projected_states, outputs_rand.projected_quantized_states, dim=-1 + ) # retrieve cosine sim of masked features cosine_sim_masked_rand = cosine_sim_rand[mask_time_indices] @@ -1371,7 +1403,6 @@ def test_inference_pretrained(self): # => the cosine similarity between quantized states and predicted states is very likely < 0.1 self.assertTrue(cosine_sim_masked.mean().item() - 5 * cosine_sim_masked_rand.mean().item() > 0) - @unittest.skipIf(ms.get_context('device_target') != "CPU", "cannot make deterministic on GPU") def test_loss_pretraining(self): model = Wav2Vec2ForPreTraining.from_pretrained( "facebook/wav2vec2-base", @@ -1380,7 +1411,7 @@ def test_loss_pretraining(self): hidden_dropout=0.0, layerdrop=0.0, ) - model.set_train(True) + model.train() feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( "facebook/wav2vec2-base", return_attention_mask=True @@ -1394,7 +1425,7 @@ def test_loss_pretraining(self): features_shape = (batch_size, feature_seq_length) - ms.set_seed(0) + mindspore.manual_seed(0) np.random.seed(0) mask_time_indices = _compute_mask_indices( @@ -1407,14 +1438,16 @@ def test_loss_pretraining(self): mask_time_indices.shape, model.config.num_negatives, mask_time_indices ) - mask_time_indices = Tensor.from_numpy(mask_time_indices) - sampled_negative_indices = Tensor.from_numpy(sampled_negative_indices) - outputs = model( - inputs_dict.input_values, - attention_mask=inputs_dict.attention_mask, - mask_time_indices=mask_time_indices, - sampled_negative_indices=sampled_negative_indices, - ) + mask_time_indices = ops.from_numpy(mask_time_indices) + sampled_negative_indices = ops.from_numpy(sampled_negative_indices) + + with no_grad(): + outputs = model( + inputs_dict.input_values, + attention_mask=inputs_dict.attention_mask, + mask_time_indices=mask_time_indices, + sampled_negative_indices=sampled_negative_indices, + ) # check diversity loss num_codevectors = model.config.num_codevectors_per_group * model.config.num_codevector_groups @@ -1424,8 +1457,7 @@ def test_loss_pretraining(self): # check overall loss (contrastive loss + diversity loss) expected_loss = 116.7094 - # NOTE: Mindspore's gumbel_softmax differs in implementation detail - #self.assertTrue(abs(outputs.loss.item() - expected_loss) < 1e-3) + self.assertTrue(abs(outputs.loss.item() - expected_loss) < 1e-3) def test_inference_keyword_spotting(self): model = Wav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-ks") @@ -1435,15 +1467,16 @@ def test_inference_keyword_spotting(self): input_values = inputs.input_values attention_mask = inputs.attention_mask - outputs = model(input_values, attention_mask=attention_mask) - predicted_logits, predicted_ids = F.max(outputs.logits, axis=-1) + with no_grad(): + outputs = model(input_values, attention_mask=attention_mask) + predicted_logits, predicted_ids = ops.max(outputs.logits, dim=-1) expected_labels = [7, 6, 10, 9] # s3prl logits for the same batch - expected_logits = ms.tensor([6.1186, 11.8961, 10.2931, 6.0898]) + expected_logits = mindspore.tensor([6.1186, 11.8961, 10.2931, 6.0898]) self.assertListEqual(predicted_ids.tolist(), expected_labels) - self.assertTrue(mnp.allclose(predicted_logits, expected_logits, atol=1e-2)) + self.assertTrue(ops.allclose(predicted_logits, expected_logits, atol=1e-2)) def test_inference_intent_classification(self): model = Wav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-ic") @@ -1453,26 +1486,27 @@ def test_inference_intent_classification(self): input_values = inputs.input_values attention_mask = inputs.attention_mask - outputs = model(input_values, attention_mask=attention_mask) + with no_grad(): + outputs = model(input_values, attention_mask=attention_mask) - predicted_logits_action, predicted_ids_action = F.max(outputs.logits[:, :6], axis=-1) - predicted_logits_object, predicted_ids_object = F.max(outputs.logits[:, 6:20], axis=-1) - predicted_logits_location, predicted_ids_location = F.max(outputs.logits[:, 20:24], axis=-1) + predicted_logits_action, predicted_ids_action = ops.max(outputs.logits[:, :6], dim=-1) + predicted_logits_object, predicted_ids_object = ops.max(outputs.logits[:, 6:20], dim=-1) + predicted_logits_location, predicted_ids_location = ops.max(outputs.logits[:, 20:24], dim=-1) expected_labels_action = [0, 0, 2, 3] - expected_logits_action = ms.tensor([0.4568, 11.0848, 1.6621, 9.3841]) + expected_logits_action = mindspore.tensor([0.4568, 11.0848, 1.6621, 9.3841]) expected_labels_object = [3, 10, 3, 4] - expected_logits_object = ms.tensor([1.5322, 10.7094, 5.2469, 22.1318]) + expected_logits_object = mindspore.tensor([1.5322, 10.7094, 5.2469, 22.1318]) expected_labels_location = [0, 0, 0, 1] - expected_logits_location = ms.tensor([1.5335, 6.5096, 10.5704, 11.0569]) + expected_logits_location = mindspore.tensor([1.5335, 6.5096, 10.5704, 11.0569]) self.assertListEqual(predicted_ids_action.tolist(), expected_labels_action) self.assertListEqual(predicted_ids_object.tolist(), expected_labels_object) self.assertListEqual(predicted_ids_location.tolist(), expected_labels_location) - self.assertTrue(mnp.allclose(predicted_logits_action, expected_logits_action, atol=1e-2)) - self.assertTrue(mnp.allclose(predicted_logits_object, expected_logits_object, atol=1e-2)) - self.assertTrue(mnp.allclose(predicted_logits_location, expected_logits_location, atol=1e-2)) + self.assertTrue(ops.allclose(predicted_logits_action, expected_logits_action, atol=1e-2)) + self.assertTrue(ops.allclose(predicted_logits_object, expected_logits_object, atol=1e-2)) + self.assertTrue(ops.allclose(predicted_logits_location, expected_logits_location, atol=1e-2)) def test_inference_speaker_identification(self): model = Wav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-sid") @@ -1480,19 +1514,20 @@ def test_inference_speaker_identification(self): input_data = self._load_superb("si", 4) output_logits = [] - for example in input_data["speech"]: - input = processor(example, return_tensors="ms", padding=True) - output = model(input.input_values, attention_mask=None) - output_logits.append(output.logits[0]) - output_logits = F.stack(output_logits) - predicted_logits, predicted_ids = F.max(output_logits, axis=-1) + with no_grad(): + for example in input_data["speech"]: + input = processor(example, return_tensors="ms", padding=True) + output = model(input.input_values, attention_mask=None) + output_logits.append(output.logits[0]) + output_logits = ops.stack(output_logits) + predicted_logits, predicted_ids = ops.max(output_logits, dim=-1) expected_labels = [251, 1, 1, 3] # s3prl logits for the same batch - expected_logits = ms.tensor([37.5627, 71.6362, 64.2419, 31.7778]) + expected_logits = mindspore.tensor([37.5627, 71.6362, 64.2419, 31.7778]) self.assertListEqual(predicted_ids.tolist(), expected_labels) - self.assertTrue(mnp.allclose(predicted_logits, expected_logits, atol=1e-2)) + self.assertTrue(ops.allclose(predicted_logits, expected_logits, atol=1e-2)) def test_inference_emotion_recognition(self): model = Wav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-er") @@ -1502,29 +1537,32 @@ def test_inference_emotion_recognition(self): input_values = inputs.input_values attention_mask = inputs.attention_mask - outputs = model(input_values, attention_mask=attention_mask) - predicted_logits, predicted_ids = F.max(outputs.logits, axis=-1) + with no_grad(): + outputs = model(input_values, attention_mask=attention_mask) + predicted_logits, predicted_ids = ops.max(outputs.logits, dim=-1) expected_labels = [1, 1, 2, 2] # s3prl logits for the same batch - expected_logits = ms.tensor([2.1722, 3.0779, 8.0287, 6.6797]) + expected_logits = mindspore.tensor([2.1722, 3.0779, 8.0287, 6.6797]) self.assertListEqual(predicted_ids.tolist(), expected_labels) - self.assertTrue(mnp.allclose(predicted_logits, expected_logits, atol=1e-2)) + self.assertTrue(ops.allclose(predicted_logits, expected_logits, atol=1e-2)) - @unittest.skip("espeak not available on Windows") def test_phoneme_recognition(self): model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft") processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft") input_speech = self._load_datasamples(4) + inputs = processor(input_speech, return_tensors="ms", padding=True) input_values = inputs.input_values attention_mask = inputs.attention_mask - logits = model(input_values, attention_mask=attention_mask).logits - predicted_ids = F.argmax(logits, dim=-1) + with no_grad(): + logits = model(input_values, attention_mask=attention_mask).logits + + predicted_ids = ops.argmax(logits, dim=-1) predicted_trans = processor.batch_decode(predicted_ids) EXPECTED_TRANSCRIPTIONS = [ @@ -1546,44 +1584,50 @@ def test_phoneme_recognition(self): self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS) @require_pyctcdecode - @require_librosa def test_wav2vec2_with_lm(self): - ds = load_dataset("mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True, trust_remote_code=True) + ds = load_dataset( + "mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True, trust_remote_code=True + ) sample = next(iter(ds)) - resampled_audio = L.resample( - ms.tensor(sample["audio"]["array"]).numpy(), orig_sr=48_000, target_sr=16_000 - ) + resampled_audio = torchaudio.functional.resample( + mindspore.tensor(sample["audio"]["array"]), 48_000, 16_000 + ).numpy() model = Wav2Vec2ForCTC.from_pretrained("patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm") processor = Wav2Vec2ProcessorWithLM.from_pretrained("patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm") input_values = processor(resampled_audio, return_tensors="ms").input_values - logits = model(input_values).logits - transcription = processor.batch_decode(logits).text + with no_grad(): + logits = model(input_values).logits + + transcription = processor.batch_decode(logits.asnumpy()).text self.assertEqual(transcription[0], "habitan aguas poco profundas y rocosas") @require_pyctcdecode - @require_librosa def test_wav2vec2_with_lm_pool(self): - ds = load_dataset("mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True, trust_remote_code=True) + ds = load_dataset( + "mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True, trust_remote_code=True + ) sample = next(iter(ds)) - resampled_audio = L.resample( - ms.tensor(sample["audio"]["array"]).numpy(), orig_sr=48_000, target_sr=16_000 - ) + resampled_audio = torchaudio.functional.resample( + mindspore.tensor(sample["audio"]["array"]), 48_000, 16_000 + ).numpy() model = Wav2Vec2ForCTC.from_pretrained("patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm") processor = Wav2Vec2ProcessorWithLM.from_pretrained("patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm") input_values = processor(resampled_audio, return_tensors="ms").input_values - logits = model(input_values).logits + + with no_grad(): + logits = model(input_values).logits # test user-managed pool with multiprocessing.get_context("fork").Pool(2) as pool: - transcription = processor.batch_decode(logits, pool).text + transcription = processor.batch_decode(logits.asnumpy(), pool).text self.assertEqual(transcription[0], "habitan aguas poco profundas y rocosas") @@ -1591,7 +1635,7 @@ def test_wav2vec2_with_lm_pool(self): with CaptureLogger(processing_wav2vec2_with_lm.logger) as cl, multiprocessing.get_context("fork").Pool( 2 ) as pool: - transcription = processor.batch_decode(logits, pool, num_processes=2).text + transcription = processor.batch_decode(logits.asnumpy(), pool, num_processes=2).text self.assertIn("num_process", cl.out) self.assertIn("it will be ignored", cl.out) @@ -1599,7 +1643,6 @@ def test_wav2vec2_with_lm_pool(self): self.assertEqual(transcription[0], "habitan aguas poco profundas y rocosas") @require_pyctcdecode - @require_librosa def test_wav2vec2_with_lm_invalid_pool(self): run_test_in_subprocess(test_case=self, target_func=_test_wav2vec2_with_lm_invalid_pool, inputs=None) @@ -1611,12 +1654,13 @@ def test_inference_diarization(self): input_values = inputs.input_values attention_mask = inputs.attention_mask - outputs = model(input_values, attention_mask=attention_mask) + with no_grad(): + outputs = model(input_values, attention_mask=attention_mask) # labels is a one-hot array of shape (num_frames, num_speakers) labels = (outputs.logits > 0).long() # s3prl logits for the same batch - expected_logits = ms.tensor( + expected_logits = mindspore.tensor( [ [[-5.2807, -5.1272], [-5.4059, -4.7757], [-5.2764, -4.9621], [-5.0117, -4.5851]], [[-1.7643, -0.5462], [-1.7369, -0.2649], [-1.5066, -0.6200], [-4.5703, -2.4863]], @@ -1626,7 +1670,7 @@ def test_inference_diarization(self): ) self.assertEqual(labels[0, :, 0].sum(), 555) self.assertEqual(labels[0, :, 1].sum(), 299) - self.assertTrue(mnp.allclose(outputs.logits[:, :4], expected_logits, atol=1e-2)) + self.assertTrue(ops.allclose(outputs.logits[:, :4], expected_logits, atol=1e-2)) def test_inference_speaker_verification(self): model = Wav2Vec2ForXVector.from_pretrained("anton-l/wav2vec2-base-superb-sv") @@ -1634,14 +1678,15 @@ def test_inference_speaker_verification(self): input_data = self._load_superb("si", 4) inputs = processor(input_data["speech"], return_tensors="ms", padding=True, sampling_rate=16_000) - labels = ms.tensor([5, 1, 1, 3]).T + labels = mindspore.tensor([5, 1, 1, 3]).T - input_values = inputs.input_values - attention_mask = inputs.attention_mask - outputs = model(input_values, attention_mask=attention_mask, labels=labels) - embeddings = outputs.embeddings / F.norm(outputs.embeddings, dim=-1, keepdim=True) + with no_grad(): + input_values = inputs.input_values + attention_mask = inputs.attention_mask + outputs = model(input_values, attention_mask=attention_mask, labels=labels) + embeddings = nn.functional.normalize(outputs.embeddings, dim=-1) - cosine_sim = lambda x, y: F.cosine_similarity(x, y, dim=-1) + cosine_sim = nn.CosineSimilarity(dim=-1) # id10002 vs id10002 self.assertAlmostEqual(cosine_sim(embeddings[1], embeddings[2]).numpy(), 0.9758, 3) # id10006 vs id10002 @@ -1651,8 +1696,6 @@ def test_inference_speaker_verification(self): self.assertAlmostEqual(outputs.loss.item(), 17.7963, 2) - @unittest.skip('no torch support') - @require_librosa def test_inference_mms_1b_all(self): model = Wav2Vec2ForCTC.from_pretrained("facebook/mms-1b-all") processor = Wav2Vec2Processor.from_pretrained("facebook/mms-1b-all") @@ -1660,7 +1703,9 @@ def test_inference_mms_1b_all(self): LANG_MAP = {"it": "ita", "es": "spa", "fr": "fra", "en": "eng"} def run_model(lang): - ds = load_dataset("mozilla-foundation/common_voice_11_0", lang, split="test", streaming=True, trust_remote_code=True) + ds = load_dataset( + "mozilla-foundation/common_voice_11_0", lang, split="test", streaming=True, trust_remote_code=True + ) sample = next(iter(ds)) wav2vec2_lang = LANG_MAP[lang] @@ -1668,15 +1713,18 @@ def run_model(lang): model.load_adapter(wav2vec2_lang) processor.tokenizer.set_target_lang(wav2vec2_lang) - resampled_audio = L.resample( - ms.tensor(sample["audio"]["array"]).numpy(), orig_sr=48_000, target_sr=16_000 - ) + resampled_audio = torchaudio.functional.resample( + mindspore.tensor(sample["audio"]["array"]), 48_000, 16_000 + ).numpy() inputs = processor(resampled_audio, sampling_rate=16_000, return_tensors="ms") input_values = inputs.input_values attention_mask = inputs.attention_mask - outputs = model(input_values, attention_mask=attention_mask).logits - ids = F.argmax(outputs, dim=-1)[0] + + with no_grad(): + outputs = model(input_values, attention_mask=attention_mask).logits + + ids = ops.argmax(outputs, dim=-1)[0] transcription = processor.decode(ids) return transcription @@ -1688,5 +1736,5 @@ def run_model(lang): "en": "joe keton disapproved of films and buster also had reservations about the media", } - for lang in LANG_MAP: + for lang in LANG_MAP.keys(): assert run_model(lang) == TRANSCRIPTIONS[lang] diff --git a/tests/ut/transformers/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py b/tests/ut/transformers/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py index 1e74b4809..a8164e97f 100644 --- a/tests/ut/transformers/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py +++ b/tests/ut/transformers/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py @@ -21,7 +21,6 @@ from datasets import load_dataset import numpy as np -from mindspore import ops from mindspore import Tensor from mindnlp.transformers import Wav2Vec2BertConfig from mindnlp.utils.testing_utils import ( @@ -43,6 +42,7 @@ if is_mindspore_available(): import mindspore + from mindnlp.core import ops from mindnlp.transformers import ( AutoFeatureExtractor, @@ -779,7 +779,7 @@ def test_sample_negatives(self): features = (ops.arange(sequence_length * hidden_size) // hidden_size).view( sequence_length, hidden_size ) # each value in vector consits of same value - features = features[None, :].expand(batch_size, sequence_length, hidden_size) + features = features[None, :].broadcast_to((batch_size, sequence_length, hidden_size)) # sample negative indices sampled_negative_indices = _sample_negative_indices((batch_size, sequence_length), num_negatives, None) @@ -814,16 +814,16 @@ def test_sample_negatives_with_mask(self): features = (ops.arange(sequence_length * hidden_size) // hidden_size).view( sequence_length, hidden_size ) # each value in vector consits of same value - features = features[None, :].expand(batch_size, sequence_length, hidden_size) + features = features[None, :].broadcast_to((batch_size, sequence_length, hidden_size)) # replace masked feature vectors with -100 to test that those are not sampled - features = ops.where(mask[:, :, None].expand(features.shape).bool(), features, -100) + features = ops.where(mask[:, :, None].broadcast_to(features.shape).bool(), features, -100) # sample negative indices sampled_negative_indices = _sample_negative_indices( - (batch_size, sequence_length), num_negatives, mask + (batch_size, sequence_length), num_negatives, mask.asnumpy() ) - sampled_negative_indices = Tensor.from_numpy(sampled_negative_indices) + sampled_negative_indices = ops.from_numpy(sampled_negative_indices) negatives = features.view(-1, hidden_size)[sampled_negative_indices.long().view(-1)] negatives = negatives.view(batch_size, sequence_length, -1, hidden_size).permute(2, 0, 1, 3) @@ -836,12 +836,7 @@ def test_sample_negatives_with_mask(self): self.assertTrue(((negative - features) == 0).sum() == 0.0) # make sure that full vectors are sampled and not values of vectors => this means that `unique()` yields a single value for `hidden_size` dim - #self.assertTrue(negatives.unique(dim=-1).shape, (num_negatives, batch_size, sequence_length, 1)) - self.assertEqual(negatives.shape[:-1], (num_negatives, batch_size, sequence_length)) - ref = negatives[:, :, :, 0] - for i in range(1, negatives.shape[-1]): - x = negatives[:, :, :, i] - self.assertTrue(ops.all(ref == x)) + self.assertTrue(ops.unique(negatives, dim=-1).shape, (num_negatives, batch_size, sequence_length, 1)) @require_mindspore @slow diff --git a/tests/ut/transformers/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py b/tests/ut/transformers/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py index 5a9f87db7..a13f7d8cc 100644 --- a/tests/ut/transformers/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py +++ b/tests/ut/transformers/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py @@ -19,7 +19,7 @@ import unittest import mindspore -from mindspore import ops, Tensor +from mindspore import Tensor import numpy as np from datasets import load_dataset @@ -42,6 +42,7 @@ if is_mindspore_available(): import mindspore + from mindnlp.core import ops from mindnlp.transformers import ( Wav2Vec2ConformerForAudioFrameClassification, @@ -519,65 +520,6 @@ def test_resize_tokens_embeddings(self): def test_model_get_set_embeddings(self): pass - #@is_pt_flax_cross_test - # non-robust architecture does not exist in Flax - def test_equivalence_flax_to_pt(self): - pass - - #@is_pt_flax_cross_test - # non-robust architecture does not exist in Flax - def test_equivalence_pt_to_flax(self): - pass - - @unittest.skip('delated in wav2vec2') - def test_retain_grad_hidden_states_attentions(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - config.output_hidden_states = True - config.output_attentions = True - - # no need to test all models as different heads yield the same functionality - model_class = self.all_model_classes[0] - model = model_class(config) - - - # set layer drop to 0 - model.config.layerdrop = 0.0 - - input_values = inputs_dict["input_values"] - attention_mask_ = inputs_dict["attention_mask"] - - - tmp = ops.ones_like(attention_mask_,dtype = mindspore.int64) - inputs_dict["attention_mask"] = tmp - - outputs = model(**inputs_dict) - - input_lengths = Tensor( - [input_values.shape[1] for _ in range(input_values.shape[0])], dtype=mindspore.int64 - ) - # output_lengths = model._get_feat_extract_output_lengths(input_lengths) - # labels = ids_tensor([input_values.shape[0], output_lengths[0] - 2], self.model_tester.vocab_size) - - # inputs_dict["labels"] = labels - - # print(inputs_dict) - - output = outputs[0] - - # Encoder-/Decoder-only models - hidden_states = outputs.hidden_states[0] - attentions = outputs.attentions[0] - - grad_fn = ops.GradOperation(get_by_list=True) - hidden_states.retain_grad() - attentions.retain_grad() - - output.flatten()[0].backward(retain_graph=True) - - self.assertIsNotNone(hidden_states.grad) - self.assertIsNotNone(attentions.grad) - def test_initialization(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() @@ -788,8 +730,8 @@ def test_compute_mask_indices_short_audio(self): (batch_size, sequence_length), mask_prob, mask_length, attention_mask=attention_mask, min_masks=2 ) - mask_bool = ops.cast(Tensor(mask[0]), mindspore.bool_) - attention_mask_bool = ops.cast(attention_mask[0], mindspore.bool_) + mask_bool = Tensor(mask[0], mindspore.bool_) + attention_mask_bool = attention_mask[0].to(mindspore.bool_) # make sure that non-padded examples cannot be padded self.assertFalse(mask_bool[attention_mask_bool].any()) @@ -815,7 +757,7 @@ def test_sample_negatives(self): features = (ops.arange(sequence_length * hidden_size) // hidden_size).view( sequence_length, hidden_size ) # each value in vector consits of same value - features = features[None, :].expand(batch_size, sequence_length, hidden_size).contiguous() + features = features[None, :].broadcast_to((batch_size, sequence_length, hidden_size)) # sample negative indices sampled_negative_indices = _sample_negative_indices((batch_size, sequence_length), num_negatives, None) @@ -836,7 +778,7 @@ def test_sample_negatives(self): # make sure that full vectors are sampled and not values of vectors => this means that `unique()` yields a single value for `hidden_size` dim self.assertTrue(unique_negatives_tensor.shape, (num_negatives, batch_size, sequence_length, 1)) - def test_sample_negatives_with_mask(self): #TODO + def test_sample_negatives_with_mask(self): batch_size = 2 sequence_length = 10 hidden_size = 4 @@ -846,22 +788,19 @@ def test_sample_negatives_with_mask(self): #TODO mask = ops.ones((batch_size, sequence_length), dtype=mindspore.int64) mask[-1, sequence_length // 2 :] = 0 - sequence = ops.div( - ops.arange(sequence_length * hidden_size), - hidden_size, - rounding_mode="floor" - ) - features = sequence.view(sequence_length, hidden_size) # each value in vector consits of same value - features = features[None, :].expand(batch_size, sequence_length, hidden_size) + features = (ops.arange(sequence_length * hidden_size) // hidden_size).view( + sequence_length, hidden_size + ) # each value in vector consits of same value + features = features[None, :].broadcast_to((batch_size, sequence_length, hidden_size)) # replace masked feature vectors with -100 to test that those are not sampled - features = ops.where(mask[:, :, None].expand(features.shape).bool(), features, -100) + features = ops.where(mask[:, :, None].broadcast_to(features.shape).bool(), features, -100) # sample negative indices sampled_negative_indices = _sample_negative_indices( - (batch_size, sequence_length), num_negatives, mask + (batch_size, sequence_length), num_negatives, mask.asnumpy() ) - sampled_negative_indices = Tensor.from_numpy(sampled_negative_indices) + sampled_negative_indices = ops.from_numpy(sampled_negative_indices) negatives = features.view(-1, hidden_size)[sampled_negative_indices.long().view(-1)] negatives = negatives.view(batch_size, sequence_length, -1, hidden_size).permute(2, 0, 1, 3) @@ -874,14 +813,7 @@ def test_sample_negatives_with_mask(self): #TODO self.assertTrue(((negative - features) == 0).sum() == 0.0) # make sure that full vectors are sampled and not values of vectors => this means that `unique()` yields a single value for `hidden_size` dim - # self.assertTrue(check_unique_values(negatives,dim=-1).shape, (num_negatives, batch_size, sequence_length, 1)) - # copy from wav2vec2 - self.assertEqual(negatives.shape[:-1], (num_negatives, batch_size, sequence_length)) - ref = negatives[:, :, :, 0] - for i in range(1, negatives.shape[-1]): - print('i = ', i) - x = negatives[:, :, :, i] - self.assertTrue(ops.all(ref == x)) + self.assertTrue(ops.unique(negatives, dim=-1).shape, (num_negatives, batch_size, sequence_length, 1)) @require_mindspore @slow diff --git a/tests/ut/transformers/models/wavlm/test_modeling_wavlm.py b/tests/ut/transformers/models/wavlm/test_modeling_wavlm.py index 4e9c14edf..3e3beeda2 100644 --- a/tests/ut/transformers/models/wavlm/test_modeling_wavlm.py +++ b/tests/ut/transformers/models/wavlm/test_modeling_wavlm.py @@ -38,7 +38,7 @@ if is_mindspore_available(): import mindspore - from mindspore import ops + from mindnlp.core import ops from mindnlp.transformers import ( Wav2Vec2FeatureExtractor, @@ -210,10 +210,10 @@ def check_ctc_loss(self, config, input_values, *args): attention_mask[i, input_lengths[i] :] = 0 model.config.ctc_loss_reduction = "sum" - sum_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss[0].item() + sum_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item() model.config.ctc_loss_reduction = "mean" - mean_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss[0].item() + mean_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item() self.parent.assertTrue(isinstance(sum_loss, float)) self.parent.assertTrue(isinstance(mean_loss, float)) @@ -267,7 +267,7 @@ def check_ctc_training(self, config, input_values, *args): # one shorter than logit lengths to prevent -inf labels[i, max_length_labels[i] - 1 :] = -100 - loss = model(input_values, labels=labels).loss[0] + loss = model(input_values, labels=labels).loss self.parent.assertFalse(ops.isinf(loss).item()) @@ -441,8 +441,7 @@ def test_initialization(self): configs_no_init = _config_zero_init(config) for model_class in self.all_model_classes: model = model_class(config=configs_no_init) - for param in model.get_parameters(): - name=param.name + for name, param in model.named_parameters(): uniform_init_parms = [ "conv.weight", "conv.parametrizations.weight", @@ -462,12 +461,12 @@ def test_initialization(self): if param.requires_grad: if any(x in name for x in uniform_init_parms): self.assertTrue( - -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, + -1.0 <= ((param.mean() * 1e9).round() / 1e9).item() <= 1.0, msg=f"Parameter {name} of model {model_class} seems not properly initialized", ) else: self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), + ((param.mean() * 1e9).round() / 1e9).item(), [0.0, 1.0], msg=f"Parameter {name} of model {model_class} seems not properly initialized", ) @@ -475,17 +474,17 @@ def test_initialization(self): # overwrite from test_modeling_common def _mock_init_weights(self, module): if hasattr(module, "weight") and module.weight is not None: - module.weight.data.fill_(3) + module.weight.fill_(3) if hasattr(module, "weight_g") and module.weight_g is not None: - module.weight_g.data.fill_(3) + module.weight_g.fill_(3) if hasattr(module, "weight_v") and module.weight_v is not None: - module.weight_v.data.fill_(3) + module.weight_v.fill_(3) if hasattr(module, "bias") and module.bias is not None: - module.bias.data.fill_(3) + module.bias.fill_(3) if hasattr(module, "codevectors") and module.codevectors is not None: - module.codevectors.data.fill_(3) + module.codevectors.fill_(3) if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None: - module.masked_spec_embed.data.fill_(3) + module.masked_spec_embed.fill_(3) @unittest.skip(reason="Feed forward chunking is not implemented for WavLM") def test_feed_forward_chunking(self):