mindspore-lab
diff --git a/‎mindone/transformers/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎mindone/transformers/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎mindone/transformers/generation/logits_process.py‎
Lines changed: 1 addition & 1 deletion b/‎mindone/transformers/generation/logits_process.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎mindone/transformers/models/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎mindone/transformers/models/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎mindone/transformers/models/auto/configuration_auto.py‎
Lines changed: 2 additions & 0 deletions b/‎mindone/transformers/models/auto/configuration_auto.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎mindone/transformers/models/auto/feature_extraction_auto.py‎
Lines changed: 1 addition & 0 deletions b/‎mindone/transformers/models/auto/feature_extraction_auto.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎mindone/transformers/models/auto/modeling_auto.py‎
Lines changed: 19 additions & 0 deletions b/‎mindone/transformers/models/auto/modeling_auto.py‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎mindone/transformers/models/auto/processing_auto.py‎
Lines changed: 1 addition & 0 deletions b/‎mindone/transformers/models/auto/processing_auto.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎mindone/transformers/models/dia/__init__.py‎
Lines changed: 20 additions & 0 deletions b/‎mindone/transformers/models/dia/__init__.py‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎mindone/transformers/models/dia/feature_extraction_dia.py‎
Lines changed: 186 additions & 0 deletions b/‎mindone/transformers/models/dia/feature_extraction_dia.py‎
Lines changed: 186 additions & 0 deletions
@@ -429,6 +429,7 @@
 )
 from .models.depth_anything import DepthAnythingForDepthEstimation, DepthAnythingPreTrainedModel
 from .models.depth_pro import DepthProForDepthEstimation, DepthProImageProcessor, DepthProModel, DepthProPreTrainedModel
+from .models.dia import DiaForConditionalGeneration, DiaModel, DiaPreTrainedModel, DiaProcessor
 from .models.diffllama import (
     DiffLlamaForCausalLM,
     DiffLlamaForQuestionAnswering,
 
@@ -3197,7 +3197,7 @@ def __call__(self, input_ids: ms.Tensor, scores: ms.Tensor) -> ms.Tensor:
             # Create top k based on the combined CFG output
             _, top_k_indices = mint.topk(scores_processed, k=self.guidance_top_k, dim=-1)
             top_k_mask = mint.ones_like(scores_processed, dtype=ms.bool_)
-            top_k_mask = top_k_mask.scatter(dim=-1, index=top_k_indices, value=False)
+            top_k_mask = top_k_mask.scatter_(dim=-1, index=top_k_indices, value=False)
             # Only return conditioned logits with top k
             scores_processed = cond_logits.masked_fill(top_k_mask, -float("inf"))
 
 
@@ -61,6 +61,7 @@
     deprecated,
     depth_anything,
     depth_pro,
+    dia,
     diffllama,
     dinov2,
     dinov2_with_registers,
 
@@ -83,6 +83,7 @@
         ("depth_anything", "DepthAnythingConfig"),
         ("depth_pro", "DepthProConfig"),
         ("detr", "DetrConfig"),
+        ("dia", "DiaConfig"),
         ("diffllama", "DiffLlamaConfig"),
         ("dinov2", "Dinov2Config"),
         ("dinov2_with_registers", "Dinov2WithRegistersConfig"),
@@ -352,6 +353,7 @@
         ("depth_anything", "Depth Anything"),
         ("depth_pro", "DepthPro"),
         ("detr", "DETR"),
+        ("dia", "Dia"),
         ("diffllama", "DiffLlama"),
         ("dinov2", "DINOv2"),
         ("dinov2_with_registers", "DINOv2 with Registers"),
 
@@ -46,6 +46,7 @@
         ("chinese_clip", "ChineseCLIPFeatureExtractor"),
         ("convnext", "ConvNextFeatureExtractor"),
         ("cvt", "ConvNextFeatureExtractor"),
+        ("dia", "DiaFeatureExtractor"),
         ("flava", "FlavaFeatureExtractor"),
         ("seamless_m4t", "SeamlessM4TFeatureExtractor"),
     ]
 
@@ -81,6 +81,7 @@
         ("deit", "DeiTModel"),
         ("depth_pro", "DepthProModel"),
         ("detr", "DetrModel"),
+        ("dia", "DiaModel"),
         ("diffllama", "DiffLlamaModel"),
         ("dinov2", "Dinov2Model"),
         ("dinov2_with_registers", "Dinov2WithRegistersModel"),
@@ -353,6 +354,7 @@
         ("data2vec-text", "Data2VecTextForMaskedLM"),
         ("deberta", "DebertaForMaskedLM"),
         ("deberta-v2", "DebertaV2ForMaskedLM"),
+        ("dia", "DiaForConditionalGeneration"),
         ("distilbert", "DistilBertForMaskedLM"),
         ("esm", "EsmForMaskedLM"),
         ("electra", "ElectraForMaskedLM"),
@@ -838,6 +840,7 @@
 
 MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = OrderedDict(
     [
+        ("dia", "DiaForConditionalGeneration"),
         ("moonshine", "MoonshineForConditionalGeneration"),
         ("pop2piano", "Pop2PianoForConditionalGeneration"),
         ("seamless_m4t", "SeamlessM4TForSpeechToText"),
@@ -1274,6 +1277,12 @@
     ]
 )
 
+MODEL_FOR_AUDIO_TOKENIZATION_NAMES = OrderedDict(
+    [
+        ("dac", "DacModel"),
+    ]
+)
+
 
 if version.parse(transformers.__version__) >= version.parse("4.51.0"):
     MODEL_MAPPING_NAMES.update({"qwen3_moe": "Qwen3MoeModel"})
@@ -1424,6 +1433,7 @@
 
 MODEL_FOR_IMAGE_TO_IMAGE_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES)
 
+MODEL_FOR_AUDIO_TOKENIZATION_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_AUDIO_TOKENIZATION_NAMES)
 
 class AutoModelForMaskGeneration(_BaseAutoModelClass):
     _model_mapping = MODEL_FOR_MASK_GENERATION_MAPPING
@@ -1708,6 +1718,15 @@ class AutoModelForMaskedImageModeling(_BaseAutoModelClass):
 AutoModelForMaskedImageModeling = auto_class_update(AutoModelForMaskedImageModeling, head_doc="masked image modeling")
 
 
+class AutoModelForAudioTokenization(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_AUDIO_TOKENIZATION_MAPPING
+
+
+AutoModelForAudioTokenization = auto_class_update(
+    AutoModelForAudioTokenization, head_doc="audio tokenization through codebooks"
+)
+
+
 class AutoModelWithLMHead(_AutoModelWithLMHead):
     @classmethod
     def from_config(cls, config):
 
@@ -55,6 +55,7 @@
         ("chameleon", "ChameleonProcessor"),
         ("chinese_clip", "ChineseCLIPProcessor"),
         ("colpali", "ColPaliProcessor"),
+        ("dia", "DiaProcessor"),
         ("flava", "FlavaProcessor"),
         ("idefics", "IdeficsProcessor"),
         ("instructblip", "InstructBlipProcessor"),
 
@@ -0,0 +1,20 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# This code is adapted from https://github.com/huggingface/transformers
+# with modifications to run transformers on mindspore.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .feature_extraction_dia import *
+from .generation_dia import *
+from .modeling_dia import *
+from .processing_dia import *
@@ -0,0 +1,186 @@
+# coding=utf-8
+# Copyright 2025 The Nari Labs and HuggingFace Inc. team. All rights reserved.
+#
+# This code is adapted from https://github.com/huggingface/transformers
+# with modifications to run transformers on mindspore.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for Dia"""
+
+from typing import Optional, Union
+
+import numpy as np
+
+from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
+from ...feature_extraction_utils import BatchFeature
+from ...utils import PaddingStrategy, TensorType, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class DiaFeatureExtractor(SequenceFeatureExtractor):
+    r"""
+    Constructs an Dia feature extractor.
+
+    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
+    most of the main methods. Users should refer to this superclass for more information regarding those methods.
+
+    Args:
+        feature_size (`int`, *optional*, defaults to 1):
+            The feature dimension of the extracted features. Use 1 for mono, 2 for stereo.
+        sampling_rate (`int`, *optional*, defaults to 16000):
+            The sampling rate at which the audio waveform should be digitalized, expressed in hertz (Hz).
+        padding_value (`float`, *optional*, defaults to 0.0):
+            The value that is used for padding.
+        hop_length (`int`, *optional*, defaults to 512):
+            Overlap length between successive windows.
+    """
+
+    model_input_names = ["input_values", "n_quantizers"]
+
+    def __init__(
+        self,
+        feature_size: int = 1,
+        sampling_rate: int = 16000,
+        padding_value: float = 0.0,
+        hop_length: int = 512,
+        **kwargs,
+    ):
+        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
+        self.hop_length = hop_length
+
+    def __call__(
+        self,
+        raw_audio: Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]],
+        padding: Optional[Union[bool, str, PaddingStrategy]] = None,
+        truncation: Optional[bool] = False,
+        max_length: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        sampling_rate: Optional[int] = None,
+    ) -> BatchFeature:
+        """
+        Main method to featurize and prepare for the model one or several sequence(s).
+
+        Args:
+            raw_audio (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
+                The sequence or batch of sequences to be processed. Each sequence can be a numpy array, a list of float
+                values, a list of numpy arrays or a list of list of float values. The numpy array must be of shape
+                `(num_samples,)` for mono audio (`feature_size = 1`), or `(2, num_samples)` for stereo audio
+                (`feature_size = 2`).
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                index) among:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            truncation (`bool`, *optional*, defaults to `False`):
+                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see above).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*, default to 'pt'):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            sampling_rate (`int`, *optional*):
+                The sampling rate at which the `audio` input was sampled. It is strongly recommended to pass
+                `sampling_rate` at the forward call to prevent silent errors.
+        """
+        if sampling_rate is not None:
+            if sampling_rate != self.sampling_rate:
+                raise ValueError(
+                    f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
+                    f" {self.sampling_rate}. Please make sure that the provided audio input was sampled with"
+                    f" {self.sampling_rate} and not {sampling_rate}."
+                )
+        else:
+            logger.warning(
+                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
+                "Failing to do so can result in silent errors that might be hard to debug."
+            )
+
+        if padding and truncation:
+            raise ValueError("Both padding and truncation were set. Make sure you only set one.")
+        elif padding is None:
+            # by default let's pad the inputs
+            padding = True
+
+        is_batched = bool(
+            isinstance(raw_audio, (list, tuple)) and (isinstance(raw_audio[0], (np.ndarray, tuple, list)))
+        )
+
+        if is_batched:
+            raw_audio = [np.asarray(audio, dtype=np.float32).T for audio in raw_audio]
+        elif not is_batched and not isinstance(raw_audio, np.ndarray):
+            raw_audio = np.asarray(raw_audio, dtype=np.float32)
+        elif isinstance(raw_audio, np.ndarray) and raw_audio.dtype is np.dtype(np.float64):
+            raw_audio = raw_audio.astype(np.float32)
+
+        # always return batch
+        if not is_batched:
+            raw_audio = [np.asarray(raw_audio).T]
+
+        # convert stereo to mono if necessary, unique to Dia
+        for idx, example in enumerate(raw_audio):
+            if self.feature_size == 2 and example.ndim == 2:
+                raw_audio[idx] = np.mean(example, -1)
+
+        # verify inputs are valid
+        for idx, example in enumerate(raw_audio):
+            if example.ndim > 2:
+                raise ValueError(f"Expected input shape (channels, length) but got shape {example.shape}")
+            if self.feature_size == 1 and example.ndim != 1:
+                raise ValueError(f"Expected mono audio but example has {example.shape[-1]} channels")
+            if self.feature_size == 2 and example.ndim != 1:  # note the conversion before
+                raise ValueError(f"Expected stereo audio but example has {example.shape[-1]} channels")
+
+        input_values = BatchFeature({"input_values": raw_audio})
+
+        # temporarily treat it as if we were mono as we also convert stereo to mono
+        origingal_feature_size = self.feature_size
+        self.feature_size = 1
+
+        # normal padding on batch
+        padded_inputs = self.pad(
+            input_values,
+            max_length=max_length,
+            truncation=truncation,
+            padding=padding,
+            return_attention_mask=True,
+            pad_to_multiple_of=self.hop_length,
+        )
+        padded_inputs["padding_mask"] = padded_inputs.pop("attention_mask")
+
+        input_values = []
+        for example in padded_inputs.pop("input_values"):
+            if self.feature_size == 1:
+                example = example[..., None]
+            input_values.append(example.T)
+
+        padded_inputs["input_values"] = input_values
+        if return_tensors is not None:
+            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
+
+        # rewrite back to original feature size
+        self.feature_size = origingal_feature_size
+
+        return padded_inputs
+
+
+__all__ = ["DiaFeatureExtractor"]
Original file line number	Diff line number	Diff line change
`@@ -429,6 +429,7 @@`
`429`	`429`	`)`
`430`	`430`	`from .models.depth_anything import DepthAnythingForDepthEstimation, DepthAnythingPreTrainedModel`
`431`	`431`	`from .models.depth_pro import DepthProForDepthEstimation, DepthProImageProcessor, DepthProModel, DepthProPreTrainedModel`
	`432`	`+from .models.dia import DiaForConditionalGeneration, DiaModel, DiaPreTrainedModel, DiaProcessor`
`432`	`433`	`from .models.diffllama import (`
`433`	`434`	`DiffLlamaForCausalLM,`
`434`	`435`	`DiffLlamaForQuestionAnswering,`
Original file line number	Diff line number	Diff line change
`@@ -46,6 +46,7 @@`
`46`	`46`	`("chinese_clip", "ChineseCLIPFeatureExtractor"),`
`47`	`47`	`("convnext", "ConvNextFeatureExtractor"),`
`48`	`48`	`("cvt", "ConvNextFeatureExtractor"),`
	`49`	`+ ("dia", "DiaFeatureExtractor"),`
`49`	`50`	`("flava", "FlavaFeatureExtractor"),`
`50`	`51`	`("seamless_m4t", "SeamlessM4TFeatureExtractor"),`
`51`	`52`	`]`