NVIDIA-NeMo
diff --git a/‎nemo_automodel/components/checkpoint/state_dict_adapter.py‎
Lines changed: 15 additions & 0 deletions b/‎nemo_automodel/components/checkpoint/state_dict_adapter.py‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎nemo_automodel/components/models/deepseek_v3/state_dict_adapter.py‎
Lines changed: 56 additions & 8 deletions b/‎nemo_automodel/components/models/deepseek_v3/state_dict_adapter.py‎
Lines changed: 56 additions & 8 deletions
diff --git a/‎nemo_automodel/components/models/glm4_moe/state_dict_adapter.py‎
Lines changed: 33 additions & 4 deletions b/‎nemo_automodel/components/models/glm4_moe/state_dict_adapter.py‎
Lines changed: 33 additions & 4 deletions
diff --git a/‎nemo_automodel/components/models/gpt_oss/state_dict_adapter.py‎
Lines changed: 55 additions & 8 deletions b/‎nemo_automodel/components/models/gpt_oss/state_dict_adapter.py‎
Lines changed: 55 additions & 8 deletions
diff --git a/‎nemo_automodel/components/models/qwen3_moe/state_dict_adapter.py‎
Lines changed: 33 additions & 4 deletions b/‎nemo_automodel/components/models/qwen3_moe/state_dict_adapter.py‎
Lines changed: 33 additions & 4 deletions
diff --git a/‎nemo_automodel/components/models/qwen3_next/state_dict_adapter.py‎
Lines changed: 42 additions & 9 deletions b/‎nemo_automodel/components/models/qwen3_next/state_dict_adapter.py‎
Lines changed: 42 additions & 9 deletions
@@ -58,3 +58,18 @@ def from_hf(
             The converted native model state dict
         """
         pass
+
+    @abstractmethod
+    def convert_single_tensor_to_hf(self, fqn: str, tensor: Any, **kwargs) -> list[tuple[str, Any]]:
+        """Convert a single tensor from native format to HuggingFace format.
+
+        Args:
+            fqn: Fully qualified name of the tensor in native format
+            tensor: The tensor to convert
+            **kwargs: Additional arguments for conversion
+
+        Returns:
+            List of (fqn, tensor) tuples in HuggingFace format.
+            Returns a list because some native tensors may split into multiple HF tensors.
+        """
+        pass
@@ -96,15 +96,15 @@ def to_hf(
         """Convert from native model state dict to HuggingFace format.
         Automatically detects format based on backend.enable_deepep configuration.
         """
-        hf_state_dict = self._to_hf_w_split_experts(state_dict)
+        hf_state_dict = {}
+        for fqn, tensor in state_dict.items():
+            converted_tensors = self.convert_single_tensor_to_hf(
+                fqn, tensor, exclude_key_regex=exclude_key_regex, quantization=quantization, **kwargs
+            )
+            for key, value in converted_tensors:
+                hf_state_dict[key] = value
 
-        if exclude_key_regex:
-            hf_state_dict = {k: v for k, v in hf_state_dict.items() if not re.match(exclude_key_regex, k)}
-
-        if quantization:
-            return self._add_quantization_scale_inv_tensors(hf_state_dict)
-        else:
-            return hf_state_dict
+        return hf_state_dict
 
     def from_hf(
         self,
@@ -124,6 +124,54 @@ def from_hf(
         hf_state_dict = self._dequantize(hf_state_dict)
         return self._from_hf_w_merged_experts(hf_state_dict, device_mesh)
 
+    def convert_single_tensor_to_hf(self, fqn: str, tensor: Any, **kwargs) -> list[tuple[str, Any]]:
+        """Convert a single tensor from native format to HuggingFace format.
+
+        Args:
+            fqn: Fully qualified name of the tensor in native format
+            tensor: The tensor to convert
+            **kwargs: Additional arguments for conversion
+
+        Returns:
+            List of (fqn, tensor) tuples in HuggingFace format
+        """
+        quantization = kwargs.get("quantization", False)
+        exclude_key_regex = kwargs.get("exclude_key_regex", None)
+
+        expert_result = self._convert_single_merged_expert_to_hf_split_experts(fqn, tensor, **kwargs)
+        if expert_result is not None:
+            result = expert_result
+        else:
+            result = [(fqn, tensor)]
+
+        if exclude_key_regex:
+            result = [(k, v) for k, v in result if not re.match(exclude_key_regex, k)]
+
+        if quantization:
+            quantized_result = []
+            for key, value in result:
+                if key.endswith(".weight") and not any(
+                    non_quantized_key in key
+                    for non_quantized_key in [
+                        "input_layernorm.weight",
+                        "post_attention_layernorm.weight",
+                        "norm.weight",
+                        "lm_head.weight",
+                        "embed_tokens.weight",
+                        "mlp.gate.weight",
+                    ]
+                ):
+                    value = value.to(dtype=torch.float8_e4m3fn)
+                    expected_scale_shape = calculate_scale_shape(value)
+                    weight_scale_inv = torch.ones(expected_scale_shape, dtype=torch.float32, device=value.device)
+                    quantized_result.append((key, value))
+                    quantized_result.append((key + "_scale_inv", weight_scale_inv))
+                else:
+                    quantized_result.append((key, value))
+            return quantized_result
+
+        return result
+
 
 def calculate_scale_shape(weight: torch.Tensor, BLOCK_SIZE: int = BLOCK_SIZE) -> torch.Size:
     # Calculate the scale tensor shape
 
@@ -61,11 +61,14 @@ def __init__(
     def to_hf(
         self, state_dict: dict[str, Any], exclude_key_regex: Optional[str] = None, quantization: bool = False, **kwargs
     ) -> dict[str, Any]:
-        hf_state_dict = self._to_hf_w_split_experts(state_dict)
-        if exclude_key_regex:
-            import re
+        hf_state_dict = {}
+        for fqn, tensor in state_dict.items():
+            converted_tensors = self.convert_single_tensor_to_hf(
+                fqn, tensor, exclude_key_regex=exclude_key_regex, quantization=quantization, **kwargs
+            )
+            for key, value in converted_tensors:
+                hf_state_dict[key] = value
 
-            hf_state_dict = {k: v for k, v in hf_state_dict.items() if not re.match(exclude_key_regex, k)}
         return hf_state_dict
 
     def from_hf(
@@ -80,3 +83,29 @@ def from_hf(
                 self._uses_model_prefix = key.startswith("model.")
                 break
         return self._from_hf_w_merged_experts(hf_state_dict, device_mesh)
+
+    def convert_single_tensor_to_hf(self, fqn: str, tensor: Any, **kwargs) -> list[tuple[str, Any]]:
+        """Convert a single tensor from native format to HuggingFace format.
+
+        Args:
+            fqn: Fully qualified name of the tensor in native format
+            tensor: The tensor to convert
+            **kwargs: Additional arguments for conversion
+
+        Returns:
+            List of (fqn, tensor) tuples in HuggingFace format
+        """
+        exclude_key_regex = kwargs.get("exclude_key_regex", None)
+
+        expert_result = self._convert_single_merged_expert_to_hf_split_experts(fqn, tensor, **kwargs)
+        if expert_result is not None:
+            result = expert_result
+        else:
+            result = [(fqn, tensor)]
+
+        if exclude_key_regex:
+            import re
+
+            result = [(k, v) for k, v in result if not re.match(exclude_key_regex, k)]
+
+        return result
@@ -212,15 +212,14 @@ def to_hf(
         self, state_dict: dict[str, Any], exclude_key_regex: Optional[str] = None, quantization: bool = False, **kwargs
     ) -> dict[str, Any]:
         """Convert from native model state dict to HuggingFace format."""
-        hf_state_dict = dict(state_dict)
-        hf_state_dict = self._apply_key_mapping(hf_state_dict, self.internal_to_hf_map)
-
-        # Apply exclude regex if provided
-        if exclude_key_regex:
-            hf_state_dict = {k: v for k, v in hf_state_dict.items() if not re.match(exclude_key_regex, k)}
+        hf_state_dict = {}
+        for fqn, tensor in state_dict.items():
+            converted_tensors = self.convert_single_tensor_to_hf(
+                fqn, tensor, exclude_key_regex=exclude_key_regex, quantization=quantization, **kwargs
+            )
+            for key, value in converted_tensors:
+                hf_state_dict[key] = value
 
-        if quantization:
-            hf_state_dict = self._add_quantization_block_scale_tensors(hf_state_dict)
         return hf_state_dict
 
     def from_hf(
@@ -244,3 +243,51 @@ def from_hf(
         native_state_dict = self._apply_key_mapping(native_state_dict, self.hf_to_internal_map)
 
         return native_state_dict
+
+    def convert_single_tensor_to_hf(self, fqn: str, tensor: Any, **kwargs) -> list[tuple[str, Any]]:
+        """Convert a single tensor from native format to HuggingFace format.
+
+        Args:
+            fqn: Fully qualified name of the tensor in native format
+            tensor: The tensor to convert
+            **kwargs: Additional arguments for conversion
+
+        Returns:
+            List of (fqn, tensor) tuples in HuggingFace format
+        """
+        quantization = kwargs.get("quantization", False)
+        exclude_key_regex = kwargs.get("exclude_key_regex", None)
+
+        hf_fqn = fqn
+        for pattern, replacement in self.internal_to_hf_map.items():
+            if fqn.endswith(pattern):
+                hf_fqn = fqn[: -len(pattern)] + replacement
+                break
+
+        if exclude_key_regex:
+            if re.match(exclude_key_regex, hf_fqn):
+                return []
+
+        if quantization:
+            if hf_fqn.endswith("gate_up_proj") or hf_fqn.endswith("down_proj"):
+                layer_name, projection_type = hf_fqn.rsplit(".", 1)
+                n_experts, _, dim = tensor.shape
+
+                if isinstance(tensor, torch.distributed.tensor.DTensor):
+                    placements, device_mesh = tensor.placements, tensor.device_mesh
+                    blocks_tensors = torch.distributed.tensor.ones(
+                        (n_experts, dim, 90, 16), placements=placements, device_mesh=device_mesh, dtype=torch.uint8
+                    )
+                    scales_tensors = torch.distributed.tensor.ones(
+                        (n_experts, dim, 90), placements=placements, device_mesh=device_mesh, dtype=torch.uint8
+                    )
+                else:
+                    blocks_tensors = torch.ones((n_experts, dim, 90, 16), dtype=torch.uint8)
+                    scales_tensors = torch.ones((n_experts, dim, 90), dtype=torch.uint8)
+
+                return [
+                    (f"{layer_name}.{projection_type}_blocks", blocks_tensors),
+                    (f"{layer_name}.{projection_type}_scales", scales_tensors),
+                ]
+
+        return [(hf_fqn, tensor)]
@@ -55,11 +55,14 @@ def __init__(
     def to_hf(
         self, state_dict: dict[str, Any], exclude_key_regex: Optional[str] = None, quantization: bool = False, **kwargs
     ) -> dict[str, Any]:
-        hf_state_dict = self._to_hf_w_split_experts(state_dict)
-        if exclude_key_regex:
-            import re
+        hf_state_dict = {}
+        for fqn, tensor in state_dict.items():
+            converted_tensors = self.convert_single_tensor_to_hf(
+                fqn, tensor, exclude_key_regex=exclude_key_regex, quantization=quantization, **kwargs
+            )
+            for key, value in converted_tensors:
+                hf_state_dict[key] = value
 
-            hf_state_dict = {k: v for k, v in hf_state_dict.items() if not re.match(exclude_key_regex, k)}
         return hf_state_dict
 
     def from_hf(
@@ -74,3 +77,29 @@ def from_hf(
                 self._uses_model_prefix = key.startswith("model.")
                 break
         return self._from_hf_w_merged_experts(hf_state_dict, device_mesh)
+
+    def convert_single_tensor_to_hf(self, fqn: str, tensor: Any, **kwargs) -> list[tuple[str, Any]]:
+        """Convert a single tensor from native format to HuggingFace format.
+
+        Args:
+            fqn: Fully qualified name of the tensor in native format
+            tensor: The tensor to convert
+            **kwargs: Additional arguments for conversion
+
+        Returns:
+            List of (fqn, tensor) tuples in HuggingFace format
+        """
+        exclude_key_regex = kwargs.get("exclude_key_regex", None)
+
+        expert_result = self._convert_single_merged_expert_to_hf_split_experts(fqn, tensor, **kwargs)
+        if expert_result is not None:
+            result = expert_result
+        else:
+            result = [(fqn, tensor)]
+
+        if exclude_key_regex:
+            import re
+
+            result = [(k, v) for k, v in result if not re.match(exclude_key_regex, k)]
+
+        return result
@@ -93,16 +93,14 @@ def _apply_key_mapping(self, state_dict: dict[str, Any], mapping: dict[str, str]
     def to_hf(
         self, state_dict: dict[str, Any], exclude_key_regex: Optional[str] = None, quantization: bool = False, **kwargs
     ) -> dict[str, Any]:
-        # First convert routed experts from grouped to split format
-        hf_state_dict = self._to_hf_w_split_experts(state_dict)
+        hf_state_dict = {}
+        for fqn, tensor in state_dict.items():
+            converted_tensors = self.convert_single_tensor_to_hf(
+                fqn, tensor, exclude_key_regex=exclude_key_regex, quantization=quantization, **kwargs
+            )
+            for key, value in converted_tensors:
+                hf_state_dict[key] = value
 
-        # Then apply key mappings for shared experts (shared_experts -> shared_expert)
-        hf_state_dict = self._apply_key_mapping(hf_state_dict, self.internal_to_hf_map)
-
-        if exclude_key_regex:
-            import re
-
-            hf_state_dict = {k: v for k, v in hf_state_dict.items() if not re.match(exclude_key_regex, k)}
         return hf_state_dict
 
     def from_hf(
@@ -122,3 +120,38 @@ def from_hf(
 
         # Then convert routed experts from split to grouped format
         return self._from_hf_w_merged_experts(hf_state_dict, device_mesh)
+
+    def convert_single_tensor_to_hf(self, fqn: str, tensor: Any, **kwargs) -> list[tuple[str, Any]]:
+        """Convert a single tensor from native format to HuggingFace format.
+
+        Args:
+            fqn: Fully qualified name of the tensor in native format
+            tensor: The tensor to convert
+            **kwargs: Additional arguments for conversion
+
+        Returns:
+            List of (fqn, tensor) tuples in HuggingFace format
+        """
+        exclude_key_regex = kwargs.get("exclude_key_regex", None)
+
+        expert_result = self._convert_single_merged_expert_to_hf_split_experts(fqn, tensor, **kwargs)
+        if expert_result is not None:
+            result = expert_result
+        else:
+            result = [(fqn, tensor)]
+
+        mapped_result = []
+        for key, value in result:
+            new_key = key
+            for pattern, replacement in self.internal_to_hf_map.items():
+                if pattern in key:
+                    new_key = new_key.replace(pattern, replacement)
+                    break
+            mapped_result.append((new_key, value))
+
+        if exclude_key_regex:
+            import re
+
+            mapped_result = [(k, v) for k, v in mapped_result if not re.match(exclude_key_regex, k)]
+
+        return mapped_result