fix

hemildesai · hemildesai · commit 35798865b3e4 · 2025-11-04T17:39:22.000-08:00
Signed-off-by: Hemil Desai &lt;hemild@nvidia.com&gt;
diff --git a/nemo_automodel/components/moe/parallelizer.py b/nemo_automodel/components/moe/parallelizer.py
@@ -34,7 +34,7 @@
     GroupedExpertsDeepEP,
     MoE,
 )
-from nemo_automodel.components.moe.utils import BackendConfig
+from nemo_automodel.shared.utils import dtype_from_str
 
 logger = logging.getLogger(__name__)
 _CP_STREAM = None
@@ -130,9 +130,11 @@ def apply_fsdp(
     mp_policy: MixedPrecisionPolicy | None = None,
     offload_policy: OffloadPolicy | None = None,
     reshard_after_forward: bool = False,
-    backend_config: BackendConfig | None = None,
-    lm_head_precision: torch.dtype | None = None,
+    lm_head_precision: str | torch.dtype | None = None,
 ):
+    if isinstance(lm_head_precision, str):
+        lm_head_precision = dtype_from_str(lm_head_precision, default=None)
+
     if mp_policy is None:
         mp_policy = MixedPrecisionPolicy(
             param_dtype=torch.bfloat16, reduce_dtype=torch.float32, output_dtype=torch.bfloat16
@@ -232,8 +234,7 @@ def parallelize_model(
     ep_shard_axis_names: tuple[str, ...] | None = None,
     activation_checkpointing: bool = False,
     reshard_after_forward: bool = False,
-    backend_config: BackendConfig | None = None,
-    lm_head_precision: torch.dtype | None = None,
+    lm_head_precision: str | torch.dtype | None = None,
 ):
     assert tp_axis_name is None or world_mesh[tp_axis_name].size() == 1, (
         "Tensor parallelism not supported for custom MoE models"
@@ -271,6 +272,5 @@ def parallelize_model(
             ep_shard_enabled=ep_shard_mesh is not None and ep_shard_mesh.size() > 1,
             ep_shard_mesh=ep_shard_mesh,
             reshard_after_forward=reshard_after_forward,
-            backend_config=backend_config,
             lm_head_precision=lm_head_precision,
         )
diff --git a/tests/unit_tests/moe/test_parallelizer.py b/tests/unit_tests/moe/test_parallelizer.py
@@ -767,3 +767,89 @@ def __init__(self):
     apply_fsdp_mock.assert_called_once()
     _, kwargs = apply_fsdp_mock.call_args
     assert kwargs.get("lm_head_precision") == torch_stub.float32
+
+
+def test_apply_fsdp_with_lm_head_precision_string_input(monkeypatch):
+    """Test that apply_fsdp accepts string input for lm_head_precision and converts to torch.dtype."""
+    P = _import_parallelizer_with_stubs(monkeypatch)
+    monkeypatch.setattr(P, "MoE", DummyMoE)
+
+    fully_shard_mock = MagicMock()
+    mp_policy_mock = MagicMock(return_value="MP_POLICY")
+    monkeypatch.setattr(P, "fully_shard", fully_shard_mock)
+    monkeypatch.setattr(P, "MixedPrecisionPolicy", mp_policy_mock)
+
+    torch_stub = sys.modules["torch"]
+
+    # Mock dtype_from_str to convert string to torch.float32
+    def mock_dtype_from_str(val, default=None):
+        if val == "float32" or val == "torch.float32":
+            return torch_stub.float32
+        return default
+
+    monkeypatch.setattr(P, "dtype_from_str", mock_dtype_from_str)
+
+    block = DummyBlock(mlp=DummyMoE())
+    lm = object()
+    model = DummyModel([block], lm_head=lm)
+    fsdp_mesh = object()
+
+    P.apply_fsdp(
+        model=model,
+        fsdp_mesh=fsdp_mesh,
+        pp_enabled=False,
+        ep_enabled=False,
+        ep_shard_enabled=False,
+        lm_head_precision="float32",
+    )
+
+    # Find the lm_head call
+    lm_call = _find_call_by_first_arg(fully_shard_mock, lm)
+    assert lm_call is not None
+
+    # Verify custom MixedPrecisionPolicy was created with fp32 for all dtypes
+    assert mp_policy_mock.call_count >= 2  # default + lm_head
+    # Find the call for lm_head's custom policy
+    fp32_policy_calls = [
+        call for call in mp_policy_mock.call_args_list
+        if call[1].get("param_dtype") == torch_stub.float32
+        and call[1].get("reduce_dtype") == torch_stub.float32
+        and call[1].get("output_dtype") == torch_stub.float32
+    ]
+    assert len(fp32_policy_calls) == 1
+
+
+def test_parallelize_model_with_lm_head_precision_string_input(monkeypatch):
+    """Test that parallelize_model accepts string input for lm_head_precision."""
+    P = _import_parallelizer_with_stubs(monkeypatch)
+    apply_fsdp_mock = MagicMock()
+    monkeypatch.setattr(P, "apply_fsdp", apply_fsdp_mock)
+    monkeypatch.setattr(P, "apply_ep", MagicMock())
+    monkeypatch.setattr(P, "apply_ac", MagicMock())
+
+    world_mesh = FakeWorldMesh({("dp",): 2}, mesh_dim_names=["dp"])
+    moe_mesh = None
+
+    class Inner:
+        def __init__(self):
+            self.moe_config = type("MC", (), {"n_routed_experts": 4})()
+
+    class Outer:
+        def __init__(self):
+            self.model = Inner()
+
+    model = Outer()
+
+    P.parallelize_model(
+        model=model,
+        world_mesh=world_mesh,
+        moe_mesh=moe_mesh,
+        pp_enabled=False,
+        dp_axis_names=("dp",),
+        lm_head_precision="float32",
+    )
+
+    # Verify apply_fsdp was called with lm_head_precision as a string
+    apply_fsdp_mock.assert_called_once()
+    _, kwargs = apply_fsdp_mock.call_args
+    assert kwargs.get("lm_head_precision") == "float32"