clean

xmfan · xmfan · commit 3cde52e10b71 · 2025-07-10T11:12:31.000-07:00
diff --git a/torchtitan/experiments/llama4/infra/parallelize.py b/torchtitan/experiments/llama4/infra/parallelize.py
@@ -40,30 +40,17 @@ def apply_compile(model: nn.Module):
     torch._dynamo.config.fail_on_recompile_limit_hit = True
     for layer_id, transformer_block in model.layers.named_children():
         if transformer_block.moe_enabled:
-            # compile the experts directly which can be wrapped by fsdp
             moe = transformer_block.moe
-
-            # transformer_block.moe.experts = torch.compile(transformer_block.moe.experts, fullgraph=True))
+            # Individually compile modules to keep fullgraph=True on FSDP wrapped experts
             moe.experts = torch.compile(moe.experts, fullgraph=True)
-            moe.router = torch.compile(moe.router, fullgraph=True)
             moe.shared_expert = torch.compile(moe.shared_expert, fullgraph=True)
+
+            # Separately compile the code around the FSDP wrapped experts
+            moe.router = torch.compile(moe.router, fullgraph=True)
         else:
             transformer_block = torch.compile(transformer_block, fullgraph=True)
         model.layers.register_module(layer_id, transformer_block)
 
-    # def _compile_child(parent:nn.Module, child_name: str, child: nn.Module):
-    #     parent.register_module(child_name, torch.compile(child, fullgraph=True))
-
-    # torch._dynamo.config.fail_on_recompile_limit_hit = True
-    # for layer_id, transformer_block in model.layers.named_children():
-    #     if transformer_block.moe_enabled:
-    #         # compile the experts directly which can be wrapped by fsdp
-    #         moe = transformer_block.moe
-    #         # for submod_id, submod in moe.named_children():
-    #         #     _compile_child(moe, submod_id, submod)
-    #     else:
-    #         _compile_child(transformer_block, layer_id, transformer_block)
-
     logger.info("Compiling each TransformerBlock with torch.compile")