pytorch
diff --git a/‎run_train.sh
Lines changed: 1 addition & 1 deletion b/‎run_train.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎scripts/estimate/estimation.py
Lines changed: 3 additions & 1 deletion b/‎scripts/estimate/estimation.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎scripts/generate/test_generate.py
Lines changed: 1 addition & 0 deletions b/‎scripts/generate/test_generate.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/unit_tests/test_model_converter.py
Lines changed: 1 addition & 0 deletions b/‎tests/unit_tests/test_model_converter.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎torchtitan/components/ft.py
Lines changed: 1 addition & 11 deletions b/‎torchtitan/components/ft.py
Lines changed: 1 addition & 11 deletions
diff --git a/‎torchtitan/config_manager.py
Lines changed: 8 additions & 0 deletions b/‎torchtitan/config_manager.py
Lines changed: 8 additions & 0 deletions
diff --git a/‎torchtitan/distributed/parallel_dims.py
Lines changed: 101 additions & 9 deletions b/‎torchtitan/distributed/parallel_dims.py
Lines changed: 101 additions & 9 deletions
diff --git a/‎torchtitan/distributed/utils.py
Lines changed: 67 additions & 1 deletion b/‎torchtitan/distributed/utils.py
Lines changed: 67 additions & 1 deletion
diff --git a/‎torchtitan/experiments/llama4/README.md
Lines changed: 1 addition & 1 deletion b/‎torchtitan/experiments/llama4/README.md
Lines changed: 1 addition & 1 deletion
@@ -12,7 +12,7 @@ set -ex
 # LOG_RANK=0,1 NGPU=4 ./run_train.sh
 NGPU=${NGPU:-"8"}
 export LOG_RANK=${LOG_RANK:-0}
-CONFIG_FILE=${CONFIG_FILE:-"./torchtitan/models/llama3/train_configs/debug_model.toml"}
+CONFIG_FILE=${CONFIG_FILE:-"./torchtitan/experiments/llama4/train_configs/debug_model.toml"}
 
 overrides=""
 if [ $# -ne 0 ]; then
 
@@ -46,6 +46,7 @@ def estimate_memory(job_config: JobConfig):
         cp=parallelism_config.context_parallel_degree,
         tp=parallelism_config.tensor_parallel_degree,
         pp=parallelism_config.pipeline_parallel_degree,
+        ep=parallelism_config.expert_parallel_degree,
         world_size=world_size,
         enable_loss_parallel=not parallelism_config.disable_loss_parallel,
     )
@@ -56,8 +57,9 @@ def estimate_memory(job_config: JobConfig):
         or parallel_dims.tp_enabled
         or parallel_dims.pp_enabled
         or parallel_dims.cp_enabled
+        or parallel_dims.ep_enabled
     ):
-        logger.warning("DDP, TP, PP, CP are not supported yet.")
+        logger.warning("DDP, TP, PP, CP, EP are not supported yet.")
         return
     if not parallel_dims.dp_shard_enabled:
         logger.warning("FSDP or HSDP is not enabled. Skipping memory estimation.")
 
@@ -125,6 +125,7 @@ def test_generate(
             cp=1,
             tp=world_size,
             pp=1,
+            ep=1,
             world_size=world_size,
             enable_loss_parallel=False,
         )
 
@@ -21,6 +21,7 @@ def build_parallel_dims(job_config, world_size):
         cp=parallelism_config.context_parallel_degree,
         tp=parallelism_config.tensor_parallel_degree,
         pp=parallelism_config.pipeline_parallel_degree,
+        ep=parallelism_config.expert_parallel_degree,
         world_size=world_size,
         enable_loss_parallel=not parallelism_config.disable_loss_parallel,
     )
 
@@ -124,17 +124,7 @@ def func(
                 manager=self.ft_manager.manager,
             )
 
-        dims = []
-        names = []
-        for d, name in zip(
-            [self.pp, self.dp_replicate, self.dp_shard, self.cp, self.tp],
-            ["pp", "dp_replicate", "dp_shard", "cp", "tp"],
-        ):
-            if d > 1 or name == "dp_replicate":
-                dims.append(d)
-                names.append(name)
-
-        return self._build_mesh(device_type, dims, names, func)
+        return self._build_mesh(device_type, func)
 
     @property
     def dp_replicate_enabled(self):
 
@@ -363,6 +363,14 @@ class Parallelism:
     The default value is 'allgather'.
     """
 
+    expert_parallel_degree: int = 1
+    """
+    Expert parallelism degree. 1 means disabled.
+    Currently, only "dp2ep" is supported, with the following constraints:
+    context_parallel_degree <= expert_parallel_degree <= data_parallel_shard_degree * context_parallel_degree
+    Note that this is still an experimental feature.
+    """
+
 
 @dataclass
 class Checkpoint:
 
@@ -23,21 +23,23 @@ class ParallelDims:
     cp: int
     tp: int
     pp: int
+    ep: int
     world_size: int
     enable_loss_parallel: bool
 
     def __post_init__(self):
         self._validate()
 
     def _validate(self):
-        dp_replicate, dp_shard, cp, tp, pp = (
+        dp_replicate, dp_shard, cp, tp, pp, ep = (
             self.dp_replicate,
             self.dp_shard,
             self.cp,
             self.tp,
             self.pp,
+            self.ep,
         )
-        for d in (dp_replicate, cp, tp, pp):
+        for d in (dp_replicate, cp, tp, pp, ep):
             assert d >= 1, "Parallelism degree should be >= 1, except for dp_shard"
 
         assert dp_shard == -1 or dp_shard >= 1, " dp_shard must -1 or >=1."
@@ -50,26 +52,107 @@ def _validate(self):
             f"cp({cp}) * tp({tp}) * pp({pp}) != WORLD_SIZE({self.world_size})"
         )
 
+        if ep > 1:
+            # EP would borrow all cp and some dp_shard degree
+            assert ep % cp == 0 and (dp_shard * cp) % ep == 0
+
     def build_mesh(self, device_type: str) -> DeviceMesh:
+        return self._build_mesh(device_type, init_device_mesh)
+
+    def _build_mesh(
+        self, device_type: str, init_device_mesh_fn: Callable
+    ) -> DeviceMesh:
+        # TODO: Current implementation of ParallelDims for dp2ep Expert Parallel
+        #       is not very clean, due to the limited support from DeviceMesh
+        #       for creating two staggered meshes. Will improve.
+        if self.ep > 1:
+            return self._build_mesh_with_ep(device_type, init_device_mesh_fn)
+        else:
+            return self._build_mesh_without_ep(device_type, init_device_mesh_fn)
+
+    def _build_mesh_with_ep(
+        self,
+        device_type: str,
+        init_device_mesh_fn: Callable,
+    ) -> DeviceMesh:
+        # With ep, dp_shard and ep are derived submeshes:
+        # dp_shard = dp_shard_mod_ep * dp_shard_in_ep
+        # ep = dp_shard_in_ep * cp
+        dp_shard_mod_ep = self.dp_shard * self.cp // self.ep
+        dp_shard_in_ep = self.ep // self.cp
+
         dims = []
         names = []
         for d, name in zip(
-            [self.pp, self.dp_replicate, self.dp_shard, self.cp, self.tp],
-            ["pp", "dp_replicate", "dp_shard", "cp", "tp"],
+            [
+                self.pp,
+                self.dp_replicate,
+                dp_shard_mod_ep,
+                dp_shard_in_ep,
+                self.cp,
+                self.tp,
+            ],
+            ["pp", "dp_replicate", "dp_shard_mod_ep", "dp_shard_in_ep", "cp", "tp"],
         ):
-            if d > 1:
+            # dp_shard_mod_ep is needed even if it's 1, whose FSDP wrapping
+            # helps the MoE layers do mixed precision training
+            if d > 1 or name == "dp_shard_mod_ep":
                 dims.append(d)
                 names.append(name)
 
-        return self._build_mesh(device_type, dims, names, init_device_mesh)
+        logger.info(f"Building {len(dims)}-D device mesh with {names}, {dims}")
+        mesh = init_device_mesh_fn(device_type, dims, mesh_dim_names=names)
 
-    def _build_mesh(
+        # Create all the submesh here to ensure all required process groups are
+        # initialized:
+        # Mesh for data loading (no communication on this mesh)
+        dp_mesh_dim_names = []
+        # Mesh for param sharding
+        dp_shard_cp_mesh_dim_names = []
+        # Mesh for loss all-reduce
+        dp_cp_mesh_dim_names = []
+        # Mesh for ep
+        ep_mesh_dim_names = []
+
+        if self.dp_replicate_enabled:
+            dp_mesh_dim_names.append("dp_replicate")
+            dp_cp_mesh_dim_names.append("dp_replicate")
+        # dp_shard_mod_ep is always needed, even if it's 1
+        dp_mesh_dim_names.append("dp_shard_mod_ep")
+        dp_shard_cp_mesh_dim_names.append("dp_shard_mod_ep")
+        dp_cp_mesh_dim_names.append("dp_shard_mod_ep")
+        if "dp_shard_in_ep" in names:
+            dp_mesh_dim_names.append("dp_shard_in_ep")
+            dp_shard_cp_mesh_dim_names.append("dp_shard_in_ep")
+            dp_cp_mesh_dim_names.append("dp_shard_in_ep")
+            ep_mesh_dim_names.append("dp_shard_in_ep")
+        if self.cp_enabled:
+            dp_shard_cp_mesh_dim_names.append("cp")
+            dp_cp_mesh_dim_names.append("cp")
+            ep_mesh_dim_names.append("cp")
+
+        mesh[tuple(dp_mesh_dim_names)]._flatten(mesh_dim_name="dp")
+        mesh[tuple(dp_shard_cp_mesh_dim_names)]._flatten(mesh_dim_name="dp_shard_cp")
+        mesh[tuple(dp_cp_mesh_dim_names)]._flatten(mesh_dim_name="dp_cp")
+        mesh[tuple(ep_mesh_dim_names)]._flatten(mesh_dim_name="ep")
+
+        return mesh
+
+    def _build_mesh_without_ep(
         self,
         device_type: str,
-        dims: list[int],
-        names: list[str],
         init_device_mesh_fn: Callable,
     ) -> DeviceMesh:
+        dims = []
+        names = []
+        for d, name in zip(
+            [self.pp, self.dp_replicate, self.dp_shard, self.cp, self.tp],
+            ["pp", "dp_replicate", "dp_shard", "cp", "tp"],
+        ):
+            if d > 1:
+                dims.append(d)
+                names.append(name)
+
         logger.info(f"Building {len(dims)}-D device mesh with {names}, {dims}")
         mesh = init_device_mesh_fn(device_type, dims, mesh_dim_names=names)
 
@@ -143,3 +226,12 @@ def loss_parallel_enabled(self):
     @cached_property
     def non_data_parallel_size(self):
         return self.cp * self.tp * self.pp
+
+    @property
+    def ep_enabled(self):
+        return self.ep > 1
+
+    @property
+    def dense_params_mesh_ndim(self):
+        # Note: EP params mesh ndim is 1 more due to the 'ep' mesh
+        return self.dp_replicate_enabled + self.fsdp_enabled + self.tp_enabled
@@ -307,6 +307,7 @@ def clip_grad_norm_(
     error_if_nonfinite: bool = False,
     foreach: bool | None = None,
     pp_mesh: DeviceMesh | None = None,
+    parallel_dims: ParallelDims | None = None,
 ) -> torch.Tensor:
     """
     Clip the gradient norm of an iterable of parameters.
@@ -329,11 +330,23 @@ def clip_grad_norm_(
             fall back to the slow implementation for other device types.
             Default: ``None``
         pp_mesh: pipeline parallel device mesh. If not None, will reduce gradient norm across PP stages.
+        parallel_dims: ParallelDims object which contains Expert Parallel related info.
 
     Returns:
         Total norm of the parameter gradients (viewed as a single vector).
 
     """
+    if parallel_dims and parallel_dims.ep_enabled:
+        return _clip_grad_norm_with_ep(
+            parameters,
+            max_norm,
+            norm_type,
+            error_if_nonfinite,
+            foreach,
+            pp_mesh,
+            parallel_dims,
+        )
+
     if isinstance(parameters, torch.Tensor):
         parameters = [parameters]
     else:
@@ -353,7 +366,6 @@ def clip_grad_norm_(
     if isinstance(total_norm, DTensor):
         # Will reach here if any non-PP parallelism is used.
         # If only using PP, total_norm will be a local tensor.
-
         total_norm = total_norm.full_tensor()
 
     if pp_mesh is not None:
@@ -366,3 +378,57 @@ def clip_grad_norm_(
 
     torch.nn.utils.clip_grads_with_norm_(parameters, max_norm, total_norm, foreach)
     return total_norm
+
+
+@torch.no_grad()
+def _clip_grad_norm_with_ep(
+    parameters: torch.Tensor | Iterable[torch.Tensor],
+    max_norm: float,
+    norm_type: float,
+    error_if_nonfinite: bool,
+    foreach: bool | None,
+    pp_mesh: DeviceMesh | None,
+    parallel_dims: ParallelDims,
+) -> torch.Tensor:
+    assert parallel_dims.ep_enabled
+
+    ep_params = []
+    non_ep_params = []
+    ep_grads = []
+    non_ep_grads = []
+
+    for p in parameters:
+        if p.grad is None:
+            continue
+        assert isinstance(p.grad, DTensor)
+        if p.device_mesh.ndim == parallel_dims.dense_params_mesh_ndim:
+            non_ep_params.append(p)
+            non_ep_grads.append(p.grad)
+        else:
+            ep_params.append(p)
+            ep_grads.append(p.grad)
+    ep_grads_total_norm = torch.nn.utils.get_total_norm(
+        ep_grads, norm_type, error_if_nonfinite, foreach
+    ).full_tensor()
+    non_ep_grads_total_norm = torch.nn.utils.get_total_norm(
+        non_ep_grads, norm_type, error_if_nonfinite, foreach
+    ).full_tensor()
+
+    if math.isinf(norm_type):
+        total_norm = torch.maximum(ep_grads_total_norm, non_ep_grads_total_norm)
+    else:
+        total_norm = (
+            ep_grads_total_norm**norm_type + non_ep_grads_total_norm**norm_type
+        )
+        total_norm **= 1.0 / norm_type
+
+    if pp_mesh is not None:
+        if math.isinf(norm_type):
+            dist.all_reduce(total_norm, op=dist.ReduceOp.MAX, group=pp_mesh.get_group())
+        else:
+            total_norm **= norm_type
+            dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=pp_mesh.get_group())
+            total_norm **= 1.0 / norm_type
+
+    torch.nn.utils.clip_grads_with_norm_(ep_params, max_norm, total_norm, foreach)
+    torch.nn.utils.clip_grads_with_norm_(non_ep_params, max_norm, total_norm, foreach)
@@ -6,6 +6,7 @@ https://github.com/pytorch/torchtitan/issues/1118
 #### Available features
 - Llama 4 model (text-only), including a token-choice MoE architecture with efficient bfloat16 Grouped MM kernels and auxiliary-loss-free load balancing
 - FSDP, TP, PP, CP support
+- Expert Parallel support
 - DCP checkpoint conversion scripts
 
 #### Download Llama 4 tokenizer
@@ -20,7 +21,6 @@ python scripts/download_tokenizer.py --repo_id meta-llama/Llama-4-Scout-17B-16E
     - multimodal support
 - Parallelism
     - Context Parallel support for FlexAttention and multimodal inputs
-    - Expert Parallel support
 - torch.compile
     - for MoE layers
 - Quantization
Original file line number	Diff line number	Diff line change
`@@ -125,6 +125,7 @@ def test_generate(`
`125`	`125`	`cp=1,`
`126`	`126`	`tp=world_size,`
`127`	`127`	`pp=1,`
	`128`	`+ ep=1,`
`128`	`129`	`world_size=world_size,`
`129`	`130`	`enable_loss_parallel=False,`
`130`	`131`	`)`
Original file line number	Diff line number	Diff line change
`@@ -21,6 +21,7 @@ def build_parallel_dims(job_config, world_size):`
`21`	`21`	`cp=parallelism_config.context_parallel_degree,`
`22`	`22`	`tp=parallelism_config.tensor_parallel_degree,`
`23`	`23`	`pp=parallelism_config.pipeline_parallel_degree,`
	`24`	`+ ep=parallelism_config.expert_parallel_degree,`
`24`	`25`	`world_size=world_size,`
`25`	`26`	`enable_loss_parallel=not parallelism_config.disable_loss_parallel,`
`26`	`27`	`)`