Revert "[kernels] move generate_permute_indices to using exact sizes per expert needed, instead of max_len" (#1340)

tianyu-l · web-flow · commit dc7fd2363f77 · 2025-06-25T17:35:26.000-07:00
Reverts #1254 for the reasons @ngimel and @kwen2501 mentioned in the original PR. It was introduced because of the issue #1237 But the right way of fixing the issue is to fix my wrong way of doing permutation of both `routed_input` and `token_indices`; instead we should permute `routed_input` and unpermute. https://github.com/pytorch/torchtitan/blob/adae6b6fd850054d6e1c04ecc2a4cf84abc68056/torchtitan/experiments/llama4/model/moe.py#L300-L305 I will send out a PR soon to fix the error in Llama 4 MoE impl.
diff --git a/torchtitan/experiments/kernels/moe/indices.py b/torchtitan/experiments/kernels/moe/indices.py
@@ -72,13 +72,13 @@ def fill_indices_wrapper(
     write_offsets: torch.Tensor,
     experts_per_rank: int,
     num_ranks: int,
-    total_size: int,
+    max_len: int,
     block_size: int = 128,
-    max_blocks: int = 1024,
+    max_blocks: int = 1024,  # cap on total number of blocks to launch
 ):
-    # Allocate exact size needed instead of max_len
+    # preallocate output
     permuted_indices = torch.full(
-        (total_size,), -1, dtype=torch.int32, device=tokens_per_expert_group.device
+        (max_len,), -1, dtype=torch.int32, device=tokens_per_expert_group.device
     )
 
     # write offsets is per local expert...
@@ -99,37 +99,39 @@ def fill_indices_wrapper(
     return permuted_indices
 
 
-# used for reference testing only
-
-
+# reference
 def fill_indices_cpu(
     tokens_per_expert_group: torch.Tensor,
     start_index_values: torch.Tensor,
     write_offsets: torch.Tensor,
     experts_per_rank: int,
     num_ranks: int,
-    total_size: int,  # Changed from max_len to actual required size
+    max_len: int,
 ):
-    # Allocate exact size needed
+    # We need to preallocate the output - we ignore device and force it on cpu
+    # device = tokens_per_expert_group.device
     permuted_indices = torch.full(
-        (total_size,),
+        (max_len,),
         -1,
         dtype=torch.int32,
-    )
-
+    )  # device=device)
     # Fill the permuted indices
+    # For each local expert
     for e in range(experts_per_rank):
         write_start = write_offsets[e].item()
+        # For each remote rank
         for r in range(num_ranks):
             i = r * experts_per_rank + e
             start_index = start_index_values[i].item()
             length = tokens_per_expert_group[i].item()
+            # Fill in the indices
             if length > 0:
-                end_idx = min(write_start + length, total_size)
+                end_idx = min(write_start + length, max_len)
                 permuted_indices[write_start:end_idx] = torch.arange(
                     start_index,
                     start_index + (end_idx - write_start),
                     dtype=torch.int32,
+                    # device=device,
                 )
             write_start += length
     return permuted_indices
@@ -139,22 +141,24 @@ def generate_permute_indices(
     tokens_per_expert_group: torch.Tensor,
     experts_per_rank: int,
     num_ranks: int,
+    max_len: int,
     alignment: int,
     use_cpu: bool = False,
 ):
     """
     Prepare permutation indices and the number of tokens for each expert.
-    Modified version that returns a tensor of size sum(m_sizes) instead of max_len.
 
     Args:
         tokens_per_expert_group: number of tokens for each expert from all ranks.
         experts_per_rank: number of experts per rank.
         num_ranks: number of ranks.
+        max_len: maximum length of the output index vector.
         alignment: alignment for each returned element in `m_sizes` and padding min for zero token experts.
         use_cpu: whether to use CPU implementation.
 
+
     Returns:
-        permuted_indices: Tensor of indices with size sum(m_sizes), that map original token order to the expert-grouped order.
+        permuted_indices: Tensor of indices that map original token order to the expert-grouped order.
         m_sizes: aligned number of tokens for each expert (padded to alignment boundary).
         m_offsets: Cumulative sum of m_sizes. The exclusive ending position for each expert's tokens.
 
@@ -165,7 +169,7 @@ def generate_permute_indices(
               |  4 |  2 |  1 |  3 |  1 |  2 |  3 |  4 |
     """
 
-    # prefix sum to get start index of each expert
+    # prefix sum to get start index of each expert (parallel scan kernel in future?)
     start_index_values = (
         torch.cumsum(tokens_per_expert_group, 0) - tokens_per_expert_group
     )
@@ -182,12 +186,10 @@ def generate_permute_indices(
     )
 
     # additional prefix sum to get write offset of each expert in permuted_indices
+    # write offsets is per local expert, not global
     m_offsets = torch.cumsum(m_sizes, 0)
     write_offsets = m_offsets - m_sizes
 
-    # Calculate the actual total size needed
-    total_size = m_offsets[-1]
-
     # Select the implementation to use
     if use_cpu:
         permuted_indices = fill_indices_cpu(
@@ -196,16 +198,16 @@ def generate_permute_indices(
             write_offsets,
             experts_per_rank,
             num_ranks,
-            total_size,
+            max_len,
         )
-    else:  # gpu
+    else:
         permuted_indices = fill_indices_wrapper(
             tokens_per_expert_group,
             start_index_values,
             write_offsets,
             experts_per_rank,
             num_ranks,
-            total_size,
+            max_len,
         )
 
     return permuted_indices, m_sizes, m_offsets.to(torch.int32)
@@ -225,17 +227,14 @@ def simple_test():
     alignment = 32
     # Use the GPU kernel
     permuted_indices_gpu, m_sizes, _ = generate_permute_indices(
-        tokens_per_expert_group,
-        experts_per_rank,
-        num_ranks,
-        alignment,
-        use_cpu=False,
+        tokens_per_expert_group, experts_per_rank, num_ranks, max_len, alignment
     )
     # Use the CPU method
     permuted_indices_cpu, m_sizes, _ = generate_permute_indices(
         tokens_per_expert_group,
         experts_per_rank,
         num_ranks,
+        max_len,
         alignment,
         use_cpu=True,
     )
@@ -273,15 +272,16 @@ def test_with_zero_tokens():
         tokens_per_expert_group,
         experts_per_rank,
         num_ranks,
+        max_len,
         alignment,
-        use_cpu=False,
     )
 
     # Use the CPU method
     permuted_indices_cpu, m_sizes_cpu, m_offsets_cpu = generate_permute_indices(
         tokens_per_expert_group,
         experts_per_rank,
         num_ranks,
+        max_len,
         alignment,
         use_cpu=True,
     )