[deepseek][blackwell] add manual looping group gemm to enable base working inference on Blackwell (#1272)

lessw2020 · H-Huang · commit 00a6cf314ca8 · 2025-06-24T14:42:40.000-07:00
This PR enables deepseek inference to run on Blackwell (B200). Currently, torch._grouped_mm is specific to Hopper...thus trying to run on B200 via TorchBF16GroupGEMM yields: ~~~ "Error using torch strategy: torch._grouped_mm is only supported on CUDA devices with compute capability = 9.0" ~~~ thus this PR adds a manual looping group gemm to get ds inference working on Blackwell. *Note that you must use Symmetric Memory for the all2all, dist.all2all_single does not yet work on Blackwell. Wtih this PR: <img width="1103" alt="Screenshot 2025-06-07 at 4 20 31 PM" src="https://github.com/user-attachments/assets/0a1b77d7-6423-4c2a-91aa-2f8587cae78a" /> Token per second of 1.21 is not great, but we have moved now from 'not working' to a working inference on B200.
diff --git a/torchtitan/experiments/deepseek_v3/generate.py b/torchtitan/experiments/deepseek_v3/generate.py
@@ -224,7 +224,7 @@ def generate(
     tokenizer,
     dist_config,
     messages: list[dict],
-    n_tokens: int = 200,
+    n_tokens: int = 50,
 ):
     rank = dist.get_rank()
     device = dist_config.device
diff --git a/torchtitan/experiments/deepseek_v3/group_gemms.py b/torchtitan/experiments/deepseek_v3/group_gemms.py
@@ -97,9 +97,70 @@ def is_available() -> bool:
     "TorchBF16GroupGEMM",
     "TorchAOBF16GroupGEMM",
     "TritonCGBF16GroupGEMM",
+    "ManualLoopGroupGEMM",
 ]
 
 
+class ManualLoopGroupGEMM(GroupGEMMStrategy):
+    """Manual looping baseline implementation for any arch (esp Blackwell) support"""
+
+    def arrange_expert_weights(self, all_weights, submod_name, module):
+        """Store weights in a stacked format"""
+        return torch.stack(all_weights)
+
+    def execute(self, contig_tokens, m_sizes, m_offsets, module):
+        """Execute using manual loops over experts"""
+        # Get weights
+
+        w_gate = module.get_parameter("gate_proj_weight")
+        w_up = module.get_parameter("up_proj_weight")
+        w_down = module.get_parameter("down_proj_weight")
+
+        # Prepare output tensor
+        hidden_size = w_gate.shape[
+            2
+        ]  # stacked weights shape [num_experts, out_dim, in_dim]
+        output = torch.zeros(
+            contig_tokens.shape[0],
+            hidden_size,
+            dtype=contig_tokens.dtype,
+            device=contig_tokens.device,
+        )
+
+        # Process each expert sequentially
+        offset = 0
+        for expert_idx, size in enumerate(m_sizes):
+            if size > 0:
+                # Get tokens for this expert
+                expert_tokens = contig_tokens[offset : offset + size]
+
+                # Get weights for this expert
+                gate_weight = w_gate[expert_idx]  # [out_dim, in_dim]
+                up_weight = w_up[expert_idx]
+                down_weight = w_down[expert_idx]
+
+                # Forward pass: gate and up projections
+                gate_out = torch.mm(expert_tokens, gate_weight.t())
+                up_out = torch.mm(expert_tokens, up_weight.t())
+
+                # Apply activation and combine
+                hidden = self.activation_function(gate_out) * up_out
+
+                # Down projection
+                expert_output = torch.mm(hidden, down_weight.t())
+
+                # Store results
+                output[offset : offset + size] = expert_output
+
+            offset += size
+
+        return output
+
+    @staticmethod
+    def is_available() -> bool:
+        return True
+
+
 class TritonCGBF16GroupGEMM(GroupGEMMStrategy):
     """Implementation of Triton Contiguous group Gemm"""
 
diff --git a/torchtitan/experiments/deepseek_v3/model.py b/torchtitan/experiments/deepseek_v3/model.py
@@ -46,6 +46,7 @@
 
 from group_gemms import (
     DSGroupGEMM,
+    ManualLoopGroupGEMM,
     TorchAOBF16GroupGEMM,
     TorchBF16GroupGEMM,
     TorchFP8GroupGEMM,
@@ -474,7 +475,7 @@ class MoE(nn.Module):
     # Group GEMM strategies
     group_gemm_strategies = None
     # which group gemm to use?
-    group_mm = "torch"  # fp8 options = ["torchfp8", "dsgemm"] bf16 = ["torch", , "torchao", "tritoncg"]
+    group_mm = "manual"  # fp8 options = ["torchfp8", "dsgemm"] bf16 = ["torch", , "torchao", "tritoncg", "manual"]
 
     def __init__(self, config):
         super().__init__()
@@ -527,7 +528,10 @@ def __init__(self, config):
     def _initialize_group_gemm_strategies(cls):
         """Initialize available group GEMM strategies"""
         cls.group_gemm_strategies = {
+            # torch._group_MM
             "torch": TorchBF16GroupGEMM(MLP.act_fn),
+            # torch.mm with looping
+            "manual": ManualLoopGroupGEMM(MLP.act_fn),
             "torchao": (
                 TorchAOBF16GroupGEMM(MLP.act_fn)
                 if TorchAOBF16GroupGEMM.is_available()