pytorch · lessw2020 · Jun 22, 2025 · Jun 22, 2025
diff --git a/torchtitan/experiments/deepseek_v3/generate.py b/torchtitan/experiments/deepseek_v3/generate.py
@@ -8,6 +8,7 @@
 
 # use inference.sh "Your Question Here?" to run inference with a single prompt.
 
+import os
 import sys
 from dataclasses import dataclass
 
@@ -127,7 +128,7 @@ def create_model(dist_config: DistConfig):
     model_args.ep_size = dist_config.ep_size
     model_args.num_stages = dist_config.pp_size
     model_args.stage_idx = dist_config.pp_rank
-    model_args.max_seq_len = 4096  # 16384
+    model_args.max_seq_len = 1024  # 4096  # 16384
 
     with dist_config.device, dist_config.mesh:
         model = DeepseekForCausalLM(model_args)
@@ -224,7 +225,7 @@ def generate(
     tokenizer,
     dist_config,
     messages: list[dict],
-    n_tokens: int = 200,
+    n_tokens: int = 80,
 ):
     rank = dist.get_rank()
     device = dist_config.device
@@ -353,6 +354,12 @@ def generate_with_cuda_graph(
 
 
 if __name__ == "__main__":
+    # set device
+    torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
+
+    run_with_cuda_graph = False
+    run_two_times = True
+
     # Get user prompt from command line arguments
     user_prompt = "What is 2+2?"  # Default prompt
     if len(sys.argv) > 1:
@@ -375,7 +382,13 @@ def generate_with_cuda_graph(
     ]
 
     generate(model, pp_schedule, tokenizer, dist_config, messages)
-    generate_with_cuda_graph(model, tokenizer, dist_config, messages)
+
+    # we run a second time to compare the performance (i.e. compilation overhead)
+    if run_two_times:
+        generate(model, pp_schedule, tokenizer, dist_config, messages)
+
+    if run_with_cuda_graph:
+        generate_with_cuda_graph(model, tokenizer, dist_config, messages)
 
     if rank == 0:
         print(f"\n{color.yellow}Closing inference mesh...{color.reset}")

diff --git a/torchtitan/experiments/deepseek_v3/model.py b/torchtitan/experiments/deepseek_v3/model.py
@@ -51,12 +51,16 @@
     TorchFP8GroupGEMM,
     TritonCGBF16GroupGEMM,
 )
-
 from model_config import ModelArgs
 from symm_mem_recipes import OnDeviceAllToAllV
 from torch import nn
 from torch.distributed._functional_collectives import all_to_all_single_autograd
 
+# blackwell specific
+from torchtitan.experiments.kernels.blackwell.cute_grouped_gemm_fwd import (
+    CUTLASSGroupedGemmStrategy,
+)
+
 from torchtitan.experiments.kernels.moe.indices import generate_permute_indices
 from torchtitan.experiments.kernels.triton_mg_group_gemm.torchao_pr import ALIGN_SIZE_M
 
@@ -474,7 +478,7 @@ class MoE(nn.Module):
     # Group GEMM strategies
     group_gemm_strategies = None
     # which group gemm to use?
-    group_mm = "torch"  # fp8 options = ["torchfp8", "dsgemm"] bf16 = ["torch", , "torchao", "tritoncg"]
+    group_mm = "cute"  # fp8 options = ["torchfp8", "dsgemm"] bf16 = ["torch","torchao", "tritoncg"], blackwell = ["cute"]
 
     def __init__(self, config):
         super().__init__()
@@ -550,6 +554,11 @@ def _initialize_group_gemm_strategies(cls):
                 if TritonCGBF16GroupGEMM.is_available()
                 else None
             ),
+            "cute": (
+                CUTLASSGroupedGemmStrategy(MLP.act_fn)
+                if CUTLASSGroupedGemmStrategy.is_available()
+                else None
+            ),
         }
 
     def combine_experts(self, submod_name: str):